Repository: kvcache-ai/ktransformers
Branch: main
Commit: 8561a71dd11e
Files: 1146
Total size: 12.2 MB

Directory structure:
gitextract_0e22n38f/

├── .github/
│   ├── CODE_OF_CONDUCT.md
│   ├── CONTRIBUTING.md
│   ├── ISSUE_TEMPLATE/
│   │   ├── -bug-.yaml
│   │   ├── -feature-.yaml
│   │   └── config.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── SECURITY.md
│   └── workflows/
│       ├── book-ci.yml
│       ├── deploy.yml
│       ├── docker-image.yml
│       ├── kt-kernel-tests.yml
│       ├── release-fake-tag.yml
│       ├── release-pypi.yml
│       ├── release-sglang-kt.yml
│       └── sync-sglang-submodule.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── MAINTAINERS.md
├── README.md
├── README_ZH.md
├── archive/
│   ├── .devcontainer/
│   │   ├── Dockerfile
│   │   └── devcontainer.json
│   ├── .flake8
│   ├── .gitmodules
│   ├── .pylintrc
│   ├── Dockerfile
│   ├── Dockerfile.xpu
│   ├── LICENSE
│   ├── MANIFEST.in
│   ├── Makefile
│   ├── README.md
│   ├── README_LEGACY.md
│   ├── README_ZH.md
│   ├── README_ZH_LEGACY.md
│   ├── SECURITY.md
│   ├── book.toml
│   ├── config.json
│   ├── csrc/
│   │   ├── balance_serve/
│   │   │   └── CMakeLists.txt
│   │   ├── custom_marlin/
│   │   │   ├── __init__.py
│   │   │   ├── binding.cpp
│   │   │   ├── gptq_marlin/
│   │   │   │   ├── gptq_marlin.cu
│   │   │   │   ├── gptq_marlin.cuh
│   │   │   │   ├── gptq_marlin_dtypes.cuh
│   │   │   │   ├── gptq_marlin_repack.cu
│   │   │   │   └── ops.h
│   │   │   ├── setup.py
│   │   │   ├── test_cuda_graph.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── format24.py
│   │   │       ├── marlin_24_perms.py
│   │   │       ├── marlin_perms.py
│   │   │       ├── marlin_utils.py
│   │   │       └── quant_utils.py
│   │   └── ktransformers_ext/
│   │       ├── CMakeLists.txt
│   │       ├── bench/
│   │       │   ├── bench_attention.py
│   │       │   ├── bench_attention_torch.py
│   │       │   ├── bench_linear.py
│   │       │   ├── bench_linear_torch.py
│   │       │   ├── bench_mlp.py
│   │       │   ├── bench_mlp_torch.py
│   │       │   ├── bench_moe.py
│   │       │   ├── bench_moe_amx.py
│   │       │   └── bench_moe_torch.py
│   │       ├── cmake/
│   │       │   └── FindSIMD.cmake
│   │       ├── cpu_backend/
│   │       │   ├── backend.cpp
│   │       │   ├── backend.h
│   │       │   ├── cpuinfer.h
│   │       │   ├── shared_mem_buffer.cpp
│   │       │   ├── shared_mem_buffer.h
│   │       │   ├── task_queue.cpp
│   │       │   ├── task_queue.h
│   │       │   └── vendors/
│   │       │       ├── README.md
│   │       │       ├── cuda.h
│   │       │       ├── hip.h
│   │       │       ├── musa.h
│   │       │       └── vendor.h
│   │       ├── cuda/
│   │       │   ├── binding.cpp
│   │       │   ├── custom_gguf/
│   │       │   │   ├── dequant.cu
│   │       │   │   └── ops.h
│   │       │   ├── gptq_marlin/
│   │       │   │   ├── gptq_marlin.cu
│   │       │   │   ├── gptq_marlin.cuh
│   │       │   │   ├── gptq_marlin_dtypes.cuh
│   │       │   │   └── ops.h
│   │       │   ├── setup.py
│   │       │   └── test_dequant.py
│   │       ├── examples/
│   │       │   ├── test_attention.py
│   │       │   ├── test_linear.py
│   │       │   ├── test_mlp.py
│   │       │   └── test_moe.py
│   │       ├── ext_bindings.cpp
│   │       ├── operators/
│   │       │   ├── amx/
│   │       │   │   ├── la/
│   │       │   │   │   ├── amx.hpp
│   │       │   │   │   └── utils.hpp
│   │       │   │   └── moe.hpp
│   │       │   ├── kvcache/
│   │       │   │   ├── kvcache.h
│   │       │   │   ├── kvcache_attn.cpp
│   │       │   │   ├── kvcache_load_dump.cpp
│   │       │   │   ├── kvcache_read_write.cpp
│   │       │   │   └── kvcache_utils.cpp
│   │       │   └── llamafile/
│   │       │       ├── conversion.h
│   │       │       ├── linear.cpp
│   │       │       ├── linear.h
│   │       │       ├── mlp.cpp
│   │       │       ├── mlp.h
│   │       │       ├── moe.cpp
│   │       │       └── moe.h
│   │       └── vendors/
│   │           ├── cuda.h
│   │           ├── hip.h
│   │           ├── musa.h
│   │           └── vendor.h
│   ├── install-with-cache.sh
│   ├── install.bat
│   ├── install.sh
│   ├── ktransformers/
│   │   ├── __init__.py
│   │   ├── configs/
│   │   │   ├── config.yaml
│   │   │   └── log_config.ini
│   │   ├── ktransformers_ext/
│   │   │   ├── operators/
│   │   │   │   └── custom_marlin/
│   │   │   │       └── quantize/
│   │   │   │           └── utils/
│   │   │   │               ├── __init__.py
│   │   │   │               ├── format_24.py
│   │   │   │               ├── marlin_24_perms.py
│   │   │   │               ├── marlin_perms.py
│   │   │   │               ├── marlin_utils.py
│   │   │   │               └── quant_utils.py
│   │   │   └── triton/
│   │   │       └── fp8gemm.py
│   │   ├── local_chat.py
│   │   ├── local_chat_test.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── ascend/
│   │   │   │   ├── custom_ascend_modeling_deepseek_v3.py
│   │   │   │   └── custom_ascend_modeling_qwen3.py
│   │   │   ├── configuration_deepseek.py
│   │   │   ├── configuration_deepseek_v3.py
│   │   │   ├── configuration_glm4_moe.py
│   │   │   ├── configuration_llama.py
│   │   │   ├── configuration_qwen2_moe.py
│   │   │   ├── configuration_qwen3_moe.py
│   │   │   ├── configuration_qwen3_next.py
│   │   │   ├── configuration_smallthinker.py
│   │   │   ├── custom_cache.py
│   │   │   ├── custom_modeling_deepseek_v2.py
│   │   │   ├── custom_modeling_deepseek_v3.py
│   │   │   ├── custom_modeling_glm4_moe.py
│   │   │   ├── custom_modeling_qwen2_moe.py
│   │   │   ├── custom_modeling_qwen3_moe.py
│   │   │   ├── custom_modeling_qwen3_next.py
│   │   │   ├── custom_modeling_smallthinker.py
│   │   │   ├── modeling_deepseek.py
│   │   │   ├── modeling_deepseek_v3.py
│   │   │   ├── modeling_glm4_moe.py
│   │   │   ├── modeling_llama.py
│   │   │   ├── modeling_mixtral.py
│   │   │   ├── modeling_qwen2_moe.py
│   │   │   ├── modeling_qwen3_moe.py
│   │   │   ├── modeling_qwen3_next.py
│   │   │   └── modeling_smallthinker.py
│   │   ├── operators/
│   │   │   ├── RoPE.py
│   │   │   ├── __init__.py
│   │   │   ├── ascend/
│   │   │   │   ├── ascend_attention.py
│   │   │   │   ├── ascend_experts.py
│   │   │   │   ├── ascend_gate.py
│   │   │   │   ├── ascend_layernorm.py
│   │   │   │   ├── ascend_linear.py
│   │   │   │   └── ascend_mlp.py
│   │   │   ├── attention.py
│   │   │   ├── balance_serve_attention.py
│   │   │   ├── base_operator.py
│   │   │   ├── cpuinfer.py
│   │   │   ├── dynamic_attention.py
│   │   │   ├── experts.py
│   │   │   ├── flashinfer_batch_prefill_wrapper.py
│   │   │   ├── flashinfer_wrapper.py
│   │   │   ├── gate.py
│   │   │   ├── layernorm.py
│   │   │   ├── linear.py
│   │   │   ├── mlp.py
│   │   │   ├── models.py
│   │   │   ├── triton_attention.py
│   │   │   └── triton_attention_prefill.py
│   │   ├── optimize/
│   │   │   ├── optimize.py
│   │   │   └── optimize_rules/
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Chat.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-gpu-cpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat.yaml
│   │   │       ├── DeepSeek-V3-Chat-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-8.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-marlin.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V3-Chat-npu.yaml
│   │   │       ├── DeepSeek-V3-Chat-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat.yaml
│   │   │       ├── Glm4Moe-serve.yaml
│   │   │       ├── Internlm2_5-7b-Chat-1m.yaml
│   │   │       ├── Mixtral.yaml
│   │   │       ├── Moonlight-16B-A3B-serve.yaml
│   │   │       ├── Moonlight-16B-A3B.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct-multi-gpu.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct.yaml
│   │   │       ├── Qwen2-serve-amx.yaml
│   │   │       ├── Qwen2-serve.yaml
│   │   │       ├── Qwen3Moe-serve-amx.yaml
│   │   │       ├── Qwen3Moe-serve.yaml
│   │   │       ├── Qwen3Next-serve.yaml
│   │   │       ├── Smallthinker-serve.yaml
│   │   │       ├── npu/
│   │   │       │   ├── DeepSeek-V3-Chat-300IA2-npu-serve.yaml
│   │   │       │   ├── DeepSeek-V3-Chat-300IA2-npu.yaml
│   │   │       │   └── Qwen3-Chat-300IA2-npu-serve.yaml
│   │   │       ├── rocm/
│   │   │       │   └── DeepSeek-V3-Chat.yaml
│   │   │       └── xpu/
│   │   │           ├── DeepSeek-V2-Chat.yaml
│   │   │           ├── DeepSeek-V3-Chat.yaml
│   │   │           └── Qwen3Moe-Chat.yaml
│   │   ├── server/
│   │   │   ├── __init__.py
│   │   │   ├── api/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── ollama/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── completions.py
│   │   │   │   ├── openai/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── assistants.py
│   │   │   │   │   │   ├── messages.py
│   │   │   │   │   │   ├── runs.py
│   │   │   │   │   │   └── threads.py
│   │   │   │   │   ├── endpoints/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── chat.py
│   │   │   │   │   └── legacy/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── completions.py
│   │   │   │   └── web/
│   │   │   │       ├── __init__.py
│   │   │   │       └── system.py
│   │   │   ├── args.py
│   │   │   ├── backend/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── base.py
│   │   │   │   ├── context_manager.py
│   │   │   │   └── interfaces/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── balance_serve.py
│   │   │   │       ├── exllamav2.py
│   │   │   │       ├── ktransformers.py
│   │   │   │       └── transformers.py
│   │   │   ├── balance_serve/
│   │   │   │   ├── inference/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── config.py
│   │   │   │   │   ├── distributed/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── communication_op.py
│   │   │   │   │   │   ├── cuda_wrapper.py
│   │   │   │   │   │   ├── custom_all_reduce.py
│   │   │   │   │   │   ├── custom_all_reduce_utils.py
│   │   │   │   │   │   ├── parallel_state.py
│   │   │   │   │   │   ├── pynccl.py
│   │   │   │   │   │   ├── pynccl_wrapper.py
│   │   │   │   │   │   └── utils.py
│   │   │   │   │   ├── forward_batch.py
│   │   │   │   │   ├── model_runner.py
│   │   │   │   │   ├── query_manager.py
│   │   │   │   │   └── sampling/
│   │   │   │   │       ├── penaltylib/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── orchestrator.py
│   │   │   │   │       │   └── penalizers/
│   │   │   │   │       │       ├── frequency_penalty.py
│   │   │   │   │       │       ├── min_new_tokens.py
│   │   │   │   │       │       ├── presence_penalty.py
│   │   │   │   │       │       └── repetition_penalty.py
│   │   │   │   │       └── sampler.py
│   │   │   │   ├── sched_rpc.py
│   │   │   │   └── settings.py
│   │   │   ├── config/
│   │   │   │   ├── config.py
│   │   │   │   ├── log.py
│   │   │   │   └── singleton.py
│   │   │   ├── crud/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── exceptions.py
│   │   │   ├── main.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── run_steps.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── requirements.txt
│   │   │   ├── schemas/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── assistants/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants.py
│   │   │   │   │   ├── messages.py
│   │   │   │   │   ├── runs.py
│   │   │   │   │   ├── streaming.py
│   │   │   │   │   ├── threads.py
│   │   │   │   │   └── tool.py
│   │   │   │   ├── base.py
│   │   │   │   ├── conversation.py
│   │   │   │   ├── endpoints/
│   │   │   │   │   └── chat.py
│   │   │   │   └── legacy/
│   │   │   │       ├── __init__.py
│   │   │   │       └── completions.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── create_interface.py
│   │   │       ├── multi_timer.py
│   │   │       ├── serve_profiling.py
│   │   │       └── sql_utils.py
│   │   ├── tests/
│   │   │   ├── .gitignore
│   │   │   ├── AIME_2024/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── UT/
│   │   │   │   ├── test_kdeepseek_attention_w8a8a2serve_npu.py
│   │   │   │   └── test_kdeepseek_ln_npu.py
│   │   │   ├── dequant_gpu.py
│   │   │   ├── dequant_gpu_t.py
│   │   │   ├── function_call_test.py
│   │   │   ├── humaneval/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── mmlu_pro_test.py
│   │   │   ├── mmlu_test.py
│   │   │   ├── mmlu_test_multi.py
│   │   │   ├── parse_cover_info.py
│   │   │   ├── score.py
│   │   │   ├── test_client.py
│   │   │   ├── test_prefix.py
│   │   │   ├── test_pytorch_q8.py
│   │   │   ├── test_speed.py
│   │   │   └── triton_fp8gemm_test.py
│   │   ├── util/
│   │   │   ├── ascend/
│   │   │   │   └── ascend_utils.py
│   │   │   ├── cuda_graph_runner.py
│   │   │   ├── custom_gguf.py
│   │   │   ├── custom_loader.py
│   │   │   ├── modeling_rope_utils.py
│   │   │   ├── npu_graph_runner.py
│   │   │   ├── textstream.py
│   │   │   ├── utils.py
│   │   │   ├── vendors.py
│   │   │   └── weight_loader.py
│   │   └── website/
│   │       ├── .browserslistrc
│   │       ├── .eslintrc.js
│   │       ├── .gitignore
│   │       ├── README.md
│   │       ├── config.d.ts
│   │       ├── jest.config.js
│   │       ├── package.json
│   │       ├── public/
│   │       │   ├── config.js
│   │       │   ├── css/
│   │       │   │   └── reset.css
│   │       │   └── index.html
│   │       ├── src/
│   │       │   ├── App.vue
│   │       │   ├── api/
│   │       │   │   ├── api-client.ts
│   │       │   │   ├── assistant.ts
│   │       │   │   ├── message.ts
│   │       │   │   ├── run.ts
│   │       │   │   └── thread.ts
│   │       │   ├── assets/
│   │       │   │   ├── css/
│   │       │   │   │   └── mixins.styl
│   │       │   │   └── iconfont/
│   │       │   │       ├── demo.css
│   │       │   │       ├── demo_index.html
│   │       │   │       ├── iconfont.css
│   │       │   │       ├── iconfont.js
│   │       │   │       └── iconfont.json
│   │       │   ├── components/
│   │       │   │   └── chat/
│   │       │   │       └── index.vue
│   │       │   ├── conf/
│   │       │   │   └── config.ts
│   │       │   ├── locals/
│   │       │   │   ├── en.js
│   │       │   │   ├── index.js
│   │       │   │   └── zh.js
│   │       │   ├── main.ts
│   │       │   ├── router/
│   │       │   │   └── index.ts
│   │       │   ├── shims-vue.d.ts
│   │       │   ├── store/
│   │       │   │   └── index.ts
│   │       │   ├── utils/
│   │       │   │   ├── copy.ts
│   │       │   │   └── types.ts
│   │       │   └── views/
│   │       │       └── home.vue
│   │       ├── tests/
│   │       │   └── unit/
│   │       │       └── example.spec.ts
│   │       ├── tsconfig.json
│   │       └── vue.config.js
│   ├── merge_tensors/
│   │   ├── merge_safetensor_gguf.py
│   │   └── merge_safetensor_gguf_for_qwen3.py
│   ├── pyproject.toml
│   ├── requirements-local_chat.txt
│   ├── setup.py
│   └── third_party/
│       ├── llamafile/
│       │   ├── README.md
│       │   ├── bench.h
│       │   ├── flags.cpp
│       │   ├── flags.h
│       │   ├── iqk_mul_mat.inc
│       │   ├── iqk_mul_mat_amd_avx2.cpp
│       │   ├── iqk_mul_mat_amd_zen4.cpp
│       │   ├── iqk_mul_mat_arm.inc
│       │   ├── iqk_mul_mat_arm82.cpp
│       │   ├── iqk_mul_mat_x86.inc
│       │   ├── macros.h
│       │   ├── micros.h
│       │   ├── numba.h
│       │   ├── sgemm.cpp
│       │   ├── sgemm.h
│       │   ├── sgemm_arm.cpp
│       │   ├── sgemm_x86.cpp
│       │   ├── tinyblas_cpu.h
│       │   ├── tinyblas_cpu_mixmul.inc
│       │   ├── tinyblas_cpu_mixmul_amd_avx.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_avx2.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_avx512f.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_avxvnni.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_fma.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_zen4.cpp
│       │   ├── tinyblas_cpu_mixmul_arm80.cpp
│       │   ├── tinyblas_cpu_mixmul_arm82.cpp
│       │   ├── tinyblas_cpu_sgemm.inc
│       │   ├── tinyblas_cpu_sgemm_amd_avx.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_avx2.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_avx512f.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_avxvnni.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_fma.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_zen4.cpp
│       │   ├── tinyblas_cpu_sgemm_arm.inc
│       │   ├── tinyblas_cpu_sgemm_arm80.cpp
│       │   ├── tinyblas_cpu_sgemm_arm82.cpp
│       │   ├── tinyblas_cpu_sgemm_x86.inc
│       │   └── tinyblas_cpu_unsupported.cpp
│       └── nlohmann/
│           ├── json.hpp
│           └── json_fwd.hpp
├── book.toml
├── doc/
│   ├── SUMMARY.md
│   ├── basic/
│   │   ├── note1.md
│   │   └── note2.md
│   ├── en/
│   │   ├── AMX.md
│   │   ├── DeepseekR1_V3_tutorial.md
│   │   ├── Docker.md
│   │   ├── Docker_xpu.md
│   │   ├── FAQ.md
│   │   ├── Kimi-K2-Thinking.md
│   │   ├── Kimi-K2.5.md
│   │   ├── Kimi-K2.md
│   │   ├── Kllama_tutorial_DeepSeekV2Lite.ipynb
│   │   ├── MiniMax-M2.5.md
│   │   ├── Qwen3-Next.md
│   │   ├── Qwen3.5.md
│   │   ├── ROCm.md
│   │   ├── SFT/
│   │   │   ├── DPO_tutorial.md
│   │   │   ├── KTransformers-Fine-Tuning_Developer-Technical-Notes.md
│   │   │   ├── KTransformers-Fine-Tuning_User-Guide.md
│   │   │   ├── README.md
│   │   │   └── injection_tutorial.md
│   │   ├── SFT_Installation_Guide_KimiK2.5.md
│   │   ├── SFT_Installation_Guide_KimiK2.md
│   │   ├── SmallThinker_and_Glm4moe.md
│   │   ├── V3-success.md
│   │   ├── api/
│   │   │   └── server/
│   │   │       ├── api.md
│   │   │       ├── server.md
│   │   │       ├── tabby.md
│   │   │       └── website.md
│   │   ├── balance-serve.md
│   │   ├── benchmark.md
│   │   ├── deepseek-v2-injection.md
│   │   ├── fp8_kernel.md
│   │   ├── install.md
│   │   ├── kt-kernel/
│   │   │   ├── GLM-5-Tutorial.md
│   │   │   ├── Kimi-K2-Thinking-Native.md
│   │   │   ├── MiniMax-M2.1-Tutorial.md
│   │   │   ├── Native-Precision-Tutorial.md
│   │   │   ├── Qwen3-Coder-Next-Tutorial.md
│   │   │   ├── README.md
│   │   │   ├── amd_blis.md
│   │   │   ├── deepseek-v3.2-sglang-tutorial.md
│   │   │   ├── experts-sched-Tutorial.md
│   │   │   └── kt-cli.md
│   │   ├── llama4.md
│   │   ├── long_context_introduction.md
│   │   ├── long_context_tutorial.md
│   │   ├── makefile_usage.md
│   │   ├── multi-gpu-tutorial.md
│   │   ├── operators/
│   │   │   └── llamafile.md
│   │   ├── prefix_cache.md
│   │   └── xpu.md
│   └── zh/
│       ├── DeepseekR1_V3_tutorial_zh.md
│       ├── DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md
│       ├── KTransformers-Fine-Tuning_Developer-Technical-Notes_zh.md
│       ├── KTransformers-Fine-Tuning_User-Guide_zh.md
│       ├── Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md
│       ├── api/
│       │   └── server/
│       │       ├── api.md
│       │       ├── server.md
│       │       ├── tabby.md
│       │       └── website.md
│       └── clawdbot_integration_guide.md
├── docker/
│   ├── Dockerfile
│   ├── README-packaging.md
│   ├── docker-utils.sh
│   └── push-to-dockerhub.sh
├── install.sh
├── kt-kernel/
│   ├── .clang-format
│   ├── .githooks/
│   │   ├── commit-msg
│   │   └── pre-commit
│   ├── .gitignore
│   ├── .gitmodules
│   ├── CMakeLists.txt
│   ├── CMakePresets.json
│   ├── MANIFEST.in
│   ├── README.md
│   ├── README_zh.md
│   ├── bench/
│   │   ├── .gitignore
│   │   ├── Makefile
│   │   ├── bench_attention.py
│   │   ├── bench_attention_torch.py
│   │   ├── bench_bf16_moe.py
│   │   ├── bench_fp8_moe.py
│   │   ├── bench_fp8_perchannel_moe.py
│   │   ├── bench_k2_moe_amx.py
│   │   ├── bench_k2_write_buffer.py
│   │   ├── bench_linear.py
│   │   ├── bench_linear_torch.py
│   │   ├── bench_mla.py
│   │   ├── bench_mlp.py
│   │   ├── bench_mlp_torch.py
│   │   ├── bench_moe.py
│   │   ├── bench_moe_amx.py
│   │   ├── bench_moe_amx_k.py
│   │   ├── bench_moe_kernel.py
│   │   ├── bench_moe_kernel_tiling.py
│   │   ├── bench_moe_kml.py
│   │   ├── bench_moe_torch.py
│   │   ├── bench_write_buffer.py
│   │   ├── compare_moe_performance.py
│   │   ├── multi_bench_moe.py
│   │   └── upload-bench-json.py
│   ├── cmake/
│   │   ├── DetectCPU.cmake
│   │   └── FindSIMD.cmake
│   ├── cpu_backend/
│   │   ├── cpuinfer.h
│   │   ├── shared_mem_buffer.cpp
│   │   ├── shared_mem_buffer.h
│   │   ├── task_queue.cpp
│   │   ├── task_queue.h
│   │   ├── vendors/
│   │   │   ├── README.md
│   │   │   ├── cuda.h
│   │   │   ├── hip.h
│   │   │   ├── musa.h
│   │   │   └── vendor.h
│   │   ├── worker_pool.cpp
│   │   └── worker_pool.h
│   ├── cuda/
│   │   ├── binding.cpp
│   │   ├── custom_gguf/
│   │   │   ├── dequant.cu
│   │   │   └── ops.h
│   │   ├── gptq_marlin/
│   │   │   ├── gptq_marlin.cu
│   │   │   ├── gptq_marlin.cuh
│   │   │   ├── gptq_marlin_dtypes.cuh
│   │   │   └── ops.h
│   │   ├── moe/
│   │   │   ├── moe_topk_softmax_kernels.cu
│   │   │   ├── ops.h
│   │   │   └── utils.h
│   │   ├── setup.py
│   │   └── test_dequant.py
│   ├── demo/
│   │   ├── .gitignore
│   │   ├── Makefile
│   │   ├── bench_reorder_bandwidth.cpp
│   │   ├── bf16-test.cpp
│   │   ├── fp16-test.cpp
│   │   ├── plot.py
│   │   ├── simple_test.cpp
│   │   ├── simple_test_aocl.cpp
│   │   └── tflops.py
│   ├── examples/
│   │   ├── .gitignore
│   │   ├── bench_moe_amx_int8.py
│   │   ├── configuration_deepseek_v3.py
│   │   ├── modeling_deepseek_v3.py
│   │   ├── repro_llamafile_re.py
│   │   ├── test-debug.py
│   │   ├── test_apply_rope.py
│   │   ├── test_attention.py
│   │   ├── test_awq_moe_amx.py
│   │   ├── test_bf16_moe.py
│   │   ├── test_deepseekv3.py
│   │   ├── test_deepseekv3_prefill.py
│   │   ├── test_deepseekv3_prefill_speed.py
│   │   ├── test_fp8_moe.py
│   │   ├── test_fp8_perchannel_moe.py
│   │   ├── test_gate.py
│   │   ├── test_k2_moe_amx.py
│   │   ├── test_k2_write_buffer.py
│   │   ├── test_linear.py
│   │   ├── test_mla.py
│   │   ├── test_mla_qlen.py
│   │   ├── test_mla_quant.py
│   │   ├── test_mla_simple.py
│   │   ├── test_mla_torch.py
│   │   ├── test_mlp.py
│   │   ├── test_moe.py
│   │   ├── test_moe_amx.py
│   │   ├── test_moe_kernel.py
│   │   ├── test_moe_kml.py
│   │   ├── test_rope.cpp
│   │   ├── test_rope.py
│   │   ├── test_softmax.py
│   │   ├── test_write_buffer.py
│   │   └── torch_attention.py
│   ├── ext_bindings.cpp
│   ├── install.sh
│   ├── operators/
│   │   ├── amx/
│   │   │   ├── awq-moe.hpp
│   │   │   ├── bf16-moe.hpp
│   │   │   ├── fp8-moe.hpp
│   │   │   ├── fp8-perchannel-moe.hpp
│   │   │   ├── k2-moe.hpp
│   │   │   ├── la/
│   │   │   │   ├── amx-example.cpp
│   │   │   │   ├── amx.hpp
│   │   │   │   ├── amx_buffers.hpp
│   │   │   │   ├── amx_config.hpp
│   │   │   │   ├── amx_kernels.hpp
│   │   │   │   ├── amx_quantization.hpp
│   │   │   │   ├── amx_raw_buffers.hpp
│   │   │   │   ├── amx_raw_kernels.hpp
│   │   │   │   ├── amx_utils.hpp
│   │   │   │   ├── pack.hpp
│   │   │   │   └── utils.hpp
│   │   │   ├── moe.hpp
│   │   │   ├── moe_base.hpp
│   │   │   └── test/
│   │   │       ├── amx-bkgroup-test.cpp
│   │   │       ├── amx-c-reduce-test.cpp
│   │   │       ├── amx-kgroup-test.cpp
│   │   │       ├── amx-test.cpp
│   │   │       ├── analyze-error.cpp
│   │   │       ├── avx-test.cpp
│   │   │       ├── debug-kgroup-details.cpp
│   │   │       ├── debug-kgroup.cpp
│   │   │       ├── debug-specific-dims.cpp
│   │   │       ├── mat-test.hpp
│   │   │       ├── mmq-test.cpp
│   │   │       ├── mmq.cpp
│   │   │       ├── mmq.h
│   │   │       ├── test-kgroup-128.cpp
│   │   │       ├── test-kgroup-kernel.cpp
│   │   │       ├── test-specific-dims.cpp
│   │   │       ├── thread_test.sh
│   │   │       ├── timer.hh
│   │   │       └── verify-kgroup.cpp
│   │   ├── common.hpp
│   │   ├── kvcache/
│   │   │   ├── kvcache.h
│   │   │   ├── kvcache_attn.cpp
│   │   │   ├── kvcache_load_dump.cpp
│   │   │   ├── kvcache_read_write.cpp
│   │   │   └── kvcache_utils.cpp
│   │   ├── llamafile/
│   │   │   ├── conversion.h
│   │   │   ├── linear.cpp
│   │   │   ├── linear.h
│   │   │   ├── mla.hpp
│   │   │   ├── mlp.cpp
│   │   │   ├── mlp.h
│   │   │   └── moe.hpp
│   │   ├── mla-tp.hpp
│   │   ├── moe-tp.hpp
│   │   ├── moe_kernel/
│   │   │   ├── api/
│   │   │   │   ├── common.h
│   │   │   │   └── mat_kernel.h
│   │   │   ├── la/
│   │   │   │   ├── kernel.hpp
│   │   │   │   ├── mat_kernel.cpp
│   │   │   │   └── utils.hpp
│   │   │   ├── mat_kernel/
│   │   │   │   ├── aocl_kernel/
│   │   │   │   │   └── kernel.cpp
│   │   │   │   └── batch_gemm_api.hpp
│   │   │   ├── moe.hpp
│   │   │   └── test/
│   │   │       ├── convert-test.cpp
│   │   │       ├── debug.hpp
│   │   │       ├── int4_mul-test.cpp
│   │   │       ├── mat_test.cpp
│   │   │       └── utils_test.cpp
│   │   ├── reduce.hpp
│   │   ├── rms-norm.hpp
│   │   ├── rope.hpp
│   │   ├── softmax.hpp
│   │   └── tp.hpp
│   ├── pyproject.toml
│   ├── pytest.ini
│   ├── python/
│   │   ├── __init__.py
│   │   ├── _cpu_detect.py
│   │   ├── cli/
│   │   │   ├── __init__.py
│   │   │   ├── commands/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bench.py
│   │   │   │   ├── chat.py
│   │   │   │   ├── config.py
│   │   │   │   ├── doctor.py
│   │   │   │   ├── model.py
│   │   │   │   ├── quant.py
│   │   │   │   ├── run.py
│   │   │   │   ├── sft.py
│   │   │   │   └── version.py
│   │   │   ├── completions/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _kt
│   │   │   │   ├── kt-completion.bash
│   │   │   │   └── kt.fish
│   │   │   ├── config/
│   │   │   │   ├── __init__.py
│   │   │   │   └── settings.py
│   │   │   ├── i18n.py
│   │   │   ├── main.py
│   │   │   ├── requirements/
│   │   │   │   ├── inference.txt
│   │   │   │   └── sft.txt
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── analyze_moe_model.py
│   │   │       ├── console.py
│   │   │       ├── debug_configs.py
│   │   │       ├── download_helper.py
│   │   │       ├── environment.py
│   │   │       ├── input_validators.py
│   │   │       ├── kv_cache_calculator.py
│   │   │       ├── model_discovery.py
│   │   │       ├── model_registry.py
│   │   │       ├── model_scanner.py
│   │   │       ├── model_table_builder.py
│   │   │       ├── model_verifier.py
│   │   │       ├── port_checker.py
│   │   │       ├── quant_interactive.py
│   │   │       ├── repo_detector.py
│   │   │       ├── run_configs.py
│   │   │       ├── run_interactive.py
│   │   │       ├── sglang_checker.py
│   │   │       ├── tuna_engine.py
│   │   │       └── user_model_registry.py
│   │   ├── experts.py
│   │   ├── experts_base.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── amx.py
│   │       ├── llamafile.py
│   │       ├── loader.py
│   │       └── moe_kernel.py
│   ├── requirements.txt
│   ├── scripts/
│   │   ├── README.md
│   │   ├── check.py
│   │   ├── check_cpu_features.py
│   │   ├── compare_weights.py
│   │   ├── convert_cpu_weights.py
│   │   ├── convert_gpu_weights.py
│   │   ├── convert_kimi_k2_fp8_to_bf16_cpu.py
│   │   ├── convert_moe_to_bf16.py
│   │   └── install-git-hooks.sh
│   ├── setup.py
│   └── test/
│       ├── __init__.py
│       ├── ci/
│       │   ├── __init__.py
│       │   ├── ci_register.py
│       │   └── ci_utils.py
│       ├── per_commit/
│       │   ├── __init__.py
│       │   ├── test_amd_placeholder.py
│       │   ├── test_basic_cpu.py
│       │   ├── test_cuda_placeholder.py
│       │   ├── test_moe_amx_accuracy_int4.py
│       │   ├── test_moe_amx_accuracy_int4_1.py
│       │   ├── test_moe_amx_accuracy_int4_1k.py
│       │   ├── test_moe_amx_accuracy_int8.py
│       │   ├── test_moe_amx_bench_int4.py
│       │   ├── test_moe_amx_bench_int4_1.py
│       │   ├── test_moe_amx_bench_int4_1k.py
│       │   └── test_moe_amx_bench_int8.py
│       ├── run_suite.py
│       └── test_generate_gpu_experts_masks.py
├── kt-sft/
│   ├── .flake8
│   ├── .gitignore
│   ├── .gitmodules
│   ├── .pylintrc
│   ├── Dockerfile
│   ├── Dockerfile.xpu
│   ├── LICENSE
│   ├── MANIFEST.in
│   ├── Makefile
│   ├── README.md
│   ├── SECURITY.md
│   ├── autosetup.sh
│   ├── book.toml
│   ├── csrc/
│   │   ├── custom_marlin/
│   │   │   ├── __init__.py
│   │   │   ├── binding.cpp
│   │   │   ├── gptq_marlin/
│   │   │   │   ├── gptq_marlin.cu
│   │   │   │   ├── gptq_marlin.cuh
│   │   │   │   ├── gptq_marlin_dtypes.cuh
│   │   │   │   ├── gptq_marlin_repack.cu
│   │   │   │   └── ops.h
│   │   │   ├── setup.py
│   │   │   ├── test_cuda_graph.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── format24.py
│   │   │       ├── marlin_24_perms.py
│   │   │       ├── marlin_perms.py
│   │   │       ├── marlin_utils.py
│   │   │       └── quant_utils.py
│   │   └── ktransformers_ext/
│   │       ├── CMakeLists.txt
│   │       ├── bench/
│   │       │   ├── bench_attention.py
│   │       │   ├── bench_attention_torch.py
│   │       │   ├── bench_linear.py
│   │       │   ├── bench_linear_torch.py
│   │       │   ├── bench_mlp.py
│   │       │   ├── bench_mlp_torch.py
│   │       │   ├── bench_moe.py
│   │       │   ├── bench_moe_amx.py
│   │       │   └── bench_moe_torch.py
│   │       ├── cmake/
│   │       │   └── FindSIMD.cmake
│   │       ├── cpu_backend/
│   │       │   ├── backend.cpp
│   │       │   ├── backend.h
│   │       │   ├── cpuinfer.h
│   │       │   ├── shared_mem_buffer.cpp
│   │       │   ├── shared_mem_buffer.h
│   │       │   ├── task_queue.cpp
│   │       │   ├── task_queue.h
│   │       │   └── vendors/
│   │       │       ├── README.md
│   │       │       ├── cuda.h
│   │       │       ├── hip.h
│   │       │       ├── musa.h
│   │       │       └── vendor.h
│   │       ├── cuda/
│   │       │   ├── binding.cpp
│   │       │   ├── custom_gguf/
│   │       │   │   ├── dequant.cu
│   │       │   │   └── ops.h
│   │       │   ├── gptq_marlin/
│   │       │   │   ├── gptq_marlin.cu
│   │       │   │   ├── gptq_marlin.cuh
│   │       │   │   ├── gptq_marlin_dtypes.cuh
│   │       │   │   └── ops.h
│   │       │   ├── setup.py
│   │       │   └── test_dequant.py
│   │       ├── examples/
│   │       │   ├── test_attention.py
│   │       │   ├── test_linear.py
│   │       │   ├── test_mlp.py
│   │       │   ├── test_moe.py
│   │       │   ├── test_sft_amx_moe.py
│   │       │   └── test_sft_moe.py
│   │       ├── ext_bindings.cpp
│   │       ├── operators/
│   │       │   ├── amx/
│   │       │   │   ├── debug_sft_moe.hpp
│   │       │   │   ├── debug_tools_sft_moe.hpp
│   │       │   │   ├── la/
│   │       │   │   │   ├── amx.hpp
│   │       │   │   │   └── utils.hpp
│   │       │   │   ├── moe.hpp
│   │       │   │   └── sft_moe.hpp
│   │       │   ├── kvcache/
│   │       │   │   ├── kvcache.h
│   │       │   │   ├── kvcache_attn.cpp
│   │       │   │   ├── kvcache_load_dump.cpp
│   │       │   │   ├── kvcache_read_write.cpp
│   │       │   │   └── kvcache_utils.cpp
│   │       │   └── llamafile/
│   │       │       ├── conversion.h
│   │       │       ├── linear.cpp
│   │       │       ├── linear.h
│   │       │       ├── mlp.cpp
│   │       │       ├── mlp.h
│   │       │       ├── moe.cpp
│   │       │       ├── moe.h
│   │       │       ├── sft_moe.cpp
│   │       │       ├── sft_moe.h
│   │       │       └── sft_moe_forward_cache.h
│   │       └── vendors/
│   │           ├── cuda.h
│   │           ├── hip.h
│   │           ├── musa.h
│   │           └── vendor.h
│   ├── install-with-cache.sh
│   ├── install.bat
│   ├── install.sh
│   ├── ktransformers/
│   │   ├── __init__.py
│   │   ├── configs/
│   │   │   ├── config.yaml
│   │   │   ├── log_config.ini
│   │   │   └── model_config/
│   │   │       ├── config.json
│   │   │       └── configuration_deepseek.py
│   │   ├── ktransformers_ext/
│   │   │   ├── operators/
│   │   │   │   └── custom_marlin/
│   │   │   │       └── quantize/
│   │   │   │           └── utils/
│   │   │   │               ├── __init__.py
│   │   │   │               ├── format_24.py
│   │   │   │               ├── marlin_24_perms.py
│   │   │   │               ├── marlin_perms.py
│   │   │   │               ├── marlin_utils.py
│   │   │   │               └── quant_utils.py
│   │   │   └── triton/
│   │   │       └── fp8gemm.py
│   │   ├── local_chat.py
│   │   ├── local_chat.sh
│   │   ├── lora_test_module.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── configuration_deepseek.py
│   │   │   ├── configuration_deepseek_v3.py
│   │   │   ├── configuration_llama.py
│   │   │   ├── configuration_qwen2_moe.py
│   │   │   ├── configuration_qwen3_moe.py
│   │   │   ├── custom_cache.py
│   │   │   ├── custom_modeling_deepseek_v2.py
│   │   │   ├── custom_modeling_deepseek_v3.py
│   │   │   ├── custom_modeling_qwen2_moe.py
│   │   │   ├── custom_modeling_qwen3_moe.py
│   │   │   ├── modeling_deepseek.py
│   │   │   ├── modeling_deepseek_v3.py
│   │   │   ├── modeling_llama.py
│   │   │   ├── modeling_mixtral.py
│   │   │   ├── modeling_qwen2_moe.py
│   │   │   └── modeling_qwen3_moe.py
│   │   ├── moe_test_module.py
│   │   ├── moe_test_module_old.py
│   │   ├── operators/
│   │   │   ├── RoPE.py
│   │   │   ├── __init__.py
│   │   │   ├── attention.py
│   │   │   ├── balance_serve_attention.py
│   │   │   ├── base_operator.py
│   │   │   ├── cpuinfer.py
│   │   │   ├── dynamic_attention.py
│   │   │   ├── experts.py
│   │   │   ├── flashinfer_batch_prefill_wrapper.py
│   │   │   ├── flashinfer_wrapper.py
│   │   │   ├── gate.py
│   │   │   ├── layernorm.py
│   │   │   ├── linear.py
│   │   │   ├── mlp.py
│   │   │   ├── models.py
│   │   │   ├── triton_attention.py
│   │   │   └── triton_attention_prefill.py
│   │   ├── optimize/
│   │   │   ├── optimize.py
│   │   │   └── optimize_rules/
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Chat-sft-amx.yaml
│   │   │       ├── DeepSeek-V2-Chat.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-sft-amx.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-sft.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-use-adapter.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat.yaml
│   │   │       ├── DeepSeek-V3-Chat-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-8.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-marlin.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V3-Chat-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
│   │   │       ├── DeepSeek-V3-Chat-sft-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat.yaml
│   │   │       ├── Internlm2_5-7b-Chat-1m.yaml
│   │   │       ├── Mixtral.yaml
│   │   │       ├── Moonlight-16B-A3B-serve.yaml
│   │   │       ├── Moonlight-16B-A3B.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct-multi-gpu.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct.yaml
│   │   │       ├── Qwen2-serve-amx.yaml
│   │   │       ├── Qwen2-serve.yaml
│   │   │       ├── Qwen3Moe-serve-amx.yaml
│   │   │       ├── Qwen3Moe-serve.yaml
│   │   │       ├── Qwen3Moe-sft-amx.yaml
│   │   │       ├── rocm/
│   │   │       │   └── DeepSeek-V3-Chat.yaml
│   │   │       └── xpu/
│   │   │           ├── DeepSeek-V2-Chat.yaml
│   │   │           ├── DeepSeek-V3-Chat.yaml
│   │   │           └── Qwen3Moe-Chat.yaml
│   │   ├── server/
│   │   │   ├── __init__.py
│   │   │   ├── api/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── ollama/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── completions.py
│   │   │   │   ├── openai/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── assistants.py
│   │   │   │   │   │   ├── messages.py
│   │   │   │   │   │   ├── runs.py
│   │   │   │   │   │   └── threads.py
│   │   │   │   │   ├── endpoints/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── chat.py
│   │   │   │   │   └── legacy/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── completions.py
│   │   │   │   └── web/
│   │   │   │       ├── __init__.py
│   │   │   │       └── system.py
│   │   │   ├── args.py
│   │   │   ├── backend/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── base.py
│   │   │   │   ├── context_manager.py
│   │   │   │   └── interfaces/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── balance_serve.py
│   │   │   │       ├── exllamav2.py
│   │   │   │       ├── ktransformers.py
│   │   │   │       └── transformers.py
│   │   │   ├── balance_serve/
│   │   │   │   ├── inference/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── config.py
│   │   │   │   │   ├── distributed/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── communication_op.py
│   │   │   │   │   │   ├── cuda_wrapper.py
│   │   │   │   │   │   ├── custom_all_reduce.py
│   │   │   │   │   │   ├── custom_all_reduce_utils.py
│   │   │   │   │   │   ├── parallel_state.py
│   │   │   │   │   │   ├── pynccl.py
│   │   │   │   │   │   ├── pynccl_wrapper.py
│   │   │   │   │   │   └── utils.py
│   │   │   │   │   ├── forward_batch.py
│   │   │   │   │   ├── model_runner.py
│   │   │   │   │   ├── query_manager.py
│   │   │   │   │   └── sampling/
│   │   │   │   │       ├── penaltylib/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── orchestrator.py
│   │   │   │   │       │   └── penalizers/
│   │   │   │   │       │       ├── frequency_penalty.py
│   │   │   │   │       │       ├── min_new_tokens.py
│   │   │   │   │       │       ├── presence_penalty.py
│   │   │   │   │       │       └── repetition_penalty.py
│   │   │   │   │       └── sampler.py
│   │   │   │   ├── sched_rpc.py
│   │   │   │   └── settings.py
│   │   │   ├── config/
│   │   │   │   ├── config.py
│   │   │   │   ├── log.py
│   │   │   │   └── singleton.py
│   │   │   ├── crud/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── exceptions.py
│   │   │   ├── main.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── run_steps.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── schemas/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── assistants/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants.py
│   │   │   │   │   ├── messages.py
│   │   │   │   │   ├── runs.py
│   │   │   │   │   ├── streaming.py
│   │   │   │   │   ├── threads.py
│   │   │   │   │   └── tool.py
│   │   │   │   ├── base.py
│   │   │   │   ├── conversation.py
│   │   │   │   ├── endpoints/
│   │   │   │   │   └── chat.py
│   │   │   │   └── legacy/
│   │   │   │       ├── __init__.py
│   │   │   │       └── completions.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── create_interface.py
│   │   │       ├── multi_timer.py
│   │   │       └── sql_utils.py
│   │   ├── sft/
│   │   │   ├── __init__.py
│   │   │   ├── flops_utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── custom_profile.py
│   │   │   │   └── lora_test_utils.py
│   │   │   ├── lora.py
│   │   │   ├── metrics.py
│   │   │   ├── metrics_utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── constants.py
│   │   │   │   ├── env.py
│   │   │   │   ├── logging.py
│   │   │   │   ├── misc.py
│   │   │   │   ├── packages.py
│   │   │   │   └── ploting.py
│   │   │   ├── monkey_patch_torch_module.py
│   │   │   ├── peft_utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── lora_layer.py
│   │   │   │   ├── lora_model.py
│   │   │   │   ├── mapping.py
│   │   │   │   └── peft_model.py
│   │   │   └── torchviz_test.py
│   │   ├── tests/
│   │   │   ├── .gitignore
│   │   │   ├── AIME_2024/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── dequant_gpu.py
│   │   │   ├── dequant_gpu_t.py
│   │   │   ├── function_call_test.py
│   │   │   ├── humaneval/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── mmlu_pro_test.py
│   │   │   ├── mmlu_test.py
│   │   │   ├── mmlu_test_multi.py
│   │   │   ├── score.py
│   │   │   ├── test_client.py
│   │   │   ├── test_pytorch_q8.py
│   │   │   ├── test_speed.py
│   │   │   └── triton_fp8gemm_test.py
│   │   ├── util/
│   │   │   ├── cuda_graph_runner.py
│   │   │   ├── custom_gguf.py
│   │   │   ├── custom_loader.py
│   │   │   ├── globals.py
│   │   │   ├── grad_wrapper.py
│   │   │   ├── inference_state.py
│   │   │   ├── modeling_rope_utils.py
│   │   │   ├── textstream.py
│   │   │   ├── utils.py
│   │   │   ├── vendors.py
│   │   │   └── weight_loader.py
│   │   └── website/
│   │       ├── .browserslistrc
│   │       ├── .eslintrc.js
│   │       ├── .gitignore
│   │       ├── README.md
│   │       ├── config.d.ts
│   │       ├── jest.config.js
│   │       ├── package.json
│   │       ├── public/
│   │       │   ├── config.js
│   │       │   ├── css/
│   │       │   │   └── reset.css
│   │       │   └── index.html
│   │       ├── src/
│   │       │   ├── App.vue
│   │       │   ├── api/
│   │       │   │   ├── api-client.ts
│   │       │   │   ├── assistant.ts
│   │       │   │   ├── message.ts
│   │       │   │   ├── run.ts
│   │       │   │   └── thread.ts
│   │       │   ├── assets/
│   │       │   │   ├── css/
│   │       │   │   │   └── mixins.styl
│   │       │   │   └── iconfont/
│   │       │   │       ├── demo.css
│   │       │   │       ├── demo_index.html
│   │       │   │       ├── iconfont.css
│   │       │   │       ├── iconfont.js
│   │       │   │       └── iconfont.json
│   │       │   ├── components/
│   │       │   │   └── chat/
│   │       │   │       └── index.vue
│   │       │   ├── conf/
│   │       │   │   └── config.ts
│   │       │   ├── locals/
│   │       │   │   ├── en.js
│   │       │   │   ├── index.js
│   │       │   │   └── zh.js
│   │       │   ├── main.ts
│   │       │   ├── router/
│   │       │   │   └── index.ts
│   │       │   ├── shims-vue.d.ts
│   │       │   ├── store/
│   │       │   │   └── index.ts
│   │       │   ├── utils/
│   │       │   │   ├── copy.ts
│   │       │   │   └── types.ts
│   │       │   └── views/
│   │       │       └── home.vue
│   │       ├── tests/
│   │       │   └── unit/
│   │       │       └── example.spec.ts
│   │       ├── tsconfig.json
│   │       └── vue.config.js
│   ├── merge_tensors/
│   │   └── merge_safetensor_gguf.py
│   ├── pyproject.toml
│   ├── requirements-sft.txt
│   ├── setup.py
│   ├── test_adapter/
│   │   ├── data_transfer.py
│   │   ├── infer_with_adapter.py
│   │   ├── inspect_adapter.py
│   │   ├── pred2metrics.py
│   │   ├── test_grad.py
│   │   └── time_test_lora_train.py
│   └── withoutKT_PEFT.py
├── pyproject.toml
├── setup.py
├── third_party/
│   └── llamafile/
│       ├── README.md
│       ├── bench.h
│       ├── flags.cpp
│       ├── flags.h
│       ├── iqk_mul_mat.inc
│       ├── iqk_mul_mat_amd_avx2.cpp
│       ├── iqk_mul_mat_amd_zen4.cpp
│       ├── iqk_mul_mat_arm.inc
│       ├── iqk_mul_mat_arm82.cpp
│       ├── macros.h
│       ├── micros.h
│       ├── numba.h
│       ├── sgemm.cpp
│       ├── sgemm.h
│       ├── tinyblas_cpu.h
│       ├── tinyblas_cpu_mixmul.inc
│       ├── tinyblas_cpu_mixmul_amd_avx.cpp
│       ├── tinyblas_cpu_mixmul_amd_avx2.cpp
│       ├── tinyblas_cpu_mixmul_amd_avx512f.cpp
│       ├── tinyblas_cpu_mixmul_amd_avxvnni.cpp
│       ├── tinyblas_cpu_mixmul_amd_fma.cpp
│       ├── tinyblas_cpu_mixmul_amd_zen4.cpp
│       ├── tinyblas_cpu_mixmul_arm80.cpp
│       ├── tinyblas_cpu_mixmul_arm82.cpp
│       ├── tinyblas_cpu_sgemm.inc
│       ├── tinyblas_cpu_sgemm_amd_avx.cpp
│       ├── tinyblas_cpu_sgemm_amd_avx2.cpp
│       ├── tinyblas_cpu_sgemm_amd_avx512f.cpp
│       ├── tinyblas_cpu_sgemm_amd_avxvnni.cpp
│       ├── tinyblas_cpu_sgemm_amd_fma.cpp
│       ├── tinyblas_cpu_sgemm_amd_zen4.cpp
│       ├── tinyblas_cpu_sgemm_arm80.cpp
│       ├── tinyblas_cpu_sgemm_arm82.cpp
│       └── tinyblas_cpu_unsupported.cpp
└── version.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/CODE_OF_CONDUCT.md
================================================

# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.

Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at [INSERT CONTACT METHOD]. All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of actions.

**Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior,  harassment of an individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].

Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC].

For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at [https://www.contributor-covenant.org/translations][translations].

[homepage]: https://www.contributor-covenant.org
[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations


================================================
FILE: .github/CONTRIBUTING.md
================================================
## Before Commit!

Your commit message must follow Conventional Commits (https://www.conventionalcommits.org/) and your code should be formatted. The Git hooks will do most of the work automatically:

### Tool Requirements

You need a recent `clang-format` (>= 18). In a conda environment you can install:

```shell
conda install -c conda-forge clang-format=18
```

If you previously configured with an older version, remove the build directory and reconfigure:

```shell
rm -rf kt-kernel/build
```

Install `black` for Python formatting:

```shell
conda install black
```

### Install hook:
```shell
bash kt-kernel/scripts/install-git-hooks.sh
#or just cmake the kt-kernel
cmake -S kt-kernel -B kt-kernel/build
```

There are manual commands if you need format.

```shell
cmake -S kt-kernel -B kt-kernel/build
cmake --build kt-kernel/build --target format
```

## Developer Note

Formatting and commit message rules are enforced by Git hooks. After installing `clang-format` and `black`, just commit normally—the hooks will run formatting for you.

> [!NOTE]
> If formatting modifies files, the commit is aborted after staging those changes. Review them and run `git commit` again. Repeat until no further formatting changes appear.

---

### Conventional Commit Regex (Reference)

The commit-msg hook enforces this pattern:

```text
regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+'
```

Meaning (English):
* `[type]` required — one of feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip
* Optional scope: `(scope)` — any chars except `)`
* Optional breaking change marker: `!` right after type or scope
* Separator: `: ` (colon + space)
* Subject: free text (at least one character)

Examples:
```text
[feat]: add adaptive batching
[fix(parser)]: handle empty token list
[docs]!: update API section for breaking rename
```

You can bypass locally (not recommended) with:
```shell
git commit --no-verify
```
## 提交前提醒

提交信息必须满足 Conventional Commits 规范 (https://www.conventionalcommits.org/)，代码需要符合格式要求。Git 钩子已经集成了大部分工作：
### 软件要求

需要较新的 `clang-format` (>= 18)，在 conda 环境中安装：

```shell
conda install -c conda-forge clang-format=18
```

如果之前用老版本配置过，请删除构建目录重新配置：

```shell
rm -rf kt-kernel/build
```

安装 `black` 以进行 Python 文件格式化：

```shell
conda install black
```
### 安装钩子
```shell
bash kt-kernel/scripts/install-git-hooks.sh
#or just cmake the kt-kernel
cmake -S kt-kernel -B kt-kernel/build
```
如果你需要手动格式化：
```shell
cmake -S kt-kernel -B kt-kernel/build
cmake --build kt-kernel/build --target format
```

## 开发者说明

本仓库通过 Git hooks 自动执行代码格式化与提交信息规范检查。只需安装好 `clang-format` 与 `black` 后正常执行提交即可，钩子会自动格式化。

> [!NOTE]
> 如果格式化修改了文件，钩子会终止提交并已暂存这些改动。请查看修改后再次执行 `git commit`，重复直到没有新的格式化变更。

### 提交信息正则（参考）

钩子使用如下正则检查提交信息：
```text
regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+'
```
含义：
* `[type]` 必填：feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip
* 作用域可选：`(scope)`，不能包含右括号
* 可选的破坏性标记：`!`
* 分隔符：冒号+空格 `: `
* 描述：至少一个字符

示例：
```text
[feat]: 增加自适应 batch 功能
[fix(tokenizer)]: 修复空 token 列表处理
[docs]!: 更新接口文档（存在破坏性修改）
```

跳过钩子（不推荐，仅紧急时）：
```shell
git commit --no-verify
```


================================================
FILE: .github/ISSUE_TEMPLATE/-bug-.yaml
================================================
name: "\U0001F41B Bug / Help"
description: Create a report to help us improve the ktransformers project
labels: ["pending"]
body:
  - type: markdown
    attributes:
      value: |
        Issues included in **[FAQs](https://github.com/kvcache-ai/ktransformers/issues/1608)** or those with **insufficient** information may be closed without a response.
        已经包含在 **[常见问题](https://github.com/kvcache-ai/ktransformers/issues/1608)** 内或提供信息**不完整**的 issues 可能不会被回复。

  - type: checkboxes
    id: reminder
    attributes:
      label: Reminder
      description: |
        Please ensure you have read the above rules carefully and searched the existing issues (including FAQs).
        请确保您已经认真阅读了上述规则并且搜索过现有的 issues（包括常见问题）。

      options:
        - label: I have read the above rules and searched the existing issues.
          required: true

  - type: textarea
    id: system-info
    validations:
      required: true
    attributes:
      label: System Info
      description: |
        Please share your system info with us. You can run the command **lscpu**, ** nvidia-smi ** etc. and copy-paste its output below.
        请提供您的系统信息。您可以在命令行运行 **lscpu**, **nvidia-smi** 等命令，并将其输出复制到该文本框中。

      placeholder: ktransformers version,sglang version, platform, python version, cpu info, GPU/NPU info ...

  - type: textarea
    id: reproduction
    validations:
      required: true
    attributes:
      label: Reproduction
      description: |
        Please provide entry arguments, error messages and stack traces that reproduces the problem.
        请提供入口参数，错误日志以及异常堆栈以便于我们复现问题。

      value: |
        ```text
        Put your message here.
        ```

  - type: textarea
    id: others
    validations:
      required: false
    attributes:
      label: Others

================================================
FILE: .github/ISSUE_TEMPLATE/-feature-.yaml
================================================
name: "\U0001F680 Feature request"
description: Submit a request for a new feature
labels: ["enhancement", "pending"]
body:
  - type: markdown
    attributes:
      value: |
        Please do not create issues that are not related to new features under this category.
        请勿在此分类下创建和新特性无关的 issues。

  - type: checkboxes
    id: reminder
    attributes:
      label: Reminder
      description: |
        Please ensure you have read the above rules carefully and searched the existing issues.
        请确保您已经认真阅读了上述规则并且搜索过现有的 issues。

      options:
        - label: I have read the above rules and searched the existing issues.
          required: true

  - type: textarea
    id: description
    validations:
      required: true
    attributes:
      label: Description
      description: |
        A clear and concise description of the feature proposal.
        请详细描述您希望加入的新功能特性。

  - type: textarea
    id: contribution
    validations:
      required: false
    attributes:
      label: Pull Request
      description: |
        Have you already created the relevant PR and submitted the code?
        您是否已经创建了相关 PR 并提交了代码？

================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
  - name: 📚 FAQs | 常见问题
    url: https://github.com/kvcache-ai/ktransformers/issues/1608
    about: Reading in advance is recommended | 建议提前阅读

================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# What does this PR do?

Fixes # (issue)

## Before submitting

- [ ] Did you read the [contributor guideline](https://github.com/kvcache-ai/ktransformers/blob/main/.github/CONTRIBUTING.md)?
- [ ] Did you write any new necessary tests?

================================================
FILE: .github/SECURITY.md
================================================
# Reporting Security Issues

To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/kvcache-ai/ktransformers/security/advisories/new) tab.

We will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.

Report security bugs in third-party modules to the person or team maintaining the module.

================================================
FILE: .github/workflows/book-ci.yml
================================================
name: Book-CI

on:
  push:
    branches:
      - main
      # - server_support

  pull_request:
    branches:
      - main
      # - server_support
jobs:
  test:
    name: test
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
    steps:
      - uses: actions/checkout@v4
      - name: Install Rust
        run: |
          rustup set profile minimal
          rustup toolchain install stable
          rustup default stable
      - name: Setup mdBook
        uses: peaceiris/actions-mdbook@v2
        with:
          mdbook-version: "latest"
      # - name: Run tests
      #   run: mdbook test

================================================
FILE: .github/workflows/deploy.yml
================================================
name: Deploy

on:
  push:
    branches:
      - main
      # - server_support

  pull_request:
    branches:
      - main
      # - server_support

defaults:
  run:
    shell: bash

permissions:
  contents: write

jobs:
  deploy:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
    steps:
      - uses: actions/checkout@v4
      - name: Install Rust
        run: |
          rustup set profile minimal
          rustup toolchain install stable
          rustup default stable
      - name: Setup mdBook
        uses: peaceiris/actions-mdbook@v2
        with:
          mdbook-version: "latest"
      - run: mdbook build
      # - name: Copy Assets
      #   run: |
      #     chmod +x ci/copy-assets.sh
      #     ci/copy-assets.sh ${{ matrix.os }}
      - name: Deploy
        uses: peaceiris/actions-gh-pages@v3
        # or || github.ref == 'refs/heads/server_support'
        if: ${{ github.ref == 'refs/heads/main' }}
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
          publish_dir: ./book

================================================
FILE: .github/workflows/docker-image.yml
================================================
name: DockerHub CI

on:
  release:
    types: [published]
  workflow_dispatch:
    inputs:
      push_to_dockerhub:
        description: 'Push image to DockerHub? (true/false)'
        required: true
        default: 'false'
        type: boolean
      cuda_version:
        description: 'CUDA version (e.g., 12.8.1)'
        required: false
        default: '12.8.1'
        type: string
      push_simplified_tag:
        description: 'Also push simplified tag? (true/false)'
        required: false
        default: 'true'
        type: boolean
      ubuntu_mirror:
        description: 'Use Tsinghua Ubuntu mirror? (0/1)'
        required: false
        default: '0'
        type: string

  # push:
  #   branches:
  #     - main
env:
  DOCKERHUB_REPO: ${{ secrets.DOCKERHUB_USERNAME }}/ktransformers
jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Run tests
        run: |
          if [ -f docker-compose.test.yml ]; then
            docker-compose --file docker-compose.test.yml build
            docker-compose --file docker-compose.test.yml run sut
          else
            docker build . --file docker/Dockerfile
          fi

  build-and-push:
    needs: test
    name: Build and Push Multi-Variant Docker Image
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Move Docker data directory
        run: |
          sudo systemctl stop docker
          sudo mkdir -p /mnt/docker
          sudo rsync -avz /var/lib/docker/ /mnt/docker
          sudo rm -rf /var/lib/docker
          sudo ln -s /mnt/docker /var/lib/docker
          sudo systemctl start docker

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}

      - name: Determine build parameters
        id: params
        run: |
          # Determine if we should push
          if [ "${{ github.event_name }}" = "release" ]; then
            echo "should_push=true" >> $GITHUB_OUTPUT
            echo "push_simplified=true" >> $GITHUB_OUTPUT
          elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            echo "should_push=${{ inputs.push_to_dockerhub }}" >> $GITHUB_OUTPUT
            echo "push_simplified=${{ inputs.push_simplified_tag }}" >> $GITHUB_OUTPUT
          else
            echo "should_push=false" >> $GITHUB_OUTPUT
            echo "push_simplified=false" >> $GITHUB_OUTPUT
          fi

          # Determine CUDA version
          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.cuda_version }}" ]; then
            echo "cuda_version=${{ inputs.cuda_version }}" >> $GITHUB_OUTPUT
          else
            echo "cuda_version=12.8.1" >> $GITHUB_OUTPUT
          fi

          # Determine Ubuntu mirror setting
          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.ubuntu_mirror }}" ]; then
            echo "ubuntu_mirror=${{ inputs.ubuntu_mirror }}" >> $GITHUB_OUTPUT
          else
            echo "ubuntu_mirror=0" >> $GITHUB_OUTPUT
          fi

      - name: Build and push Docker image
        run: |
          cd docker

          # Build command arguments
          BUILD_ARGS=(
            --cuda-version "${{ steps.params.outputs.cuda_version }}"
            --ubuntu-mirror "${{ steps.params.outputs.ubuntu_mirror }}"
            --repository "${{ env.DOCKERHUB_REPO }}"
          )

          # Add simplified tag option if enabled
          if [ "${{ steps.params.outputs.push_simplified }}" = "true" ]; then
            BUILD_ARGS+=(--also-push-simplified)
          fi

          # Add HTTP proxy if available
          if [ -n "${{ secrets.HTTP_PROXY }}" ]; then
            BUILD_ARGS+=(--http-proxy "${{ secrets.HTTP_PROXY }}")
          fi

          # Add HTTPS proxy if available
          if [ -n "${{ secrets.HTTPS_PROXY }}" ]; then
            BUILD_ARGS+=(--https-proxy "${{ secrets.HTTPS_PROXY }}")
          fi

          # Dry run if not pushing
          if [ "${{ steps.params.outputs.should_push }}" != "true" ]; then
            BUILD_ARGS+=(--dry-run)
          fi

          # Execute build script
          ./push-to-dockerhub.sh "${BUILD_ARGS[@]}"

      - name: Display image information
        if: steps.params.outputs.should_push == 'true'
        run: |
          echo "::notice title=Docker Image::Image pushed successfully to ${{ env.DOCKERHUB_REPO }}"
          echo "Pull command: docker pull ${{ env.DOCKERHUB_REPO }}:v\$(VERSION)-cu\$(CUDA_SHORT)"


================================================
FILE: .github/workflows/kt-kernel-tests.yml
================================================
name: PR KT-Kernel Test

on:
  pull_request:
    branches:
      - main
      - develop
    types: [synchronize, labeled]
  workflow_dispatch:

concurrency:
  group: pr-kt-kernel-test-${{ github.ref }}
  cancel-in-progress: true

jobs:
  # =============================================== check changes ====================================================
  check-changes:
    runs-on: ubuntu-latest
    outputs:
      kt_kernel: ${{ steps.filter.outputs.kt_kernel }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Fail if the PR does not have the 'run-ci' label
        if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
        run: |
          echo "This pull request does not have the 'run-ci' label. Failing the workflow."
          exit 1

      - name: Fail if the PR is a draft
        if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
        run: |
          echo "This pull request is a draft. Failing the workflow."
          exit 1

      - name: Detect file changes
        id: filter
        uses: dorny/paths-filter@v3
        with:
          filters: |
            kt_kernel:
              - "kt-kernel/**"
              - ".github/workflows/kt-kernel-tests.yml"

  # =============================================== KT-Kernel tests ====================================================
  per-commit-kt-kernel-cpu:
    needs: [check-changes]
    if: always() && !failure() && !cancelled() &&
      (needs.check-changes.outputs.kt_kernel == 'true' || github.event_name == 'workflow_dispatch')
    runs-on: kt-cpu
    continue-on-error: false
    steps:
      - name: Cleanup
        run: |
          sudo rm -rf $GITHUB_WORKSPACE/* || true

      - name: Checkout code
        uses: actions/checkout@v4
        with:
          submodules: recursive

      - name: Install KT-Kernel
        run: |
          cd kt-kernel
          bash install.sh build

      - name: Run KT-Kernel CPU tests
        timeout-minutes: 60
        run: |
          cd kt-kernel/test
          python3 run_suite.py --hw cpu --suite default

  # =============================================== finish ====================================================
  pr-test-kt-kernel-finish:
    needs: [check-changes, per-commit-kt-kernel-cpu]
    if: always()
    runs-on: ubuntu-latest
    steps:
      - name: Check all dependent job statuses
        run: |
          # Convert the 'needs' context to a JSON string
          json_needs='${{ toJson(needs) }}'

          # Get a list of all job names from the JSON keys
          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')

          for job in $job_names; do
            # For each job, extract its result
            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')

            # Print the job name and its result
            echo "$job: $result"

            # Check for failure or cancellation and exit if found
            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
              echo "The above jobs failed."
              exit 1
            fi
          done

          # If the loop completes, all jobs were successful
          echo "All jobs completed successfully"
          exit 0


================================================
FILE: .github/workflows/release-fake-tag.yml
================================================
name: Release Fake Tag

on:
  push:
    branches:
      - main
    paths:
      - "version.py"
  workflow_dispatch:

permissions:
  contents: write

jobs:
  publish:
    if: github.repository == 'kvcache-ai/ktransformers'
    runs-on: ubuntu-latest
    environment: 'prod'
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Get version
        id: get_version
        run: |
          version=$(cat version.py | grep '__version__' | cut -d'"' -f2)
          echo "TAG=v$version" >> $GITHUB_OUTPUT

      - name: Create and push tag
        run: |
          git config user.name "ktransformers-bot"
          git config user.email "ktransformers-bot@users.noreply.github.com"
          git tag ${{ steps.get_version.outputs.TAG }}
          git push origin ${{ steps.get_version.outputs.TAG }}


================================================
FILE: .github/workflows/release-pypi.yml
================================================
name: Release to PyPI

on:
  push:
    branches:
      - main
    paths:
      - "version.py"
  workflow_dispatch:
    inputs:
      test_pypi:
        description: 'Publish to TestPyPI instead of PyPI (for testing)'
        required: false
        default: 'false'
        type: choice
        options:
          - 'true'
          - 'false'

permissions:
  contents: read

jobs:
  # ── sglang-kt (must be on PyPI before users can pip install kt-kernel) ──
  build-and-publish-sglang-kt:
    name: Build & publish sglang-kt
    runs-on: [self-hosted, linux, x64]
    if: github.repository == 'kvcache-ai/ktransformers' && github.ref == 'refs/heads/main'
    environment: prod
    permissions:
      id-token: write
      contents: read

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          submodules: recursive

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.12'

      - name: Install build tools
        run: |
          python -m pip install --upgrade pip
          pip install build wheel setuptools twine

      - name: Build sglang-kt wheel
        working-directory: third_party/sglang/python
        run: |
          KT_VERSION=$(python3 -c "exec(open('${{ github.workspace }}/version.py').read()); print(__version__)")
          export SGLANG_KT_VERSION="$KT_VERSION"
          echo "Building sglang-kt v${KT_VERSION} wheel..."
          python -m build --wheel -v
          ls dist/ | grep -q "sglang_kt" || (echo "ERROR: Wheel name does not contain sglang_kt" && exit 1)

      - name: Publish sglang-kt to PyPI
        if: github.event.inputs.test_pypi != 'true'
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
        run: |
          python -m twine upload --skip-existing --verbose third_party/sglang/python/dist/*.whl

      - name: Publish sglang-kt to TestPyPI (if requested)
        if: github.event.inputs.test_pypi == 'true'
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }}
        run: |
          python -m twine upload --repository testpypi --skip-existing --verbose third_party/sglang/python/dist/*.whl

  # ── kt-kernel ──
  build-kt-kernel:
    name: Build kt-kernel (Python ${{ matrix.python-version }})
    runs-on: [self-hosted, linux, x64, gpu]
    strategy:
      fail-fast: false
      matrix:
        python-version: ['3.11', '3.12']

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          submodules: recursive

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}

      - name: Verify CUDA availability
        run: |
          nvidia-smi || (echo "ERROR: GPU not available" && exit 1)
          nvcc --version || (echo "ERROR: CUDA toolkit not found" && exit 1)

      - name: Install dependencies
        run: |
          apt-get update && apt-get install -y cmake libhwloc-dev pkg-config libnuma-dev
          python -m pip install --upgrade pip
          pip install build wheel setuptools torch --index-url https://download.pytorch.org/whl/cu118

      - name: Build kt-kernel wheel
        working-directory: kt-kernel
        env:
          CPUINFER_BUILD_ALL_VARIANTS: '1'
          CPUINFER_USE_CUDA: '1'
          CPUINFER_CUDA_ARCHS: '80;86;89;90'
          CPUINFER_CUDA_STATIC_RUNTIME: '1'
          CPUINFER_BUILD_TYPE: 'Release'
          CPUINFER_PARALLEL: '4'
          CPUINFER_FORCE_REBUILD: '1'
          CUDA_HOME: '/usr/local/cuda-11.8'
        run: |
          echo "Building kt-kernel with:"
          echo "  - CUDA support (SM 80, 86, 89, 90)"
          echo "  - CPU multi-variant (AMX, AVX512, AVX2)"
          python -m build --wheel -v

      - name: Verify wheel
        working-directory: kt-kernel
        run: |
          echo "Generated wheel:"
          ls -lh dist/

          # Install and test
          pip install dist/*.whl
          python -c "import kt_kernel; print(f'✓ Version: {kt_kernel.__version__}')"
          python -c "import kt_kernel; print(f'✓ CPU variant: {kt_kernel.__cpu_variant__}')"

          # Verify CUDA support
          python -c "
          from kt_kernel import kt_kernel_ext
          cpu_infer = kt_kernel_ext.CPUInfer(4)
          methods = dir(cpu_infer)
          has_cuda = 'submit_with_cuda_stream' in methods
          print(f'✓ CUDA support: {has_cuda}')
          "

          # Verify CPU multi-variant support
          echo "Checking CPU variants in wheel..."
          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_" || echo "Warning: No variant .so files found"
          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_amx.cpython" && echo "✓ AMX variant found" || echo "Note: AMX variant missing"
          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_avx512" && echo "✓ AVX512 variants found" || echo "Note: AVX512 variants missing"
          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_avx2.cpython" && echo "✓ AVX2 variant found" || echo "Note: AVX2 variant missing"

          # Verify static linking (should NOT depend on libcudart.so)
          rm -rf /tmp/check
          unzip -q dist/*.whl -d /tmp/check
          if ldd /tmp/check/kt_kernel/*.so 2>/dev/null | grep -q "libcudart.so"; then
            echo "ERROR: Dynamic cudart found, should be statically linked"
            exit 1
          else
            echo "✓ CUDA runtime statically linked"
          fi

      - name: Repair wheel for manylinux
        working-directory: kt-kernel
        run: |
          pip install auditwheel patchelf
          mkdir -p wheelhouse
          for wheel in dist/*.whl; do
            auditwheel repair "$wheel" --plat manylinux_2_17_x86_64 --exclude libcuda.so.1 -w wheelhouse/ || \
              cp "$wheel" wheelhouse/$(basename "$wheel" | sed 's/linux_x86_64/manylinux_2_17_x86_64/')
          done
          rm -f dist/*.whl && cp wheelhouse/*.whl dist/

      - name: Upload artifact
        uses: actions/upload-artifact@v4
        with:
          name: kt-kernel-wheels-py${{ matrix.python-version }}
          path: kt-kernel/dist/*.whl
          retention-days: 7

  publish-pypi:
    name: Publish kt-kernel to PyPI
    needs: [build-and-publish-sglang-kt, build-kt-kernel]
    runs-on: [self-hosted, linux, x64]
    if: github.repository == 'kvcache-ai/ktransformers' && github.ref == 'refs/heads/main'
    environment: prod
    permissions:
      id-token: write  # For trusted publishing (OIDC)
      contents: read

    steps:
      - name: Download all wheel artifacts
        uses: actions/download-artifact@v4
        with:
          path: artifacts/

      - name: Organize wheels into dist/
        run: |
          mkdir -p dist/
          find artifacts/ -name "*.whl" -exec cp {} dist/ \;
          echo "Wheels to publish:"
          ls -lh dist/

      - name: Get version from wheel
        id: get_version
        run: |
          # Extract version from first wheel filename
          wheel_name=$(ls dist/*.whl | head -1 | xargs basename)
          # Extract version (format: kt_kernel-X.Y.Z-...)
          version=$(echo "$wheel_name" | sed 's/kt_kernel-\([0-9.]*\)-.*/\1/')
          echo "VERSION=$version" >> $GITHUB_OUTPUT
          echo "Publishing version: $version"

      - name: Install twine
        run: |
          python -m pip install --upgrade pip
          pip install twine

      - name: Publish to TestPyPI (if requested)
        if: github.event.inputs.test_pypi == 'true'
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }}
        run: |
          python -m twine upload \
            --repository testpypi \
            --skip-existing \
            --verbose \
            dist/*.whl

      - name: Publish to PyPI
        if: github.event.inputs.test_pypi != 'true'
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
        run: |
          python -m twine upload \
            --skip-existing \
            --verbose \
            dist/*.whl

      - name: Create release summary
        run: |
          echo "## 🎉 kt-kernel v${{ steps.get_version.outputs.VERSION }} Published to PyPI" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### Installation" >> $GITHUB_STEP_SUMMARY
          echo '```bash' >> $GITHUB_STEP_SUMMARY
          echo "pip install kt-kernel==${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
          echo '```' >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### Published Wheels" >> $GITHUB_STEP_SUMMARY
          echo "Total: $(ls -1 dist/*.whl | wc -l) wheels (Python 3.10, 3.11, 3.12)" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### Features" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "**CPU Multi-Variant Support:**" >> $GITHUB_STEP_SUMMARY
          echo "- ✅ AMX (Intel Sapphire Rapids+, 2023)" >> $GITHUB_STEP_SUMMARY
          echo "- ✅ AVX512 Base/VNNI/VBMI/BF16 (Intel Skylake-X/Ice Lake/Cascade Lake, 2017+)" >> $GITHUB_STEP_SUMMARY
          echo "- ✅ AVX2 (Maximum compatibility, 2013+)" >> $GITHUB_STEP_SUMMARY
          echo "- 🔧 Runtime CPU detection: Automatically selects optimal variant" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "**CUDA Support:**" >> $GITHUB_STEP_SUMMARY
          echo "- ✅ SM 80 (Ampere: A100, RTX 3000 series)" >> $GITHUB_STEP_SUMMARY
          echo "- ✅ SM 86 (Ampere: RTX 3060-3090)" >> $GITHUB_STEP_SUMMARY
          echo "- ✅ SM 89 (Ada Lovelace: RTX 4000 series)" >> $GITHUB_STEP_SUMMARY
          echo "- ✅ SM 90 (Hopper: H100)" >> $GITHUB_STEP_SUMMARY
          echo "- 🔧 Static CUDA runtime: Compatible with CUDA 11.8+ and 12.x drivers" >> $GITHUB_STEP_SUMMARY
          echo "- 🔧 Works on CPU-only systems (CUDA features disabled gracefully)" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "**Requirements:**" >> $GITHUB_STEP_SUMMARY
          echo "- Python 3.10, 3.11, or 3.12" >> $GITHUB_STEP_SUMMARY
          echo "- Linux x86-64 (manylinux_2_17 compatible)" >> $GITHUB_STEP_SUMMARY
          echo "- For CUDA features: NVIDIA driver with CUDA 11.8+ or 12.x support" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "PyPI link: https://pypi.org/project/kt-kernel/${{ steps.get_version.outputs.VERSION }}/" >> $GITHUB_STEP_SUMMARY


================================================
FILE: .github/workflows/release-sglang-kt.yml
================================================
name: Release sglang-kt to PyPI

on:
  push:
    branches:
      - main
    paths:
      - "third_party/sglang"
      - "version.py"
  workflow_dispatch:
    inputs:
      test_pypi:
        description: 'Publish to TestPyPI instead of PyPI (for testing)'
        required: false
        default: 'false'
        type: choice
        options:
          - 'true'
          - 'false'

permissions:
  contents: read

jobs:
  build-sglang-kt:
    name: Build sglang-kt wheel
    runs-on: [self-hosted, linux, x64]

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          submodules: recursive

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.12'

      - name: Install build tools
        run: |
          python -m pip install --upgrade pip
          pip install build wheel setuptools

      - name: Build sglang-kt wheel
        working-directory: third_party/sglang/python
        run: |
          # Read version from ktransformers version.py
          KT_VERSION=$(python3 -c "exec(open('${{ github.workspace }}/version.py').read()); print(__version__)")
          export SGLANG_KT_VERSION="$KT_VERSION"
          echo "Building sglang-kt v${KT_VERSION} wheel..."
          python -m build --wheel -v

      - name: Verify wheel
        working-directory: third_party/sglang/python
        run: |
          echo "Generated wheel:"
          ls -lh dist/
          # Verify the wheel has the correct package name
          ls dist/ | grep -q "sglang_kt" || (echo "ERROR: Wheel name does not contain sglang_kt" && exit 1)
          echo "Wheel name verified."

      - name: Upload artifact
        uses: actions/upload-artifact@v4
        with:
          name: sglang-kt-wheel
          path: third_party/sglang/python/dist/*.whl
          retention-days: 7

  publish-pypi:
    name: Publish sglang-kt to PyPI
    needs: [build-sglang-kt]
    runs-on: [self-hosted, linux, x64]
    if: github.repository == 'kvcache-ai/ktransformers' && github.ref == 'refs/heads/main'
    environment: prod
    permissions:
      id-token: write
      contents: read

    steps:
      - name: Download wheel artifact
        uses: actions/download-artifact@v4
        with:
          name: sglang-kt-wheel
          path: dist/

      - name: Display wheels
        run: |
          echo "Wheels to publish:"
          ls -lh dist/

      - name: Install twine
        run: |
          python -m pip install --upgrade pip
          pip install twine

      - name: Publish to TestPyPI (if requested)
        if: github.event.inputs.test_pypi == 'true'
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }}
        run: |
          python -m twine upload \
            --repository testpypi \
            --skip-existing \
            --verbose \
            dist/*.whl

      - name: Publish to PyPI
        if: github.event.inputs.test_pypi != 'true'
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
        run: |
          python -m twine upload \
            --skip-existing \
            --verbose \
            dist/*.whl

      - name: Create release summary
        run: |
          echo "## sglang-kt Published to PyPI" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### Installation" >> $GITHUB_STEP_SUMMARY
          echo '```bash' >> $GITHUB_STEP_SUMMARY
          echo "pip install sglang-kt" >> $GITHUB_STEP_SUMMARY
          echo '```' >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "This is the kvcache-ai fork of SGLang with kt-kernel support." >> $GITHUB_STEP_SUMMARY
          echo "PyPI link: https://pypi.org/project/sglang-kt/" >> $GITHUB_STEP_SUMMARY


================================================
FILE: .github/workflows/sync-sglang-submodule.yml
================================================
name: Sync sglang submodule

on:
  schedule:
    # Run daily at 08:00 UTC
    - cron: "0 8 * * *"
  workflow_dispatch:

permissions:
  contents: write
  pull-requests: write

jobs:
  sync:
    name: Check for sglang-kt updates
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 0
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Update sglang submodule to latest main
        id: update
        run: |
          OLD_SHA=$(git -C third_party/sglang rev-parse HEAD)
          git submodule update --remote third_party/sglang
          NEW_SHA=$(git -C third_party/sglang rev-parse HEAD)

          echo "old_sha=$OLD_SHA" >> "$GITHUB_OUTPUT"
          echo "new_sha=$NEW_SHA" >> "$GITHUB_OUTPUT"

          if [ "$OLD_SHA" = "$NEW_SHA" ]; then
            echo "changed=false" >> "$GITHUB_OUTPUT"
            echo "sglang submodule is already up to date ($OLD_SHA)"
          else
            echo "changed=true" >> "$GITHUB_OUTPUT"

            # Collect commit log between old and new
            COMMITS=$(git -C third_party/sglang log --oneline "$OLD_SHA..$NEW_SHA" | head -20)
            echo "commits<<EOF" >> "$GITHUB_OUTPUT"
            echo "$COMMITS" >> "$GITHUB_OUTPUT"
            echo "EOF" >> "$GITHUB_OUTPUT"

            # sglang-kt version = ktransformers version (from version.py)
            VERSION=$(python3 -c "exec(open('version.py').read()); print(__version__)" 2>/dev/null || echo "unknown")
            echo "version=$VERSION" >> "$GITHUB_OUTPUT"

            echo "sglang submodule updated: $OLD_SHA -> $NEW_SHA (v$VERSION)"
          fi

      - name: Create pull request
        if: steps.update.outputs.changed == 'true'
        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          commit-message: |
            [build]: sync sglang submodule to ${{ steps.update.outputs.new_sha }}
          branch: auto/sync-sglang
          delete-branch: true
          title: "[build] Sync sglang-kt submodule (v${{ steps.update.outputs.version }})"
          body: |
            Automated sync of `third_party/sglang` submodule to latest `main`.

            **Old ref:** `${{ steps.update.outputs.old_sha }}`
            **New ref:** `${{ steps.update.outputs.new_sha }}`
            **sglang-kt version:** `${{ steps.update.outputs.version }}`

            ### Commits included
            ```
            ${{ steps.update.outputs.commits }}
            ```

            ---
            *This PR was created automatically by the [sync-sglang-submodule](${{ github.server_url }}/${{ github.repository }}/actions/workflows/sync-sglang-submodule.yml) workflow.*
          labels: |
            dependencies
            automated


================================================
FILE: .gitignore
================================================
__pycache__
build
.vscode
*.so
*.cache
server.db
logs
node_modules
*.nsys-rep
.vs/
*pycache*
*build/
.DS_Store
compile_commands.json
*.egg-info*
*dist/
ktransformers/server/local_store/
ktransformers/server_test1.db
*.patch
img/
tmp*.txt
test.txt
book
ktransformers/tests/chat_txt.txt
mmlu_result*
ktransformers/ktransformers_ext/cuda_musa/
test_prompt.txt
csrc/demo
build*
CMakeFiles/
kvc2/
sched/
*.png

================================================
FILE: .gitmodules
================================================
[submodule "third_party/llama.cpp"]
	path = third_party/llama.cpp
	url = https://github.com/ggerganov/llama.cpp.git
[submodule "third_party/pybind11"]
	path = third_party/pybind11
	url = https://github.com/pybind/pybind11.git
[submodule "third_party/custom_flashinfer"]
	path = third_party/custom_flashinfer
	url = https://github.com/kvcache-ai/custom_flashinfer.git
	branch = fix-precision-mla-merge-main
[submodule "third_party/sglang"]
	path = third_party/sglang
	url = https://github.com/kvcache-ai/sglang.git
	branch = main


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: MAINTAINERS.md
================================================
# Maintainers

This document lists the current maintainers and outlines their responsibilities.

## Current Maintainers

| Name | GitHub | Role | Affiliation | Email |
|------|--------|------|-------------|-------|
| Weiyu Xie | [@ErvinXie](https://github.com/ErvinXie) | Maintainer | [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University | xwy21@mails.tsinghua.edu.cn |
| Hongtao Chen | [@chenht2022](https://github.com/chenht2022) | Maintainer | [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University | cht22@mails.tsinghua.edu.cn |
| Jianwei Dong | [@ovowei](https://github.com/ovowei) | Maintainer | [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University | dongjw24@mails.tsinghua.edu.cn |
| Ziwei Yuan | [@KMSorSMS](https://github.com/KMSorSMS) | Maintainer | [Approaching.AI](http://approaching.ai/) | 2022090910005@std.uestc.edu.cn |
| Qingliang Ou | [@ouqingliang](https://github.com/ouqingliang) | Maintainer | [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University | oql@bupt.edu.cn |
| Jiaqi Liao | [@SkqLiao](https://github.com/SkqLiao) | Maintainer | [Approaching.AI](http://approaching.ai/) | jiaqi.liao@bit.edu.cn |
| Peilin Li | [@JimmyPeilinLi](https://github.com/JimmyPeilinLi) | Maintainer | [Approaching.AI](http://approaching.ai/) | lipeilin@mail.nwpu.edu.cn |
| Xingxing Hao | [@mrhaoxx](https://github.com/mrhaoxx) | Maintainer | [Approaching.AI](http://approaching.ai/) | mr.haoxx@gmail.com |
| Boxin Zhang | [@Atream](https://github.com/Atream) | Maintainer | [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University | zhangbx24@mails.tsinghua.edu.cn |
| Jingqi Tang | [@Azure-Tang](https://github.com/Azure-Tang) | Maintainer | [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University | tangjq25@mails.tsinghua.edu.cn |
| Jiahao Wang | [@qiyuxinlin](https://github.com/qiyuxinlin) | Maintainer | [Approaching.AI](http://approaching.ai/) | 202241050020@hdu.edu.cn |

## Responsibilities

Maintainers steward the project and keep it healthy for users and contributors.

- Review and approve pull requests; ensure changes meet quality, testing, and documentation standards.
- Triage issues, keep labels organized, and respond to questions in a timely manner.
- Uphold the project’s code of conduct and report violations when needed.
- Maintain CI reliability and address regressions promptly.
- Oversee releases and keep compatibility with supported dependency versions.
- Protect project security and follow the security disclosure process.

## Becoming a Maintainer

We welcome contributors who show sustained, high-quality contributions and collaborative behavior. If you are interested, please contact an existing maintainer and share your recent contributions and areas of focus.


================================================
FILE: README.md
================================================
<div align="center">
  <p align="center">

<picture>
    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>

</picture>

</p>
  <h3>A Flexible Framework for Experiencing Cutting-edge LLM Inference/Fine-tune Optimizations</h3>
  <strong><a href="#-overview">🎯 Overview</a> | <a href="#-kt-kernel---high-performance-inference-kernels">🚀 kt-kernel</a> | <a href="#-kt-sft---fine-tuning-framework">🎓 kt-sft</a> | <a href="#-citation">🔥 Citation</a> | <a href="https://github.com/kvcache-ai/ktransformers/issues/1582">🚀 Roadmap(2025Q4)</a>  </strong>
</div>

## 🎯 Overview

KTransformers is a research project focused on efficient inference and fine-tuning of large language models through CPU-GPU heterogeneous computing. The project has evolved into **two core modules**: [kt-kernel](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel/) and [kt-sft](https://github.com/kvcache-ai/ktransformers/tree/main/kt-sft).

## 🔥 Updates

* **Feb 13, 2026**: MiniMax-M2.5 Day0 Support! ([Tutorial](./doc/en/MiniMax-M2.5.md))
* **Feb 12, 2026**: GLM-5 Day0 Support! ([Tutorial](./doc/en/kt-kernel/GLM-5-Tutorial.md))
* **Jan 27, 2026**: Kimi-K2.5 Day0 Support! ([Tutorial](./doc/en/Kimi-K2.5.md)) ([SFT Tutorial](./doc/en/SFT_Installation_Guide_KimiK2.5.md))
* **Jan 22, 2026**: Support [CPU-GPU Expert Scheduling](./doc/en/kt-kernel/experts-sched-Tutorial.md), [Native BF16 and FP8 per channel Precision](./doc/en/kt-kernel/Native-Precision-Tutorial.md) and [AutoDL unified fine-tuning and inference](./doc/zh/【云端低价训推】%20KTransformers%2BAutoDL%2BLlamaFactory：随用随租的低成本超大模型「微调%2B推理」一体化流程.pdf)
* **Dec 24, 2025**: Support Native MiniMax-M2.1 inference. ([Tutorial](./doc/en/kt-kernel/MiniMax-M2.1-Tutorial.md))
* **Dec 22, 2025**: Support RL-DPO fine-tuning with LLaMA-Factory. ([Tutorial](./doc/en/SFT/DPO_tutorial.md))
* **Dec 5, 2025**: Support Native Kimi-K2-Thinking inference ([Tutorial](./doc/en/kt-kernel/Kimi-K2-Thinking-Native.md))
* **Nov 6, 2025**: Support Kimi-K2-Thinking inference ([Tutorial](./doc/en/Kimi-K2-Thinking.md)) and fine-tune ([Tutorial](./doc/en/SFT_Installation_Guide_KimiK2.md))
* **Nov 4, 2025**: KTransformers Fine-Tuning × LLaMA-Factory Integration. ([Tutorial](./doc/en/KTransformers-Fine-Tuning_User-Guide.md))
* **Oct 27, 2025**: Support Ascend NPU. ([Tutorial](./doc/zh/DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md))
* **Oct 10, 2025**: Integrating into SGLang. ([Roadmap](https://github.com/sgl-project/sglang/issues/11425), [Blog](https://lmsys.org/blog/2025-10-22-KTransformers/))
* **Sept 11, 2025**: Support Qwen3-Next. ([Tutorial](./doc/en/Qwen3-Next.md))
* **Sept 05, 2025**: Support Kimi-K2-0905. ([Tutorial](./doc/en/Kimi-K2.md))
* **July 26, 2025**: Support SmallThinker and GLM4-MoE. ([Tutorial](./doc/en/SmallThinker_and_Glm4moe.md))
* **July 11, 2025**: Support Kimi-K2. ([Tutorial](./doc/en/Kimi-K2.md))
* **June 30, 2025**: Support 3-layer (GPU-CPU-Disk) [prefix cache](./doc/en/prefix_cache.md) reuse.
* **May 14, 2025**: Support Intel Arc GPU ([Tutorial](./doc/en/xpu.md)).
* **Apr 29, 2025**: Support AMX-Int8、 AMX-BF16 and Qwen3MoE ([Tutorial](./doc/en/AMX.md))
* **Apr 9, 2025**: Experimental support for LLaMA 4 models ([Tutorial](./doc/en/llama4.md)).
* **Apr 2, 2025**: Support Multi-concurrency. ([Tutorial](./doc/en/balance-serve.md)).
* **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./doc/en/ROCm.md)).
* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel) for DeepSeek-V3 and R1 in 24GB VRAM.
* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
* **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
* **Aug 9, 2024**: Support windows native.

---

## 📦 Core Modules

### 🚀 [kt-kernel](./kt-kernel/) - High-Performance Inference Kernels

CPU-optimized kernel operations for heterogeneous LLM inference.

<img width="1049" height="593" alt="image" src="https://github.com/user-attachments/assets/68f423da-3f55-4025-bdc9-9ceaa554f00b" />


**Key Features:**
- **AMX/AVX Acceleration**: Intel AMX and AVX512/AVX2 optimized kernels for INT4/INT8 quantized inference
- **MoE Optimization**: Efficient Mixture-of-Experts inference with NUMA-aware memory management
- **Quantization Support**: CPU-side INT4/INT8 quantized weights, GPU-side GPTQ support
- **Easy Integration**: Clean Python API for SGLang and other frameworks

**Quick Start:**
```bash
cd kt-kernel
pip install .
```

**Use Cases:**

- CPU-GPU hybrid inference for large MoE models
- Integration with SGLang for production serving
- Heterogeneous expert placement (hot experts on GPU, cold experts on CPU)

**Performance Examples:**
| Model | Hardware Configuration | Total Throughput | Output Throughput |
|-------|------------------------|------------------|-------------------|
| DeepSeek-R1-0528 (FP8) | 8×L20 GPU + Xeon Gold 6454S | 227.85 tokens/s | 87.58 tokens/s (8-way concurrency) |

👉 **[Full Documentation →](./kt-kernel/README.md)**

---

### 🎓 [kt-sft](./kt-sft/) - Fine-Tuning Framework

KTransformers × LLaMA-Factory integration for ultra-large MoE model fine-tuning.

![image-20251011010558909](https://raw.githubusercontent.com/kvcache-ai/ktransformers/main/doc/assets/image-20251011010558909.png)

**Key Features:**

- **Resource Efficient**: Fine-tune 671B DeepSeek-V3 with just **70GB GPU memory** + 1.3TB RAM
- **LoRA Support**: Full LoRA fine-tuning with heterogeneous acceleration
- **LLaMA-Factory Integration**: Seamless integration with popular fine-tuning framework
- **Production Ready**: Chat, batch inference, and metrics evaluation

**Performance Examples:**

| Model | Configuration | Throughput | GPU Memory |
|-------|--------------|------------|------------|
| DeepSeek-V3 (671B) | LoRA + AMX | ~40 tokens/s | 70GB (multi-GPU) |
| DeepSeek-V2-Lite (14B) | LoRA + AMX | ~530 tokens/s | 6GB |

**Quick Start:**
```bash
cd kt-sft
# Install environment following kt-sft/README.md
USE_KT=1 llamafactory-cli train examples/train_lora/deepseek3_lora_sft_kt.yaml
```

👉 **[Full Documentation →](./kt-sft/README.md)**

---

## 🔥 Citation

If you use KTransformers in your research, please cite our paper:

```bibtex
@inproceedings{10.1145/3731569.3764843,
  title = {KTransformers: Unleashing the Full Potential of CPU/GPU Hybrid Inference for MoE Models},
  author = {Chen, Hongtao and Xie, Weiyu and Zhang, Boxin and Tang, Jingqi and Wang, Jiahao and Dong, Jianwei and Chen, Shaoyuan and Yuan, Ziwei and Lin, Chen and Qiu, Chengyu and Zhu, Yuening and Ou, Qingliang and Liao, Jiaqi and Chen, Xianglin and Ai, Zhiyuan and Wu, Yongwei and Zhang, Mingxing},
  booktitle = {Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles},
  year = {2025}
}
```

## 👥 Contributors & Team

Developed and maintained by:
- [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University
- [Approaching.AI](http://approaching.ai/)
- [9#AISoft](https://github.com/aisoft9)
- Community contributors

We welcome contributions! Please feel free to submit issues and pull requests.

## 💬 Community & Support

- **GitHub Issues**: [Report bugs or request features](https://github.com/kvcache-ai/ktransformers/issues)
- **WeChat Group**: See [archive/WeChatGroup.png](./archive/WeChatGroup.png)

## 📦 KT original Code

The original integrated KTransformers framework has been archived to the [`archive/`](./archive/) directory for reference. The project now focuses on the two core modules above for better modularity and maintainability.

For the original documentation with full quick-start guides and examples, see:
- [archive/README.md](./archive/README.md) (English)
- [archive/README_ZH.md](./archive/README_ZH.md) (中文)


================================================
FILE: README_ZH.md
================================================
<div align="center">
  <p align="center">

<picture>
    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>

</picture>

</p>
  <h3>一个用于体验尖端 LLM 推理/微调优化的灵活框架</h3>
  <strong><a href="#-概览">🎯 概览</a> | <a href="#-kt-kernel---高性能推理内核">🚀 kt-kernel</a> | <a href="#-kt-sft---微调框架">🎓 kt-sft</a> | <a href="#-引用">🔥 引用</a> </strong>
</div>

## 🎯 概览

KTransformers 是一个专注于通过 CPU-GPU 异构计算实现大语言模型高效推理和微调的研究项目。该项目已发展为**两个核心模块**：[kt-kernel](./kt-kernel/) 和 [kt-sft](./kt-sft/)。

## 🔥 更新

* **2025 年 12 月 5 日**：支持原生 Kimi-K2-Thinking 推理（[教程](./doc/en/Kimi-K2-Thinking-Native.md)）
* **2025 年 11 月 6 日**：支持 Kimi-K2-Thinking 推理（[教程](./doc/en/Kimi-K2-Thinking.md)）和微调（[教程](./doc/en/SFT_Installation_Guide_KimiK2.md)）
* **2025 年 11 月 4 日**：KTransformers 微调 × LLaMA-Factory 集成（[教程](./doc/en/KTransformers-Fine-Tuning_User-Guide.md)）
* **2025 年 10 月 27 日**：支持昇腾 NPU（[教程](./doc/zh/DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md)）
* **2025 年 10 月 10 日**：集成到 SGLang（[路线图](https://github.com/sgl-project/sglang/issues/11425)，[博客](https://lmsys.org/blog/2025-10-22-KTransformers/)）
* **2025 年 9 月 11 日**：支持 Qwen3-Next（[教程](./doc/en/Qwen3-Next.md)）
* **2025 年 9 月 5 日**：支持 Kimi-K2-0905（[教程](./doc/en/Kimi-K2.md)）
* **2025 年 7 月 26 日**：支持 SmallThinker 和 GLM4-MoE（[教程](./doc/en/SmallThinker_and_Glm4moe.md)）
* **2025 年 7 月 11 日**：支持 Kimi-K2（[教程](./doc/en/Kimi-K2.md)）
* **2025 年 6 月 30 日**：支持 3 层（GPU-CPU-磁盘）[前缀缓存](./doc/en/prefix_cache.md)复用
* **2025 年 5 月 14 日**：支持 Intel Arc GPU（[教程](./doc/en/xpu.md)）
* **2025 年 4 月 29 日**：支持 AMX-Int8、AMX-BF16 和 Qwen3MoE（[教程](./doc/en/AMX.md)）
* **2025 年 4 月 9 日**：实验性支持 LLaMA 4 模型（[教程](./doc/en/llama4.md)）
* **2025 年 4 月 2 日**：支持多并发（[教程](./doc/en/balance-serve.md)）
* **2025 年 3 月 15 日**：支持 AMD GPU 上的 ROCm（[教程](./doc/en/ROCm.md)）
* **2025 年 3 月 5 日**：支持 unsloth 1.58/2.51 位权重和 [IQ1_S/FP8 混合](./doc/en/fp8_kernel.md)权重。在 24GB VRAM 中支持 DeepSeek-V3 和 R1 的 139K [更长上下文](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel)
* **2025 年 2 月 25 日**：为 DeepSeek-V3 和 R1 支持 [FP8 GPU 内核](./doc/en/fp8_kernel.md)；[更长上下文](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context)
* **2025 年 2 月 15 日**：更长上下文（24GB VRAM 从 4K 到 8K）& 速度稍快（+15%，最高 16 Tokens/s），更新[文档](./doc/en/DeepseekR1_V3_tutorial.md)和[在线手册](https://kvcache-ai.github.io/ktransformers/)
* **2025 年 2 月 10 日**：支持 Deepseek-R1 和 V3 在单 GPU（24GB VRAM）/多 GPU 和 382GB DRAM 上运行，速度提升高达 3~28 倍。详细案例展示和复现教程请参见[这里](./doc/en/DeepseekR1_V3_tutorial.md)
* **2024 年 8 月 28 日**：将 DeepseekV2 所需的 VRAM 从 21GB 降低到 11GB
* **2024 年 8 月 15 日**：更新了关于注入和多 GPU 的详细[教程](doc/en/injection_tutorial.md)
* **2024 年 8 月 14 日**：支持 llamfile 作为线性后端
* **2024 年 8 月 12 日**：支持多 GPU；支持新模型：mixtral 8\*7B 和 8\*22B；支持 GPU 上的 q2k、q3k、q5k 去量化
* **2024 年 8 月 9 日**：支持 Windows 原生环境

---

## 📦 核心模块

### 🚀 [kt-kernel](./kt-kernel/) - 高性能推理内核

用于异构 LLM 推理的 CPU 优化内核操作。

![image-20251011010558909](./doc/assets/heterogeneous_computing.png)

**主要特性：**
- **AMX/AVX 加速**：Intel AMX 和 AVX512/AVX2 优化的内核，用于 INT4/INT8 量化推理
- **MoE 优化**：高效的专家混合推理，具有 NUMA 感知内存管理
- **量化支持**：CPU 端 INT4/INT8 量化权重，GPU 端 GPTQ 支持
- **易于集成**：为 SGLang 和其他框架提供简洁的 Python API

**快速开始：**
```bash
cd kt-kernel
pip install .
```

**使用场景：**

- 大型 MoE 模型的 CPU-GPU 混合推理
- 与 SGLang 集成用于生产服务
- 异构专家放置（热专家在 GPU 上，冷专家在 CPU 上）

**性能示例：**
| 模型 | 硬件配置 | 总吞吐量 | 输出吞吐量 |
|-------|------------------------|------------------|-------------------|
| DeepSeek-R1-0528 (FP8) | 8×L20 GPU + Xeon Gold 6454S | 227.85 tokens/s | 87.58 tokens/s（8 路并发）|

👉 **[完整文档 →](./kt-kernel/README.md)**

---

### 🎓 [kt-sft](./kt-sft/) - 微调框架

KTransformers × LLaMA-Factory 集成，用于超大型 MoE 模型微调。

![image-20251011010558909](./doc/assets/image-20251011010558909.png)

**主要特性：**

- **资源高效**：仅需 **70GB GPU 显存** + 1.3TB 内存即可微调 671B DeepSeek-V3
- **LoRA 支持**：完整的 LoRA 微调，带有异构加速
- **LLaMA-Factory 集成**：与流行的微调框架无缝集成
- **生产就绪**：聊天、批量推理和指标评估

**性能示例：**

| 模型 | 配置 | 吞吐量 | GPU 显存 |
|-------|--------------|------------|--------------|
| DeepSeek-V3 (671B) | LoRA + AMX | ~40 tokens/s | 70GB（多 GPU）|
| DeepSeek-V2-Lite (14B) | LoRA + AMX | ~530 tokens/s | 6GB |

**快速开始：**
```bash
cd kt-sft
# 按照 kt-sft/README.md 安装环境
USE_KT=1 llamafactory-cli train examples/train_lora/deepseek3_lora_sft_kt.yaml
```

👉 **[完整文档 →](./kt-sft/README.md)**

---

## 🔥 引用

如果您在研究中使用了 KTransformers，请引用我们的论文：

```bibtex
@inproceedings{10.1145/3731569.3764843,
  title = {KTransformers: Unleashing the Full Potential of CPU/GPU Hybrid Inference for MoE Models},
  author = {Chen, Hongtao and Xie, Weiyu and Zhang, Boxin and Tang, Jingqi and Wang, Jiahao and Dong, Jianwei and Chen, Shaoyuan and Yuan, Ziwei and Lin, Chen and Qiu, Chengyu and Zhu, Yuening and Ou, Qingliang and Liao, Jiaqi and Chen, Xianglin and Ai, Zhiyuan and Wu, Yongwei and Zhang, Mingxing},
  booktitle = {Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles},
  year = {2025}
}
```

## 👥 贡献者与团队

由以下团队开发和维护：
- 清华大学 [MADSys 实验室](https://madsys.cs.tsinghua.edu.cn/)
- [Approaching.AI](http://approaching.ai/)
- 社区贡献者

我们欢迎贡献！请随时提交问题和拉取请求。

## 💬 社区与支持

- **GitHub Issues**：[报告问题或请求功能](https://github.com/kvcache-ai/ktransformers/issues)
- **微信群**：请参见 [archive/WeChatGroup.png](./archive/WeChatGroup.png)

## 📦 KT原仓库

原始的集成 KTransformers 框架已归档到 [`archive/`](./archive/) 目录以供参考。该项目现在专注于上述两个核心模块，以获得更好的模块化和可维护性。

有关原始文档以及完整的快速入门指南和示例，请参见：
- [archive/README.md](./archive/README.md)（英文）
- [archive/README_ZH.md](./archive/README_ZH.md)（中文）


================================================
FILE: archive/.devcontainer/Dockerfile
================================================
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
WORKDIR /workspace
ENV CUDA_HOME /usr/local/cuda
RUN <<EOF
apt update -y &&  apt install -y  --no-install-recommends \
    git \
    wget \
    vim \
    gcc \
    g++ \
    cmake && 
rm -rf /var/lib/apt/lists/* &&
pip install --upgrade pip &&
pip install ninja pyproject numpy cpufeature &&
pip install flash-attn &&
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
EOF
# Set the default shell to bash
CMD ["/bin/bash"]


================================================
FILE: archive/.devcontainer/devcontainer.json
================================================
{
    "name": "Ktrans Dev Container",
    "privileged": true,
    "build": {
        "dockerfile": "Dockerfile",
        "context": "..",
        "args": {
            "http_proxy": "${env:http_proxy}",
            "https_proxy": "${env:https_proxy}",
        }
    },
    "runArgs": [
        "--network=host",
        "--gpus",
        "all"
        // "--gpu all"
    ],
    "workspaceFolder": "/workspace",
    "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
    "mounts": [
        "source=/mnt/data,target=/mnt/incontainer,type=bind,consistency=cached"
    ],
    "customizations": {
        "vscode": {
            "extensions": [
            ],
            "settings": {
                "terminal.integrated.shell.linux": "/bin/bash",
                "cmake.configureOnOpen": true,
                "cmake.generator": "Ninja"
            }
        }
    }
}

================================================
FILE: archive/.flake8
================================================
[flake8]
max-line-length = 120
extend-select = B950
extend-ignore = E203,E501,E701, B001,B006,B007,B008,B009,B010,B011,B016,B028,B031,B950,E265,E266,E401,E402,E711,E712,E713,E721,E722,E731,F401,F403,F405,F541,F811,F821,F841,W391

================================================
FILE: archive/.gitmodules
================================================
[submodule "third_party/llama.cpp"]
	path = archive/third_party/llama.cpp
	url = https://github.com/ggerganov/llama.cpp.git
[submodule "third_party/pybind11"]
	path = archive/third_party/pybind11
	url = https://github.com/pybind/pybind11.git
[submodule "third_party/spdlog"]
	path = archive/third_party/spdlog
	url = https://github.com/gabime/spdlog.git
[submodule "third_party/custom_flashinfer"]
	path = archive/third_party/custom_flashinfer
	url = https://github.com/kvcache-ai/custom_flashinfer.git
	branch = fix-precision-mla-merge-main
[submodule "third_party/xxHash"]
	path = archive/third_party/xxHash
	url = https://github.com/Cyan4973/xxHash.git
[submodule "third_party/prometheus-cpp"]
	path = archive/third_party/prometheus-cpp
	url = https://github.com/jupp0r/prometheus-cpp
[submodule "third_party/PhotonLibOS"]
	path = archive/third_party/PhotonLibOS
	url = https://github.com/alibaba/PhotonLibOS.git
[submodule "kt-kernel/third_party/llama.cpp"]
	path = kt-kernel/third_party/llama.cpp
	url = https://github.com/ggerganov/llama.cpp.git
[submodule "kt-kernel/third_party/pybind11"]
	path = kt-kernel/third_party/pybind11
	url = https://github.com/pybind/pybind11.git


================================================
FILE: archive/.pylintrc
================================================
[MASTER]
extension-pkg-whitelist=pydantic
max-line-length=120

[MESSAGES CONTROL]
disable=missing-function-docstring

================================================
FILE: archive/Dockerfile
================================================
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server


ARG CPU_INSTRUCT=NATIVE

# 设置工作目录和 CUDA 路径
WORKDIR /workspace
ENV CUDA_HOME=/usr/local/cuda


# 安装依赖
RUN apt update -y
RUN apt install -y --no-install-recommends \
    libtbb-dev \
    libssl-dev \
    libcurl4-openssl-dev \
    libaio1 \
    libaio-dev \
    libfmt-dev \
    libgflags-dev \
    zlib1g-dev \
    patchelf \
    git \
    wget \
    vim \
    gcc \
    g++ \
    cmake
# 拷贝代码
RUN git clone https://github.com/kvcache-ai/ktransformers.git 
# 清理 apt 缓存
RUN rm -rf /var/lib/apt/lists/*

# 进入项目目录
WORKDIR /workspace/ktransformers
# 初始化子模块
RUN git submodule update --init --recursive

# 升级 pip
RUN pip install --upgrade pip

# 安装构建依赖
RUN pip install ninja pyproject numpy cpufeature aiohttp zmq openai

# 安装 flash-attn（提前装可以避免后续某些编译依赖出错）
RUN pip install flash-attn

# 安装 ktransformers 本体（含编译）
RUN CPU_INSTRUCT=${CPU_INSTRUCT} \
    USE_BALANCE_SERVE=1 \
    KTRANSFORMERS_FORCE_BUILD=TRUE \
    TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" \
    pip install . --no-build-isolation --verbose

RUN pip install third_party/custom_flashinfer/
# 清理 pip 缓存
RUN pip cache purge

# 拷贝 C++ 运行时库
RUN cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/

# 保持容器运行（调试用）
ENTRYPOINT ["tail", "-f", "/dev/null"]

================================================
FILE: archive/Dockerfile.xpu
================================================
# Base image
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04

ARG http_proxy
ARG https_proxy

ENV DEBIAN_FRONTEND=noninteractive
ENV CONDA_DIR=/opt/conda

# Install dependencies
RUN apt-get update && apt-get install -y \
    wget \
    curl \
    bash \
    git \
    vim \
    ca-certificates \
    binutils \
    cmake \
    g++ \
    && rm -rf /var/lib/apt/lists/*

# Install Miniforge
RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O /tmp/miniforge.sh && \
    bash /tmp/miniforge.sh -b -p $CONDA_DIR && \
    rm /tmp/miniforge.sh && \
    $CONDA_DIR/bin/conda clean -afy

# Add conda to PATH
ENV PATH=$CONDA_DIR/bin:$PATH

RUN bash -c "\
    source /opt/conda/etc/profile.d/conda.sh && \
    conda create --name ktransformers python=3.11 -y && \
    conda activate ktransformers && \
    conda env list && \
    conda install -c conda-forge libstdcxx-ng -y && \
    strings \$(find /opt/conda/envs/ktransformers/lib -name 'libstdc++.so.6') | grep GLIBCXX | grep 3.4.32 \
"

RUN bash -c "\
    source /opt/conda/etc/profile.d/conda.sh && \
    conda activate ktransformers && \
    pip install ipex-llm[xpu_2.6]==2.3.0b20250518 --extra-index-url https://download.pytorch.org/whl/xpu && \
    pip uninstall -y torch torchvision torchaudio && \
    pip install torch==2.7+xpu torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu && \
    pip uninstall -y intel-opencl-rt dpcpp-cpp-rt && \
    pip list \
"

# Clone and set up ktransformers repo
RUN bash -c "\
    source $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate ktransformers && \
    git clone https://github.com/kvcache-ai/ktransformers.git && \
    cd ktransformers && \
    git submodule update --init && \
    sed -i 's/torch\.xpu\.is_available()/True/g' setup.py && \
    bash install.sh --dev xpu \
"

# Init conda and prepare bashrc
RUN conda init bash && \
    echo "source $CONDA_DIR/etc/profile.d/conda.sh" >> ~/.bashrc && \
    echo "conda activate ktransformers" >> ~/.bashrc

WORKDIR /ktransformers/
CMD ["bash"]


================================================
FILE: archive/LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: archive/MANIFEST.in
================================================
graft third_party
graft ktransformers
graft local_chat.py
graft csrc
include LICENSE README.md
prune ktransformers/website
prune ktransformers/logs
prune ktransformers.egg-info
prune third_party/llama.cpp/models
graft ktransformers/website/dist
global-exclude __pycache__
include KTransformersOps.*.so
include cpuinfer_ext.*.so


================================================
FILE: archive/Makefile
================================================
flake_find:
	cd ktransformers && flake8 | grep -Eo '[A-Z][0-9]{3}' | sort | uniq| paste -sd ',' - 
format:
	@cd ktransformers && black .
	@black setup.py
dev_install:
# clear build dirs
	rm -rf build
	rm -rf *.egg-info
	rm -rf ktransformers/ktransformers_ext/build
	rm -rf ktransformers/ktransformers_ext/cuda/build
	rm -rf ktransformers/ktransformers_ext/cuda/dist
	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info

# install ktransformers
	echo "Installing python dependencies from requirements.txt"
	pip install -r requirements-local_chat.txt

	echo "Installing ktransformers"
	KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
	echo "Installation completed successfully"
clean:
	rm -rf build
	rm -rf *.egg-info
	rm -rf ktransformers/ktransformers_ext/build
	rm -rf ktransformers/ktransformers_ext/cuda/build
	rm -rf ktransformers/ktransformers_ext/cuda/dist
	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info	
install_numa:
	USE_NUMA=1 make dev_install
install_no_numa:
	env -u USE_NUMA make dev_install

================================================
FILE: archive/README.md
================================================
<div align="center">
  <p align="center">
    <picture>
      <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
    </picture>
  </p>
  <h3>High-Performance CPU-GPU Hybrid Inference for Large Language Models</h3>
</div>

## 🎯 Overview

KTransformers is a research project focused on efficient inference and fine-tuning of large language models through CPU-GPU heterogeneous computing. The project has evolved into **two core modules**: [kt-kernel](./kt-kernel/) and [kt-sft](./kt-sft/).

## 🔥 Updates

* **Nov 6, 2025**: Support Kimi-K2-Thinking inference and fine-tune
* **Nov 4, 2025**: KTransformers Fine-Tuning × LLaMA-Factory Integration
* **Oct 27, 2025**: Support Ascend NPU
* **Oct 10, 2025**: Integrating into SGLang ([Roadmap](https://github.com/sgl-project/sglang/issues/11425), [Blog](https://lmsys.org/blog/2025-10-22-KTransformers/))
* **Sept 11, 2025**: Support Qwen3-Next
* **Sept 05, 2025**: Support Kimi-K2-0905
* **July 26, 2025**: Support SmallThinker and GLM4-MoE
* **June 30, 2025**: Support 3-layer (GPU-CPU-Disk) prefix cache reuse
* **May 14, 2025**: Support Intel Arc GPU
* **Apr 29, 2025**: Support AMX-Int8、AMX-BF16 and Qwen3MoE
* **Apr 9, 2025**: Experimental support for LLaMA 4 models
* **Apr 2, 2025**: Support Multi-concurrency
* **Mar 15, 2025**: Support ROCm on AMD GPU
* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and IQ1_S/FP8 hybrid weights; 139K longer context for DeepSeek-V3/R1
* **Feb 25, 2025**: Support FP8 GPU kernel for DeepSeek-V3 and R1
* **Feb 10, 2025**: Support Deepseek-R1 and V3, up to 3~28x speedup

---

## 📦 Core Modules

### 🚀 [kt-kernel](./kt-kernel/) - High-Performance Inference Kernels

CPU-optimized kernel operations for heterogeneous LLM inference.

![image-20251011010558909](./doc/assets/heterogeneous_computing.png)

**Key Features:**
- **AMX/AVX Acceleration**: Intel AMX and AVX512/AVX2 optimized kernels for INT4/INT8 quantized inference
- **MoE Optimization**: Efficient Mixture-of-Experts inference with NUMA-aware memory management
- **Quantization Support**: CPU-side INT4/INT8 quantized weights, GPU-side GPTQ support
- **Easy Integration**: Clean Python API for SGLang and other frameworks

**Quick Start:**
```bash
cd kt-kernel
pip install .
```

**Use Cases:**

- CPU-GPU hybrid inference for large MoE models
- Integration with SGLang for production serving
- Heterogeneous expert placement (hot experts on GPU, cold experts on CPU)

**Performance Examples:**
| Model | Hardware Configuration | Total Throughput | Output Throughput |
|-------|------------------------|------------------|-------------------|
| DeepSeek-R1-0528 (FP8) | 8×L20 GPU + Xeon Gold 6454S | 227.85 tokens/s | 87.58 tokens/s (8-way concurrency) |

👉 **[Full Documentation →](./kt-kernel/README.md)**

---

### 🎓 [kt-sft](./kt-sft/) - Fine-Tuning Framework

KTransformers × LLaMA-Factory integration for ultra-large MoE model fine-tuning.

![image-20251011010558909](./doc/assets/image-20251011010558909.png)

**Key Features:**

- **Resource Efficient**: Fine-tune 671B DeepSeek-V3 with just **70GB GPU memory** + 1.3TB RAM
- **LoRA Support**: Full LoRA fine-tuning with heterogeneous acceleration
- **LLaMA-Factory Integration**: Seamless integration with popular fine-tuning framework
- **Production Ready**: Chat, batch inference, and metrics evaluation

**Performance Examples:**

| Model | Configuration | Throughput | GPU Memory |
|-------|--------------|------------|------------|
| DeepSeek-V3 (671B) | LoRA + AMX | ~40 tokens/s | 70GB (multi-GPU) |
| DeepSeek-V2-Lite (14B) | LoRA + AMX | ~530 tokens/s | 6GB |

**Quick Start:**
```bash
cd kt-sft
# Install environment following kt-sft/README.md
USE_KT=1 llamafactory-cli train examples/train_lora/deepseek3_lora_sft_kt.yaml
```

👉 **[Full Documentation →](./kt-sft/README.md)**

---

## 🔥 Citation

If you use KTransformers in your research, please cite our paper:

```bibtex
@inproceedings{10.1145/3731569.3764843,
  title = {KTransformers: Unleashing the Full Potential of CPU/GPU Hybrid Inference for MoE Models},
  author = {Chen, Hongtao and Xie, Weiyu and Zhang, Boxin and Tang, Jingqi and Wang, Jiahao and Dong, Jianwei and Chen, Shaoyuan and Yuan, Ziwei and Lin, Chen and Qiu, Chengyu and Zhu, Yuening and Ou, Qingliang and Liao, Jiaqi and Chen, Xianglin and Ai, Zhiyuan and Wu, Yongwei and Zhang, Mingxing},
  booktitle = {Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles},
  year = {2025}
}
```

## 👥 Contributors & Team

Developed and maintained by:
- [MADSys Lab](https://madsys.cs.tsinghua.edu.cn/) @ Tsinghua University
- [Approaching.AI](http://approaching.ai/)
- Community contributors

We welcome contributions! Please feel free to submit issues and pull requests.

## 💬 Community & Support

- **GitHub Issues**: [Report bugs or request features](https://github.com/kvcache-ai/ktransformers/issues)
- **GitHub Discussions**: [Ask questions and share ideas](https://github.com/kvcache-ai/ktransformers/discussions)
- **WeChat Group**: See [archive/WeChatGroup.png](./archive/WeChatGroup.png)

## 📦 Legacy Code

The original integrated KTransformers framework has been archived to the [`archive/`](./archive/) directory for reference. The project now focuses on the two core modules above for better modularity and maintainability.

For the original documentation with full quick-start guides and examples, see:
- [archive/README_LEGACY.md](./archive/README_LEGACY.md) (English)
- [archive/README_ZH_LEGACY.md](./archive/README_ZH_LEGACY.md) (中文)


================================================
FILE: archive/README_LEGACY.md
================================================
<div align="center">
  <!-- <h1>KTransformers</h1> -->
  <p align="center">

<picture>
    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>

</picture>

</p>
  <h3>A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations</h3>
  <strong><a href="#show-cases">🌟 Show Cases</a> | <a href="#quick-start">🚀 Quick Start</a> | <a href="#tutorial">📃 Tutorial</a> | <a href="#Citation">🔥  Citation </a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬  Discussion </a>|<a href="#FAQ"> 🙋 FAQ</a> </strong>
</div>

<h2 id="intro">🎉 Introduction</h2>
KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
<br/><br/>
KTransformers is a flexible, Python-centric framework designed with extensibility at its core. 
By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. 
<br/><br/>
Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.

<h2 id="Updates">🔥 Updates</h2>

* **Nov 6, 2025**: Support Kimi-K2-Thinking inference ([Tutorial](./doc/en/Kimi-K2-Thinking.md)) and fine-tune ([Tutorial](./doc/en/SFT_Installation_Guide_KimiK2.md))
* **Nov 4, 2025**: KTransformers Fine-Tuning × LLaMA-Factory Integration. ([Tutorial](./doc/en/KTransformers-Fine-Tuning_User-Guide.md))
* **Oct 27, 2025**: Support Ascend NPU. ([Tutorial](./doc/zh/DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md))
* **Oct 10, 2025**: Integrating into SGLang. ([Roadmap](https://github.com/sgl-project/sglang/issues/11425))
* **Sept 11, 2025**: Support Qwen3-Next. ([Tutorial](./doc/en/Qwen3-Next.md))
* **Sept 05, 2025**: Support Kimi-K2-0905. ([Tutorial](./doc/en/Kimi-K2.md))
* **July 26, 2025**: Support SmallThinker and GLM4-MoE. ([Tutorial](./doc/en/SmallThinker_and_Glm4moe.md))
* **July 11, 2025**: Support Kimi-K2. ([Tutorial](./doc/en/Kimi-K2.md))
* **June 30, 2025**: Support 3-layer (GPU-CPU-Disk) [prefix cache](./doc/en/prefix_cache.md) reuse.
* **May 14, 2025**: Support Intel Arc GPU ([Tutorial](./doc/en/xpu.md)).
* **Apr 29, 2025**: Support AMX-Int8、 AMX-BF16 and Qwen3MoE ([Tutorial](./doc/en/AMX.md))

https://github.com/user-attachments/assets/fafe8aec-4e22-49a8-8553-59fb5c6b00a2

* **Apr 9, 2025**: Experimental support for LLaMA 4 models ([Tutorial](./doc/en/llama4.md)).
* **Apr 2, 2025**: Support Multi-concurrency. ([Tutorial](./doc/en/balance-serve.md)).

https://github.com/user-attachments/assets/faa3bda2-928b-45a7-b44f-21e12ec84b8a

* **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./doc/en/ROCm.md)).
* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel) for DeepSeek-V3 and R1 in 24GB VRAM.
* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
* **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
* **Aug 9, 2024**: Support windows native.

<!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->

<h2 id="show-cases">🌟 Show Cases</h2>

<div>
<h3>GPT-4/o1-level Local VSCode Copilot on a Desktop with only 24GB VRAM</h3>
</div>

https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

</p>

- **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
  
  - Prefill Speed (tokens/s):
    - KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
    - Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
  - Decode Speed (tokens/s):
    - KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
    - Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
  - Upcoming Open Source Release:
    - AMX optimizations and selective expert activation will be open-sourced in V0.3.
    - Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).
- **Local 236B DeepSeek-Coder-V2:** Running its Q4_K_M version using only 21GB VRAM and 136GB DRAM, attainable on a local desktop machine, which scores even better than GPT4-0613 in [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench).

<p align="center">
  <picture>
    <img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/d052924e-8631-44de-aad2-97c54b965693" width=100%>
  </picture>
</p>

- **Faster Speed:** Achieving 126 tokens/s for 2K prompt prefill and 13.6 tokens/s for generation through MoE offloading and injecting advanced kernels from [Llamafile](https://github.com/Mozilla-Ocho/llamafile/tree/main) and [Marlin](https://github.com/IST-DASLab/marlin).
- **VSCode Integration:** Wrapped into an OpenAI and Ollama compatible API for seamless integration as a backend for [Tabby](https://github.com/TabbyML/tabby) and various other frontends.

<p align="center">

https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c

</p>

<!-- <h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
<p align="center">

https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12

* **1M Context InternLM 2.5 7B**: Operates at full bf16 precision, utilizing 24GB VRAM and 150GB DRAM, which is feasible on a local desktop setup. It achieves a 92.88% success rate on the 1M "Needle In a Haystack" test and 100% on the 128K NIAH test.

<p align="center">
  <picture>
    <img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
  </picture>
</p>

<p align="center">
  <picture>
    <img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
  </picture>
</p>

* **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.

* **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
 -->

<strong>More advanced features will coming soon, so stay tuned!</strong>

<h2 id="quick-start">🚀 Quick Start</h2>

Getting started with KTransformers is simple! Follow the steps below to set up and start using it.

we have already supported vendors:

- Metax
- Sanechips (ZhuFeng V1.0)
- Intel
- Ascend
- Kunpeng
- AMD

### 📥 Installation

To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).

<h2 id="tutorial">📃 Brief Injection Tutorial</h2>
At the heart of KTransformers is a user-friendly, template-based injection framework. 
This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.

</br>
<p align="center">
  <picture>
    <img alt="Inject-Struction" src="https://github.com/user-attachments/assets/6b4c1e54-9f6d-45c5-a3fc-8fa45e7d257e" width=65%>
  </picture>
</p>

Given that vLLM already serves as a great framework for large-scale deployment optimizations, KTransformers is particularly focused on local deployments that are constrained by limited resources. We pay special attention to heterogeneous computing opportunities, such as GPU/CPU offloading of quantized models. For example, we support the efficient <a herf="https://github.com/Mozilla-Ocho/llamafile/tree/main">Llamafile</a> and <a herf="https://github.com/IST-DASLab/marlin">Marlin</a> kernels for CPU and GPU, respectively. More details can be found <a herf="doc/en/operators/llamafile.md">here</a>.

<h3>Example Usage</h3>
To utilize the provided kernels, users only need to create a YAML-based injection template and add the call to `optimize_and_load_gguf` before using the Transformers model.

```python
with torch.device("meta"):
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
...
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
```

In this example, the AutoModel is first initialized on the meta device to avoid occupying any memory resources. Then, `optimize_and_load_gguf` iterates through all sub-modules of the model, matches rules specified in your YAML rule file, and replaces them with advanced modules as specified.

After injection, the original `generate` interface is available, but we also provide a compatible `prefill_and_generate` method, which enables further optimizations like CUDAGraph to improve generation speed.

<h3>How to custom your model</h3>

A detailed tutorial of the injection and multi-GPU using DeepSeek-V2 as an example is given [here](doc/en/injection_tutorial.md).

Below is an example of a YAML template for replacing all original Linear modules with Marlin, an advanced 4-bit quantization kernel.

```yaml
- match:
    name: "^model\\.layers\\..*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformerLinear  # optimized Kernel on quantized data types
    device: "cpu"   # which devices to load this module when initializing
    kwargs:
      generate_device: "cuda"
      generate_linear_type: "QuantizedLinearMarlin"
```

Each rule in the YAML file has two parts: `match` and `replace`. The `match` part specifies which module should be replaced, and the `replace` part specifies the module to be injected into the model along with the initialization keywords.

You can find example rule templates for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models, in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory. These templates are used to power the `local_chat.py` demo.

If you are interested in our design principles and the implementation of the injection framework, please refer to the [design document](doc/en/deepseek-v2-injection.md).

<h2 id="Citation">🔥 Citation</h2>

If you use KTransformers for your research, please cite our [paper](https://madsys.cs.tsinghua.edu.cn/publication/ktransformers-unleashing-the-full-potential-of-cpu/gpu-hybrid-inference-for-moe-models/):

```
@inproceedings{10.1145/3731569.3764843,
title = {KTransformers: Unleashing the Full Potential of CPU/GPU Hybrid Inference for MoE Models},
author = {Chen, Hongtao and Xie, Weiyu and Zhang, Boxin and Tang, Jingqi and Wang, Jiahao and Dong, Jianwei and Chen, Shaoyuan and Yuan, Ziwei and Lin, Chen and Qiu, Chengyu and Zhu, Yuening and Ou, Qingliang and Liao, Jiaqi and Chen, Xianglin and Ai, Zhiyuan and Wu, Yongwei and Zhang, Mingxing},
booktitle = {Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles},
year = {2025}
}
```

<h2 id="ack">Acknowledgment and Contributors</h2>

The development of KTransformers is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, Marlin, sglang and flashinfer. We are planning to contribute back to the community by upstreaming our modifications.

KTransformers is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformers faster and easier to use.

<h2 id="ack">Discussion</h2>

If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)

<h2 id="FAQ">🙋 FAQ</h2>

Some common questions are answered in the [FAQ](doc/en/FAQ.md).


================================================
FILE: archive/README_ZH.md
================================================
<div align="center">
  <p align="center">
    <picture>
      <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
    </picture>
  </p>
  <h3>高性能 CPU-GPU 异构大语言模型推理</h3>
</div>

## 🎯 项目概述

KTransformers 是一个专注于大语言模型高效推理和微调的研究项目，通过 CPU-GPU 异构计算实现资源受限环境下的模型部署。项目已演进为**两个核心模块**：[kt-kernel](./kt-kernel/) 和 [kt-sft](./kt-sft/)。

## 🔥 更新

* **2025年11月6日**：支持 Kimi-K2-Thinking 推理和微调
* **2025年11月4日**：KTransformers 微调 × LLaMA-Factory 集成
* **2025年10月27日**：支持 Ascend NPU
* **2025年10月10日**：集成到 SGLang ([路线图](https://github.com/sgl-project/sglang/issues/11425), [博客](https://lmsys.org/blog/2025-10-22-KTransformers/))
* **2025年9月11日**：支持 Qwen3-Next
* **2025年9月5日**：支持 Kimi-K2-0905
* **2025年7月26日**：支持 SmallThinker 和 GLM4-MoE
* **2025年6月30日**：支持 3层（GPU-CPU-磁盘）前缀缓存复用
* **2025年5月14日**：支持 Intel Arc GPU
* **2025年4月29日**：支持 AMX-Int8、AMX-BF16 和 Qwen3MoE
* **2025年4月9日**：实验性支持 LLaMA 4 模型
* **2025年4月2日**：支持多并发
* **2025年3月15日**：支持 AMD GPU 的 ROCm
* **2025年3月5日**：支持 unsloth 1.58/2.51 bits 权重和 IQ1_S/FP8 混合权重；DeepSeek-V3/R1 支持 139K 长上下文
* **2025年2月25日**：支持 DeepSeek-V3 和 R1 的 FP8 GPU 内核
* **2025年2月10日**：支持 Deepseek-R1 和 V3，速度提升最高达 3~28 倍

---

## 📦 核心模块

### 🚀 [kt-kernel](./kt-kernel/) - 高性能推理内核

面向异构 LLM 推理的 CPU 优化内核操作库。

![image-20251011010558909](./doc/assets/heterogeneous_computing.png)

**核心特性：**
- **AMX/AVX 加速**：Intel AMX 和 AVX512/AVX2 优化内核，支持 INT4/INT8 量化推理
- **MoE 优化**：高效的专家混合推理，支持 NUMA 感知内存管理
- **量化支持**：CPU 端 INT4/INT8 量化权重，GPU 端 GPTQ 支持
- **易于集成**：简洁的 Python API，可集成到 SGLang 等框架

**快速开始：**
```bash
cd kt-kernel
pip install .
```

**应用场景：**
- 大型 MoE 模型的 CPU-GPU 混合推理
- 与 SGLang 集成用于生产服务
- 异构专家放置（热门专家在 GPU，冷门专家在 CPU）

**性能示例：**
| 模型 | 硬件配置 | 总吞吐量 | 输出吞吐量 |
|------|---------|---------|-----------|
| DeepSeek-R1-0528 (FP8) | 8×L20 GPU + Xeon Gold 6454S | 227.85 tokens/s | 87.58 tokens/s（8路并发）|

👉 **[完整文档 →](./kt-kernel/README.md)**

---

### 🎓 [kt-sft](./kt-sft/) - 微调框架

KTransformers × LLaMA-Factory 集成，支持超大 MoE 模型微调。

![image-20251011010558909](./doc/assets/image-20251011010558909.png)

**核心特性：**
- **资源高效**：仅需 **70GB 显存** + 1.3TB 内存即可微调 671B DeepSeek-V3
- **LoRA 支持**：完整的 LoRA 微调与异构加速
- **LLaMA-Factory 集成**：与流行微调框架无缝集成
- **生产就绪**：支持对话、批量推理和指标评估

**性能示例：**
| 模型 | 配置 | 吞吐量 | GPU 显存 |
|------|------|--------|----------|
| DeepSeek-V3 (671B) | LoRA + AMX | ~40 tokens/s | 70GB (多卡) |
| DeepSeek-V2-Lite (14B) | LoRA + AMX | ~530 tokens/s | 6GB |

**快速开始：**
```bash
cd kt-sft
# 按照 kt-sft/README.md 安装环境
USE_KT=1 llamafactory-cli train examples/train_lora/deepseek3_lora_sft_kt.yaml
```

👉 **[完整文档 →](./kt-sft/README.md)**

---

## 🔥 引用

如果您在研究中使用了 KTransformers，请引用我们的论文：

```bibtex
@inproceedings{10.1145/3731569.3764843,
  title = {KTransformers: Unleashing the Full Potential of CPU/GPU Hybrid Inference for MoE Models},
  author = {Chen, Hongtao and Xie, Weiyu and Zhang, Boxin and Tang, Jingqi and Wang, Jiahao and Dong, Jianwei and Chen, Shaoyuan and Yuan, Ziwei and Lin, Chen and Qiu, Chengyu and Zhu, Yuening and Ou, Qingliang and Liao, Jiaqi and Chen, Xianglin and Ai, Zhiyuan and Wu, Yongwei and Zhang, Mingxing},
  booktitle = {Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles},
  year = {2025}
}
```

## 👥 贡献者与团队

由以下团队开发和维护：
- 清华大学 [MADSys 实验室](https://madsys.cs.tsinghua.edu.cn/)
- [Approaching.AI](http://approaching.ai/)
- 社区贡献者

我们欢迎贡献！请随时提交 issues 和 pull requests。

## 💬 社区与支持

- **GitHub Issues**：[报告 bug 或请求功能](https://github.com/kvcache-ai/ktransformers/issues)
- **GitHub Discussions**：[提问和分享想法](https://github.com/kvcache-ai/ktransformers/discussions)
- **微信群**：查看 [archive/WeChatGroup.png](./archive/WeChatGroup.png)

## 📦 历史代码

原完整的 KTransformers 框架代码已归档至 [`archive/`](./archive/) 目录供参考。项目现专注于上述两个核心模块，以实现更好的模块化和可维护性。

关于原始完整文档（包含快速入门指南和示例），请查看：
- [archive/README_LEGACY.md](./archive/README_LEGACY.md) (English)
- [archive/README_ZH_LEGACY.md](./archive/README_ZH_LEGACY.md) (中文)


================================================
FILE: archive/README_ZH_LEGACY.md
================================================
<div align="center">
  <!-- <h1>KTransformers</h1> -->
  <p align="center">

<picture>
    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>

</picture>

</p>
  <h3>一个用于体验尖端 LLM 推理优化的灵活框架</h3>
  <strong><a href="#show-cases">🌟 案例展示</a> | <a href="#quick-start">🚀 快速入门</a> | <a href="#tutorial">📃 教程</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬 讨论</a> | <a href="#FAQ">🙋 常见问题</a> </strong>
</div>

<h2 id="intro">🎉 介绍</h2>
KTransformers（发音为 Quick Transformers）旨在通过先进的内核优化和放置/并行策略来增强您对 🤗 [Transformers](https://github.com/huggingface/transformers) 的体验。
<br/><br/>
KTransformers 是一个以 Python 为中心的灵活框架，其核心是可扩展性。通过用一行代码实现并注入优化模块，用户可以获得与 Transformers 兼容的接口、符合 OpenAI 和 Ollama 的 RESTful API，甚至是一个简化的类似 ChatGPT 的 Web 界面。
<br/><br/>
我们对 KTransformers 的愿景是成为一个用于实验创新 LLM 推理优化的灵活平台。如果您需要任何其他功能，请告诉我们。

<h2 id="Updates">🔥 更新</h2>

* **2025 年 2 月 15 日**：为DeepSeek-V3/R1支持[FP8 GPU内核](./doc/en/fp8_kernel.md); 支持更长的上下文([教程](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context)).
* **2025 年 2 月 15 日**：长上下文(从4K到8K，24GB VRAM) & 稍快的速度(+15%)(最快 16 Tokens/s)，文档请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 和 [在线指南](https://kvcache-ai.github.io/ktransformers/) 。
* **2025 年 2 月 10 日**：支持 Deepseek-R1 和 V3 在单个（24GB VRAM）/多 GPU 和 382G DRAM 上运行，速度提升高达 3~28 倍。详细教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)。
* **2024 年 8 月 28 日**：支持 InternLM2.5-7B-Chat-1M 模型下的 1M 上下文，使用 24GB 的 VRAM 和 150GB 的 DRAM。详细教程请参见 [这里](./doc/en/long_context_tutorial.md)。
* **2024 年 8 月 28 日**：将 DeepseekV2 所需的 VRAM 从 21G 降低到 11G。
* **2024 年 8 月 15 日**：更新了详细的 [教程](doc/en/injection_tutorial.md)，介绍注入和多 GPU 的使用。
* **2024 年 8 月 14 日**：支持 llamfile 作为线性后端。
* **2024 年 8 月 12 日**：支持多 GPU；支持新模型：mixtral 8\*7B 和 8\*22B；支持 q2k、q3k、q5k 在 GPU 上的去量化。
* **2024 年 8 月 9 日**：支持 Windows。

<h2 id="show-cases">🌟 案例展示</h2>

<div>
<h3>在仅 24GB VRAM 的桌面上运行 GPT-4/o1 级别的本地 VSCode Copilot</h3>
</div>

https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

</p>

- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1**：使用其 Q4_K_M 版本，仅需 14GB VRAM 和 382GB DRAM 即可运行（教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)）。
	- 预填充速度（tokens/s）：
 		- KTransformers：54.21（32 核）→ 74.362（双插槽，2×32 核）→ 255.26（优化的 AMX 基 MoE 内核，仅 V0.3）→ 286.55（选择性使用 6 个专家，仅 V0.3）
 		- 与 llama.cpp 在 2×32 核下相比，达到 **27.79× 速度提升**。
 	- 解码速度（tokens/s）：
 		- KTransformers：8.73（32 核）→ 11.26（双插槽，2×32 核）→ 13.69（选择性使用 6 个专家，仅 V0.3）
 		- 与 llama.cpp 在 2×32 核下相比，达到 **3.03× 速度提升**。
	- 即将开源发布：
		- AMX 优化和选择性专家激活将在 V0.3 中开源。
		- 目前仅在预览二进制分发中可用，可从 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 下载。

- **本地 236B DeepSeek-Coder-V2**：使用其 Q4_K_M 版本，仅需 21GB VRAM 和 136GB DRAM 即可运行，甚至在 [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench) 中得分超过 GPT4-0613。

<p align="center">
  <picture>
    <img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/d052924e-8631-44de-aad2-97c54b965693" width=100%>
  </picture>
</p>

- **更快的速度**：通过 MoE 卸载和注入来自 [Llamafile](https://github.com/Mozilla-Ocho/llamafile/tree/main) 和 [Marlin](https://github.com/IST-DASLab/marlin) 的高级内核，实现了 2K 提示预填充 126 tokens/s 和生成 13.6 tokens/s 的速度。
- **VSCode 集成**：封装成符合 OpenAI 和 Ollama 的 API，可无缝集成到 [Tabby](https://github.com/TabbyML/tabby) 和其他前端的后端。

<p align="center">

https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c

</p>

<!-- <h3>在仅 24GB VRAM 的桌面上进行 1M 上下文本地推理</h3>
<p align="center"> -->

<!-- https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 -->
<!-- 
* **1M 上下文 InternLM 2.5 7B**：以全 bf16 精度运行，使用 24GB VRAM 和 150GB DRAM，可在本地桌面设置中实现。在 1M "针在干草堆中" 测试中达到 92.88% 的成功率，在 128K NIAH 测试中达到 100%。

<p align="center">
  <picture>
    <img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
  </picture>
</p>

<p align="center">
  <picture>
    <img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
  </picture>
</p>

* **增强的速度**：使用稀疏注意力，通过 llamafile 内核实现 1M 上下文生成 16.91 tokens/s 的速度。这种方法比 llama.cpp 的全注意力方法快 10 倍以上。

* **灵活的稀疏注意力框架**：提供了一个灵活的块稀疏注意力框架，用于 CPU 卸载解码。与 SnapKV、Quest 和 InfLLm 兼容。更多信息请参见 [这里](./doc/en/long_context_introduction.md)。 -->

<strong>更多高级功能即将推出，敬请期待！</strong>

<h2 id="quick-start">🚀 快速入门</h2>


KTransformers 的入门非常简单！请参考我们的[安装指南]((https://kvcache-ai.github.io/ktransformers/))进行安装。

<h2 id="tutorial">📃 简要注入教程</h2>
KTransformers 的核心是一个用户友好的、基于模板的注入框架。这使得研究人员可以轻松地将原始 torch 模块替换为优化的变体。它还简化了多种优化的组合过程，允许探索它们的协同效应。
</br>
<p align="center">
  <picture>
    <img alt="Inject-Struction" src="https://github.com/user-attachments/assets/6b4c1e54-9f6d-45c5-a3fc-8fa45e7d257e" width=65%>
  </picture>
</p>

鉴于 vLLM 已经是一个用于大规模部署优化的优秀框架，KTransformers 特别关注受资源限制的本地部署。我们特别关注异构计算时机，例如量化模型的 GPU/CPU 卸载。例如，我们支持高效的 <a herf="https://github.com/Mozilla-Ocho/llamafile/tree/main">Llamafile</a> 和<a herf="https://github.com/IST-DASLab/marlin">Marlin</a> 内核，分别用于 CPU 和 GPU。 更多详细信息可以在 <a herf="doc/en/operators/llamafile.md">这里</a>找到。


<h3>示例用法</h3>
要使用提供的内核，用户只需创建一个基于 YAML 的注入模板，并在使用 Transformers 模型之前添加对 `optimize_and_load_gguf` 的调用。

```python
with torch.device("meta"):
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
...
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
```

在这个示例中，首先在 meta 设备上初始化 AutoModel，以避免占用任何内存资源。然后，`optimize_and_load_gguf` 遍历模型的所有子模块，匹配您的 YAML 规则文件中指定的规则，并将它们替换为指定的高级模块。

注入后，原始的 `generate` 接口仍然可用，但我们还提供了一个兼容的 `prefill_and_generate` 方法，这使得可以进一步优化，例如使用 CUDAGraph 提高生成速度。

<h3>如何自定义您的模型</h3>

一个详细的使用 DeepSeek-V2 作为示例的注入和 multi-GPU 教程在 [这里](doc/en/injection_tutorial.md)。

以下是一个将所有原始 Linear 模块替换为 Marlin 的 YAML 模板示例，Marlin 是一个高级的 4 位量化内核。

```yaml
- match:
    name: "^model\\.layers\\..*$"  # 正则表达式 
    class: torch.nn.Linear  # 仅匹配同时符合名称和类的模块
  replace:
    class: ktransformers.operators.linear.KTransformerLinear  # 量化数据类型的优化内核
    device: "cpu"   # 初始化时加载该模块的 device
    kwargs:
      generate_device: "cuda"
      generate_linear_type: "QuantizedLinearMarlin"
```

YAML 文件中的每个规则都有两部分：`match` 和 `replace`。`match` 部分指定应替换的模块，`replace` 部分指定要注入到模型中的模块以及初始化关键字。

您可以在 [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) 目录中找到用于优化 DeepSeek-V2 和 Qwen2-57B-A14 的示例规则模板。这些模板用于为 `local_chat.py` 示例提供支持。

如果您对我们的设计原则和注入框架的实现感兴趣，请参考 [设计文档](doc/en/deepseek-v2-injection.md)。

<h2 id="ack">致谢和贡献者</h2>

KTransformers 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile 、 Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。

KTransformers 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们，使 KTransformers 更快、更易于使用。


<h2 id="ack">讨论</h2>

如果您有任何问题，欢迎随时提出 issue。或者，您可以加入我们的微信群进行进一步讨论。二维码： [微信群](WeChatGroup.png)

<h2 id="FAQ">🙋 常见问题</h2>

一些常见问题的答案可以在 [FAQ](doc/en/FAQ.md) 中找到。 


================================================
FILE: archive/SECURITY.md
================================================
# Security Policy

## Supported Versions

Use this section to tell people about which versions of your project are
currently being supported with security updates.

| Version | Supported          |
| ------- | ------------------ |
| 5.1.x   | :white_check_mark: |
| 5.0.x   | :x:                |
| 4.0.x   | :white_check_mark: |
| < 4.0   | :x:                |

## Reporting a Vulnerability

Use this section to tell people how to report a vulnerability.

Tell them where to go, how often they can expect to get an update on a
reported vulnerability, what to expect if the vulnerability is accepted or
declined, etc.


================================================
FILE: archive/book.toml
================================================
[book]
authors = ["kvcache-ai"]
language = "zh-CN"
title = "Ktransformers"
src = "doc"

[output.html]
git-repository-url = "https://github.com/kvcache-ai/ktransformers"
edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"

[output.html.playground]
editable = true
copy-js = true
# line-numbers = true

[output.html.fold]
enable = true
level = 0

================================================
FILE: archive/config.json
================================================


================================================
FILE: archive/csrc/balance_serve/CMakeLists.txt
================================================
option(KTRANSFORMERS_USE_NPU                 "ktransformers: use NPU"                           OFF)
if(KTRANSFORMERS_USE_NPU)
    add_definitions(-DKTRANSFORMERS_USE_NPU=1)
endif()

if(KTRANSFORMERS_USE_NPU)
    set(ASCEND_HOME_PATH "$ENV{ASCEND_HOME_PATH}")
    message(STATUS "ASCEND_HOME_PATH is ${ASCEND_HOME_PATH}")
    include_directories(${ASCEND_HOME_PATH}/include)
    
    link_directories(${TORCH_INSTALL_PREFIX}/../torch.libs)
    # find torch_npu
    execute_process(
            COMMAND python -c "import torch; import torch_npu; print(torch_npu.__path__[0])"
            OUTPUT_VARIABLE TORCH_NPU_PATH
            OUTPUT_STRIP_TRAILING_WHITESPACE
    )
    message(STATUS "Found PTA at: ${TORCH_NPU_PATH}")
    find_library(PTA_LIBRARY torch_npu PATH "${TORCH_NPU_PATH}/lib")
endif()

cmake_minimum_required(VERSION 3.21)
find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 g++ REQUIRED)
set(CMAKE_CXX_COMPILER ${GCC_COMPILER})

# 显示选定的编译器
message(STATUS "Using compiler: ${CMAKE_CXX_COMPILER}")


project(balance_serve VERSION 0.1.0)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
set(CMAKE_BUILD_TYPE "Debug")
# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
# set(CMAKE_BUILD_TYPE "Release")


if(NOT DEFINED _GLIBCXX_USE_CXX11_ABI)
    find_package(Python3 REQUIRED COMPONENTS Interpreter)

    execute_process(
        COMMAND ${Python3_EXECUTABLE} -c
        "import torch; print('1' if torch.compiled_with_cxx11_abi() else '0')"
        OUTPUT_VARIABLE ABI_FLAG
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )

    set(_GLIBCXX_USE_CXX11_ABI ${ABI_FLAG} CACHE STRING "C++11 ABI setting from PyTorch" FORCE)
endif()

# 无论是否是自动检测，都传给编译器
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})

message(STATUS "_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}")

file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")

add_custom_target(
    format
    COMMAND clang-format
    -i
    -style=file
    ${FMT_SOURCES}
    COMMENT "Running clang-format on all source files"
)

set(BUILD_SHARED_LIBS ON)
set(ENABLE_PUSH OFF)
set(ENABLE_COMPRESSION OFF)

# set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
set(THIRD_PARTY_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/third_party)
add_subdirectory(${THIRD_PARTY_DIR}/prometheus-cpp ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp EXCLUDE_FROM_ALL)
add_subdirectory(${THIRD_PARTY_DIR}/xxHash/cmake_unofficial ${THIRD_PARTY_BUILD_DIR}/xxHash EXCLUDE_FROM_ALL)
set_target_properties(xxhash PROPERTIES POSITION_INDEPENDENT_CODE ON)

# add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third_party/prometheus-cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/prometheus-cpp)
set(SPDLOG_DIR ${THIRD_PARTY_DIR}/spdlog)
set(FMT_DIR ${THIRD_PARTY_DIR}/fmt)

set(KVC2_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kvc2/src)

include_directories(${THIRD_PARTY_DIR})

add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)

execute_process(
    COMMAND python3 -c "import torch; print(torch.__path__[0])"
    OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
    OUTPUT_STRIP_TRAILING_WHITESPACE
)

message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")

# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)

add_subdirectory(kvc2)
add_subdirectory(sched)

# add_subdirectory(test)


================================================
FILE: archive/csrc/custom_marlin/__init__.py
================================================


================================================
FILE: archive/csrc/custom_marlin/binding.cpp
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-12 03:05:04
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "gptq_marlin/ops.h"
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>
// namespace py = pybind11;

PYBIND11_MODULE(vLLMMarlin, m) {

    /*m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q3_k",  &dequantize_q3_k, "Function to dequantize q3_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q2_k",  &dequantize_q2_k, "Function to dequantize q2_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_iq4_xs",  &dequantize_iq4_xs, "Function to dequantize
    iq4_xs data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));*/
    m.def("gptq_marlin_gemm", &gptq_marlin_gemm,
          "Function to perform GEMM using Marlin quantization.", py::arg("a"),
          py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
          py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m_tensor"),
          py::arg("size_m"), py::arg("size_n"), py::arg("size_k"),
          py::arg("sms"), py::arg("is_k_full"));
    m.def("gptq_marlin_repack", &gptq_marlin_repack,
            "gptq_marlin repack from GPTQ");
}

================================================
FILE: archive/csrc/custom_marlin/gptq_marlin/gptq_marlin.cu
================================================
/*
 * Modified by Neural Magic
 * Copyright (C) Marlin.2024 Elias Frantar
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /*
  * Adapted from https://github.com/IST-DASLab/marlin
  */
  /*
   * Adapted from
   * https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
   */
#include "gptq_marlin.cuh"
#include "gptq_marlin_dtypes.cuh"
#include <c10/cuda/CUDAGuard.h>
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                              \
    static_assert(std::is_same<scalar_t, half>::value ||                       \
                      std::is_same<scalar_t, nv_bfloat16>::value,              \
                  "only float16 and bfloat16 is supported");

template <typename T> inline std::string str(T x) { return std::to_string(x); }

namespace gptq_marlin {

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800

    __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
        int const* __restrict__ perm_int_ptr,
        int4* __restrict__ out_int4_ptr, int size_m,
        int size_k, int block_rows) {}

    template <typename scalar_t,         // compute dtype, half or nv_float16
        const int num_bits,        // number of bits used for weights
        const int threads,         // number of threads in a threadblock
        const int thread_m_blocks, // number of 16x16 blocks in the m
        // dimension (batchsize) of the
        // threadblock
        const int thread_n_blocks, // same for n dimension (output)
        const int thread_k_blocks, // same for k dimension (reduction)
        const int stages, // number of stages for the async global->shared
        // fetch pipeline
        const bool has_act_order,   // whether act_order is enabled
        const int group_blocks = -1 // number of consecutive 16x16 blocks
        // with a separate quantization scale
    >
    __global__ void
        Marlin(const int4* __restrict__ A, // fp16 input matrix of shape mxk
            const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
            int4* __restrict__ C,       // fp16 output buffer of shape mxn
            const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
            // (k/groupsize)xn
            const int* __restrict__ g_idx, // int32 group indices of shape k
            int num_groups, // number of scale groups per output channel
            int prob_m,     // batch dimension m
            int prob_n,     // output dimension n
            int prob_k,     // reduction dimension k
            int* locks      // extra global storage for barrier synchronization
        ) {}

} // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
    torch::Tensor& b_scales, torch::Tensor& g_idx,
    torch::Tensor& perm, torch::Tensor& workspace,
    int64_t num_bits, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full) {
    TORCH_CHECK_NOT_IMPLEMENTED(false,
        "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
    return torch::empty({ 1, 1 });
}

#else

    // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
    // output/accumulation.
    template <typename scalar_t>
    __device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
        const typename ScalarType<scalar_t>::FragB& frag_b,
        typename ScalarType<scalar_t>::FragC& frag_c) {
        const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
        const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
        float* c = reinterpret_cast<float*>(&frag_c);
        if constexpr (std::is_same<scalar_t, half>::value) {
            asm volatile(
                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
                "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
                : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
                : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
                "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
        }
        else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
            asm volatile(
                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
                "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
                : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
                : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
                "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
        }
        else {
            STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
        }
    }

    // Instruction for loading a full 16x16 matrix fragment of operand A from shared
    // memory, directly in tensor core layout.
    template <typename scalar_t>
    __device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
        const void* smem_ptr) {
        uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
        uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
        asm volatile(
            "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
            : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
            : "r"(smem));
    }

    // Lookup-table based 3-input logical operation; explicitly used for
    // dequantization as the compiler does not seem to automatically recognize it in
    // all cases.
    template <int lut> __device__ inline int lop3(int a, int b, int c) {
        int res;
        asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
            : "=r"(res)
            : "r"(a), "r"(b), "r"(c), "n"(lut));
        return res;
    }

    // Constructs destination register by taking bytes from 2 sources (based on
    // mask)
    template <int start_byte, int mask>
    __device__ inline uint32_t prmt(uint32_t a) {
        uint32_t res;
        asm volatile("prmt.b32 %0, %1, %2, %3;\n"
            : "=r"(res)
            : "r"(a), "n"(start_byte), "n"(mask));
        return res;
    }

    // Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
    // values. We mostly follow the strategy in the link below, with some small
    // changes:
    // - FP16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
    // - BF16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
    template <typename scalar_t>
    __device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit(int q) {
        STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
    }

    template <>
    __device__ inline typename ScalarType<half>::FragB dequant_4bit<half>(int q) {
        const int LO = 0x000f000f;
        const int HI = 0x00f000f0;
        const int EX = 0x64006400;
        // Guarantee that the `(a & b) | c` operations are LOP3s.
        int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
        int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
        // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
        // directly into `SUB` and `ADD`.
        const int SUB = 0x64086408;
        const int MUL = 0x2c002c00;
        const int ADD = 0xd480d480;
        typename ScalarType<half>::FragB frag_b;
        frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
            *reinterpret_cast<const half2*>(&SUB));
        frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
            *reinterpret_cast<const half2*>(&MUL),
            *reinterpret_cast<const half2*>(&ADD));
        return frag_b;
    }

    template <>
    __device__ inline typename ScalarType<nv_bfloat16>::FragB
        dequant_4bit<nv_bfloat16>(int q) {
        static constexpr uint32_t MASK = 0x000f000f;
        static constexpr uint32_t EX = 0x43004300;

        // Guarantee that the `(a & b) | c` operations are LOP3s.

        int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
        q >>= 4;
        int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

        typename ScalarType<nv_bfloat16>::FragB frag_b;
        static constexpr uint32_t MUL = 0x3F803F80;
        static constexpr uint32_t ADD = 0xC308C308;

        frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
            *reinterpret_cast<const nv_bfloat162*>(&MUL),
            *reinterpret_cast<const nv_bfloat162*>(&ADD));
        frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
            *reinterpret_cast<const nv_bfloat162*>(&MUL),
            *reinterpret_cast<const nv_bfloat162*>(&ADD));
        return frag_b;
    }

    // Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
    // bf16 Reference:
    // - FP16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
    // - BF16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
    template <typename scalar_t>
    __device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
        STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
    }

    template <>
    __device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
        static constexpr uint32_t mask_for_elt_01 = 0x5250;
        static constexpr uint32_t mask_for_elt_23 = 0x5351;
        static constexpr uint32_t start_byte_for_fp16 = 0x64646464;

        uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
        uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);

        static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;

        typename ScalarType<half>::FragB frag_b;
        frag_b[0] =
            __hsub2(*reinterpret_cast<half2*>(&lo),
                *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
        frag_b[1] =
            __hsub2(*reinterpret_cast<half2*>(&hi),
                *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
        return frag_b;
    }

    template <>
    __device__ inline typename ScalarType<nv_bfloat16>::FragB
        dequant_8bit<nv_bfloat16>(int q) {
        typename ScalarType<nv_bfloat16>::FragB frag_b;

        float fp32_intermediates[4];
        uint32_t* fp32_intermediates_casted =
            reinterpret_cast<uint32_t*>(fp32_intermediates);

        static constexpr uint32_t fp32_base = 0x4B000000;
        fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
        fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
        fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
        fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);

        fp32_intermediates[0] -= 8388736.f;
        fp32_intermediates[1] -= 8388736.f;
        fp32_intermediates[2] -= 8388736.f;
        fp32_intermediates[3] -= 8388736.f;

        uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
        bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
            fp32_intermediates_casted[1], 0x7632);
        bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
            fp32_intermediates_casted[3], 0x7632);

        return frag_b;
    }

    // Multiply dequantized values by the corresponding quantization scale; used
    // only for grouped quantization.
    template <typename scalar_t>
    __device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
        typename ScalarType<scalar_t>::FragS& frag_s,
        int i) {
        using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
        scalar_t2 s = ScalarType<scalar_t>::num2num2(
            reinterpret_cast<scalar_t*>(&frag_s)[i]);
        frag_b[0] = __hmul2(frag_b[0], s);
        frag_b[1] = __hmul2(frag_b[1], s);
    }

    // Same as above, but for act_order (each K is multiplied individually)
    template <typename scalar_t>
    __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
        typename ScalarType<scalar_t>::FragS& frag_s_1,
        typename ScalarType<scalar_t>::FragS& frag_s_2,
        typename ScalarType<scalar_t>::FragS& frag_s_3,
        typename ScalarType<scalar_t>::FragS& frag_s_4,
        int i) {
        using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
        scalar_t2 s_val_1_2;
        s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
        s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];

        scalar_t2 s_val_3_4;
        s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
        s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];

        frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
        frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
    }

    // Given 2 floats multiply by 2 scales (halves)
    template <typename scalar_t>
    __device__ inline void scale_float(float* c,
        typename ScalarType<scalar_t>::FragS& s) {
        scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
        c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
        c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
    }

    // Wait until barrier reaches `count`, then lock for current threadblock.
    __device__ inline void barrier_acquire(int* lock, int count) {
        if (threadIdx.x == 0) {
            int state = -1;
            do
                // Guarantee that subsequent writes by this threadblock will be
                // visible globally.
                asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
                    : "=r"(state)
                    : "l"(lock));
            while (state != count);
        }
        __syncthreads();
    }

    // Release barrier and increment visitation count.
    __device__ inline void barrier_release(int* lock, bool reset = false) {
        __syncthreads();
        if (threadIdx.x == 0) {
            if (reset) {
                lock[0] = 0;
                return;
            }
            int val = 1;
            // Make sure that all writes since acquiring this barrier are visible
            // globally, while releasing the barrier.
            asm volatile("fence.acq_rel.gpu;\n");
            asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
                :
            : "l"(lock), "r"(val));
        }
    }

    // For a given "a" of size [M,K] performs a permutation of the K columns based
    // on the given "perm" indices.
    __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
        int const* __restrict__ perm_int_ptr,
        int4* __restrict__ out_int4_ptr, int size_m,
        int size_k, int block_rows) {
        int start_row = block_rows * blockIdx.x;
        int finish_row = start_row + block_rows;
        if (finish_row > size_m) {
            finish_row = size_m;
        }
        int cur_block_rows = finish_row - start_row;

        int row_stride = size_k * sizeof(half) / 16;

        auto permute_row = [&](int row) {
            int iters = size_k / default_threads;
            int rest = size_k % default_threads;

            int offset = row * row_stride;

            half const* a_row_half =
                reinterpret_cast<half const*>(a_int4_ptr + offset);
            half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);

            int base_k = 0;

            for (int i = 0; i < iters; i++) {
                int cur_k = base_k + threadIdx.x;
                int src_pos = perm_int_ptr[cur_k];

                out_half[cur_k] = a_row_half[src_pos];

                base_k += default_threads;
            }

            if (rest) {
                if (threadIdx.x < rest) {
                    int cur_k = base_k + threadIdx.x;
                    int src_pos = perm_int_ptr[cur_k];

                    out_half[cur_k] = a_row_half[src_pos];
                }
            }
            };

        for (int i = 0; i < cur_block_rows; i++) {
            int cur_row = start_row + i;
            if (cur_row < size_m) {
                permute_row(cur_row);
            }
        }
    }

    template <typename scalar_t,         // compute dtype, half or nv_float16
        const int num_bits,        // number of bits used for weights
        const int threads,         // number of threads in a threadblock
        const int thread_m_blocks, // number of 16x16 blocks in the m
        // dimension (batchsize) of the
        // threadblock
        const int thread_n_blocks, // same for n dimension (output)
        const int thread_k_blocks, // same for k dimension (reduction)
        const int stages, // number of stages for the async global->shared
        // fetch pipeline
        const bool has_act_order,   // whether act_order is enabled
        const int group_blocks = -1 // number of consecutive 16x16 blocks
        // with a separate quantization scale
    >
    __device__ void
        Marlin(const int4* __restrict__ A, // fp16 input matrix of shape mxk
            const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
            int4* __restrict__ C,       // fp16 output buffer of shape mxn
            const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
            // (k/groupsize)xn
            const int* __restrict__ g_idx, // int32 group indices of shape k
            int num_groups, // number of scale groups per output channel
            int prob_m,     // batch dimension m, should be divisible by (16 * thread_m_blocks) if bigger than that
            int prob_n,     // output dimension n
            int prob_k,     // reduction dimension k
            int* locks      // extra global storage for barrier synchronization
        ) {
        // Each threadblock processes one "stripe" of the B matrix with (roughly) the
        // same size, which might involve multiple column "slices" (of width 16 *
        // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
        // example:
        //   0 1 3
        //   0 2 3
        //   1 2 4
        // While this kind of partitioning makes things somewhat more complicated, it
        // ensures good utilization of all SMs for many kinds of shape and GPU
        // configurations, while requiring as few slow global cross-threadblock
        // reductions as possible.
        using Dtype = ScalarType<scalar_t>;
        using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
        using FragA = typename ScalarType<scalar_t>::FragA;
        using FragB = typename ScalarType<scalar_t>::FragB;
        using FragC = typename ScalarType<scalar_t>::FragC;
        using FragS = typename ScalarType<scalar_t>::FragS;

        constexpr int pack_factor = 32 / num_bits;

        // int prob_m = *prob_m_ptr;
        // const int thread_m_blocks = min(div_ceil(prob_m, 16), template_thread_m_blocks);
        // constexpr int thread_m_blocks = template_thread_m_blocks;

        // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
        // better partitioning with less reductions
        int parallel = 1;
        if (prob_m > 16 * thread_m_blocks) {
            parallel = prob_m / (16 * thread_m_blocks);
            prob_m = 16 * thread_m_blocks;
        }

        int k_tiles = prob_k / 16 / thread_k_blocks;
        int n_tiles = prob_n / 16 / thread_n_blocks;
        int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);

        if constexpr (!has_act_order && group_blocks != -1) {
            if (group_blocks >= thread_k_blocks) {
                // Ensure that the number of tiles in each stripe is a multiple of the
                // groupsize; this avoids an annoying special case where a stripe starts
                // in the middle of group.
                iters = (group_blocks / thread_k_blocks) *
                    div_ceil(iters, (group_blocks / thread_k_blocks));
            }
        }

        int slice_row = (iters * blockIdx.x) % k_tiles;
        int slice_col_par = (iters * blockIdx.x) / k_tiles;
        int slice_col = slice_col_par;
        int slice_iters;  // number of threadblock tiles in the current slice
        int slice_count =
            0;          // total number of active threadblocks in the current slice
        int slice_idx;  // index of threadblock in current slice; numbered bottom to
        // top

    // We can easily implement parallel problem execution by just remapping
    // indices and advancing global pointers
        if (slice_col_par >= n_tiles) {
            A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
            C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
            locks += (slice_col_par / n_tiles) * n_tiles;
            slice_col = slice_col_par % n_tiles;
        }

        // Compute all information about the current slice which is required for
        // synchronization.
        auto init_slice = [&]() {
            slice_iters =
                iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
            if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
            if (slice_iters == 0) return;
            if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
            slice_count = 1;
            slice_idx = 0;
            int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
            if (col_first <= k_tiles * (slice_col_par + 1)) {
                int col_off = col_first - k_tiles * slice_col_par;
                slice_count = div_ceil(k_tiles - col_off, iters);
                if (col_off > 0) slice_count++;
                int delta_first = iters * blockIdx.x - col_first;
                if (delta_first < 0 || (col_off == 0 && delta_first == 0))
                    slice_idx = slice_count - 1;
                else {
                    slice_idx = slice_count - 1 - delta_first / iters;
                    if (col_off > 0) slice_idx--;
                }
            }
            if (slice_col == n_tiles) {
                A += 16 * thread_m_blocks * prob_k / 8;
                C += 16 * thread_m_blocks * prob_n / 8;
                locks += n_tiles;
                slice_col = 0;
            }
            };
        init_slice();

        // A sizes/strides

        // stride of the A matrix in global memory
        int a_gl_stride = prob_k / 8;
        // stride of an A matrix tile in shared memory
        constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
        // delta between subsequent A tiles in global memory
        constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
        // between subsequent accesses within a tile
        int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
        // between shared memory writes
        constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
        // between shared memory tile reads
        constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
        // within a shared memory tile
        constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
        // overall size of a tile
        constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
        // number of shared write iterations for a tile
        constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);

        // B sizes/strides
        int b_gl_stride = 16 * prob_n / (pack_factor * 4);
        constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
        constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
        constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;

        int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
        int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
        constexpr int b_sh_wr_delta = threads * b_thread_vecs;
        constexpr int b_sh_rd_delta = threads * b_thread_vecs;
        constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
        constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;

        // Scale sizes/strides without act_order
        int s_gl_stride = prob_n / 8;
        constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
        constexpr int s_tb_groups =
            !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
            ? thread_k_blocks / group_blocks
            : 1;
        constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
        int s_gl_rd_delta = s_gl_stride;

        // Scale size/strides with act_order
        constexpr int tb_k = 16 * thread_k_blocks;
        constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
        // constexpr int act_s_row_stride      = 1;
        // int           act_s_col_stride      = act_s_row_stride * num_groups;
        int act_s_col_stride = 1;
        int act_s_col_warp_stride = act_s_col_stride * 8;
        int tb_n_warps = thread_n_blocks / 4;
        int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;

        // Global A read index of current thread.
        int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
            (threadIdx.x % a_gl_rd_delta_o);
        a_gl_rd += a_gl_rd_delta_o * slice_row;
        // Shared write index of current thread.
        int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
            (threadIdx.x % a_gl_rd_delta_o);
        // Shared read index.
        int a_sh_rd =
            a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
        a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));

        int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
            (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
        b_gl_rd += b_sh_stride * slice_col;
        b_gl_rd += b_gl_rd_delta_o * slice_row;
        int b_sh_wr = threadIdx.x * b_thread_vecs;
        int b_sh_rd = threadIdx.x * b_thread_vecs;

        // For act_order
        constexpr int k_iter_size = tb_k / b_sh_wr_iters;
        int slice_k_start = tb_k * slice_row;
        int slice_k_finish = slice_k_start + tb_k * slice_iters;
        int slice_k_start_shared_fetch = slice_k_start;
        int slice_n_offset = act_s_col_tb_stride * slice_col;

        // No act_order
        int s_gl_rd;
        if constexpr (!has_act_order) {
            if constexpr (group_blocks == -1) {
                s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
            }
            else {
                s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                    s_sh_stride * slice_col + threadIdx.x;
            }
        }
        int s_sh_wr = threadIdx.x;
        bool s_sh_wr_pred = threadIdx.x < s_sh_stride;

        // We use a different scale layout for grouped and column-wise quantization as
        // we scale a `half2` tile in column-major layout in the former and in
        // row-major in the latter case.
        int s_sh_rd;
        if constexpr (group_blocks != -1)
            s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
            (threadIdx.x % 32) / 4;
        else
            s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
            (threadIdx.x % 32) % 4;

        // Precompute which thread should not read memory in which iterations; this is
        // needed if there are more threads than required for a certain tilesize or
        // when the batchsize is not a multiple of 16.
        bool a_sh_wr_pred[a_sh_wr_iters];
#pragma unroll
        for (int i = 0; i < a_sh_wr_iters; i++) {
            a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
        }

        // To ensure that writing and reading A tiles to/from shared memory, the
        // latter in fragment format, is fully bank conflict free, we need to use a
        // rather fancy XOR-based layout. The key here is that neither reads nor
        // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
        // same shared memory banks. Further, it seems (based on NSight-Compute) that
        // each warp must also write a consecutive memory segment?
        auto transform_a = [&](int i) {
            int row = i / a_gl_rd_delta_o;
            return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
            };
        // Since the computation of this remapping is non-trivial and, due to our main
        // loop unrolls, all shared memory accesses are static, we simply precompute
        // both transformed reads and writes.
        int a_sh_wr_trans[a_sh_wr_iters];
#pragma unroll
        for (int i = 0; i < a_sh_wr_iters; i++) {
            a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
        }
        int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
#pragma unroll
        for (int i = 0; i < b_sh_wr_iters; i++) {
#pragma unroll
            for (int j = 0; j < thread_m_blocks; j++)
            {
                a_sh_rd_trans[i][j] =
                    transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
            }
        }

        // Since B-accesses have non-constant stride they have to be computed at
        // runtime; we break dependencies between subsequent accesses with a tile by
        // maintining multiple pointers (we have enough registers), a tiny
        // optimization.
        const int4* B_ptr[b_sh_wr_iters];
#pragma unroll
        for (int i = 0; i < b_sh_wr_iters; i++)
            B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;

        extern __shared__ int4 sh[];
        // Shared memory storage for global fetch pipelines.
        int4* sh_a = sh;
        int4* sh_b = sh_a + (stages * a_sh_stage);
        int4* sh_g_idx = sh_b + (stages * b_sh_stage);
        int4* sh_s = sh_g_idx + (stages * g_idx_stage);

        // Register storage for double buffer of shared memory reads.
        FragA frag_a[2][thread_m_blocks];
        I4 frag_b_quant[2][b_thread_vecs];
        FragC frag_c[thread_m_blocks][4][2];
        FragS frag_s[2][4];         // No act-order
        FragS act_frag_s[2][4][4];  // For act-order

        // Zero accumulators.
        auto zero_accums = [&]() {
#pragma unroll
            for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
            {
                reinterpret_cast<float*>(frag_c)[i] = 0;
            }
            };

        int sh_first_group_id = -1;
        int sh_num_groups = -1;
        constexpr int sh_max_num_groups = 32;

        auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
            int last_group_id) {
                sh_first_group_id = first_group_id;
                sh_num_groups = last_group_id - first_group_id + 1;

                if (sh_num_groups < sh_max_num_groups) {
                    sh_num_groups = sh_max_num_groups;
                }

                if (sh_first_group_id + sh_num_groups > num_groups) {
                    sh_num_groups = num_groups - sh_first_group_id;
                }

                int row_offset = first_group_id * s_gl_stride;

                if (is_async) {
                    for (int i = 0; i < sh_num_groups; i++) {
                        if (threadIdx.x < s_sh_stride) {
                            cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
                                &scales_ptr[row_offset + (i * s_gl_stride) +
                                slice_n_offset + threadIdx.x]);
                        }
                    }
                }
                else {
                    for (int i = 0; i < sh_num_groups; i++) {
                        if (threadIdx.x < s_sh_stride) {
                            sh_s[(i * s_sh_stride) + threadIdx.x] =
                                scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
                                threadIdx.x];
                        }
                    }
                }
            };
        // Asynchronously fetch the next A, B and s tile from global to the next
        // shared memory pipeline location.
        auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
            if (pred) {
                int4* sh_a_stage = sh_a + a_sh_stage * pipe;
#pragma unroll
                for (int i = 0; i < a_sh_wr_iters; i++) {
                    cp_async4_pred(
                        &sh_a_stage[a_sh_wr_trans[i]],
                        &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
                        a_sh_wr_pred[i]);
                }
                int4* sh_b_stage = sh_b + b_sh_stage * pipe;
#pragma unroll
                for (int i = 0; i < b_sh_wr_iters; i++) {
#pragma unroll
                    for (int j = 0; j < b_thread_vecs; j++) {
                        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
                    }

                    B_ptr[i] += b_gl_rd_delta_o;
                }

                if constexpr (has_act_order) {
                    // Fetch g_idx thread-block portion
                    int full_pipe = a_off;
                    int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
                    if (cur_k < prob_k && cur_k < slice_k_finish) {
                        int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;

                        int4 const* cur_g_idx_stage_ptr =
                            reinterpret_cast<int4 const*>(&g_idx[cur_k]);

                        if (threadIdx.x < g_idx_stage) {
                            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
                                &cur_g_idx_stage_ptr[threadIdx.x]);
                        }
                    }
                }
                else {
                    if constexpr (group_blocks != -1) {
                        int4* sh_s_stage = sh_s + s_sh_stage * pipe;

                        if constexpr (group_blocks >= thread_k_blocks) {
                            // Only fetch scales if this tile starts a new group
                            if (pipe % (group_blocks / thread_k_blocks) == 0) {
                                if (s_sh_wr_pred) {
                                    cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
                                }
                                s_gl_rd += s_gl_rd_delta;
                            }
                        }
                        else {
                            for (int i = 0; i < s_tb_groups; i++) {
                                if (s_sh_wr_pred) {
                                    cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
                                        &scales_ptr[s_gl_rd]);
                                }
                                s_gl_rd += s_gl_rd_delta;
                            }
                        }
                    }
                }
            }
            // Insert a fence even when we are winding down the pipeline to ensure that
            // waiting is also correct at this point.
            cp_async_fence();
            };

        // Wait until the next thread tile has been loaded to shared memory.
        auto wait_for_stage = [&]() {
            // We only have `stages - 2` active fetches since we are double buffering
            // and can only issue the next fetch when it is guaranteed that the previous
            // shared memory load is fully complete (as it may otherwise be
            // overwritten).
            cp_async_wait<stages - 2>();
            __syncthreads();
            };

        // Load the next sub-tile from the current location in the shared memory pipe
        // into the current register buffer.
        auto fetch_to_registers = [&](int k, int pipe) {
            int4* sh_a_stage = sh_a + a_sh_stage * pipe;
#pragma unroll
            for (int i = 0; i < thread_m_blocks; i++)
            {
                ldsm4<scalar_t>(frag_a[k % 2][i],
                    &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
            }

            int4* sh_b_stage = sh_b + b_sh_stage * pipe;

#pragma unroll
            for (int i = 0; i < b_thread_vecs; i++) {
                frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
                    &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
            }
            };

        bool is_same_group[stages];
        int same_group_id[stages];

        auto init_same_group = [&](int pipe) {
            if constexpr (!has_act_order) {
                is_same_group[pipe] = false;
                same_group_id[pipe] = 0;
                return;
            }

            int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
            int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

            int group_id_1 = sh_g_idx_int_ptr[0];
            int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];

            is_same_group[pipe] = group_id_1 == group_id_2;
            same_group_id[pipe] = group_id_1;
            };

        auto fetch_scales_to_registers = [&](int k, int full_pipe) {
            int pipe = full_pipe % stages;

            if constexpr (!has_act_order) {
                // No act-order case
                if constexpr (group_blocks != -1) {
                    if constexpr (group_blocks >= thread_k_blocks) {
                        int4* sh_s_stage =
                            sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));
                        reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
                    }
                    else {
                        int warp_id = threadIdx.x / 32;
                        int n_warps = thread_n_blocks / 4;

                        int warp_row = warp_id / n_warps;

                        int cur_k = warp_row * 16;
                        cur_k += k_iter_size * (k % b_sh_wr_iters);

                        int k_blocks = cur_k / 16;
                        int cur_group_id = k_blocks / group_blocks;

                        int4* sh_s_stage = sh_s + s_sh_stage * pipe;

                        reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                            sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
                    }
                }

                return;
            }

            // Act-order case

            // Determine K of the "current" thread-block
            int cur_k = slice_k_start + tb_k * full_pipe;
            if (cur_k >= prob_k || cur_k >= slice_k_finish) {
                return;
            }

            // Reset (to current thread-block) since we read g_idx portion from the
            // shared memory
            cur_k = 0;

            // Progress to current iteration
            cur_k += k_iter_size * (k % b_sh_wr_iters);

            // Determine "position" inside the thread-block (based on warp and
            // thread-id)
            int warp_id = threadIdx.x / 32;
            int n_warps =
                thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N

            int warp_row = warp_id / n_warps;
            int warp_col = warp_id % n_warps;

            cur_k += warp_row * 16;

            int th_id = threadIdx.x % 32;
            cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix

            int s_col_shift =
                /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
                (th_id / 4) * act_s_col_stride;

            if (is_same_group[pipe]) {
                if (k % 2 == 0) {
                    *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
                        sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
                        s_col_shift];
                }
                else {
                    *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
                        *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
                }

                for (int i = 1; i < 4; i++) {
                    *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
                        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
                }
                return;
            }

            int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
            int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

            constexpr int k_frag_offsets[4] = { 0, 1, 8,
                                               9 };  // Tensor core offsets per thread

#pragma unroll
            for (int i = 0; i < 4; i++) {
                int actual_k = cur_k + k_frag_offsets[i];

                int group_id = sh_g_idx_int_ptr[actual_k];
                int rel_group_id = group_id - sh_first_group_id;

                *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
                    sh_s[rel_group_id * s_sh_stride + s_col_shift];
            }
            };

        // Execute the actual tensor core matmul of a sub-tile.
        auto matmul = [&](int k) {
            // We have the m dimension as the inner loop in order to encourage overlapping
            // dequantization and matmul operations.
#pragma unroll
            for (int j = 0; j < 4; j++) {
                FragB frag_b0;
                FragB frag_b1;
                if constexpr (num_bits == 4) {
                    int b_quant = frag_b_quant[k % 2][0][j];
                    int b_quant_shift = b_quant >> 8;

                    frag_b0 = dequant_4bit<scalar_t>(b_quant);
                    frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);

                }
                else {
                    int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
                    int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
                    int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];

                    frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
                    frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
                }

                // Apply scale to frag_b0
                if constexpr (has_act_order) {
                    scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
                        act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                        act_frag_s[k % 2][3][j], 0);
                }
                else {
                    if constexpr (group_blocks != -1) {
                        scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
                    }
                }

                // Apply scale to frag_b1
                if constexpr (has_act_order) {
                    scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
                        act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                        act_frag_s[k % 2][3][j], 1);

                }
                else {
                    if constexpr (group_blocks != -1) {
                        scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
                    }
                }

#pragma unroll
                for (int i = 0; i < thread_m_blocks; i++) {
                    mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
                    mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
                }
            }
            };

        // Since we slice across the k dimension of a tile in order to increase the
        // number of warps while keeping the n dimension of a tile reasonable, we have
        // multiple warps that accumulate their partial sums of the same output
        // location; which we have to reduce over in the end. We do in shared memory.
        auto thread_block_reduce = [&]() {
            constexpr int red_off = threads / b_sh_stride_threads / 2;
            if (red_off >= 1) {
                int red_idx = threadIdx.x / b_sh_stride_threads;
                constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
                constexpr int red_sh_delta = b_sh_stride_threads;
                int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                    (threadIdx.x % b_sh_stride_threads);

                // Parallel logarithmic shared memory reduction. We make sure to avoid any
                // unnecessary read or write iterations, e.g., for two warps we write only
                // once by warp 1 and read only once by warp 0.

#pragma unroll
                for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
#pragma unroll
                    for (int i = red_off; i > 0; i /= 2) {
                        if (i <= red_idx && red_idx < 2 * i) {
#pragma unroll
                            for (int j = 0; j < 4 * 2; j++) {
                                int red_sh_wr =
                                    red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
                                if (i < red_off) {
                                    float* c_rd =
                                        reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
                                    float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
#pragma unroll
                                    for (int k = 0; k < 4; k++)
                                        reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                                        c_rd[k] + c_wr[k];
                                }
                                sh[red_sh_wr] =
                                    reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
                            }
                        }
                        __syncthreads();
                    }
                    if (red_idx == 0) {
#pragma unroll
                        for (int i = 0; i < 4 * 2; i++) {
                            float* c_rd =
                                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
#pragma unroll
                            for (int j = 0; j < 4; j++)
                                reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
                                c_rd[j];
                        }
                    }
                    __syncthreads();
                }
            }
            };

        // Since multiple threadblocks may process parts of the same column slice, we
        // finally have to globally reduce over the results. As the striped
        // partitioning minimizes the number of such reductions and our outputs are
        // usually rather small, we perform this reduction serially in L2 cache.
        auto global_reduce = [&](bool first = false, bool last = false) {
            // We are very careful here to reduce directly in the output buffer to
            // maximize L2 cache utilization in this step. To do this, we write out
            // results in FP16 (but still reduce with FP32 compute).
            constexpr int active_threads = 32 * thread_n_blocks / 4;
            if (threadIdx.x < active_threads) {
                int c_gl_stride = prob_n / 8;
                int c_gl_wr_delta_o = 8 * c_gl_stride;
                int c_gl_wr_delta_i = 4 * (active_threads / 32);
                int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
                c_gl_wr += (2 * thread_n_blocks) * slice_col;
                constexpr int c_sh_wr_delta = active_threads;
                int c_sh_wr = threadIdx.x;

                int row = (threadIdx.x % 32) / 4;

                if (!first) {
                    // Interestingly, doing direct global accesses here really seems to mess up
                    // the compiler and lead to slowdowns, hence we also use async-copies even
                    // though these fetches are not actually asynchronous.
#pragma unroll
                    for (int i = 0; i < thread_m_blocks * 4; i++) {
                        cp_async4_pred(
                            &sh[c_sh_wr + c_sh_wr_delta * i],
                            &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                            c_gl_wr_delta_i * (i % 2)],
                            i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
                    }
                    cp_async_fence();
                    cp_async_wait<0>();
                }

#pragma unroll
                for (int i = 0; i < thread_m_blocks * 4; i++) {
                    if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
                        if (!first) {
                            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
#pragma unroll
                            for (int j = 0; j < 2 * 4; j++) {
                                reinterpret_cast<float*>(
                                    &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
                                    Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
                            }
                        }
                        if (!last) {
                            int4 c;
#pragma unroll
                            for (int j = 0; j < 2 * 4; j++) {
                                reinterpret_cast<scalar_t*>(&c)[j] =
                                    Dtype::float2num(reinterpret_cast<float*>(
                                        &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
                            }
                            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
                                c;
                        }
                    }
                }
            }
            };

        // Write out the reduce final result in the correct layout. We only actually
        // reshuffle matrix fragments in this step, the reduction above is performed
        // in fragment layout.
        auto write_result = [&]() {
            int c_gl_stride = prob_n / 8;
            constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
            int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
            constexpr int c_sh_rd_delta =
                c_sh_stride * (threads / (2 * thread_n_blocks));

            int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                (threadIdx.x % (2 * thread_n_blocks));
            c_gl_wr += (2 * thread_n_blocks) * slice_col;
            int c_sh_wr =
                (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
            c_sh_wr += 32 * (threadIdx.x / 32);
            int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                (threadIdx.x % (2 * thread_n_blocks));

            int c_gl_wr_end = c_gl_stride * prob_m;

            // We first reorder in shared memory to guarantee the most efficient final
            // global write patterns
            auto write = [&](int idx, float c0, float c1, FragS& s) {
                scalar_t2 res =
                    Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

                // For per-column quantization we finally apply the scale here (only for
                // 4-bit)
                if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
                    res = __hmul2(res, s[0]);
                }

                ((scalar_t2*)sh)[idx] = res;
                };

            if (threadIdx.x / 32 < thread_n_blocks / 4) {
#pragma unroll
                for (int i = 0; i < thread_m_blocks; i++) {
#pragma unroll
                    for (int j = 0; j < 4; j++) {
                        int wr = c_sh_wr + 8 * j;
                        write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
                            frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
                        write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
                            frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
                        write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
                            frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
                        write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
                            frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
                    }
                    c_sh_wr += 16 * (4 * c_sh_stride);
                }
            }
            __syncthreads();

#pragma unroll
            for (int i = 0;
                i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
                i++) {
                if (c_gl_wr < c_gl_wr_end) {
                    C[c_gl_wr] = sh[c_sh_rd];
                    c_gl_wr += c_gl_wr_delta;
                    c_sh_rd += c_sh_rd_delta;
                }
            }
            };

        // Start global fetch and register load pipelines.
        auto start_pipes = [&]() {

#pragma unroll
            for (int i = 0; i < stages - 1; i++) {
                if (has_act_order && i == 0) {
                    int last_g_idx = slice_k_start + stages * tb_k * 2;
                    if (last_g_idx >= prob_k) {
                        last_g_idx = prob_k - 1;
                    }
                    fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
                }
                fetch_to_shared(i, i, i < slice_iters);
            }

            zero_accums();
            wait_for_stage();
            init_same_group(0);
            fetch_to_registers(0, 0);
            fetch_scales_to_registers(0, 0);
            a_gl_rd += a_gl_rd_delta_o * (stages - 1);
            slice_k_start_shared_fetch += tb_k * (stages - 1);
            };
        if (slice_iters) {
            start_pipes();
        }

        // Main loop.
        while (slice_iters) {
            // We unroll over both the global fetch and the register load pipeline to
            // ensure all shared memory accesses are static. Note that both pipelines
            // have even length meaning that the next iteration will always start at
            // index 0.

#pragma unroll
            for (int pipe = 0; pipe < stages;) {
#pragma unroll
                for (int k = 0; k < b_sh_wr_iters; k++) {
                    fetch_to_registers(k + 1, pipe % stages);
                    fetch_scales_to_registers(k + 1, pipe);
                    if (k == b_sh_wr_iters - 2) {
                        fetch_to_shared((pipe + stages - 1) % stages, pipe,
                            slice_iters >= stages);
                        pipe++;
                        wait_for_stage();
                        init_same_group(pipe % stages);
                    }
                    matmul(k);
                }
                slice_iters--;
                if (slice_iters == 0) {
                    break;
                }
            }

            a_gl_rd += a_gl_rd_delta_o * stages;
            slice_k_start += tb_k * stages;
            slice_k_start_shared_fetch += tb_k * stages;

            if constexpr (has_act_order) {
                int first_group_id = g_idx[slice_k_start];
                int last_g_idx = slice_k_start + stages * tb_k * 2;
                if (last_g_idx >= prob_k) {
                    last_g_idx = prob_k - 1;
                }
                int last_group_id = g_idx[last_g_idx];
                if (last_group_id >= sh_first_group_id + sh_num_groups) {
                    fetch_scales_to_shared(false, first_group_id, last_group_id);
                    __syncthreads();
                }
            }

            // Process results and, if necessary, proceed to the next column slice.
            // While this pattern may not be the most readable, other ways of writing
            // the loop seemed to noticeably worse performance after compilation.
            if (slice_iters == 0) {
                cp_async_wait<0>();
                bool last = slice_idx == slice_count - 1;
                // For per-column scales, we only fetch them here in the final step before
                // write-out
                if constexpr (!has_act_order && group_blocks == -1) {
                    if constexpr (num_bits == 8) {
                        if (s_sh_wr_pred) {
                            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
                        }
                        cp_async_fence();
                    }
                    else {
                        if (last) {
                            if (s_sh_wr_pred) {
                                cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
                            }
                            cp_async_fence();
                        }
                    }
                }

                thread_block_reduce();
                if constexpr (!has_act_order && group_blocks == -1) {
                    if constexpr (num_bits == 8) {
                        cp_async_wait<0>();
                        __syncthreads();
                        if (threadIdx.x / 32 < thread_n_blocks / 4) {
                            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
                            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
                        }

                    }
                    else {
                        if (last) {
                            cp_async_wait<0>();
                            __syncthreads();
                            if (threadIdx.x / 32 < thread_n_blocks / 4) {
                                reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
                                reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
                            }
                        }
                    }
                }

                // For 8-bit channelwise, we apply the scale before the global reduction
                // that converts the fp32 results to fp16 (so that we avoid possible
                // overflow in fp16)
                if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
                    if (threadIdx.x / 32 < thread_n_blocks / 4) {
#pragma unroll
                        for (int i = 0; i < thread_m_blocks; i++) {
#pragma unroll
                            for (int j = 0; j < 4; j++) {
                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                                    frag_s[j / 2][2 * (j % 2) + 0]);
                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                                    frag_s[j / 2][2 * (j % 2) + 0]);

                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                                    frag_s[j / 2][2 * (j % 2) + 1]);
                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                                    frag_s[j / 2][2 * (j % 2) + 1]);
                            }
                        }
                    }
                }

                if (slice_count > 1) {  // only globally reduce if there is more than one
                    // block in a slice
                    barrier_acquire(&locks[slice_col], slice_idx);
                    global_reduce(slice_idx == 0, last);
                    barrier_release(&locks[slice_col], last);
                }
                if (last)  // only the last block in a slice actually writes the result
                    write_result();
                slice_row = 0;
                slice_col_par++;
                slice_col++;
                init_slice();
                if (slice_iters) {
                    a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                        (threadIdx.x % a_gl_rd_delta_o);
#pragma unroll
                    for (int i = 0; i < b_sh_wr_iters; i++)
                        B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
                    if (slice_col == 0) {
#pragma unroll
                        for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
                    }

                    // Update slice k/n for scales loading
                    if constexpr (has_act_order) {
                        slice_k_start = tb_k * slice_row;
                        slice_k_finish = slice_k_start + tb_k * slice_iters;
                        slice_k_start_shared_fetch = slice_k_start;
                        slice_n_offset = act_s_col_tb_stride * slice_col;

                    }
                    else {
                        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
                    }

                    start_pipes();
                }
            }
        }
    }

    template <typename scalar_t,         // compute dtype, half or nv_float16
        const int num_bits,        // number of bits used for weights
        const int threads,         // number of threads in a threadblock
        const int template_thread_m_blocks, // number of 16x16 blocks in the m
        // dimension (batchsize) of the
        // threadblock
        const int thread_n_blocks, // same for n dimension (output)
        const int thread_k_blocks, // same for k dimension (reduction)
        const int stages, // number of stages for the async global->shared
        // fetch pipeline
        const bool has_act_order,   // whether act_order is enabled
        const int group_blocks = -1 // number of consecutive 16x16 blocks
        // with a separate quantization scale
    >
    __global__ void
        Marlin_wrapper(const int4* __restrict__ A, // fp16 input matrix of shape mxk
            const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
            int4* __restrict__ C,       // fp16 output buffer of shape mxn
            const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
            // (k/groupsize)xn
            const int* __restrict__ g_idx, // int32 group indices of shape k
            int num_groups, // number of scale groups per output channel
            const int* __restrict__ prob_m_ptr,     // batch dimension m
            int prob_n,     // output dimension n
            int prob_k,     // reduction dimension k
            int* locks      // extra global storage for barrier synchronization
        ) {
        int prob_m = *prob_m_ptr;
        prob_m = min(prob_m, 1024);
        const int thread_m_blocks = min(div_ceil(prob_m, 16), template_thread_m_blocks);
        if(prob_m > 16 * thread_m_blocks)
            prob_m = (16 * thread_m_blocks) * div_ceil(prob_m, (16 * thread_m_blocks));
        /*if (blockIdx.x == 0 && threadIdx.x == 0)
            printf("marlin prob_m %d\n", prob_m);*/
        if (thread_m_blocks == 1) {
            Marlin<scalar_t, num_bits, threads, 1,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
        else if (thread_m_blocks == 2) {
            Marlin<scalar_t, num_bits, threads, 2,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
        else if (thread_m_blocks == 3) {
            Marlin<scalar_t, num_bits, threads, 3,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
        else if (thread_m_blocks == 4) {
            Marlin<scalar_t, num_bits, threads, 4,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
    }

#define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
                  HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)                    \
    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
             thread_n_blocks == THREAD_N_BLOCKS &&                             \
             thread_k_blocks == THREAD_K_BLOCKS &&                             \
             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
             num_threads == NUM_THREADS) {                                     \
        cudaFuncSetAttribute(                                                  \
            Marlin_wrapper<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,           \
                   THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages,              \
                   HAS_ACT_ORDER, GROUP_BLOCKS>,                               \
            cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
        Marlin_wrapper<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,               \
               THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,   \
               GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
            A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m_ptr, prob_n, \
            prob_k, locks);                                                    \
    }

    typedef struct {
        int thread_k;
        int thread_n;
        int num_threads;
    } thread_config_t;

    typedef struct {
        int max_m_blocks;
        thread_config_t tb_cfg;
    } exec_config_t;

    thread_config_t small_batch_thread_configs[] = {
        // Ordered by priority

        // thread_k, thread_n, num_threads
        {128, 128, 256},
        {64, 128, 128},
        {128, 64, 128},
    };

    thread_config_t large_batch_thread_configs[] = {
        // Ordered by priority

        // thread_k, thread_n, num_threads
        {64, 256, 256},
        // {128, 128, 256},
        {64, 128, 128},
        {128, 64, 128},

    };

    int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
        int prob_n, int prob_k, int num_bits, int group_size,
        bool has_act_order, bool is_k_full) {
        bool cache_scales_chunk = has_act_order && !is_k_full;

        int tb_n = th_config.thread_n;
        int tb_k = th_config.thread_k;

        // Get max scale groups per thread-block
        int tb_groups;
        if (group_size == -1) {
            tb_groups = 1;
        }
        else if (group_size == 0) {
            tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size
        }
        else {
            tb_groups = div_ceil(tb_k, group_size);
        }

        if (cache_scales_chunk) {
            int load_groups =
                tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K
            load_groups = max(load_groups, 32); // We load at least 32 scale groups
            return load_groups * tb_n * 2;

        }
        else {
            int tb_scales = tb_groups * tb_n * 2;

            return tb_scales * pipe_stages;
        }
    }

    bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
        int prob_m, int prob_n, int prob_k, int num_bits,
        int scales_cache_size, int max_shared_mem) {
        int pack_factor = 32 / num_bits;

        // Get B size
        int tb_k = th_config.thread_k;
        int tb_n = th_config.thread_n;

        int b_size = (tb_k * tb_n / pack_factor) * 4;

        // Get A size
        int m_blocks = div_ceil(prob_m, 16);
        int tb_max_m = 16;

        // zbx: too ugly
        // origin
        /*while (true) {
          if (m_blocks >= max_m_blocks) {
            tb_max_m *= max_m_blocks;
            break;
          }

          max_m_blocks--;
          if (max_m_blocks == 0) {
            TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
          }
        }*/
        // refactor
        tb_max_m *= std::min(m_blocks, max_m_blocks);

        int a_size = (tb_max_m * tb_k) * 2;

        float pipe_size = (a_size + b_size) * pipe_stages;

        TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
        return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
    }

    bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
        int prob_m, int prob_n, int prob_k, int num_bits,
        int group_size, bool has_act_order, bool is_k_full,
        int max_shared_mem) {
        // Sanity
        if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
            th_config.num_threads == -1) {
            return false;
        }

        // Verify K/N are divisible by thread K/N
        if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
            return false;
        }

        // Verify min for thread K/N
        if (th_config.thread_n < min_thread_n ||
            th_config.thread_k < min_thread_k) {
            return false;
        }

        // num_threads must be at least 128 (= 4 warps)
        if (th_config.num_threads < 128) {
            return false;
        }

        //  Determine cache for scales
        int scales_cache_size =
            get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                group_size, has_act_order, is_k_full);

        // Check that pipeline fits into cache
        if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
            num_bits, scales_cache_size, max_shared_mem)) {
            return false;
        }

        return true;
    }

    exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
        int num_bits, int group_size,
        bool has_act_order, bool is_k_full,
        int max_shared_mem) {
        int max_m_blocks = 4;
        while (max_m_blocks > 0) {
            if (prob_m <= 16) {
                for (auto th_config : small_batch_thread_configs) {
                    if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n,
                        prob_k, num_bits, group_size, has_act_order,
                        is_k_full, max_shared_mem)) {
                        return exec_config_t{ max_m_blocks, th_config };
                    }
                }
            }
            else {
                for (auto th_config : large_batch_thread_configs) {
                    if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n,
                        prob_k, num_bits, group_size, has_act_order,
                        is_k_full, max_shared_mem)) {
                        return exec_config_t{ max_m_blocks, th_config };
                    }
                }
            }

            max_m_blocks--; // Process less M blocks per invocation to reduce cache
            // usage
        }

        return exec_config_t{ 0, {-1, -1, -1} };
    }

#define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)                     \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)        

    template <typename scalar_t>
    void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
        void* g_idx, void* perm, void* a_tmp, int* prob_m_ptr, int prob_m,
        int prob_n, int prob_k, void* workspace, int num_bits,
        bool has_act_order, bool is_k_full, int num_groups,
        int group_size, int dev, cudaStream_t stream, int thread_k,
        int thread_n, int sms, int max_par) {
        TORCH_CHECK(num_bits == 4 || num_bits == 8,
            "num_bits must be 4 or 8. Got = ", num_bits);
        TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [",
            prob_m, ", ", prob_n, ", ", prob_k, "]");

        int tot_m = prob_m;
        int tot_m_blocks = div_ceil(tot_m, 16);
        int pad = 16 * tot_m_blocks - tot_m;

        if (sms == -1) {
            cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
        }

        int max_shared_mem = 0;
        cudaDeviceGetAttribute(&max_shared_mem,
            cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
        TORCH_CHECK(max_shared_mem > 0);

        // Set thread config
        exec_config_t exec_cfg;
        if (thread_k != -1 && thread_n != -1) {
            // User-defined config
            exec_cfg = exec_config_t{
                4, thread_config_t{thread_k, thread_n, default_threads} };
        }
        else {
            // Auto config
            exec_cfg = determine_thread_config(prob_m, prob_n, prob_k, num_bits,
                group_size, has_act_order, is_k_full,
                max_shared_mem);
        }

        TORCH_CHECK(
            exec_cfg.max_m_blocks > 0 &&
            is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks, prob_m,
                prob_n, prob_k, num_bits, group_size, has_act_order,
                is_k_full, max_shared_mem),
            "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
            ", thread_k = ", exec_cfg.tb_cfg.thread_k,
            ", thread_n = ", exec_cfg.tb_cfg.thread_n,
            ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [", prob_m,
            ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
            ", group_size = ", group_size, ", has_act_order = ", has_act_order,
            ", is_k_full = ", is_k_full, ", max_shared_mem = ", max_shared_mem);

        int num_threads = exec_cfg.tb_cfg.num_threads;
        thread_k = exec_cfg.tb_cfg.thread_k;
        thread_n = exec_cfg.tb_cfg.thread_n;

        int thread_k_blocks = thread_k / 16;
        int thread_n_blocks = thread_n / 16;

        int blocks = sms;

        TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
            " is not divisible by thread_n = ", thread_n);
        TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
            " is not divisible by thread_k = ", thread_k);

        int group_blocks = 0;
        if (has_act_order) {
            if (is_k_full) {
                TORCH_CHECK(group_size != -1);
                group_blocks = group_size / 16;
                TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                    " is not divisible by group_blocks = ", group_blocks);
            }
            else {
                TORCH_CHECK(group_size == 0);
                group_blocks = 0;
            }

        }
        else {
            if (group_size == -1) {
                group_blocks = -1;
            }
            else {
                group_blocks = group_size / 16;
                TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                    " is not divisible by group_blocks = ", group_blocks);
            }
        }

        const int4* A_ptr = (const int4*)A;
        const int4* B_ptr = (const int4*)B;
        int4* C_ptr = (int4*)C;
        const int4* s_ptr = (const int4*)s;
        const int* g_idx_ptr = (const int*)g_idx;
        const int* perm_ptr = (const int*)perm;
        int4* a_tmp_ptr = (int4*)a_tmp;

        int* locks = (int*)workspace;

        if (has_act_order) {
            // Permute A columns
            int block_rows = div_ceil(prob_m, blocks);
            permute_cols_kernel << <blocks, default_threads, 0, stream >> > (
                A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
            A_ptr = a_tmp_ptr;
        }

        // If we have a full K, then we can run the non-act-order version of Marlin
        // (since the weight rows are reordered by increasing group ids, and by
        // having a full K, we have full original groups)
        if (is_k_full) {
            has_act_order = false;
        }

        // Main loop
        for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
            int thread_m_blocks = tot_m_blocks - i;
            prob_m = tot_m - 16 * i;
            int par = 1;
            if (thread_m_blocks > exec_cfg.max_m_blocks) {
                // Note that parallel > 1 currently only works for inputs without
                // any padding
                par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
                if (par > max_par)
                    par = max_par;
                prob_m = (16 * exec_cfg.max_m_blocks) * par;
                i += exec_cfg.max_m_blocks * (par - 1);
                thread_m_blocks = exec_cfg.max_m_blocks;
            }

            // Define kernel configurations
#define undefined_error                                                        \
    TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +    \
                           str(prob_n) + ", " + str(prob_k) + "]" +            \
                           ", has_act_order = " + str(has_act_order) +         \
                           ", num_groups = " + str(num_groups) +               \
                           ", group_size = " + str(group_size) +               \
                           ", thread_m_blocks = " + str(thread_m_blocks) +     \
                           ", thread_n_blocks = " + str(thread_n_blocks) +     \
                           ", thread_k_blocks = " + str(thread_k_blocks));

        /* std::cout << "MNK = [" + str(prob_m) + ", " + \
             str(prob_n) + ", " + str(prob_k) + "]" + \
             ", has_act_order = " + str(has_act_order) + \
             ", num_groups = " + str(num_groups) + \
             ", group_size = " + str(group_size) + \
             ", thread_m_blocks = " + str(thread_m_blocks) + \
             ", thread_n_blocks = " + str(thread_n_blocks) + \
             ", thread_k_blocks = " + str(thread_k_blocks) << std::endl;*/

             /*if (false) {
             }
             // CALL_IF(4, 32, 2, 256)
             // CALL_IF(4, 16, 4, 256)
             __CALL_IF(4, 1, 16, 4, false, 4, 256)
             __CALL_IF(4, 2, 16, 4, false, 4, 256)
             // CALL_IF(4, 8, 8, 256)
             __CALL_IF(4, 1, 8, 8, false, 4, 256)
             __CALL_IF(4, 2, 8, 8, false, 4, 256)
             // CALL_IF(4, 16, 4, 128)
             __CALL_IF(4, 1, 16, 4, false, 4, 128)
             __CALL_IF(4, 2, 16, 4, false, 4, 128)
             // CALL_IF(4, 8, 8, 128)
             __CALL_IF(4, 1, 8, 8, false, 4, 128)
             __CALL_IF(4, 2, 8, 8, false, 4, 128)
             else {undefined_error}*/

            if (num_bits == 4 && num_threads == 256)
            {
                if (false) {
                }
                CALL_IF(4, 32, 2, 256)
                    CALL_IF(4, 16, 4, 256)
                    CALL_IF(4, 8, 8, 256)
                else {
                    undefined_error
                }
            }
            else if (num_bits == 4 && num_threads == 128)
            {
                if (false) {
                }
                CALL_IF(4, 8, 4, 128)
                    CALL_IF(4, 16, 4, 128)
                    CALL_IF(4, 4, 8, 128)
                else {
                    undefined_error
                }
            }
            // else if (num_bits == 8 && num_threads == 256)
            // {
            //     if (false) {
            //     }
            //     CALL_IF(8, 32, 2, 256)
            //     CALL_IF(8, 16, 4, 256)
            //     CALL_IF(8, 8, 8, 256)
            //     else {
            //         undefined_error
            //     }
            // }
            // else if (num_bits == 8 && num_threads == 128)
            // {
            //     if (false) {
            //     }
            //     CALL_IF(8, 8, 4, 128)
            //     CALL_IF(8, 16, 4, 128)
            //     CALL_IF(8, 4, 8, 128)
            //     else {
            //         undefined_error
            //     }
            // }
            else {
                undefined_error
            }

            A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
            C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
        }
    }

} // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
    torch::Tensor& b_scales, torch::Tensor& g_idx,
    torch::Tensor& perm, torch::Tensor& workspace,
    int64_t num_bits, torch::Tensor size_m_tensor, int64_t size_m, int64_t size_n,
    int64_t size_k, int sms, bool is_k_full) {
    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
    // Verify num_bits
    TORCH_CHECK(num_bits == 4 || num_bits == 8,
        "num_bits must be 4 or 8. Got = ", num_bits);
    int pack_factor = 32 / num_bits;

    // Verify A
    TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
        ", size_m = ", size_m);
    TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
        ", size_k = ", size_k);

    // Verify B
    TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
        " is not divisible by tile_size = ", gptq_marlin::tile_size);
    TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
        "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
        ", size_k = ", size_k,
        ", tile_size = ", gptq_marlin::tile_size);
    TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
        "b_q_weight.size(1) = ", b_q_weight.size(1),
        " is not divisible by tile_size = ", gptq_marlin::tile_size);
    int actual_size_n =
        (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
    TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
        ", actual_size_n = ", actual_size_n);

    // Verify device and strides
    TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
    TORCH_CHECK(a.is_contiguous(), "A is not contiguous");

    TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
    TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");

    TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
    TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");

    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");

    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");

    // Alloc buffers
    auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
    torch::Tensor c = torch::empty({ size_m, size_n }, options);
    torch::Tensor a_tmp = torch::empty({ size_m, size_k }, options);

    // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
    // auto -1)
    int thread_k = -1;
    // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
    // auto -1)
    int thread_n = -1;
    // sms: number of SMs to use for the kernel (can usually be left as auto -1)
    // int sms = -1; //zbx

    // Verify g_idx and perm
    TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
        (g_idx.size(0) == size_k && perm.size(0) == size_k),
        "Unexpected g_idx.size(0) = ", g_idx.size(0),
        " and perm.size(0) = ", perm.size(0),
        ", where size_k = ", size_k);

    // Detect groupsize and act_order
    int num_groups = -1;
    int group_size = -1;
    bool has_act_order = g_idx.size(0) != 0;

    int b_rank = b_scales.sizes().size();
    TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
    TORCH_CHECK(b_scales.size(1) == size_n,
        "b_scales dim 1 = ", b_scales.size(1),
        " is not size_n = ", size_n);
    num_groups = b_scales.size(0);

    if (has_act_order) {
        if (is_k_full) {
            TORCH_CHECK(num_groups > 1,
                "For act_order, num_groups must be > 1");
            TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
                ", is not divisible by num_groups = ", num_groups);
            group_size = size_k / num_groups;
        }
        else {
            group_size = 0;
        }

    }
    else {
        if (num_groups > 1) {
            TORCH_CHECK(
                size_k % num_groups == 0, "size_k = ", size_k,
                ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
            group_size = size_k / num_groups;
        }
        else {
            group_size = -1;
        }
    }

    // Verify workspace size
    TORCH_CHECK(
        size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
        ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
    int min_workspace_size =
        (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
    TORCH_CHECK(workspace.numel() >= min_workspace_size,
        "workspace.numel = ", workspace.numel(),
        " is below min_workspace_size = ", min_workspace_size);

    int dev = a.get_device();
    if (a.scalar_type() == at::ScalarType::Half) {
        gptq_marlin::marlin_mm_f16i4<half>(
            a.data_ptr<at::Half>(), b_q_weight.data_ptr(),
            c.data_ptr<at::Half>(), b_scales.data_ptr<at::Half>(),
            g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
            size_m_tensor.data_ptr<int>(),
            size_m, size_n, size_k, workspace.data_ptr(), num_bits,
            has_act_order, is_k_full, num_groups, group_size, dev,
            at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
            gptq_marlin::max_par);
    }
    else if (a.scalar_type() == at::ScalarType::BFloat16) {
        gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
            a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
            c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
            g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
            size_m_tensor.data_ptr<int>(),
            size_m, size_n, size_k, workspace.data_ptr(), num_bits,
            has_act_order, is_k_full, num_groups, group_size, dev,
            at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
            gptq_marlin::max_par);
    }
    else {
        TORCH_CHECK(false,
            "gpt_marlin_gemm only supports bfloat16 and float16");
    }

    return c;
}

#endif

================================================
FILE: archive/csrc/custom_marlin/gptq_marlin/gptq_marlin.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once

#include <torch/all.h>

#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>

namespace gptq_marlin {

// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
static constexpr int default_threads = 256;

static constexpr int pipe_stages =
    4; // 4 pipeline stages fit into shared memory

static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;

static constexpr int tile_size = 16;
static constexpr int max_par = 16;

template <typename T, int n> struct Vec {
    T elems[n];
    __device__ T &operator[](int i) { return elems[i]; }
};

using I4 = Vec<int, 4>;

constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// No support for async
#else

__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
                                      bool pred = true) {
    const int BYTES = 16;
    uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
    asm volatile("{\n"
                 "   .reg .pred p;\n"
                 "   setp.ne.b32 p, %0, 0;\n"
                 "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
                 "}\n" ::"r"((int)pred),
                 "r"(smem), "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
    const int BYTES = 16;
    uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
    asm volatile("{\n"
                 "   cp.async.cg.shared.global [%0], [%1], %2;\n"
                 "}\n" ::"r"(smem),
                 "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async_fence() {
    asm volatile("cp.async.commit_group;\n" ::);
}

template <int n> __device__ inline void cp_async_wait() {
    asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}

#endif

} // namespace gptq_marlin

================================================
FILE: archive/csrc/custom_marlin/gptq_marlin/gptq_marlin_dtypes.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#ifndef _data_types_cuh
#define _data_types_cuh
#include "gptq_marlin.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>

namespace gptq_marlin {

template <typename scalar_t> class ScalarType {};

template <> class ScalarType<half> {
  public:
    using scalar_t = half;
    using scalar_t2 = half2;

    // Matrix fragments for tensor core instructions; their precise layout is
    // documented here:
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
    using FragA = Vec<half2, 4>;
    using FragB = Vec<half2, 2>;
    using FragC = Vec<float, 4>;
    using FragS = Vec<half2, 1>;

    static __device__ float inline num2float(const half x) {
        return __half2float(x);
    }

    static __device__ half2 inline num2num2(const half x) {
        return __half2half2(x);
    }

    static __device__ half2 inline nums2num2(const half x1, const half x2) {
        return __halves2half2(x1, x2);
    }

    static __host__ __device__ half inline float2num(const float x) {
        return __float2half(x);
    }
};

template <> class ScalarType<nv_bfloat16> {
  public:
    using scalar_t = nv_bfloat16;
    using scalar_t2 = nv_bfloat162;

    using FragA = Vec<nv_bfloat162, 4>;
    using FragB = Vec<nv_bfloat162, 2>;
    using FragC = Vec<float, 4>;
    using FragS = Vec<nv_bfloat162, 1>;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    static __device__ float inline num2float(const nv_bfloat16 x) {
        return __bfloat162float(x);
    }

    static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
        return __bfloat162bfloat162(x);
    }

    static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
                                                    const nv_bfloat16 x2) {
        return __halves2bfloat162(x1, x2);
    }

    static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
        return __float2bfloat16(x);
    }
#endif
};

} // namespace gptq_marlin

#endif

================================================
FILE: archive/csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu
================================================
#include "gptq_marlin.cuh"

namespace gptq_marlin {

static constexpr int repack_stages = 8;

static constexpr int repack_threads = 256;

static constexpr int tile_k_size = tile_size;
static constexpr int tile_n_size = tile_k_size * 4;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800

template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void marlin_repack_kernel(
    uint32_t const* __restrict__ b_q_weight_ptr,
    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
    int size_k, int size_n) {}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits) {
  TORCH_CHECK_NOT_IMPLEMENTED(
      false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
  return torch::empty({1, 1});
}

#else

template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void marlin_repack_kernel(
    uint32_t const* __restrict__ b_q_weight_ptr,
    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
    int size_k, int size_n) {
  constexpr int pack_factor = 32 / num_bits;

  int k_tiles = size_k / tile_k_size;
  int n_tiles = size_n / tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);

  int start_k_tile = blockIdx.x * block_k_tiles;
  if (start_k_tile >= k_tiles) {
    return;
  }

  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);

  // Wait until the next thread tile has been loaded to shared memory.
  auto wait_for_stage = [&]() {
    // We only have `stages - 2` active fetches since we are double buffering
    // and can only issue the next fetch when it is guaranteed that the previous
    // shared memory load is fully complete (as it may otherwise be
    // overwritten).
    cp_async_wait<repack_stages - 2>();
    __syncthreads();
  };

  extern __shared__ int4 sh[];

  constexpr int perm_size = tile_k_size / 4;

  int4* sh_perm_ptr = sh;
  int4* sh_pipe_ptr = sh_perm_ptr;
  if constexpr (has_perm) {
    sh_pipe_ptr += perm_size;
  }

  constexpr int tile_ints = tile_k_size / pack_factor;

  constexpr int stage_n_threads = tile_n_size / 4;
  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
  constexpr int stage_size = stage_k_threads * stage_n_threads;

  auto load_perm_to_shared = [&](int k_tile_id) {
    int first_k_int4 = (k_tile_id * tile_k_size) / 4;

    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);

    if (threadIdx.x < perm_size) {
      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
    }
    __syncthreads();
  };

  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
    if (n_tile_id >= n_tiles) {
      cp_async_fence();
      return;
    }

    int first_n = n_tile_id * tile_n_size;

    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;

    if constexpr (has_perm) {
      if (threadIdx.x < stage_size) {
        int k_id = threadIdx.x / stage_n_threads;
        int n_id = threadIdx.x % stage_n_threads;

        uint32_t const* sh_perm_int_ptr =
            reinterpret_cast<uint32_t const*>(sh_perm_ptr);

        int src_k = sh_perm_int_ptr[k_id];
        int src_k_packed = src_k / pack_factor;

        cp_async4(
            &sh_ptr[k_id * stage_n_threads + n_id],
            reinterpret_cast<int4 const*>(&(
                b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
      }

    } else {
      if (threadIdx.x < stage_size) {
        int k_id = threadIdx.x / stage_n_threads;
        int n_id = threadIdx.x % stage_n_threads;

        int first_k = k_tile_id * tile_k_size;
        int first_k_packed = first_k / pack_factor;

        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
                  reinterpret_cast<int4 const*>(
                      &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
                                       first_n + (n_id * 4)])));
      }
    }

    cp_async_fence();
  };

  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
    if (n_tile_id >= n_tiles) {
      return;
    }

    int warp_id = threadIdx.x / 32;
    int th_id = threadIdx.x % 32;

    if (warp_id >= 4) {
      return;
    }

    int tc_col = th_id / 4;
    int tc_row = (th_id % 4) * 2;

    constexpr int tc_offsets[4] = {0, 1, 8, 9};

    int cur_n = warp_id * 16 + tc_col;

    constexpr int sh_stride = 64;
    constexpr uint32_t mask = (1 << num_bits) - 1;

    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);

    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);

    uint32_t vals[8];

    if constexpr (has_perm) {
      for (int i = 0; i < 4; i++) {
        int k_idx = tc_row + tc_offsets[i];

        uint32_t src_k = sh_perm_int_ptr[k_idx];
        uint32_t src_k_pos = src_k % pack_factor;

        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;

        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;

        vals[i] = b1_cur_val;
        vals[4 + i] = b2_cur_val;
      }

    } else {
      uint32_t b1_vals[tile_ints];
      uint32_t b2_vals[tile_ints];

  #pragma unroll
      for (int i = 0; i < tile_ints; i++) {
        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
      }

  #pragma unroll
      for (int i = 0; i < 4; i++) {
        int cur_elem = tc_row + tc_offsets[i];
        int cur_int = cur_elem / pack_factor;
        int cur_pos = cur_elem % pack_factor;

        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
      }
    }

    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;

    // Result of:
    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
    if constexpr (num_bits == 4) {
      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};

      uint32_t res = 0;
  #pragma unroll
      for (int i = 0; i < 8; i++) {
        res |= vals[pack_idx[i]] << (i * 4);
      }

      out_ptr[out_offset + th_id * 4 + warp_id] = res;

    } else {
      constexpr int pack_idx[4] = {0, 2, 1, 3};

      uint32_t res1 = 0;
      uint32_t res2 = 0;
  #pragma unroll
      for (int i = 0; i < 4; i++) {
        res1 |= vals[pack_idx[i]] << (i * 8);
        res2 |= vals[4 + pack_idx[i]] << (i * 8);
      }

      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
    }
  };

  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
  #pragma unroll
    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
    }

    wait_for_stage();
  };
  #pragma unroll
  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
    int n_tile_id = 0;

    if constexpr (has_perm) {
      load_perm_to_shared(k_tile_id);
    }

    start_pipes(k_tile_id, n_tile_id);

    while (n_tile_id < n_tiles) {
  #pragma unroll
      for (int pipe = 0; pipe < repack_stages; pipe++) {
        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
                        n_tile_id + pipe + repack_stages - 1);
        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
        wait_for_stage();
      }
      n_tile_id += repack_stages;
    }
  }
}

}  // namespace gptq_marlin

  #define CALL_IF(NUM_BITS, HAS_PERM)                                          \
    else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                   \
      cudaFuncSetAttribute(                                                    \
          gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads,       \
                                            NUM_BITS, HAS_PERM>,               \
          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
      gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
                                        HAS_PERM>                              \
          <<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>(   \
              b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);              \
    }

torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits) {
  // Verify compatibility with marlin tile of 16x64
  TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
              " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
  TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
              " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);

  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  int const pack_factor = 32 / num_bits;

  // Verify B
  TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
              ", size_k = ", size_k, ", pack_factor = ", pack_factor);
  TORCH_CHECK(b_q_weight.size(1) == size_n,
              "b_q_weight.size(1) = ", b_q_weight.size(1),
              " is not size_n = ", size_n);

  // Verify device and strides
  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");

  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");

  // Alloc buffers
  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
  auto options = torch::TensorOptions()
                     .dtype(b_q_weight.dtype())
                     .device(b_q_weight.device());
  torch::Tensor out =
      torch::empty({size_k / gptq_marlin::tile_size,
                    size_n * gptq_marlin::tile_size / pack_factor},
                   options);

  // Detect if there is act_order
  bool has_perm = perm.size(0) != 0;

  // Get ptrs
  uint32_t const* b_q_weight_ptr =
      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());

  // Get dev info
  int dev = b_q_weight.get_device();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
  int blocks;
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);

  int max_shared_mem = 0;
  cudaDeviceGetAttribute(&max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);

  if (false) {
  }
  CALL_IF(4, false)
  CALL_IF(4, true)
  CALL_IF(8, false)
  CALL_IF(8, true)
  else {
    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
                ", has_perm = ", has_perm);
  }

  return out;
}

#endif

================================================
FILE: archive/csrc/custom_marlin/gptq_marlin/ops.h
================================================
/**
 * @Description  :
 * @Author       : Azure
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : Azure
 * @LastEditTime : 2024-07-26 08:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#pragma once

#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>

torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                               torch::Tensor &b_scales, torch::Tensor &g_idx,
                               torch::Tensor &perm, torch::Tensor &workspace,
                               int64_t num_bits, torch::Tensor size_m_tensor, int64_t size_m, int64_t size_n,
                               int64_t size_k, int sms, bool is_k_full);

torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor&perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);

================================================
FILE: archive/csrc/custom_marlin/setup.py
================================================
from setuptools import setup, Extension
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
    name='vLLMMarlin',
    ext_modules=[
        CUDAExtension(
            'vLLMMarlin', [
                #'custom_gguf/dequant.cu',
                'binding.cpp',
                'gptq_marlin/gptq_marlin.cu',
                'gptq_marlin/gptq_marlin_repack.cu',
            ],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': [
                    '-O3',
                    '--use_fast_math',
                    '-Xcompiler', '-fPIC',
                ]
            },
        )
    ],
    cmdclass={'build_ext': BuildExtension}
)

================================================
FILE: archive/csrc/custom_marlin/test_cuda_graph.py
================================================
import csv
import torch
import torch.nn as nn
import vLLMMarlin
torch.set_grad_enabled(False)
from utils.marlin_utils import (
	MarlinWorkspace,
	marlin_quantize,
	GPTQ_MARLIN_MIN_THREAD_N,
	GPTQ_MARLIN_MIN_THREAD_K,
	GPTQ_MARLIN_MAX_PARALLEL,
)

def setup_seed(seed):
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

setup_seed(20241223)

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
global_dtype=torch.bfloat16
global_device=torch.device("cuda",0)
global_num_cases:int=int(50)
torch.cuda.set_device(0)
torch.backends.cudnn.enabled =True
torch.backends.cudnn.benchmark = True

max_batch_size = 512
max_tp = 8
L2_size = 73728 * 1024

def get_usable_mem():
	properties = torch.cuda.get_device_properties(global_device)
	#print(f"Total memory: {properties.total_memory / (1024 ** 3):.2f} GB")
	allocated_memory = torch.cuda.memory_allocated(global_device)
	#print(f"Currently allocated memory: {allocated_memory / (1024 ** 2):.2f} MB")
	reserved_memory = torch.cuda.memory_reserved(global_device)
	#print(f"Currently reserved memory: {reserved_memory / (1024 ** 2):.2f} MB")
	return properties.total_memory - 512 * 1024 ** 2 - allocated_memory# - reserved_memory

def exp_range(start, stop, step = 2):
	now = start
	while now <= stop:
		yield now
		now *= step

def timing(func, iters, epochs=100):
	#warmup
	for idx in range(iters):
		func(idx)
		
	torch.cuda.synchronize()
	cuda_graph = torch.cuda.CUDAGraph()
	with torch.cuda.graph(cuda_graph):
		for idx in range(iters):
			func(idx)

	for _ in range(2000):
		cuda_graph.replay()

	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)
	stream = torch.cuda.Stream()
	torch.cuda.synchronize()
	#with torch.cuda.stream(stream):
	start_event.record()
	for _ in range(10):
		cuda_graph.replay()
	end_event.record()
	torch.cuda.synchronize()
	elapsed_time_ms0 = start_event.elapsed_time(end_event)
	
	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)
	torch.cuda.synchronize()
	#with torch.cuda.stream(stream):
	start_event.record()
	for _ in range(epochs+10):
		cuda_graph.replay()
	end_event.record()
	torch.cuda.synchronize()
	elapsed_time_ms = start_event.elapsed_time(end_event) - elapsed_time_ms0
	
	#print(elapsed_time_ms0, elapsed_time_ms)
	return elapsed_time_ms/iters/epochs

class LinearMarlin(nn.Linear):
	marlin_q_w: torch.Tensor
	marlin_s: torch.Tensor
	g_idx: torch.Tensor
	sort_indices: torch.Tensor
	has_bias: bool
	def __init__(
		self,
		in_features,
		out_features,
		bias = False,
		device: str = "cuda",
		num_bits: int = 4,  # 4-bit/8-bit is supported
		group_size: int = 64,  # -1, 32, 64, 128
		act_order: bool = False,
		is_k_full=True,
		sms = -1, # sms in GPU
		**kwargs,
	):
		self.padding = False
		assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
		if in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
			#print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
			self.padding = True
			self.orin_in_features = in_features
			self.orin_out_features = out_features
			in_features = (in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
			out_features = (out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
			#print(f"After padding: in_features={in_features}, out_features={out_features}")
			

		super().__init__(in_features, out_features, bias, device)
		self.has_bias = bias
		self.device = device
		self.num_bits = num_bits
		self.group_size = group_size
		self.act_order = act_order
		# TODO: optimize every shape GEMM
		
		blocks_k, blocks_n = in_features//128, out_features//128

		self.sms = sms

		self.is_k_full = is_k_full
		
		self.weight.requires_grad = False
		self.weight.t_()
		# Pack Marlin linear
		#w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
		#    self.weight, self.num_bits, self.group_size, self.act_order
		#)
		marlin_q_w = torch.randint(int(-1e9), int(1e9), (in_features//16, out_features*2), device=device, dtype=torch.int)
		marlin_s = torch.randn((in_features//64, out_features), device=device)
		self.workspace = MarlinWorkspace(
			self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL, self.device
		)
		self.marlin_q_w = marlin_q_w
		self.marlin_s = marlin_s
		self.g_idx = torch.empty((0), dtype=torch.int32, device=self.device)
		self.sort_indices = torch.empty((0), dtype=torch.int32, device=self.device)
		self.k = self.weight.shape[0]
		self.n = self.weight.shape[1]
		self.weight = None
		"""
		print(in_features, out_features)
		print(marlin_q_w.shape)
		print(marlin_q_w.dtype)
		print(marlin_s.shape)
		print(marlin_s.dtype)
		print(self.workspace.scratch.shape)
		print(self.workspace.scratch.dtype)
		print(self.g_idx.shape)
		print(self.g_idx.dtype)
		print(self.sort_indices.shape)
		print(self.sort_indices.dtype)
		#print(w_ref.shape)
		#print(w_ref.dtype)
		"""
		#w_ref = None

	def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor) -> torch.Tensor:
		# Only support input x as BF16 and FP16
		x = x.to(self.device)
		orig_shape = list(x.shape)
		orig_dtype = x.dtype
		x = x.reshape(-1, x.shape[-1])
		if self.padding:
			padding_input=torch.empty(x.shape[0], self.in_features, device=x.device, dtype=x.dtype)
			padding_input[:,:self.orin_in_features] = x
			x = padding_input
		marlin_s = self.marlin_s.to(x.dtype)
		#print(self.sms * ((orig_shape[0]+63)//64))
		
		sms = self.sms

		x = vLLMMarlin.gptq_marlin_gemm(
			x,
			self.marlin_q_w,
			marlin_s,
			self.g_idx,
			self.sort_indices,
			self.workspace.scratch,
			self.num_bits,
			bsz_tensor,
			x.shape[0],
			self.n,
			x.shape[-1],
			sms,
			self.is_k_full,
		)
		# TODO: don't padding bias
		if self.has_bias:
			x = x + self.bias
		if self.padding:
			x = x[:,:self.orin_out_features]
			orig_shape[-1] = self.orin_out_features
		else:
			orig_shape[-1] = self.out_features
		return x.reshape(orig_shape).to(orig_dtype)

def benchLinearMarlin(input_dim, output_dim):#, out_file
	print("benchmarking MLP Marlin")
	print("-----------------------------------------------------------")
	headers = ["batch_size", "tp", "used_time", "bandwidth GB/s", "TFLOPS", "cases", "padding", "sms"]
	print(" | ".join(headers) + "\n")
	rows = []
	for batch_size in exp_range(1, 64):
		for tp in exp_range(1, max_tp):
			torch.cuda.empty_cache()
			if output_dim % tp != 0:
				continue
			cur_output_dim = output_dim // tp
			modules = []
			inputs = []
			data_size = int(0.53125*input_dim*cur_output_dim)
			input_size = int(2*batch_size*input_dim)
			output_size = int(2*batch_size*cur_output_dim)
			usable_mem = get_usable_mem() - 2 * input_dim * cur_output_dim
			min_cases = max(global_num_cases, (2*L2_size) // (data_size+input_size))
			cases = int(min(min_cases, (usable_mem * 0.8) // (data_size+input_size)))
			#print(usable_mem, data_size, input_size, cases)
				
			bsz_tensor = torch.tensor([batch_size], device=global_device, dtype=torch.int32)

			if cases == 0:
				row = [f"{batch_size}", "OOM", "OOM", "OOM", "0", "False"]
				rows.append(row)
				break
			for _ in range(cases):
				modules.append(LinearMarlin(input_dim, cur_output_dim, sms=56, non_equal_division=False).to(device=global_device).eval())
				inputs.append(torch.randn(batch_size, 1, input_dim, device=global_device))
				
			def forward(case_id):
				modules[case_id](inputs[case_id], bsz_tensor)
				
			used_time = timing(forward, iters=cases)
			bandwidth = (data_size+input_size+output_size)/used_time/1e6
			flops = 2*batch_size*input_dim*cur_output_dim
			tflops = flops/used_time/1e9
			cur_sms = modules[0].sms
			row = [f"{batch_size}", f"{tp}", f"{used_time}", f"{bandwidth}", f"{tflops}", f"{cases}", modules[0].padding, cur_sms]
			rows.append(row)
			print(f"{batch_size}", f"{tp}", f"{used_time}", f"{bandwidth}", f"{tflops}", f"{cases}", modules[0].padding, cur_sms)
	
	"""
	with open(out_file, 'w', newline='') as csvfile:
		csvwriter = csv.writer(csvfile)
		csvwriter.writerow(headers)
		for row in rows:
			csvwriter.writerow(row)
	"""
	
	"""
	markdown_table = " | ".join(headers) + "\n"
	markdown_table += " | ".join(["---"] * len(headers)) + "\n"
	for row in rows:
		markdown_table += " | ".join(row) + "\n"

	print(markdown_table)
	"""
	#print("finish write file", out_file)
	#print("-------------------------------------------------------------")

if __name__ == "__main__":
	
	benchLinearMarlin(5120, 3584)
	exit(0)
	
	max_batch = 1
	cur_batch = 1


	marlin_linear = LinearMarlin(5120, 3584)

	input_tensor = torch.randn(max_batch, 1, 5120, device="cuda", dtype=torch.bfloat16)
	bsz_tensor = torch.tensor([max_batch], device="cuda", dtype=torch.int32)

	out_truth = marlin_linear(input_tensor, bsz_tensor)

	print(out_truth)

	g = torch.cuda.CUDAGraph()
	with torch.cuda.graph(g):
		out_buf = marlin_linear(input_tensor, bsz_tensor)
	
	for i in range(10000):
		g.replay()
	
	#torch.testing.assert_close(out_buf, out_truth, rtol=1e-3, atol=1e-3)
	
	marlin_linear = LinearMarlin(5120, 3584)
	g = torch.cuda.CUDAGraph()
	with torch.cuda.graph(g):
		out_buf = marlin_linear(input_tensor, bsz_tensor)
	
	new_input = torch.randn(cur_batch, 1, 5120, device="cuda", dtype=torch.bfloat16)
	bsz_tensor.copy_(torch.tensor([cur_batch], device="cuda", dtype=torch.int32))
	
	new_out_truth = marlin_linear(new_input, bsz_tensor)
	input_tensor[:cur_batch].copy_(new_input)
	input_tensor[cur_batch:] = 0
	
	g.replay()
	
	torch.cuda.synchronize()

	def printMinMax(tensor):
		abs_tensor = torch.abs(tensor)

		min_val = torch.min(abs_tensor)
		max_val = torch.max(abs_tensor)

		min_indices = (abs_tensor == min_val).nonzero(as_tuple=True)
		max_indices = (abs_tensor == max_val).nonzero(as_tuple=True)

		print(f"min: {min_val.item()}")
		print(f"min idx: {min_indices}")
		print(f"max: {max_val.item()}")
		print(f"max idx: {max_indices}")

	print(out_buf[:cur_batch].shape)
	print(new_out_truth.shape)


	printMinMax(out_buf[:cur_batch])
	printMinMax(new_out_truth)

	#torch.testing.assert_close(out_buf[:cur_batch, 0, :], new_out_truth[:cur_batch, 0, :], rtol=1e-3, atol=1e-3)


================================================
FILE: archive/csrc/custom_marlin/utils/__init__.py
================================================


================================================
FILE: archive/csrc/custom_marlin/utils/format24.py
================================================
#
# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
#

import torch


# This is PyTorch implementation of main part of reorder_meta()
# function, from tools/util/include/cutlass/util/host_reorder.h file
# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
# GEMM decides upon layout of this matrix, and at the moment for the
# sparse GEMM executed on tensor cores, this is layout described by
# ColumnMajorInterleaved<2> data structure, in
# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
# reordering of meta matrix into meta_reordered matrix calculated
# according to these segments of CUTLASS code is re-implemented here.
# Note that this calculation produces offsets for scattering metadata
# matrix elements into reordered metadata matrix elements (or,
# equivalently, for gathering reordered metadata matrix element back
# into metadata matrix elements).
def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
                                               device):
    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)

    # Reorder the rows, then swizzle the 2x2 blocks.
    group_x = 64
    group_y = 32 if meta_dtype.itemsize == 2 else 16

    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
                ((dst_rows % group_x) // 8) * 4)

    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
    dst_rows += topright - bottomleft
    dst_cols -= topright - bottomleft

    # Assumed that meta tensor is to be stored in CUTLASS
    # InterleavedColumnMajor layout, and reverse engineered
    # corresponding code to store values into this tensor.
    interleave = 2
    cols_maj = dst_cols // interleave
    cols_min = dst_cols % interleave
    return (cols_maj * m * interleave + dst_rows * interleave +
            cols_min).view(-1)


# This function converts dense matrix into sparse semi-structured
# representation, producing "compressed" matrix, in the layout used by
# CUTLASS backend, and corresponding metadata matrix.
def sparse_semi_structured_from_dense_cutlass(dense):
    if dense.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = dense.shape
    device = dense.device

    meta_dtype = torch.int8
    if dense.dtype == torch.int8:
        meta_dtype = torch.int32
    elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
        meta_dtype = torch.int16
    else:
        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
    if quadbits_per_meta_elem not in (4, 8):
        raise RuntimeError(
            "Invalid number of elements per meta element calculated")

    if meta_dtype == torch.int32:
        if m % 16 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 16")
    else:
        if m % 32 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 32")
    if k % (4 * quadbits_per_meta_elem) != 0:
        raise RuntimeError(
            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
        )

    if dense.dtype != torch.float:
        ksparse = 4
        dense_4 = dense.view(-1, k // ksparse, ksparse)
        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
    else:
        ksparse = 2
        dense_2 = dense.view(-1, k // ksparse, ksparse)
        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
    meta_ncols = k // (ksparse * quadbits_per_meta_elem)

    # Encoding quadruples of True/False values as follows:
    #     [True,  True,  False, False] -> 0b0100
    #     [True,  False, True,  False] -> 0b1000
    #     [False, True,  True,  False] -> 0b1001
    #     [True,  False, False, True ] -> 0b1100
    #     [False, True,  False, True ] -> 0b1101
    #     [False, False, True,  True ] -> 0b1110
    # Thus, lower two bits in the encoding are index of the True value
    # at the lowest index in the quadruple, and the higher two bits in
    # the encoding are index of the other True value in the quadruple.
    # In case there are less than two True values, than False value or
    # values at some index or indices are considered True for the
    # encoding.  In case there are more than two True values, then the
    # excess True value(s) at some indices are considered False for
    # the encoding.  The exact encodings used for these cases are as
    # follows:
    #     [False, False, False, False] -> 0b1110
    #     [False, False, False, True ] -> 0b1110
    #     [False, False, True,  False] -> 0b1110
    #     [False, True,  False, False] -> 0b1001
    #     [False, True,  True,  True ] -> 0b1101
    #     [True,  False, False, False] -> 0b1000
    #     [True,  False, True,  True ] -> 0b1100
    #     [True,  True,  False, True ] -> 0b0100
    #     [True,  True,  True,  False] -> 0b0100
    #     [True,  True,  True,  True ] -> 0b0100
    # These particular encodings are chosen, with the help of Espresso
    # logic minimizer software, for the purpose of minimization of
    # corresponding Boolean functions, that translate non-zero flags
    # into encoding bits.  Note also possible choices for the first
    # and last of these encodings were limited only to (0b0100,
    # 0b1110), in order to produce valid encodings for 1:2 sparsity
    # case.

    expr0 = m0 & m1
    expr1 = ~m0 & m1
    expr2 = ~m0 & ~m1
    bit0 = expr1
    bit1 = expr2
    bit2 = expr0 | expr2 | m3
    bit3 = expr1 | ~m1
    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
    idxs1 = bit2 | (bit3.to(torch.int64) << 1)

    if dense.dtype != torch.float:
        sparse0 = dense_4.gather(
            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
    else:
        sparse = dense_2.gather(-1,
                                idxs0.unsqueeze(-1) // 2).view(
                                    m,
                                    k // 2)  # type: ignore[possibly-undefined]

    meta_4 = idxs0 | (idxs1 << 2)
    meta_n = meta_4.view(
        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)

    if quadbits_per_meta_elem == 4:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12))
    elif quadbits_per_meta_elem == 8:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12)
                | (meta_n[:, :, 4] << 16)
                | (meta_n[:, :, 5] << 20)
                | (meta_n[:, :, 6] << 24)
                | (meta_n[:, :, 7] << 28))

    # Reorder meta tensor elements.
    meta_reordered = meta.new_empty(
        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))

    return (sparse, meta_reordered.view(m, meta_ncols))


# This function performs reverse of the function above - it
# reconstructs dense matrix from a pair of "compressed" matrix, given
# in the layout used by CUTLASS backend, and accompanying metadata
# matrix.
def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
    if sparse.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = sparse.shape
    device = sparse.device

    if meta_reordered.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"  # noqa: E501
        )
    if meta_reordered.device != device:
        raise RuntimeError(
            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"  # noqa: E501
        )

    meta_dtype = meta_reordered.dtype
    if meta_dtype not in (torch.int16, torch.int32):
        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4

    ksparse = 4 if sparse.dtype != torch.float else 2

    meta_nrows, meta_ncols = meta_reordered.shape
    if meta_nrows != m:
        raise RuntimeError(
            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"  # noqa: E501
        )
    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
        raise RuntimeError(
            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
            "expected according to the number of columns of meta matrix")

    # Undo meta tensor elements reordering.
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta = torch.gather(meta_reordered.view(-1), 0,
                        meta_offsets).view(m, meta_ncols)

    # Unpack sparse tensor back to original dense tensor, using
    # information provided by meta tensor.  Note that torch.float
    # datatype is handled pretty much the same as
    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
    # value is encoded as if underlying 8 bytes contain four
    # torch.half/torch.bfloat16 values, where either first two or last
    # two are zeros.
    meta_2 = torch.empty(
        (m, meta_ncols, 2 * quadbits_per_meta_elem),
        dtype=meta_dtype,
        device=device,
    )
    if quadbits_per_meta_elem == 4:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
    elif quadbits_per_meta_elem == 8:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
        meta_2[:, :, 8] = (meta >> 16) & 0b11
        meta_2[:, :, 9] = (meta >> 18) & 0b11
        meta_2[:, :, 10] = (meta >> 20) & 0b11
        meta_2[:, :, 11] = (meta >> 22) & 0b11
        meta_2[:, :, 12] = (meta >> 24) & 0b11
        meta_2[:, :, 13] = (meta >> 26) & 0b11
        meta_2[:, :, 14] = (meta >> 28) & 0b11
        meta_2[:, :, 15] = (meta >> 30) & 0b11

    dense_offsets = meta_2.view(-1) + (
        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
            -1, 1).repeat(1, 2).view(-1)

    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
    if sparse.dtype != torch.float:
        # dense.scatter_(0, dense_offsets, sparse.view(-1))
        dense.scatter_(0, dense_offsets, sparse.reshape(-1))
    else:
        dense.view(torch.half).scatter_(0, dense_offsets,
                                        sparse.view(torch.half).view(-1))

    return dense.view(m, 2 * k)


def mask_creator(tensor):
    """
    Class for creating N:M sparsity masks.
    Masks will be created using the N:M ratio, where for every block of 
    M weights, N will be pruned based on ranked weight value. Each mask 
    will correspond to the given tensor.

    :param N: The number of weights in a group to keep
    :param M: The size of a weight group
    """
    N = 2
    M = 4

    mask = None
    # for i, tensor in enumerate(tensors):
    if tensor.numel() % M != 0:
        raise ValueError(
            f"Tensor of size {tensor.shape} can't be evenly divided into "
            f"{M} groups")

    num_groups = tensor.numel() // M

    # N:M sparsity for linear layers
    tensor_temp = tensor.detach().abs().reshape(num_groups, M)
    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]

    w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
    mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)

    return mask

================================================
FILE: archive/csrc/custom_marlin/utils/marlin_24_perms.py
================================================
'''
Date: 2024-11-08 02:46:07
LastEditors: djw
LastEditTime: 2024-11-08 02:46:41
'''
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
#
# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms_24(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        col_o = col // 2
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
                             4 * block)
        for j in range(4):
            perm_list.extend([p + 1 * j for p in perm1])
    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
    scale_perm_single: List[int] = []
    for i in range(8):
        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
    return perm, scale_perm, scale_perm_single


marlin_24_perm: Dict[int, torch.Tensor] = {}
marlin_24_scale_perm: Dict[int, List[int]] = {}
marlin_24_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
    marlin_24_perm[num_bits] = perm_24
    marlin_24_scale_perm[num_bits] = scale_perm_24
    marlin_24_scale_perm_single[num_bits] = scale_perm_single_24

================================================
FILE: archive/csrc/custom_marlin/utils/marlin_perms.py
================================================
'''
Date: 2024-11-08 02:46:47
LastEditors: djw
LastEditTime: 2024-11-08 02:46:55
'''
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
#
# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col + 8 * block)
        for j in range(4):
            perm_list.extend([p + 256 * j for p in perm1])

    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return perm, scale_perm, scale_perm_single


marlin_perm: Dict[int, torch.Tensor] = {}
marlin_scale_perm: Dict[int, List[int]] = {}
marlin_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm, scale_perm, scale_perm_single = get_perms(num_bits)
    marlin_perm[num_bits] = perm
    marlin_scale_perm[num_bits] = scale_perm
    marlin_scale_perm_single[num_bits] = scale_perm_single

================================================
FILE: archive/csrc/custom_marlin/utils/marlin_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import random

import numpy
import torch

from .format24 import (
    mask_creator, sparse_semi_structured_from_dense_cutlass)
from .marlin_24_perms import (
    marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
from .marlin_perms import (
    marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
from .quant_utils import (
    get_pack_factor, quantize_weights, sort_weights, dequantize_weights)


__cuda_arch = torch.cuda.get_device_capability()

MARLIN_TILE = 16

GPTQ_MARLIN_TILE = 16
GPTQ_MARLIN_MIN_THREAD_N = 64
GPTQ_MARLIN_MIN_THREAD_K = 128
GPTQ_MARLIN_MAX_PARALLEL = 16

GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
GPTQ_MARLIN_SUPPORTED_SYM = [True]

def is_marlin_supported():
    return __cuda_arch[0] >= 8


def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
    assert q_w.shape == (size_k, size_n)
    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"

    # Permute weights to 16x64 marlin tiles
    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
    q_w = q_w.permute((0, 2, 1, 3))
    q_w = q_w.reshape((size_k // tile, size_n * tile))

    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)

    return q_w


def marlin_weights(q_w, size_k, size_n, num_bits, perm):
    # Permute
    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)

    # Pack
    pack_factor = get_pack_factor(num_bits)
    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
                           dtype=numpy.uint32)
    for i in range(pack_factor):
        q_packed |= q_w[:, i::pack_factor] << num_bits * i

    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)

    return q_packed


def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
                          scale_perm_single):
    if group_size < size_k and group_size != -1:
        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
    else:
        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
    s = s.reshape((-1, size_n)).contiguous()

    return s


def marlin_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
    act_order: bool,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Quantize (and apply act_order if provided)
    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                       act_order)

    # For act_order, sort the "weights" and "g_idx" so that group ids are
    # increasing
    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)

    # Reformat to marlin
    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
                                marlin_perm[num_bits])
    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                     marlin_scale_perm[num_bits],
                                     marlin_scale_perm_single[num_bits])

    # Create result
    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def inject_24(w, size_k, size_n):
    assert w.shape == (size_k, size_n)

    mask = mask_creator(w.t()).t().cuda().bool()

    return (mask * w).contiguous(), mask.contiguous()


def check_24(w, num_rows_to_sample=50, _verbose=False):
    BLOCK_SIZE = 4
    MAX_NON_ZEROS = 2

    w = w.t().contiguous()

    print("check_24: w.shape = {}".format(w.shape))

    num_rows, num_cols = w.shape
    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
    if _verbose:
        print(f"Sampled row idxs = {sampled_row_idxs}")

    total_segments = 0
    non_24_segments = 0
    for i in sampled_row_idxs:
        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
            total_segments += 1
            block = w[i, j:j + BLOCK_SIZE]
            num_nonzero = torch.count_nonzero(block)
            if num_nonzero > MAX_NON_ZEROS:
                print("i = {} j = {} block = {}".format(i, j, block))
                non_24_segments += 1

    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")


def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
    assert q_24.shape == (size_k, size_n)

    # Remove zp to normalize over 0
    max_q_val = (1 << num_bits) - 1
    zp = (max_q_val + 1) // 2
    q_24_no_zp = q_24 - zp

    # Compress
    q_24_no_zp = q_24_no_zp.t().contiguous()
    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
        q_24_no_zp)
    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()

    # Restore zp
    q_24_comp = q_24_no_zp_comp + zp

    # Resize meta to its actual shape (without moving any data)
    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)

    return q_24_comp, meta


def marlin_24_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Inject 2:4 sparsity
    w_24, mask_24 = inject_24(w, size_k, size_n)

    # Quantize
    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
                                                             num_bits,
                                                             group_size,
                                                             act_order=False)

    # Compress quantized weight
    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
                                                     num_bits)
    size_k_comp = size_k // 2

    # Reformat to marlin
    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
                                        num_bits, marlin_24_perm[num_bits])
    marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                        marlin_24_scale_perm[num_bits],
                                        marlin_24_scale_perm_single[num_bits])

    # Create result
    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def compute_max_diff(output, output_ref):
    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
        torch.abs(output_ref))


class MarlinWorkspace:

    def __init__(self, out_features, min_thread_n, max_parallel, device):
        assert (out_features % min_thread_n == 0), (
            "out_features = {} is undivisible by min_thread_n = {}".format(
                out_features, min_thread_n))

        max_workspace_size = ((out_features // min_thread_n) * max_parallel)

        self.scratch = torch.zeros(max_workspace_size,
                                   dtype=torch.int,
                                   device=device)

================================================
FILE: archive/csrc/custom_marlin/utils/quant_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import numpy
import torch

SUPPORTED_NUM_BITS = [4, 8]
SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]


def get_pack_factor(num_bits):
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    return 32 // num_bits


def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
    assert q_w.shape == w_ref.shape

    orig_device = q_w.device
    k_size, _ = q_w.shape

    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
    for i in range(k_size):
        g_idx[i] = i // group_size

    # Simulate act_order by doing a random permutation on K
    rand_perm = torch.randperm(k_size)

    g_idx = g_idx[rand_perm].contiguous()
    q_w = q_w[rand_perm, :].contiguous()
    w_ref = w_ref[rand_perm, :].contiguous()

    return (
        w_ref.to(device=orig_device),
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


# Function: Dequantize quantized weights
def dequantize_weights(qweight, qzeros, scales, g_idx, bits=4, group_size=128, device='cuda:0'):
    # Create a tensor for bitwise right shift operation
    wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32, device=device).unsqueeze(0)

    # Apply bitwise right shift and convert qzeros to the appropriate type
    zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)).to(torch.int16 if bits == 8 else torch.int8)
    torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros)

    # Reshape the zeros tensor
    zeros = zeros + 1
    zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])

    # Reshape the scales tensor
    scales = scales.reshape(-1, 1, scales.shape[-1])

    # Similar bitwise right shift operation for qweight and reshape
    weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
    torch.bitwise_and(weight, (2 ** bits) - 1, out=weight)
    weight = weight.reshape(-1, group_size, weight.shape[2])

    # Apply dequantization formula and reshape the final weight
    weight = (scales * (weight - zeros))
    weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])

    # Return the transposed weight
    return weight.transpose(0, 1)

def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
                     act_order: bool):
    orig_device = w.device
    size_k, size_n = w.shape

    assert w.is_floating_point(), "w must be float"
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    assert group_size in SUPPORTED_GROUP_SIZES + [
        size_k
    ], f"Unsupported groupsize = {group_size}"

    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    max_q_val = 2**num_bits - 1
    half_q_val = (max_q_val + 1) // 2

    # Reshape to [groupsize, -1]
    if group_size < size_k:
        w = w.view((-1, group_size, size_n))
        w = w.permute(1, 0, 2)
        w = w.reshape((group_size, -1))

    # Compute scale for each group
    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
    s *= 2 / max_q_val  # 2 => symmetric

    # Quantize
    q_w = torch.round(w / s).int()
    q_w += half_q_val
    q_w = torch.clamp(q_w, 0, max_q_val)

    # Compute ref (dequantized)
    w_ref = (q_w - half_q_val).half() * s

    # Restore original shapes
    if group_size < size_k:

        def reshape_w(w):
            w = w.reshape((group_size, -1, size_n))
            w = w.permute(1, 0, 2)
            w = w.reshape((size_k, size_n)).contiguous()
            return w

        q_w = reshape_w(q_w)
        w_ref = reshape_w(w_ref)

    s = s.reshape((-1, size_n)).contiguous()

    # Apply act_order
    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        assert (
            group_size < size_k
        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
            group_size, size_k)

        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)

    return (
        w_ref.to(device=orig_device),
        q_w.to(device=orig_device),
        s.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
    orig_device = q_w.device

    sort_indices = torch.argsort(g_idx).to(
        dtype=torch.int32)  # Sort based on g_idx

    g_idx = g_idx[sort_indices].contiguous()
    q_w = q_w[sort_indices, :].contiguous()

    return (
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        sort_indices.to(device=orig_device),
    )


def gptq_pack(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    pack_factor = get_pack_factor(num_bits)
    assert size_k % pack_factor == 0

    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_res |= q_w[i::pack_factor, :] << num_bits * i

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    return q_res

def gptq_unpack(
    q_res: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    pack_factor = 32 // num_bits
    assert size_k % pack_factor == 0

    orig_device = q_res.device

    q_res = q_res.cpu().numpy()

    q_w = numpy.zeros((size_k, size_n), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_w[i::pack_factor, :] = (q_res >> (num_bits * i)) & ((1 << num_bits) - 1)

    q_w = torch.from_numpy(q_w.astype(numpy.int32)).to(orig_device)
    return q_w

================================================
FILE: archive/csrc/ktransformers_ext/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16)
project(cpuinfer_ext VERSION 0.1.0)


set(CMAKE_CXX_STANDARD 17)


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math -fopenmp")
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
set(CMAKE_BUILD_TYPE "Release")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ffast-math -fopenmp")
# set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)


include(CheckCXXCompilerFlag)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)


option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)

# instruction set specific
if (LLAMA_NATIVE)
    set(INS_ENB OFF)
else()
    set(INS_ENB ON)
endif()

option(LLAMA_AVX                             "llama: enable AVX"                                OFF)
option(LLAMA_AVX2                            "llama: enable AVX2"                               OFF)
option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
option(LLAMA_AVX512_BF16                     "llama: enable AVX512-BF16"                        OFF)
option(LLAMA_FMA                             "llama: enable FMA"                                OFF)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
    option(LLAMA_F16C                        "llama: enable F16C"                               OFF)
endif()
option(LLAMA_AVX512_FANCY_SIMD               "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"                        OFF)
option(KTRANSFORMERS_USE_CUDA                "ktransformers: use CUDA"                          ON)
option(KTRANSFORMERS_USE_MUSA                "ktransformers: use MUSA"                          OFF)
option(KTRANSFORMERS_USE_ROCM                "ktransformers: use ROCM"                          OFF)
option(KTRANSFORMERS_USE_XPU                 "ktransformers: use XPU"                           OFF)
option(KTRANSFORMERS_USE_NPU                 "ktransformers: use NPU"                           OFF)

if(KTRANSFORMERS_USE_NPU)
    add_definitions(-DKTRANSFORMERS_USE_NPU=1)
endif()

# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
#       feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
    set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()

if (NOT MSVC)
    if (LLAMA_STATIC)
        add_link_options(-static)
        if (MINGW)
            add_link_options(-static-libgcc -static-libstdc++)
        endif()
    endif()
    if (LLAMA_GPROF)
        add_compile_options(-pg)
    endif()
endif()

set(ARCH_FLAGS "")

if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
     CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
    message(STATUS "ARM detected")
    if (MSVC)
        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
        add_compile_definitions(__ARM_NEON)
        add_compile_definitions(__ARM_FEATURE_FMA)

        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
        if (GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif ()
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
        endif ()
        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
    else()
        if(KTRANSFORMERS_USE_NPU)
            list(APPEND ARCH_FLAGS -march=armv8.2-a+fp16+fp16fml+dotprod -lnuma)
        endif()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
                # Android armeabi-v7a
                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
            else()
                # Raspberry Pi 2
                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
            endif()
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Android arm64-v8a
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
    endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
    message(STATUS "x86 detected")
    if(NOT KTRANSFORMERS_USE_NPU)
        set(HOST_IS_X86 TRUE)
        set(HAS_AVX512 TRUE)
        set(__HAS_AMX__ TRUE)
        add_compile_definitions(__x86_64__)
        # check AVX512
        execute_process(
            COMMAND lscpu
            OUTPUT_VARIABLE LSCPU_OUTPUT
            OUTPUT_STRIP_TRAILING_WHITESPACE
        )
        # message(STATUS "LSCPU_OUTPUT: ${LSCPU_OUTPUT}")
    
        string(FIND "${LSCPU_OUTPUT}" "avx512" COMPILER_SUPPORTS_AVX512F)
        
        if (COMPILER_SUPPORTS_AVX512F GREATER -1)
            message(STATUS "Compiler and CPU support AVX512F (tested by compiling a program)")
            add_compile_definitions(__HAS_AVX512F__)
        else()
            message(STATUS "Compiler and/or CPU do NOT support AVX512F")
            set(HAS_AVX512 False)
        endif()
    
        # check AMX
        string(FIND "${LSCPU_OUTPUT}" "amx" COMPILER_SUPPORTS_AMX)
        
        if(COMPILER_SUPPORTS_AMX GREATER -1)
            message(STATUS "Compiler supports AMX")
            add_compile_definitions(__HAS_AMX__)
        else()
            message(STATUS "Compiler does NOT support AMX")
        endif()
    endif()
    if (MSVC)
        # instruction set detection for MSVC only
        if (LLAMA_NATIVE)
            include(cmake/FindSIMD.cmake)
        endif ()
        if (LLAMA_AVX512)
            list(APPEND ARCH_FLAGS /arch:AVX512)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
            # Do it manually.
            if (LLAMA_AVX512_VBMI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
            endif()
            if (LLAMA_AVX512_VNNI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (LLAMA_AVX512_FANCY_SIMD)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (LLAMA_AVX512_BF16)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
            endif()
        elseif (LLAMA_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (LLAMA_AVX)
            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
    else()
        if (LLAMA_NATIVE)
            list(APPEND ARCH_FLAGS -mfma -mavx -mavx2)
            list(APPEND ARCH_FLAGS -march=native)
        endif()
        if (LLAMA_F16C)
            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        if (LLAMA_FMA)
            list(APPEND ARCH_FLAGS -mfma)
        endif()
        if (LLAMA_AVX)
            list(APPEND ARCH_FLAGS -mavx)
        endif()
        if (LLAMA_AVX2)
            list(APPEND ARCH_FLAGS -mavx2)
        endif()
        if (LLAMA_AVX512)
            list(APPEND ARCH_FLAGS -mavx512f)
            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (LLAMA_AVX512_VBMI)
            list(APPEND ARCH_FLAGS -mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
        if (LLAMA_AVX512_FANCY_SIMD)
            message(STATUS "AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI enabled")
            list(APPEND ARCH_FLAGS -mavx512vl)
            list(APPEND ARCH_FLAGS -mavx512bw)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512vnni)
            list(APPEND ARCH_FLAGS -mavx512vpopcntdq)
        endif()
        if (LLAMA_AVX512_BF16)
            list(APPEND ARCH_FLAGS -mavx512bf16)
        endif()
    endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
else()
    message(STATUS "Unknown architecture")
endif()

# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}")
# find_package(FindCUDAToolkit REQUIRED)
# if(CUDAToolkit_FOUND)
#     message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}")
# else()
#     message(STATUS "Can't found CUDA lib")
# endif()

if (NOT EXISTS $ENV{ROCM_PATH})
    if (NOT EXISTS /opt/rocm)
        set(ROCM_PATH /usr)
    else()
        set(ROCM_PATH /opt/rocm)
    endif()
else()
    set(ROCM_PATH $ENV{ROCM_PATH})
endif()

list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")

if (NOT EXISTS $ENV{MUSA_PATH})
    if (NOT EXISTS /opt/musa)
        set(MUSA_PATH /usr/local/musa)
    else()
        set(MUSA_PATH /opt/musa)
    endif()
else()
    set(MUSA_PATH $ENV{MUSA_PATH})
endif()

list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")

add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/third_party/pybind11)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
if (WIN32)
    include_directories("$ENV{CUDA_PATH}/include")
    add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
elseif (UNIX)
    if (KTRANSFORMERS_USE_ROCM)
        find_package(HIP REQUIRED)
        if(HIP_FOUND)
            include_directories("${HIP_INCLUDE_DIRS}")
            add_compile_definitions(KTRANSFORMERS_USE_ROCM=1)
        endif()
    elseif (KTRANSFORMERS_USE_MUSA)
        if (NOT EXISTS $ENV{MUSA_PATH})
            if (NOT EXISTS /opt/musa)
                set(MUSA_PATH /usr/local/musa)
            else()
                set(MUSA_PATH /opt/musa)
            endif()
        else()
            set(MUSA_PATH $ENV{MUSA_PATH})
        endif()

        list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")

        find_package(MUSAToolkit)
        if (MUSAToolkit_FOUND)
            message(STATUS "MUSA Toolkit found")
            add_compile_definitions(KTRANSFORMERS_USE_MUSA=1)
        endif()
    elseif (KTRANSFORMERS_USE_XPU)
        add_compile_definitions(KTRANSFORMERS_USE_XPU=1)
    elseif (KTRANSFORMERS_USE_CUDA)
        find_package(CUDA REQUIRED)
        include_directories("${CUDA_INCLUDE_DIRS}")
        include(CheckLanguage)
        check_language(CUDA)
        if(CMAKE_CUDA_COMPILER)
            message(STATUS "CUDA detected")
            find_package(CUDAToolkit REQUIRED)
            include_directories(${CUDAToolkit_INCLUDE_DIRS})
        endif()
        message(STATUS "enabling CUDA")
        enable_language(CUDA)
        add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
    endif()
endif()

aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
# aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile SOURCE_DIR4)
file(GLOB LLAMAFILE_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile/*.cpp")
list(REMOVE_ITEM LLAMAFILE_SOURCES
    "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile/sgemm_arm.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile/sgemm_x86.cpp"
)
set(SOURCE_DIR4 ${LLAMAFILE_SOURCES})
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5)

if (HOST_IS_X86 AND HAS_AVX512 AND __HAS_AMX__)
    aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/amx SOURCE_DIR6)
endif()


set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5} ${SOURCE_DIR6})

file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")

add_custom_target(
    format
    COMMAND clang-format
    -i
    -style=file
    ${FMT_SOURCES}
    COMMENT "Running clang-format on all source files"
)


add_library(llamafile STATIC ${SOURCE_DIR4})

message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
message(STATUS "ARCH_FLAGS: ${ARCH_FLAGS}")
pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
target_link_libraries(${PROJECT_NAME} PRIVATE llama)


if(WIN32)
    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
elseif(UNIX)
    if (KTRANSFORMERS_USE_ROCM)
        add_compile_definitions(USE_HIP=1)
        target_link_libraries(${PROJECT_NAME} PRIVATE "${ROCM_PATH}/lib/libamdhip64.so")
        message(STATUS "Building for HIP")
    elseif(KTRANSFORMERS_USE_MUSA)
        target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
    elseif(KTRANSFORMERS_USE_XPU)
    elseif(KTRANSFORMERS_USE_CUDA AND NOT KTRANSFORMERS_USE_MUSA)
        target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
    endif()
endif()

# Define the USE_NUMA option
option(USE_NUMA "Disable NUMA support" OFF)

# Check if the USE_NUMA environment variable is set
if(DEFINED ENV{USE_NUMA})
    set(USE_NUMA ON)
endif()

if(USE_NUMA)
    message(STATUS "NUMA support is enabled")
else()
    message(STATUS "NUMA support is disabled")
endif()

find_library(NUMA_LIBRARY NAMES numa)

if(NUMA_LIBRARY AND USE_NUMA)
    message(STATUS "NUMA library found: ${NUMA_LIBRARY} - enabling NUMA support")
    target_link_libraries(${PROJECT_NAME} PRIVATE ${NUMA_LIBRARY})
    target_compile_definitions(${PROJECT_NAME} PRIVATE USE_NUMA)
else()
    if(USE_NUMA)
        message(FATAL_ERROR "NUMA library not found - maybe sudo apt install libnuma-dev")
    else()
        message(STATUS "NUMA library not found or user not set USE_NUMA - disabling NUMA support")
    endif()
endif()


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : Jianwei Dong 
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1

anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 64
max_batch_size: int = 1
max_block_num: int = 1024
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)

warm_up_iter = 1000
test_iter = 10000


def bench_linear(cache_seqlen: int):
    with torch.inference_mode(mode=True):
        cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
        seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")

        config = cpuinfer_ext.kvcache.KVCacheConfig(
            layer_num,
            kv_head_num,
            q_head_num,
            head_dim,
            block_len,
            anchor_num,
            anchor_type,
            kv_type,
            retrieval_type,
            layer_step,
            token_step,
            layer_offset,
            max_block_num,
            max_batch_size,
            max_thread_num,
        )
        local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
        block_table = (
            torch.arange(max_block_num, dtype=torch.int32, device="cpu")
            .contiguous()
            .view(1, -1)
        )

        for layer_idx in range(layer_num):
            k_cache = torch.randn(
                (1, cache_seqlen, kv_head_num, head_dim),
                dtype=torch.float16,
                device="cpu",
            ).contiguous()
            v_cache = torch.randn(
                (1, cache_seqlen, kv_head_num, head_dim),
                dtype=torch.float16,
                device="cpu",
            ).contiguous()

            CPUInfer.submit(
                local_kvcache.update_kvcache_fp16(
                    k_cache.data_ptr(),
                    v_cache.data_ptr(),
                    layer_idx,
                    block_table.data_ptr(),
                    1,
                    max_block_num,
                    seqlens_zero.data_ptr(),
                    cache_seqlen,
                )
            )
            CPUInfer.sync()

        input = torch.randn(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()
        output = torch.empty(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()

        # attn_lse: (bsz, q_len, q_head_num)
        attn_lse = torch.empty(
            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
        ).contiguous()
        input = input / 100

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                local_kvcache.attn(
                    input.data_ptr(),
                    output.data_ptr(),
                    attn_lse.data_ptr(),
                    i % layer_num,
                    0,
                    1,
                    1,
                    max_block_num,
                    block_table.data_ptr(),
                    cache_seqlens.data_ptr(),
                    -1,
                    -1,
                    -1,
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                local_kvcache.attn(
                    input.data_ptr(),
                    output.data_ptr(),
                    attn_lse.data_ptr(),
                    i % layer_num,
                    0,
                    1,
                    1,
                    max_block_num,
                    block_table.data_ptr(),
                    cache_seqlens.data_ptr(),
                    -1,
                    -1,
                    -1,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print("cache sequence length: ", cache_seqlen)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            cache_seqlen
            * kv_head_num
            * head_dim
            * 2
            * 2
            * test_iter
            / total_time
            / 1000
            / 1000
            / 1000,
            "GB/s",
        )
        print("")


bench_linear(1024)
bench_linear(4096)
bench_linear(16384)
bench_linear(32768)
bench_linear(65536)


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_attention_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : Jianwei Dong 
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
warm_up_iter = 1000
test_iter = 10000


def bench_linear(cache_seqlen: int, device):
    with torch.inference_mode(mode=True):

        kvcaches = []

        for layer_idx in range(layer_num):
            k_cache = torch.randn(
                (1, 32, cache_seqlen, head_dim),
                dtype=torch.float16,
                device=device,
            ).contiguous()
            v_cache = torch.randn(
                (1, 32, cache_seqlen, head_dim),
                dtype=torch.float16,
                device=device,
            ).contiguous()

            kvcaches.append((k_cache, v_cache))

        input = torch.randn(
            (1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
        ).contiguous()
        input = input / 100

        # warm up
        for i in range(warm_up_iter):
            k_cache = kvcaches[i % layer_num][0]
            v_cache = kvcaches[i % layer_num][1]
            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            k_cache = kvcaches[i % layer_num][0]
            v_cache = kvcaches[i % layer_num][1]
            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
        end = time.perf_counter()
        total_time = end - start
        print("cache sequence length: ", cache_seqlen)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            cache_seqlen
            * q_head_num
            * head_dim
            * 2
            * 2
            * test_iter
            / total_time
            / 1000
            / 1000
            / 1000,
            "GB/s",
        )
        print("")


bench_linear(1024, "cpu")
bench_linear(4096, "cpu")
bench_linear(1024, "cuda")
bench_linear(4096, "cuda")
bench_linear(16384, "cuda")
bench_linear(32768, "cuda")
bench_linear(65536, "cuda")


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:31:59
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:35:35
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

input_size = 16384
output_size = 5120
stride = 16
group_max_len = 1024
layer_num = 10
qlen = 1
CPUInfer = cpuinfer_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000

def bench_linear(quant_mode: str):
    with torch.inference_mode(mode=True):

        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            proj_type = 0 # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = 1 # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = 30 # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            proj_type = 8 # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            proj_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            proj_type = 13 # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.687500
        elif quant_mode == "q4_k_m":
            proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
            bytes_per_elem = 0.562500
        elif quant_mode == "q3_k_m":
            proj_type = 11 # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.429688
        elif quant_mode == "q2_k":
            proj_type = 10 # ggml_type::GGML_TYPE_Q2_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert(False)

        linears = []
        projs = []
        for _ in range(layer_num):
            proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
            linear = cpuinfer_ext.linear.Linear(config)
            projs.append(proj)
            linears.append(linear)
        input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                linears[i % layer_num].forward(
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                linears[i % layer_num].forward(
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_linear("fp32")
bench_linear("fp16")
bench_linear("bf16")
bench_linear("q8_0")
bench_linear("q6_k")
bench_linear("q5_k_m")
bench_linear("q4_k_m")
bench_linear("q3_k_m")
bench_linear("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_linear_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:31:59
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:48
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

input_size = 16384
output_size = 5120
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def bench_linear(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        projs = []
        for _ in range(layer_num):
            proj = torch.randn((output_size, input_size), dtype = torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                proj_q = torch.quantize_per_tensor(proj, scale, zero_point, torch.qint8)
                quantized_layer = nnq.Linear(input_size, output_size)
                quantized_layer.set_weight_bias(proj_q, None)
                projs.append(quantized_layer)
            else:
                projs.append(proj.to(proj_type))
        input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            if isinstance(projs[i % layer_num], nnq.Linear):
                input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
                t_output = projs[i % layer_num](input_q)
            else:
                t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            if isinstance(projs[i % layer_num], nnq.Linear):
                input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
                t_output = projs[i % layer_num](input_q)
            else:
                t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_linear("fp32")
bench_linear("fp16")
bench_linear("bf16")
bench_linear("qint8")


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_mlp.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-16 10:43:18
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:36:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

hidden_size = 5120
intermediate_size = 3072
stride = 16
group_max_len = 1024
layer_num = 10
qlen = 1
CPUInfer = cpuinfer_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000

def bench_mlp(quant_mode: str):
    with torch.inference_mode(mode=True):

        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            gate_type = 0 # ggml_type::GGML_TYPE_F32
            up_type = 0 # ggml_type::GGML_TYPE_F32
            down_type = 0 # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            gate_type = 1 # ggml_type::GGML_TYPE_F16
            up_type = 1 # ggml_type::GGML_TYPE_F16
            down_type = 1 # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            gate_type = 30 # ggml_type::GGML_TYPE_BF16
            up_type = 30 # ggml_type::GGML_TYPE_BF16
            down_type = 30 # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
            up_type = 8 # ggml_type::GGML_TYPE_Q8_0
            down_type = 8 # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
            up_type = 14 # ggml_type::GGML_TYPE_Q6_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
            up_type = 13 # ggml_type::GGML_TYPE_Q5_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.731771
        elif quant_mode == "q4_k_m":
            gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
            up_type = 12 # ggml_type::GGML_TYPE_Q4_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.648437
        elif quant_mode == "q3_k_m":
            gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
            up_type = 11 # ggml_type::GGML_TYPE_Q3_K
            down_type = 13 # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.515625
        elif quant_mode == "q2_k":
            gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
            up_type = 10 # ggml_type::GGML_TYPE_Q2_K
            down_type = 11 # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert(False)


        mlps = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
            mlp = cpuinfer_ext.mlp.MLP(config)
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            mlps.append(mlp)
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                mlps[i % layer_num].forward( 
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                mlps[i % layer_num].forward( 
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_mlp("fp32")
bench_mlp("fp16")
bench_mlp("bf16")
bench_mlp("q8_0")
bench_mlp("q6_k")
bench_mlp("q5_k_m")
bench_mlp("q4_k_m")
bench_mlp("q3_k_m")
bench_mlp("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_mlp_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-16 10:43:18
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:53
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

hidden_size = 5120
intermediate_size = 3072
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    if isinstance(gate_proj, nnq.Linear):
        input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
        gate_buf = gate_proj(input_q)
        up_buf = up_proj(input_q)
        gate_buf = gate_buf.dequantize()
        up_buf = up_buf.dequantize()
        intermediate = act_fn(gate_buf) * up_buf
        intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
        expert_output = down_proj(intermediate_q)
        ret = expert_output.dequantize()
    else:
        gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
        up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
        intermediate = act_fn(gate_buf) * up_buf
        ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
    return ret

def bench_mlp(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                gate_proj_q = torch.quantize_per_tensor(gate_proj, scale, zero_point, torch.qint8)
                quantized_gate = nnq.Linear(hidden_size, intermediate_size)
                quantized_gate.set_weight_bias(gate_proj_q, None)
                up_proj_q = torch.quantize_per_tensor(up_proj, scale, zero_point, torch.qint8)
                quantized_up = nnq.Linear(hidden_size, intermediate_size)
                quantized_up.set_weight_bias(up_proj_q, None)
                down_proj_q = torch.quantize_per_tensor(down_proj, scale, zero_point, torch.qint8)
                quantized_down = nnq.Linear(intermediate_size, hidden_size)
                quantized_down.set_weight_bias(down_proj_q, None)
                gate_projs.append(quantized_gate)
                up_projs.append(quantized_up)
                down_projs.append(quantized_down)
            else:
                gate_projs.append(gate_proj.to(proj_type))
                up_projs.append(up_proj.to(proj_type))
                down_projs.append(down_proj.to(proj_type))
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_mlp("fp32")
bench_mlp("fp16")
bench_mlp("bf16")
bench_mlp("qint8")


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

expert_num = 160
hidden_size = 5120
intermediate_size = 1536
stride = 16
group_min_len = 10
group_max_len = 1024
n_routed_experts = 6
layer_num = 10
qlen = 1
CPUInfer = cpuinfer_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000

def bench_moe(quant_mode: str):
    with torch.inference_mode(mode=True):
        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            gate_type = 0 # ggml_type::GGML_TYPE_F32
            up_type = 0 # ggml_type::GGML_TYPE_F32
            down_type = 0 # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            gate_type = 1 # ggml_type::GGML_TYPE_F16
            up_type = 1 # ggml_type::GGML_TYPE_F16
            down_type = 1 # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            gate_type = 30 # ggml_type::GGML_TYPE_BF16
            up_type = 30 # ggml_type::GGML_TYPE_BF16
            down_type = 30 # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
            up_type = 8 # ggml_type::GGML_TYPE_Q8_0
            down_type = 8 # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
            up_type = 14 # ggml_type::GGML_TYPE_Q6_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
            up_type = 13 # ggml_type::GGML_TYPE_Q5_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.731771
        elif quant_mode == "q4_k_m":
            gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
            up_type = 12 # ggml_type::GGML_TYPE_Q4_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.648437
        elif quant_mode == "q3_k_m":
            gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
            up_type = 11 # ggml_type::GGML_TYPE_Q3_K
            down_type = 13 # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.515625
        elif quant_mode == "q2_k":
            gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
            up_type = 10 # ggml_type::GGML_TYPE_Q2_K
            down_type = 11 # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert(False)


        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.moe.MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, stride, group_min_len, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
            moe = cpuinfer_ext.moe.MOE(config)
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)
        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_moe("fp32")
bench_moe("fp16")
bench_moe("bf16")
bench_moe("q8_0")
bench_moe("q6_k")
bench_moe("q5_k_m")
bench_moe("q4_k_m")
bench_moe("q3_k_m")
bench_moe("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_moe_amx.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2025-04-25 18:28:12
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2025-04-25 18:28:12
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

expert_num = 8
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
n_routed_experts = 8
layer_num = 10
qlen = 1024
CPUInfer = cpuinfer_ext.CPUInfer(65)
warm_up_iter = 100
test_iter = 100

def bench_moe(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "bf16":
            bytes_per_elem = 2.000000
        elif quant_mode == "int8":
            bytes_per_elem = 1.000000
        else:
            assert(False)


        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.moe.AMX_MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr())
            if quant_mode == "bf16":
                moe = cpuinfer_ext.moe.AMXBF16_MOE(config)
                CPUInfer.submit(moe.load_weights())
                CPUInfer.sync()
            elif quant_mode == "int8":
                moe = cpuinfer_ext.moe.AMXInt8_MOE(config)
                CPUInfer.submit(moe.load_weights())
                CPUInfer.sync()
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)
        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        qlen_tensor = torch.tensor([qlen], dtype=torch.int32)

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr(),
                    qlen_tensor.data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr(),
                    qlen_tensor.data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('Flops: ', hidden_size * intermediate_size * qlen * 3 * n_routed_experts * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GFLOPS')
        print('')

bench_moe("bf16")
bench_moe("int8")


================================================
FILE: archive/csrc/ktransformers_ext/bench/bench_moe_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:57
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

expert_num = 160
hidden_size = 5120
intermediate_size = 1536
n_routed_experts = 6
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    if isinstance(gate_proj, nnq.Linear):
        input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
        gate_buf = gate_proj(input_q)
        up_buf = up_proj(input_q)
        gate_buf = gate_buf.dequantize()
        up_buf = up_buf.dequantize()
        intermediate = act_fn(gate_buf) * up_buf
        intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
        expert_output = down_proj(intermediate_q)
        ret = expert_output.dequantize()
    else:
        gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
        up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
        intermediate = act_fn(gate_buf) * up_buf
        ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
    return ret

def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output

def bench_moe(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                quantized_gate_proj = []
                quantized_up_proj = []
                quantized_down_proj = []
                for i in range(expert_num):
                    gate_proj_q = torch.quantize_per_tensor(gate_proj[i], scale, zero_point, torch.qint8)
                    quantized_gate = nnq.Linear(hidden_size, intermediate_size)
                    quantized_gate.set_weight_bias(gate_proj_q, None)
                    quantized_gate_proj.append(quantized_gate)
                    up_proj_q = torch.quantize_per_tensor(up_proj[i], scale, zero_point, torch.qint8)
                    quantized_up = nnq.Linear(hidden_size, intermediate_size)
                    quantized_up.set_weight_bias(up_proj_q, None)
                    quantized_up_proj.append(quantized_up)
                    down_proj_q = torch.quantize_per_tensor(down_proj[i], scale, zero_point, torch.qint8)
                    quantized_down = nnq.Linear(intermediate_size, hidden_size)
                    quantized_down.set_weight_bias(down_proj_q, None)
                    quantized_down_proj.append(quantized_down)
                gate_projs.append(quantized_gate_proj)
                up_projs.append(quantized_up_proj)
                down_projs.append(quantized_down_proj)
            else:
                gate_projs.append(gate_proj.to(proj_type))
                up_projs.append(up_proj.to(proj_type))
                down_projs.append(down_proj.to(proj_type))
        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_moe("fp32")
bench_moe("fp16")
bench_moe("bf16")
bench_moe("qint8")


================================================
FILE: archive/csrc/ktransformers_ext/cmake/FindSIMD.cmake
================================================
include(CheckCSourceRuns)

set(AVX_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 a;
        a = _mm256_set1_ps(0);
        return 0;
    }
")

set(AVX512_CODE "
    #include <immintrin.h>
    int main()
    {
        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0);
        __m512i b = a;
        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
        return 0;
    }
")

set(AVX2_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256i a = {0};
        a = _mm256_abs_epi16(a);
        __m256i x;
        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
        return 0;
    }
")

set(FMA_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 acc = _mm256_setzero_ps();
        const __m256 d = _mm256_setzero_ps();
        const __m256 p = _mm256_setzero_ps();
        acc = _mm256_fmadd_ps( d, p, acc );
        return 0;
    }
")

macro(check_sse type flags)
    set(__FLAG_I 1)
    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
    foreach (__FLAG ${flags})
        if (NOT ${type}_FOUND)
            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
            if (HAS_${type}_${__FLAG_I})
                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
            endif()
            math(EXPR __FLAG_I "${__FLAG_I}+1")
        endif()
    endforeach()
    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})

    if (NOT ${type}_FOUND)
        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
        set(${type}_FLAGS "" CACHE STRING "${type} flags")
    endif()

    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
endmacro()

# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
    set(LLAMA_AVX OFF)
else()
    set(LLAMA_AVX ON)
endif()

check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
    set(LLAMA_AVX2 OFF)
else()
    set(LLAMA_AVX2 ON)
endif()

check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
    set(LLAMA_AVX512 OFF)
else()
    set(LLAMA_AVX512 ON)
endif()


================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/backend.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:34
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "backend.h"

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>

thread_local int Backend::numa_node = -1;
#endif

thread_local int Backend::thread_local_id = -1;

Backend::Backend(int max_thread_num) {
    max_thread_num_ = max_thread_num;
    thread_state_.resize(max_thread_num_);
    for (int i = 0; i < max_thread_num_; i++) {
        thread_state_[i].curr = std::make_unique<std::atomic<int>>();
        thread_state_[i].status =
            std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
    }
    workers_.resize(max_thread_num_);
    for (int i = 1; i < max_thread_num_; i++) {
        workers_[i] = std::thread(&Backend::worker_thread, this, i);
    }
}

Backend::~Backend() {
    for (int i = 0; i < max_thread_num_; i++) {
        thread_state_[i].status->store(ThreadStatus::EXIT,
                                       std::memory_order_release);
    }
    for (int i = 1; i < max_thread_num_; i++) {
        if (workers_[i].joinable()) {
            workers_[i].join();
        }
    }
}

int Backend::get_thread_num() { return max_thread_num_; }

void Backend::do_work_stealing_job(int task_num,
                                   std::function<void(int)> init_func,
                                   std::function<void(int)> compute_func,
                                   std::function<void(int)> finalize_func) {
    init_func_ = init_func;
    compute_func_ = compute_func;
    finalize_func_ = finalize_func;
#ifdef USE_NUMA
    // numa node location will be calculated based on the number of threads
    thread_num_ = max_thread_num_;
#else
    thread_num_ = std::min(max_thread_num_, task_num);
#endif
    int base = task_num / thread_num_;
    int remain = task_num % thread_num_;
    thread_state_[0].end = base + (0 < remain);

    // 为主线程设置 thread_local_id
    thread_local_id = 0;

    for (int i = 1; i < thread_num_; i++) {
        thread_state_[i].curr->store(thread_state_[i - 1].end,
                                     std::memory_order_relaxed);
        thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
        thread_state_[i].status->store(ThreadStatus::WORKING,
                                       std::memory_order_release);
    }
    thread_state_[0].curr->store(0, std::memory_order_relaxed);
    thread_state_[0].status->store(ThreadStatus::WORKING,
                                   std::memory_order_release);
    process_tasks(0);
    for (int i = 1; i < thread_num_; i++) {
        while (thread_state_[i].status->load(std::memory_order_acquire) ==
               ThreadStatus::WORKING) {
        }
    }
}

void Backend::process_tasks(int thread_id) {
    
    #ifdef USE_NUMA
    if(numa_node == -1){
        numa_node = thread_id * numa_num_configured_nodes() / thread_num_;
        struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes());
        numa_bitmask_setbit(mask, numa_node);
        numa_bind(mask);
    }
    #endif

    if (init_func_ != nullptr) {
        init_func_(thread_id);
    }
    while (true) {
        int task_id = thread_state_[thread_id].curr->fetch_add(
            1, std::memory_order_acq_rel);
        if (task_id >= thread_state_[thread_id].end) {
            break;
        }
        compute_func_(task_id);
    }
    for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
        int t_i = (thread_id + t_offset) % thread_num_;
        if (thread_state_[t_i].status->load(std::memory_order_acquire) !=
            ThreadStatus::WORKING) {
            continue;
        }
        while (true) {
            int task_id = thread_state_[t_i].curr->fetch_add(
                1, std::memory_order_acq_rel);
            if (task_id >= thread_state_[t_i].end) {
                break;
            }
            compute_func_(task_id);
        }
    }
    if (finalize_func_ != nullptr) {
        finalize_func_(thread_id);
    }
    thread_state_[thread_id].status->store(ThreadStatus::WAITING,
                                           std::memory_order_release);
}

void Backend::worker_thread(int thread_id) {
    auto start = std::chrono::steady_clock::now();
    thread_local_id = thread_id; // 设置线程本地变量
    while (true) {
        ThreadStatus status =
            thread_state_[thread_id].status->load(std::memory_order_acquire);
        if (status == ThreadStatus::WORKING) {
            process_tasks(thread_id);
            start = std::chrono::steady_clock::now();
        } else if (status == ThreadStatus::WAITING) {
            auto now = std::chrono::steady_clock::now();
            auto duration =
                std::chrono::duration_cast<std::chrono::milliseconds>(now -
                                                                      start)
                    .count();
            if (duration > 50) {
                std::this_thread::sleep_for(std::chrono::milliseconds(1));
            }
        } else if (status == ThreadStatus::EXIT) {
            return;
        }
    }
}

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/backend.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_BACKEND_H
#define CPUINFER_BACKEND_H

#include <atomic>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <mutex>
#include <thread>
#include <vector>

enum ThreadStatus {
    WORKING,
    WAITING,
    EXIT,
};

struct ThreadState {
    std::unique_ptr<std::atomic<ThreadStatus>> status;
    std::unique_ptr<std::atomic<int>> curr;
    int end;
};

class Backend {
  public:
    Backend(int);
    ~Backend();
    int get_thread_num();
    void do_work_stealing_job(int, std::function<void(int)>,
                              std::function<void(int)>,
                              std::function<void(int)>);
    #ifdef USE_NUMA
    static thread_local int numa_node;
    #endif
    static thread_local int thread_local_id;

  private:
    int thread_num_;
    int max_thread_num_;
    std::vector<ThreadState> thread_state_; // [thread_num]
    std::function<void(int)> init_func_;
    std::function<void(int)> compute_func_;
    std::function<void(int)> finalize_func_;
    std::vector<std::thread> workers_;

    void process_tasks(int);
    void worker_thread(int);
};
#endif

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/cpuinfer.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-08-07 09:47:43
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #ifndef CPUINFER_CPUINFER_H
 #define CPUINFER_CPUINFER_H
 
 #include <atomic>
 #include <condition_variable>
 #include <functional>
 #include <mutex>
 #include <queue>
 #include <thread>
 #include <vector>
 #include <stdexcept>
 #ifdef KTRANSFORMERS_USE_CUDA
 #include "vendors/cuda.h"
 #elif KTRANSFORMERS_USE_MUSA
 #include "vendors/musa.h"
 #elif KTRANSFORMERS_USE_ROCM
 #define __HIP_PLATFORM_AMD__
 #include "vendors/hip.h"
 #endif
 
 #include "backend.h"
 #include "task_queue.h"
 #include "./vendors/vendor.h"
 
 #include "llama.cpp/ggml-impl.h"
 
 class CPUInfer {
    public:
     CPUInfer(int thread_num) {
         backend_ = new Backend(thread_num - 1);
         task_queue_ = new TaskQueue();
         for (int i = 0; i < (1 << 16); ++i) {
             ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
         }
     }
 
     ~CPUInfer() {
         delete backend_;
         delete task_queue_;
     }
 
     template <typename Func, typename Obj, typename... Args>
     void enqueue(Func f, Obj* obj, Args... args) {
         task_queue_->enqueue([=]() {
             std::invoke(f, *obj, args..., backend_);
         });
     }
 
     void submit(std::pair<intptr_t, intptr_t> params) {
         void (*func)(void*) = (void (*)(void*))params.first;
         void* args = (void*)params.second;
         *((CPUInfer**)args) = this;
         func(args);
     }
 
     void sync() {
         task_queue_->sync();
     }
 
     void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
        #if defined(KTRANSFORMERS_USE_CUDA) || defined(KTRANSFORMERS_USE_MUSA) || defined(KTRANSFORMERS_USE_ROCM)
         void (*func)(void*) = (void (*)(void*))params.first;
         void* args = (void*)params.second;
         *((CPUInfer**)args) = this;
         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
        #else
         throw std::runtime_error("submit_with_cuda_stream is not supported on this platforma");
        #endif
     }
 
     static void sync_(void* cpu_infer_ptr) {
         CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr;
         cpuinfer->sync();
     }
 
     void sync_with_cuda_stream(intptr_t user_cuda_stream) {
        #if defined(KTRANSFORMERS_USE_CUDA) || defined(KTRANSFORMERS_USE_MUSA) || defined(KTRANSFORMERS_USE_ROCM)
         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
        #else
         throw std::runtime_error("sync_with_cuda_stream is not supported on this platforma");
        #endif
     }
 
    public:
     Backend* backend_;
     TaskQueue* task_queue_;
 };
 
 #endif

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-08-05 04:49:08
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022 
 * @LastEditTime : 2024-08-05 09:21:29
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "shared_mem_buffer.h"
#include <cstdio>

SharedMemBuffer::SharedMemBuffer() {
    buffer_ = nullptr;
    size_ = 0;
}

SharedMemBuffer::~SharedMemBuffer() {
    if (buffer_) {
        free(buffer_);
    }
}

void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests) {
    uint64_t size = 0;
    for (auto& request : requests) {
        size += request.second;
    }
    if (size > size_) {
        if (buffer_) {
            free(buffer_);
        }
        buffer_ = std::aligned_alloc(64, size);

        size_ = size;
        for (auto& obj_requests : hist_requests_) {
            for (auto& requests : obj_requests.second) {
                arrange(requests);
            }
        }
    }
    arrange(requests);
    hist_requests_[object].push_back(requests);
}

void SharedMemBuffer::dealloc(void* object) {
    hist_requests_.erase(object);
}

void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) {
    uint64_t offset = 0;
    for (auto& request : requests) {
        *(request.first) = (uint8_t*)buffer_ + offset;
        offset += request.second;
    }
}

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-08-05 04:49:08
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022 
 * @LastEditTime : 2024-08-05 06:36:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

 #ifndef CPUINFER_SHAREDMEMBUFFER_H
 #define CPUINFER_SHAREDMEMBUFFER_H
 
 #include <cstdint>
 #include <cstdlib>
 #include <map>
 #include <vector>
 
 class SharedMemBuffer {
    public:
     SharedMemBuffer();
     ~SharedMemBuffer();
 
     void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
     void dealloc(void* object);
 
    private:
     void* buffer_;
     uint64_t size_;
     std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
 
     void arrange(std::vector<std::pair<void**, uint64_t>> requests);
 };
 
 static SharedMemBuffer shared_mem_buffer;
 
 #endif

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/task_queue.cpp
================================================
/**
 * @Description :
 * @Author    : chenht2022
 * @Date     : 2024-07-17 12:25:51
 * @Version   : 1.0.0
 * @LastEditors : chenht2022
 * @LastEditTime : 2024-10-09 11:08:10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "task_queue.h"

TaskQueue::TaskQueue() {
    worker = std::thread(&TaskQueue::processTasks, this);
    sync_flag.store(true, std::memory_order_seq_cst);
    exit_flag.store(false, std::memory_order_seq_cst);
}

TaskQueue::~TaskQueue() {
    {
        mutex.lock();
        exit_flag.store(true, std::memory_order_seq_cst);
        mutex.unlock();
    }
    cv.notify_all();
    if (worker.joinable()) {
        worker.join();
    }
}

void TaskQueue::enqueue(std::function<void()> task) {
    {
        mutex.lock();
        tasks.push(task);
        sync_flag.store(false, std::memory_order_seq_cst);
        mutex.unlock();
    }
    cv.notify_one();
}

void TaskQueue::sync() {
    while (!sync_flag.load(std::memory_order_seq_cst))
        ;
}

void TaskQueue::processTasks() {
    while (true) {
        std::function<void()> task;
        {
            mutex.lock();
            cv.wait(mutex, [this]() { return !tasks.empty() || exit_flag.load(std::memory_order_seq_cst); });
            if (exit_flag.load(std::memory_order_seq_cst) && tasks.empty()) {
                return;
            }
            task = tasks.front();
            tasks.pop();
            mutex.unlock();
        }
        task();
        {
            mutex.lock();
            if (tasks.empty()) {
                sync_flag.store(true, std::memory_order_seq_cst);
            }
            mutex.unlock();
        }
    }
}

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/task_queue.h
================================================
/**
 * @Description :
 * @Author    : chenht2022
 * @Date     : 2024-07-16 10:43:18
 * @Version   : 1.0.0
 * @LastEditors : chenht
 * @LastEditTime : 2024-10-09 11:08:07
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_TASKQUEUE_H
#define CPUINFER_TASKQUEUE_H

#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
#ifdef _WIN32
#include <windows.h>
#endif

class custom_mutex {
   private:
#ifdef _WIN32
    CRITICAL_SECTION cs;
#else
    std::mutex mtx;
#endif

   public:
    custom_mutex() {
#ifdef _WIN32
        InitializeCriticalSection(&cs);
#else
        // No initialization required for std::mutex
#endif
    }

    ~custom_mutex() {
#ifdef _WIN32
        DeleteCriticalSection(&cs);
#endif
    }

    void lock() {
#ifdef _WIN32
        EnterCriticalSection(&cs);
#else
        mtx.lock();
#endif
    }

    void unlock() {
#ifdef _WIN32
        LeaveCriticalSection(&cs);
#else
        mtx.unlock();
#endif
    }

#ifdef _WIN32
    CRITICAL_SECTION* get_handle() {
        return &cs;
    }
#else
    std::mutex* get_handle() {
        return &mtx;
    }
#endif
};

class custom_condition_variable {
   private:
#ifdef _WIN32
    CONDITION_VARIABLE cond_var;
#else
    std::condition_variable cond_var;
#endif

   public:
    custom_condition_variable() {
#ifdef _WIN32
        InitializeConditionVariable(&cond_var);
#endif
    }

    template <typename Predicate>
    void wait(custom_mutex& mutex, Predicate pred) {
#ifdef _WIN32
        while (!pred()) {
            SleepConditionVariableCS(&cond_var, mutex.get_handle(), INFINITE);
        }
#else
        std::unique_lock<std::mutex> lock(*mutex.get_handle(), std::adopt_lock);
        cond_var.wait(lock, pred);
        lock.release();
#endif
    }

    void notify_one() {
#ifdef _WIN32
        WakeConditionVariable(&cond_var);
#else
        cond_var.notify_one();
#endif
    }

    void notify_all() {
#ifdef _WIN32
        WakeAllConditionVariable(&cond_var);
#else
        cond_var.notify_all();
#endif
    }
};

class TaskQueue {
   public:
    TaskQueue();
    ~TaskQueue();

    void enqueue(std::function<void()>);

    void sync();

   private:
    void processTasks();

    std::queue<std::function<void()>> tasks;
    custom_mutex mutex;
    custom_condition_variable cv;
    std::thread worker;
    std::atomic<bool> sync_flag;
    std::atomic<bool> exit_flag;
};
#endif

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/vendors/README.md
================================================
## TODO

This directory can be removed after updating the version of `llama.cpp`.

================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/vendors/cuda.h
================================================
#pragma once

#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif // CUDART_VERSION < 11020


================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/vendors/hip.h
================================================
#pragma once

#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bfloat16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__

#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F  HIPBLAS_R_16F
#define CUDA_R_32F  HIPBLAS_R_32F
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cublasOperation_t hipblasOperation_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cuDeviceGet hipDeviceGet
#define CUdevice hipDevice_t
#define CUdeviceptr hipDeviceptr_t
#define cuMemUnmap hipMemUnmap
#define CUmemAccessDesc hipMemAccessDesc
#define cuMemAddressFree hipMemAddressFree
#define cuMemRelease hipMemRelease
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
#define cuMemCreate hipMemCreate
#define cuMemAddressReserve hipMemAddressReserve
#define cuMemMap hipMemMap
#define cuMemSetAccess hipMemSetAccess
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
#define CUmemAllocationProp hipMemAllocationProp
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaGraphExec_t hipGraphExec_t
#define cudaGraphNode_t hipGraphNode_t
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaGraphExecDestroy hipGraphExecDestroy
#define cudaGraphLaunch hipGraphLaunch
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
#define cudaGraphNodeType hipGraphNodeType
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
#define cudaGraphInstantiate hipGraphInstantiate
#define cudaStreamEndCapture hipStreamEndCapture
#define cudaGraphDestroy hipGraphDestroy
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
#define cudaGraphNodeGetType hipGraphNodeGetType
#define cudaGraphGetNodes hipGraphGetNodes
#define cudaGraphExecUpdate hipGraphExecUpdate
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
#define cudaStreamBeginCapture hipStreamBeginCapture
#define cudaGraph_t hipGraph_t
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define cudaHostFn_t hipHostFn_t
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED

#define __CUDA_ARCH__ 1300

#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif

#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
#define CDNA
#endif

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#if defined(__gfx1010__) || defined(__gfx1012__)
#define RDNA1
#endif

#ifndef __has_builtin
    #define __has_builtin(x) 0
#endif

typedef hip_bfloat16 nv_bfloat16;


================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/vendors/musa.h
================================================
#pragma once

#include <musa_runtime.h>
#include <musa.h>
#include <mublas.h>
#include <musa_bf16.h>
#include <musa_fp16.h>
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
#define CUDA_R_16F  MUSA_R_16F
#define CUDA_R_32F  MUSA_R_32F
#define cublasComputeType_t cudaDataType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy
#define cublasGemmEx mublasGemmEx
#define cublasGemmBatchedEx mublasGemmBatchedEx
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
#define cublasHandle_t mublasHandle_t
#define cublasSetMathMode mublasSetMathMode
#define cublasSetStream mublasSetStream
#define cublasSgemm mublasSgemm
#define cublasStatus_t mublasStatus_t
#define cublasOperation_t mublasOperation_t
#define cublasGetStatusString mublasStatus_to_string
#define cudaDataType_t musaDataType_t
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t
#define cudaEventDestroy musaEventDestroy
#define cudaFree musaFree
#define cudaFreeHost musaFreeHost
#define cudaGetDevice musaGetDevice
#define cudaGetDeviceCount musaGetDeviceCount
#define cudaGetDeviceProperties musaGetDeviceProperties
#define cudaGetErrorString musaGetErrorString
#define cudaGetLastError musaGetLastError
#define cudaHostRegister musaHostRegister
#define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister
#define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost
#define cudaMallocManaged musaMallocManaged
#define cudaMemcpy musaMemcpy
#define cudaMemcpyAsync musaMemcpyAsync
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
#define cudaMemcpy2DAsync musaMemcpy2DAsync
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
#define cudaMemcpyKind musaMemcpyKind
#define cudaMemset musaMemset
#define cudaMemsetAsync musaMemsetAsync
#define cudaMemGetInfo musaMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
#define cudaSetDevice musaSetDevice
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
#define cudaStreamDestroy musaStreamDestroy
#define cudaStreamFireAndForget musaStreamFireAndForget
#define cudaStreamNonBlocking musaStreamNonBlocking
#define cudaStreamPerThread musaStreamPerThread
#define cudaStreamSynchronize musaStreamSynchronize
#define cudaStreamWaitEvent musaStreamWaitEvent
#define cudaStream_t musaStream_t
#define cudaSuccess musaSuccess

// Additional mappings for MUSA virtual memory pool
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
#define CUdevice MUdevice
#define CUdeviceptr MUdeviceptr
#define CUmemAccessDesc MUmemAccessDesc
#define CUmemAllocationProp MUmemAllocationProp
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
#define cuDeviceGet muDeviceGet
#define cuDeviceGetAttribute muDeviceGetAttribute
#define cuMemAddressFree muMemAddressFree
#define cuMemAddressReserve muMemAddressReserve
#define cuMemCreate muMemCreate
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
#define cuMemMap muMemMap
#define cuMemRelease muMemRelease
#define cuMemSetAccess muMemSetAccess
#define cuMemUnmap muMemUnmap
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
#define cudaFuncSetAttribute musaFuncSetAttribute
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
#define make_cudaExtent make_musaExtent
#define make_cudaPitchedPtr make_musaPitchedPtr

// Additional mappings for MUSA graphs
#define CUDA_SUCCESS MUSA_SUCCESS
#define CUresult MUresult
#define cuGetErrorString muGetErrorString
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
#define cudaGraphDestroy musaGraphDestroy
#define cudaGraphExecDestroy musaGraphExecDestroy
#define cudaGraphExec_t musaGraphExec_t
#define cudaGraphExecUpdate musaGraphExecUpdate
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
#define cudaGraphGetNodes musaGraphGetNodes
#define cudaGraphInstantiate musaGraphInstantiate
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
#define cudaGraphLaunch musaGraphLaunch
#define cudaGraphNodeGetType musaGraphNodeGetType
#define cudaGraphNode_t musaGraphNode_t
#define cudaGraphNodeType musaGraphNodeType
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
#define cudaGraph_t musaGraph_t
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture

typedef mt_bfloat16 nv_bfloat16;


================================================
FILE: archive/csrc/ktransformers_ext/cpu_backend/vendors/vendor.h
================================================
#ifndef CPUINFER_VENDOR_VENDOR_H
#define CPUINFER_VENDOR_VENDOR_H

#ifdef USE_CUDA
#include "cuda.h"
#elif USE_HIP
#define __HIP_PLATFORM_AMD__
#include "hip.h"
#elif USE_MUSA
#include "musa.h"
#endif

#endif  // CPUINFER_VENDOR_VENDOR_H

================================================
FILE: archive/csrc/ktransformers_ext/cuda/binding.cpp
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 0.2.2
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/

#include "custom_gguf/ops.h"
#ifdef KTRANSFORMERS_USE_CUDA
#include "gptq_marlin/ops.h"
#endif
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
// namespace py = pybind11;

PYBIND11_MODULE(KTransformersOps, m) {

    m.def("dequantize_q8_0", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q8_0((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q8_0 data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q6_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q6_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q6_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q5_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q5_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q5_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q4_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q4_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q4_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q3_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q3_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q3_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q2_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q2_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q2_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_iq4_xs", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_iq4_xs((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize iq4_xs data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

#ifdef KTRANSFORMERS_USE_CUDA
    m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
        py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
        py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
        py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
#endif
}

================================================
FILE: archive/csrc/ktransformers_ext/cuda/custom_gguf/dequant.cu
================================================
/*
 * @Description  :  
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 0.2.2
 * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
 * Copyright (c) 2023-2024 The ggml authors
 * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 */
#include <cuda_runtime.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
#include <cstdint>
#include <c10/cuda/CUDAGuard.h>

#ifdef __HIP_PLATFORM_AMD__
typedef __hip_bfloat16 nv_bfloat16;
#endif

__global__ void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++){
            output_blk[i] = scale * cur_block[i];
        }
    }
}

__global__ void dequantize_q8_0_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++) {
            output_blk[i] = __float2half(scale * cur_block[i]);
        }
    }
}

__global__ void dequantize_q8_0_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++) {
            output_blk[i] = __float2bfloat16(scale * cur_block[i]);
        }
    }
}

// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
__device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
    if (j < 4) {
        *d = q[j] & 63; *m = q[j + 4] & 63;
    } else {
        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
}

__global__ void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q2_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q2_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}


__global__ void dequantize_q4_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1;
            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l]  >> 4) - m2;
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q4_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * (q[l] & 0xF) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * (q[l]  >> 4) - m2);
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q4_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * (q[l] & 0xF) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * (q[l]  >> 4) - m2);
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q5_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q5_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q5_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q6_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = d * sc[is + 0] * q1;
                output_blk[l + 32] = d * sc[is + 2] * q2;
                output_blk[l + 64] = d * sc[is + 4] * q3;
                output_blk[l + 96] = d * sc[is + 6] * q4;
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

__global__ void dequantize_q6_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = __float2half(d * sc[is + 0] * q1);
                output_blk[l + 32] = __float2half(d * sc[is + 2] * q2);
                output_blk[l + 64] = __float2half(d * sc[is + 4] * q3);
                output_blk[l + 96] = __float2half(d * sc[is + 6] * q4);
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

__global__ void dequantize_q6_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = __float2bfloat16(d * sc[is + 0] * q1);
                output_blk[l + 32] = __float2bfloat16(d * sc[is + 2] * q2);
                output_blk[l + 64] = __float2bfloat16(d * sc[is + 4] * q3);
                output_blk[l + 96] = __float2bfloat16(d * sc[is + 6] * q4);
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

__global__ void dequantize_iq4_xs_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
                output_blk[j + 16] = dl * kvalues_iq4nl[qs[j] >> 4];
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

__global__ void dequantize_iq4_xs_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = __float2half(dl * kvalues_iq4nl[qs[j] & 0xf]);
                output_blk[j + 16] = __float2half(dl * kvalues_iq4nl[qs[j] >> 4]);
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

__global__ void dequantize_iq4_xs_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] & 0xf]);
                output_blk[j + 16] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] >> 4]);
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({ num_bytes }, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({ num_blocks, 32 }, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q8_0_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q8_0_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q8_0_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }

    cudaDeviceSynchronize();
    return output;
}


torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = num_bytes / blk_size;

    const at::cuda::OptionalCUDAGuard device_guard(device);
    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q6_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q6_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q6_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q5_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q5_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q5_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q4_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q4_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q4_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q3_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q3_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q3_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q2_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q2_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q2_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, 256}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_iq4_xs_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_iq4_xs_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_iq4_xs_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}


================================================
FILE: archive/csrc/ktransformers_ext/cuda/custom_gguf/ops.h
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-12 03:48:46
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once

#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>

torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);

================================================
FILE: archive/csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
================================================
/*
 * Modified by Neural Magic
 * Copyright (C) Marlin.2024 Elias Frantar
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Adapted from https://github.com/IST-DASLab/marlin
 */
/*
 * Adapted from  https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
 */
#include "gptq_marlin.cuh"
#include "gptq_marlin_dtypes.cuh"
#include <c10/cuda/CUDAGuard.h>
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
  static_assert(std::is_same<scalar_t, half>::value ||          \
                    std::is_same<scalar_t, nv_bfloat16>::value, \
                "only float16 and bfloat16 is supported");

template <typename T>
inline std::string str(T x) {
  return std::to_string(x);
}

namespace gptq_marlin {

#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined(__HIP_PLATFORM_AMD__)

__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
                                    int size_k, int block_rows) {}

template <typename scalar_t,          // compute dtype, half or nv_float16
          const int num_bits,         // number of bits used for weights
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
                                      // threadblock
          const int thread_n_blocks,  // same for n dimension (output)
          const int thread_k_blocks,  // same for k dimension (reduction)
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
          const bool has_act_order,    // whether act_order is enabled
          const int group_blocks = -1  // number of consecutive 16x16 blocks
                                       // with a separate quantization scale
          >
__global__ void Marlin(
    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const int* __restrict__ g_idx,        // int32 group indices of shape k
    int num_groups,  // number of scale groups per output channel
    int prob_m,      // batch dimension m
    int prob_n,      // output dimension n
    int prob_k,      // reduction dimension k
    int* locks       // extra global storage for barrier synchronization
) {}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full) {
  TORCH_CHECK_NOT_IMPLEMENTED(false,
                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
  return torch::empty({1, 1});
}

#else

// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
template <typename scalar_t>
__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
                           const typename ScalarType<scalar_t>::FragB& frag_b,
                           typename ScalarType<scalar_t>::FragC& frag_c) {
  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
  float* c = reinterpret_cast<float*>(&frag_c);
  if constexpr (std::is_same<scalar_t, half>::value) {
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
  } else {
    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
  }
}

// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
template <typename scalar_t>
__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
                             const void* smem_ptr) {
  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
               : "r"(smem));
}

// Lookup-table based 3-input logical operation; explicitly used for
// dequantization as the compiler does not seem to automatically recognize it in
// all cases.
template <int lut>
__device__ inline int lop3(int a, int b, int c) {
  int res;
  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
               : "=r"(res)
               : "r"(a), "r"(b), "r"(c), "n"(lut));
  return res;
}

// Constructs destination register by taking bytes from 2 sources (based on
// mask)
template <int start_byte, int mask>
__device__ inline uint32_t prmt(uint32_t a) {
  uint32_t res;
  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
               : "=r"(res)
               : "r"(a), "n"(start_byte), "n"(mask));
  return res;
}

// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
// values. We mostly follow the strategy in the link below, with some small
// changes:
// - FP16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
// - BF16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
template <typename scalar_t>
__device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit(int q) {
  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
}

template <>
__device__ inline typename ScalarType<half>::FragB dequant_4bit<half>(int q) {
  const int LO = 0x000f000f;
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
  const int MUL = 0x2c002c00;
  const int ADD = 0xd480d480;
  typename ScalarType<half>::FragB frag_b;
  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
                      *reinterpret_cast<const half2*>(&SUB));
  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
                      *reinterpret_cast<const half2*>(&MUL),
                      *reinterpret_cast<const half2*>(&ADD));
  return frag_b;
}

template <>
__device__ inline typename ScalarType<nv_bfloat16>::FragB
dequant_4bit<nv_bfloat16>(int q) {
  static constexpr uint32_t MASK = 0x000f000f;
  static constexpr uint32_t EX = 0x43004300;

  // Guarantee that the `(a & b) | c` operations are LOP3s.

  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
  static constexpr uint32_t ADD = 0xC308C308;

  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
  return frag_b;
}

// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
// bf16 Reference:
// - FP16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
// - BF16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
template <typename scalar_t>
__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
}

template <>
__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
  static constexpr uint32_t mask_for_elt_01 = 0x5250;
  static constexpr uint32_t mask_for_elt_23 = 0x5351;
  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;

  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);

  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;

  typename ScalarType<half>::FragB frag_b;
  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
  return frag_b;
}

template <>
__device__ inline typename ScalarType<nv_bfloat16>::FragB
dequant_8bit<nv_bfloat16>(int q) {
  typename ScalarType<nv_bfloat16>::FragB frag_b;

  float fp32_intermediates[4];
  uint32_t* fp32_intermediates_casted =
      reinterpret_cast<uint32_t*>(fp32_intermediates);

  static constexpr uint32_t fp32_base = 0x4B000000;
  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);

  fp32_intermediates[0] -= 8388736.f;
  fp32_intermediates[1] -= 8388736.f;
  fp32_intermediates[2] -= 8388736.f;
  fp32_intermediates[3] -= 8388736.f;

  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
                                   fp32_intermediates_casted[1], 0x7632);
  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
                                   fp32_intermediates_casted[3], 0x7632);

  return frag_b;
}

// Multiply dequantized values by the corresponding quantization scale; used
// only for grouped quantization.
template <typename scalar_t>
__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
                             typename ScalarType<scalar_t>::FragS& frag_s,
                             int i) {
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  scalar_t2 s =
      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
  frag_b[0] = __hmul2(frag_b[0], s);
  frag_b[1] = __hmul2(frag_b[1], s);
}

// Same as above, but for act_order (each K is multiplied individually)
template <typename scalar_t>
__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
                              typename ScalarType<scalar_t>::FragS& frag_s_1,
                              typename ScalarType<scalar_t>::FragS& frag_s_2,
                              typename ScalarType<scalar_t>::FragS& frag_s_3,
                              typename ScalarType<scalar_t>::FragS& frag_s_4,
                              int i) {
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  scalar_t2 s_val_1_2;
  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];

  scalar_t2 s_val_3_4;
  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];

  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
}

// Given 2 floats multiply by 2 scales (halves)
template <typename scalar_t>
__device__ inline void scale_float(float* c,
                                   typename ScalarType<scalar_t>::FragS& s) {
  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
}

// Wait until barrier reaches `count`, then lock for current threadblock.
__device__ inline void barrier_acquire(int* lock, int count) {
  if (threadIdx.x == 0) {
    int state = -1;
    do
      // Guarantee that subsequent writes by this threadblock will be visible
      // globally.
      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
                   : "=r"(state)
                   : "l"(lock));
    while (state != count);
  }
  __syncthreads();
}

// Release barrier and increment visitation count.
__device__ inline void barrier_release(int* lock, bool reset = false) {
  __syncthreads();
  if (threadIdx.x == 0) {
    if (reset) {
      lock[0] = 0;
      return;
    }
    int val = 1;
    // Make sure that all writes since acquiring this barrier are visible
    // globally, while releasing the barrier.
    asm volatile("fence.acq_rel.gpu;\n");
    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
                 :
                 : "l"(lock), "r"(val));
  }
}

// For a given "a" of size [M,K] performs a permutation of the K columns based
// on the given "perm" indices.
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
                                    int size_k, int block_rows) {
  int start_row = block_rows * blockIdx.x;
  int finish_row = start_row + block_rows;
  if (finish_row > size_m) {
    finish_row = size_m;
  }
  int cur_block_rows = finish_row - start_row;

  int row_stride = size_k * sizeof(half) / 16;

  auto permute_row = [&](int row) {
    int iters = size_k / default_threads;
    int rest = size_k % default_threads;

    int offset = row * row_stride;

    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);

    int base_k = 0;

    for (int i = 0; i < iters; i++) {
      int cur_k = base_k + threadIdx.x;
      int src_pos = perm_int_ptr[cur_k];

      out_half[cur_k] = a_row_half[src_pos];

      base_k += default_threads;
    }

    if (rest) {
      if (threadIdx.x < rest) {
        int cur_k = base_k + threadIdx.x;
        int src_pos = perm_int_ptr[cur_k];

        out_half[cur_k] = a_row_half[src_pos];
      }
    }
  };

  for (int i = 0; i < cur_block_rows; i++) {
    int cur_row = start_row + i;
    if (cur_row < size_m) {
      permute_row(cur_row);
    }
  }
}

template <typename scalar_t,          // compute dtype, half or nv_float16
          const int num_bits,         // number of bits used for weights
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
                                      // threadblock
          const int thread_n_blocks,  // same for n dimension (output)
          const int thread_k_blocks,  // same for k dimension (reduction)
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
          const bool has_act_order,    // whether act_order is enabled
          const int group_blocks = -1  // number of consecutive 16x16 blocks
                                       // with a separate quantization scale
          >
__global__ void Marlin(
    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const int* __restrict__ g_idx,        // int32 group indices of shape k
    int num_groups,  // number of scale groups per output channel
    int prob_m,      // batch dimension m
    int prob_n,      // output dimension n
    int prob_k,      // reduction dimension k
    int* locks       // extra global storage for barrier synchronization
) {
  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
  // same size, which might involve multiple column "slices" (of width 16 *
  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
  // example:
  //   0 1 3
  //   0 2 3
  //   1 2 4
  // While this kind of partitioning makes things somewhat more complicated, it
  // ensures good utilization of all SMs for many kinds of shape and GPU
  // configurations, while requiring as few slow global cross-threadblock
  // reductions as possible.
  using Dtype = ScalarType<scalar_t>;
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  using FragA = typename ScalarType<scalar_t>::FragA;
  using FragB = typename ScalarType<scalar_t>::FragB;
  using FragC = typename ScalarType<scalar_t>::FragC;
  using FragS = typename ScalarType<scalar_t>::FragS;

  constexpr int pack_factor = 32 / num_bits;

  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
  // better partitioning with less reductions
  int parallel = 1;
  if (prob_m > 16 * thread_m_blocks) {
    parallel = prob_m / (16 * thread_m_blocks);
    prob_m = 16 * thread_m_blocks;
  }

  int k_tiles = prob_k / 16 / thread_k_blocks;
  int n_tiles = prob_n / 16 / thread_n_blocks;
  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);

  if constexpr (!has_act_order && group_blocks != -1) {
    if (group_blocks >= thread_k_blocks) {
      // Ensure that the number of tiles in each stripe is a multiple of the
      // groupsize; this avoids an annoying special case where a stripe starts
      // in the middle of group.
      iters = (group_blocks / thread_k_blocks) *
              div_ceil(iters, (group_blocks / thread_k_blocks));
    }
  }

  int slice_row = (iters * blockIdx.x) % k_tiles;
  int slice_col_par = (iters * blockIdx.x) / k_tiles;
  int slice_col = slice_col_par;
  int slice_iters;  // number of threadblock tiles in the current slice
  int slice_count =
      0;          // total number of active threadblocks in the current slice
  int slice_idx;  // index of threadblock in current slice; numbered bottom to
                  // top

  // We can easily implement parallel problem execution by just remapping
  // indices and advancing global pointers
  if (slice_col_par >= n_tiles) {
    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
    locks += (slice_col_par / n_tiles) * n_tiles;
    slice_col = slice_col_par % n_tiles;
  }

  // Compute all information about the current slice which is required for
  // synchronization.
  auto init_slice = [&]() {
    slice_iters =
        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
    if (slice_iters == 0) return;
    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
    slice_count = 1;
    slice_idx = 0;
    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
    if (col_first <= k_tiles * (slice_col_par + 1)) {
      int col_off = col_first - k_tiles * slice_col_par;
      slice_count = div_ceil(k_tiles - col_off, iters);
      if (col_off > 0) slice_count++;
      int delta_first = iters * blockIdx.x - col_first;
      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
        slice_idx = slice_count - 1;
      else {
        slice_idx = slice_count - 1 - delta_first / iters;
        if (col_off > 0) slice_idx--;
      }
    }
    if (slice_col == n_tiles) {
      A += 16 * thread_m_blocks * prob_k / 8;
      C += 16 * thread_m_blocks * prob_n / 8;
      locks += n_tiles;
      slice_col = 0;
    }
  };
  init_slice();

  // A sizes/strides

  // stride of the A matrix in global memory
  int a_gl_stride = prob_k / 8;
  // stride of an A matrix tile in shared memory
  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
  // delta between subsequent A tiles in global memory
  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
  // between subsequent accesses within a tile
  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
  // between shared memory writes
  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
  // between shared memory tile reads
  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
  // within a shared memory tile
  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
  // overall size of a tile
  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
  // number of shared write iterations for a tile
  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);

  // B sizes/strides
  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;

  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;

  // Scale sizes/strides without act_order
  int s_gl_stride = prob_n / 8;
  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
  constexpr int s_tb_groups =
      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
          ? thread_k_blocks / group_blocks
          : 1;
  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
  int s_gl_rd_delta = s_gl_stride;

  // Scale size/strides with act_order
  constexpr int tb_k = 16 * thread_k_blocks;
  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
  // constexpr int act_s_row_stride      = 1;
  // int           act_s_col_stride      = act_s_row_stride * num_groups;
  int act_s_col_stride = 1;
  int act_s_col_warp_stride = act_s_col_stride * 8;
  int tb_n_warps = thread_n_blocks / 4;
  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;

  // Global A read index of current thread.
  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                (threadIdx.x % a_gl_rd_delta_o);
  a_gl_rd += a_gl_rd_delta_o * slice_row;
  // Shared write index of current thread.
  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
                (threadIdx.x % a_gl_rd_delta_o);
  // Shared read index.
  int a_sh_rd =
      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));

  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
  int b_sh_wr = threadIdx.x * b_thread_vecs;
  int b_sh_rd = threadIdx.x * b_thread_vecs;

  // For act_order
  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
  int slice_k_start = tb_k * slice_row;
  int slice_k_finish = slice_k_start + tb_k * slice_iters;
  int slice_k_start_shared_fetch = slice_k_start;
  int slice_n_offset = act_s_col_tb_stride * slice_col;

  // No act_order
  int s_gl_rd;
  if constexpr (!has_act_order) {
    if constexpr (group_blocks == -1) {
      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
    } else {
      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                s_sh_stride * slice_col + threadIdx.x;
    }
  }
  int s_sh_wr = threadIdx.x;
  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;

  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
  // row-major in the latter case.
  int s_sh_rd;
  if constexpr (group_blocks != -1)
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) / 4;
  else
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) % 4;

  // Precompute which thread should not read memory in which iterations; this is
  // needed if there are more threads than required for a certain tilesize or
  // when the batchsize is not a multiple of 16.
  bool a_sh_wr_pred[a_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < a_sh_wr_iters; i++)
    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;

  // To ensure that writing and reading A tiles to/from shared memory, the
  // latter in fragment format, is fully bank conflict free, we need to use a
  // rather fancy XOR-based layout. The key here is that neither reads nor
  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
  // same shared memory banks. Further, it seems (based on NSight-Compute) that
  // each warp must also write a consecutive memory segment?
  auto transform_a = [&](int i) {
    int row = i / a_gl_rd_delta_o;
    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
  };
  // Since the computation of this remapping is non-trivial and, due to our main
  // loop unrolls, all shared memory accesses are static, we simply precompute
  // both transformed reads and writes.
  int a_sh_wr_trans[a_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < a_sh_wr_iters; i++)
    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
  #pragma unroll
  for (int i = 0; i < b_sh_wr_iters; i++) {
  #pragma unroll
    for (int j = 0; j < thread_m_blocks; j++)
      a_sh_rd_trans[i][j] =
          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
  }

  // Since B-accesses have non-constant stride they have to be computed at
  // runtime; we break dependencies between subsequent accesses with a tile by
  // maintining multiple pointers (we have enough registers), a tiny
  // optimization.
  const int4* B_ptr[b_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < b_sh_wr_iters; i++)
    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;

  extern __shared__ int4 sh[];
  // Shared memory storage for global fetch pipelines.
  int4* sh_a = sh;
  int4* sh_b = sh_a + (stages * a_sh_stage);
  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
  int4* sh_s = sh_g_idx + (stages * g_idx_stage);

  // Register storage for double buffer of shared memory reads.
  FragA frag_a[2][thread_m_blocks];
  I4 frag_b_quant[2][b_thread_vecs];
  FragC frag_c[thread_m_blocks][4][2];
  FragS frag_s[2][4];         // No act-order
  FragS act_frag_s[2][4][4];  // For act-order

  // Zero accumulators.
  auto zero_accums = [&]() {
  #pragma unroll
    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
      reinterpret_cast<float*>(frag_c)[i] = 0;
  };

  int sh_first_group_id = -1;
  int sh_num_groups = -1;
  constexpr int sh_max_num_groups = 32;

  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
                                    int last_group_id) {
    sh_first_group_id = first_group_id;
    sh_num_groups = last_group_id - first_group_id + 1;

    if (sh_num_groups < sh_max_num_groups) {
      sh_num_groups = sh_max_num_groups;
    }

    if (sh_first_group_id + sh_num_groups > num_groups) {
      sh_num_groups = num_groups - sh_first_group_id;
    }

    int row_offset = first_group_id * s_gl_stride;

    if (is_async) {
      for (int i = 0; i < sh_num_groups; i++) {
        if (threadIdx.x < s_sh_stride) {
          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
                         &scales_ptr[row_offset + (i * s_gl_stride) +
                                     slice_n_offset + threadIdx.x]);
        }
      }
    } else {
      for (int i = 0; i < sh_num_groups; i++) {
        if (threadIdx.x < s_sh_stride) {
          sh_s[(i * s_sh_stride) + threadIdx.x] =
              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
                         threadIdx.x];
        }
      }
    }
  };
  // Asynchronously fetch the next A, B and s tile from global to the next
  // shared memory pipeline location.
  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
    if (pred) {
      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
  #pragma unroll
      for (int i = 0; i < a_sh_wr_iters; i++) {
        cp_async4_pred(
            &sh_a_stage[a_sh_wr_trans[i]],
            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
            a_sh_wr_pred[i]);
      }
      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
  #pragma unroll
      for (int i = 0; i < b_sh_wr_iters; i++) {
  #pragma unroll
        for (int j = 0; j < b_thread_vecs; j++) {
          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
        }

        B_ptr[i] += b_gl_rd_delta_o;
      }

      if constexpr (has_act_order) {
        // Fetch g_idx thread-block portion
        int full_pipe = a_off;
        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
        if (cur_k < prob_k && cur_k < slice_k_finish) {
          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;

          int4 const* cur_g_idx_stage_ptr =
              reinterpret_cast<int4 const*>(&g_idx[cur_k]);

          if (threadIdx.x < g_idx_stage) {
            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
                           &cur_g_idx_stage_ptr[threadIdx.x]);
          }
        }
      } else {
        if constexpr (group_blocks != -1) {
          int4* sh_s_stage = sh_s + s_sh_stage * pipe;

          if constexpr (group_blocks >= thread_k_blocks) {
            // Only fetch scales if this tile starts a new group
            if (pipe % (group_blocks / thread_k_blocks) == 0) {
              if (s_sh_wr_pred) {
                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
              }
              s_gl_rd += s_gl_rd_delta;
            }
          } else {
            for (int i = 0; i < s_tb_groups; i++) {
              if (s_sh_wr_pred) {
                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
                          &scales_ptr[s_gl_rd]);
              }
              s_gl_rd += s_gl_rd_delta;
            }
          }
        }
      }
    }
    // Insert a fence even when we are winding down the pipeline to ensure that
    // waiting is also correct at this point.
    cp_async_fence();
  };

  // Wait until the next thread tile has been loaded to shared memory.
  auto wait_for_stage = [&]() {
    // We only have `stages - 2` active fetches since we are double buffering
    // and can only issue the next fetch when it is guaranteed that the previous
    // shared memory load is fully complete (as it may otherwise be
    // overwritten).
    cp_async_wait<stages - 2>();
    __syncthreads();
  };

  // Load the next sub-tile from the current location in the shared memory pipe
  // into the current register buffer.
  auto fetch_to_registers = [&](int k, int pipe) {
    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
  #pragma unroll
    for (int i = 0; i < thread_m_blocks; i++)
      ldsm4<scalar_t>(frag_a[k % 2][i],
                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
    int4* sh_b_stage = sh_b + b_sh_stage * pipe;

  #pragma unroll
    for (int i = 0; i < b_thread_vecs; i++) {
      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
    }
  };

  bool is_same_group[stages];
  int same_group_id[stages];

  auto init_same_group = [&](int pipe) {
    if constexpr (!has_act_order) {
      is_same_group[pipe] = false;
      same_group_id[pipe] = 0;
      return;
    }

    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

    int group_id_1 = sh_g_idx_int_ptr[0];
    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];

    is_same_group[pipe] = group_id_1 == group_id_2;
    same_group_id[pipe] = group_id_1;
  };

  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
    int pipe = full_pipe % stages;

    if constexpr (!has_act_order) {
      // No act-order case
      if constexpr (group_blocks != -1) {
        if constexpr (group_blocks >= thread_k_blocks) {
          int4* sh_s_stage =
              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                   (pipe / (group_blocks / thread_k_blocks)));
          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
        } else {
          int warp_id = threadIdx.x / 32;
          int n_warps = thread_n_blocks / 4;

          int warp_row = warp_id / n_warps;

          int cur_k = warp_row * 16;
          cur_k += k_iter_size * (k % b_sh_wr_iters);

          int k_blocks = cur_k / 16;
          int cur_group_id = k_blocks / group_blocks;

          int4* sh_s_stage = sh_s + s_sh_stage * pipe;

          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
        }
      }

      return;
    }

    // Act-order case

    // Determine K of the "current" thread-block
    int cur_k = slice_k_start + tb_k * full_pipe;
    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
      return;
    }

    // Reset (to current thread-block) since we read g_idx portion from the
    // shared memory
    cur_k = 0;

    // Progress to current iteration
    cur_k += k_iter_size * (k % b_sh_wr_iters);

    // Determine "position" inside the thread-block (based on warp and
    // thread-id)
    int warp_id = threadIdx.x / 32;
    int n_warps =
        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N

    int warp_row = warp_id / n_warps;
    int warp_col = warp_id % n_warps;

    cur_k += warp_row * 16;

    int th_id = threadIdx.x % 32;
    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix

    int s_col_shift =
        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
        (th_id / 4) * act_s_col_stride;

    if (is_same_group[pipe]) {
      if (k % 2 == 0) {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
                 s_col_shift];
      } else {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
      }

      for (int i = 1; i < 4; i++) {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
      }
      return;
    }

    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

    constexpr int k_frag_offsets[4] = {0, 1, 8,
                                       9};  // Tensor core offsets per thread

  #pragma unroll
    for (int i = 0; i < 4; i++) {
      int actual_k = cur_k + k_frag_offsets[i];

      int group_id = sh_g_idx_int_ptr[actual_k];
      int rel_group_id = group_id - sh_first_group_id;

      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
          sh_s[rel_group_id * s_sh_stride + s_col_shift];
    }
  };

  // Execute the actual tensor core matmul of a sub-tile.
  auto matmul = [&](int k) {
  // We have the m dimension as the inner loop in order to encourage overlapping
  // dequantization and matmul operations.
  #pragma unroll
    for (int j = 0; j < 4; j++) {
      FragB frag_b0;
      FragB frag_b1;
      if constexpr (num_bits == 4) {
        int b_quant = frag_b_quant[k % 2][0][j];
        int b_quant_shift = b_quant >> 8;

        frag_b0 = dequant_4bit<scalar_t>(b_quant);
        frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);

      } else {
        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];

        frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
        frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
      }

      // Apply scale to frag_b0
      if constexpr (has_act_order) {
        scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                         act_frag_s[k % 2][3][j], 0);
      } else {
        if constexpr (group_blocks != -1) {
          scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
        }
      }

      // Apply scale to frag_b1
      if constexpr (has_act_order) {
        scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                         act_frag_s[k % 2][3][j], 1);

      } else {
        if constexpr (group_blocks != -1) {
          scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
        }
      }

  #pragma unroll
      for (int i = 0; i < thread_m_blocks; i++) {
        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
      }
    }
  };

  // Since we slice across the k dimension of a tile in order to increase the
  // number of warps while keeping the n dimension of a tile reasonable, we have
  // multiple warps that accumulate their partial sums of the same output
  // location; which we have to reduce over in the end. We do in shared memory.
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride_threads / 2;
    if (red_off >= 1) {
      int red_idx = threadIdx.x / b_sh_stride_threads;
      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride_threads;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                      (threadIdx.x % b_sh_stride_threads);

      // Parallel logarithmic shared memory reduction. We make sure to avoid any
      // unnecessary read or write iterations, e.g., for two warps we write only
      // once by warp 1 and read only once by warp 0.

  #pragma unroll
      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
  #pragma unroll
        for (int i = red_off; i > 0; i /= 2) {
          if (i <= red_idx && red_idx < 2 * i) {
  #pragma unroll
            for (int j = 0; j < 4 * 2; j++) {
              int red_sh_wr =
                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
              if (i < red_off) {
                float* c_rd =
                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
  #pragma unroll
                for (int k = 0; k < 4; k++)
                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                      c_rd[k] + c_wr[k];
              }
              sh[red_sh_wr] =
                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
            }
          }
          __syncthreads();
        }
        if (red_idx == 0) {
  #pragma unroll
          for (int i = 0; i < 4 * 2; i++) {
            float* c_rd =
                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
  #pragma unroll
            for (int j = 0; j < 4; j++)
              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
                  c_rd[j];
          }
        }
        __syncthreads();
      }
    }
  };

  // Since multiple threadblocks may process parts of the same column slice, we
  // finally have to globally reduce over the results. As the striped
  // partitioning minimizes the number of such reductions and our outputs are
  // usually rather small, we perform this reduction serially in L2 cache.
  auto global_reduce = [&](bool first = false, bool last = false) {
    // We are very careful here to reduce directly in the output buffer to
    // maximize L2 cache utilization in this step. To do this, we write out
    // results in FP16 (but still reduce with FP32 compute).
    constexpr int active_threads = 32 * thread_n_blocks / 4;
    if (threadIdx.x < active_threads) {
      int c_gl_stride = prob_n / 8;
      int c_gl_wr_delta_o = 8 * c_gl_stride;
      int c_gl_wr_delta_i = 4 * (active_threads / 32);
      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
      int c_sh_wr = threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

      if (!first) {
  // Interestingly, doing direct global accesses here really seems to mess up
  // the compiler and lead to slowdowns, hence we also use async-copies even
  // though these fetches are not actually asynchronous.
  #pragma unroll
        for (int i = 0; i < thread_m_blocks * 4; i++) {
          cp_async4_pred(
              &sh[c_sh_wr + c_sh_wr_delta * i],
              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                 c_gl_wr_delta_i * (i % 2)],
              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
        }
        cp_async_fence();
        cp_async_wait<0>();
      }

  #pragma unroll
      for (int i = 0; i < thread_m_blocks * 4; i++) {
        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
          if (!first) {
            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
  #pragma unroll
            for (int j = 0; j < 2 * 4; j++) {
              reinterpret_cast<float*>(
                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
            }
          }
          if (!last) {
            int4 c;
  #pragma unroll
            for (int j = 0; j < 2 * 4; j++) {
              reinterpret_cast<scalar_t*>(&c)[j] =
                  Dtype::float2num(reinterpret_cast<float*>(
                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
            }
            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
                c;
          }
        }
      }
    }
  };

  // Write out the reduce final result in the correct layout. We only actually
  // reshuffle matrix fragments in this step, the reduction above is performed
  // in fragment layout.
  auto write_result = [&]() {
    int c_gl_stride = prob_n / 8;
    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
    constexpr int c_sh_rd_delta =
        c_sh_stride * (threads / (2 * thread_n_blocks));

    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                  (threadIdx.x % (2 * thread_n_blocks));
    c_gl_wr += (2 * thread_n_blocks) * slice_col;
    int c_sh_wr =
        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
    c_sh_wr += 32 * (threadIdx.x / 32);
    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                  (threadIdx.x % (2 * thread_n_blocks));

    int c_gl_wr_end = c_gl_stride * prob_m;

    // We first reorder in shared memory to guarantee the most efficient final
    // global write patterns
    auto write = [&](int idx, float c0, float c1, FragS& s) {
      scalar_t2 res =
          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

      // For per-column quantization we finally apply the scale here (only for
      // 4-bit)
      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
        res = __hmul2(res, s[0]);
      }

      ((scalar_t2*)sh)[idx] = res;
    };

    if (threadIdx.x / 32 < thread_n_blocks / 4) {
  #pragma unroll
      for (int i = 0; i < thread_m_blocks; i++) {
  #pragma unroll
        for (int j = 0; j < 4; j++) {
          int wr = c_sh_wr + 8 * j;
          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
        }
        c_sh_wr += 16 * (4 * c_sh_stride);
      }
    }
    __syncthreads();

  #pragma unroll
    for (int i = 0;
         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
         i++) {
      if (c_gl_wr < c_gl_wr_end) {
        C[c_gl_wr] = sh[c_sh_rd];
        c_gl_wr += c_gl_wr_delta;
        c_sh_rd += c_sh_rd_delta;
      }
    }
  };

  // Start global fetch and register load pipelines.
  auto start_pipes = [&]() {

  #pragma unroll
    for (int i = 0; i < stages - 1; i++) {
      if (has_act_order && i == 0) {
        int last_g_idx = slice_k_start + stages * tb_k * 2;
        if (last_g_idx >= prob_k) {
          last_g_idx = prob_k - 1;
        }
        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
      }
      fetch_to_shared(i, i, i < slice_iters);
    }

    zero_accums();
    wait_for_stage();
    init_same_group(0);
    fetch_to_registers(0, 0);
    fetch_scales_to_registers(0, 0);
    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
    slice_k_start_shared_fetch += tb_k * (stages - 1);
  };
  if (slice_iters) {
    start_pipes();
  }

  // Main loop.
  while (slice_iters) {
    // We unroll over both the global fetch and the register load pipeline to
    // ensure all shared memory accesses are static. Note that both pipelines
    // have even length meaning that the next iteration will always start at
    // index 0.

  #pragma unroll
    for (int pipe = 0; pipe < stages;) {
  #pragma unroll
      for (int k = 0; k < b_sh_wr_iters; k++) {
        fetch_to_registers(k + 1, pipe % stages);
        fetch_scales_to_registers(k + 1, pipe);
        if (k == b_sh_wr_iters - 2) {
          fetch_to_shared((pipe + stages - 1) % stages, pipe,
                          slice_iters >= stages);
          pipe++;
          wait_for_stage();
          init_same_group(pipe % stages);
        }
        matmul(k);
      }
      slice_iters--;
      if (slice_iters == 0) {
        break;
      }
    }

    a_gl_rd += a_gl_rd_delta_o * stages;
    slice_k_start += tb_k * stages;
    slice_k_start_shared_fetch += tb_k * stages;

    if constexpr (has_act_order) {
      int first_group_id = g_idx[slice_k_start];
      int last_g_idx = slice_k_start + stages * tb_k * 2;
      if (last_g_idx >= prob_k) {
        last_g_idx = prob_k - 1;
      }
      int last_group_id = g_idx[last_g_idx];
      if (last_group_id >= sh_first_group_id + sh_num_groups) {
        fetch_scales_to_shared(false, first_group_id, last_group_id);
        __syncthreads();
      }
    }

    // Process results and, if necessary, proceed to the next column slice.
    // While this pattern may not be the most readable, other ways of writing
    // the loop seemed to noticeably worse performance after compilation.
    if (slice_iters == 0) {
      cp_async_wait<0>();
      bool last = slice_idx == slice_count - 1;
      // For per-column scales, we only fetch them here in the final step before
      // write-out
      if constexpr (!has_act_order && group_blocks == -1) {
        if constexpr (num_bits == 8) {
          if (s_sh_wr_pred) {
            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
          }
          cp_async_fence();
        } else {
          if (last) {
            if (s_sh_wr_pred) {
              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
            }
            cp_async_fence();
          }
        }
      }

      thread_block_reduce();
      if constexpr (!has_act_order && group_blocks == -1) {
        if constexpr (num_bits == 8) {
          cp_async_wait<0>();
          __syncthreads();
          if (threadIdx.x / 32 < thread_n_blocks / 4) {
            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
          }

        } else {
          if (last) {
            cp_async_wait<0>();
            __syncthreads();
            if (threadIdx.x / 32 < thread_n_blocks / 4) {
              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
            }
          }
        }
      }

      // For 8-bit channelwise, we apply the scale before the global reduction
      // that converts the fp32 results to fp16 (so that we avoid possible
      // overflow in fp16)
      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
        if (threadIdx.x / 32 < thread_n_blocks / 4) {
  #pragma unroll
          for (int i = 0; i < thread_m_blocks; i++) {
  #pragma unroll
            for (int j = 0; j < 4; j++) {
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                  frag_s[j / 2][2 * (j % 2) + 0]);
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                  frag_s[j / 2][2 * (j % 2) + 0]);

              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                  frag_s[j / 2][2 * (j % 2) + 1]);
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                  frag_s[j / 2][2 * (j % 2) + 1]);
            }
          }
        }
      }

      if (slice_count > 1) {  // only globally reduce if there is more than one
                              // block in a slice
        barrier_acquire(&locks[slice_col], slice_idx);
        global_reduce(slice_idx == 0, last);
        barrier_release(&locks[slice_col], last);
      }
      if (last)  // only the last block in a slice actually writes the result
        write_result();
      slice_row = 0;
      slice_col_par++;
      slice_col++;
      init_slice();
      if (slice_iters) {
        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                  (threadIdx.x % a_gl_rd_delta_o);
  #pragma unroll
        for (int i = 0; i < b_sh_wr_iters; i++)
          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
        if (slice_col == 0) {
  #pragma unroll
          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
        }

        // Update slice k/n for scales loading
        if constexpr (has_act_order) {
          slice_k_start = tb_k * slice_row;
          slice_k_finish = slice_k_start + tb_k * slice_iters;
          slice_k_start_shared_fetch = slice_k_start;
          slice_n_offset = act_s_col_tb_stride * slice_col;

        } else {
          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
        }

        start_pipes();
      }
    }
  }
}

  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
                    THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \
    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
             thread_n_blocks == THREAD_N_BLOCKS &&                             \
             thread_k_blocks == THREAD_K_BLOCKS &&                             \
             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
             num_threads == NUM_THREADS) {                                     \
      cudaFuncSetAttribute(                                                    \
          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
                 GROUP_BLOCKS>,                                                \
          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
             GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>(   \
          A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n,   \
          prob_k, locks);                                                      \
    }

typedef struct {
  int thread_k;
  int thread_n;
  int num_threads;
} thread_config_t;

typedef struct {
  int max_m_blocks;
  thread_config_t tb_cfg;
} exec_config_t;

thread_config_t small_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {128, 128, 256},
    {64, 128, 128},
    {128, 64, 128},
};

thread_config_t large_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {64, 256, 256},
    {64, 128, 128},
    {128, 64, 128},

};

int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                          int prob_n, int prob_k, int num_bits, int group_size,
                          bool has_act_order, bool is_k_full) {
  bool cache_scales_chunk = has_act_order && !is_k_full;

  int tb_n = th_config.thread_n;
  int tb_k = th_config.thread_k;

  // Get max scale groups per thread-block
  int tb_groups;
  if (group_size == -1) {
    tb_groups = 1;
  } else if (group_size == 0) {
    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
  } else {
    tb_groups = div_ceil(tb_k, group_size);
  }

  if (cache_scales_chunk) {
    int load_groups =
        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
    return load_groups * tb_n * 2;

  } else {
    int tb_scales = tb_groups * tb_n * 2;

    return tb_scales * pipe_stages;
  }
}

bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
                         int prob_m, int prob_n, int prob_k, int num_bits,
                         int scales_cache_size, int max_shared_mem) {
  int pack_factor = 32 / num_bits;

  // Get B size
  int tb_k = th_config.thread_k;
  int tb_n = th_config.thread_n;

  int b_size = (tb_k * tb_n / pack_factor) * 4;

  // Get A size
  int m_blocks = div_ceil(prob_m, 16);
  int tb_max_m = 16;

  while (true) {
    if (m_blocks >= max_m_blocks) {
      tb_max_m *= max_m_blocks;
      break;
    }

    max_m_blocks--;
    if (max_m_blocks == 0) {
      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
    }
  }

  int a_size = (tb_max_m * tb_k) * 2;

  float pipe_size = (a_size + b_size) * pipe_stages;

  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity

  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
}

bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
                     int prob_m, int prob_n, int prob_k, int num_bits,
                     int group_size, bool has_act_order, bool is_k_full,
                     int max_shared_mem) {
  // Sanity
  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
      th_config.num_threads == -1) {
    return false;
  }

  // Verify K/N are divisible by thread K/N
  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
    return false;
  }

  // Verify min for thread K/N
  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
    return false;
  }

  // num_threads must be at least 128 (= 4 warps)
  if (th_config.num_threads < 128) {
    return false;
  }

  //  Determine cache for scales
  int scales_cache_size =
      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                            group_size, has_act_order, is_k_full);

  // Check that pipeline fits into cache
  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                           num_bits, scales_cache_size, max_shared_mem)) {
    return false;
  }

  return true;
}

exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
                                      int num_bits, int group_size,
                                      bool has_act_order, bool is_k_full,
                                      int max_shared_mem) {
  int max_m_blocks = 4;
  while (max_m_blocks > 0) {
    if (prob_m <= 16) {
      for (auto th_config : small_batch_thread_configs) {
        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                            num_bits, group_size, has_act_order, is_k_full,
                            max_shared_mem)) {
          return exec_config_t{max_m_blocks, th_config};
        }
      }
    } else {
      for (auto th_config : large_batch_thread_configs) {
        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                            num_bits, group_size, has_act_order, is_k_full,
                            max_shared_mem)) {
          return exec_config_t{max_m_blocks, th_config};
        }
      }
    }

    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
                     // usage
  }

  return exec_config_t{0, {-1, -1, -1}};
}

  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
                                                                       \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)

template <typename scalar_t>
void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
                     void* g_idx, void* perm, void* a_tmp, int prob_m,
                     int prob_n, int prob_k, void* workspace, int num_bits,
                     bool has_act_order, bool is_k_full, int num_groups,
                     int group_size, int dev, cudaStream_t stream, int thread_k,
                     int thread_n, int sms, int max_par) {
  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
              ", ", prob_n, ", ", prob_k, "]");

  int tot_m = prob_m;
  int tot_m_blocks = div_ceil(tot_m, 16);
  int pad = 16 * tot_m_blocks - tot_m;

  if (sms == -1) {
    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
  }

  int max_shared_mem = 0;
  cudaDeviceGetAttribute(&max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);

  // Set thread config
  exec_config_t exec_cfg;
  if (thread_k != -1 && thread_n != -1) {
    // User-defined config
    exec_cfg =
        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
  } else {
    // Auto config
    exec_cfg =
        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
                                has_act_order, is_k_full, max_shared_mem);
  }

  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
                                  prob_m, prob_n, prob_k, num_bits, group_size,
                                  has_act_order, is_k_full, max_shared_mem),
              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
              ", group_size = ", group_size,
              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
              ", max_shared_mem = ", max_shared_mem);

  int num_threads = exec_cfg.tb_cfg.num_threads;
  thread_k = exec_cfg.tb_cfg.thread_k;
  thread_n = exec_cfg.tb_cfg.thread_n;

  int thread_k_blocks = thread_k / 16;
  int thread_n_blocks = thread_n / 16;

  int blocks = sms;

  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
              " is not divisible by thread_n = ", thread_n);
  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
              " is not divisible by thread_k = ", thread_k);

  int group_blocks = 0;
  if (has_act_order) {
    if (is_k_full) {
      TORCH_CHECK(group_size != -1);
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    } else {
      TORCH_CHECK(group_size == 0);
      group_blocks = 0;
    }

  } else {
    if (group_size == -1) {
      group_blocks = -1;
    } else {
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    }
  }

  const int4* A_ptr = (const int4*)A;
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  const int4* s_ptr = (const int4*)s;
  const int* g_idx_ptr = (const int*)g_idx;
  const int* perm_ptr = (const int*)perm;
  int4* a_tmp_ptr = (int4*)a_tmp;

  int* locks = (int*)workspace;

  if (has_act_order) {
    // Permute A columns
    int block_rows = div_ceil(prob_m, blocks);
    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
    A_ptr = a_tmp_ptr;
  }

  // If we have a full K, then we can run the non-act-order version of Marlin
  // (since the weight rows are reordered by increasing group ids, and by having
  // a full K, we have full original groups)
  if (is_k_full) {
    has_act_order = false;
  }

  // Main loop
  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
    int thread_m_blocks = tot_m_blocks - i;
    prob_m = tot_m - 16 * i;
    int par = 1;
    if (thread_m_blocks > exec_cfg.max_m_blocks) {
      // Note that parallel > 1 currently only works for inputs without any
      // padding
      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
      if (par > max_par) par = max_par;
      prob_m = (16 * exec_cfg.max_m_blocks) * par;
      i += exec_cfg.max_m_blocks * (par - 1);
      thread_m_blocks = exec_cfg.max_m_blocks;
    }


    // Define kernel configurations
#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
    str(prob_n) + ", " + str(prob_k) + "]" + \
        ", has_act_order = " + str(has_act_order) + \
        ", num_groups = " + str(num_groups) + \
        ", group_size = " + str(group_size) + \
        ", thread_m_blocks = " + str(thread_m_blocks) + \
        ", thread_n_blocks = " + str(thread_n_blocks) + \
        ", thread_k_blocks = " + str(thread_k_blocks));


    if (num_bits == 4 && num_threads == 256)
    {
        if (false) {
        }
        CALL_IF(4, 32, 2, 256)
        CALL_IF(4, 16, 4, 256)
        CALL_IF(4, 8, 8, 256)
        else {
            undefined_error
        }
    }
    else if (num_bits == 4 && num_threads == 128)
    {
        if (false) {
        }
        CALL_IF(4, 8, 4, 128)
        CALL_IF(4, 4, 8, 128)
        else {
            undefined_error
        }
    }
    else if (num_bits == 8 && num_threads == 256)
    {
        if (false) {
        }
        CALL_IF(8, 32, 2, 256)
        CALL_IF(8, 16, 4, 256)
        CALL_IF(8, 8, 8, 256)
        else {
            undefined_error
        }
    }
    else if (num_bits == 8 && num_threads == 128)
    {
        if (false) {
        }
        CALL_IF(8, 8, 4, 128)
        CALL_IF(8, 4, 8, 128)
        else {
            undefined_error
        }
    }
    else {
        undefined_error
    }

    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
  }
}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  // Verify num_bits
  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  int pack_factor = 32 / num_bits;

  // Verify A
  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
              ", size_m = ", size_m);
  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
              ", size_k = ", size_k);

  // Verify B
  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
              " is not divisible by tile_size = ", gptq_marlin::tile_size);
  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
              "b_q_weight.size(1) = ", b_q_weight.size(1),
              " is not divisible by tile_size = ", gptq_marlin::tile_size);
  int actual_size_n =
      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
              ", actual_size_n = ", actual_size_n);

  // Verify device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");

  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");

  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");

  TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
  TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");

  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");

  // Alloc buffers
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c = torch::empty({size_m, size_n}, options);
  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);

  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_k = -1;
  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_n = -1;
  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
  int sms = -1;

  // Verify g_idx and perm
  TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
                  (g_idx.size(0) == size_k && perm.size(0) == size_k),
              "Unexpected g_idx.size(0) = ", g_idx.size(0),
              " and perm.size(0) = ", perm.size(0),
              ", where size_k = ", size_k);

  // Detect groupsize and act_order
  int num_groups = -1;
  int group_size = -1;
  bool has_act_order = g_idx.size(0) != 0;

  int b_rank = b_scales.sizes().size();
  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
              " is not size_n = ", size_n);
  num_groups = b_scales.size(0);

  if (has_act_order) {
    if (is_k_full) {
      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
                  ", is not divisible by num_groups = ", num_groups);
      group_size = size_k / num_groups;
    } else {
      group_size = 0;
    }

  } else {
    if (num_groups > 1) {
      TORCH_CHECK(
          size_k % num_groups == 0, "size_k = ", size_k,
          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
      group_size = size_k / num_groups;
    } else {
      group_size = -1;
    }
  }

  // Verify workspace size
  TORCH_CHECK(
      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
  int min_workspace_size =
      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
  TORCH_CHECK(workspace.numel() >= min_workspace_size,
              "workspace.numel = ", workspace.numel(),
              " is below min_workspace_size = ", min_workspace_size);

  int dev = a.get_device();
  if (a.scalar_type() == at::ScalarType::Half) {
    gptq_marlin::marlin_mm_f16i4<half>(
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
        b_scales.data_ptr<at::Half>(), g_idx.data_ptr(), perm.data_ptr(),
        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
        workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups,
        group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
        thread_n, sms, gptq_marlin::max_par);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order,
        is_k_full, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        gptq_marlin::max_par);
  } else {
    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
  }

  return c;
}

#endif


================================================
FILE: archive/csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once

#include <torch/all.h>

#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>

namespace gptq_marlin {

// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
static constexpr int default_threads = 256;

static constexpr int pipe_stages =
    4;  // 4 pipeline stages fit into shared memory

static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;

static constexpr int tile_size = 16;
static constexpr int max_par = 16;

template <typename T, int n>
struct Vec {
  T elems[n];
  __device__ T& operator[](int i) { return elems[i]; }
};

using I4 = Vec<int, 4>;

constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }

#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined (__HIP_PLATFORM_AMD__)
// No support for async
#else

__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
                                      bool pred = true) {
  const int BYTES = 16;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   .reg .pred p;\n"
      "   setp.ne.b32 p, %0, 0;\n"
      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
      "}\n" ::"r"((int)pred),
      "r"(smem), "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
  const int BYTES = 16;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
      "}\n" ::"r"(smem),
      "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async_fence() {
  asm volatile("cp.async.commit_group;\n" ::);
}

template <int n>
__device__ inline void cp_async_wait() {
  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}

#endif

}  // namespace gptq_marlin


================================================
FILE: archive/csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#ifndef _data_types_cuh
#define _data_types_cuh
#include "gptq_marlin.cuh"
#include <cuda_fp16.h>
#include <cuda_bf16.h>

#ifdef __HIP_PLATFORM_AMD__
typedef __hip_bfloat16 nv_bfloat16;
typedef __hip_bfloat162 nv_bfloat162;
#endif

namespace gptq_marlin {

template <typename scalar_t>
class ScalarType {};

template <>
class ScalarType<half> {
 public:
  using scalar_t = half;
  using scalar_t2 = half2;

  // Matrix fragments for tensor core instructions; their precise layout is
  // documented here:
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
  using FragA = Vec<half2, 4>;
  using FragB = Vec<half2, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<half2, 1>;

  static __device__ float inline num2float(const half x) {
    return __half2float(x);
  }

  static __device__ half2 inline num2num2(const half x) {
    return __half2half2(x);
  }

  static __device__ half2 inline nums2num2(const half x1, const half x2) {
    return __halves2half2(x1, x2);
  }

  static __host__ __device__ half inline float2num(const float x) {
    return __float2half(x);
  }
};

template <>
class ScalarType<nv_bfloat16> {
 public:
  using scalar_t = nv_bfloat16;
  using scalar_t2 = nv_bfloat162;

  using FragA = Vec<nv_bfloat162, 4>;
  using FragB = Vec<nv_bfloat162, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<nv_bfloat162, 1>;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
  static __device__ float inline num2float(const nv_bfloat16 x) {
    return __bfloat162float(x);
  }

  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
    return __bfloat162bfloat162(x);
  }

  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
                                                  const nv_bfloat16 x2) {
    return __halves2bfloat162(x1, x2);
  }

  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
    return __float2bfloat16(x);
  }
#endif
};

}  // namespace gptq_marlin

#endif


================================================
FILE: archive/csrc/ktransformers_ext/cuda/gptq_marlin/ops.h
================================================
/**
 * @Description  :  
 * @Author       : Azure
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : Azure 
 * @LastEditTime : 2024-07-26 08:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
**/
#pragma once

#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full);

// torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
//                                  int64_t size_k, int64_t size_n,
//                                  int64_t num_bits);

================================================
FILE: archive/csrc/ktransformers_ext/cuda/setup.py
================================================

from setuptools import setup, Extension
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
    name='KTransformersOps',
    ext_modules=[
        CUDAExtension(
            'KTransformersOps', [
                'custom_gguf/dequant.cu',
                'binding.cpp',
                'gptq_marlin/gptq_marlin.cu',
                # 'gptq_marlin_repack.cu',
            ],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': [
                    '-O3',
                    '--use_fast_math',
                    '-Xcompiler', '-fPIC',
                ]
            },
        )
    ],
    cmdclass={'build_ext': BuildExtension}
)

================================================
FILE: archive/csrc/ktransformers_ext/cuda/test_dequant.py
================================================
import os
import sys
sys.path.insert(0,"/home/zbx/ktransformers")
from ktransformers.util.custom_loader import GGUFLoader
import torch

gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/")

torch.set_default_dtype(torch.bfloat16)

tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")

print(tensor_1[0, -64:])
print(tensor_2[0, -64:])

================================================
FILE: archive/csrc/ktransformers_ext/examples/test_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
from flash_attn import flash_attn_with_kvcache
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
cache_seqlen = 8192
cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 2
max_batch_size: int = 1
max_block_num: int = 512
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
validation_iter = 100

with torch.inference_mode(mode=True):
    config = cpuinfer_ext.kvcache.KVCacheConfig(
        layer_num,
        kv_head_num,
        q_head_num,
        head_dim,
        block_len,
        anchor_num,
        anchor_type,
        kv_type,
        retrieval_type,
        layer_step,
        token_step,
        layer_offset,
        max_block_num,
        max_batch_size,
        max_thread_num,
    )
    local_kvcache = cpuinfer_ext.kvcache.KVCache(config)

    kvcaches = []
    block_table = (
        torch.arange(max_block_num, dtype=torch.int32, device="cpu")
        .contiguous()
        .view(1, -1)
    )

    for layer_idx in range(layer_num):
        k_cache = torch.randn(
            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()
        v_cache = torch.randn(
            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()

        CPUInfer.submit(
            local_kvcache.update_kvcache_fp16(
                k_cache.data_ptr(),
                v_cache.data_ptr(),
                layer_idx,
                block_table.data_ptr(),
                1,
                max_block_num,
                seqlens_zero.data_ptr(),
                cache_seqlen,
            )
        )
        CPUInfer.sync()

        kvcaches.append((k_cache.to("cuda"), v_cache.to("cuda")))

    # validation
    for i in range(validation_iter):

        k_cache = kvcaches[i % layer_num][0]
        v_cache = kvcaches[i % layer_num][1]
        input = torch.randn(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()
        output = torch.empty(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()

        # attn_lse: (bsz, q_len, q_head_num)
        attn_lse = torch.empty(
            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
        ).contiguous()
        input = input / 100

        CPUInfer.submit(
            local_kvcache.attn(
                input.data_ptr(),
                output.data_ptr(),
                attn_lse.data_ptr(),
                i % layer_num,
                0,
                1,
                1,
                max_block_num,
                block_table.data_ptr(),
                cache_seqlens.data_ptr(),
                -1,
                -1,
                -1,
            )
        )
        CPUInfer.sync()
        # print("cpuinfer output", output)

        t_output = flash_attn_with_kvcache(
            q=input.to("cuda"),
            k_cache=k_cache,
            v_cache=v_cache,
            cache_seqlens=cache_seqlens.to("cuda"),
        )
        # print("torch output", t_output)

        diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
            torch.abs(t_output)
        )
        print("diff = ", diff)
        assert diff < 0.001


================================================
FILE: archive/csrc/ktransformers_ext/examples/test_linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:36:59
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

input_size = 16384
output_size = 5120
stride = 32
group_max_len = 1024
proj_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100

with torch.inference_mode(mode=True):
    linears = []
    projs = []
    for _ in range(layer_num):
        proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
        linear = cpuinfer_ext.linear.Linear(config)
        projs.append(proj)
        linears.append(linear)

    # validation
    for i in range(validation_iter):
        linear = linears[i % layer_num]
        input = torch.randn((qlen, input_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
        input = input / 100

        CPUInfer.submit(
            linear.forward(
                qlen,
                input.data_ptr(),
                output.data_ptr()
            )
        )
        CPUInfer.sync()
        # print('cpuinfer output', output)

        proj = projs[i%layer_num]
        t_output = torch.mm(input, proj.t())
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print('diff = ', diff)
        assert(diff < 0.001)


================================================
FILE: archive/csrc/ktransformers_ext/examples/test_mlp.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:37:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

hidden_size = 5120
intermediate_size = 3072
stride = 32
group_max_len = 1024
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret

with torch.inference_mode(mode=True):
    mlps = []
    gate_projs = []
    up_projs = []
    down_projs = []
    for _ in range(layer_num):
        gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
        mlp = cpuinfer_ext.mlp.MLP(config)
        gate_projs.append(gate_proj)
        up_projs.append(up_proj)
        down_projs.append(down_proj)
        mlps.append(mlp)

    # validation
    for i in range(validation_iter):
        mlp = mlps[i % layer_num]
        input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
        input = input / 100

        CPUInfer.submit(
            mlp.forward(
                qlen,
                input.data_ptr(), 
                output.data_ptr()
            )
        )
        CPUInfer.sync()
        # print('cpuinfer output', output)

        gate_proj = gate_projs[i%layer_num]
        up_proj = up_projs[i%layer_num]
        down_proj = down_projs[i%layer_num]
        t_output = mlp_torch(input, gate_proj, up_proj, down_proj)
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print('diff = ', diff)
        assert(diff < 0.001)


================================================
FILE: archive/csrc/ktransformers_ext/examples/test_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

expert_num = 160
hidden_size = 5120
intermediate_size = 1536
stride = 32
group_min_len = 10
group_max_len = 1024
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
n_routed_experts = 6
qlen = 30
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret

def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output

with torch.inference_mode(mode=True):
    moes = []
    gate_projs = []
    up_projs = []
    down_projs = []
    for _ in range(layer_num):
        gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        config = cpuinfer_ext.moe.MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, stride, group_min_len, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
        moe = cpuinfer_ext.moe.MOE(config)
        gate_projs.append(gate_proj)
        up_projs.append(up_proj)
        down_projs.append(down_proj)
        moes.append(moe)

    # validation
    for i in range(validation_iter):
        expert_ids = torch.stack([torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]).contiguous()
        weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
        input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
        input = input / 100
        
        moe = moes[i % layer_num]
        CPUInfer.submit(
            moe.forward( 
                qlen,
                n_routed_experts, 
                expert_ids.data_ptr(), 
                weights.data_ptr(), 
                input.data_ptr(), 
                output.data_ptr()
            )
        )
        CPUInfer.sync()
        # print('cpuinfer output', output)

        gate_proj = gate_projs[i%layer_num]
        up_proj = up_projs[i%layer_num]
        down_proj = down_projs[i%layer_num]
        t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj)
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print('diff = ', diff)
        assert(diff < 0.001)


================================================
FILE: archive/csrc/ktransformers_ext/ext_bindings.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022, Jianwei Dong
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
// Python bindings
#include "cpu_backend/cpuinfer.h"
#if !defined(KTRANSFORMERS_USE_ROCM) && !defined(KTRANSFORMERS_USE_XPU) && !defined(KTRANSFORMERS_USE_NPU)
#include "device_launch_parameters.h"
#endif
#include "llamafile/flags.h"
#include "operators/kvcache/kvcache.h"
#include "operators/llamafile/linear.h"
#include "operators/llamafile/mlp.h"
#include "operators/llamafile/moe.h"

#if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
#include "operators/amx/moe.hpp"
#endif

#include "pybind11/functional.h"
#include "pybind11/operators.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include <cstdint>
#include <iostream>
#include <memory>

namespace py = pybind11;
using namespace pybind11::literals;

// Binding functions for the KVCache class
class KVCacheBindings {
  public:
    class AttnBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            const ggml_fp16_t *q_in;
            ggml_fp16_t *output;
            float *attn_lse;
            int layer_idx;
            int generate_token_idx;
            int q_len;
            int batch_size;
            int max_block_num;
            int *block_table;
            int *cache_seqlens;
            int pick_block_num;
            int init_block_num;
            int local_block_num;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::attn, args_->kv_cache, args_->q_in, args_->output,
                args_->attn_lse, args_->layer_idx, args_->generate_token_idx,
                args_->q_len, args_->batch_size, args_->max_block_num,
                args_->block_table, args_->cache_seqlens, args_->pick_block_num,
                args_->init_block_num, args_->local_block_num);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t output,
                           intptr_t attn_lse, int layer_idx,
                           int generate_token_idx, int q_len, int batch_size,
                           int max_block_num, intptr_t block_table,
                           intptr_t cache_seqlens, int pick_block_num,
                           int init_block_num, int local_block_num) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (const ggml_fp16_t *)q_in,
                                  (ggml_fp16_t *)output,
                                  (float *)attn_lse,
                                  layer_idx,
                                  generate_token_idx,
                                  q_len,
                                  batch_size,
                                  max_block_num,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  pick_block_num,
                                  init_block_num,
                                  local_block_num};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class GetAllKVCacheOneLayerBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int layer_id;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::get_all_kvcache_one_layer,
                                     args_->kv_cache, args_->layer_id,
                                     args_->k_in, args_->v_in);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id) {
            Args *args = new Args{nullptr, &kv_cache, layer_id,
                                  (ggml_fp16_t *)k_in, (ggml_fp16_t *)v_in};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class GetAndUpdateKVCacheFp16Bindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *cache_seqlens;
            int q_len;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::get_and_update_kvcache_fp16,
                                     args_->kv_cache, args_->k_in, args_->v_in,
                                     args_->layer_id, args_->block_table,
                                     args_->batch_size, args_->max_block_num,
                                     args_->cache_seqlens, args_->q_len);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id, intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t cache_seqlens,
                           int q_len) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (ggml_fp16_t *)k_in,
                                  (ggml_fp16_t *)v_in,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)cache_seqlens,
                                  q_len};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class GetKVCacheFp16Bindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *cache_seqlens;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::get_kvcache_fp16, args_->kv_cache, args_->k_in,
                args_->v_in, args_->layer_id, args_->block_table,
                args_->batch_size, args_->max_block_num, args_->cache_seqlens);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id, intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t cache_seqlens) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (ggml_fp16_t *)k_in,
                                  (ggml_fp16_t *)v_in,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)cache_seqlens};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class UpdateKVCacheFp16Bindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *cache_seqlens;
            int q_len;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::update_kvcache_fp16,
                                     args_->kv_cache, args_->k_in, args_->v_in,
                                     args_->layer_id, args_->block_table,
                                     args_->batch_size, args_->max_block_num,
                                     args_->cache_seqlens, args_->q_len);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id, intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t cache_seqlens,
                           int q_len) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (ggml_fp16_t *)k_in,
                                  (ggml_fp16_t *)v_in,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)cache_seqlens,
                                  q_len};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class UpdateImportanceBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            const ggml_fp16_t *importance;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *offset;
            int width;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::update_importance, args_->kv_cache, args_->importance,
                args_->layer_id, args_->block_table, args_->batch_size,
                args_->max_block_num, args_->offset, args_->width);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t importance, int layer_id,
                           intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t offset, int width) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (const ggml_fp16_t *)importance,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)offset,
                                  width};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class AttnWithKVCacheBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            const ggml_fp16_t *q_in;
            const ggml_fp16_t *k_in;
            const ggml_fp16_t *v_in;
            ggml_fp16_t *output;
            float *attn_lse;
            int layer_idx;
            int generate_token_idx;
            int q_len;
            int batch_size;
            int max_block_num;
            int *block_table;
            int *cache_seqlens;
            int topk;
            int local;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::attn_with_kvcache, args_->kv_cache, args_->q_in,
                args_->k_in, args_->v_in, args_->output, args_->attn_lse,
                args_->layer_idx, args_->generate_token_idx, args_->q_len,
                args_->batch_size, args_->max_block_num, args_->block_table,
                args_->cache_seqlens, args_->topk, args_->local);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t k_in,
                           intptr_t v_in, intptr_t output, intptr_t attn_lse,
                           int layer_idx, int generate_token_idx, int q_len,
                           int batch_size, int max_block_num,
                           intptr_t block_table, intptr_t cache_seqlens,
                           int topk, int local) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (const ggml_fp16_t *)q_in,
                                  (const ggml_fp16_t *)k_in,
                                  (const ggml_fp16_t *)v_in,
                                  (ggml_fp16_t *)output,
                                  (float *)attn_lse,
                                  layer_idx,
                                  generate_token_idx,
                                  q_len,
                                  batch_size,
                                  max_block_num,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  topk,
                                  local};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class ClearImportanceAllLayersBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int *block_table;
            int *cache_seqlens;
            int batch_size;
            int max_block_num;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::clear_importance_all_layers,
                                     args_->kv_cache, args_->block_table,
                                     args_->cache_seqlens, args_->batch_size,
                                     args_->max_block_num);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
                           intptr_t cache_seqlens, int batch_size,
                           int max_block_num) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  batch_size,
                                  max_block_num};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class CalcAnchorAllLayersBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int *block_table;
            int *cache_seqlens;
            int batch_size;
            int max_block_num;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::calc_anchor_all_layers,
                                     args_->kv_cache, args_->block_table,
                                     args_->cache_seqlens, args_->batch_size,
                                     args_->max_block_num);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
                           intptr_t cache_seqlens, int batch_size,
                           int max_block_num) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  batch_size,
                                  max_block_num};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class LoadKVCacheBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            std::string tensor_file_path;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::load_kvcache, args_->kv_cache,
                                     args_->tensor_file_path);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, std::string tensor_file_path) {
            Args *args =
                new Args{nullptr, &kv_cache, (std::string)tensor_file_path};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class DumpKVCacheBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int *block_table;
            int cache_total_len;
            std::string tensor_file_path;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::dump_kvcache, args_->kv_cache,
                                     args_->block_table, args_->cache_total_len,
                                     args_->tensor_file_path);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
                           int cache_total_len, std::string tensor_file_path) {
            Args *args =
                new Args{nullptr, &kv_cache, (int *)block_table,
                         cache_total_len, (std::string)tensor_file_path};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};

class LinearBindings {
  public:
    class WarmUpBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            Linear *linear;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&Linear::warm_up, args_->linear);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(Linear &linear) {
            Args *args = new Args{nullptr, &linear};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            Linear *linear;
            int qlen;
            const void *input;
            void *output;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&Linear::forward, args_->linear,
                                     args_->qlen, args_->input, args_->output);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(Linear &linear, int qlen, intptr_t input,
                           intptr_t output) {
            Args *args = new Args{nullptr, &linear, qlen, (const void *)input,
                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};

class MLPBindings {
  public:
    class WarmUpBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MLP *mlp;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MLP::warm_up, args_->mlp);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP &mlp) {
            Args *args = new Args{nullptr, &mlp};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MLP *mlp;
            int qlen;
            const void *input;
            void *output;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen,
                                     args_->input, args_->output);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(MLP &mlp, int qlen, intptr_t input,
                           intptr_t output) {
            Args *args = new Args{nullptr, &mlp, qlen, (const void *)input,
                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};

class MOEBindings {
  public:
    class WarmUpBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MOE *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MOE::warm_up, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MOE *moe;
            int qlen;
            int k;
            const uint64_t *expert_ids;
            const float *weights;
            const void *input;
            void *output;
            int *batch_size_tensor;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &MOE::forward, args_->moe, args_->qlen, args_->k,
                args_->expert_ids, args_->weights, args_->input, args_->output, args_->batch_size_tensor);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(MOE &moe, int qlen, int k, intptr_t expert_ids,
                           intptr_t weights, intptr_t input, intptr_t output, intptr_t batch_size_tensor) {
            Args *args = new Args{nullptr,
                                  &moe,
                                  qlen,
                                  k,
                                  (const uint64_t *)expert_ids,
                                  (const float *)weights,
                                  (const void *)input,
                                  (void *)output,
                                  (int *)batch_size_tensor};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};


#if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
template<class T>
class AMX_MOEBindings {
  public:
    class WarmUpBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            AMX_MOE<T> *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&AMX_MOE<T>::warm_up, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class LoadWeightsBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            AMX_MOE<T> *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&AMX_MOE<T>::load_weights, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            AMX_MOE<T> *moe;
            int qlen;
            int k;
            const uint64_t *expert_ids;
            const float *weights;
            const void *input;
            void *output;
            int *batch_size_tensor;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &AMX_MOE<T>::forward, args_->moe, args_->qlen, args_->k,
                args_->expert_ids, args_->weights, args_->input, args_->output, args_->batch_size_tensor);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(AMX_MOE<T> &moe, int qlen, int k, intptr_t expert_ids,
                        intptr_t weights, intptr_t input, intptr_t output, intptr_t batch_size_tensor) {
            Args *args = new Args{nullptr,
                                &moe,
                                qlen,
                                k,
                                (const uint64_t *)expert_ids,
                                (const float *)weights,
                                (const void *)input,
                                (void *)output,
                                (int *)batch_size_tensor};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};
#endif

PYBIND11_MODULE(cpuinfer_ext, m) {
    py::class_<CPUInfer>(m, "CPUInfer")
        .def(py::init<int>())
        .def("submit", &CPUInfer::submit)
        .def("submit_with_cuda_stream", &CPUInfer::submit_with_cuda_stream)
        .def("sync", &CPUInfer::sync)
        .def("sync_with_cuda_stream", &CPUInfer::sync_with_cuda_stream);

    auto linear_module = m.def_submodule("linear");
    py::class_<LinearConfig>(linear_module, "LinearConfig")
        .def(py::init([](int hidden_size, int intermediate_size, int stride,
                         int group_max_len, intptr_t proj, int proj_type,
                         int hidden_type) {
            return LinearConfig(hidden_size, intermediate_size, stride,
                                group_max_len, (void *)proj,
                                (ggml_type)proj_type, (ggml_type)hidden_type);
        }));
    py::class_<Linear>(linear_module, "Linear")
        .def(py::init<LinearConfig>())
        .def("warm_up", &LinearBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &LinearBindings::ForwardBindings::cpuinfer_interface);

    auto mlp_module = m.def_submodule("mlp");
    py::class_<MLPConfig>(mlp_module, "MLPConfig")
        .def(py::init([](int hidden_size, int intermediate_size, int stride,
                         int group_max_len, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj, int gate_type,
                         int up_type, int down_type, int hidden_type) {
            return MLPConfig(hidden_size, intermediate_size, stride,
                             group_max_len, (void *)gate_proj, (void *)up_proj,
                             (void *)down_proj, (ggml_type)gate_type,
                             (ggml_type)up_type, (ggml_type)down_type,
                             (ggml_type)hidden_type);
        }));
    py::class_<MLP>(mlp_module, "MLP")
        .def(py::init<MLPConfig>())
        .def("warm_up", &MLPBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &MLPBindings::ForwardBindings::cpuinfer_interface);

    auto moe_module = m.def_submodule("moe");
    py::class_<MOEConfig>(moe_module, "MOEConfig")
        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
                         int intermediate_size, int stride, int group_min_len,
                         int group_max_len, bool use_silu, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj, int gate_type,
                         int up_type, int down_type, int hidden_type) {
            return MOEConfig(expert_num, routed_expert_num, hidden_size,
                             intermediate_size, stride, group_min_len,
                             group_max_len, use_silu, (void *)gate_proj, (void *)up_proj,
                             (void *)down_proj, (ggml_type)gate_type,
                             (ggml_type)up_type, (ggml_type)down_type,
                             (ggml_type)hidden_type);
        }));
    py::class_<MOE>(moe_module, "MOE")
        .def(py::init<MOEConfig>())
        .def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface);


    #if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
    py::class_<AMX_MOEConfig>(moe_module, "AMX_MOEConfig")
        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
                         int intermediate_size,
                         int max_len, bool use_silu, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj) {
            return AMX_MOEConfig(expert_num, routed_expert_num, hidden_size,
                                 intermediate_size, 
                                 max_len, use_silu, (void *)gate_proj,
                                 (void *)up_proj, (void *)down_proj);
        }));

    py::class_<AMX_MOE<amx::GemmKernel224BF>>(moe_module, "AMXBF16_MOE")
        .def(py::init<AMX_MOEConfig>())
        .def("warm_up", &AMX_MOEBindings<amx::GemmKernel224BF>::WarmUpBindings::cpuinfer_interface)
        .def("load_weights", &AMX_MOEBindings<amx::GemmKernel224BF>::LoadWeightsBindings::cpuinfer_interface)
        .def("forward", &AMX_MOEBindings<amx::GemmKernel224BF>::ForwardBindings::cpuinfer_interface);
    py::class_<AMX_MOE<amx::GemmKernel224Int8>>(moe_module, "AMXInt8_MOE")
        .def(py::init<AMX_MOEConfig>())
        .def("warm_up", &AMX_MOEBindings<amx::GemmKernel224Int8>::WarmUpBindings::cpuinfer_interface)
        .def("load_weights", &AMX_MOEBindings<amx::GemmKernel224Int8>::LoadWeightsBindings::cpuinfer_interface)
        .def("forward", &AMX_MOEBindings<amx::GemmKernel224Int8>::ForwardBindings::cpuinfer_interface);

    #endif

    auto kvcache_module = m.def_submodule("kvcache");

    py::enum_<AnchorType>(kvcache_module, "AnchorType")
        .value("FIXED", AnchorType::FIXED_ANCHOR)
        .value("DYNAMIC", AnchorType::DYNAMIC)
        .value("QUEST", AnchorType::QUEST)
        .value("BLOCK_MAX", AnchorType::BLOCK_MAX)
        .value("BLOCK_MEAN", AnchorType::BLOCK_MEAN);
    py::enum_<ggml_type>(kvcache_module, "ggml_type")
        .value("FP16", ggml_type::GGML_TYPE_F16)
        .value("FP32", ggml_type::GGML_TYPE_F32)
        .value("Q4_0", ggml_type::GGML_TYPE_Q4_0)
        .value("Q8_0", ggml_type::GGML_TYPE_Q8_0);
    py::enum_<RetrievalType>(kvcache_module, "RetrievalType")
        .value("LAYER", RetrievalType::LAYER)
        .value("KVHEAD", RetrievalType::KVHEAD)
        .value("QHEAD", RetrievalType::QHEAD);

    py::class_<KVCacheConfig>(kvcache_module, "KVCacheConfig")
        .def(py::init<int, int, int, int, int, int, AnchorType, ggml_type,
                      RetrievalType, int, int, int, int, int, int>())
        .def_readwrite("layer_num", &KVCacheConfig::layer_num)
        .def_readwrite("kv_head_num", &KVCacheConfig::kv_head_num)
        .def_readwrite("q_head_num", &KVCacheConfig::q_head_num)
        .def_readwrite("head_dim", &KVCacheConfig::head_dim)
        .def_readwrite("block_len", &KVCacheConfig::block_len)
        .def_readwrite("anchor_num", &KVCacheConfig::anchor_num)
        .def_readwrite("anchor_type", &KVCacheConfig::anchor_type)
        .def_readwrite("kv_type", &KVCacheConfig::kv_type)
        .def_readwrite("retrieval_type", &KVCacheConfig::retrieval_type)
        .def_readwrite("layer_step", &KVCacheConfig::layer_step)
        .def_readwrite("token_step", &KVCacheConfig::token_step)
        .def_readwrite("layer_offset", &KVCacheConfig::layer_offset)
        .def_readwrite("max_block_num", &KVCacheConfig::max_block_num)
        .def_readwrite("max_batch_size", &KVCacheConfig::max_batch_size)
        .def_readwrite("max_thread_num", &KVCacheConfig::max_thread_num);
    py::class_<KVCache>(kvcache_module, "KVCache")
        .def(py::init<KVCacheConfig>())
        .def("get_cache_total_len", &KVCache::get_cache_total_len)
        .def("update_cache_total_len",
             [](KVCache &kvcache, int cache_total_len) {
                 kvcache.update_cache_total_len(cache_total_len);
             })
        .def("attn", &KVCacheBindings::AttnBindings::cpuinfer_interface)
        .def(
            "get_all_kvcache_one_layer",
            &KVCacheBindings::GetAllKVCacheOneLayerBindings::cpuinfer_interface)
        .def("get_and_update_kvcache_fp16",
             &KVCacheBindings::GetAndUpdateKVCacheFp16Bindings::
                 cpuinfer_interface)
        .def("get_kvcache_fp16",
             &KVCacheBindings::GetKVCacheFp16Bindings::cpuinfer_interface)
        .def("update_kvcache_fp16",
             &KVCacheBindings::UpdateKVCacheFp16Bindings::cpuinfer_interface)
        .def("update_importance",
             &KVCacheBindings::UpdateImportanceBindings::cpuinfer_interface)
        .def("attn_with_kvcache",
             &KVCacheBindings::AttnWithKVCacheBindings::cpuinfer_interface)
        .def("clear_importance_all_layers",
             &KVCacheBindings::ClearImportanceAllLayersBindings::
                 cpuinfer_interface)
        .def("calc_anchor_all_layers",
             &KVCacheBindings::CalcAnchorAllLayersBindinds::cpuinfer_interface);
}

================================================
FILE: archive/csrc/ktransformers_ext/operators/amx/la/amx.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#pragma once
#include <array>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <immintrin.h>
#include <iostream>
#include <random>
#include <stdexcept>
#include <stdlib.h>
#include <sys/syscall.h>
#include <unistd.h>

#include "utils.hpp"
#include <memory>

#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
#define RESTRICT __restrict__
#endif

#if (defined(_WIN32) || defined(_WIN64))
#define ALWAYS_INLINE __forceinline
#elif __has_attribute(always_inline) || defined(__GNUC__)
#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
#else
#define ALWAYS_INLINE inline
#endif

namespace amx {

#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18

const int TMMCount = 8;
const int MaxTileHeight = 16;
const int MaxTileWidth = 64;

const int AMX_BLK_SIZE = 32;

#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7

inline bool enable_amx() {
  static thread_local bool initialized = false;
  if (initialized) {
    return true;
  }
  initialized = true;

  if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
    printf("\n Fail to do XFEATURE_XTILEDATA \n\n");
    return false;
  } else {
    // printf("\n TILE DATA USE SET - OK \n\n");
    return true;
  }
  return true;
}

struct alignas(64) TileConfig {
  uint8_t palette;
  uint8_t start_row;
  std::array<uint8_t, 14> __0 = {};
  std::array<uint16_t, 8> colsb;
  std::array<uint8_t, 16> __1 = {};
  std::array<uint8_t, 8> rows;
  std::array<uint8_t, 8> __2 = {};

  TileConfig() {
    palette = 1;
    start_row = 0;
    for (int i = 0; i < 8; i++) {
      set_row_col(i, 0, 0);
    }
  }

  void set_row_col(int i, uint8_t row, uint16_t col) {
    colsb[i] = col;
    rows[i] = row;
  }

  void set_config() { _tile_loadconfig(this); }

  static void load_data(int to, void *from, size_t stride) {
    switch (to) {
    case 0:
      _tile_loadd(0, from, stride);
      break;
    case 1:
      _tile_loadd(1, from, stride);
      break;
    case 2:
      _tile_loadd(2, from, stride);
      break;
    case 3:
      _tile_loadd(3, from, stride);
      break;
    case 4:
      _tile_loadd(4, from, stride);
      break;
    case 5:
      _tile_loadd(5, from, stride);
      break;
    case 6:
      _tile_loadd(6, from, stride);
      break;
    case 7:
      _tile_loadd(7, from, stride);
      break;
    default:
      throw std::runtime_error("no such tile");
    }
  }

  static void store_data(int from, void *to, size_t stride) {
    switch (from) {
    case 0:
      _tile_stored(0, to, stride);
      break;
    case 1:
      _tile_stored(1, to, stride);
      break;
    case 2:
      _tile_stored(2, to, stride);
      break;
    case 3:
      _tile_stored(3, to, stride);
      break;
    case 4:
      _tile_stored(4, to, stride);
      break;
    case 5:
      _tile_stored(5, to, stride);
      break;
    case 6:
      _tile_stored(6, to, stride);
      break;
    case 7:
      _tile_stored(7, to, stride);
      break;
    default:
      throw std::runtime_error("no such tile");
    }
  }
};

static_assert(sizeof(TileConfig) == 64);

inline void debug_tile(int t) {
  printf("Tile %d\n", t);
  uint8_t data[16][64] = {};
  TileConfig::store_data(t, data, 64);
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 64; j++) {
      printf("%3d ", data[i][j]);
    }
    printf("\n");
  }
  printf("\n");
}

inline void debug_tiles(int to = 8) {
  for (int i = 0; i < to; i++) {
    debug_tile(i);
  }
}

inline void debug_m512(__m512 x) {
  float data[16];
  _mm512_storeu_ps(data, x);
  for (int i = 0; i < 16; i++) {
    printf("%f ", data[i]);
  }
  printf("\n");
}

// transpose utils
inline void transpose_16x16_32bit(__m512i *v) {
  __m512i v1[16];
  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);

  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);

  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

/*
  Transpose 16x16 32-bit elements
  Note that v must be 64 byte aligned
*/
inline void transpose_16x16_32bit(__m512i *v, size_t stride) {
  assert(reinterpret_cast<intptr_t>(v) % 64 == 0 && "v must be 64 aligned");

  auto stride_v = [=](int i) { return offset_pointer(v, i * stride); };
  __m512i v1[16];

  v1[0] = _mm512_unpacklo_epi32(*stride_v(0), *stride_v(1));
  v1[1] = _mm512_unpackhi_epi32(*stride_v(0), *stride_v(1));
  v1[2] = _mm512_unpacklo_epi32(*stride_v(2), *stride_v(3));
  v1[3] = _mm512_unpackhi_epi32(*stride_v(2), *stride_v(3));
  v1[4] = _mm512_unpacklo_epi32(*stride_v(4), *stride_v(5));
  v1[5] = _mm512_unpackhi_epi32(*stride_v(4), *stride_v(5));
  v1[6] = _mm512_unpacklo_epi32(*stride_v(6), *stride_v(7));
  v1[7] = _mm512_unpackhi_epi32(*stride_v(6), *stride_v(7));
  v1[8] = _mm512_unpacklo_epi32(*stride_v(8), *stride_v(9));
  v1[9] = _mm512_unpackhi_epi32(*stride_v(8), *stride_v(9));
  v1[10] = _mm512_unpacklo_epi32(*stride_v(10), *stride_v(11));
  v1[11] = _mm512_unpackhi_epi32(*stride_v(10), *stride_v(11));
  v1[12] = _mm512_unpacklo_epi32(*stride_v(12), *stride_v(13));
  v1[13] = _mm512_unpackhi_epi32(*stride_v(12), *stride_v(13));
  v1[14] = _mm512_unpacklo_epi32(*stride_v(14), *stride_v(15));
  v1[15] = _mm512_unpackhi_epi32(*stride_v(14), *stride_v(15));

  *stride_v(0) = _mm512_unpacklo_epi64(v1[0], v1[2]);
  *stride_v(1) = _mm512_unpackhi_epi64(v1[0], v1[2]);
  *stride_v(2) = _mm512_unpacklo_epi64(v1[1], v1[3]);
  *stride_v(3) = _mm512_unpackhi_epi64(v1[1], v1[3]);
  *stride_v(4) = _mm512_unpacklo_epi64(v1[4], v1[6]);
  *stride_v(5) = _mm512_unpackhi_epi64(v1[4], v1[6]);
  *stride_v(6) = _mm512_unpacklo_epi64(v1[5], v1[7]);
  *stride_v(7) = _mm512_unpackhi_epi64(v1[5], v1[7]);
  *stride_v(8) = _mm512_unpacklo_epi64(v1[8], v1[10]);
  *stride_v(9) = _mm512_unpackhi_epi64(v1[8], v1[10]);
  *stride_v(10) = _mm512_unpacklo_epi64(v1[9], v1[11]);
  *stride_v(11) = _mm512_unpackhi_epi64(v1[9], v1[11]);
  *stride_v(12) = _mm512_unpacklo_epi64(v1[12], v1[14]);
  *stride_v(13) = _mm512_unpackhi_epi64(v1[12], v1[14]);
  *stride_v(14) = _mm512_unpacklo_epi64(v1[13], v1[15]);
  *stride_v(15) = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(*stride_v(0), *stride_v(4), 0x88);
  v1[1] = _mm512_shuffle_i32x4(*stride_v(1), *stride_v(5), 0x88);
  v1[2] = _mm512_shuffle_i32x4(*stride_v(2), *stride_v(6), 0x88);
  v1[3] = _mm512_shuffle_i32x4(*stride_v(3), *stride_v(7), 0x88);
  v1[4] = _mm512_shuffle_i32x4(*stride_v(0), *stride_v(4), 0xdd);
  v1[5] = _mm512_shuffle_i32x4(*stride_v(1), *stride_v(5), 0xdd);
  v1[6] = _mm512_shuffle_i32x4(*stride_v(2), *stride_v(6), 0xdd);
  v1[7] = _mm512_shuffle_i32x4(*stride_v(3), *stride_v(7), 0xdd);
  v1[8] = _mm512_shuffle_i32x4(*stride_v(8), *stride_v(12), 0x88);
  v1[9] = _mm512_shuffle_i32x4(*stride_v(9), *stride_v(13), 0x88);
  v1[10] = _mm512_shuffle_i32x4(*stride_v(10), *stride_v(14), 0x88);
  v1[11] = _mm512_shuffle_i32x4(*stride_v(11), *stride_v(15), 0x88);
  v1[12] = _mm512_shuffle_i32x4(*stride_v(8), *stride_v(12), 0xdd);
  v1[13] = _mm512_shuffle_i32x4(*stride_v(9), *stride_v(13), 0xdd);
  v1[14] = _mm512_shuffle_i32x4(*stride_v(10), *stride_v(14), 0xdd);
  v1[15] = _mm512_shuffle_i32x4(*stride_v(11), *stride_v(15), 0xdd);

  *stride_v(0) = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  *stride_v(1) = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  *stride_v(2) = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  *stride_v(3) = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  *stride_v(4) = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  *stride_v(5) = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  *stride_v(6) = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  *stride_v(7) = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  *stride_v(8) = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  *stride_v(9) = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  *stride_v(10) = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  *stride_v(11) = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  *stride_v(12) = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  *stride_v(13) = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  *stride_v(14) = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  *stride_v(15) = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

struct GemmKernel224BF {
  using dt = ggml_bf16_t;
  using output_t = float;
  static const int TILE_M = 16;
  static const int TILE_K = 32;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 2;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  static inline const int K_BLOCK = 1792;

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 32
    for (int i = 0; i < 2; i++)
      tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 16 x 32
    for (int i = 2; i < 4; i++)
      tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 16
    for (int i = 4; i < 8; i++)
      tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
  }

  static void load_a(dt *a, size_t lda) {
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
  }

  static void load_b(dt *b, size_t ldb) {
    _tile_loadd(2, b, ldb);
    _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
  }

  static void clean_c() {
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
  }

  static void load_c(output_t *c, size_t ldc) {
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void store_c(output_t *c, size_t ldc) {
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void run_tile() {
    _tile_dpbf16ps(4, 0, 2);
    _tile_dpbf16ps(5, 0, 3);
    _tile_dpbf16ps(6, 1, 2);
    _tile_dpbf16ps(7, 1, 3);
  }

  struct BufferA {
    ggml_bf16_t *a;
    int max_m, k;

    static size_t required_size(int max_m, int k) { return max_m * k * sizeof(ggml_bf16_t); }

    BufferA(int max_m, int k, void *ptr) : max_m(max_m), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(k % K_STEP == 0);
      a = reinterpret_cast<ggml_bf16_t *>(ptr);
    }

    void from_mat(int m, ggml_bf16_t *src, int ith, int nth) {
      assert(m <= max_m);
      assert(ith == 0 && nth == 1);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
              __m512i *s = (__m512i *)(src + (m_begin + i) * k + k_block_begin + k_begin);
              __m512i *d = (__m512i *)(a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP +
                                       i * K_STEP);
              avx512_copy_32xbf16(s, d);
            }
          }
        }
      }
    }

    ggml_bf16_t *get_submat(int m, int k, int m_begin, int k_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
    }
  };

  struct BufferB {
    ggml_bf16_t *b;
    int n, k;

    static size_t required_size(int n, int k) { return n * k * sizeof(ggml_bf16_t); }

    BufferB(int n, int k, void *ptr) : n(n), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(n % N_STEP == 0);
      assert(k % K_STEP == 0);
      b = reinterpret_cast<ggml_bf16_t *>(ptr);
    }

    void from_mat(ggml_bf16_t *src, int ith, int nth) {
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < N_STEP; i++) {
              __m512i *s = (__m512i *)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin);
              __m512i *d = (__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                       k_begin * N_STEP + i * K_STEP);
              avx512_copy_32xbf16(s, d);
            }
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP));
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP + TILE_N * K_STEP));
          }
        }
      }
    }

    ggml_bf16_t *get_submat(int n, int k, int n_begin, int k_begin) {
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      n_begin -= n_block_begin;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP;
    }
  };

  struct BufferC {
    float *c;
    int max_m, n;

    static size_t required_size(int max_m, int n) { return max_m * n * sizeof(float); }

    BufferC(int max_m, int n, void *ptr) : max_m(max_m), n(n) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(n % N_STEP == 0);
      c = reinterpret_cast<float *>(ptr);
    }

    void to_mat(int m, ggml_bf16_t *dst, int ith, int nth) {
      assert(m <= max_m);
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512 *x0 =
                (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
            __m512 *x1 = (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP +
                                    i * N_STEP + 16);
            avx512_32xfp32_to_32xbf16(x0, x1, (__m512i *)(dst + (m_begin + i) * n + n_block_begin + n_begin));
          }
        }
      }
    }

    float *get_submat(int m, int n, int m_begin, int n_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      n_begin -= n_block_begin;
      return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
    }
  };
};

struct GemmKernel224Int8 {
  using dt = int8_t;
  using output_t = int32_t;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  static inline const int K_BLOCK = 3584;

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 64
    for (int i = 0; i < 2; i++)
      tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 16 x 64
    for (int i = 2; i < 4; i++)
      tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 16
    for (int i = 4; i < 8; i++)
      tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
  }

  static void load_a(dt *a, size_t lda) {
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
  }

  static void load_b(dt *b, size_t ldb) {
    _tile_loadd(2, b, ldb);
    _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
  }

  static void clean_c() {
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
  }

  static void load_c(output_t *c, size_t ldc) {
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void store_c(output_t *c, size_t ldc) {
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void run_tile() {
    _tile_dpbssd(4, 0, 2);
    _tile_dpbssd(5, 0, 3);
    _tile_dpbssd(6, 1, 2);
    _tile_dpbssd(7, 1, 3);
  }

  struct BufferA {
    int8_t *a;
    float *d;
    int max_m, k;

    static size_t required_size(int max_m, int k) { return max_m * k * sizeof(int8_t) + max_m * sizeof(float); }

    BufferA(int max_m, int k, void *ptr) : max_m(max_m), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(k % K_STEP == 0);
      a = reinterpret_cast<int8_t *>(ptr);
      d = reinterpret_cast<float *>(a + max_m * k);
    }

    void from_mat(int m, ggml_bf16_t *src, int ith, int nth) {
      assert(m <= max_m);
      assert(ith == 0 && nth == 1);
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
          float amax = 0.0f;
          for (int j = 0; j < k; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i *)(src + (m_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          }
          d[m_begin + i] = amax / ((1 << 7) - 1);
        }
      }
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
              __m512 id = _mm512_set1_ps(d[m_begin + i] ? 1.0f / d[m_begin + i] : 0.0f);
              int8_t *dst = a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP;
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i *)(src + (m_begin + i) * k + k_block_begin + k_begin), &f0, &f1);
              avx512_32xbf16_to_32xfp32((__m512i *)(src + (m_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              _mm_storeu_si128((__m128i *)dst, s0);
              _mm_storeu_si128((__m128i *)(dst + 16), s1);
              _mm_storeu_si128((__m128i *)(dst + 32), s2);
              _mm_storeu_si128((__m128i *)(dst + 48), s3);
            }
          }
        }
      }
    }

    int8_t *get_submat(int m, int k, int m_begin, int k_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
    }

    float *get_scale(int m, int m_begin) { return d + m_begin; }
  };

  struct BufferB {
    int8_t *b;
    float *d;
    int n, k;

    static size_t required_size(int n, int k) { return n * k * sizeof(int8_t) + n * sizeof(float); }

    BufferB(int n, int k, void *ptr) : n(n), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(n % N_STEP == 0);
      assert(k % K_STEP == 0);
      b = reinterpret_cast<int8_t *>(ptr);
      d = reinterpret_cast<float *>(b + n * k);
    }

    void from_mat(ggml_bf16_t *src, int ith, int nth) {
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP; i++) {
          float amax = 0.0f;
          for (int j = 0; j < k; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i *)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          }
          d[n_block_begin + n_begin + i] = amax / ((1 << 7) - 1);
        }
      }
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < N_STEP; i++) {
              __m512 id = _mm512_set1_ps(d[n_block_begin + n_begin + i] ? 1.0f / d[n_block_begin + n_begin + i] : 0.0f);
              int8_t *dst = b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                            k_begin * N_STEP + i * K_STEP;
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i *)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i *)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              _mm_storeu_si128((__m128i *)dst, s0);
              _mm_storeu_si128((__m128i *)(dst + 16), s1);
              _mm_storeu_si128((__m128i *)(dst + 32), s2);
              _mm_storeu_si128((__m128i *)(dst + 48), s3);
            }
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP));
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP + TILE_N * K_STEP));
          }
        }
      }
    }

    int8_t *get_submat(int n, int k, int n_begin, int k_begin) {
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      n_begin -= n_block_begin;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP;
    }

    float *get_scale(int n, int n_begin) { return d + n_begin; }
  };

  struct BufferC {
    float *c;
    int max_m, n;

    static size_t required_size(int max_m, int n) { return max_m * n * sizeof(float); }

    BufferC(int max_m, int n, void *ptr) : max_m(max_m), n(n) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(n % N_STEP == 0);
      c = reinterpret_cast<float *>(ptr);
    }

    void to_mat(int m, ggml_bf16_t *dst, int ith, int nth) {
      assert(m <= max_m);
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512 *x0 =
                (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
            __m512 *x1 = (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP +
                                    i * N_STEP + 16);
            avx512_32xfp32_to_32xbf16(x0, x1, (__m512i *)(dst + (m_begin + i) * n + n_block_begin + n_begin));
          }
        }
      }
    }

    float *get_submat(int m, int n, int m_begin, int n_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      n_begin -= n_block_begin;
      return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
    }
  };
};

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF::BufferA> ba,
                    std::shared_ptr<GemmKernel224BF::BufferB> bb, std::shared_ptr<GemmKernel224BF::BufferC> bc, int ith,
                    int nth, bool use_amx) {
  using K = GemmKernel224BF;
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {

        float *c = bc->get_submat(m, n, m_begin, n_begin);
        if (!use_amx) {
          __m512 *c512 = (__m512 *)c;
          if (k_block_begin == 0) {
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              c512[m_i * 2] = _mm512_setzero_ps();
              c512[m_i * 2 + 1] = _mm512_setzero_ps();
            }
          }

          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            int32_t *a32 = (int32_t *)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
            __m512bh *b512 = (__m512bh *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              for (int k_i = 0; k_i < 16; k_i++) {
                __m512bh ma = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i]);
                for (int n_i = 0; n_i < 2; n_i++) {
                  c512[m_i * 2 + n_i] = _mm512_dpbf16_ps(c512[m_i * 2 + n_i], ma, b512[n_i * 16 + k_i]);
                }
              }
            }
          }

        } else {
          if (k_block_begin == 0) {
            K::clean_c();
          } else {
            K::load_c(c, K::N_STEP * sizeof(float));
          }
          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(ggml_bf16_t));
            K::load_b(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::K_STEP * sizeof(ggml_bf16_t));
            K::run_tile();
          }
          K::store_c(c, K::N_STEP * sizeof(float));
        }
      }
    }
  }
}

inline __m512i _mm512_dpbssd_epi32(__m512i src, __m512i a, __m512i b) {
  __m256i a_lo = _mm512_extracti64x4_epi64(a, 0);
  __m256i a_hi = _mm512_extracti64x4_epi64(a, 1);
  __m256i b_lo = _mm512_extracti64x4_epi64(b, 0);
  __m256i b_hi = _mm512_extracti64x4_epi64(b, 1);

  b_lo = _mm256_sign_epi8(b_lo, a_lo);
  b_hi = _mm256_sign_epi8(b_hi, a_hi);

  b = _mm512_inserti64x4(b, b_lo, 0);
  b = _mm512_inserti64x4(b, b_hi, 1);

  a = _mm512_abs_epi8(a);

  return _mm512_dpbusd_epi32(src, a, b);
}

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int8::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int8::BufferB> bb, std::shared_ptr<GemmKernel224Int8::BufferC> bc,
                    int ith, int nth, bool use_amx) {
  using K = GemmKernel224Int8;
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
        float *c = bc->get_submat(m, n, m_begin, n_begin);

        if (!use_amx) {
          __m512i *c512 = (__m512i *)c;
          if (k_block_begin == 0) {
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              c512[m_i * 2] = _mm512_setzero_si512();
              c512[m_i * 2 + 1] = _mm512_setzero_si512();
            }
          }

          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            static_assert(K::K_STEP * sizeof(int8_t) == sizeof(__m512i));
            static_assert(K::N_STEP / K::TILE_N == 2, "Must be lke this");

            int32_t *a32 = (int32_t *)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
            __m512i *b512 = (__m512i *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              for (int k_i = 0; k_i < 16; k_i++) {
                __m512i ma = _mm512_set1_epi32(a32[m_i * 16 + k_i]);
                for (int n_i = 0; n_i < 2; n_i++) {
                  c512[m_i * 2 + n_i] = _mm512_dpbssd_epi32(c512[m_i * 2 + n_i], ma, b512[n_i * 16 + k_i]);
                }
              }
            }
          }
        } else {
          if (k_block_begin == 0) {
            K::clean_c();
          } else {
            K::load_c((int32_t *)c, K::N_STEP * sizeof(int32_t));
          }
          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
            K::load_b(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
            K::run_tile();
          }
          K::store_c((int32_t *)c, K::N_STEP * sizeof(int32_t));
        }

        if (k_block_begin + K::K_BLOCK >= k) {
          int to = m - m_begin;
          if (m - m_begin > K::M_STEP) {
            to = K::M_STEP;
          }
          for (int i = 0; i < to; i++) {
            __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i));
            __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin));
            __m512i now = _mm512_load_si512((__m512i *)(c + i * K::N_STEP));
            __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
            _mm512_store_ps((__m512 *)(c + i * K::N_STEP), result);
            bs = _mm512_load_ps(bb->get_scale(n, n_begin) + K::TILE_N);
            now = _mm512_load_si512((__m512i *)(c + i * K::N_STEP + K::TILE_N));
            result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
            _mm512_store_ps((__m512 *)(c + i * K::N_STEP + K::TILE_N), result);
          }
        }
      }
    }
  }
}

} // namespace amx

================================================
FILE: archive/csrc/ktransformers_ext/operators/amx/la/utils.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#pragma once
#include <cstdint>


template <typename T>
T* offset_pointer(T* ptr, std::size_t byte_offset) {
  return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + byte_offset);
}

template <typename T>
const T* offset_pointer(const T* ptr, std::size_t byte_offset) {
  return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + byte_offset);
}

template <typename T>
T* offset_pointer_row_major(T* t, int row, int col, std::size_t ld) {
  return offset_pointer(t, row * ld) + col;
}

template <typename T>
T* offset_pointer_col_major(T* t, int row, int col, std::size_t ld) {
  return offset_pointer(t, col * ld) + row;
}

static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
  _mm512_storeu_si512(dst, _mm512_loadu_si512(src));
}

static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
  _mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
}

static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
  _mm512_storeu_ps(dst0, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src))), 16)));
  _mm512_storeu_ps(dst1, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src) + 1)), 16)));
}

================================================
FILE: archive/csrc/ktransformers_ext/operators/amx/moe.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_AMX_MOE_H
#define CPUINFER_OPERATOR_AMX_MOE_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

#include "la/amx.hpp"

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
void *numa_alloc_aligned(size_t size, int node, size_t alignment) {
  void *ptr = numa_alloc_onnode(size, node);
  assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
  return ptr;
}
#endif

static inline __m512 exp_avx512(__m512 x) {
  const __m512 log2e = _mm512_set1_ps(1.44269504089f);
  const __m512 c1 = _mm512_set1_ps(0.69314718056f);

  __m512 y = _mm512_mul_ps(x, log2e);
  __m512i int_part = _mm512_cvtps_epi32(y);
  __m512 frac_part = _mm512_sub_ps(y, _mm512_cvtepi32_ps(int_part));

  const __m512 poly_1 = _mm512_set1_ps(0.9999999995f);
  const __m512 poly_2 = _mm512_set1_ps(0.6931471805f);
  const __m512 poly_3 = _mm512_set1_ps(0.2402265069f);
  const __m512 poly_4 = _mm512_set1_ps(0.0555041087f);
  const __m512 poly_5 = _mm512_set1_ps(0.0096181291f);
  const __m512 poly_6 = _mm512_set1_ps(0.0013333558f);

  __m512 frac_exp = _mm512_fmadd_ps(
      frac_part, poly_6,
      _mm512_fmadd_ps(frac_part, poly_5,
                      _mm512_fmadd_ps(frac_part, poly_4,
                                      _mm512_fmadd_ps(frac_part, poly_3, _mm512_fmadd_ps(frac_part, poly_2, poly_1)))));

  __m512 two_pow_i = _mm512_scalef_ps(_mm512_set1_ps(1.0f), _mm512_cvtepi32_ps(int_part));
  return _mm512_mul_ps(two_pow_i, frac_exp);
}

static inline __m512 act_fn(__m512 gate_val, __m512 up_val) {
  __m512 neg_gate_val = _mm512_sub_ps(_mm512_setzero_ps(), gate_val);
  __m512 exp_neg_gate = exp_avx512(neg_gate_val);
  __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), exp_neg_gate);
  __m512 act_val = _mm512_div_ps(gate_val, denom);

  return _mm512_mul_ps(act_val, up_val);
}

static inline __m512 relu_act_fn(__m512 gate_val, __m512 up_val) {
  __m512 zero_vec = _mm512_setzero_ps();
  __m512 act_val = _mm512_max_ps(zero_vec, gate_val);
  return _mm512_mul_ps(act_val, up_val);
}

struct AMX_MOEConfig {
  int expert_num;
  int routed_expert_num;
  int hidden_size;
  int intermediate_size;
  int max_len;
  bool use_silu;
  void *gate_proj;
  void *up_proj;
  void *down_proj;

  AMX_MOEConfig() {}

  AMX_MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int max_len, bool use_silu,
                void *gate_proj, void *up_proj, void *down_proj)
      : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size),
        intermediate_size(intermediate_size), max_len(max_len), use_silu(use_silu), gate_proj(gate_proj), up_proj(up_proj),
        down_proj(down_proj) {}
};

template <class T> class AMX_MOE {
private:
  AMX_MOEConfig config_;
  void *gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *up_proj_;   // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *down_proj_; // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

  ggml_bf16_t *m_local_input_;       // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_gate_output_; // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_up_output_;   // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_down_output_; // [routed_expert_num * max_len * hidden_size]

  std::vector<std::vector<int>> m_local_pos_;          // [max_len, routed_expert_num]
  std::vector<int> m_local_num_;                       // [expert_num]
  std::vector<int> m_expert_id_map_;                   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_input_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_output_ptr_; // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_output_ptr_;   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_down_output_ptr_; // [expert_num]

  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;

#ifdef USE_NUMA
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> gate_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> up_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> down_bb_numa_;
#else
  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
#endif

public:
  AMX_MOE(AMX_MOEConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;

    std::vector<std::pair<void **, uint64_t>> m_mem_requests;
    m_mem_requests.push_back({(void **)&m_local_input_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                  config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_up_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_down_output_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    std::vector<void *> gate_up_ba_ptr(config_.expert_num);
    std::vector<void *> gate_bc_ptr(config_.expert_num);
    std::vector<void *> up_bc_ptr(config_.expert_num);
    std::vector<void *> down_ba_ptr(config_.expert_num);
    std::vector<void *> down_bc_ptr(config_.expert_num);
    for (int i = 0; i < config_.expert_num; i++) {
      m_mem_requests.push_back(
          {(void **)&gate_up_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&gate_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&up_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
    }
    shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.max_len);
    for (int i = 0; i < config_.max_len; i++) {
      m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_expert_id_map_.resize(config_.expert_num);
    m_local_num_.resize(config_.expert_num);
    m_local_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);

    for (uint64_t i = 0; i < config_.expert_num; i++) {
      gate_up_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, gate_up_ba_ptr[i]));
      gate_bc_.push_back(
          std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, gate_bc_ptr[i]));
      up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, up_bc_ptr[i]));
      down_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, down_ba_ptr[i]));
      down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, down_bc_ptr[i]));

#ifdef USE_NUMA
      int numa_nodes = numa_num_configured_nodes();
      gate_bb_numa_.resize(numa_nodes);
      up_bb_numa_.resize(numa_nodes);
      down_bb_numa_.resize(numa_nodes);
      for (int j = 0; j < numa_nodes; j++) {
        void *gate_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        gate_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
        void *up_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        up_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
        void *down_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
        down_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
      }
#else
      void *gate_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      gate_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));

      void *up_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      up_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));

      void *down_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      down_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
#endif
    }
  }

  ~AMX_MOE() { shared_mem_buffer.dealloc(this); }

  void load_weights(Backend *backend) {
    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            gate_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                                       expert_idx * config_.intermediate_size * config_.hidden_size,
                                                   ith, nth);
            up_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.up_proj +
                                                     expert_idx * config_.intermediate_size * config_.hidden_size,
                                                 ith, nth);
          }
#else
          gate_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                             expert_idx * config_.intermediate_size * config_.hidden_size,
                                         ith, nth);
          up_bb_[expert_idx]->from_mat(
              (ggml_bf16_t *)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith, nth);
#endif
        },
        nullptr);
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            down_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                                       expert_idx * config_.hidden_size * config_.intermediate_size,
                                                   ith, nth);
          }
#else
          down_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
#endif
        },
        nullptr);
  }

  void warm_up(Backend *backend) {}

  void forward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void *input, void *output,
               int *batch_size_tensor, Backend *backend) {
    qlen = batch_size_tensor[0];
    bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }
    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];
    }
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int j = 0; j < k; j++) {
            memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
                   (ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
          }
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];

          gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_numa_[Backend::numa_node][expert_idx], gate_bc_[expert_idx],
                       ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_numa_[Backend::numa_node][expert_idx], up_bc_[expert_idx], ith,
                       nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
#endif
          gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
          up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);
          auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
          if (config_.use_silu) {
            for (int i = 0; i < m_local_num_[expert_idx]; i++) {
                ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
                ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
                for (int j = n_start; j < n_end; j += 32) {
                  __m512 gate_val0, gate_val1, up_val0, up_val1;
                  avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
                  avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
                  __m512 result0 = act_fn(gate_val0, up_val0);
                  __m512 result1 = act_fn(gate_val1, up_val1);
                  avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i *)(gate_output_ptr + j));
                }
              }
          }
          else {
              for (int i = 0; i < m_local_num_[expert_idx]; i++) {
                ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
                ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
                for (int j = n_start; j < n_end; j += 32) {
                  __m512 gate_val0, gate_val1, up_val0, up_val1;
                  avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
                  avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
                  __m512 result0 = relu_act_fn(gate_val0, up_val0);
                  __m512 result1 = relu_act_fn(gate_val1, up_val1);
                  avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i *)(gate_output_ptr + j));
                }
              }
          }
          
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_numa_[Backend::numa_node][expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_[expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#endif
          down_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_output_ptr_[expert_idx], ith, nth);
        },
        nullptr);
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int e = 0; e < config_.hidden_size; e += 32) {
            __m512 x0 = _mm512_setzero_ps();
            __m512 x1 = _mm512_setzero_ps();
            for (int j = 0; j < k; j++) {
              __m512 weight = _mm512_set1_ps(weights[i * k + j]);
              __m512 down_output0, down_output1;
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_down_output_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &down_output0, &down_output1);
              x0 = _mm512_fmadd_ps(down_output0, weight, x0);
              x1 = _mm512_fmadd_ps(down_output1, weight, x1);
            }
            avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)output + i * config_.hidden_size + e));
          }
        },
        nullptr);
  }
};

#endif


================================================
FILE: archive/csrc/ktransformers_ext/operators/kvcache/kvcache.h
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#ifndef CPUINFER_OPERATOR_KVCACHE_H
#define CPUINFER_OPERATOR_KVCACHE_H

#include <algorithm>
#include <atomic>
#include <cassert>
#include <condition_variable>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <functional>
#include <future>
#include <iostream>
#include <memory>
#include <mutex>
#include <queue>
#include <random>
#include <stdexcept>
#include <thread>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "llama.cpp/ggml-common.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

#define CHUNK_SIZE 32

/**
 * @brief Converts a ggml_type enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * ggml_type enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param type The ggml_type enum value to convert.
 * @return A string representation of the enum value.
 */
std::string ggml_type_to_string(ggml_type type);

/**
 * @enum AnchorType
 * @brief Defines the types of anchors used in attention mechanisms.
 *
 * This enum specifies different types of anchors that can be used in attention
 * mechanisms, such as fixed anchors, dynamic anchors, or special anchors like
 * QUEST, BLOCK_MEAN, or BLOCK_MAX.
 */
enum AnchorType {
    FIXED_ANCHOR, /**< A fixed anchor that does not change. */
    DYNAMIC,      /**< A dynamic anchor that can change over time. */
    QUEST, /**< A special anchor type used for QUEST (Query and Embedding Space
              Transformation). */
    BLOCK_MEAN, /**< An anchor based on the mean of a block of data. */
    BLOCK_MAX /**< An anchor based on the maximum value within a block of data.
               */
};

/**
 * @brief Converts an AnchorType enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * AnchorType enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param anchor_type The AnchorType enum value to convert.
 * @return A string representation of the enum value.
 */
std::string AnchorTypeToString(AnchorType anchor_type);

/**
 * @enum RetrievalType
 * @brief Defines the types of retrieval strategies in attention mechanisms.
 *
 * This enum specifies different retrieval strategies that can be used in
 * attention mechanisms, such as layer-level retrieval, key-value head-level
 * retrieval, or query head-level retrieval.
 */
enum RetrievalType {
    LAYER,  /**< Retrieval at the layer level. */
    KVHEAD, /**< Retrieval at the key-value head level. */
    QHEAD   /**< Retrieval at the query head level. */
};

/**
 * @brief Converts a RetrievalType enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * RetrievalType enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param retrieval_type The RetrievalType enum value to convert.
 * @return A string representation of the enum value.
 */
std::string RetrievalTypeToString(RetrievalType retrieval_type);

/**
 * @struct KVCacheConfig
 * @brief Configuration structure for Key-Value (KV) Cache.
 *
 * This structure holds configuration parameters for setting up and managing
 * a Key-Value (KV) Cache used in various attention mechanisms. It includes
 * parameters such as the number of layers, the number of heads, the dimension
 * of each head, block length, anchor information, and memory-related settings.
 */
struct KVCacheConfig {
    int layer_num;   /**< Number of layers in the model. */
    int kv_head_num; /**< Number of heads in the KV Cache. */
    int q_head_num;  /**< Number of heads in the query. */
    int head_dim;    /**< Dimension of each head. */
    int block_len;   /**< Length of each block in the cache. */
    int anchor_num;  /**< Number of anchors used in attention. */

    ggml_type kv_type; /**< Data type of the KV Cache (e.g., fp16, q8_0). */

    // Controls the pre-allocated memory size
    int max_block_num;  /**< Maximum number of blocks that can be allocated. */
    int max_batch_size; /**< Maximum batch size that can be processed. */
    int max_thread_num; /**< Maximum number of threads that can be used. */

    AnchorType
        anchor_type; /**< Type of anchors used in the attention mechanism. */
    RetrievalType
        retrieval_type; /**< Type of retrieval strategy used in the cache. */

    int layer_step;   /**< Step size between layers. */
    int token_step;   /**< Step size between tokens. */
    int layer_offset; /**< Offset value for layers. */

    /**
     * @brief Default constructor for KVCacheConfig.
     *
     * Initializes the configuration with default values. This constructor
     * does not initialize any member variables explicitly.
     */
    KVCacheConfig() = default;

    /**
     * @brief Parameterized constructor for KVCacheConfig.
     *
     * This constructor initializes the configuration with specific values
     * for all member variables.
     *
     * @param layer_num The number of layers in the model.
     * @param kv_head_num The number of heads in the KV Cache.
     * @param q_head_num The number of heads in the query.
     * @param head_dim The dimension of each head.
     * @param block_len The length of each block in the cache.
     * @param anchor_num The number of anchors used in attention.
     * @param anchor_type The type of anchors used in the attention mechanism.
     * @param kv_type The data type of the KV Cache (e.g., fp16, q8_0).
     * @param retrieval_type The type of retrieval strategy used in the cache.
     * @param layer_step The step size between layers.
     * @param token_step The step size between tokens.
     * @param layer_offset The offset value for layers.
     * @param max_block_num The maximum number of blocks that can be allocated.
     * @param max_batch_size The maximum batch size that can be processed.
     * @param max_thread_num The maximum number of threads that can be used.
     */
    KVCacheConfig(int layer_num, int kv_head_num, int q_head_num, int head_dim,
                  int block_len, int anchor_num, AnchorType anchor_type,
                  ggml_type kv_type, RetrievalType retrieval_type,
                  int layer_step, int token_step, int layer_offset,
                  int max_block_num, int max_batch_size, int max_thread_num);
};

/**
 * @class KVCache
 * @brief Manages the Key-Value (KV) Cache used in attention mechanisms.
 *
 * The KVCache class provides functionality for managing the Key-Value Cache,
 * including resizing the cache, retrieving configuration parameters, and
 * updating internal states. This class is typically used in transformer models
 * to store and manage past key and value states for efficient attention
 * computations.
 */
class KVCache {
  public:
    /**
     * @brief Constructs a KVCache object with the given configuration.
     *
     * Initializes the KVCache with the specified configuration parameters,
     * such as the number of layers, heads, head dimensions, and other
     * relevant settings.
     *
     * @param config The configuration object containing initialization
     * parameters.
     */
    KVCache(KVCacheConfig config);

    /**
     * @brief Resizes the number of threads used by the cache.
     *
     * This function adjusts the number of threads that the cache can utilize.
     * It allows dynamic reconfiguration of the parallel processing capabilities
     * based on the current workload or system resources.
     *
     * @param thread_num The new number of threads to use.
     */
    void ThreadResize(int thread_num);

    /**
     * @brief Resizes the batch size managed by the cache.
     *
     * This function adjusts the batch size that the cache can handle. It
     * is useful when the input batch size changes dynamically, allowing
     * the cache to be reconfigured accordingly.
     *
     * @param batch_size The new batch size.
     */
    void BatchResize(int batch_size);

    /**
     * @brief Resizes the number of blocks managed by the cache.
     *
     * This function adjusts the number of blocks that the cache can manage.
     * It allows dynamic reconfiguration of the block structure based on the
     * current sequence length or other factors.
     *
     * @param block_num The new number of blocks.
     */
    void BlockResize(int block_num);

    /**
     * @brief Gets the number of layers in the cache.
     *
     * @return The number of layers configured in the cache.
     */
    int get_layer_num() { return config_.layer_num; }

    /**
     * @brief Gets the number of KV heads in the cache.
     *
     * @return The number of KV heads configured in the cache.
     */
    int get_kv_head_num() { return config_.kv_head_num; }

    /**
     * @brief Gets the number of query heads in the cache.
     *
     * @return The number of query heads configured in the cache.
     */
    int get_q_head_num() { return config_.q_head_num; }

    /**
     * @brief Gets the dimension of each head in the cache.
     *
     * @return The dimension of each head.
     */
    int get_head_dim() { return config_.head_dim; }

    /**
     * @brief Gets the length of each block in the cache.
     *
     * @return The length of each block.
     */
    int get_block_len() { return config_.block_len; }

    /**
     * @brief Gets the number of blocks for a specific layer.
     *
     * @param layer_id The ID of the layer for which to retrieve the block
     * number.
     * @return The number of blocks in the specified layer.
     */
    int get_block_num(int layer_id) { return past_block_num_[layer_id]; }

    /**
     * @brief Gets the number of anchors in the cache.
     *
     * @return The number of anchors configured in the cache.
     */
    int get_anchor_num() { return config_.anchor_num; }

    /**
     * @brief Gets the total length of the cache.
     *
     * @return The total length of the cache.
     */
    int get_cache_total_len() { return cache_total_len_; }

    /**
     * @brief Gets the total number of blocks in the cache.
     *
     * This function computes and returns the total number of blocks in the
     * cache based on the total cache length and the block length configuration.
     *
     * @return The total number of blocks in the cache.
     */
    int get_cache_total_block_num() {
        return (cache_total_len_ + config_.block_len - 1) / config_.block_len;
    }

    /**
     * @brief Updates the total length of the cache.
     *
     * This function sets a new total length for the cache, allowing dynamic
     * adjustment of the cache size during runtime.
     *
     * @param cache_total_len The new total length of the cache.
     */
    void update_cache_total_len(int cache_total_len) {
        cache_total_len_ = cache_total_len;
    }
    void attn(const ggml_fp16_t *q_in, ggml_fp16_t *output, float *attn_lse,
              int layer_idx, int generate_token_idx, int q_len, int batch_size,
              int max_block_num, int *block_table, int *cache_seqlens,
              int pick_block_num, int init_block_num, int local_block_num,
              Backend *backend);

    void update_kvcache_one_block_fp16(const ggml_fp16_t *k_in,
                                       const ggml_fp16_t *v_in, int layer_id,
                                       int block_idx, Backend *backend);

    void get_kvcache_one_block_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                    int layer_id, int block_idx,
                                    Backend *backend);

    void update_importance_one_block(const ggml_fp16_t *importance,
                                     int layer_id, int block_idx,
                                     Backend *backend);
    void get_importance_one_block(ggml_fp16_t *importance, int layer_id,
                                  int block_idx, Backend *backend);

    void get_anchor_one_block(ggml_fp16_t *anchor, int layer_id, int block_idx,
                              Backend *backend);

    void update_anchor_one_block(const ggml_fp16_t *anchor, int layer_id,
                                 int block_idx, Backend *backend);

    void calc_anchor_all_layers(int *block_table, int *cache_seqlens,
                                int batch_size, int max_block_num,
                                Backend *backend);

    void load_kvcache(std::string tensor_file_path, Backend *backend);
    void dump_kvcache(int *block_table, int cache_total_len,
                      std::string tensor_file_path, Backend *backend);

    void get_and_update_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                     int layer_id, int *block_table,
                                     int batch_size, int max_block_num,
                                     int *cache_seqlens, int q_len,
                                     Backend *backend);

    void get_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in, int layer_id,
                          int *block_table, int batch_size, int max_block_num,
                          int *cache_seqlens, Backend *backend);

    void update_kvcache_fp16(const ggml_fp16_t *k_in, const ggml_fp16_t *v_in,
                             int layer_id, int *block_table, int batch_size,
                             int max_block_num, int *cache_seqlens, int q_len,
                             Backend *backend);

    void update_importance(const ggml_fp16_t *importance, int layer_id,
                           int *block_table, int batch_size, int max_block_num,
                           int *offset, int width, Backend *backend);

    void attn_with_kvcache(const ggml_fp16_t *q_in, const ggml_fp16_t *k_in,
                           const ggml_fp16_t *v_in, ggml_fp16_t *output,
                           float *attn_lse, int layer_idx,
                           int generate_token_idx, int q_len, int batch_size,
                           int max_block_num, int *block_table,
                           int *cache_seqlens, int topk, int local,
                           Backend *backend);

    void clear_importance_all_layers(int *block_table, int *cache_seqlens,
                                     int batch_size, int max_block_num,
                                     Backend *backend);

    void clear_kvcache_all_layers(int *block_table, int *cache_seqlens,
                                  int batch_size, int max_block_num,
                                  Backend *backend);

    void get_sincos(ggml_fp16_t *sin, ggml_fp16_t *cos, int seqlen);

    void get_attn_sparsity(const ggml_fp16_t *q_in, float *attn_sparsity,
                           int layer_idx, int generate_token_idx, int q_len,
                           int batch_size, int max_block_num, int *block_table,
                           int *cache_seqlens, int *block_table_origin,
                           int *cache_seqlens_origin, int max_block_num_origin,
                           int topk, int local, Backend *backend);

    void get_all_kvcache_one_layer(int layer_id, ggml_fp16_t *k_in,
                                   ggml_fp16_t *v_in, Backend *backend);

  private:
    // Persistent data
    KVCacheConfig config_;
    int n_gqa_;                            // q_head_num / kv_head_num
    int cache_total_len_;                  // Number of tokens in cache
    std::vector<uint64_t> past_block_num_; // [layer_num]
    std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
        k_cache_q4; // [layer_num, kv_head_num, past_block_num, block_len *
                    // (head_dim / QK_4)]
    std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
        v_cache_q4; // [layer_num, kv_head_num, past_block_num, head_dim *
                    // (block_len / QK_4)]
    std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
        k_cache_q8; // [layer_num, kv_head_num, past_block_num, block_len *
                    // (head_dim / QK_8)]
    std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
        v_cache_q8; // [layer_num, kv_head_num, past_block_num, head_dim *
                    // (block_len / QK_8)]

    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
        k_cache_fp16_; // [layer_num, kv_head_num, past_block_num, block_len *
                       // head_dim]
    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
        v_cache_fp16_; // [layer_num, kv_head_num, past_block_num, head_dim *
                       // block_len]

    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
        importance_; // [layer_num, past_block_num, block_len,
                     // attention_head_num]

    std::vector<ggml_fp16_t>
        anchor_; // [layer_num * past_block_num * anchor_num *
                 // attention_head_num * head_dim]

    // Runtime data
    int64_t layer_id_;
    int64_t block_idx_;
    int *block_table_;
    uint64_t block_num_;
    int max_block_num_after_retrieval_;

    // Rotary positional embeddings
    std::vector<std::vector<ggml_fp16_t>> sin_; // [seq_len, head_dim]
    std::vector<std::vector<ggml_fp16_t>> cos_; // [seq_len, head_dim]

    // update/get
    int seq_len_;
    uint16_t *k_scales_;        // q4_0
    uint8_t *k_in_;             // q4_0
    uint16_t *v_scales_;        // q4_0
    uint8_t *v_in_;             // q4_0
    uint16_t *k_data_;          // fp16
    uint16_t *v_data_;          // fp16
    uint16_t *importance_data_; // fp16
    uint16_t *anchor_data_;     // fp16

    // sparsity = (sigma(block lse / lse))
    std::vector<std::vector<std::vector<float>>>
        block_lse_; // [batch_size, max_block_num, q_head_num]
    std::vector<std::vector<float>> attn_sparsity_; // [batch_size, q_head_num]

    // attn
    std::vector<std::vector<float>>
        avg_q; // [batch_size, q_head_num * head_dim]

    std::vector<std::vector<ggml_fp16_t>>
        avg_q_fp16; // [batch_size, q_head_num * head_dim]
    std::vector<
        std::priority_queue<std::pair<float, int>,
                            std::vector<std::pair<float, int>>, std::greater<>>>
        top_similar_block_;

    std::vector<std::vector<float>> block_similar_;
    std::vector<std::vector<std::vector<float>>> block_similar_kv_head_;
    std::vector<std::vector<std::vector<float>>> block_similar_q_head_;

    std::vector<int> cache_seqlens_;               // [batch_size]
    std::vector<int> selected_blocks_num_history_; // [layer_num // layer_step]

    std::vector<std::vector<std::vector<int>>> selected_blocks_history_;
    // [layer_num // layer_step, batch_size, max_block_num]

    std::vector<std::vector<std::vector<std::vector<int>>>>
        selected_blocks_history_kvhead_; // [layer_num // layer_step,
                                         // batch_size, max_block_num,
                                         // kv_head_num]

    std::vector<std::vector<int>>
        block_table_before_retrieval_; // [batch_size, max_block_num]
    std::vector<std::vector<int>>
        block_table_after_retrieval_; // [batch_size, pick_block_num]

    std::vector<std::vector<std::vector<int>>>
        block_table_before_retrieval_qhead_; // [batch_size, max_block_num,
                                             // q_head_num]
    std::vector<std::vector<std::vector<int>>>
        block_table_after_retrieval_qhead_; // [batch_size, pick_block_num,
                                            // q_head_num]

    std::vector<std::vector<std::vector<int>>>
        block_table_before_retrieval_kvhead_; // [batch_size, max_block_num,
                                              // kv_head_num]
    std::vector<std::vector<std::vector<int>>>
        block_table_after_retrieval_kvhead_; // [batch_size, pick_block_num,
                                             // kv_head_num]

    std::vector<std::vector<std::unique_ptr<std::mutex>>>
        mutex_; // [batch_size, kv_head_num]
    std::vector<std::vector<std::vector<block_q8_0>>>
        q_q8_0_; // [batch_size, kv_head_num, n_gqa * head_dim / QK8_0]
    std::vector<std::vector<std::vector<float>>>
        q_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]

    std::vector<std::vector<std::vector<float>>>
        output_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]
    std::vector<std::vector<std::vector<float>>>
        attn_lse_; // [batch_size, kv_head_num, n_gqa]

    std::vector<std::pair<int, int>> thread_cur_head_idx_; // [thread_num]

    std::vector<std::vector<block_q8_0>>
        thread_local_output_q8_0_; // [thread_num, n_gqa * head_dim / QK8_0]
    std::vector<std::vector<float>>
        thread_local_attn_score_; // [thread_num, n_gqa * block_len]
    std::vector<std::vector<float>>
        thread_local_output_fp32_; // [thread_num, n_gqa * head_dim]
    std::vector<std::vector<float>>
        thread_local_attn_lse_; // [thread_num, n_gqa]
    std::vector<std::vector<float>>
        thread_local_cur_output_fp32_; // [thread_num, n_gqa * head_dim]
    std::vector<std::vector<float>>
        thread_local_cur_attn_lse_; // [thread_num, n_gqa]
    std::vector<std::vector<uint8_t>>
        thread_local_attn_mask_; // [thread_num, block_len // 8]
    std::vector<std::vector<char>>
        thread_local_draft_; // [thread_num, 2 * n_gqa * block_len + 6 * n_gqa *
                             // head_dim + 2 * block_len * head_dim]

    // tmp space
    std::vector<float> q_fp32; // [n_gqa * head_dim]

    void quantize_q_(const uint16_t *q_in_data, int batch_size);
    void attn_initialize_layer_(int batch_size, int layer_idx, int *block_table,
                                int &max_block_num, int *cache_seqlens);
    void attn_initialize_kvhead_(int batch_size, int layer_idx,
                                 int *block_table, int &max_block_num,
                                 int *cache_seqlens);
    void retrieval_kvcache_layer_(const uint16_t *q_in_data, int init_block_num,
                                  int local_block_num, int pick_block_num,
                                  int q_len, int generate_token_idx,
                                  int batch_size, int layer_idx,
                                  int *cache_seqlens, int &max_block_num,
                                  Backend *backend);
    void retrieval_kvcache_kvhead_(const uint16_t *q_in_data,
                                   int init_block_num, int local_block_num,
                                   int pick_block_num, int q_len,
                                   int generate_token_idx, int batch_size,
                                   int layer_idx, int *cache_seqlens,
                                   int &max_block_num, Backend *backend);

    void calculate_block_similarity_layer_(
        const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
        int max_block_num, int *cache_seqlens, int init_block_num,
        int local_block_num, int pick_block_num, Backend *backend);
    void calculate_block_similarity_kvhead_(
        const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
        int max_block_num, int *cache_seqlens, int init_block_num,
        int local_block_num, int pick_block_num, Backend *backend);

    void select_block_layer_(int batch_size, int layer_idx, int max_block_num,
                             int init_block_num, int local_block_num,
                             int pick_block_num);
    void select_block_kvhead_(int batch_size, int layer_idx, int max_block_num,
                              int init_block_num, int local_block_num,
                              int pick_block_num);

    void calculate_sparsity_layer_(const uint16_t *q_in_data,
                                   float *attn_sparsity, int batch_size,
                                   int max_block_num, int *block_table,
                                   int *cache_seqlens, Backend *backend);
    void calculate_sparsity_kvhead_(const uint16_t *q_in_data,
                                    float *attn_sparsity, int batch_size,
                                    int max_block_num, int *block_table,
                                    int *cache_seqlens, Backend *backend);

    void attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
                           float *attn_lse, int batch_size, Backend *backend);
    void attention_layer_(const uint16_t *q_in_data, ggml_fp16_t *output,
                          float *attn_lse, int batch_size, Backend *backend);

    /**
     * @brief Computes attention with KV cache for one block.
     *
     * This function performs attention computation for one block using KV
     * cache. The function supports different data types for Q, K, and V caches,
     * and provides options for quantization. The function does not perform any
     * dynamic memory allocation internally, so all necessary buffers must be
     * pre-allocated externally.
     *
     * @param head_dim The dimension of the head.
     * @param bsz The batch size.
     * @param q_type The data type of Q (GGML data type). Only supports fp16 and
     * q8_0.
     * @param q Pointer to the Q tensor [bsz, head_dim]. The quantization is
     *          always applied along the head_dim dimension. The size must be
     *          bsz * head_dim/32 * qtype_size. If head_dim % 32 != 0, an error
     *          will be raised.
     * @param past_kv_len The length of the past KV cache.
     * @param past_kv_offset The offset in the past KV cache.
     * @param is_full_attn Boolean flag indicating whether to use full attention
     *                     (true for full 1 mask).
     * @param attn_mask Pointer to the attention mask [bsz, past_kv_len]. If
     *                  is_full_attn = false, a bit matrix is passed to
     * represent the mask.
     * @param k_type The data type of K cache (GGML data type). Only supports
     *               fp16, q4_0, and q8_0.
     * @param k_quant_type Quantization type for K cache. 0 for per_token, 1 for
     *                     per_channel. Other values will raise an error.
     * @param k_cache Pointer to the K cache tensor [seq_len, head_dim]. If
     *                quant_type == 0, head_dim % 32 must be 0. If quant_type ==
     * 1, seq_len % 32 must be 0.
     * @param num_k_anchor The number of K anchors. If num_k_anchor == 0, it
     * means no anchor is present.
     * @param k_cache_anchors Pointer to the K cache anchors [num_k_anchor,
     * head_dim]. The k_anchor_type must be fp16.
     * @param k_cache_anchor_pos Pointer to the K cache anchor positions. Each
     * token is associated with the nearest previous anchor position.
     * @param v_type The data type of V cache (GGML data type).
     * @param v_quant_type Quantization type for V cache.
     * @param v_cache Pointer to the V cache tensor [head_dim, seq_len].
     * @param num_v_anchor The number of V anchors.
     * @param v_cache_anchors Pointer to the V cache anchors.
     * @param v_cache_anchor_pos Pointer to the V cache anchor positions.
     * @param attn_score Pre-allocated buffer for attention scores [bsz,
     * past_kv_len].
     * @param output Output tensor [bsz, head_dim] with the same type as q_type.
     * @param lse Pre-allocated buffer [bsz] for the log-sum-exp of the
     * attention scores.
     * @param draft Pre-allocated temporary buffer. The buffer size should be
     * enough to hold (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 *
     *              past_kv_len * head_dim + past_kv_len * head_dim / 32) bytes.
     * @param rotary_angle Pointer to the rotary angle tensor.
     * @param rotary_cos Pointer to the cosine values for rotary embedding.
     * @param rotary_sin Pointer to the sine values for rotary embedding.
     */
    void attn_with_kvcache_one_block_(
        int head_dim, int bsz,
        ggml_type q_type, // GGML data type of `Q`, only supports fp16 and q8_0
        // [bsz, head_dim]
        // Quantization is always on the head_dim dimension (per_token). If
        // head_dim % 32 != 0, an error will be raised. The size must be bsz *
        // head_dim/32 * qtype_size.
        const void *q,

        int past_kv_len, int past_kv_offset,
        bool is_full_attn, // true indicates a full 1 mask
        // If is_full_attn = false, a bit matrix representing the mask is
        // passed. [bsz, past_kv_len]
        const uint8_t *attn_mask,

        ggml_type k_type, // GGML data type of `K Cache`, only supports fp16,
                          // q4_0, q8_0
        int k_quant_type, // 0 for per_token, 1 for per_channel, others raise an
                          // error
        // [seq_len, head_dim]
        // If quant_type == 0, head_dim % 32 must be 0.
        // If quant_type == 1, seq_len % 32 must be 0.
        const void *k_cache,

        // k_anchor_type must be fp16
        int num_k_anchor, // num_k_anchor == 0 indicates no anchor
        // [num_k_anchor, head_dim]
        const void *k_cache_anchors,
        // Each token is associated with the nearest previous position's anchor,
        // with the same distance.
        const int *k_cache_anchor_pos,

        // v_cache similar to k_cache
        ggml_type v_type, int v_quant_type,
        // [head_dim, seq_len]
        const void *v_cache, int num_v_anchor, const void *v_cache_anchors,
        const int *v_cache_anchor_pos,

        // Pre-allocated buffer for intermediate calculations [bsz,
        // past_kv_len]. No malloc is performed inside this function.
        float *attn_score,

        // Output: [bsz, head_dim], with the same type as q_type
        void *output,
        // [bsz]
        float *lse,

        // Pre-allocated temporary buffer with sufficient size:
        // (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
        // head_dim + past_kv_len * head_dim / 32) bytes.
        void *draft,

        // Apply rotary embedding online
        const int *rotary_angle, const void *rotary_cos, const void *rotary_sin
        // rotary_cos=None,
        // rotary_sin=None,
        // cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
        // cache_batch_idx: Optional[torch.Tensor] = None,
        // rotary_interleaved=True,

        // // Not supported for now
        // window_size=(-1, -1),  # -1 means infinite context window
        // alibi_slopes=None,
    );
};

/**
 * @brief Scales a float32 vector by a given scalar value.
 *
 * This function multiplies each element of the input vector `y` by a scalar
 * `v`. It uses platform-specific optimizations if available, such as Apple's
 * Accelerate framework or SIMD instructions. If no specific optimization is
 * available, the function falls back to a simple scalar multiplication loop.
 *
 * @param n The number of elements in the vector `y`.
 * @param y The input vector to be scaled. The result will be stored in the same
 * vector.
 * @param v The scalar value by which to scale the vector.
 */
void ggml_vec_scale_f32(const int n, float *y, const float v);
#endif

================================================
FILE: archive/csrc/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

void KVCache::attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
                                float *attn_lse, int batch_size,
                                Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;

    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num_after_retrieval_,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num *
                                      max_block_num_after_retrieval_);
            int head_id = (task_id % (config_.kv_head_num *
                                      max_block_num_after_retrieval_)) /
                          max_block_num_after_retrieval_;
            int block_id = task_id % max_block_num_after_retrieval_;
            int thread_id = Backend::thread_local_id;

            // If the block is out of the sequence length, skip it.
            if (cache_seqlens_[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx =
                block_table_after_retrieval_kvhead_[batch_id][block_id]
                                                   [head_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;
                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });
    // move the results to output and attn_lse
    uint16_t *output_data = reinterpret_cast<uint16_t *>(output);
    float *attn_lse_data = attn_lse;
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                config_.head_dim +
                            i * n_gqa_ * config_.head_dim + j] =
                    GGML_FP32_TO_FP16(output_fp32_[batch_idx][i][j]);
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_data[batch_idx * config_.kv_head_num * n_gqa_ +
                              i * n_gqa_ + j] = attn_lse_[batch_idx][i][j];
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of computing attention: %f s\n", layer_idx,
    //        diff.count());
}

void KVCache::attention_layer_(const uint16_t *q_in_data, ggml_fp16_t *output,
                               float *attn_lse, int batch_size,
                               Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num_after_retrieval_,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num *
                                      max_block_num_after_retrieval_);
            int head_id = (task_id % (config_.kv_head_num *
                                      max_block_num_after_retrieval_)) /
                          max_block_num_after_retrieval_;
            int block_id = task_id % max_block_num_after_retrieval_;
            int thread_id = Backend::thread_local_id;
            // If the block is out of the sequence length, skip it.
            if (cache_seqlens_[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table_after_retrieval_[batch_id][block_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;

                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });

    // move the results to output and attn_lse
    uint16_t *output_data = reinterpret_cast<uint16_t *>(output);
    float *attn_lse_data = attn_lse;
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                config_.head_dim +
                            i * n_gqa_ * config_.head_dim + j] =
                    GGML_FP32_TO_FP16(output_fp32_[batch_idx][i][j]);
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_data[batch_idx * config_.kv_head_num * n_gqa_ +
                              i * n_gqa_ + j] = attn_lse_[batch_idx][i][j];
            }
        }
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    //     printf("layer %d time of computing attention: %f s\n", layer_id_,
    //     diff.count());
}

void KVCache::attn(const ggml_fp16_t *q_in, ggml_fp16_t *output,
                   float *attn_lse, int layer_idx, int generate_token_idx,
                   int q_len, int batch_size, int max_block_num,
                   int *block_table, int *cache_seqlens, int pick_block_num,
                   int init_block_num, int local_block_num, Backend *backend) {

    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    layer_id_ = layer_idx;
    batch_size = batch_size * q_len;

    const uint16_t *q_in_data = const_cast<const uint16_t *>(q_in);

    quantize_q_(q_in_data, batch_size);
    if (config_.retrieval_type == RetrievalType::LAYER) {
        attn_initialize_layer_(batch_size, layer_idx, block_table,
                               max_block_num, cache_seqlens);
        retrieval_kvcache_layer_(q_in_data, init_block_num, local_block_num,
                                 pick_block_num, q_len, generate_token_idx,
                                 batch_size, layer_idx, cache_seqlens,
                                 max_block_num, backend);
        attention_layer_(q_in_data, output, attn_lse, batch_size, backend);
    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        attn_initialize_kvhead_(batch_size, layer_idx, block_table,
                                max_block_num, cache_seqlens);
        retrieval_kvcache_kvhead_(q_in_data, init_block_num, local_block_num,
                                  pick_block_num, q_len, generate_token_idx,
                                  batch_size, layer_idx, cache_seqlens,
                                  max_block_num, backend);
        attention_kvhead_(q_in_data, output, attn_lse, batch_size, backend);
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of computing attention: %f s\n", layer_idx,
    //        diff.count());
}

void KVCache::attn_with_kvcache(
    const ggml_fp16_t *q_in, const ggml_fp16_t *k_in, const ggml_fp16_t *v_in,
    ggml_fp16_t *output, float *attn_lse, int layer_idx, int generate_token_idx,
    int q_len, int batch_size, int max_block_num, int *block_table,
    int *cache_seqlens, int topk, int local, Backend *backend) {
    //    printf("attn_with_kvcache start\n");
    assert(q_len == 1);
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_idx;

    update_kvcache_fp16(k_in, v_in, layer_idx, block_table, batch_size,
                        max_block_num, cache_seqlens, q_len, backend);
    //    printf("update finished.\n");

    // cache_seqlens memory is modified.
    for (int i = 0; i < batch_size; i++) {
        cache_seqlens[i] += q_len;
    }
    int init_block_num = 1;
    if (config_.block_len <= 32) {
        init_block_num = 64 / config_.block_len;
    }

    attn(q_in, output, attn_lse, layer_idx, generate_token_idx, q_len,
         batch_size, max_block_num, block_table, cache_seqlens, topk,
         init_block_num, local, backend);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    //     printf("layer %d time of computing attention with kvcache: %f s\n",
    //     layer_idx, diff.count());
}

void KVCache::quantize_q_(const uint16_t *q_in_data, int batch_size) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            // quantize q
            for (int i = 0; i < config_.kv_head_num; i++) {
                for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                    q_fp32_[batch_idx][i][j] = GGML_FP16_TO_FP32(
                        q_in_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                      config_.head_dim +
                                  i * n_gqa_ * config_.head_dim + j]);
                }
            }
        } else {
            // quantize q
            for (int i = 0; i < config_.kv_head_num; i++) {
                for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                    q_fp32[j] = GGML_FP16_TO_FP32(
                        q_in_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                      config_.head_dim +
                                  i * n_gqa_ * config_.head_dim + j]);
                }
                quantize_row_q8_0(q_fp32.data(), q_q8_0_[batch_idx][i].data(),
                                  n_gqa_ * config_.head_dim);
            }
        }
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("time of quantizing q: %f s\n",
    //        std::chrono::duration<double>(end - start).count());
}
void KVCache::attn_initialize_layer_(int batch_size, int layer_idx,
                                     int *block_table, int &max_block_num,
                                     int *cache_seqlens) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        // initialize output_fp32_ and attn_lse_
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_fp32_[batch_idx][i][j] = 0;
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_[batch_idx][i][j] = 0;
            }
        }
        // clear top_similar_block_

        while (!top_similar_block_[batch_idx].empty())
            top_similar_block_[batch_idx].pop();
    }

    // get block_table_before_retrieval_ and cache_seqlens_
    if (block_table == nullptr) {
        max_block_num = past_block_num_[layer_idx];
        for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
            if (cache_total_len_ != 0)
                cache_seqlens_[batch_idx] = cache_total_len_;
            else
                cache_seqlens_[batch_idx] = max_block_num * config_.block_len;
            for (int i = 0; i < max_block_num; i++) {
                block_table_before_retrieval_[batch_idx][i] = i;
                block_similar_[batch_idx][i] = 0;
            }
        }
    } else {
        for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
            cache_seqlens_[batch_idx] = cache_seqlens[batch_idx];
            for (int i = 0; i < max_block_num; i++) {
                block_table_before_retrieval_[batch_idx][i] =
                    block_table[batch_idx * max_block_num + i];
                block_similar_[batch_idx][i] = 0;
            }
        }
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("layer %d time of initializing attention: %f s\n", layer_idx,
    //        std::chrono::duration<double>(end - start).count());
}

void KVCache::calculate_block_similarity_layer_(
    const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
    int max_block_num, int *cache_seqlens, int init_block_num,
    int local_block_num, int pick_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    if (batch_size == 1 &&
        config_.anchor_num == 1) { // TODO: improve batch_size > 1
        for (int batch_id = 0; batch_id < batch_size; batch_id++) {
            if (q_len == 1) {
                for (int j = 0; j < config_.head_dim * config_.q_head_num;
                     j++) {
                    avg_q[batch_id][j] = GGML_FP16_TO_FP32(
                        q_in_data[batch_id * q_len * config_.q_head_num *
                                      config_.head_dim +
                                  j]);
                    avg_q_fp16[batch_id][j] =
                        q_in_data[batch_id * q_len * config_.q_head_num *
                                      config_.head_dim +
                                  j];
                }
            } else {
                for (int j = 0; j < config_.head_dim * config_.q_head_num;
                     j++) {
                    avg_q[batch_id][j] = 0;
                }
                for (int i = 0; i < q_len; i++) {
                    for (int j = 0; j < config_.head_dim; j++) {
                        avg_q[batch_id][j] += GGML_FP16_TO_FP32(
                            q_in_data[batch_id * q_len * config_.q_head_num *
                                          config_.head_dim +
                                      i * config_.q_head_num *
                                          config_.head_dim +
                                      j]);
                    }
                }
                for (int j = 0; j < config_.head_dim * config_.q_head_num;
                     j++) {
                    avg_q[batch_id][j] /= q_len;
                    avg_q_fp16[batch_id][j] =
                        GGML_FP32_TO_FP16(avg_q[batch_id][j]);
                }
            }
            int seq_len = cache_seqlens_[batch_id];
            int block_num = (seq_len / config_.block_len) - local_block_num -
                            init_block_num;
            if (block_num <= 0) {
                continue;
            }
            bool is_seq = true;
            for (int i = init_block_num + 1;
                 i < (seq_len / config_.block_len) - local_block_num; i++) {
                if (block_table_before_retrieval_[batch_id][i] !=
                    block_table_before_retrieval_[batch_id][i - 1] + 1) {
                    is_seq = false;
                    break;
                }
            }
            if (is_seq) {
                int nth = backend->get_thread_num();
                backend->do_work_stealing_job(
                    nth, nullptr,
                    [&](int task_id) {
                        int ith = task_id;
                        bool ok = llamafile_sgemm(
                            block_num, 1, config_.q_head_num * config_.head_dim,
                            anchor_.data() +
                                (layer_idx * config_.max_block_num +
                                 block_table_before_retrieval_
                                     [batch_id][init_block_num]) *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim,
                            config_.q_head_num * config_.head_dim,
                            avg_q_fp16[batch_id].data(),
                            config_.q_head_num * config_.head_dim,
                            block_similar_[batch_id].data() + init_block_num,
                            block_num, ith, nth, GGML_TASK_TYPE_COMPUTE,
                            GGML_TYPE_F16, GGML_TYPE_F16, GGML_TYPE_F32,
                            GGML_PREC_DEFAULT);
                        if (!ok) {
                            printf("llamafile_sgemm failed\n");
                        }
                    },
                    nullptr);
            } else {
                backend->do_work_stealing_job(
                    block_num, nullptr,
                    [&](int task_id) {
                        int block_id = task_id + init_block_num;
                        int block_idx =
                            block_table_before_retrieval_[batch_id][block_id];
                        bool ok = llamafile_sgemm(
                            1, 1, config_.q_head_num * config_.head_dim,
                            anchor_.data() +
                                (layer_idx * config_.max_block_num +
                                 block_table_before_retrieval_[batch_id]
                                                              [block_idx]) *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim,
                            config_.q_head_num * config_.head_dim,
                            avg_q_fp16[batch_id].data(),
                            config_.q_head_num * config_.head_dim,
                            block_similar_[batch_id].data() + block_id, 1, 0, 1,
                            GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F16,
                            GGML_TYPE_F16, GGML_TYPE_F32, GGML_PREC_DEFAULT);
                        if (!ok) {
                            printf("llamafile_sgemm failed\n");
                        }
                    },
                    nullptr);
            }
        }
    } else {
        backend->do_work_stealing_job(
            batch_size * max_block_num, nullptr,
            [&](int task_id) {
                int batch_id = task_id / max_block_num;
                int block_id = task_id % max_block_num;
                int seq_len = cache_seqlens_[batch_id];

                if (block_id < init_block_num ||
                    block_id >=
                        (seq_len / config_.block_len) - local_block_num) {
                    return;
                }

                int block_idx =
                    block_table_before_retrieval_[batch_id][block_id];
                float sim = 0;

                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int i = 0; i < config_.head_dim; i++) {
                        float q_i = 0,
                              qa_i = std::numeric_limits<float>::lowest();
                        for (int q_id = 0; q_id < q_len; q_id++) {
                            q_i += GGML_FP16_TO_FP32(
                                q_in_data[batch_id * q_len *
                                              config_.q_head_num *
                                              config_.head_dim +
                                          q_id * config_.q_head_num *
                                              config_.head_dim +
                                          head_id * config_.head_dim + i]);
                        }
                        q_i /= q_len;
                        for (int anchor_id = 0; anchor_id < config_.anchor_num;
                             anchor_id++) {
                            qa_i = std::max(
                                qa_i,
                                GGML_FP16_TO_FP32(
                                    anchor_[(long long)layer_idx *
                                                config_.max_block_num *
                                                config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            block_idx * config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            anchor_id * config_.q_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim + i]) *
                                    q_i);
                        }
                        sim += qa_i;
                    }
                }
                block_similar_[batch_id][block_id] = sim;
            },
            nullptr);
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating similarity: %f s\n", layer_idx,
    //        diff.count());
}

void KVCache::select_block_layer_(int batch_size, int layer_idx,
                                  int max_block_num, int init_block_num,
                                  int local_block_num, int pick_block_num) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {

        if (cache_seqlens_[batch_idx] / config_.block_len <=
            init_block_num + pick_block_num + local_block_num) {
            block_table_after_retrieval_[batch_idx].swap(
                block_table_before_retrieval_[batch_idx]);
            selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] = 0;
            continue;
        }

        for (int block_id = init_block_num;
             block_id <
             (cache_seqlens_[batch_idx] / config_.block_len) - local_block_num;
             block_id++) {
            top_similar_block_[batch_idx].push(std::make_pair(
                block_similar_[batch_idx][block_id],
                block_table_before_retrieval_[batch_idx][block_id]));
            if (top_similar_block_[batch_idx].size() > pick_block_num) {
                top_similar_block_[batch_idx].pop();
            }
        }

        int i = 0;
        for (; i < init_block_num; i++) {
            block_table_after_retrieval_[batch_idx][i] =
                block_table_before_retrieval_[batch_idx][i];
        }
        while (!top_similar_block_[batch_idx].empty()) {
            block_table_after_retrieval_[batch_idx][i] =
                top_similar_block_[batch_idx].top().second;
            top_similar_block_[batch_idx].pop();
            i++;
        }
        for (; i < init_block_num + pick_block_num + local_block_num; i++) {
            block_table_after_retrieval_[batch_idx][i] =
                block_table_before_retrieval_[batch_idx]
                                             [(cache_seqlens_[batch_idx] /
                                               config_.block_len) -
                                              local_block_num + i -
                                              init_block_num - pick_block_num];
        }
        if (cache_seqlens_[batch_idx] % config_.block_len != 0) {
            block_table_after_retrieval_[batch_idx][i] =
                block_table_before_retrieval_[batch_idx][(
                    cache_seqlens_[batch_idx] / config_.block_len)];
            cache_seqlens_[batch_idx] =
                (cache_seqlens_[batch_idx] % config_.block_len) +
                i * config_.block_len;
            i++;
        } else {
            cache_seqlens_[batch_idx] =
                (cache_seqlens_[batch_idx] % config_.block_len) +
                i * config_.block_len;
        }
        for (int j = 0; j < i; j++) {
            selected_blocks_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step][batch_idx][j] =
                block_table_after_retrieval_[batch_idx][j];
        }
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] = i;
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of selecting blocks: %f s\n", layer_idx,
    //        diff.count());
}

// retrieval kvcache, get the init_block_num block at beginning, top
// pick_block_num similar and last local_block_num blocks. Each task
// calculates the simlarity of a certain block with the query, then push
// the block into the priority queue. Finally, the required blocks are
// pushed into the block_table_after_retrieval_.
void KVCache::retrieval_kvcache_layer_(const uint16_t *q_in_data,
                                       int init_block_num, int local_block_num,
                                       int pick_block_num, int q_len,
                                       int generate_token_idx, int batch_size,
                                       int layer_idx, int *cache_seqlens,
                                       int &max_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    max_block_num_after_retrieval_ = 0;
    if (pick_block_num != -1 &&
        (generate_token_idx % config_.token_step != 0 ||
         (layer_idx % config_.layer_step != config_.layer_offset))) {

        if (selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] == 0) {
            max_block_num_after_retrieval_ = max_block_num;
            block_table_after_retrieval_.swap(block_table_before_retrieval_);
        } else {
            max_block_num_after_retrieval_ = selected_blocks_num_history_
                [(layer_idx - config_.layer_offset) / config_.layer_step];
            for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
                for (int i = 0; i < max_block_num_after_retrieval_; i++) {
                    block_table_after_retrieval_[batch_idx][i] =
                        selected_blocks_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step][batch_idx]
                                                [i];
                }

                if (cache_seqlens[batch_idx] % config_.block_len == 1) {
                    selected_blocks_num_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step] += 1;
                    int x =
                        selected_blocks_num_history_[(layer_idx -
                                                      config_.layer_offset) /
                                                     config_.layer_step];
                    int last_block_idx =
                        block_table_before_retrieval_[batch_idx]
                                                     [cache_seqlens[batch_idx] /
                                                      config_.block_len];
                    selected_blocks_history_[(layer_idx -
                                              config_.layer_offset) /
                                             config_.layer_step][batch_idx]
                                            [x - 1] = last_block_idx;
                    block_table_after_retrieval_[batch_idx][x - 1] =
                        last_block_idx;
                }
                cache_seqlens_[batch_idx] =
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                    selected_blocks_num_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step] *
                        config_.block_len -
                    config_.block_len;
            }
        }
    } else if (pick_block_num != -1) {
        max_block_num_after_retrieval_ =
            std::min(max_block_num,
                     init_block_num + pick_block_num + local_block_num + 1);
        calculate_block_similarity_layer_(q_in_data, batch_size, layer_idx,
                                          q_len, max_block_num, cache_seqlens,
                                          init_block_num, local_block_num,
                                          pick_block_num, backend);
        select_block_layer_(batch_size, layer_idx, max_block_num,
                            init_block_num, local_block_num, pick_block_num);
    } else {
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] = 0;
        max_block_num_after_retrieval_ = max_block_num;
        block_table_after_retrieval_.swap(block_table_before_retrieval_);
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    //     printf("layer %d time of retrieval kvcache: %f s\n", layer_idx,
    //     std::chrono::duration<double>(end - start).count());
}
void KVCache::calculate_sparsity_layer_(const uint16_t *q_in_data,
                                        float *attn_sparsity, int batch_size,
                                        int max_block_num, int *block_table,
                                        int *cache_seqlens, Backend *backend

) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int head_id = (task_id % (config_.kv_head_num * max_block_num)) /
                          max_block_num;
            int block_id = task_id % max_block_num;
            int thread_id = Backend::thread_local_id;
            // If the block is out of the sequence length, skip it.
            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;
                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            for (int i = 0; i < n_gqa_; i++) {
                block_lse_[batch_id][block_idx][head_id * n_gqa_ + i] =
                    thread_local_attn_lse_[thread_id][i];
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });

    for (int i = 0; i < batch_size; i++) {
        for (int j = 0; j < max_block_num_after_retrieval_; j++) {
            int block_idx = block_table_after_retrieval_[i][j];
            for (int k = 0; k < config_.q_head_num; k++) {
                attn_sparsity[i * config_.q_head_num + k] +=
                    std::exp(block_lse_[i][block_idx][k] -
                             attn_lse_[i][k / n_gqa_][k % n_gqa_]);
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating sparsity: %f s\n", layer_id_,
    //        diff.count());
}

void KVCache::attn_initialize_kvhead_(int batch_size, int layer_idx,
                                      int *block_table, int &max_block_num,
                                      int *cache_seqlens) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        // initialize output_fp32_ and attn_lse_
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_fp32_[batch_idx][i][j] = 0;
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_[batch_idx][i][j] = 0;
            }
        }

        // clear top_similar_block_
        while (!top_similar_block_[batch_idx].empty())
            top_similar_block_[batch_idx].pop();
    }

    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        cache_seqlens_[batch_idx] = cache_seqlens[batch_idx];
        for (int i = 0; i < max_block_num; i++) {
            for (int j = 0; j < config_.kv_head_num; j++) {
                block_table_before_retrieval_kvhead_[batch_idx][i][j] =
                    block_table[batch_idx * max_block_num + i];
                block_similar_kv_head_[batch_idx][i][j] = 0;
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("layer %d time of initializing attn: %f s\n", layer_idx,
    //        std::chrono::duration<double>(end - start).count());
}
void KVCache::retrieval_kvcache_kvhead_(const uint16_t *q_in_data,
                                        int init_block_num, int local_block_num,
                                        int pick_block_num, int q_len,
                                        int generate_token_idx, int batch_size,
                                        int layer_idx, int *cache_seqlens,
                                        int &max_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    max_block_num_after_retrieval_ = 0;
    if (pick_block_num != -1 &&
        (generate_token_idx % config_.token_step != 0 ||
         (layer_idx % config_.layer_step != config_.layer_offset))) {

        if (selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] == 0) {
            max_block_num_after_retrieval_ = max_block_num;
            for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
                for (int i = 0; i < max_block_num; i++) {
                    for (int j = 0; j < config_.kv_head_num; j++) {
                        block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                            block_table_before_retrieval_kvhead_[batch_idx][i]
                                                                [j];
                    }
                }
            }
        } else {

            max_block_num_after_retrieval_ = selected_blocks_num_history_
                [(layer_idx - config_.layer_offset) / config_.layer_step];

            for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
                for (int i = 0; i < max_block_num_after_retrieval_; i++) {
                    for (int j = 0; j < config_.kv_head_num; j++) {
                        block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                            selected_blocks_history_kvhead_
                                [(layer_idx - config_.layer_offset) /
                                 config_.layer_step][batch_idx][i][j];
                    }
                }

                if (cache_seqlens[batch_idx] % config_.block_len == 1) {
                    selected_blocks_num_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step] += 1;
                    int x =
                        selected_blocks_num_history_[(layer_idx -
                                                      config_.layer_offset) /
                                                     config_.layer_step];
                    for (int i = 0; i < config_.kv_head_num; i++) {
                        int last_block_idx =
                            block_table_before_retrieval_kvhead_
                                [batch_idx][cache_seqlens[batch_idx] /
                                            config_.block_len][i];
                        selected_blocks_history_kvhead_[(layer_idx -
                                                         config_.layer_offset) /
                                                        config_.layer_step]
                                                       [batch_idx][x - 1][i] =
                                                           last_block_idx;
                        block_table_after_retrieval_kvhead_[batch_idx][x - 1]
                                                           [i] = last_block_idx;
                    }
                }
                cache_seqlens_[batch_idx] = std::min(
                    cache_seqlens_[batch_idx],
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                        (init_block_num + pick_block_num + local_block_num) *
                            config_.block_len);
            }
        }
    } else if (pick_block_num != -1) {
        max_block_num_after_retrieval_ =
            std::min(max_block_num,
                     init_block_num + pick_block_num + local_block_num + 1);
        calculate_block_similarity_kvhead_(q_in_data, batch_size, layer_idx,
                                           q_len, max_block_num, cache_seqlens,
                                           init_block_num, local_block_num,
                                           pick_block_num, backend);
        select_block_kvhead_(batch_size, layer_idx, max_block_num,
                             init_block_num, local_block_num, pick_block_num);
    } else {
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] = 0;
        max_block_num_after_retrieval_ = max_block_num;
        for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
            for (int i = 0; i < max_block_num; i++) {
                for (int j = 0; j < config_.kv_head_num; j++) {
                    block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                        block_table_before_retrieval_kvhead_[batch_idx][i][j];
                }
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("layer %d time of retrieval kvcache: %f s\n", layer_idx,
    //        std::chrono::duration<double>(end - start).count());
}
void KVCache::calculate_sparsity_kvhead_(const uint16_t *q_in_data,
                                         float *attn_sparsity, int batch_size,
                                         int max_block_num, int *block_table,
                                         int *cache_seqlens, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int head_id = (task_id % (config_.kv_head_num * max_block_num)) /
                          max_block_num;
            int block_id = task_id % max_block_num;
            int thread_id = Backend::thread_local_id;
            // If the block is out of the sequence length, skip it.
            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;

                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            for (int i = 0; i < n_gqa_; i++) {
                block_lse_[batch_id][block_idx][head_id * n_gqa_ + i] =
                    thread_local_attn_lse_[thread_id][i];
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });

    for (int i = 0; i < batch_size; i++) {
        for (int j = 0; j < max_block_num_after_retrieval_; j++) {
            for (int k = 0; k < config_.q_head_num; k++) {
                int block_idx =
                    block_table_after_retrieval_kvhead_[i][j][k / n_gqa_];
                attn_sparsity[i * config_.q_head_num + k] +=
                    std::exp(block_lse_[i][block_idx][k] -
                             attn_lse_[i][k / n_gqa_][k % n_gqa_]);
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating sparsity: %f s\n", layer_id_,
    //        diff.count());
}
void KVCache::calculate_block_similarity_kvhead_(
    const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
    int max_block_num, int *cache_seqlens, int init_block_num,
    int local_block_num, int pick_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    backend->do_work_stealing_job(
        batch_size * max_block_num, nullptr,
        [&](int task_id) {
            int batch_id = task_id / max_block_num;
            int block_id = task_id % max_block_num;
            int seq_len = cache_seqlens_[batch_id];

            if (block_id < init_block_num ||
                block_id >= (seq_len / config_.block_len) - local_block_num) {
                return;
            }
            int block_idx =
                block_table_before_retrieval_kvhead_[batch_id][block_id][0];

            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                for (int i = 0; i < config_.head_dim; i++) {
                    float q_i = 0, qa_i = std::numeric_limits<float>::lowest();
                    for (int q_id = 0; q_id < q_len; q_id++) {
                        q_i += GGML_FP16_TO_FP32(
                            q_in_data[batch_id * q_len * config_.q_head_num *
                                          config_.head_dim +
                                      q_id * config_.q_head_num *
                                          config_.head_dim +
                                      head_id * config_.head_dim + i]);
                    }
                    q_i /= q_len;
                    for (int anchor_id = 0; anchor_id < config_.anchor_num;
                         anchor_id++) {
                        qa_i = std::max(
                            qa_i,
                            GGML_FP16_TO_FP32(
                                anchor_[layer_idx * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        anchor_id * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + i]) *
                                q_i);
                    }
                    block_similar_kv_head_[batch_id][block_id]
                                          [head_id / n_gqa_] += qa_i;
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating similarity: %f s\n", layer_idx,
    //        diff.count());
}
void KVCache::select_block_kvhead_(int batch_size, int layer_idx,
                                   int max_block_num, int init_block_num,
                                   int local_block_num, int pick_block_num) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        int cache_len_after_retrieval = 0;
        if (cache_seqlens_[batch_idx] / config_.block_len <=
            init_block_num + pick_block_num + local_block_num) {
            selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] = 0;
            for (int i = 0; i < max_block_num; i++) {
                for (int j = 0; j < config_.kv_head_num; j++) {
                    block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                        block_table_before_retrieval_kvhead_[batch_idx][i][j];
                }
            }
            continue;
        }
        for (int head_id = 0; head_id < config_.kv_head_num; head_id++) {

            for (int block_id = init_block_num;
                 block_id < (cache_seqlens_[batch_idx] / config_.block_len) -
                                local_block_num;
                 block_id++) {

                top_similar_block_[batch_idx].push(std::make_pair(
                    block_similar_kv_head_[batch_idx][block_id][head_id],
                    block_table_before_retrieval_kvhead_[batch_idx][block_id]
                                                        [head_id]));
                if (top_similar_block_[batch_idx].size() > pick_block_num) {
                    top_similar_block_[batch_idx].pop();
                }
            }

            int i = 0;
            for (; i < init_block_num; i++) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    block_table_before_retrieval_kvhead_[batch_idx][i][head_id];
            }
            while (!top_similar_block_[batch_idx].empty()) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    top_similar_block_[batch_idx].top().second;
                top_similar_block_[batch_idx].pop();
                i++;
            }
            for (; i < init_block_num + pick_block_num + local_block_num; i++) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    block_table_before_retrieval_kvhead_
                        [batch_idx]
                        [(cache_seqlens_[batch_idx] / config_.block_len) -
                         local_block_num + i - init_block_num - pick_block_num]
                        [head_id];
            }
            if (cache_seqlens_[batch_idx] % config_.block_len != 0) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    block_table_before_retrieval_kvhead_[batch_idx][(
                        cache_seqlens_[batch_idx] / config_.block_len)]
                                                        [head_id];
                cache_len_after_retrieval =
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                    i * config_.block_len;
                i++;
            } else {
                cache_len_after_retrieval =
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                    i * config_.block_len;
            }
            for (int j = 0; j < i; j++) {
                selected_blocks_history_kvhead_
                    [(layer_idx - config_.layer_offset) / config_.layer_step]
                    [batch_idx][j][head_id] =
                        block_table_after_retrieval_kvhead_[batch_idx][j]
                                                           [head_id];
            }
        }
        cache_seqlens_[batch_idx] = cache_len_after_retrieval;
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] =
            (cache_len_after_retrieval + config_.block_len - 1) /
            config_.block_len;
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of selecting block: %f s\n", layer_idx,
    //        diff.count())
}

void KVCache::get_attn_sparsity(const ggml_fp16_t *q_in, float *attn_sparsity,
                                int layer_idx, int generate_token_idx,
                                int q_len, int batch_size, int max_block_num,
                                int *block_table, int *cache_seqlens,
                                int *block_table_origin,
                                int *cache_seqlens_origin,
                                int max_block_num_origin, int topk, int local,
                                Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    layer_id_ = layer_idx;
    int thread_num = backend->get_thread_num();
    batch_size = 1;

    const uint16_t *q_in_data = const_cast<const uint16_t *>(q_in);

    quantize_q_(q_in_data, batch_size);
    if (config_.retrieval_type == RetrievalType::LAYER) {
        attn_initialize_layer_(batch_size, layer_idx, block_table,
                               max_block_num, cache_seqlens);
        retrieval_kvcache_layer_(q_in_data, 1, local, topk, q_len,
                                 generate_token_idx, batch_size, layer_idx,
                                 cache_seqlens, max_block_num, backend);
        calculate_sparsity_layer_(q_in_data, attn_sparsity, batch_size,
                                  max_block_num_origin, block_table_origin,
                                  cache_seqlens_origin, backend);
    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        attn_initialize_kvhead_(batch_size, layer_idx, block_table,
                                max_block_num, cache_seqlens);
        retrieval_kvcache_kvhead_(q_in_data, 1, local, topk, q_len,
                                  generate_token_idx, batch_size, layer_idx,
                                  cache_seqlens, max_block_num, backend);
        calculate_sparsity_kvhead_(q_in_data, attn_sparsity, batch_size,
                                   max_block_num_origin, block_table_origin,
                                   cache_seqlens_origin, backend);
    }
}

void KVCache::attn_with_kvcache_one_block_(
    int head_dim, int bsz,
    ggml_type q_type, // GGML data type of `Q`, only supports fp16 and q8_0
    // [bsz, head_dim]
    // Quantization is always on the head_dim dimension (per_token). If
    // head_dim % 32 != 0, an error will be raised. The size must be bsz *
    // head_dim/32 * qtype_size.
    const void *q,

    int past_kv_len, int past_kv_offset,
    bool is_full_attn, // true indicates a full 1 mask
    // If is_full_attn = false, a bit matrix representing the mask is
    // passed. [bsz, past_kv_len]
    const uint8_t *attn_mask,

    ggml_type k_type, // GGML data type of `K Cache`, only supports fp16,
                      // q4_0, q8_0
    int k_quant_type, // 0 for per_token, 1 for per_channel, others raise an
                      // error
    // [seq_len, head_dim]
    // If quant_type == 0, head_dim % 32 must be 0.
    // If quant_type == 1, seq_len % 32 must be 0.
    const void *k_cache,

    // k_anchor_type must be fp16
    int num_k_anchor, // num_k_anchor == 0 indicates no anchor
    // [num_k_anchor, head_dim]
    const void *k_cache_anchors,
    // Each token is associated with the nearest previous position's anchor,
    // with the same distance.
    const int *k_cache_anchor_pos,

    // v_cache similar to k_cache
    ggml_type v_type, int v_quant_type,
    // [head_dim, seq_len]
    const void *v_cache, int num_v_anchor, const void *v_cache_anchors,
    const int *v_cache_anchor_pos,

    // Pre-allocated buffer for intermediate calculations [bsz,
    // past_kv_len]. No malloc is performed inside this function.
    float *attn_score,

    // Output: [bsz, head_dim], with the same type as q_type
    void *output,
    // [bsz]
    float *lse,

    // Pre-allocated temporary buffer with sufficient size:
    // (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
    // head_dim + past_kv_len * head_dim / 32) bytes.
    void *draft,

    // Apply rotary embedding online
    const int *rotary_angle, const void *rotary_cos, const void *rotary_sin
    // rotary_cos=None,
    // rotary_sin=None,
    // cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
    // cache_batch_idx: Optional[torch.Tensor] = None,
    // rotary_interleaved=True,

    // // Not supported for now
    // window_size=(-1, -1),  # -1 means infinite context window
    // alibi_slopes=None,
) {
    assert(head_dim % 32 == 0);
    assert(k_quant_type == 0);
    assert(v_quant_type == 1);
    assert(q_type == GGML_TYPE_F16 || q_type == GGML_TYPE_Q8_0);
    if (q_type == GGML_TYPE_F16) {
        assert(k_type == GGML_TYPE_F16);
        assert(v_type == GGML_TYPE_F16);

        // attn = q * k + q * k_anchor
        // TODO: anchor
        assert(num_k_anchor == 0);

        if (rotary_angle != nullptr) {
            ggml_fp16_t *k_cache_with_rope_fp16 =
                (reinterpret_cast<ggml_fp16_t *>(draft) +
                 sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
                 sizeof(float) * bsz * head_dim);
            // dequant k_cache and apply rope
            // k_rope(i) = k(i) * cos(i) - k(i+l) * sin(i)
            // k_rope(i+l) = k(i+l) * cos(i+l) + k(i) * sin(i)

            // k(i)cos(i) -> k_rope(i)
            // k(i)sin(i+l) -> k_rope(i+l)

            // k(i)cos(i) -> k_rope(i)
            // -k(i)sin(i-l) -> k_rope(i-l)

            std::vector<float> block_fp32(32);
            for (int k = 0; k < past_kv_len; k++) {
                int angle = rotary_angle[k];
                for (int l = 0; l < head_dim / 32; l++) {
                    for (int m = 0; m < 32; m++) {
                        float x = GGML_FP16_TO_FP32((
                            (ggml_fp16_t *)k_cache)[k * head_dim + l * 32 + m]);
                        float sin_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_sin)[angle * head_dim + l * 32 + m]);
                        float cos_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_cos)[angle * head_dim + l * 32 + m]);

                        if (l * 32 + m < head_dim / 2) {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] =
                                GGML_FP32_TO_FP16(x * cos_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m +
                                                   head_dim / 2] =
                                GGML_FP32_TO_FP16(-x * sin_val);
                        } else {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] =
                                GGML_FP32_TO_FP16(
                                    GGML_FP16_TO_FP32(
                                        k_cache_with_rope_fp16[k * head_dim +
                                                               l * 32 + m]) +
                                    x * sin_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m -
                                                   head_dim / 2] =
                                GGML_FP32_TO_FP16(
                                    GGML_FP16_TO_FP32(
                                        k_cache_with_rope_fp16[k * head_dim +
                                                               l * 32 + m -
                                                               head_dim / 2]) -
                                    x * cos_val);
                        }
                    }
                }
            }

            llamafile_sgemm(past_kv_len, bsz, head_dim,
                            (ggml_fp16_t *)k_cache_with_rope_fp16, head_dim,
                            (ggml_fp16_t *)q, head_dim, attn_score, past_kv_len,
                            0, 1, GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_F16,
                            GGML_TYPE_F32, GGML_PREC_DEFAULT);
        } else {
            bool ok = llamafile_sgemm(
                past_kv_len, bsz, head_dim, (ggml_fp16_t *)k_cache, head_dim,
                (ggml_fp16_t *)q, head_dim, attn_score, past_kv_len, 0, 1,
                GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_F16, GGML_TYPE_F32,
                GGML_PREC_DEFAULT);

            if (!ok) {
                printf("llamafile_sgemm failed\n");
            }
        }
        // attn = attn * scale
        float scale_factor = 1.0 / std::sqrt(float(head_dim));
        ggml_vec_scale_f32(bsz * past_kv_len, attn_score, scale_factor);

        // attn = attn & mask
        if (!is_full_attn) {
            for (int i = 0; i < bsz; i++) {
                for (int j = 0; j < past_kv_len; j++) {
                    int index = i * past_kv_len + j;
                    if (!(attn_mask[j / 8] & (1 << (j % 8)))) {
                        attn_score[index] =
                            std::numeric_limits<float>::lowest();
                    }
                }
            }
        }

        // attn = softmax(attn)
        for (int i = 0; i < bsz; i++) {
            float sum_exp = 0;
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] =
                    std::exp(attn_score[i * past_kv_len + j]);
                sum_exp += attn_score[i * past_kv_len + j];
            }
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] /= sum_exp;
            }
            if (lse != nullptr) {
                lse[i] = std::log(sum_exp);
            }
        }

        // output = attn * v + attn * v_anchor
        // std::vector<float> sum(bsz * head_dim);
        float *sum = reinterpret_cast<float *>(reinterpret_cast<char *>(draft) +
                                               sizeof(block_q8_0) * bsz *
                                                   past_kv_len / QK8_0);

        // float* attn_score_fp16(bsz, past_kv_len)
        ggml_fp16_t *attn_score_fp16 = (reinterpret_cast<ggml_fp16_t *>(
            reinterpret_cast<char *>(draft) +
            sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
            sizeof(float) * bsz * head_dim));

        for (int i = 0; i < bsz * past_kv_len; i++) {
            attn_score_fp16[i] = GGML_FP32_TO_FP16(attn_score[i]);
        }

        // TODO: anchor
        assert(num_v_anchor == 0);
        bool ok = llamafile_sgemm(
            head_dim, bsz, past_kv_len, (ggml_fp16_t *)v_cache, past_kv_len,
            (ggml_fp16_t *)attn_score_fp16, past_kv_len, sum, head_dim, 0, 1,
            GGML_TASK_TYPE_COMPUTE, v_type, GGML_TYPE_F16, GGML_TYPE_F32,
            GGML_PREC_DEFAULT);
        if (!ok) {
            printf("llamafile_sgemm failed\n");
        }

        // copy to output
        for (int i = 0; i < bsz; i++) {
            for (int j = 0; j < head_dim; j++) {
                ((float *)output)[i * head_dim + j] = sum[i * head_dim + j];
            }
        }
    } else {
        assert(k_type == GGML_TYPE_Q4_0 || k_type == GGML_TYPE_Q8_0);
        assert(v_type == GGML_TYPE_Q4_0 || v_type == GGML_TYPE_Q8_0);

        // attn = q * k + q * k_anchor
        // TODO: anchor
        assert(num_k_anchor == 0);

        if (rotary_angle != nullptr) {
            ggml_fp16_t *k_cache_with_rope_fp16 =
                (reinterpret_cast<ggml_fp16_t *>(draft) +
                 sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
                 sizeof(float) * bsz * head_dim);
            block_q4_0 *k_cache_with_rope_q4 =
                (reinterpret_cast<block_q4_0 *>(draft) +
                 sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
                 sizeof(float) * bsz * head_dim) +
                sizeof(ggml_fp16_t) * bsz * head_dim;
            // dequant k_cache and apply rope
            // k_rope(i) = k(i) * cos(i) - k(i+l) * sin(i)
            // k_rope(i+l) = k(i+l) * cos(i+l) + k(i) * sin(i)

            // k(i)cos(i) -> k_rope(i)
            // k(i)sin(i+l) -> k_rope(i+l)

            // k(i)cos(i) -> k_rope(i)
            // -k(i)sin(i-l) -> k_rope(i-l)

            std::vector<float> block_fp32(32);
            for (int k = 0; k < past_kv_len; k++) {
                int angle = rotary_angle[k];
                for (int l = 0; l < head_dim / 32; l++) {
                    block_q4_0 block =
                        ((block_q4_0 *)k_cache)[k * head_dim / 32 + l];
                    dequantize_row_q4_0(&block, block_fp32.data(), 32);
                    for (int m = 0; m < 32; m++) {
                        float sin_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_sin)[angle * head_dim + l * 32 + m]);
                        float cos_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_cos)[angle * head_dim + l * 32 + m]);

                        if (l * 32 + m < head_dim / 2) {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] =
                                GGML_FP32_TO_FP16(block_fp32[m] * cos_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m +
                                                   head_dim / 2] =
                                GGML_FP32_TO_FP16(-block_fp32[m] * sin_val);
                        } else {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] +=
                                GGML_FP32_TO_FP16(block_fp32[m] * sin_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m -
                                                   head_dim / 2] -=
                                GGML_FP32_TO_FP16(block_fp32[m] * cos_val);
                        }
                    }
                }
            }
            // quantize k_cache_with_rope_fp16
            for (int k = 0; k < past_kv_len; k++) {
                for (int l = 0; l < head_dim / 32; l++) {
                    for (int m = 0; m < 32; m++) {
                        block_fp32[m] = GGML_FP16_TO_FP32(
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m]);
                    }
                    quantize_row_q4_0(
                        block_fp32.data(),
                        &k_cache_with_rope_q4[k * head_dim / 32 + l], 32);
                }
            }

            llamafile_sgemm(past_kv_len, bsz, head_dim / 32,
                            (block_q4_0 *)k_cache_with_rope_q4, head_dim / 32,
                            (block_q8_0 *)q, head_dim / 32, attn_score,
                            past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type,
                            GGML_TYPE_Q8_0, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        } else {
            llamafile_sgemm(past_kv_len, bsz, head_dim / 32,
                            (block_q4_0 *)k_cache, head_dim / 32,
                            (block_q8_0 *)q, head_dim / 32, attn_score,
                            past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type,
                            GGML_TYPE_Q8_0, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        }

        // attn = attn * scale
        float scale_factor = 1.0 / std::sqrt(float(head_dim));
        ggml_vec_scale_f32(bsz * past_kv_len, attn_score, scale_factor);

        // attn = attn & mask
        if (!is_full_attn) {
            for (int i = 0; i < bsz; i++) {
                for (int j = 0; j < past_kv_len; j++) {
                    int index = i * past_kv_len + j;
                    if (!(attn_mask[j / 8] & (1 << (j % 8)))) {
                        attn_score[index] =
                            std::numeric_limits<float>::lowest();
                    }
                }
            }
        }

        // attn = softmax(attn)
        for (int i = 0; i < bsz; i++) {
            float sum_exp = 0;
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] =
                    std::exp(attn_score[i * past_kv_len + j]);
                sum_exp += attn_score[i * past_kv_len + j];
            }
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] /= sum_exp;
            }
            if (lse != nullptr) {
                lse[i] = std::log(sum_exp);
            }
        }

        // output = attn * v + attn * v_anchor
        // std::vector<block_q8_0> attn_q8_0(bsz * past_kv_len / QK8_0);
        block_q8_0 *attn_q8_0 = reinterpret_cast<block_q8_0 *>(draft);
        quantize_row_q8_0(attn_score, attn_q8_0, bsz * past_kv_len);
        // std::vector<float> sum(bsz * head_dim);
        float *sum = reinterpret_cast<float *>(reinterpret_cast<char *>(draft) +
                                               sizeof(block_q8_0) * bsz *
                                                   past_kv_len / QK8_0);
        // TODO: anchor
        assert(num_v_anchor == 0);
        llamafile_sgemm(head_dim, bsz, past_kv_len / 32, (block_q4_0 *)v_cache,
                        past_kv_len / 32, attn_q8_0, past_kv_len / 32, sum,
                        head_dim, 0, 1, GGML_TASK_TYPE_COMPUTE, v_type,
                        GGML_TYPE_Q8_0, GGML_TYPE_F32, GGML_PREC_DEFAULT);

        quantize_row_q8_0(sum, (block_q8_0 *)output, bsz * head_dim);
    }
}


================================================
FILE: archive/csrc/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    std::ifstream ifs_tensor(tensor_file_path, std::ios::binary);
    if (!ifs_tensor) {
        throw std::runtime_error("Failed to open tensor file");
    }
    ifs_tensor.read(reinterpret_cast<char *>(&cache_total_len_),
                    sizeof(cache_total_len_));
    int past_block_num =
        (cache_total_len_ + config_.block_len - 1) / config_.block_len;
    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len_,
           past_block_num);
    for (int i = 0; i < config_.layer_num; ++i) {
        past_block_num_[i] = past_block_num;
    }
    ifs_tensor.read(reinterpret_cast<char *>(anchor_.data()),
                    anchor_.size() * sizeof(ggml_fp16_t));
    for (int i = 0; i < config_.layer_num; ++i) {
        for (int j = 0; j < config_.kv_head_num; ++j) {
            for (int k = 0; k < past_block_num_[i]; ++k) {
                if (config_.kv_type == GGML_TYPE_F16) {
                    ifs_tensor.read(
                        reinterpret_cast<char *>(k_cache_fp16_[i][j][k].data()),
                        k_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
                    ifs_tensor.read(
                        reinterpret_cast<char *>(v_cache_fp16_[i][j][k].data()),
                        v_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
                    ifs_tensor.read(
                        reinterpret_cast<char *>(k_cache_q4[i][j][k].data()),
                        k_cache_q4[i][j][k].size() * sizeof(block_q4_0));
                    ifs_tensor.read(
                        reinterpret_cast<char *>(v_cache_q4[i][j][k].data()),
                        v_cache_q4[i][j][k].size() * sizeof(block_q4_0));
                }
            }
        }
        for (int k = 0; k < past_block_num_[i]; ++k) {
            for (int l = 0; l < config_.block_len; l++) {
                ifs_tensor.read(
                    reinterpret_cast<char *>(importance_[i][k][l].data()),
                    importance_[i][k][l].size() * sizeof(ggml_fp16_t));
            }
        }
    }
    ifs_tensor.close();
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    printf("time of load: %f s\n", diff.count());
}
void KVCache::dump_kvcache(int *block_table, int cache_total_len,
                           std::string tensor_file_path, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    std::ofstream ofs(tensor_file_path, std::ios::binary);
    printf("dump_kvcache: %s\n", tensor_file_path.c_str());
    if (!ofs.is_open()) {
        std::cerr << "Cannot open file " << tensor_file_path << std::endl;
        return;
    }
    ofs.write(reinterpret_cast<const char *>(&cache_total_len),
              sizeof(cache_total_len));
    int past_block_num =
        (cache_total_len + config_.block_len - 1) / config_.block_len;
    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len,
           past_block_num);
    ofs.write(reinterpret_cast<const char *>(anchor_.data()),
              anchor_.size() * sizeof(ggml_fp16_t));
    for (int i = 0; i < config_.layer_num; ++i) {
        for (int j = 0; j < config_.kv_head_num; ++j) {
            for (int k = 0; k < past_block_num; ++k) {
                int block_idx = block_table[k];
                if (config_.kv_type == GGML_TYPE_F16) {
                    ofs.write(reinterpret_cast<const char *>(
                                  k_cache_fp16_[i][j][block_idx].data()),
                              k_cache_fp16_[i][j][block_idx].size() *
                                  sizeof(ggml_fp16_t));
                    ofs.write(reinterpret_cast<const char *>(
                                  v_cache_fp16_[i][j][block_idx].data()),
                              v_cache_fp16_[i][j][block_idx].size() *
                                  sizeof(ggml_fp16_t));

                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
                    ofs.write(reinterpret_cast<const char *>(
                                  k_cache_q4[i][j][block_idx].data()),
                              k_cache_q4[i][j][block_idx].size() *
                                  sizeof(block_q4_0));
                    ofs.write(reinterpret_cast<const char *>(
                                  v_cache_q4[i][j][block_idx].data()),
                              v_cache_q4[i][j][block_idx].size() *
                                  sizeof(block_q4_0));
                }
            }
        }
        for (int k = 0; k < past_block_num; ++k) {
            int block_idx = block_table[k];
            for (int l = 0; l < config_.block_len; l++) {
                ofs.write(reinterpret_cast<const char *>(
                              importance_[i][block_idx][l].data()),
                          importance_[i][block_idx][l].size() *
                              sizeof(ggml_fp16_t));
            }
        }
    }
    ofs.close();
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    printf("time of dump: %f s\n", diff.count());
}

================================================
FILE: archive/csrc/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

void KVCache::get_anchor_one_block(ggml_fp16_t *anchor, int layer_id,
                                   int block_idx, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    anchor_data_ = const_cast<uint16_t *>(anchor);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of reading anchor: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::update_anchor_one_block(const ggml_fp16_t *anchor, int layer_id,
                                      int block_idx, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    anchor_data_ = const_cast<uint16_t *>(anchor);

    // Each task updates the anchor of a certain position
    // backend->do_work_stealing_job(config_.anchor_num, [&](int task_id) {
    //     int k = task_id % config_.anchor_num;
    //     int head_id = task_id / config_.anchor_num;
    //     memcpy(anchor_[layer_id_][head_id][block_idx].data() +
    //                k * config_.head_dim,
    //            anchor_data_ + k * config_.head_dim,
    //            sizeof(uint16_t) * config_.head_dim);
    // });

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of writting anchor: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::update_importance_one_block(const ggml_fp16_t *importance,
                                          int layer_id, int block_idx,
                                          Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    importance_data_ = const_cast<uint16_t *>(importance);

    // Each task updates the importance of a certain position
    backend->do_work_stealing_job(
        config_.block_len, nullptr,
        [&](int task_id) {
            int k = task_id;
            memcpy(importance_[layer_id_][block_idx].data() + k,
                   importance_data_ + k, sizeof(uint16_t));
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of writting importance: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::get_importance_one_block(ggml_fp16_t *importance, int layer_id,
                                       int block_idx, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    importance_data_ = const_cast<uint16_t *>(importance);

    // Each task updates the importance of a certain position
    backend->do_work_stealing_job(
        config_.block_len, nullptr,
        [&](int task_id) {
            int k = task_id;
            memcpy(importance_data_ + k,
                   importance_[layer_id_][block_idx].data() + k,
                   sizeof(uint16_t));
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of reading importance: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::update_kvcache_one_block_fp16(const ggml_fp16_t *k_in,
                                            const ggml_fp16_t *v_in,
                                            int layer_id, int block_idx,
                                            Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);

    int new_block_num = std::max((int)past_block_num_[layer_id], block_idx + 1);

    importance_[layer_id_].resize(new_block_num);

    for (int i = 0; i < config_.kv_head_num; i++) {
        k_cache_q4[layer_id][i].resize(new_block_num);
        v_cache_q4[layer_id][i].resize(new_block_num);
        // anchor_[layer_id][i].resize(new_block_num);
    }

    for (int i = 0; i < new_block_num; i++) {
        importance_[layer_id][i].resize(config_.block_len);
    }

    // Each task updates the k cache or v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * 2, nullptr,
        [&](int task_id) {
            std::vector<float> block_fp32(32);
            int head_id = task_id / 2;
            if (task_id & 1) {
                // fill k_cache_
                k_cache_q4[layer_id_][head_id][block_idx].resize(
                    config_.block_len * config_.head_dim / 32);
                for (int k = 0; k < config_.block_len; k++) {
                    for (int l = 0; l < config_.head_dim / 32; l++) {
                        block_q4_0 block;
                        for (int m = 0; m < 32; m++) {

                            block_fp32[m] = GGML_FP16_TO_FP32(
                                k_data_[((0 * config_.kv_head_num + head_id) *
                                             seq_len_ +
                                         0 * config_.block_len + k) *
                                            config_.head_dim +
                                        l * 32 + m]);
                        }
                        quantize_row_q4_0(block_fp32.data(), &block, 32);
                        k_cache_q4[layer_id_][head_id][block_idx]
                                  [k * config_.head_dim / 32 + l] = block;
                    }
                }
            } else {
                // fill v_cache_
                v_cache_q4[layer_id_][head_id][block_idx].resize(
                    config_.head_dim * config_.block_len / 32);
                for (int k = 0; k < config_.block_len / 32; k++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        block_q4_0 block;
                        for (int m = 0; m < 32; m++) {

                            block_fp32[m] = GGML_FP16_TO_FP32(
                                v_data_[((0 * config_.kv_head_num + head_id) *
                                             seq_len_ +
                                         0 * config_.block_len + k * 32 + m) *
                                            config_.head_dim +
                                        l]);
                        }
                        quantize_row_q4_0(block_fp32.data(), &block, 32);
                        v_cache_q4[layer_id_][head_id][block_idx]
                                  [l * config_.block_len / 32 + k] = block;
                    }
                }
            }
        },
        nullptr);
    past_block_num_[layer_id] = new_block_num;

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of writting KV Cache: %f s\n", layer_id,
           block_idx, duration.count());
    // printf("get_one_block_fp16 duration: %ld\n", duration);
}

void KVCache::get_kvcache_one_block_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                         int layer_id, int block_idx,
                                         Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    seq_len_ = config_.block_len;
    k_data_ = reinterpret_cast<uint16_t *>(k_in);
    v_data_ = reinterpret_cast<uint16_t *>(v_in);

    // printf("layer_id: %d, block_idx: %d\n", layer_id, block_idx);
    // Each task gets the k cache or v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * 2, nullptr,
        [&](int task_id) {
            std::vector<float> block_fp32(32);
            int head_id = task_id / 2;
            if (task_id & 1) {
                // get k_cache_
                for (int k = 0; k < config_.block_len; k++) {
                    for (int l = 0; l < config_.head_dim / 32; l++) {
                        block_q4_0 block =
                            k_cache_q4[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            k_data_[((0 * config_.kv_head_num + head_id) *
                                         seq_len_ +
                                     0 * config_.block_len + k) *
                                        config_.head_dim +
                                    l * 32 + m] =
                                GGML_FP32_TO_FP16(block_fp32[m]);
                        }
                    }
                }
            } else {
                // get v_cache_
                for (int k = 0; k < config_.block_len / 32; k++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        block_q4_0 block =
                            v_cache_q4[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            v_data_[((0 * config_.kv_head_num + head_id) *
                                         seq_len_ +
                                     0 * config_.block_len + k * 32 + m) *
                                        config_.head_dim +
                                    l] = GGML_FP32_TO_FP16(block_fp32[m]);
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of reading KV Cache: %f s\n", layer_id,
           block_idx, duration.count());
    // printf("get_one_block_fp16 duration: %ld\n", duration);
}

// k_in: (batch_size, seq_len, head_num, head_dim)
// v_in: (batch_size, seq_len, head_num, head_dim)
void KVCache::get_and_update_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                          int layer_id, int *block_table,
                                          int batch_size, int max_block_num,
                                          int *cache_seqlens, int q_len,
                                          Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);

    // Each task updates the k cache and v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * max_block_num * batch_size, nullptr,
        [&](int task_id) {
            // printf("block_idx: %d, task_id: %d\n", block_idx, task_id);
            std::vector<float> block_fp32(32);
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int block_id = (task_id / config_.kv_head_num) % max_block_num;
            int head_id = task_id % config_.kv_head_num;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            int seq_len = cache_seqlens[batch_id];
            int block_l = block_id * config_.block_len;
            int block_r = block_id * config_.block_len + config_.block_len;

            if (block_l < seq_len) {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim; l++) {
                            k_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    k_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [k * config_.head_dim + l];
                            v_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    v_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len + k];
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q4_0 block =
                                k_cache_q4[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q4_0 block =
                                v_cache_q4[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q8_0 block =
                                k_cache_q8[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q8_0 block =
                                v_cache_q8[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                }
            }
            if (block_r > seq_len && block_l < seq_len + q_len) {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >=
                                seq_len + q_len ||
                            block_id * config_.block_len + k < seq_len)
                            continue;
                        for (int l = 0; l < config_.head_dim; l++) {
                            k_cache_fp16_[layer_id_][head_id][block_idx]
                                         [k * config_.head_dim + l] = k_data_
                                             [batch_id * (max_block_num *
                                                          config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              block_id * (config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              k * (config_.kv_head_num *
                                                   config_.head_dim) +
                                              head_id * config_.head_dim + l];
                            v_cache_fp16_[layer_id_][head_id][block_idx]
                                         [l * config_.block_len + k] = v_data_
                                             [batch_id * (max_block_num *
                                                          config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              block_id * (config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              k * (config_.kv_head_num *
                                                   config_.head_dim) +
                                              head_id * config_.head_dim + l];
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    // fill k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >=
                                seq_len + q_len ||
                            block_id * config_.block_len + k < seq_len)
                            continue;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q4_0 block;
                            for (int m = 0; m < 32; m++) {

                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    k_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            k * (config_.kv_head_num *
                                                 config_.head_dim) +
                                            head_id * config_.head_dim +
                                            l * 32 + m]);
                            }
                            quantize_row_q4_0(block_fp32.data(), &block, 32);
                            k_cache_q4[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l] = block;
                        }
                    }

                    // fill v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q4_0 block;
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len + q_len) {
                                    block_fp32[m] = 0;
                                    continue;
                                }
                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    v_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            (k * 32 + m) * config_.kv_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim + l]);
                            }
                            quantize_row_q4_0(block_fp32.data(), &block, 32);
                            v_cache_q4[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k] = block;
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    // fill k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >=
                                seq_len + q_len ||
                            block_id * config_.block_len + k < seq_len)
                            continue;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q8_0 block;
                            for (int m = 0; m < 32; m++) {

                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    k_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            k * (config_.kv_head_num *
                                                 config_.head_dim) +
                                            head_id * config_.head_dim +
                                            l * 32 + m]);
                            }
                            quantize_row_q8_0(block_fp32.data(), &block, 32);
                            k_cache_q8[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l] = block;
                        }
                    }

                    // fill v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q8_0 block;
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len + q_len) {
                                    block_fp32[m] = 0;
                                    continue;
                                }
                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    v_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            (k * 32 + m) * config_.kv_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim + l]);
                            }
                            quantize_row_q8_0(block_fp32.data(), &block, 32);
                            v_cache_q8[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k] = block;
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;

    // printf("layer %d time of reading and updating KV Cache: %f s\n",
    // layer_id,
    //        duration.count());
}

void KVCache::update_importance(const ggml_fp16_t *importance, int layer_id,
                                int *block_table, int batch_size,
                                int max_block_num, int *offset, int width,
                                Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    importance_data_ = const_cast<uint16_t *>(importance);

    // Each task updates the importance of a certain position
    backend->do_work_stealing_job(
        max_block_num * batch_size, nullptr,
        [&](int task_id) {
            int block_id = task_id % max_block_num;
            int batch_id = task_id / max_block_num;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            if (block_id > (offset[batch_id] + width) / config_.block_len) {
                return;
            }
            for (int k = 0; k < config_.block_len; k++) {
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    importance_[layer_id_][block_idx][k][head_id] =
                        GGML_FP32_TO_FP16(
                            GGML_FP16_TO_FP32(
                                importance_data_[batch_id * max_block_num *
                                                     config_.block_len *
                                                     config_.q_head_num +
                                                 (block_id * config_.block_len +
                                                  k) *
                                                     config_.q_head_num +
                                                 head_id]) +
                            GGML_FP16_TO_FP32(
                                importance_[layer_id_][block_idx][k][head_id]));
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;

    // printf("layer %d time of updating importance: %f s\n", layer_id,
    //        duration.count());
}

void KVCache::get_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                               int layer_id, int *block_table, int batch_size,
                               int max_block_num, int *cache_seqlens,
                               Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);

    // Each task updates the k cache and v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * max_block_num * batch_size, nullptr,
        [&](int task_id) {
            // printf("block_idx: %d, task_id: %d\n", block_idx, task_id);
            std::vector<float> block_fp32(32);
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int block_id = (task_id / config_.kv_head_num) % max_block_num;
            int head_id = task_id % config_.kv_head_num;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            int seq_len = cache_seqlens[batch_id];
            int block_l = block_id * config_.block_len;
            int block_r = block_id * config_.block_len + config_.block_len;

            if (block_l < seq_len) {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim; l++) {
                            k_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    k_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [k * config_.head_dim + l];
                            v_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    v_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len + k];
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q4_0 block =
                                k_cache_q4[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q4_0 block =
                                v_cache_q4[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q8_0 block =
                                k_cache_q8[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q8_0 block =
                                v_cache_q8[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
}

void KVCache::update_kvcache_fp16(const ggml_fp16_t *k_in,
                                  const ggml_fp16_t *v_in, int layer_id,
                                  int *block_table, int batch_size,
                                  int max_block_num, int *cache_seqlens,
                                  int q_len, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);
    // Each task updates the k cache and v cache of a certain header
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * q_len, nullptr,
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num * q_len);
            int head_id = task_id / q_len % config_.kv_head_num;
            int seq_len = cache_seqlens[batch_id] + task_id % q_len;
            int q_offset = task_id % q_len;

            int block_id = seq_len / config_.block_len;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            int pos_in_block = seq_len % config_.block_len;

            if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                for (int l = 0; l < config_.head_dim; l++) {
                    k_cache_fp16_[layer_id_][head_id][block_idx]
                                 [pos_in_block * config_.head_dim + l] =
                                     k_data_[batch_id *
                                                 (q_len * config_.kv_head_num *
                                                  config_.head_dim) +
                                             q_offset * config_.kv_head_num *
                                                 config_.head_dim +
                                             head_id * config_.head_dim + l];
                    v_cache_fp16_[layer_id_][head_id][block_idx]
                                 [l * config_.block_len + pos_in_block] =
                                     v_data_[batch_id *
                                                 (q_len * config_.kv_head_num *
                                                  config_.head_dim) +
                                             q_offset * config_.kv_head_num *
                                                 config_.head_dim +
                                             head_id * config_.head_dim + l];
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                std::vector<float> block_fp32(32);
                // fill k_cache_
                for (int l = 0; l < config_.head_dim / 32; l++) {
                    block_q4_0 block;
                    for (int m = 0; m < 32; m++) {

                        block_fp32[m] = GGML_FP16_TO_FP32(
                            k_data_[batch_id * (q_len * config_.kv_head_num *
                                                config_.head_dim) +
                                    head_id * config_.head_dim + l * 32 + m]);
                    }
                    quantize_row_q4_0(block_fp32.data(), &block, 32);

                    k_cache_q4[layer_id_][head_id][block_idx]
                              [pos_in_block * config_.head_dim / 32 + l] =
                                  block;
                }

                // fill v_cache_
                for (int l = 0; l < config_.head_dim; l++) {
                    block_q4_0 block = v_cache_q4[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len / 32 +
                                                  pos_in_block / 32];
                    dequantize_row_q4_0(&block, block_fp32.data(), 32);
                    block_fp32[pos_in_block % 32] = GGML_FP16_TO_FP32(
                        v_data_[batch_id * (q_len * config_.kv_head_num *
                                            config_.head_dim) +
                                head_id * config_.head_dim + l]);
                    quantize_row_q4_0(block_fp32.data(), &block, 32);
                    v_cache_q4[layer_id_][head_id][block_idx]
                              [l * config_.block_len / 32 + pos_in_block / 32] =
                                  block;
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                std::vector<float> block_fp32(32);
                // fill k_cache_
                for (int l = 0; l < config_.head_dim / 32; l++) {
                    block_q8_0 block;
                    for (int m = 0; m < 32; m++) {

                        block_fp32[m] = GGML_FP16_TO_FP32(
                            k_data_[batch_id * (q_len * config_.kv_head_num *
                                                config_.head_dim) +
                                    head_id * config_.head_dim + l * 32 + m]);
                    }
                    quantize_row_q8_0(block_fp32.data(), &block, 32);

                    k_cache_q8[layer_id_][head_id][block_idx]
                              [pos_in_block * config_.head_dim / 32 + l] =
                                  block;
                }

                // fill v_cache_
                for (int l = 0; l < config_.head_dim; l++) {
                    block_q8_0 block = v_cache_q8[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len / 32 +
                                                  pos_in_block / 32];
                    dequantize_row_q8_0(&block, block_fp32.data(), 32);
                    block_fp32[pos_in_block % 32] = GGML_FP16_TO_FP32(
                        v_data_[batch_id * (q_len * config_.kv_head_num *
                                            config_.head_dim) +
                                head_id * config_.head_dim + l]);
                    quantize_row_q8_0(block_fp32.data(), &block, 32);
                    v_cache_q8[layer_id_][head_id][block_idx]
                              [l * config_.block_len / 32 + pos_in_block / 32] =
                                  block;
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    // printf("layer %d time of reading KV Cache: %f s\n", layer_id,
    //        duration.count());
}

void KVCache::get_all_kvcache_one_layer(int layer_id, ggml_fp16_t *k_in,
                                        ggml_fp16_t *v_in, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    seq_len_ = config_.block_len;
    block_num_ = get_cache_total_block_num();
    k_data_ = reinterpret_cast<uint16_t *>(k_in);
    v_data_ = reinterpret_cast<uint16_t *>(v_in);

    // Each task gets the k cache or v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * past_block_num_[layer_id] * 2, nullptr,
        [&](int task_id) {
            std::vector<float> block_fp32(32);
            int head_id = task_id / 2 / past_block_num_[layer_id];
            int block_idx = task_id / 2 % past_block_num_[layer_id];
            if (block_idx >= block_num_)
                return;

            int max_offset = 0;
            if (task_id & 1) {
                // get k_cache_
                for (int k = 0; k < config_.block_len; k++) {
                    if (block_idx * seq_len_ + k >= cache_total_len_)
                        break;
                    for (int l = 0; l < config_.head_dim / 32; l++) {
                        block_q4_0 block =
                            k_cache_q4[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            k_data_[(head_id * cache_total_len_ +
                                     block_idx * config_.block_len + k) *
                                        config_.head_dim +
                                    l * 32 + m] =
                                GGML_FP32_TO_FP16(block_fp32[m]);
                            max_offset = std::max(
                                max_offset,
                                (int)(head_id * cache_total_len_ +
                                      block_idx * config_.block_len + k) *
                                        config_.head_dim +
                                    l * 32 + m);
                        }
                    }
                }
            } else {
                // get v_cache_
                for (int k = 0; k < config_.block_len / 32; k++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        block_q4_0 block =
                            v_cache_q4[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            if (block_idx * seq_len_ + k * 32 + m >=
                                cache_total_len_)
                                break;
                            v_data_[(head_id * cache_total_len_ +
                                     block_idx * config_.block_len + k * 32 +
                                     m) *
                                        config_.head_dim +
                                    l] = GGML_FP32_TO_FP16(block_fp32[m]);
                            max_offset =
                                std::max(max_offset,
                                         (int)((head_id * cache_total_len_ +
                                                block_idx * config_.block_len +
                                                k * 32 + m) *
                                                   config_.head_dim +
                                               l));
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    // printf("layer %d block num %d time of reading all KV Cache: %f s\n",
    //        layer_id, block_num_, duration.count());
}


================================================
FILE: archive/csrc/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

std::string ggml_type_to_string(ggml_type type) {
    switch (type) {
    case GGML_TYPE_F32:
        return "GGML_TYPE_F32";
    case GGML_TYPE_F16:
        return "GGML_TYPE_F16";
    case GGML_TYPE_Q4_0:
        return "GGML_TYPE_Q4_0";
    case GGML_TYPE_Q8_0:
        return "GGML_TYPE_Q8_0";
    }
    return "UNDIFINED";
}
std::string AnchorTypeToString(AnchorType type) {
    switch (type) {
    case AnchorType::DYNAMIC:
        return "DYNAMIC";
    case AnchorType::BLOCK_MEAN:
        return "BLOCK_MEAN";
    case AnchorType::BLOCK_MAX:
        return "BLOCK_MAX";
    case AnchorType::FIXED_ANCHOR:
        return "FIXED_ANCHOR";
    case AnchorType::QUEST:
        return "QUEST";
    }
    return "UNDIFINED";
}
std::string RetrievalTypeToString(RetrievalType type) {
    switch (type) {
    case RetrievalType::LAYER:
        return "SHARED";
    case RetrievalType::KVHEAD:
        return "SEPARATE";
    case RetrievalType::QHEAD:
        return "INDIVIDUAL";
    }
    return "UNDIFINED";
}
KVCacheConfig::KVCacheConfig(int layer_num, int kv_head_num, int q_head_num,
                             int head_dim, int block_len, int anchor_num,
                             AnchorType anchor_type, ggml_type kv_type,
                             RetrievalType retrieval_type, int layer_step,
                             int token_step, int layer_offset,
                             int max_block_num, int max_batch_size,
                             int max_thread_num)
    : layer_num(layer_num), kv_head_num(kv_head_num), q_head_num(q_head_num),
      head_dim(head_dim), block_len(block_len), anchor_num(anchor_num),
      anchor_type(anchor_type), kv_type(kv_type),
      retrieval_type(retrieval_type), layer_step(layer_step),
      token_step(token_step), layer_offset(layer_offset),
      max_block_num(max_block_num), max_batch_size(max_batch_size),
      max_thread_num(max_thread_num) {
    printf(
        "layer_num: %d, kv_head_num: %d, q_head_num: %d, head_dim: %d, "
        "block_len: %d, anchor_num: %d, anchor_type: %s, kv_type: %s, "
        "retrieval_type: %s, layer_step: %d, token_step: %d, layer_offset: %d,"
        "max_block_num: %d, max_batch_size: %d, max_thread_num: %d\n",
        layer_num, kv_head_num, q_head_num, head_dim, block_len, anchor_num,
        AnchorTypeToString(anchor_type).c_str(),
        ggml_type_to_string(kv_type).c_str(),
        RetrievalTypeToString(retrieval_type).c_str(), layer_step, token_step,
        layer_offset, max_block_num, max_batch_size, max_thread_num);
    assert(q_head_num % kv_head_num == 0);
}
KVCache::KVCache(KVCacheConfig config) {
    this->config_ = config;

    n_gqa_ = config_.q_head_num / config_.kv_head_num;
    if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
        // TODO: Elegant implement
        k_cache_fp16_.resize(config_.layer_num);
        v_cache_fp16_.resize(config_.layer_num);
        selected_blocks_num_history_.resize(config_.layer_num /
                                            config_.layer_step);
        if (config_.retrieval_type == RetrievalType::LAYER) {
            selected_blocks_history_.resize(config_.layer_num /
                                            config_.layer_step);
        } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
            selected_blocks_history_kvhead_.resize(config_.layer_num /
                                                   config_.layer_step);
        } else if (config_.retrieval_type == RetrievalType::QHEAD) {
        }
    } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
        k_cache_q4.resize(config.layer_num);
        v_cache_q4.resize(config.layer_num);
    } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
        k_cache_q8.resize(config.layer_num);
        v_cache_q8.resize(config.layer_num);
    } else {
        assert(false);
    }
    anchor_.resize(config.layer_num * config.max_block_num * config.anchor_num *
                   config.q_head_num * config.head_dim);
    importance_.resize(config.layer_num);
    past_block_num_.resize(config.layer_num);
    for (int i = 0; i < config.layer_num; i++) {
        past_block_num_[i] = 0;
    }

    ThreadResize(config.max_thread_num);
    BatchResize(config.max_batch_size);
    BlockResize(config.max_block_num);
    q_fp32.resize(n_gqa_ * config.head_dim);
}

void KVCache::ThreadResize(int thread_num) {
    thread_local_output_q8_0_.resize(thread_num);
    thread_local_attn_score_.resize(thread_num);
    thread_local_output_fp32_.resize(thread_num);
    thread_local_attn_lse_.resize(thread_num);
    thread_local_cur_output_fp32_.resize(thread_num);
    thread_local_cur_attn_lse_.resize(thread_num);
    thread_local_draft_.resize(thread_num);
    thread_cur_head_idx_.resize(thread_num);
    thread_local_attn_mask_.resize(thread_num);
    for (int i = 0; i < thread_num; i++) {
        thread_local_output_q8_0_[i].resize(n_gqa_ * config_.head_dim / QK8_0);
        thread_local_attn_score_[i].resize(n_gqa_ * config_.block_len);
        thread_local_output_fp32_[i].resize(n_gqa_ * config_.head_dim);
        thread_local_attn_lse_[i].resize(n_gqa_);
        thread_local_cur_output_fp32_[i].resize(n_gqa_ * config_.head_dim);
        thread_local_cur_attn_lse_[i].resize(n_gqa_);
        thread_local_draft_[i].resize(
            2 * n_gqa_ * config_.block_len + 6 * n_gqa_ * config_.head_dim +
            2 * config_.block_len * config_.head_dim +
            config_.block_len * config_.head_dim / QK4_0);
        thread_local_attn_mask_[i].resize(config_.block_len / 8);
    }
}
void KVCache::BatchResize(int batch_size) {
    mutex_.resize(batch_size);
    q_q8_0_.resize(batch_size);
    q_fp32_.resize(batch_size);
    output_fp32_.resize(batch_size);
    attn_lse_.resize(batch_size);
    block_lse_.resize(batch_size);
    attn_sparsity_.resize(batch_size);

    if (config_.retrieval_type == RetrievalType::LAYER) {
        block_table_before_retrieval_.resize(batch_size);
        block_table_after_retrieval_.resize(batch_size);

        for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
            selected_blocks_history_[i].resize(batch_size);
        }

    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        block_table_before_retrieval_kvhead_.resize(batch_size);
        block_table_after_retrieval_kvhead_.resize(batch_size);
        for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
            selected_blocks_history_kvhead_[i].resize(batch_size);
        }
    } else if (config_.retrieval_type == RetrievalType::QHEAD) {
        block_table_before_retrieval_qhead_.resize(batch_size);
        block_table_after_retrieval_qhead_.resize(batch_size);
    }
    cache_seqlens_.resize(batch_size);
    if (config_.retrieval_type == RetrievalType::LAYER) {
        block_similar_.resize(batch_size);
    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        block_similar_kv_head_.resize(batch_size);
    } else if (config_.retrieval_type == RetrievalType::QHEAD) {
        block_similar_q_head_.resize(batch_size);
    }
    for (int i = 0; i < batch_size; i++) {
        top_similar_block_.resize(batch_size);

        mutex_[i].resize(config_.kv_head_num);
        q_q8_0_[i].resize(config_.kv_head_num);
        q_fp32_[i].resize(config_.kv_head_num);
        output_fp32_[i].resize(config_.kv_head_num);
        attn_lse_[i].resize(config_.kv_head_num);

        for (int j = 0; j < config_.kv_head_num; j++) {
            if (!mutex_[i][j]) {
                mutex_[i][j] = std::make_unique<std::mutex>();
            }
            q_q8_0_[i][j].resize(n_gqa_ * config_.head_dim / QK8_0);
            q_fp32_[i][j].resize(n_gqa_ * config_.head_dim);
            output_fp32_[i][j].resize(n_gqa_ * config_.head_dim);
            attn_lse_[i][j].resize(n_gqa_);
        }
    }
    avg_q.resize(batch_size);
    avg_q_fp16.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        attn_sparsity_[i].resize(config_.q_head_num);
        avg_q[i].resize(config_.q_head_num * config_.head_dim);
        avg_q_fp16[i].resize(config_.q_head_num * config_.head_dim);
    }
}

void KVCache::BlockResize(int max_block_num) {
    sin_.resize(max_block_num * config_.block_len);
    cos_.resize(max_block_num * config_.block_len);
    for (int i = 0; i < max_block_num * config_.block_len; i++) {
        sin_[i].resize(config_.head_dim);
        cos_[i].resize(config_.head_dim);
    }

    for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
        for (int j = 0; j < config_.max_batch_size; j++) {
            if (config_.retrieval_type == RetrievalType::LAYER) {
                selected_blocks_history_[i][j].resize(max_block_num);
            } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
                selected_blocks_history_kvhead_[i][j].resize(max_block_num);
                for (int k = 0; k < config_.max_block_num; k++) {
                    selected_blocks_history_kvhead_[i][j][k].resize(
                        config_.kv_head_num);
                }
            } else if (config_.retrieval_type == RetrievalType::QHEAD) {
            }
        }
    }

    for (int layer_id = 0; layer_id < config_.layer_num; layer_id++) {
        importance_[layer_id].resize(max_block_num);

        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            // TODO: Elegant implement
            k_cache_fp16_[layer_id].resize(config_.kv_head_num);
            v_cache_fp16_[layer_id].resize(config_.kv_head_num);

            for (int i = 0; i < config_.kv_head_num; i++) {
                k_cache_fp16_[layer_id][i].resize(max_block_num);
                v_cache_fp16_[layer_id][i].resize(max_block_num);

                for (int j = 0; j < max_block_num; j++) {
                    k_cache_fp16_[layer_id][i][j].resize(config_.block_len *
                                                         config_.head_dim);
                    v_cache_fp16_[layer_id][i][j].resize(config_.block_len *
                                                         config_.head_dim);
                }
            }

        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            k_cache_q4[layer_id].resize(config_.kv_head_num);
            v_cache_q4[layer_id].resize(config_.kv_head_num);
            for (int i = 0; i < config_.kv_head_num; i++) {
                k_cache_q4[layer_id][i].resize(max_block_num);
                v_cache_q4[layer_id][i].resize(max_block_num);

                for (int j = 0; j < max_block_num; j++) {
                    k_cache_q4[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                    v_cache_q4[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                }
            }
        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            k_cache_q8[layer_id].resize(config_.kv_head_num);
            v_cache_q8[layer_id].resize(config_.kv_head_num);
            for (int i = 0; i < config_.kv_head_num; i++) {
                k_cache_q8[layer_id][i].resize(max_block_num);
                v_cache_q8[layer_id][i].resize(max_block_num);

                for (int j = 0; j < max_block_num; j++) {
                    k_cache_q8[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                    v_cache_q8[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                }
            }
        } else {
            assert(false);
        }
        for (int i = 0; i < config_.max_batch_size; i++) {
            if (config_.retrieval_type == RetrievalType::LAYER) {
                block_similar_[i].resize(max_block_num);
                block_table_before_retrieval_[i].resize(max_block_num);
                block_table_after_retrieval_[i].resize(max_block_num);
            } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
                block_similar_kv_head_[i].resize(max_block_num);
                block_table_before_retrieval_kvhead_[i].resize(max_block_num);
                block_table_after_retrieval_kvhead_[i].resize(max_block_num);
                for (int j = 0; j < max_block_num; j++) {
                    block_similar_kv_head_[i][j].resize(config_.kv_head_num);
                    block_table_before_retrieval_kvhead_[i][j].resize(
                        config_.kv_head_num);
                    block_table_after_retrieval_kvhead_[i][j].resize(
                        config_.kv_head_num);
                }
            } else if (config_.retrieval_type == RetrievalType::QHEAD) {
                block_similar_q_head_[i].resize(max_block_num);
                block_table_before_retrieval_qhead_[i].resize(max_block_num);
                block_table_after_retrieval_qhead_[i].resize(max_block_num);
                for (int j = 0; j < max_block_num; j++) {
                    block_similar_q_head_[i][j].resize(config_.q_head_num);
                    block_table_before_retrieval_qhead_[i][j].resize(
                        config_.q_head_num);
                    block_table_after_retrieval_qhead_[i][j].resize(
                        config_.q_head_num);
                }
            }
            block_lse_[i].resize(max_block_num);
            for (int j = 0; j < max_block_num; j++) {
                block_lse_[i][j].resize(config_.q_head_num);
            }
        }

        for (int i = 0; i < max_block_num; i++) {
            importance_[layer_id][i].resize(config_.block_len);
            for (int j = 0; j < config_.block_len; j++) {
                importance_[layer_id][i][j].resize(config_.q_head_num);
            }
        }
    }
}

void KVCache::calc_anchor_all_layers(int *block_table, int *cache_seqlens,
                                     int batch_size, int max_block_num,
                                     Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    // Each task updates the importance of a certain block
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        config_.layer_num * batch_size * max_block_num, nullptr,
        [&](int task_id) {
            int layer_id = task_id / (batch_size * max_block_num);
            int batch_id = (task_id / max_block_num) % batch_size;
            int block_id = task_id % max_block_num;
            // If the block is out of the sequence length, skip it. In
            // particular, the last block of the sequence that is shorter than
            // the block length should be skipped.

            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];

            std::vector<float> block_fp32(32);
            if (config_.anchor_type == AnchorType::DYNAMIC) {

                // clear anchor_
                for (int anchor_id = 0; anchor_id < 1; anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // find top anchor_num importances and their corresponding
                // positions in the importance_ tensor
                // TODO: Move top_importances to the class member to avoid
                // repeated memory allocation
                std::priority_queue<
                    std::pair<float, std::pair<int, int>>,
                    std::vector<std::pair<float, std::pair<int, int>>>,
                    std::greater<>>
                    top_importances;
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int k = 0; k < seq_len_; k++) {
                        top_importances.push(std::make_pair(
                            GGML_FP16_TO_FP32(
                                importance_[layer_id][block_idx][k][head_id]),
                            std::make_pair(block_idx, k)));
                        // TODO: change to config_ item
                        if (top_importances.size() > config_.anchor_num) {
                            top_importances.pop();
                        }
                    }

                    // fill anchor_

                    for (int l = 0; l < config_.head_dim; l++) {
                        anchor_[layer_id * config_.max_block_num *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim +
                                block_idx * config_.anchor_num *
                                    config_.q_head_num * config_.head_dim +
                                0 * config_.q_head_num * config_.head_dim +
                                head_id * config_.head_dim + l] = 0;
                    }
                    for (int k = 0; k < config_.anchor_num; k++) {
                        int top_indice = top_importances.top().second.second;
                        int top_block_idx = top_importances.top().second.first;

                        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        top_block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    top_block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]) +
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_[layer_id]
                                                         [head_id / n_gqa_]
                                                         [top_block_idx]
                                                         [top_indice *
                                                              config_.head_dim +
                                                          l]));
                            }

                        } else if (config_.kv_type ==
                                   ggml_type::GGML_TYPE_Q4_0) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q4_0 block = k_cache_q4
                                    [layer_id][head_id / n_gqa_][top_block_idx]
                                    [top_indice * config_.head_dim / 32 + l];
                                dequantize_row_q4_0(&block, block_fp32.data(),
                                                    32);
                                for (int m = 0; m < 32; m++) {
                                    anchor_[layer_id * config_.max_block_num *
                                                config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            top_block_idx * config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            0 * config_.q_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim +
                                            l * 32 + m] =
                                        GGML_FP32_TO_FP16(
                                            block_fp32[m] / 4 +
                                            GGML_FP16_TO_FP32(
                                                anchor_[layer_id *
                                                            config_
                                                                .max_block_num *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        top_block_idx *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        0 * config_.q_head_num *
                                                            config_.head_dim +
                                                        head_id *
                                                            config_.head_dim +
                                                        l * 32 + m]));
                                }
                            }
                        } else if (config_.kv_type ==
                                   ggml_type::GGML_TYPE_Q8_0) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q8_0 block = k_cache_q8
                                    [layer_id][head_id / n_gqa_][top_block_idx]
                                    [top_indice * config_.head_dim / 32 + l];
                                dequantize_row_q8_0(&block, block_fp32.data(),
                                                    32);
                                for (int m = 0; m < 32; m++) {
                                    anchor_[layer_id * config_.max_block_num *
                                                config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            top_block_idx * config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            0 * config_.q_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim +
                                            l * 32 + m] =
                                        GGML_FP32_TO_FP16(
                                            block_fp32[m] / 4 +
                                            GGML_FP16_TO_FP32(
                                                anchor_[layer_id *
                                                            config_
                                                                .max_block_num *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        top_block_idx *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        0 * config_.q_head_num *
                                                            config_.head_dim +
                                                        head_id *
                                                            config_.head_dim +
                                                        l * 32 + m]));
                                }
                            }
                        }
                        top_importances.pop();
                    }
                }
            } else if (config_.anchor_type == AnchorType::BLOCK_MEAN) {
                // clear anchor_
                for (int anchor_id = 0; anchor_id < config_.anchor_num;
                     anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // fill anchor_
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int k = 0; k < config_.block_len; k++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]) +
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_[layer_id]
                                                         [head_id / n_gqa_]
                                                         [block_idx]
                                                         [k * config_.head_dim +
                                                          l]) /
                                            config_.block_len);
                            }
                        }
                    }
                }
            } else if (config_.anchor_type == AnchorType::BLOCK_MAX) {
                // clear anchor_
                for (int anchor_id = 0; anchor_id < config_.anchor_num;
                     anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // fill anchor_
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int k = 0; k < config_.block_len; k++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(std::max(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]),
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_
                                                [layer_id][head_id / n_gqa_]
                                                [block_idx]
                                                [k * config_.head_dim + l])));
                            }
                        }
                    }
                }
            } else if (config_.anchor_type == AnchorType::FIXED_ANCHOR) {
                // clear anchor_
                for (int anchor_id = 0; anchor_id < 1; anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // fill anchor_
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                    int stride = config_.block_len / config_.anchor_num;
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int k = 0, tot = 0;
                             k < config_.block_len, tot < config_.anchor_num;
                             k += stride, tot++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]) +
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_[layer_id]
                                                         [head_id / n_gqa_]
                                                         [block_idx]
                                                         [k * config_.head_dim +
                                                          l]) /
                                            config_.anchor_num);
                            }
                        }
                    }
                }

            } else if (config_.anchor_type == AnchorType::QUEST) {
                // clear anchor_
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        anchor_[layer_id * config_.max_block_num *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim +
                                block_idx * config_.anchor_num *
                                    config_.q_head_num * config_.head_dim +
                                1 * config_.q_head_num * config_.head_dim +
                                head_id * config_.head_dim + l] =
                            GGML_FP32_TO_FP16(
                                std::numeric_limits<float>::max());

                        anchor_[layer_id * config_.max_block_num *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim +
                                block_idx * config_.anchor_num *
                                    config_.q_head_num * config_.head_dim +
                                0 * config_.q_head_num * config_.head_dim +
                                head_id * config_.head_dim + l] =
                            GGML_FP32_TO_FP16(
                                std::numeric_limits<float>::min());
                    }
                }

                // fill anchor_

                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int indice = 0; indice < seq_len_; indice++) {
                        for (int head_id = 0; head_id < config_.kv_head_num;
                             head_id++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(std::max(
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_
                                                [layer_id][head_id][block_idx]
                                                [indice * config_.head_dim +
                                                 l]),
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l])));

                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        1 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(std::min(
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_
                                                [layer_id][head_id][block_idx]
                                                [indice * config_.head_dim +
                                                 l]),
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    1 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l])));
                            }
                        }
                    }

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    for (int indice = 0; indice < seq_len_; indice++) {
                        for (int head_id = 0; head_id < config_.kv_head_num;
                             head_id++) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q4_0 block =
                                    k_cache_q4[layer_id][head_id][block_idx]
                                              [indice * config_.head_dim / 32 +
                                               l];
                                dequantize_row_q4_0(&block, block_fp32.data(),
                                                    32);

                                for (int m = 0; m < 32; m++) {
                                    for (int gqa_idx = 0; gqa_idx < n_gqa_;
                                         gqa_idx++) {

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                0 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::max(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         0 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                1 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::min(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         1 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));
                                    }
                                }
                            }
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    for (int indice = 0; indice < seq_len_; indice++) {
                        for (int head_id = 0; head_id < config_.kv_head_num;
                             head_id++) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q8_0 block =
                                    k_cache_q8[layer_id][head_id][block_idx]
                                              [indice * config_.head_dim / 32 +
                                               l];
                                dequantize_row_q8_0(&block, block_fp32.data(),
                                                    32);

                                for (int m = 0; m < 32; m++) {
                                    for (int gqa_idx = 0; gqa_idx < n_gqa_;
                                         gqa_idx++) {

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                0 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::max(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         0 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                1 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::min(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         1 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));
                                    }
                                }
                            }
                        }
                    }
                }
            } else {
                assert(false);
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    //    printf("time of calc_anchor_all_layers: %f s\n", duration.count());
}

void KVCache::clear_importance_all_layers(int *block_table, int *cache_seqlens,
                                          int batch_size, int max_block_num,
                                          Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    // Each task updates the importance of a certain block
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        config_.layer_num * batch_size * max_block_num, nullptr,
        [&](int task_id) {
            int layer_id = task_id / (batch_size * max_block_num);
            int batch_id = (task_id / max_block_num) % batch_size;
            int block_id = task_id % max_block_num;
            // If the block is out of the sequence length, skip it. In
            // particular, the last block of the sequence that is shorter than
            // the block length should be skipped.

            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];

            if (config_.anchor_type == AnchorType::DYNAMIC) {

                // clear anchor_
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int l = 0; l < config_.block_len; l++) {
                        importance_[layer_id][block_idx][l][head_id] = 0;
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    //    printf("time of clear_importance_all_layerssssss: %f s\n",
    //    duration.count());
}

void KVCache::clear_kvcache_all_layers(int *block_table, int *cache_seqlens,
                                       int batch_size, int max_block_num,
                                       Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    // Each task updates the importance of a certain block
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        config_.layer_num * batch_size * max_block_num * config_.kv_head_num,
        nullptr,
        [&](int task_id) {
            int layer_id =
                task_id / (batch_size * max_block_num * config_.kv_head_num);
            int batch_id =
                (task_id / (max_block_num * config_.kv_head_num)) % batch_size;
            int block_id = task_id / config_.kv_head_num % max_block_num;
            int head_id = task_id % config_.kv_head_num;
            // If the block is out of the sequence length, skip it. In
            // particular, the last block of the sequence that is shorter than
            // the block length should be skipped.
            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];

            if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                for (int l = 0; l < config_.block_len * config_.head_dim; l++) {
                    k_cache_fp16_[layer_id][head_id][block_idx][l] = 0;
                    v_cache_fp16_[layer_id][head_id][block_idx][l] = 0;
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                for (int l = 0; l < config_.block_len * config_.head_dim / 32;
                     l++) {
                    k_cache_q4[layer_id][head_id][block_idx][l].d = 0;
                    v_cache_q4[layer_id][head_id][block_idx][l].d = 0;
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                for (int l = 0; l < config_.block_len * config_.head_dim / 32;
                     l++) {
                    k_cache_q8[layer_id][head_id][block_idx][l].d = 0;
                    v_cache_q8[layer_id][head_id][block_idx][l].d = 0;
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    //    printf("time of clear_kvcache_all_layers: %f s\n", duration.count());
}

void KVCache::get_sincos(ggml_fp16_t *sin, ggml_fp16_t *cos, int seqlen) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    const uint16_t *sin_data = const_cast<const uint16_t *>(sin);
    const uint16_t *cos_data = const_cast<const uint16_t *>(cos);

    for (int i = 0; i < seqlen; i++) {
        for (int j = 0; j < config_.head_dim; j++) {
            sin_[i][j] = sin_data[i * config_.head_dim + j];
            cos_[i][j] = cos_data[i * config_.head_dim + j];
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("time of get_sincos: %f s\n", duration.count());
}

void ggml_vec_scale_f32(const int n, float *y, const float v) {
#if defined(GGML_USE_ACCELERATE)
    vDSP_vsmul(y, 1, &v, y, 1, n);
#elif defined(GGML_SIMD)
    const int np = (n & ~(GGML_F32_STEP - 1));

    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);

    GGML_F32_VEC ay[GGML_F32_ARR];

    for (int i = 0; i < np; i += GGML_F32_STEP) {
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ay[j] = GGML_F32_VEC_LOAD(y + i + j * GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);

            GGML_F32_VEC_STORE(y + i + j * GGML_F32_EPR, ay[j]);
        }
    }

    // leftovers
    for (int i = np; i < n; ++i) {
        y[i] *= v;
    }
#else
    // scalar
    for (int i = 0; i < n; ++i) {
        y[i] *= v;
    }
#endif
}

================================================
FILE: archive/csrc/ktransformers_ext/operators/llamafile/conversion.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022 
 * @LastEditTime : 2024-07-25 10:34:55
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_CONVERSION_H
#define CPUINFER_CONVERSION_H

#include <memory.h>
#include "llama.cpp/ggml.h"

inline void to_float(const void* input, float* output, int size, ggml_type type) {
    if (type == ggml_type::GGML_TYPE_F32) {
        memcpy(output, input, size * sizeof(float));
    } else {
        ggml_internal_get_type_traits(type).to_float(input, output, size);
    }
}

inline void from_float(const float* input, void* output, int size, ggml_type type) {
    if (type == ggml_type::GGML_TYPE_F32) {
        memcpy(output, input, size * sizeof(float));
    } else {
        ggml_internal_get_type_traits(type).from_float(input, output, size);
    }
}

#endif

================================================
FILE: archive/csrc/ktransformers_ext/operators/llamafile/linear.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:45:18
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "linear.h"

Linear::Linear(LinearConfig config) {
    config_ = config;
    proj_ = config_.proj;

    std::vector<std::pair<void**, uint64_t>> mem_requests;
    mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.input_size});
    mem_requests.push_back({(void**)&proj_input_, config_.group_max_len * config_.input_size * ggml_type_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type)});
    mem_requests.push_back({(void**)&proj_output_, sizeof(float) * config_.group_max_len * config_.output_size});
    shared_mem_buffer.alloc(this, mem_requests);
}

Linear::~Linear() {
    shared_mem_buffer.dealloc(this);
}

void Linear::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.input_size);
    std::vector<uint8_t> input(config_.input_size *
                               ggml_type_size(config_.hidden_type) /
                               ggml_blck_size(config_.hidden_type));
    std::vector<uint8_t> output(config_.output_size *
                                ggml_type_size(config_.hidden_type) /
                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.input_size; i++) {
        input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.input_size, config_.hidden_type);
    forward_many(1, input.data(), output.data(), backend);
}

void Linear::forward_many(int qlen, const void* input, void* output, Backend* backend) {
    const void* proj_input_ptr;
    if (config_.hidden_type == ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) {
        proj_input_ptr = input;
    } else {
        to_float(input, input_fp32_, qlen * config_.input_size, config_.hidden_type);
        from_float(input_fp32_, proj_input_, qlen * config_.input_size, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type);
        proj_input_ptr = proj_input_;
    }
    int nth = config_.output_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
        float* proj_output_ptr = proj_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.input_size / ggml_blck_size(config_.proj_type), proj_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_ptr, config_.output_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            for (int i = 0; i < qlen; i++) {
                float* output_fp32_ptr = proj_output_ + i * config_.output_size + ith * config_.stride;
                void* output_ptr = (uint8_t*)output + i * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
    }
}

void Linear::forward(int qlen, const void* input, void* output, Backend* backend) {
    if (qlen <= 0) {
        return;
    }
    int forward_len = std::min(qlen, config_.group_max_len);
    forward_many(forward_len, input, output, backend);
    forward(qlen - forward_len, (uint8_t*)input + forward_len * config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
}

================================================
FILE: archive/csrc/ktransformers_ext/operators/llamafile/linear.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_LINEAR_H
#define CPUINFER_OPERATOR_LINEAR_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct LinearConfig {
    int input_size;
    int output_size;
    int stride;
    int group_max_len;
    void* proj;
    ggml_type proj_type;
    ggml_type hidden_type;

    LinearConfig() {}

    LinearConfig(int input_size, int output_size, int stride, int group_max_len, void* proj, ggml_type proj_type, ggml_type hidden_type)
        : input_size(input_size), output_size(output_size), stride(stride), group_max_len(group_max_len), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {}
};

class Linear {
   public:
    Linear(LinearConfig);
    ~Linear();
    void warm_up(Backend* backend);
    void forward_many(int qlen, const void* input, void* output, Backend* backend);
    void forward(int qlen, const void* input, void* output, Backend* backend);

   private:
    LinearConfig config_;
    void* proj_;  // [output_size * input_size ( /32 if quantized)]

    float* input_fp32_;    // [group_max_len * input_size]
    uint8_t* proj_input_;  // [group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)]
    float* proj_output_;   // [group_max_len * output_size]
};

#endif

================================================
FILE: archive/csrc/ktransformers_ext/operators/llamafile/mlp.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:44:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "mlp.h"

MLP::MLP(MLPConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;

    std::vector<std::pair<void**, uint64_t>> mem_requests;
    mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.hidden_size});
    mem_requests.push_back({(void**)&gate_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    mem_requests.push_back({(void**)&up_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    mem_requests.push_back({(void**)&gate_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
    mem_requests.push_back({(void**)&up_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
    mem_requests.push_back({(void**)&intermediate_fp32_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
    mem_requests.push_back({(void**)&down_input_, config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
    mem_requests.push_back({(void**)&down_output_, sizeof(float) * config_.group_max_len * config_.hidden_size});
    shared_mem_buffer.alloc(this, mem_requests);
}

MLP::~MLP() {
    shared_mem_buffer.dealloc(this);
}

void MLP::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.hidden_size);
    std::vector<uint8_t> input(config_.hidden_size *
                               ggml_type_size(config_.hidden_type) /
                               ggml_blck_size(config_.hidden_type));
    std::vector<uint8_t> output(config_.hidden_size *
                                ggml_type_size(config_.hidden_type) /
                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.hidden_size; i++) {
        input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
    forward_many(1, input.data(), output.data(), backend);
}

static float act_fn(float x) { return x / (1.0f + expf(-x)); }

void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) {
    const void* gate_input_ptr;
    const void* up_input_ptr;
    if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
        gate_input_ptr = up_input_ptr = input;
    } else {
        to_float(input, input_fp32_, qlen * config_.hidden_size, config_.hidden_type);
        if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
            gate_input_ptr = up_input_ptr = gate_input_;
        } else {
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = gate_input_;
            } else {
                gate_input_ptr = input;
            }
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(input_fp32_, up_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                up_input_ptr = up_input_;
            } else {
                up_input_ptr = input;
            }
        }
    }
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = gate_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        void* up_proj_ptr = (uint8_t*)up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = up_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < qlen; i++) {
            for (int j = ith * config_.stride; j < (ith + 1) * config_.stride; j++) {
                intermediate_fp32_[i * config_.intermediate_size + j] = act_fn(gate_output_[i * config_.intermediate_size + j]) * up_output_[i * config_.intermediate_size + j];
            }
            if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
                float* intermediate_fp32_ptr = intermediate_fp32_ + i * config_.intermediate_size + ith * config_.stride;
                void* down_input_ptr = (uint8_t*)down_input_ + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
                from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            }
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
    }
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = down_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            for (int i = 0; i < qlen; i++) {
                float* output_fp32_ptr = down_output_ + i * config_.hidden_size + ith * config_.stride;
                void* output_ptr = (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
    }
}

void MLP::forward(int qlen, const void* input, void* output, Backend* backend) {
    if (qlen <= 0) {
        return;
    }
    int forward_len = std::min(qlen, config_.group_max_len);
    forward_many(forward_len, input, output, backend);
    forward(qlen - forward_len, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
}

================================================
FILE: archive/csrc/ktransformers_ext/operators/llamafile/mlp.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_MLP_H
#define CPUINFER_OPERATOR_MLP_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct MLPConfig {
    int hidden_size;
    int intermediate_size;
    int stride;
    int group_max_len;
    void* gate_proj;
    void* up_proj;
    void* down_proj;
    ggml_type gate_type;
    ggml_type up_type;
    ggml_type down_type;
    ggml_type hidden_type;

    MLPConfig() {}

    MLPConfig(int hidden_size, int intermediate_size, int stride, int group_max_len, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
        : hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_max_len(group_max_len), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
};

class MLP {
   public:
    MLP(MLPConfig);
    ~MLP();
    void warm_up(Backend* backend);
    void forward_many(int qlen, const void* input, void* output, Backend* backend);
    void forward(int qlen, const void* input, void* output, Backend* backend);

   private:
    MLPConfig config_;
    void* gate_proj_;  // [intermediate_size * hidden_size ( /32 if quantized)]
    void* up_proj_;    // [intermediate_size * hidden_size ( /32 if quantized)]
    void* down_proj_;  // [hidden_size * intermediate_size ( /32 if quantized)]

    float* input_fp32_;         // [group_max_len * hidden_size]
    uint8_t* gate_input_;       // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* up_input_;         // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    float* gate_output_;        // [group_max_len * intermediate_size]
    float* up_output_;          // [group_max_len * intermediate_size]
    float* intermediate_fp32_;  // [group_max_len * intermediate_size]
    uint8_t* down_input_;       // [group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    float* down_output_;        // [group_max_len * hidden_size]
};

#endif

================================================
FILE: archive/csrc/ktransformers_ext/operators/llamafile/moe.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:43:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "moe.h"
#include <iostream>
#include <cstdint>
#include <math.h>

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
#endif

MOE::MOE(MOEConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;
    
    #ifdef USE_NUMA
    int numa_nodes = numa_num_configured_nodes();
    gate_proj_numa_.resize(numa_nodes);
    up_proj_numa_.resize(numa_nodes);
    down_proj_numa_.resize(numa_nodes);
    size_t exp_inter_hidden_mul_ = (size_t)config.expert_num * config.intermediate_size * config.hidden_size;
    for (int i = 0; i < numa_nodes; i++) {
        gate_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.gate_type) / ggml_blck_size(config.gate_type), i);
        up_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.up_type) / ggml_blck_size(config.up_type), i);
        down_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.down_type) / ggml_blck_size(config.down_type), i);
        if (!gate_proj_numa_[i]) {
            std::cout << "Memory allocation failed for gate_proj_numa_ on node " << i << std::endl;
        }
        if (!up_proj_numa_[i]) {
            std::cout << "Memory allocation failed for up_proj_numa_ on node " << i << std::endl;
        }
        if (!down_proj_numa_[i]) {
            std::cout << "Memory allocation failed for down_proj_numa_ on node " << i << std::endl;
        }
        memcpy(gate_proj_numa_[i], gate_proj_, exp_inter_hidden_mul_* ggml_type_size(config.gate_type) / ggml_blck_size(config.gate_type));
        memcpy(up_proj_numa_[i], up_proj_, exp_inter_hidden_mul_* ggml_type_size(config.up_type) / ggml_blck_size(config.up_type));
        memcpy(down_proj_numa_[i], down_proj_, exp_inter_hidden_mul_* ggml_type_size(config.down_type) / ggml_blck_size(config.down_type));
    }
    #endif

    std::vector<std::pair<void**, uint64_t>> s_mem_requests;
    s_mem_requests.push_back({(void**)&s_input_fp32_, sizeof(float) * config_.hidden_size});
    s_mem_requests.push_back({(void**)&s_gate_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    s_mem_requests.push_back({(void**)&s_up_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    s_gate_output_.resize(config_.routed_expert_num);
    s_up_output_.resize(config_.routed_expert_num);
    s_intermediate_fp32_.resize(config_.routed_expert_num);
    s_down_input_.resize(config_.routed_expert_num);
    s_down_output_.resize(config_.routed_expert_num);
    for (int i = 0; i < config_.routed_expert_num; i++) {
        s_mem_requests.push_back({(void**)&s_gate_output_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_up_output_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_intermediate_fp32_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_down_input_[i], config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
        s_mem_requests.push_back({(void**)&s_down_output_[i], sizeof(float) * config_.hidden_size});
    }
    s_mem_requests.push_back({(void**)&s_output_fp32_, sizeof(float) * config_.hidden_size});
    shared_mem_buffer.alloc(this, s_mem_requests);

    std::vector<std::pair<void**, uint64_t>> m_mem_requests;
    m_input_fp32_.resize(config_.group_max_len);
    m_gate_input_.resize(config_.group_max_len);
    m_up_input_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_mem_requests.push_back({(void**)&m_input_fp32_[i], sizeof(float) * config_.hidden_size});
        m_mem_requests.push_back({(void**)&m_gate_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
        m_mem_requests.push_back({(void**)&m_up_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    }
    m_mem_requests.push_back({(void**)&m_local_gate_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_up_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_gate_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_up_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_intermediate_fp32_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_down_input_, config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_down_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size});
    m_output_fp32_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_mem_requests.push_back({(void**)&m_output_fp32_[i], sizeof(float) * config_.hidden_size});
    }
    shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_local_num_.resize(config_.expert_num);
    m_local_gate_input_ptr_.resize(config_.expert_num);
    m_local_up_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_intermediate_fp32_ptr_.resize(config_.expert_num);
    m_local_down_input_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);
}

MOE::~MOE() {
    shared_mem_buffer.dealloc(this);

    #ifdef USE_NUMA
    int numa_nodes = numa_num_configured_nodes();
    for (int i = 0; i < numa_nodes; i++) {
        numa_free(gate_proj_numa_[i], config_.expert_num * config_.intermediate_size * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type));
        numa_free(up_proj_numa_[i], config_.expert_num * config_.intermediate_size * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type));
        numa_free(down_proj_numa_[i], config_.expert_num * config_.hidden_size * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type));
    }
    #endif
}

void MOE::warm_up(Backend* backend) {
    std::vector<float> input_fp32(config_.hidden_size);
    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
    std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.hidden_size; i++) {
        input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
    for (int i = 0; i < config_.expert_num; i++) {
        uint64_t expert_ids = i;
        float weights = 0;
        forward_one(1, &expert_ids, &weights, input.data(), output.data(), backend);
    }
}

static float act_fn(float x) {
    return x / (1.0f + expf(-x));
}

static float act_fn_relu(float x) {
    if(x > 0.0){
        return x;
    } else {
        return 0.0;
    }
}

void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
    const void* gate_input_ptr;
    const void* up_input_ptr;
    if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
        gate_input_ptr = up_input_ptr = input;
    } else {
        to_float(input, s_input_fp32_, config_.hidden_size, config_.hidden_type);
        if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
            gate_input_ptr = up_input_ptr = s_gate_input_;
        } else {
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = s_gate_input_;
            } else {
                gate_input_ptr = input;
            }
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(s_input_fp32_, s_up_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                up_input_ptr = s_up_input_;
            } else {
                up_input_ptr = input;
            }
        }
    }
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
        
        #ifdef USE_NUMA
        void* gate_proj_ptr = (uint8_t*)gate_proj_numa_[Backend::numa_node] + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #else
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #endif

        float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);

        #ifdef USE_NUMA
        void* up_proj_ptr = (uint8_t*)up_proj_numa_[Backend::numa_node] + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #else
        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #endif

        float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if(config_.use_silu){
            // use silu as act fn
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
                s_intermediate_fp32_[expert_idx][i] = act_fn(s_gate_output_[expert_idx][i]) * s_up_output_[expert_idx][i];
            }
        } else {
            // use relu as act fn
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
                s_intermediate_fp32_[expert_idx][i] = act_fn_relu(s_gate_output_[expert_idx][i]) * s_up_output_[expert_idx][i];
            }
        }
        if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
            float* intermediate_fp32_ptr = s_intermediate_fp32_[expert_idx] + ith * config_.stride;
            void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        for (int i = 0; i < k; i++) {
            from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_output_fp32_[i] = 0;
        }
        for (int expert_idx = 0; expert_idx < k; expert_idx++) {
            uint64_t expert_id = expert_ids[expert_idx];

            #ifdef USE_NUMA
            void* down_proj_ptr = (uint8_t*)down_proj_numa_[Backend::numa_node] + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            #else
            void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            #endif
            
            float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
            llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
                s_output_fp32_[i] += s_down_output_[expert_idx][i] * weights[expert_idx];
            }
        }
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(s_output_fp32_, output, config_.hidden_size, config_.hidden_type);
    }
}

void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
        for (int j = 0; j < k; j++) {
            m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
        }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_gate_input_ptr_[i] = m_local_gate_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
        m_local_up_input_ptr_[i] = m_local_up_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
        m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
        m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
        m_local_intermediate_fp32_ptr_[i] = m_local_intermediate_fp32_ + offset * config_.intermediate_size;
        m_local_down_input_ptr_[i] = m_local_down_input_ + offset * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
        offset += m_local_num_[i];
    }
    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        const void* gate_input_ptr;
        const void* up_input_ptr;
        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
        } else {
            to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
            if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = up_input_ptr = m_gate_input_[i];
            } else {
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                    gate_input_ptr = m_gate_input_[i];
                } else {
                    gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                    up_input_ptr = m_up_input_[i];
                } else {
                    up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
            }
        }
        for (int j = 0; j < k; j++) {
            memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
            memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
        }
    }, nullptr);
    int stride = QK_K;
    int nth = config_.intermediate_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];

        #ifdef USE_NUMA
        void* gate_proj_ptr = (uint8_t*)gate_proj_numa_[Backend::numa_node] + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #else
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #endif

        float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        void* up_input_ptr = m_local_up_input_ptr_[expert_idx];

        #ifdef USE_NUMA
        void* up_proj_ptr = (uint8_t*)up_proj_numa_[Backend::numa_node] + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #else
        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #endif

        float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            if(config_.use_silu){
                for (int j = ith * stride; j < (ith + 1) * stride; j++) {
                    m_local_intermediate_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] = act_fn(m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size + j]) * m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size + j];
                }
            } else {
                for (int j = ith * stride; j < (ith + 1) * stride; j++) {
                    m_local_intermediate_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] = act_fn_relu(m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size + j]) * m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size + j];
                }
            }
            float* intermediate_fp32_ptr = m_local_intermediate_fp32_ptr_[expert_idx] + i * config_.intermediate_size + ith * stride;
            void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }, nullptr);
    stride = QK_K;
    nth = config_.hidden_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
        
        #ifdef USE_NUMA
        void* down_proj_ptr = (uint8_t*)down_proj_numa_[Backend::numa_node] + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        #else
        void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        #endif

        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
    }, nullptr);
    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        for (int e = 0; e < config_.hidden_size; e++) {
            m_output_fp32_[i][e] = 0;
        }
        for (int j = 0; j < k; j++) {
            for (int e = 0; e < config_.hidden_size; e++) {
                m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
            }
        }
        from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
    }, nullptr);
}

void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend) {
    qlen = batch_size_tensor[0];
    if (qlen < config_.group_min_len) {
        for (int i = 0; i < qlen; i++) {
            forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
        }
        return;
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    forward_many(forward_len, k, expert_ids, weights, input, output, backend);

    batch_size_tensor[0] -= forward_len;
    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), batch_size_tensor, backend);
}

================================================
FILE: archive/csrc/ktransformers_ext/operators/llamafile/moe.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_MOE_H
#define CPUINFER_OPERATOR_MOE_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct MOEConfig {
    int expert_num;
    int routed_expert_num;
    int hidden_size;
    int intermediate_size;
    int stride;
    int group_min_len;
    int group_max_len;
    bool use_silu;
    void* gate_proj;
    void* up_proj;
    void* down_proj;
    ggml_type gate_type;
    ggml_type up_type;
    ggml_type down_type;
    ggml_type hidden_type;

    MOEConfig() {}

    MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, bool use_silu, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
        : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_min_len(group_min_len), group_max_len(group_max_len), use_silu(use_silu), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
};

class MOE {
   public:
    MOE(MOEConfig);
    ~MOE();
    void warm_up(Backend* backend);
    void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
    void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend);

   private:
    MOEConfig config_;
    void* gate_proj_;  // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    void* up_proj_;    // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    void* down_proj_;  // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

    #ifdef USE_NUMA
    std::vector<void*> gate_proj_numa_;  // [numa_num, expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    std::vector<void*> up_proj_numa_;    // [numa_num, expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    std::vector<void*> down_proj_numa_;  // [numa_num, expert_num * hidden_size * intermediate_size ( /32 if quantized)]
    #endif

    float* s_input_fp32_;                      // [hidden_size]
    uint8_t* s_gate_input_;                    // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* s_up_input_;                      // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    std::vector<float*> s_gate_output_;        // [routed_expert_num, intermediate_size]
    std::vector<float*> s_up_output_;          // [routed_expert_num, intermediate_size]
    std::vector<float*> s_intermediate_fp32_;  // [routed_expert_num, intermediate_size]
    std::vector<uint8_t*> s_down_input_;       // [routed_expert_num, intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    std::vector<float*> s_down_output_;        // [routed_expert_num, hidden_size]
    float* s_output_fp32_;                     // [hidden_size]

    std::vector<float*> m_input_fp32_;    // [group_max_len, hidden_size]
    std::vector<uint8_t*> m_gate_input_;  // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    std::vector<uint8_t*> m_up_input_;    // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    uint8_t* m_local_gate_input_;         // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* m_local_up_input_;           // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    float* m_local_gate_output_;          // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_up_output_;            // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_intermediate_fp32_;    // [routed_expert_num * group_max_len * intermediate_size]
    uint8_t* m_local_down_input_;         // [routed_expert_num * group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    float* m_local_down_output_;          // [routed_expert_num * group_max_len * hidden_size]
    std::vector<float*> m_output_fp32_;   // [group_max_len, hidden_size]

    std::vector<std::vector<int>> m_local_pos_;          // [group_max_len, routed_expert_num]
    std::vector<int> m_local_num_;                       // [expert_num]
    std::vector<uint8_t*> m_local_gate_input_ptr_;       // [expert_num]
    std::vector<uint8_t*> m_local_up_input_ptr_;         // [expert_num]
    std::vector<float*> m_local_gate_output_ptr_;        // [expert_num]
    std::vector<float*> m_local_up_output_ptr_;          // [expert_num]
    std::vector<float*> m_local_intermediate_fp32_ptr_;  // [expert_num]
    std::vector<uint8_t*> m_local_down_input_ptr_;       // [expert_num]
    std::vector<float*> m_local_down_output_ptr_;        // [expert_num]
};

#endif

================================================
FILE: archive/csrc/ktransformers_ext/vendors/cuda.h
================================================
#pragma once

#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif // CUDART_VERSION < 11020


================================================
FILE: archive/csrc/ktransformers_ext/vendors/hip.h
================================================
#pragma once

#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bfloat16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__

#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F  HIPBLAS_R_16F
#define CUDA_R_32F  HIPBLAS_R_32F
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cublasOperation_t hipblasOperation_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cuDeviceGet hipDeviceGet
#define CUdevice hipDevice_t
#define CUdeviceptr hipDeviceptr_t
#define cuMemUnmap hipMemUnmap
#define CUmemAccessDesc hipMemAccessDesc
#define cuMemAddressFree hipMemAddressFree
#define cuMemRelease hipMemRelease
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
#define cuMemCreate hipMemCreate
#define cuMemAddressReserve hipMemAddressReserve
#define cuMemMap hipMemMap
#define cuMemSetAccess hipMemSetAccess
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
#define CUmemAllocationProp hipMemAllocationProp
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaGraphExec_t hipGraphExec_t
#define cudaGraphNode_t hipGraphNode_t
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaGraphExecDestroy hipGraphExecDestroy
#define cudaGraphLaunch hipGraphLaunch
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
#define cudaGraphNodeType hipGraphNodeType
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
#define cudaGraphInstantiate hipGraphInstantiate
#define cudaStreamEndCapture hipStreamEndCapture
#define cudaGraphDestroy hipGraphDestroy
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
#define cudaGraphNodeGetType hipGraphNodeGetType
#define cudaGraphGetNodes hipGraphGetNodes
#define cudaGraphExecUpdate hipGraphExecUpdate
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
#define cudaStreamBeginCapture hipStreamBeginCapture
#define cudaGraph_t hipGraph_t
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define cudaHostFn_t hipHostFn_t
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED

#define __CUDA_ARCH__ 1300

#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif

#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
#define CDNA
#endif

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#if defined(__gfx1010__) || defined(__gfx1012__)
#define RDNA1
#endif

#ifndef __has_builtin
    #define __has_builtin(x) 0
#endif

typedef hip_bfloat16 nv_bfloat16;


================================================
FILE: archive/csrc/ktransformers_ext/vendors/musa.h
================================================
#pragma once

#include <musa_runtime.h>
#include <musa.h>
#include <mublas.h>
#include <musa_bf16.h>
#include <musa_fp16.h>
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
#define CUDA_R_16F  MUSA_R_16F
#define CUDA_R_32F  MUSA_R_32F
#define cublasComputeType_t cudaDataType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy
#define cublasGemmEx mublasGemmEx
#define cublasGemmBatchedEx mublasGemmBatchedEx
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
#define cublasHandle_t mublasHandle_t
#define cublasSetMathMode mublasSetMathMode
#define cublasSetStream mublasSetStream
#define cublasSgemm mublasSgemm
#define cublasStatus_t mublasStatus_t
#define cublasOperation_t mublasOperation_t
#define cublasGetStatusString mublasStatus_to_string
#define cudaDataType_t musaDataType_t
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t
#define cudaEventDestroy musaEventDestroy
#define cudaFree musaFree
#define cudaFreeHost musaFreeHost
#define cudaGetDevice musaGetDevice
#define cudaGetDeviceCount musaGetDeviceCount
#define cudaGetDeviceProperties musaGetDeviceProperties
#define cudaGetErrorString musaGetErrorString
#define cudaGetLastError musaGetLastError
#define cudaHostRegister musaHostRegister
#define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister
#define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost
#define cudaMallocManaged musaMallocManaged
#define cudaMemcpy musaMemcpy
#define cudaMemcpyAsync musaMemcpyAsync
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
#define cudaMemcpy2DAsync musaMemcpy2DAsync
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
#define cudaMemcpyKind musaMemcpyKind
#define cudaMemset musaMemset
#define cudaMemsetAsync musaMemsetAsync
#define cudaMemGetInfo musaMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
#define cudaSetDevice musaSetDevice
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
#define cudaStreamDestroy musaStreamDestroy
#define cudaStreamFireAndForget musaStreamFireAndForget
#define cudaStreamNonBlocking musaStreamNonBlocking
#define cudaStreamPerThread musaStreamPerThread
#define cudaStreamSynchronize musaStreamSynchronize
#define cudaStreamWaitEvent musaStreamWaitEvent
#define cudaStream_t musaStream_t
#define cudaSuccess musaSuccess

// Additional mappings for MUSA virtual memory pool
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
#define CUdevice MUdevice
#define CUdeviceptr MUdeviceptr
#define CUmemAccessDesc MUmemAccessDesc
#define CUmemAllocationProp MUmemAllocationProp
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
#define cuDeviceGet muDeviceGet
#define cuDeviceGetAttribute muDeviceGetAttribute
#define cuMemAddressFree muMemAddressFree
#define cuMemAddressReserve muMemAddressReserve
#define cuMemCreate muMemCreate
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
#define cuMemMap muMemMap
#define cuMemRelease muMemRelease
#define cuMemSetAccess muMemSetAccess
#define cuMemUnmap muMemUnmap
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
#define cudaFuncSetAttribute musaFuncSetAttribute
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
#define make_cudaExtent make_musaExtent
#define make_cudaPitchedPtr make_musaPitchedPtr

// Additional mappings for MUSA graphs
#define CUDA_SUCCESS MUSA_SUCCESS
#define CUresult MUresult
#define cuGetErrorString muGetErrorString
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
#define cudaGraphDestroy musaGraphDestroy
#define cudaGraphExecDestroy musaGraphExecDestroy
#define cudaGraphExec_t musaGraphExec_t
#define cudaGraphExecUpdate musaGraphExecUpdate
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
#define cudaGraphGetNodes musaGraphGetNodes
#define cudaGraphInstantiate musaGraphInstantiate
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
#define cudaGraphLaunch musaGraphLaunch
#define cudaGraphNodeGetType musaGraphNodeGetType
#define cudaGraphNode_t musaGraphNode_t
#define cudaGraphNodeType musaGraphNodeType
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
#define cudaGraph_t musaGraph_t
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture

typedef mt_bfloat16 nv_bfloat16;


================================================
FILE: archive/csrc/ktransformers_ext/vendors/vendor.h
================================================
#ifndef CPUINFER_VENDOR_VENDOR_H
#define CPUINFER_VENDOR_VENDOR_H

#ifdef USE_CUDA
#include "cuda.h"
#elif USE_HIP
#define __HIP_PLATFORM_AMD__
#include "hip.h"
#elif USE_MUSA
#include "musa.h"
#endif

#endif  // CPUINFER_VENDOR_VENDOR_H

================================================
FILE: archive/install-with-cache.sh
================================================
#!/bin/bash
set -e  

# clear build dirs
# rm -rf build
# rm -rf *.egg-info
# rm -rf csrc/build
# rm -rf csrc/ktransformers_ext/build
# rm -rf csrc/ktransformers_ext/cuda/build
# rm -rf csrc/ktransformers_ext/cuda/dist
# rm -rf csrc/ktransformers_ext/cuda/*.egg-info
rm -rf ~/.ktransformers
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
pip install -r ktransformers/server/requirements.txt
echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
pip install third_party/custom_flashinfer/ -v

# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*


echo "Installation completed successfully"


================================================
FILE: archive/install.bat
================================================
@echo off

REM clear build dirs
rmdir /S /Q ktransformers\ktransformers_ext\build
rmdir /S /Q ktransformers\ktransformers_ext\cuda\build
rmdir /S /Q ktransformers\ktransformers_ext\cuda\dist
rmdir /S /Q ktransformers\ktransformers_ext\out
del /F /Q ktransformers\ktransformers_ext\cuda\*.egg-info

echo Installing python dependencies from requirements.txt
pip install -r requirements-local_chat.txt

echo Installing ktransformers
set KTRANSFORMERS_FORCE_BUILD=TRUE
pip install . --no-build-isolation
echo Installation completed successfully

================================================
FILE: archive/install.sh
================================================
#!/bin/bash
set -e  

# default backend
DEV="cuda"

# parse --dev argument
while [[ "$#" -gt 0 ]]; do
    case $1 in
        --dev) DEV="$2"; shift ;;
        *) echo "Unknown parameter passed: $1"; exit 1 ;;
    esac
    shift
done
export DEV_BACKEND="$DEV"
echo "Selected backend: $DEV_BACKEND"

# clear build dirs
rm -rf build
rm -rf *.egg-info
rm -rf csrc/build
rm -rf csrc/ktransformers_ext/build
rm -rf csrc/ktransformers_ext/cuda/build
rm -rf csrc/ktransformers_ext/cuda/dist
rm -rf csrc/ktransformers_ext/cuda/*.egg-info
rm -rf ~/.ktransformers
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
pip install -r ktransformers/server/requirements.txt

echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation

if [[ "$DEV_BACKEND" == "cuda" ]]; then
    echo "Installing custom_flashinfer for CUDA backend"
    pip install third_party/custom_flashinfer/
fi
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*

echo "Installation completed successfully"


================================================
FILE: archive/ktransformers/__init__.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  : 
Author       : kkk1nak0
Date         : 2024-08-15 07:34:46
Version      : 1.0.0
LastEditors  : chenxl 
LastEditTime : 2025-02-15 03:53:02
'''
__version__ = "0.4.1"


================================================
FILE: archive/ktransformers/configs/config.yaml
================================================
log:
  dir: "logs"
  file: "lexllama.log"
  #log level: debug, info, warn, error, crit
  level: "debug"
  backup_count: -1

server:
  ip: 0.0.0.0
  port: 10002

db:
  type: "sqllite"
  database: "server.db"
  host: "./"
  pool_size: 10

user:
  secret_key: "981f1dd2a44e27d68759d0252a486568ed43480b4e616a26e3af3709c3a7ce73"
  algorithm: "HS256"

model:
  # type: transformers
  type: balance_serve
  # type: ktransformers

  name: DeepSeek-Coder-V2-Instruct
  path: deepseek-ai/DeepSeek-V2-Lite-Chat
  gguf_path: /mnt/data/models/Smallthinker-21B

  device: cuda:0
  cache_lens: 16384
  max_new_tokens: 500
web:
  mount: False
  open_cross_domain: True

ext:
  cpu_infer: 10

long_context:
  max_seq_len: 32000
  block_size: 128
  local_windows_len: 4096
  second_select_num: 32
  anchor_type: DYNAMIC
  kv_type: FP16
  dense_layer_num: 2
  anchor_num: 1
  preselect_block: True
  head_select_mode: SHARED
  preselect_block_count: 32
  layer_step: 1
  token_step: 

local_chat:
  prompt_file: ""

async_server:
  sched_strategy: "FCFS"
  sched_port: 56441
  sched_metrics_port: 54321
  kvc2_metrics_port: 54391
  max_batch_size: 4  # decode count + prefill count, in one mini batch

attn:
  page_size: 256
  chunk_size: 256
kvc2:
  gpu_only: true 
  utilization_percentage: 1.0
  cpu_memory_size_GB: 500
  disk_path: /home/wjh/kvc

================================================
FILE: archive/ktransformers/configs/log_config.ini
================================================
[loggers]
keys=root,uvicorn,uvicornError,uvicornAccess

[handlers]
keys=consoleHandler,fileHandler

[formatters]
keys=detailedFormatter

[logger_root]
level=INFO
handlers=consoleHandler

[logger_uvicorn]
level=INFO
handlers=consoleHandler,fileHandler
qualname=uvicorn
propagate=0

[logger_uvicornError]
level=ERROR
handlers=consoleHandler,fileHandler
qualname=uvicorn.error
propagate=0

[logger_uvicornAccess]
level=INFO
handlers=consoleHandler,fileHandler
qualname=uvicorn.access
propagate=0

[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=detailedFormatter
args=(sys.stdout,)

[handler_fileHandler]
class=logging.FileHandler
level=INFO
formatter=detailedFormatter
args=('uvicorn_logs.log', 'a')

[formatter_detailedFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s
datefmt=%Y-%m-%d %H:%M:%S


================================================
FILE: archive/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/__init__.py
================================================


================================================
FILE: archive/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/format_24.py
================================================
#
# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
#

import torch


# This is PyTorch implementation of main part of reorder_meta()
# function, from tools/util/include/cutlass/util/host_reorder.h file
# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
# GEMM decides upon layout of this matrix, and at the moment for the
# sparse GEMM executed on tensor cores, this is layout described by
# ColumnMajorInterleaved<2> data structure, in
# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
# reordering of meta matrix into meta_reordered matrix calculated
# according to these segments of CUTLASS code is re-implemented here.
# Note that this calculation produces offsets for scattering metadata
# matrix elements into reordered metadata matrix elements (or,
# equivalently, for gathering reordered metadata matrix element back
# into metadata matrix elements).
def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
                                               device):
    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)

    # Reorder the rows, then swizzle the 2x2 blocks.
    group_x = 64
    group_y = 32 if meta_dtype.itemsize == 2 else 16

    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
                ((dst_rows % group_x) // 8) * 4)

    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
    dst_rows += topright - bottomleft
    dst_cols -= topright - bottomleft

    # Assumed that meta tensor is to be stored in CUTLASS
    # InterleavedColumnMajor layout, and reverse engineered
    # corresponding code to store values into this tensor.
    interleave = 2
    cols_maj = dst_cols // interleave
    cols_min = dst_cols % interleave
    return (cols_maj * m * interleave + dst_rows * interleave +
            cols_min).view(-1)


# This function converts dense matrix into sparse semi-structured
# representation, producing "compressed" matrix, in the layout used by
# CUTLASS backend, and corresponding metadata matrix.
def sparse_semi_structured_from_dense_cutlass(dense):
    if dense.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = dense.shape
    device = dense.device

    meta_dtype = torch.int8
    if dense.dtype == torch.int8:
        meta_dtype = torch.int32
    elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
        meta_dtype = torch.int16
    else:
        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
    if quadbits_per_meta_elem not in (4, 8):
        raise RuntimeError(
            "Invalid number of elements per meta element calculated")

    if meta_dtype == torch.int32:
        if m % 16 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 16")
    else:
        if m % 32 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 32")
    if k % (4 * quadbits_per_meta_elem) != 0:
        raise RuntimeError(
            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
        )

    if dense.dtype != torch.float:
        ksparse = 4
        dense_4 = dense.view(-1, k // ksparse, ksparse)
        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
    else:
        ksparse = 2
        dense_2 = dense.view(-1, k // ksparse, ksparse)
        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
    meta_ncols = k // (ksparse * quadbits_per_meta_elem)

    # Encoding quadruples of True/False values as follows:
    #     [True,  True,  False, False] -> 0b0100
    #     [True,  False, True,  False] -> 0b1000
    #     [False, True,  True,  False] -> 0b1001
    #     [True,  False, False, True ] -> 0b1100
    #     [False, True,  False, True ] -> 0b1101
    #     [False, False, True,  True ] -> 0b1110
    # Thus, lower two bits in the encoding are index of the True value
    # at the lowest index in the quadruple, and the higher two bits in
    # the encoding are index of the other True value in the quadruple.
    # In case there are less than two True values, than False value or
    # values at some index or indices are considered True for the
    # encoding.  In case there are more than two True values, then the
    # excess True value(s) at some indices are considered False for
    # the encoding.  The exact encodings used for these cases are as
    # follows:
    #     [False, False, False, False] -> 0b1110
    #     [False, False, False, True ] -> 0b1110
    #     [False, False, True,  False] -> 0b1110
    #     [False, True,  False, False] -> 0b1001
    #     [False, True,  True,  True ] -> 0b1101
    #     [True,  False, False, False] -> 0b1000
    #     [True,  False, True,  True ] -> 0b1100
    #     [True,  True,  False, True ] -> 0b0100
    #     [True,  True,  True,  False] -> 0b0100
    #     [True,  True,  True,  True ] -> 0b0100
    # These particular encodings are chosen, with the help of Espresso
    # logic minimizer software, for the purpose of minimization of
    # corresponding Boolean functions, that translate non-zero flags
    # into encoding bits.  Note also possible choices for the first
    # and last of these encodings were limited only to (0b0100,
    # 0b1110), in order to produce valid encodings for 1:2 sparsity
    # case.

    expr0 = m0 & m1
    expr1 = ~m0 & m1
    expr2 = ~m0 & ~m1
    bit0 = expr1
    bit1 = expr2
    bit2 = expr0 | expr2 | m3
    bit3 = expr1 | ~m1
    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
    idxs1 = bit2 | (bit3.to(torch.int64) << 1)

    if dense.dtype != torch.float:
        sparse0 = dense_4.gather(
            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
    else:
        sparse = dense_2.gather(-1,
                                idxs0.unsqueeze(-1) // 2).view(
                                    m,
                                    k // 2)  # type: ignore[possibly-undefined]

    meta_4 = idxs0 | (idxs1 << 2)
    meta_n = meta_4.view(
        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)

    if quadbits_per_meta_elem == 4:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12))
    elif quadbits_per_meta_elem == 8:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12)
                | (meta_n[:, :, 4] << 16)
                | (meta_n[:, :, 5] << 20)
                | (meta_n[:, :, 6] << 24)
                | (meta_n[:, :, 7] << 28))

    # Reorder meta tensor elements.
    meta_reordered = meta.new_empty(
        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))

    return (sparse, meta_reordered.view(m, meta_ncols))


# This function performs reverse of the function above - it
# reconstructs dense matrix from a pair of "compressed" matrix, given
# in the layout used by CUTLASS backend, and accompanying metadata
# matrix.
def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
    if sparse.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = sparse.shape
    device = sparse.device

    if meta_reordered.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"  # noqa: E501
        )
    if meta_reordered.device != device:
        raise RuntimeError(
            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"  # noqa: E501
        )

    meta_dtype = meta_reordered.dtype
    if meta_dtype not in (torch.int16, torch.int32):
        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4

    ksparse = 4 if sparse.dtype != torch.float else 2

    meta_nrows, meta_ncols = meta_reordered.shape
    if meta_nrows != m:
        raise RuntimeError(
            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"  # noqa: E501
        )
    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
        raise RuntimeError(
            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
            "expected according to the number of columns of meta matrix")

    # Undo meta tensor elements reordering.
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta = torch.gather(meta_reordered.view(-1), 0,
                        meta_offsets).view(m, meta_ncols)

    # Unpack sparse tensor back to original dense tensor, using
    # information provided by meta tensor.  Note that torch.float
    # datatype is handled pretty much the same as
    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
    # value is encoded as if underlying 8 bytes contain four
    # torch.half/torch.bfloat16 values, where either first two or last
    # two are zeros.
    meta_2 = torch.empty(
        (m, meta_ncols, 2 * quadbits_per_meta_elem),
        dtype=meta_dtype,
        device=device,
    )
    if quadbits_per_meta_elem == 4:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
    elif quadbits_per_meta_elem == 8:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
        meta_2[:, :, 8] = (meta >> 16) & 0b11
        meta_2[:, :, 9] = (meta >> 18) & 0b11
        meta_2[:, :, 10] = (meta >> 20) & 0b11
        meta_2[:, :, 11] = (meta >> 22) & 0b11
        meta_2[:, :, 12] = (meta >> 24) & 0b11
        meta_2[:, :, 13] = (meta >> 26) & 0b11
        meta_2[:, :, 14] = (meta >> 28) & 0b11
        meta_2[:, :, 15] = (meta >> 30) & 0b11

    dense_offsets = meta_2.view(-1) + (
        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
            -1, 1).repeat(1, 2).view(-1)

    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
    if sparse.dtype != torch.float:
        # dense.scatter_(0, dense_offsets, sparse.view(-1))
        dense.scatter_(0, dense_offsets, sparse.reshape(-1))
    else:
        dense.view(torch.half).scatter_(0, dense_offsets,
                                        sparse.view(torch.half).view(-1))

    return dense.view(m, 2 * k)


def mask_creator(tensor):
    """
    Class for creating N:M sparsity masks.
    Masks will be created using the N:M ratio, where for every block of 
    M weights, N will be pruned based on ranked weight value. Each mask 
    will correspond to the given tensor.

    :param N: The number of weights in a group to keep
    :param M: The size of a weight group
    """
    N = 2
    M = 4

    mask = None
    # for i, tensor in enumerate(tensors):
    if tensor.numel() % M != 0:
        raise ValueError(
            f"Tensor of size {tensor.shape} can't be evenly divided into "
            f"{M} groups")

    num_groups = tensor.numel() // M

    # N:M sparsity for linear layers
    tensor_temp = tensor.detach().abs().reshape(num_groups, M)
    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]

    w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
    mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)

    return mask


================================================
FILE: archive/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_24_perms.py
================================================
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
#
# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms_24(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        col_o = col // 2
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
                             4 * block)
        for j in range(4):
            perm_list.extend([p + 1 * j for p in perm1])
    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
    scale_perm_single: List[int] = []
    for i in range(8):
        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
    return perm, scale_perm, scale_perm_single


marlin_24_perm: Dict[int, torch.Tensor] = {}
marlin_24_scale_perm: Dict[int, List[int]] = {}
marlin_24_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
    marlin_24_perm[num_bits] = perm_24
    marlin_24_scale_perm[num_bits] = scale_perm_24
    marlin_24_scale_perm_single[num_bits] = scale_perm_single_24


================================================
FILE: archive/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_perms.py
================================================
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
#
# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col + 8 * block)
        for j in range(4):
            perm_list.extend([p + 256 * j for p in perm1])

    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return perm, scale_perm, scale_perm_single


marlin_perm: Dict[int, torch.Tensor] = {}
marlin_scale_perm: Dict[int, List[int]] = {}
marlin_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm, scale_perm, scale_perm_single = get_perms(num_bits)
    marlin_perm[num_bits] = perm
    marlin_scale_perm[num_bits] = scale_perm
    marlin_scale_perm_single[num_bits] = scale_perm_single


================================================
FILE: archive/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import random

import numpy
import torch

from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.format_24 import (
    mask_creator, sparse_semi_structured_from_dense_cutlass)
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_24_perms import (
    marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_perms import (
    marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.quant_utils import (
    get_pack_factor, quantize_weights, sort_weights)

__cuda_arch = torch.cuda.get_device_capability()

MARLIN_TILE = 16

GPTQ_MARLIN_TILE = 16
GPTQ_MARLIN_MIN_THREAD_N = 64
GPTQ_MARLIN_MIN_THREAD_K = 128
GPTQ_MARLIN_MAX_PARALLEL = 16

GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
GPTQ_MARLIN_SUPPORTED_SYM = [True]

def is_marlin_supported():
    return __cuda_arch[0] >= 8


def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
    assert q_w.shape == (size_k, size_n)
    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"

    # Permute weights to 16x64 marlin tiles
    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
    q_w = q_w.permute((0, 2, 1, 3))
    q_w = q_w.reshape((size_k // tile, size_n * tile))

    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)

    return q_w


def marlin_weights(q_w, size_k, size_n, num_bits, perm):
    # Permute
    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)

    # Pack
    pack_factor = get_pack_factor(num_bits)
    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
                           dtype=numpy.uint32)
    for i in range(pack_factor):
        q_packed |= q_w[:, i::pack_factor] << num_bits * i

    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)

    return q_packed


def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
                          scale_perm_single):
    if group_size < size_k and group_size != -1:
        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
    else:
        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
    s = s.reshape((-1, size_n)).contiguous()

    return s


def marlin_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
    act_order: bool,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Quantize (and apply act_order if provided)
    q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                       act_order)

    # For act_order, sort the "weights" and "g_idx" so that group ids are
    # increasing
    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)

    # Reformat to marlin
    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
                                marlin_perm[num_bits])
    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                     marlin_scale_perm[num_bits],
                                     marlin_scale_perm_single[num_bits])

    # Create result
    res_list = [marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def vllm_marlin_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
    act_order: bool,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Quantize (and apply act_order if provided)
    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                       act_order)

    # For act_order, sort the "weights" and "g_idx" so that group ids are
    # increasing
    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)

    # Reformat to marlin
    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
                                marlin_perm[num_bits])
    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                     marlin_scale_perm[num_bits],
                                     marlin_scale_perm_single[num_bits])

    # Create result
    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def inject_24(w, size_k, size_n):
    assert w.shape == (size_k, size_n)

    mask = mask_creator(w.t()).t().cuda().bool()

    return (mask * w).contiguous(), mask.contiguous()


def check_24(w, num_rows_to_sample=50, _verbose=False):
    BLOCK_SIZE = 4
    MAX_NON_ZEROS = 2

    w = w.t().contiguous()

    print("check_24: w.shape = {}".format(w.shape))

    num_rows, num_cols = w.shape
    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
    if _verbose:
        print(f"Sampled row idxs = {sampled_row_idxs}")

    total_segments = 0
    non_24_segments = 0
    for i in sampled_row_idxs:
        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
            total_segments += 1
            block = w[i, j:j + BLOCK_SIZE]
            num_nonzero = torch.count_nonzero(block)
            if num_nonzero > MAX_NON_ZEROS:
                print("i = {} j = {} block = {}".format(i, j, block))
                non_24_segments += 1

    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")


def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
    assert q_24.shape == (size_k, size_n)

    # Remove zp to normalize over 0
    max_q_val = (1 << num_bits) - 1
    zp = (max_q_val + 1) // 2
    q_24_no_zp = q_24 - zp

    # Compress
    q_24_no_zp = q_24_no_zp.t().contiguous()
    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
        q_24_no_zp)
    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()

    # Restore zp
    q_24_comp = q_24_no_zp_comp + zp

    # Resize meta to its actual shape (without moving any data)
    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)

    return q_24_comp, meta


def marlin_24_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Inject 2:4 sparsity
    w_24, mask_24 = inject_24(w, size_k, size_n)

    # Quantize
    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
                                                             num_bits,
                                                             group_size,
                                                             act_order=False)

    # Compress quantized weight
    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
                                                     num_bits)
    size_k_comp = size_k // 2

    # Reformat to marlin
    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
                                        num_bits, marlin_24_perm[num_bits])
    marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                        marlin_24_scale_perm[num_bits],
                                        marlin_24_scale_perm_single[num_bits])

    # Create result
    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def compute_max_diff(output, output_ref):
    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
        torch.abs(output_ref))


class MarlinWorkspace:

    def __init__(self, out_features, min_thread_n, max_parallel, device):
        assert (out_features % min_thread_n == 0), (
            "out_features = {} is undivisible by min_thread_n = {}".format(
                out_features, min_thread_n))

        max_workspace_size = ((out_features // min_thread_n) * max_parallel)

        self.scratch = torch.zeros(max_workspace_size,
                                   dtype=torch.int,
                                   device=device)


================================================
FILE: archive/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import numpy
import torch

SUPPORTED_NUM_BITS = [4, 8]
SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]


def get_pack_factor(num_bits):
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    return 32 // num_bits


def permute_rows(q_w: torch.Tensor, group_size: int):

    orig_device = q_w.device
    k_size, _ = q_w.shape

    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
    for i in range(k_size):
        g_idx[i] = i // group_size

    # Simulate act_order by doing a random permutation on K
    rand_perm = torch.randperm(k_size)

    g_idx = g_idx[rand_perm].contiguous()
    q_w = q_w[rand_perm, :].contiguous()

    return (
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
                     act_order: bool):
    orig_device = w.device
    size_k, size_n = w.shape

    assert w.is_floating_point(), "w must be float"
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    assert group_size in SUPPORTED_GROUP_SIZES + [
        size_k
    ], f"Unsupported groupsize = {group_size}"

    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    max_q_val = 2**num_bits - 1
    half_q_val = (max_q_val + 1) // 2

    # Reshape to [groupsize, -1]
    if group_size < size_k:
        w = w.view((-1, group_size, size_n))
        w = w.permute(1, 0, 2)
        w = w.reshape((group_size, -1))

    # Compute scale for each group
    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
    s *= 2 / max_q_val  # 2 => symmetric

    # Quantize
    q_w = torch.round(w / s).int()
    q_w += half_q_val
    q_w = torch.clamp(q_w, 0, max_q_val)

    # Restore original shapes
    if group_size < size_k:

        def reshape_w(w):
            w = w.reshape((group_size, -1, size_n))
            w = w.permute(1, 0, 2)
            w = w.reshape((size_k, size_n)).contiguous()
            return w

        q_w = reshape_w(q_w)

    s = s.reshape((-1, size_n)).contiguous()

    # Apply act_order
    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        assert (
            group_size < size_k
        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
            group_size, size_k)

        q_w, g_idx, rand_perm = permute_rows(q_w, group_size)

    return (
        q_w.to(device=orig_device),
        s.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
    orig_device = q_w.device

    sort_indices = torch.argsort(g_idx).to(
        dtype=torch.int32)  # Sort based on g_idx

    g_idx = g_idx[sort_indices].contiguous()
    q_w = q_w[sort_indices, :].contiguous()

    return (
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        sort_indices.to(device=orig_device),
    )


def gptq_pack(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    pack_factor = get_pack_factor(num_bits)
    assert size_k % pack_factor == 0

    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_res |= q_w[i::pack_factor, :] << num_bits * i

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    return q_res


================================================
FILE: archive/ktransformers/ktransformers_ext/triton/fp8gemm.py
================================================
# Adopted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
from typing import Tuple

import torch
import triton
import triton.language as tl
from triton import Config


@triton.jit
def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
    """
    Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.

    Args:
        x_ptr (triton.Pointer): Pointer to the input tensor.
        y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
        s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
        BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.

    Returns:
        None
    """
    pid = tl.program_id(axis=0)
    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    x = tl.load(x_ptr + offs).to(tl.float32)
    s = tl.max(tl.abs(x)) / 448.
    y = x / s
    y = y.to(y_ptr.dtype.element_ty)
    tl.store(y_ptr + offs, y)
    tl.store(s_ptr + pid, s)


def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Quantizes the input tensor `x` using block-wise quantization.

    Args:
        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The quantized tensor with dtype `torch.float8_e4m3fn`.
            - A tensor of scaling factors with dtype `torch.float32`.
    """
    assert x.is_contiguous(), 'Input tensor must be contiguous'
    assert x.size(-1) % block_size == 0, f'Last dimension size must be divisible by block_size (block_size={block_size})'
    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
    grid = lambda meta: (triton.cdiv(x.numel(), meta['BLOCK_SIZE']), )
    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
    return y, s


@triton.jit
def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
    """
    Dequantizes weights using the provided scaling factors and stores the result.

    Args:
        x_ptr (tl.pointer): Pointer to the quantized weights.
        s_ptr (tl.pointer): Pointer to the scaling factors.
        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
        M (int): Number of rows in the weight matrix.
        N (int): Number of columns in the weight matrix.
        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.

    Returns:
        None
    """
    pid_m = tl.program_id(axis=0)
    pid_n = tl.program_id(axis=1)
    n = tl.cdiv(N, BLOCK_SIZE)
    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs = offs_m[:, None] * N + offs_n[None, :]
    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
    s = tl.load(s_ptr + pid_m * n + pid_n)
    y = x * s
    tl.store(y_ptr + offs, y, mask=mask)


def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
    """
    Dequantizes the given weight tensor using the provided scale tensor.

    Args:
        x (torch.Tensor): The quantized weight tensor of shape (M, N).
        s (torch.Tensor): The scale tensor of shape (M, N).
        block_size (int, optional): The block size to use for dequantization. Defaults to 128.

    Returns:
        torch.Tensor: The dequantized weight tensor of the same shape as `x`.

    Raises:
        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
    """
    assert x.is_contiguous() and s.is_contiguous(), 'Input tensors must be contiguous'
    assert x.dim() == 2 and s.dim() == 2, 'Input tensors must have 2 dimensions'
    M, N = x.size()
    y = torch.empty_like(x, dtype=torch.get_default_dtype())
    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
    with torch.cuda.device(x.device):
        weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
    return y


fp8_gemm_configs = [
    Config({'BLOCK_SIZE_M': block_m, 'BLOCK_SIZE_N': block_n, 'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
]

@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
@triton.jit
def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
                    a_s_ptr, b_s_ptr,
                    M, N: tl.constexpr, K: tl.constexpr,
                    BLOCK_SIZE_M: tl.constexpr,
                    BLOCK_SIZE_N: tl.constexpr,
                    BLOCK_SIZE_K: tl.constexpr):
    """
    Performs a matrix multiplication operation on FP8 matrices with scaling factors.

    Args:
        a_ptr (tl.tensor): Pointer to the first input matrix A.
        b_ptr (tl.tensor): Pointer to the second input matrix B.
        c_ptr (tl.tensor): Pointer to the output matrix C.
        a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
        b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
        M (int): Number of rows in matrix A and C.
        N (tl.constexpr): Number of columns in matrix B and C.
        K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
        BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
        BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
        BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.

    Returns:
        None
    """
    pid_m = tl.program_id(axis=0)
    pid_n = tl.program_id(axis=1)
    k = tl.cdiv(K, BLOCK_SIZE_K)
    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
    a_s_ptrs = a_s_ptr + offs_m * k
    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k

    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    for i in range(k):
        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
        a_s = tl.load(a_s_ptrs)
        b_s = tl.load(b_s_ptrs)
        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
        a_ptrs += BLOCK_SIZE_K
        b_ptrs += BLOCK_SIZE_K
        a_s_ptrs += 1
        b_s_ptrs += 1
    c = accumulator.to(c_ptr.dtype.element_ty)
    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    tl.store(c_ptrs, c, mask=mask)


def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor):
    """
    Perform a matrix multiplication using FP8 precision.

    Args:
        a (torch.Tensor): The first input matrix, must be contiguous.
        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
        b (torch.Tensor): The second input matrix, must be contiguous.
        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.

    Returns:
        torch.Tensor: The result of the matrix multiplication.
    """
    assert a.is_contiguous() and b.is_contiguous(), 'Input tensors must be contiguous'
    assert a_s.is_contiguous() and b_s.is_contiguous(), 'Scaling factor tensors must be contiguous'
    K = a.size(-1)
    M = a.numel() // K
    N = b.size(0)
    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
    fp8_gemm_kernel[grid](a, b, c, a_s, b_s, M, N, K)
    return c

================================================
FILE: archive/ktransformers/local_chat.py
================================================
"""
Description  :  
Author       : Boxin Zhang, Azure-Tang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

import os
import platform
import sys

project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)
import torch
try:
    import torch_npu
    from torch_npu.contrib import transfer_to_npu
    from ktransformers.util.ascend.ascend_utils import get_absort_weight, setup_model_parallel, get_tensor_parallel_group
    from ktransformers.util import utils, npu_graph_runner
except:
    pass
import torch.distributed as dist

import logging
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)
import json
import fire
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import prefill_and_generate, get_compute_capability, xpu_fp16_model
from ktransformers.util import utils
from ktransformers.models.custom_cache import StaticCache
from ktransformers.server.config.config import Config
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor

custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
    "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
    "LlamaForCausalLM": LlamaForCausalLM,
    "MixtralForCausalLM": MixtralForCausalLM,
}

ktransformer_rules_dir = (
    os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
)
default_optimize_rules = {
    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
    "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
    "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
}

try:
    torch.npu.config.allow_internal_format = True
    torch.npu.set_compile_mode(jit_compile=False)
except:
    pass

import sys, signal, faulthandler
faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True, chain=False)


def local_chat(
    model_path: str | None = None,
    optimize_config_path: str = None,
    gguf_path: str | None = None,
    max_new_tokens: int = 1000,
    cpu_infer: int = Config().cpu_infer,
    use_cuda_graph: bool = True,
    prompt_file : str | None = None,
    mode: str = "normal",
    force_think: bool = False,
    chunk_size: int = 8192,
    device: str = "cuda",
    tp: int = 1,
):
    Config().cpu_infer = cpu_infer

    local_rank, world_size = setup_model_parallel(tp=tp)
    torch.set_grad_enabled(False)
    if utils.CUR_DEVICE is None:
        utils.CUR_DEVICE = f"npu:{torch.npu.current_device()}"

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    config.chunk_size = chunk_size
    npu_graph_runner.LAYER_ID = config.num_hidden_layers
    if mode == 'long_context':
        assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
        torch.set_default_dtype(torch.float16)
    else:
        torch.set_default_dtype(config.torch_dtype)

    with torch.device("meta"):
        if config.architectures[0] in custom_models:
            print("using custom modeling_xxx.py.")
            if (
                "Qwen2Moe" in config.architectures[0]
            ):  # Qwen2Moe must use flash_attention_2 to avoid overflow.
                config._attn_implementation = "flash_attention_2"
            if "Llama" in config.architectures[0]:
                config._attn_implementation = "eager"
            if "Mixtral" in config.architectures[0]:
                config._attn_implementation = "flash_attention_2"

            model = custom_models[config.architectures[0]](config)
        else:
            model = AutoModelForCausalLM.from_config(
                config, trust_remote_code=True, attn_implementation="flash_attention_2"
            )

    if optimize_config_path is None:
        if config.architectures[0] in default_optimize_rules:
            print("using default_optimize_rule for", config.architectures[0]) if local_rank == 0 else None
            optimize_config_path = default_optimize_rules[config.architectures[0]]
            print(f'{optimize_config_path=}') if local_rank == 0 else None
        else:
            optimize_config_path = input(
                "please input the path of your rule file(yaml file containing optimize rules):"
            )

    if gguf_path is None:
        gguf_path = input(
            "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
        )
    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
    # 提前absorbed
    get_absort_weight(model, config)

    try:
        model.generation_config = GenerationConfig.from_pretrained(model_path)
    except Exception as e:
        print(f"generation config can't auto create, make default. Message: {e}")
        gen_config = GenerationConfig(
            temperature=0.6,
            top_p=0.95,
            do_sample=True
        )
        model.generation_config = gen_config
    # model.generation_config = GenerationConfig.from_pretrained(model_path)
    if model.generation_config.pad_token_id is None:
        model.generation_config.pad_token_id = model.generation_config.eos_token_id
    model.eval()
    logging.basicConfig(level=logging.INFO)

    system = platform.system()
    if system == "Windows":
        os.system("cls") if local_rank == 0 else None
    else:
        os.system("clear") if local_rank == 0 else None

    print(f"{model=}") if local_rank == 0 else None

    batch_size, seq_length = 1, 16384  # default cache pool params
    device_map = model.gguf_loader.tensor_device_map
    static_cache = StaticCache(
        config = model.config, max_batch_size = batch_size, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
    )

    torch.distributed.barrier()
    while True:
        if local_rank == 0:
            try:
                content = input("Chat: \n").strip()
            except KeyboardInterrupt:
                dist.barrier()
                print('Exit all rank with KeyboardInterrupt!')
                sys.exit(0)
            if content.startswith('"""'):  # prefix """
                # multi lines input
                content = content[3:] + "\n"
                while True:
                    line = input("")
                    if line.endswith('"""'):
                        # end multi lines input
                        line = line[:-3]  # suffix """
                        if line:
                            content += line + "\n"
                        break
                    else:
                        content += line + "\n"

            if content == "":
                if prompt_file != None:
                    content = open(prompt_file, "r").read()
                else:
                    continue
            elif os.path.isfile(content):
                f = open(content, "r")
                content = f.readlines()
                f.close()
            else:
                content = [f"{len(content)},{max_new_tokens},{content}"]
        else:
            content = [""]

        for line in content:
            content_tensor = torch.tensor(bytearray(line.encode()), dtype=torch.uint8).to(device=utils.CUR_DEVICE)
            if world_size > 1:
                content_size = torch.tensor(len(content_tensor), dtype=torch.int64).to(device=utils.CUR_DEVICE)
                all_content_sizes = [torch.zeros((1,), dtype=torch.int64).to(device=utils.CUR_DEVICE) for _ in range(world_size)]
                dist.all_gather(all_content_sizes, content_size)
                max_content_size = max([size.item() for size in all_content_sizes])

                padded_content_tensor = torch.zeros((max_content_size,), dtype=torch.uint8).to(device=utils.CUR_DEVICE)
                padded_content_tensor[:len(content_tensor)] = content_tensor

                all_content_tensors = [torch.zeros((max_content_size,), dtype=torch.uint8).to(device=utils.CUR_DEVICE) for _ in range(world_size)]
                dist.all_gather(all_content_tensors, padded_content_tensor)
                content_tensor = all_content_tensors[0][:all_content_sizes[0].item()]
                line = bytes(content_tensor.cpu().numpy()).decode()

            parts = line.split(",")
            input_tokens = int(parts[0])
            max_new_tokens = int(parts[1])
            line = line[line.index(",", line.index(",") + 1) + 1:]
            
            messages = [{"role": "user", "content": line}]
            input_tensor = tokenizer.apply_chat_template(
                messages, add_generation_prompt=True, return_tensors="pt"
            )
            if force_think:
                token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
                input_tensor = torch.cat(
                    [input_tensor, token_thinks], dim=1
                )
            if mode == 'long_context':
                assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
                "please change max_seq_len in  ~/.ktransformers/config.yaml"

            if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8 and device_manager.gpu_vendor == GPUVendor.NVIDIA:
                generated = prefill_and_generate(
                    model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
                    use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim,
                    static_cache=static_cache
                )
            else:
                generated = prefill_and_generate(
                    model, tokenizer, input_tensor.to(device=utils.CUR_DEVICE), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
                    static_cache=static_cache
                )


if __name__ == "__main__":
    fire.Fire(local_chat)


================================================
FILE: archive/ktransformers/local_chat_test.py
================================================
"""
Description  :  
Author       : Boxin Zhang, Azure-Tang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

import os
import platform
import sys

project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)
import torch
import logging
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)
import json
import fire
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import prefill_and_generate, get_compute_capability
from ktransformers.server.config.config import Config
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled

custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
    "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
    "LlamaForCausalLM": LlamaForCausalLM,
    "MixtralForCausalLM": MixtralForCausalLM,
}

ktransformer_rules_dir = (
    os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
)
default_optimize_rules = {
    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
    "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
    "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
}


def local_chat(
    model_path: str | None = None,
    optimize_config_path: str = None,
    gguf_path: str | None = None,
    max_new_tokens: int = 1000,
    cpu_infer: int = Config().cpu_infer,
    use_cuda_graph: bool = True,
    prompt_file : str | None = None,
    mode: str = "normal",
    force_think: bool = False,
    chunk_prefill_size: int = 8192
):

    torch.set_grad_enabled(False)

    Config().cpu_infer = cpu_infer

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    if mode == 'long_context':
        assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
        torch.set_default_dtype(torch.float16)
    else:
        torch.set_default_dtype(config.torch_dtype)

    with torch.device("meta"):
        if config.architectures[0] in custom_models:
            print("using custom modeling_xxx.py.")
            if (
                "Qwen2Moe" in config.architectures[0]
            ):  # Qwen2Moe must use flash_attention_2 to avoid overflow.
                config._attn_implementation = "flash_attention_2"
            if "Llama" in config.architectures[0]:
                config._attn_implementation = "eager"
            if "Mixtral" in config.architectures[0]:
                config._attn_implementation = "flash_attention_2"

            model = custom_models[config.architectures[0]](config)
        else:
            model = AutoModelForCausalLM.from_config(
                config, trust_remote_code=True, attn_implementation="flash_attention_2"
            )

    if optimize_config_path is None:
        if config.architectures[0] in default_optimize_rules:
            print("using default_optimize_rule for", config.architectures[0])
            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
            optimize_config_path = input(
                "please input the path of your rule file(yaml file containing optimize rules):"
            )

    if gguf_path is None:
        gguf_path = input(
            "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
        )
    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
    
    try:
        model.generation_config = GenerationConfig.from_pretrained(model_path)
    except Exception as e:
        print(f"generation config can't auto create, make default. Message: {e}")
        gen_config = GenerationConfig(
            temperature=0.6,
            top_p=0.95,
            do_sample=True
        )
        model.generation_config = gen_config
    # model.generation_config = GenerationConfig.from_pretrained(model_path)
    if model.generation_config.pad_token_id is None:
        model.generation_config.pad_token_id = model.generation_config.eos_token_id
    model.eval()
    logging.basicConfig(level=logging.INFO)

    system = platform.system()
    if system == "Windows":
        os.system("cls")
    else:
        os.system("clear")

    if prompt_file != None:
        assert os.path.isfile(prompt_file), "prompt file not exist"
        print(f"prompt file is {prompt_file}")
        content = open(prompt_file, "r").read()
    else:
        content = "Please write a piece of quicksort code in C++."

    print('Start Testing...(1 round)')
    print('Prompt:', content)

    while True:
        messages = [{"role": "user", "content": content}]
        input_tensor = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        )
        if force_think:
            token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
            input_tensor = torch.cat(
                [input_tensor, token_thinks], dim=1
            )
        if mode == 'long_context':
            assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
            "please change max_seq_len in  ~/.ktransformers/config.yaml"
        
        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
            generated = prefill_and_generate(
                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
            )
        else:
            generated = prefill_and_generate(
                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
            )
        break

if __name__ == "__main__":
    fire.Fire(local_chat)


================================================
FILE: archive/ktransformers/models/__init__.py
================================================


================================================
FILE: archive/ktransformers/models/ascend/custom_ascend_modeling_deepseek_v3.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch_npu
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.config.config import Config

from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KVC2StaticCache
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3Model,  DeepseekV3PreTrainedModel
from ktransformers.models.configuration_deepseek_v3 import DeepseekV3Config
import ktransformers.util.utils as utils


torch.set_grad_enabled(False)
torch.set_default_dtype(torch.float16)


class KNPUDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):

    # cache: KVC2StaticCache
    use_cuda_graph = False

    def __init__(
        self,
        config: DeepseekV3Config,
        stream = None,
        default_type=torch.float16
    ):
        super().__init__(config)
        self.model = DeepseekV3Model(config)
        self.config = config
        self.config.backend_type = "balance_serve"
        # self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.default_type = default_type
        self.stream = torch_npu.npu.current_stream() if stream is None else stream
        self.para_stream = torch_npu.npu.Stream()
        self.call_stream = torch_npu.npu.Stream()
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
        print('[WARN] this custom modeling do not support flash infer, skip this part...')

    def batch_embeddings(self, batch: ForwardBatchInput, device="npu:0", is_prefill=True):
        features = []
        if is_prefill:
            start_ids = 0
            seq_lens = []
            for i in range(batch.minibatch.prefill_batch):
                assert batch.minibatch.p_kv_len[i] == batch.minibatch.p_q_len[i], \
                    "[ERROR] current prefill do not support chunk or prefix cache"
                tokens = batch.minibatch.p_tokens[start_ids: start_ids+batch.minibatch.p_q_len[i]].contiguous()
                start_ids += batch.minibatch.p_q_len[i]
                feature = (
                    self.model.embed_tokens(tokens.to(torch.device('cpu')))
                    .to(self.default_type)
                    .to(device=device)
                )
                features.append(feature)
                seq_lens.append(feature.shape[0])

            max_seq_len = max(seq_lens) if seq_lens else 0

            padded_features = []
            for feat in features:
                curr_len = feat.shape[0]
                if curr_len < max_seq_len:
                    pad_len = max_seq_len - curr_len
                    padded_feat = torch.nn.functional.pad(
                        feat,
                        (0, 0, 0, pad_len),
                        mode='constant',
                        value=0.0
                    )
                    padded_features.append(padded_feat)
                else:
                    padded_features.append(feat)

            features_t = torch.stack(padded_features)

        else:
            for i in range(batch.minibatch.decode_batch):
                if batch.minibatch.d_tokens.dim() == 1:
                    tokens = batch.minibatch.d_tokens.contiguous()
                else:
                    tokens = batch.minibatch.d_tokens[i].contiguous()

                feature = (
                    self.model.embed_tokens(tokens.to(torch.device('cpu')))
                    .to(self.default_type)
                    .to(device=device)
                )
                features.append(feature)

            features_t = torch.stack(features)
        return features_t

    def print_callback(self, param):
        with torch.npu.stream(self.call_stream):
            hidden_states = param[0]
            print("########################################")
            print("hidden_states is ", hidden_states)
            print("########################################")

    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: torch.Tensor | None = None,
        past_key_value: KVC2StaticCache | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        position_ids: torch.Tensor | None = None,
        block_tables: torch.Tensor | None = None,
        cuda_graph_idx: int | None = -1,
        is_prefill: bool = True
    ) -> ForwardBatchOutput:
        # NPU use direct block table from ForwardBatchInput instead of page_idx & page_offset

        if features.ndim == 2:
            hidden_states = features.unsqueeze(0)
        elif features.ndim == 1:
            hidden_states = features.unsqueeze(0).unsqueeze(0)  # (bsz, seqlen, hidden)
        else:
            hidden_states = features

        (bsz, q_len, hidden_size) = hidden_states.shape

        if is_prefill:
            position_ids = -1 * torch.ones(bsz, q_len).to(batch.minibatch.p_position_ids.device)
            bsz_real = torch.zeros(bsz).to(batch.minibatch.p_position_ids.device)
            # convert merged into batched
            start_ids = 0
            for i, qlen in enumerate(batch.minibatch.p_q_len):
                position_ids[i, 0:qlen] = batch.minibatch.p_position_ids[start_ids:start_ids+qlen]
                start_ids += qlen
                bsz_real[i] = qlen
            block_tables = batch.minibatch.p_block_tables
            kv_len = batch.minibatch.p_kv_len[0]
            q_len_raw = batch.minibatch.p_q_len
            kv_len_raw = batch.minibatch.p_kv_len
        else:
            position_ids = batch.minibatch.d_position_ids
            if len(position_ids.shape) == 1:
                position_ids = position_ids.unsqueeze(0)
            block_tables = batch.minibatch.d_block_tables
            kv_len = batch.minibatch.d_kv_len[0]
            q_len_raw = None
            kv_len_raw = batch.minibatch.d_kv_len_list
            bsz_real = None

        for i, decode_layer in enumerate(self.model.layers):
            residual = hidden_states
            hidden_states = decode_layer.input_layernorm(hidden_states)

            # generate chunk_mask automatically.
            if is_prefill:
                attn_mask = -65504.0 * torch.triu(torch.ones(q_len, kv_len, device=hidden_states.device), diagonal=1)
                attn_mask = attn_mask.unsqueeze(0).unsqueeze(0) # (bsz, 1, q_len, kv_len)
                if bsz > 1:
                    attn_mask = attn_mask.expand(bsz, attn_mask.shape[1], attn_mask.shape[2], attn_mask.shape[3])
            else:
                attn_mask = None
            # print_ex(f"####: before self_attn of layer {i}...")
            hidden_states, _, _ = decode_layer.self_attn(hidden_states,
                                                            position_ids=position_ids,
                                                            attention_mask=attn_mask,
                                                            past_key_value=past_key_value,
                                                            num_tokens_tensors=num_tokens_tensors,
                                                            page_idx=page_idx,
                                                            page_offset=page_offset,
                                                            block_table=block_tables,
                                                            q_len_raw=q_len_raw,
                                                            kv_len_raw=kv_len_raw,
                                                            is_prefill=is_prefill,
                                                            stream = self.stream,
                                                            )
            hidden_states = residual + hidden_states
            # mlp
            residual = hidden_states
            hidden_states = decode_layer.post_attention_layernorm(hidden_states)
            # print_ex(f"####: before mlp of layer {i}...")
            hidden_states = decode_layer.mlp(hidden_states, self.stream, self.para_stream)
            hidden_states = hidden_states.squeeze(0)
            hidden_states = residual + hidden_states
        # print_ex(f"####: fill output...")
        forward_batch_output = ForwardBatchOutput()
        # with torch_npu.npu.stream(self.stream):
        hidden_states_without_norm = hidden_states.clone()
        local_logit = self.lm_head(self.model.norm(hidden_states))
        for bsz in range(local_logit.size(0)):
            if bsz_real is not None:
                index = int(bsz_real[bsz].item())
                result = local_logit[bsz][:index]
            else:
                result = local_logit[bsz]
            forward_batch_output.logits.append(result)
            forward_batch_output.pre_hidden_states.append(hidden_states_without_norm[bsz])
        return forward_batch_output

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_heads: int,
        head_dim_ckv: int,
        head_dim_kpe: int,
        page_size: int,
        causal: bool,
        sm_scale: float,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,):
        print('[WARN] this custom modeling do not support flash infer, skip this part...')


================================================
FILE: archive/ktransformers/models/ascend/custom_ascend_modeling_qwen3.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn as nn
import torch_npu
from dataclasses import dataclass
from torch.nn import functional as F
import torch.utils.checkpoint

from ktransformers.server.config.config import Config
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KVC2Qwen3Cache
from ktransformers.models.modeling_qwen3_moe import Qwen3MoePreTrainedModel, Qwen3MoeModel
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
import ktransformers.util.utils as utils
from ktransformers.operators.ascend.ascend_layernorm import KQwen3FinalRMSNormNPU

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.float16)

class KNPUQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):

    cache: "KVC2Qwen3Cache"
    use_cuda_graph = False

    def __init__(
        self,
        config: "Qwen3MoeConfig",
        cache: "KVC2Qwen3Cache",
        stream: Optional["torch_npu.npu.Stream"] = None,
        default_type: torch.dtype = torch.float16,
    ):
        super().__init__(config)

        self.model = Qwen3MoeModel(config)
        self.config = config
        self.config.backend_type = "balance_serve" 
        self.cache = cache
        self.vocab_size = config.vocab_size

        self.model.to(torch.float16)

        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.default_type = default_type

        self.stream = torch_npu.npu.current_stream() if stream is None else stream
        self.para_stream = torch_npu.npu.Stream()
        self.call_stream = torch_npu.npu.Stream()

        if hasattr(self.model, "embed_tokens"):
            self.model.embed_tokens.weight.data = self.model.embed_tokens.weight.data.to(torch.float16)

        if hasattr(self.model, "norm"):
            self.model.norm.weight.data = self.model.norm.weight.data.to(torch.float16)
            if getattr(self.model.norm, "bias", None) is not None:
                self.model.norm.bias.data = self.model.norm.bias.data.to(torch.float16)


        try:
            orig_norm = self.model.norm
            self.model.norm = KQwen3FinalRMSNormNPU(orig_norm)
        except Exception as e:
            print(f"[INIT][WARN] replace model.norm failed: {e}", flush=True)

    def init_wrapper(self):
        print("[WARN] KNPUQwen3MoeForCausalLM does not use flashinfer wrapper on NPU, skip init_wrapper...")

    # ---------------------------------------------------
    # Embedding：support prefill / decode modes
    # ---------------------------------------------------
    def batch_embeddings(
        self,
        batch: "ForwardBatchInput",
        device: str = "npu:0",
        is_prefill: bool = True,
    ) -> torch.Tensor:
        features = []

        if is_prefill:
            start_ids = 0
            seq_lens = []

            for i in range(batch.minibatch.prefill_batch):
                qlen = int(batch.minibatch.p_q_len[i])
                kvlen = int(batch.minibatch.p_kv_len[i])

                if kvlen < qlen:
                    raise AssertionError(
                        f"[ERROR] p_kv_len({kvlen}) < p_q_len({qlen}) "
                        f"for prefill idx={i}, this should not happen"
                    )

                tokens = batch.minibatch.p_tokens[start_ids: start_ids + qlen].contiguous()
                start_ids += qlen

                feat = (
                    self.model.embed_tokens(tokens.to(torch.device("cpu")))
                    .to(self.default_type)
                    .to(device=device)
                )

                features.append(feat)
                seq_lens.append(qlen)

            max_seq_len = max(seq_lens) if seq_lens else 0

            # Pad the current chunk to the maximum q_len with [bsz, max_q_len, hidden].
            padded_features = []
            for feat in features:
                curr_len = feat.shape[0]
                if curr_len < max_seq_len:
                    pad_len = max_seq_len - curr_len
                    padded_feat = torch.nn.functional.pad(
                        feat,
                        (0, 0, 0, pad_len),
                        mode="constant",
                        value=0.0,
                    )
                    padded_features.append(padded_feat)
                else:
                    padded_features.append(feat)
            features_t = torch.stack(padded_features, dim=0)  # [bsz, max_seq_len, hidden]
        else:
            for i in range(batch.minibatch.decode_batch):
                if batch.minibatch.d_tokens.dim() == 1:
                    tokens = batch.minibatch.d_tokens.contiguous()
                else:
                    tokens = batch.minibatch.d_tokens[i].contiguous()
                feature = (
                    self.model.embed_tokens(tokens.to(torch.device("cpu")))
                    .to(self.default_type)
                    .to(device=device)
                )
                features.append(feature)
            features_t = torch.stack(features)  # [decode_bsz, decode_q_len, hidden]

        return features_t

    def forward(
            self,
            batch: Optional["ForwardBatchInput"] = None,
            features: torch.Tensor | None = None,
            cache=None,
            bsz_tensors: torch.Tensor | None = None,
            num_tokens_tensors: torch.Tensor | None = None,
            page_idx: torch.Tensor | None = None,
            page_offset: torch.Tensor | None = None,
            position_ids: torch.Tensor | None = None,
            block_tables: torch.Tensor | None = None,
            cuda_graph_idx: int | None = 0,
            is_prefill: bool = True,
        ) -> "ForwardBatchOutput":
        try:
            is_capturing = torch.npu.is_current_stream_capturing()
        except Exception:
            is_capturing = False
        # features: [bsz, q_len, hidden]
        if features.ndim == 2:
            hidden_states = features.unsqueeze(0)
        elif features.ndim == 1:
            hidden_states = features.unsqueeze(0).unsqueeze(0)
        else:
            hidden_states = features
        bsz, q_len, hidden_size = hidden_states.shape
        minibatch = batch.minibatch
        if is_prefill:
            device_pos = minibatch.p_position_ids.device
            position_ids = -1 * torch.ones(
                bsz,
                q_len,
                dtype=minibatch.p_position_ids.dtype,
                device=device_pos,
            )
            bsz_real = torch.zeros(bsz, dtype=torch.int32, device=device_pos)
            start_ids = 0
            for i, qlen in enumerate(minibatch.p_q_len):
                position_ids[i, :qlen] = minibatch.p_position_ids[start_ids:start_ids + qlen]
                start_ids += int(qlen.item())
                bsz_real[i] = qlen
            block_tables = minibatch.p_block_tables
            kv_len = minibatch.p_kv_len[0]
            q_len_raw = minibatch.p_q_len
            kv_len_raw = minibatch.p_kv_len
            kv_len_tensor = kv_len_raw
        else:
            position_ids = minibatch.d_position_ids
            if position_ids.dim() == 1:
                position_ids = position_ids.unsqueeze(0)
            block_tables = minibatch.d_block_tables
            kv_len = minibatch.d_kv_len[0]
            q_len_raw = None
            kv_len_tensor = minibatch.d_kv_len_list
            bsz_real = None

        # ==================== layer loop ====================
        for i, decode_layer in enumerate(self.model.layers):
            # ---------- Attention Block ----------
            attn_residual = hidden_states

            hidden_states = decode_layer.input_layernorm(hidden_states)

            attn_out = decode_layer.self_attn(
                hidden_states,
                past_key_value=self.cache,
                position_ids=position_ids,
                num_tokens_tensors=num_tokens_tensors,
                page_idx=page_idx,
                page_offset=page_offset,
                block_table=block_tables,
                q_len_raw=q_len_raw,
                kv_len_raw=kv_len_tensor,
                is_prefill=is_prefill,
                stream=self.stream,
            )

            hidden_states = attn_residual + attn_out
            # ---------- MLP Block ----------
            mlp_residual = hidden_states
            hidden_states = decode_layer.post_attention_layernorm(hidden_states)
            mlp_in = hidden_states
            mlp_out = decode_layer.mlp(
                mlp_in,
                num_tokens_tensors,
                cuda_graph_idx,
            )

            if isinstance(mlp_out, tuple):
                moe_y = mlp_out[0]
            else:
                moe_y = mlp_out

            hidden_states = mlp_residual + moe_y
        forward_batch_output = ForwardBatchOutput()

        hidden_states_without_norm = hidden_states.clone()

        normed = self.model.norm(hidden_states)

        local_logit = self.lm_head(normed)
        B_out = local_logit.size(0)
        for b in range(B_out):
            if (bsz_real is not None) and (not is_capturing):
                valid_len = int(bsz_real[b].item())
                result = local_logit[b, :valid_len]
                pre_h = hidden_states_without_norm[b, :valid_len]
            else:
                result = local_logit[b]
                pre_h = hidden_states_without_norm[b]

            forward_batch_output.logits.append(result)
            forward_batch_output.pre_hidden_states.append(pre_h)
        return forward_batch_output


    def flash_infer_attn_plan(
        self,
        batch: "ForwardBatchInput",
        bsz_tensors: torch.Tensor,
        num_tokens_tensors: torch.Tensor,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0,
    ):
        print("[WARN] KNPUQwen3MoeForCausalLM on NPU does not support flashinfer, skip flash_infer_attn_plan...")


================================================
FILE: archive/ktransformers/models/configuration_deepseek.py
================================================
# Adapted from
# https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628/blob/main/configuration_deepseek.py
# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepseekV2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V2.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 102400):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        n_shared_experts (`int`, *optional*, defaults to None):
            Number of shared experts, None means dense model.
        n_routed_experts (`int`, *optional*, defaults to None):
            Number of routed experts, None means dense model.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        topk_method (`str`, *optional*, defaults to `gready`):
            Topk method used in routed gate.
        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
        moe_layer_freq (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
        scoring_func (`str`, *optional*, defaults to 'softmax'):
            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import DeepseekV2Model, DeepseekV2Config
    >>> # Initializing a Deepseek-V2 style configuration
    >>> configuration = DeepseekV2Config()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "deepseek_v2"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=102400,
        hidden_size=4096,
        intermediate_size=11008,
        moe_intermediate_size = 1407,
        num_hidden_layers=30,
        num_attention_heads=32,
        num_key_value_heads=32,
        n_shared_experts = None,
        n_routed_experts = None,
        ep_size = 1,
        routed_scaling_factor = 1.0,
        kv_lora_rank = 512,
        q_lora_rank = 1536,
        qk_rope_head_dim = 64,
        v_head_dim = 128,
        qk_nope_head_dim = 128,
        topk_method = 'gready',
        n_group = None,
        topk_group = None,
        num_experts_per_tok = None,
        moe_layer_freq = 1,
        first_k_dense_replace = 0,
        norm_topk_prob = False,
        scoring_func = 'softmax',
        aux_loss_alpha = 0.001,
        seq_aux = True,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=100000,
        eos_token_id=100001,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        cpu_quant=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        self.cpu_quant = cpu_quant

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


================================================
FILE: archive/ktransformers/models/configuration_deepseek_v3.py
================================================
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepseekV3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V3.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 129280):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV3Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
            Number of nextn predict layers in the DeepSeekV3 Model.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        n_shared_experts (`int`, *optional*, defaults to None):
            Number of shared experts, None means dense model.
        n_routed_experts (`int`, *optional*, defaults to None):
            Number of routed experts, None means dense model.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        topk_method (`str`, *optional*, defaults to `gready`):
            Topk method used in routed gate.
        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
        moe_layer_freq (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
        scoring_func (`str`, *optional*, defaults to 'softmax'):
            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import DeepseekV3Model, DeepseekV3Config
    >>> # Initializing a Deepseek-V3 style configuration
    >>> configuration = DeepseekV3Config()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "deepseek_v3"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=129280,
        hidden_size=7168,
        intermediate_size=18432,
        moe_intermediate_size = 2048,
        num_hidden_layers=61,
        num_nextn_predict_layers=1,
        num_attention_heads=128,
        num_key_value_heads=128,
        n_shared_experts = 1,
        n_routed_experts = 256,
        ep_size = 1,
        routed_scaling_factor = 2.5,
        kv_lora_rank = 512,
        q_lora_rank = 1536,
        qk_rope_head_dim = 64,
        v_head_dim = 128,
        qk_nope_head_dim = 128,
        topk_method = 'noaux_tc',
        n_group = 8,
        topk_group = 4,
        num_experts_per_tok = 8,
        moe_layer_freq = 1,
        first_k_dense_replace = 3,
        norm_topk_prob = True,
        scoring_func = 'sigmoid',
        hidden_act="silu",
        max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=0,
        eos_token_id=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

================================================
FILE: archive/ktransformers/models/configuration_glm4_moe.py
================================================
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/glm4_moe/modular_glm4_moe.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_glm4_moe.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation


class Glm4MoeConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Glm4MoeModel`]. It is used to instantiate a
    Glm4Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of [THUDM/GLM-4-100B-A10B](https://huggingface.co/THUDM/GLM-4-100B-A10B).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 151552):
            Vocabulary size of the Glm4Moe model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Glm4MoeModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 10944):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 46):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 96):
            Number of attention heads for each attention layer in the Transformer encoder.
        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
            The factor of the partial rotary position.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.

        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        moe_intermediate_size (`int`, *optional*, defaults to 1408):
            Intermediate size of the routed expert.
        num_experts_per_tok (`int`, *optional*, defaults to 8):
            number of experts per token.
        n_shared_experts (`int`, *optional*, defaults to 1):
            Number of shared experts.
        n_routed_experts (`int`, *optional*, defaults to 128):
            Number of routed experts.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        n_group (`int`, *optional*, defaults to 1):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to 1):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        first_k_dense_replace (`int`, *optional*, defaults to 1):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to `True`):
            Whether to normalize the topk probabilities.
        use_qk_norm (`bool`, *optional*, defaults to `False`):
            Whether to use query-key normalization in the attention
    ```python
    >>> from transformers import Glm4MoeModel, Glm4MoeConfig

    >>> # Initializing a Glm4Moe style configuration
    >>> configuration = Glm4MoeConfig()

    >>> # Initializing a model from the GLM-4-MOE-100B-A10B style configuration
    >>> model = Glm4MoeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "glm4_moe"
    keys_to_ignore_at_inference = ["past_key_values"]

    # Default tensor parallel plan for base model `Glm4Moe`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.experts.*.gate_proj": "colwise",
        "layers.*.mlp.experts.*.up_proj": "colwise",
        "layers.*.mlp.experts.*.down_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    def __init__(
        self,
        vocab_size=151552,
        hidden_size=4096,
        intermediate_size=10944,
        num_hidden_layers=46,
        num_attention_heads=96,
        partial_rotary_factor=0.5,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=131072,
        initializer_range=0.02,
        rms_norm_eps=1e-5,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        moe_intermediate_size=1408,
        num_experts_per_tok=8,
        n_shared_experts=1,
        n_routed_experts=128,
        routed_scaling_factor=1.0,
        n_group=1,
        topk_group=1,
        first_k_dense_replace=1,
        norm_topk_prob=True,
        use_qk_norm=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.partial_rotary_factor = partial_rotary_factor

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
        rope_config_validation(self)

        # MoE arguments
        self.moe_intermediate_size = moe_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.n_group = n_group
        self.topk_group = topk_group
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.routed_scaling_factor = routed_scaling_factor
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.use_qk_norm = use_qk_norm

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


__all__ = ["Glm4MoeConfig"]

================================================
FILE: archive/ktransformers/models/configuration_llama.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LLaMA model configuration"""

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation


class LlamaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the LLaMA-7B.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`LlamaModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
            Llama 2 up to 4096, CodeLlama up to 16384.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.

    ```python
    >>> from transformers import LlamaModel, LlamaConfig

    >>> # Initializing a LLaMA llama-7b style configuration
    >>> configuration = LlamaConfig()

    >>> # Initializing a model from the llama-7b style configuration
    >>> model = LlamaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "llama"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias

        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
        rope_config_validation(self)

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


================================================
FILE: archive/ktransformers/models/configuration_qwen2_moe.py
================================================
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen2MoE model configuration"""

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


class Qwen2MoeConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a
    Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of
    Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B").

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Qwen2MoeModel`]
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 16):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        use_sliding_window (`bool`, *optional*, defaults to `False`):
            Whether to use sliding window attention.
        sliding_window (`int`, *optional*, defaults to 4096):
            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
        max_window_layers (`int`, *optional*, defaults to 28):
            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        decoder_sparse_step (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer.
        moe_intermediate_size (`int`, *optional*, defaults to 1408):
            Intermediate size of the routed expert.
        shared_expert_intermediate_size (`int`, *optional*, defaults to 5632):
            Intermediate size of the shared expert.
        num_experts_per_tok (`int`, *optional*, defaults to 4):
            Number of selected experts.
        num_experts (`int`, *optional*, defaults to 60):
            Number of routed experts.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
            Indicate which layers use Qwen2MoeMLP rather than Qwen2MoeSparseMoeBlock
            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.

    ```python
    >>> from transformers import Qwen2MoeModel, Qwen2MoeConfig

    >>> # Initializing a Qwen2MoE style configuration
    >>> configuration = Qwen2MoeConfig()

    >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration
    >>> model = Qwen2MoeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "qwen2_moe"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=2048,
        intermediate_size=5632,
        num_hidden_layers=24,
        num_attention_heads=16,
        num_key_value_heads=16,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=28,
        attention_dropout=0.0,
        decoder_sparse_step=1,
        moe_intermediate_size=1408,
        shared_expert_intermediate_size=5632,
        num_experts_per_tok=4,
        num_experts=60,
        norm_topk_prob=False,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        mlp_only_layers=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window if use_sliding_window else None
        self.max_window_layers = max_window_layers

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout

        # MoE arguments
        self.decoder_sparse_step = decoder_sparse_step
        self.moe_intermediate_size = moe_intermediate_size
        self.shared_expert_intermediate_size = shared_expert_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.num_experts = num_experts
        self.norm_topk_prob = norm_topk_prob
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

================================================
FILE: archive/ktransformers/models/configuration_qwen3_moe.py
================================================
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen3MoE model configuration"""

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging


logger = logging.get_logger(__name__)


class Qwen3MoeConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Qwen3MoeModel`]. It is used to instantiate a
    Qwen3MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of [Qwen/Qwen3-MoE-15B-A2B](https://huggingface.co/Qwen/Qwen3-15B-A2B).
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the Qwen3MoE model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Qwen3MoeModel`]
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 6144):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        use_sliding_window (`bool`, *optional*, defaults to `False`):
            Whether to use sliding window attention.
        sliding_window (`int`, *optional*, defaults to 4096):
            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
        max_window_layers (`int`, *optional*, defaults to 28):
            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        decoder_sparse_step (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer.
        moe_intermediate_size (`int`, *optional*, defaults to 768):
            Intermediate size of the routed expert.
        num_experts_per_tok (`int`, *optional*, defaults to 8):
            Number of selected experts.
        num_experts (`int`, *optional*, defaults to 128):
            Number of routed experts.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
            Indicate which layers use Qwen3MoeMLP rather than Qwen3MoeSparseMoeBlock
            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
    ```python
    >>> from transformers import Qwen3MoeModel, Qwen3MoeConfig
    >>> # Initializing a Qwen3MoE style configuration
    >>> configuration = Qwen3MoeConfig()
    >>> # Initializing a model from the Qwen3-15B-A2B" style configuration
    >>> model = Qwen3MoeModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "qwen3_moe"
    keys_to_ignore_at_inference = ["past_key_values"]

    # Default tensor parallel plan for base model `Qwen3Moe`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=2048,
        intermediate_size=6144,
        num_hidden_layers=24,
        num_attention_heads=32,
        num_key_value_heads=4,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=28,
        attention_dropout=0.0,
        decoder_sparse_step=1,
        moe_intermediate_size=768,
        num_experts_per_tok=8,
        num_experts=128,
        norm_topk_prob=False,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        mlp_only_layers=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window if use_sliding_window else None
        self.max_window_layers = max_window_layers

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
        rope_config_validation(self)

        # MoE arguments
        self.decoder_sparse_step = decoder_sparse_step
        self.moe_intermediate_size = moe_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.num_experts = num_experts
        self.norm_topk_prob = norm_topk_prob
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


__all__ = ["Qwen3MoeConfig"]

================================================
FILE: archive/ktransformers/models/configuration_qwen3_next.py
================================================
# coding=utf-8
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen3-Next model configuration"""

from transformers.configuration_utils import PretrainedConfig, layer_type_validation
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging


logger = logging.get_logger(__name__)


class Qwen3NextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
    Qwen3-Next model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of
    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
            `inputs_ids`.
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str`, *optional*, defaults to `"silu"`):
            The non-linear activation function in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        head_dim (`int`, *optional*, defaults to 256):
            Projection weights dimension in multi-head attention.
        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
            Kernel size of the convolution used in linear attention layers.
        linear_key_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each key head in linear attention.
        linear_value_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each value head in linear attention.
        linear_num_key_heads (`int`, *optional*, defaults to 16):
            Number of key heads used in linear attention layers.
        linear_num_value_heads (`int`, *optional*, defaults to 32):
            Number of value heads used in linear attention layers.
        decoder_sparse_step (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer.
        moe_intermediate_size (`int`, *optional*, defaults to 512):
            Intermediate size of the routed expert.
        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
            Intermediate size of the shared expert.
        num_experts_per_tok (`int`, *optional*, defaults to 10):
            Number of selected experts.
        num_experts (`int`, *optional*, defaults to 512):
            Number of routed experts.
        norm_topk_prob (`bool`, *optional*, defaults to `True`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
        layer_types (`list[str]`, *optional*):
            Types of each layer (attention or linear).
    ```python
    >>> from transformers import Qwen3NextModel, Qwen3NextConfig
    >>> # Initializing a Qwen3Next style configuration
    >>> configuration =  Qwen3NextConfig()
    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
    >>> model = Qwen3NextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "qwen3_next"
    keys_to_ignore_at_inference = ["past_key_values"]

    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.experts.*.gate_proj": "colwise",
        "layers.*.mlp.experts.*.up_proj": "colwise",
        "layers.*.mlp.experts.*.down_proj": "rowwise",
        "layers.*.mlp.shared_experts.gate_proj": "colwise",
        "layers.*.mlp.shared_experts.up_proj": "colwise",
        "layers.*.mlp.shared_experts.down_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=2048,
        intermediate_size=5632,
        num_hidden_layers=48,
        num_attention_heads=16,
        num_key_value_heads=2,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.25,
        attention_bias=False,
        attention_dropout=0.0,
        head_dim=256,
        linear_conv_kernel_dim=4,
        linear_key_head_dim=128,
        linear_value_head_dim=128,
        linear_num_key_heads=16,
        linear_num_value_heads=32,
        decoder_sparse_step=1,
        moe_intermediate_size=512,
        shared_expert_intermediate_size=512,
        num_experts_per_tok=10,
        num_experts=512,
        norm_topk_prob=True,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        mlp_only_layers=[],
        layer_types=None,
        **kwargs,
    ):
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.partial_rotary_factor = partial_rotary_factor
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.head_dim = head_dim
        rope_config_validation(self)

        self.layer_types = layer_types
        if self.layer_types is None:
            self.layer_types = [
                "linear_attention" if bool((i + 1) % 4) else "full_attention" for i in range(self.num_hidden_layers)
            ]
        layer_type_validation(self.layer_types)

        # linear attention part
        self.linear_conv_kernel_dim = linear_conv_kernel_dim
        self.linear_key_head_dim = linear_key_head_dim
        self.linear_value_head_dim = linear_value_head_dim
        self.linear_num_key_heads = linear_num_key_heads
        self.linear_num_value_heads = linear_num_value_heads

        # MoE arguments
        self.decoder_sparse_step = decoder_sparse_step
        self.moe_intermediate_size = moe_intermediate_size
        self.shared_expert_intermediate_size = shared_expert_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.num_experts = num_experts
        self.norm_topk_prob = norm_topk_prob
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.mlp_only_layers = mlp_only_layers


__all__ = ["Qwen3NextConfig"]

================================================
FILE: archive/ktransformers/models/configuration_smallthinker.py
================================================
# coding=utf-8
from transformers.configuration_utils import PretrainedConfig

class SmallthinkerConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`SmallthinkerModel`]. 
    It is used to instantiate a Smallthinker model according to the specified arguments, defining the model architecture. 
    The default values for each of the parameters are the same as the ones used in the original Smallthinker 4B model.

    General configs:
    - model_type: "smallthinker"
    - model_name
    - num_hidden_layers
    - hidden_size

    Tokenizer configs:
    - pad_token_id
    - bos_token_id
    - eos_token_id

    Embedding configs:
    - vocab_size

    RMSNorm configs:
    - rms_norm_eps

    Attention configs:
    - num_attention_heads
    - num_key_value_heads
    - head_dim
    - use_cache
    - use_qk_norm
    - rope_layout: array of 0 or 1s, 0 for nope, 1 for rope
    - rope_theta
    - max_position_embeddings
    - sliding_window_layout: array of 0 or 1s, 0 for normal attention, 1 for SWA
    - sliding_window_size

    General FFN configs:
    - moe_layer_layout: array of 0 or 1s, 0 for dense layer, 1 for MoE layer
    
    Dense FFN configs:
    - dense_ffn_hidden_size

    MoE FFN configs:
    - moe_num_primary_experts
    - moe_shared_primary_experts
    - moe_ffn_hidden_size
    - moe_enable_early_router: Use attention output as router input if true
    - moe_primary_router_use_sigmoid: Use normalized sigmoid 
    - moe_num_active_primary_experts
    - moe_enable_secondary_experts
    - moe_num_secondary_experts
    - moe_secondary_expert_size

    LM Head configs:
    - tie_word_embeddings

    Visibility configs:
    - profile_sparsity

    Other configs:
    - initializer_range
    """
    def __init__(self,
        model_type = "smallthinker",
        model_name="smallthinker_4b_base",
        num_hidden_layers=32,
        hidden_size=1536,
        pad_token_id=None,
        bos_token_id=151643,
        eos_token_id=[151643,151645],
        vocab_size=151936,
        rms_norm_eps=1e-6,
        num_attention_heads=12,
        num_key_value_heads=2,
        head_dim=128,
        use_cache=True,
        use_qk_norm=False,
        rope_layout=[1]*32,
        rope_theta=1e6,
        max_position_embeddings=4096 * 32,
        sliding_window_layout=[0]*32,
        sliding_window_size=4096,
        moe_layer_layout=[1]*32,
        dense_ffn_hidden_size=4096,
        moe_num_primary_experts=32,
        moe_shared_primary_experts=0,
        moe_ffn_hidden_size=768,
        moe_enable_early_router=True,
        moe_primary_router_apply_softmax=False,
        moe_num_active_primary_experts=4,
        moe_enable_secondary_experts=False,
        moe_num_secondary_experts=0,
        moe_secondary_expert_size=0,
        tie_word_embeddings=True,
        initializer_range=0.02,
        **kwargs,
    ):
        moe_layer_layout = [1]*num_hidden_layers
        # Configuration sanitizers
        assert num_attention_heads % num_key_value_heads == 0,      "[Smallthinker config sanitizer] num_attention_heads must be divisible by num_key_value_heads"
        assert len(rope_layout) == num_hidden_layers,               "[Smallthinker config sanitizer] rope_layout must have the same length as num_hidden_layers"
        assert len(sliding_window_layout) == num_hidden_layers,     "[Smallthinker config sanitizer] sliding_window_layout must have the same length as num_hidden_layers"
        assert len(moe_layer_layout) == num_hidden_layers,          "[Smallthinker config sanitizer] moe_layer_layout must have the same length as num_hidden_layers"

        if any(moe_layer_layout):
            assert moe_num_primary_experts != 0,                    "[Smallthinker config sanitizer] moe_num_primary_experts must be set non-zero if there is any MoE layer"
            assert moe_ffn_hidden_size != 0,                        "[Smallthinker config sanitizer] moe_ffn_hidden_size must be set non-zero if there is any MoE layer"
            assert moe_num_active_primary_experts != 0,             "[Smallthinker config sanitizer] moe_num_active_primary_experts must be set non-zero if there is any MoE layer"
            if moe_enable_secondary_experts:
                assert moe_num_secondary_experts != 0,              "[Smallthinker config sanitizer] moe_num_secondary_experts must be set non-zero if moe_enable_secondary_experts is True"
                assert moe_secondary_expert_size != 0,              "[Smallthinker config sanitizer] moe_secondary_expert_size must be set non-zero if moe_enable_secondary_experts is True"
                assert moe_num_secondary_experts * moe_secondary_expert_size == moe_ffn_hidden_size, "[Smallthinker config sanitizer] moe_num_secondary_experts * moe_secondary_expert_size must equal moe_ffn_hidden_size"

        if not all(moe_layer_layout):
            assert dense_ffn_hidden_size != 0,                      "[Smallthinker config sanitizer] dense_ffn_hidden_size must be set non-zero if there is any dense FFN layer"

        # General configs
        self.model_type = model_type
        self.model_name = model_name
        self.num_hidden_layers = num_hidden_layers
        self.hidden_size = hidden_size

        # Tokenizer configs
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        # Embedding configs
        self.vocab_size = vocab_size

        # RMSNorm configs
        self.rms_norm_eps = rms_norm_eps

        # Attention configs
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.head_dim = head_dim
        self.use_cache = use_cache
        self.use_qk_norm = use_qk_norm
        self.rope_layout = rope_layout
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.sliding_window_layout = sliding_window_layout
        self.sliding_window_size = sliding_window_size

        # General FFN configs
        self.moe_layer_layout = moe_layer_layout

        # Dense FFN configs
        self.dense_ffn_hidden_size = dense_ffn_hidden_size

        # MoE FFN configs
        self.moe_num_primary_experts = moe_num_primary_experts
        self.moe_shared_primary_experts = moe_shared_primary_experts
        self.moe_ffn_hidden_size = moe_ffn_hidden_size
        self.num_experts_per_tok = moe_num_active_primary_experts
        self.moe_intermediate_size = moe_ffn_hidden_size
        self.moe_enable_early_router = moe_enable_early_router
        self.moe_primary_router_apply_softmax = moe_primary_router_apply_softmax
        self.moe_num_active_primary_experts = moe_num_active_primary_experts
        self.moe_enable_secondary_experts = moe_enable_secondary_experts
        self.moe_num_secondary_experts = moe_num_secondary_experts
        self.moe_secondary_expert_size = moe_secondary_expert_size

        # Logging configs
        # self.output_router_logits = False

        # Other configs
        self.initializer_range = initializer_range

        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)

        self._attn_implementation = "eager" # SDPA is not allowed for now

        # if self._attn_implementation != "flash_attention_2":
        #     raise NotImplementedError("SDPA impl is buggy for now. NEVER TRY TO USE IT.")
        
__all__ = ["SmallthinkerConfig"]


================================================
FILE: archive/ktransformers/models/custom_cache.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
'''
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/cache_utils.py
# Copyright 2018- The Hugging Face team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
import torch
import torch.nn as nn
import transformers
from transformers import Cache, PretrainedConfig
from typing import List, Optional, Dict, Any, Tuple

try:
    import torch_npu
    from ktransformers.util import utils
    from ktransformers.server.balance_serve.inference.forward_batch import ForwardMiniBatchCombine, ForwardMiniBatchSplit
    
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from ktransformers.server.balance_serve.settings import sched_ext

class StaticCache(transformers.StaticCache):
    """
    Static Cache class to be used with `torch.compile(model)`.

    Parameters:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used.
        max_cache_len (`int`):
            The maximum sequence length with which the model will be used.
        device (`torch.device` or `dict`):
            The device on which the cache should be initialized. Should be the same as the layer.
            If a `dict`, it should contain the `device` key with the device name as the value.
        dtype (*optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.
    """

    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device: torch.device| dict, dtype=None) -> None:
        Cache.__init__(self, layer_class_to_replicate=LlamaDecoderLayer)
        self._max_batch_size = max_batch_size

        if use_torch_npu:
            self.position = [0]

        self._max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
        if config.architectures[0] == "DeepseekV3ForCausalLM":
            self.head_dim = config.qk_rope_head_dim
        else:
            self.head_dim = (
                config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
            )

        self.dtype = dtype if dtype is not None else torch.float32
        self.num_key_value_heads = (
            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
        )

        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
        cache_shape = (max_batch_size, self.num_key_value_heads, self._max_cache_len, self.head_dim)
        if config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM":
            # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically

            if use_torch_npu:
                self.page_size = 128
                self.page_size_tensor = torch.tensor(
                self.page_size,
                dtype=torch.int32,
                    ).npu()
                self.max_pages_per_batch = (self._max_cache_len + self.page_size - 1) // self.page_size
                self.max_pages = (self._max_cache_len + self.page_size - 1) // self.page_size * self._max_batch_size
            else:
                self.page_size = 64
                self.max_pages = (self._max_cache_len + self.page_size - 1) // self.page_size
            latent_shape = (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
            self.kv_lora_rank = config.kv_lora_rank
            self.qk_rope_head_dim = config.qk_rope_head_dim
            # TODO: support real page table
            self.page_table_map = dict()
            self.page_table_list = []
            for idx in range(config.num_hidden_layers):
                if isinstance(device, dict):
                    target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
                else:
                    target_device = device
                
                if target_device not in self.page_table_map:
                    if use_torch_npu:
                        page_table = torch.zeros((max_batch_size, self.max_pages_per_batch), dtype=torch.int32, device=target_device)
                        for seq_id in range(max_batch_size):
                            page_table[seq_id, :] = torch.arange(seq_id * self.max_pages_per_batch, seq_id * self.max_pages_per_batch + self.max_pages_per_batch, dtype=torch.int32, device=target_device)
                    else:
                        page_table = torch.zeros((max_batch_size, self.max_pages), dtype=torch.int32, device=target_device)
                        for seq_id in range(max_batch_size):
                            page_table[seq_id, :] = torch.arange(seq_id * self.max_pages, seq_id * self.max_pages + self.max_pages, dtype=torch.int32, device=target_device)
                    self.page_table_map[target_device] = page_table
                    
                self.page_table_list.append(self.page_table_map[target_device])
                    
            self.is_MLA = True
            self.is_page = True
        else:
            key_shape = cache_shape
            value_shape = cache_shape
            self.is_MLA = False

        self.past_tokens = []
        self.num_hidden_layers = config.num_hidden_layers
        for idx in range(self.num_hidden_layers):
            # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
            # breaks when updating the cache.
            if isinstance(device, dict):
                target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
            else:
                target_device = device
            
            if self.is_MLA:
                new_layer_key_cache = torch.zeros(latent_shape, dtype=self.dtype, device=target_device)
                new_layer_value_cache = None
                torch._dynamo.mark_static_address(new_layer_key_cache)
            else:
                new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=target_device)
                new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=target_device)
                torch._dynamo.mark_static_address(new_layer_key_cache)
                torch._dynamo.mark_static_address(new_layer_value_cache)
                
            self.key_cache.append(new_layer_key_cache)
            self.value_cache.append(new_layer_value_cache)
            self.past_tokens.append(0)

    @property
    def max_batch_size(self):
        return self._max_batch_size

    @property
    def max_cache_len(self):
        return self._max_cache_len

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
        cache_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
                to know how where to write in the cache.

        Return:
            A tuple containing the updated key and value states.
        """
        cache_position = cache_kwargs.get("cache_position")
        k_out = self.key_cache[layer_idx]
        v_out = self.value_cache[layer_idx]
        self.past_tokens[layer_idx] += cache_position.size(0)
        #print(cache_position)
        if self.is_MLA:
            if use_torch_npu:
                page_idx = cache_position // self.page_size_tensor
                page_offset = cache_position % self.page_size_tensor

                page_idx = page_idx.unsqueeze(0).expand(self.max_batch_size, -1)
                page_offset = page_offset.unsqueeze(0).expand(self.max_batch_size, -1)

                page_idx_offset = torch.arange(self.max_batch_size, device=page_idx.device) * self.max_pages_per_batch
                page_idx = page_idx + page_idx_offset.unsqueeze(1)

                combined = torch.cat([key_states, value_states], dim=-1)
                combined = combined.contiguous()
                # key shape (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
                k_out[page_idx, page_offset] = combined
            else:
                page_idx = cache_position // self.page_size
                page_offset = cache_position % self.page_size
                # key shape (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
                k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states
                k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states
            return k_out, self.page_table_list[layer_idx]
        else:
            k_out[:, :, cache_position] = key_states
            v_out[:, :, cache_position] = value_states
            return k_out, v_out

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states that were seen by the model."""
        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
        # limit the check to the first batch member and head dimension.
        # TODO: deprecate this function in favor of `cache_position`
        return self.past_tokens[layer_idx]
    
    def change_seq_length(self, bias: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states that were seen by the model."""
        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
        # limit the check to the first batch member and head dimension.
        # TODO: deprecate this function in favor of `cache_position`
        for layer_idx in range(self.num_hidden_layers):
            self.past_tokens[layer_idx] += bias

    def get_max_length(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states."""
        return self.max_cache_len
    
    def get_usable_length(self, kv_seq_len, layer_idx: Optional[int] = 0) -> int:
        return 0

    def reset(self):
        """Resets the cache values while preserving the objects"""
        for layer_idx in range(len(self.key_cache)):
            # In-place ops prevent breaking the static address
            self.key_cache[layer_idx].zero_()
            if self.value_cache[layer_idx] is not None:
                self.value_cache[layer_idx].zero_()
            self.past_tokens[layer_idx] = 0
        
        if use_torch_npu:
            self.position = [0]

    def remove_suffix(self, start_pos):
        for layer_idx in range(len(self.key_cache)):
            # In-place ops prevent breaking the static address
            if self.is_MLA:
                k_cache = self.key_cache[layer_idx]
                k_cache.view(-1, k_cache.shape[-1])[start_pos:].zero_()
            else:
                self.key_cache[layer_idx][..., start_pos:, :].zero_()
                self.value_cache[layer_idx][..., start_pos:, :].zero_()
            self.past_tokens[layer_idx] = start_pos
    
    def get_max_cache_shape(self) -> Tuple[int, int, int, int]:
        """Returns the maximum shape of the cache."""
        return self.max_cache_len

class KVC2StaticCache:
    """
    Static Cache class connect with KVC2
    remind: page_idx & page_offset info need to refs to forward batching, only contains KV Block Tensor here
    """
    def __init__(self, config: PretrainedConfig, max_batch_size, page_size: int = 256, dtype=torch.bfloat16, device=None) -> None:
        super().__init__()
        self.config = config
        self.dtype = dtype
        self.device = torch.device("npu:0")
        self.kv_lora_rank = config.kv_lora_rank
        self.max_batch_size = max_batch_size
        self.page_size = page_size
        self.k_caches = []
        self.v_caches = []

        self.num_hidden_layers = config.num_hidden_layers

        self.is_MLA = True if config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"] else False
        # kv cache stored in kvc2
        # self.past_tokens = []

    def load(self, inference_context):
        # assert self.is_MLA and len(inference_context.k_cache) == 1, "currently only support MLA and Cache Pool TP=1"
        from ktransformers.util.utils import get_current_device
        for i in range(self.config.num_hidden_layers):
            new_layer_key_cache = inference_context.k_cache[int(torch.distributed.get_rank())][i].to(get_current_device())
            torch._dynamo.mark_static_address(new_layer_key_cache)

            self.k_caches.append(
                new_layer_key_cache  # [TP_idx, layer_idx, page_idx, page_size, kv_head_num, kv_head_dim]
            )

            self.v_caches.append(None)
        self.max_cache_len = self.k_caches[0].shape[0] * self.k_caches[0].shape[1]  # page_len * page_size

    def update(
        self,
        combined: torch.Tensor,
        layer_idx: int,
        cache_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                must have page_idx (`torch.Tensor`): & page_offset (`torch.Tensor`) & cache_position (`torch.Tensor`)

        Return:
            A tuple containing the updated key and value states.
        """
        page_idx, page_offset = cache_kwargs.get("page_idx"), cache_kwargs.get("page_offset")
        if page_idx is None or page_offset is None:
            raise ValueError('[ERROR] block info:page_idx & page_offset missing!')

        k_out = self.k_caches[layer_idx]
        assert self.is_MLA, "currently only support DeepSeekV3 on NPU balance server"

        if page_idx.dim() == 1:
            page_idx_tmp = page_idx.unsqueeze(0)
            page_offset_tmp = page_offset.unsqueeze(0)
        else:
             page_idx_tmp = page_idx
             page_offset_tmp = page_offset

        k_out[page_idx_tmp, page_offset_tmp] = combined
        return k_out, page_idx

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states that were seen by the model."""
        raise ValueError('kvc2 cache pool no longer hold seq_length info, refer to forward batching')

    def get_usable_length(self, kv_seq_len, layer_idx: Optional[int] = 0) -> int:
        return 0

    def change_seq_length(self, bias: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states that were seen by the model."""
        raise ValueError('kvc2 cache pool no longer hold seq_length info, refer to forward batching')

    def get_max_length(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states."""
        return self.max_cache_len

    def reset(self, inference_context):
        assert self.is_MLA and len(inference_context.k_cache) == 1, "currently only support MLA and Cache Pool TP=1"
        self.k_caches = []
        self.v_caches = []
        for i in range(self.config.num_hidden_layers):
            self.k_caches.append(
                inference_context.k_cache[0][i]
            )
            self.v_caches.append(None)
        self.max_cache_len = self.k_caches[0].shape[0] * self.k_caches[0].shape[1]  # page_len * page_size

    def get_page_table(self, mini_batch, bsz_tensors: torch.tensor = None, is_prefill=True):
        if is_prefill:
            # TODO add padding support
            q_lens = [mini_batch.p_q_len[idx] for idx in range(mini_batch.prefill_batch)]
            page_local_idx = -1 * torch.ones(mini_batch.prefill_batch, max(q_lens),
                                             dtype=mini_batch.p_position_ids.dtype, device=mini_batch.p_position_ids.device)
            page_offset = -1 * torch.ones_like(page_local_idx)
            # convert merged into batched
            start_ids = 0
            for i in range(mini_batch.prefill_batch):
                page_offset[i, 0:q_lens[i]] = mini_batch.p_position_ids[start_ids:start_ids+q_lens[i]] % self.page_size
                page_local_idx[i, 0:q_lens[i]] = mini_batch.p_position_ids[start_ids:start_ids+q_lens[i]] // self.page_size
                for j in range(q_lens[i]):
                    # get global page idx index by local page idx from block table, as followed decode
                    page_local_idx[i, j] = mini_batch.p_block_tables[i, page_local_idx[i, j]]
                start_ids += q_lens[i]
            page_idx = page_local_idx
            # only padding will cause page_local_idx/page_offset still have -1 value
            # you can use following code as check
            # indices = torch.where(page_offset == -1)
            # assert not indices[0].numel() > 0, 'there still have un-calculated page_idx value'
        else:
            page_local_idx = mini_batch.d_position_ids // self.page_size

            page_offset = mini_batch.d_position_ids % self.page_size
            
            for i in range(mini_batch.decode_batch):
                page_local_idx[i] = mini_batch.d_block_tables[i, page_local_idx[i]]
            
            page_idx = page_local_idx
            
        return page_idx, page_offset

class KDeepSeekV3Cache(nn.Module):
    def __init__(
        self,
        config: PretrainedConfig,
        page_size: int = 256,
        dtype=torch.bfloat16,
        device=torch.device("cuda:0"),
        
    ):
        super().__init__()
        self.config = config
        self.dtype = dtype
        self.device = device
        self.kv_lora_rank = config.kv_lora_rank
        self.page_size = page_size
        self.k_caches = []
        self.v_caches = []
        

    def load(self, inference_context: "sched_ext.InferenceContext"):
        
        for i in range(self.config.num_hidden_layers):
            self.k_caches.append(
                inference_context.k_cache[0][i] 
            )
        self.max_cache_len = self.k_caches[0].shape[0]*self.k_caches[0].shape[1]

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,

        page_idx: torch.Tensor,
        page_offset: torch.Tensor,

        cache_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
                to know how where to write in the cache.

        Return:
            A tuple containing the updated key and value states.
        """
        k_out = self.k_caches[layer_idx]

        k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states.reshape(-1, *key_states.shape[2:])
        k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states.reshape(-1, *value_states.shape[2:])
        return k_out

        
    def get_page_table(self, cache_position: torch.Tensor, q_indptr: torch.Tensor, kv_indptr: torch.Tensor, kv_indices: torch.Tensor, bsz_tensors: torch.tensor):
        page_offset = cache_position % self.page_size  
        page_idx_local = cache_position // self.page_size  
        query_ids = torch.zeros_like(cache_position)
        for i in range(len(q_indptr) - 1):
            start_idx = q_indptr[i]
            end_idx = q_indptr[i + 1]
            query_ids[start_idx:end_idx] = i
        page_idx = torch.zeros_like(page_idx_local)
        for i in range(bsz_tensors[0]):
            query_id = query_ids[i]
            local_block = page_idx_local[i]
            start_block = kv_indptr[query_id]
            if local_block < kv_indptr[query_id + 1] - kv_indptr[query_id]:
                page_idx[i] = kv_indices[start_block + local_block]
        
        return page_idx, page_offset
    
class KGQACache(nn.Module):
    def __init__(
        self,
        config: PretrainedConfig,
        page_size: int = 256,
        dtype=torch.bfloat16,
        device=torch.device("cuda:0"),
        
    ):
        super().__init__()
        self.config = config
        self.dtype = dtype
        self.device = device
        self.page_size = page_size
        self.k_caches = []
        self.v_caches = []
        

    def load(self, inference_context: "sched_ext.InferenceContext"):
        print(self.config.num_hidden_layers)
        for i in range(self.config.num_hidden_layers):
            self.k_caches.append(
                inference_context.k_cache[0][i] 
            )
            self.v_caches.append(
                inference_context.v_cache[0][i]
            )


        self.max_cache_len = self.k_caches[0].shape[0]*self.k_caches[0].shape[1]


    def get_page_table(self, cache_position: torch.Tensor, q_indptr: torch.Tensor, kv_indptr: torch.Tensor, kv_indices: torch.Tensor, bsz_tensors: torch.tensor):
        page_offset = cache_position % self.page_size  
        page_idx_local = cache_position // self.page_size  
        query_ids = torch.zeros_like(cache_position)
        for i in range(len(q_indptr) - 1):
            start_idx = q_indptr[i]
            end_idx = q_indptr[i + 1]
            query_ids[start_idx:end_idx] = i
        page_idx = torch.zeros_like(page_idx_local)
        for i in range(bsz_tensors[0]):
            query_id = query_ids[i]
            local_block = page_idx_local[i]
            start_block = kv_indptr[query_id]
            if local_block < kv_indptr[query_id + 1] - kv_indptr[query_id]:
                page_idx[i] = kv_indices[start_block + local_block]
        
        return page_idx, page_offset

    def get_k_cache(self, layer_idx):
        return self.k_caches[layer_idx]

    def get_v_cache(self, layer_idx):
        return self.v_caches[layer_idx]


class KVC2Qwen3Cache(nn.Module):

    def __init__(self, config, max_batch_size, page_size=256,
                 dtype=torch.bfloat16, device=None):
        super().__init__()
        self.config = config
        self.max_batch_size = max_batch_size
        self.page_size = page_size
        self.dtype = dtype
        self.device = device if device else torch.device("npu:0")

        self.num_layers = config.num_hidden_layers
        self.num_kv_heads = config.num_key_value_heads
        self.head_dim = config.head_dim

        self.k_caches = []
        self.v_caches = []


    # ------------------------- 绑定到底层 kvc2 pool -------------------------

    def load(self, inference_context):
        from ktransformers.util.utils import get_current_device
        dev = get_current_device()

        self.k_caches = []
        self.v_caches = []

        rank = (
            torch.distributed.get_rank()
            if (torch.distributed.is_available() and torch.distributed.is_initialized())
            else 0
        )

        for i in range(self.num_layers):
            k_buf = inference_context.k_cache[rank][i].to(dev).to(self.dtype)
            v_buf = inference_context.v_cache[rank][i].to(dev).to(self.dtype)

            torch._dynamo.mark_static_address(k_buf)
            torch._dynamo.mark_static_address(v_buf)

            self.k_caches.append(k_buf)
            self.v_caches.append(v_buf)

        # num_pages * page_size
        self.max_cache_len = self.k_caches[0].shape[0] * self.k_caches[0].shape[1]

    # ------------------------- 写 KV -------------------------
    @torch.no_grad()
    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
        cache_kwargs: Optional[Dict[str, Any]] = None,
    ):
        if cache_kwargs is None:
            raise ValueError("[KVC2Qwen3Cache] cache_kwargs must contain page_idx & page_offset")

        page_idx: Optional[torch.Tensor] = cache_kwargs.get("page_idx", None)
        page_offset: Optional[torch.Tensor] = cache_kwargs.get("page_offset", None)

        if page_idx is None or page_offset is None:
            raise ValueError("[KVC2Qwen3Cache] page_idx & page_offset are required in cache_kwargs")

        k_out = self.k_caches[layer_idx]
        v_out = self.v_caches[layer_idx]

        # -------- 1) 修正维度顺序：[B, KvH, Q, D] -> [B, Q, KvH, D] --------
        if key_states.dim() == 4 and key_states.shape[1] == self.num_kv_heads:
            key_states = key_states.transpose(1, 2).contiguous()
            value_states = value_states.transpose(1, 2).contiguous()

        if key_states.shape != value_states.shape:
            raise ValueError(
                f"[KVC2Qwen3Cache] key_states.shape {key_states.shape} "
                f"!= value_states.shape {value_states.shape}"
            )

        if key_states.dim() != 4:
            raise ValueError(
                f"[KVC2Qwen3Cache] expect key_states dim=4, got {key_states.dim()} "
                f"(shape={key_states.shape})"
            )

        bsz, q_len, kv_heads, head_dim = key_states.shape

        if kv_heads != self.num_kv_heads or head_dim != self.head_dim:
            raise ValueError(
                f"[KVC2Qwen3Cache] KV shape mismatch: "
                f"got num_kv_heads={kv_heads}, head_dim={head_dim}, "
                f"expected num_kv_heads={self.num_kv_heads}, head_dim={self.head_dim}"
            )

        # -------- 2) flatten page_idx / page_offset 为一维 --------
        page_idx = page_idx.reshape(-1)
        page_offset = page_offset.reshape(-1)

        # -------- 3) flatten KV，并强制 dtype 与 cache 对齐 --------
        val_dtype = k_out.dtype
        flat_k = key_states.to(val_dtype).reshape(-1, kv_heads, head_dim)
        flat_v = value_states.to(val_dtype).reshape(-1, kv_heads, head_dim)

        # -------- 4) 真正写入 K / V --------
        # k_out / v_out: [num_pages, page_size, num_kv_heads, head_dim]
        k_out[page_idx, page_offset] = flat_k
        v_out[page_idx, page_offset] = flat_v

    # ------------------------- get K/V -------------------------
    def get_k_cache(self, layer_idx):
        return self.k_caches[layer_idx]

    def get_v_cache(self, layer_idx):
        return self.v_caches[layer_idx]

    # ------------------------- page table 计算 -------------------------
    def get_page_table(
        self,
        mini_batch,
        bsz_tensors: torch.Tensor = None,
        is_prefill: bool = True,
    ):
        if is_prefill:
            # prefill: merged positions => batched (B, T_chunk)
            q_lens = [int(mini_batch.p_q_len[idx]) for idx in range(mini_batch.prefill_batch)]
            if len(q_lens) == 0:
                return None, None

            max_q_len = max(q_lens)

            page_local_idx = -1 * torch.ones(
                mini_batch.prefill_batch,
                max_q_len,
                dtype=mini_batch.p_position_ids.dtype,
                device=mini_batch.p_position_ids.device,
            )
            page_offset = -1 * torch.ones_like(page_local_idx)

            start_ids = 0
            for i in range(mini_batch.prefill_batch):
                cur_len = q_lens[i]
                pos = mini_batch.p_position_ids[start_ids:start_ids + cur_len]  # global pos of this chunk

                # local block + offset by page_size
                page_offset[i, 0:cur_len] = pos % self.page_size
                page_local_idx[i, 0:cur_len] = pos // self.page_size

                # local block -> global page id via block_tables
                for j in range(cur_len):
                    blk = page_local_idx[i, j]
                    page_local_idx[i, j] = mini_batch.p_block_tables[i, blk]

                start_ids += cur_len

            page_idx = page_local_idx
        else:
            # decode: decode_batch = 当前 step 的 batch_size, 每条样本通常 1 个 token
            page_local_idx = mini_batch.d_position_ids // self.page_size
            page_offset = mini_batch.d_position_ids % self.page_size

            for i in range(mini_batch.decode_batch):
                blk = page_local_idx[i]
                page_local_idx[i] = mini_batch.d_block_tables[i, blk]

            page_idx = page_local_idx

        return page_idx, page_offset


================================================
FILE: archive/ktransformers/models/custom_modeling_deepseek_v2.py
================================================
import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KDeepSeekV3Cache
from  ktransformers.models.modeling_deepseek import DeepseekV2Model,  DeepseekV2PreTrainedModel
from ktransformers.models.configuration_deepseek import DeepseekV2Config


torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KDeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):

    kv_cache: KDeepSeekV3Cache
    use_cuda_graph = False
    def __init__(
        self,
        config,
        kv_cache,

    ):
        super().__init__(config)
        self.model = DeepseekV2Model(config)
        self.config = config
        self.kv_cache = kv_cache

        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        

    def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
        self.use_cuda_graph = use_cuda_graph
        self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
        self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
        self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
        self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
        self.paged_kv_len_buf = torch.empty((max_batch_size,), dtype=torch.int32, device=device)

		
        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
            self.workspace_buffer, use_cuda_graph=use_cuda_graph,
            qo_indptr=self.qo_indptr_buf,kv_indptr=self.paged_kv_indptr_buf,
            kv_indices=self.paged_kv_indices_buf,kv_len_arr=self.paged_kv_len_buf,
            backend = "fa2",
        )

    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]


        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.kv_cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.wrapper, bsz_tensors=num_tokens_tensors, 
                                                       cache_position=batch.minibatch.positions, 
                                                       batch_indices=batch.minibatch.batch_indices,
                                                       kv_indices=batch.minibatch.kv_indices,
                                                       kv_indptr=batch.minibatch.kv_indptr,
                                                       kv_last_page_len=batch.minibatch.kv_last_page_len,
                                                       q_indptr=batch.minibatch.q_indptr,
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                if i < 3:
                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors)
                else:
                    hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors)
                    hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        assert  batch.batch_size == 1
        with torch.cuda.stream(current_stream):

            local_logit = self.lm_head(self.model.norm(hidden_states[batch.minibatch.logits_start], num_tokens_tensors, residual[batch.minibatch.logits_start])[0])
            # local_logit = local_logit[batch.minibatch.logits_start]
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_heads: int,
        head_dim_ckv: int,
        head_dim_kpe: int,
        page_size: int,
        causal: bool,
        sm_scale: float,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,):
        minibatch = batch.minibatch
        
        self.wrapper.plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_len, num_heads, head_dim_ckv, head_dim_kpe, page_size, causal, sm_scale, q_data_type, kv_data_type)
        

================================================
FILE: archive/ktransformers/models/custom_modeling_deepseek_v3.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KDeepSeekV3Cache
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3Model,  DeepseekV3PreTrainedModel
from ktransformers.models.configuration_deepseek_v3 import DeepseekV3Config


torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):

    cache: KDeepSeekV3Cache
    use_cuda_graph = False
    def __init__(
        self,
        config: DeepseekV3Config,
        cache,
    ):
        super().__init__(config)
        self.model = DeepseekV3Model(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
        self.use_cuda_graph = use_cuda_graph
        self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
        self.qo_indptr_buf = torch.empty((max_batch_size+2,), dtype=torch.int32, device=device)
        self.paged_kv_indptr_buf = torch.empty((max_batch_size+2,), dtype=torch.int32, device=device)
        self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
        self.paged_kv_len_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
        self.bsz_tensor_buf = torch.empty((1, ), dtype=torch.int32, device=device)
		

        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
            self.workspace_buffer, use_cuda_graph=use_cuda_graph,
            qo_indptr=self.qo_indptr_buf,kv_indptr=self.paged_kv_indptr_buf,
            kv_indices=self.paged_kv_indices_buf,kv_len_arr=self.paged_kv_len_buf,
            bsz_tensor=self.bsz_tensor_buf,
            backend = "fa2",
        )

    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = -1
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]

        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                # can't use now, only one flashinfer wrapper
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.wrapper, num_tokens_tensors=num_tokens_tensors, 
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                if i < self.config.first_k_dense_replace:
                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors)
                else:
                    hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors, cuda_graph_idx)
                    hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_heads: int,
        head_dim_ckv: int,
        head_dim_kpe: int,
        page_size: int,
        causal: bool,
        sm_scale: float,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,):
        minibatch = batch.minibatch
        self.wrapper.plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_len, num_heads, head_dim_ckv, head_dim_kpe, page_size, causal, sm_scale, q_data_type, kv_data_type, bsz_tensors)
        

================================================
FILE: archive/ktransformers/models/custom_modeling_glm4_moe.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KGQACache
from ktransformers.models.modeling_glm4_moe import Glm4MoeModel,  Glm4MoePreTrainedModel
from ktransformers.models.configuration_glm4_moe import Glm4MoeConfig
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KGlm4MoeForCausalLM(Glm4MoePreTrainedModel):

    cache: KGQACache
    use_cuda_graph = False
    def __init__(
        self,
        config: Glm4MoeConfig,
        cache,
    ):

        super().__init__(config)
        self.model = Glm4MoeModel(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.attn = [None] * 100
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
        self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)


    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = 0
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]
        self.attn[cuda_graph_idx].calc_batch_indices(hidden_states.shape[0])

        freqs_cis = self.model.rotary_emb(hidden_states.unsqueeze(0), batch.minibatch.position_ids.unsqueeze(0))


        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):

                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       freqs_cis,
                                                       wrapper=self.attn[cuda_graph_idx], bsz_tensors=num_tokens_tensors, 
                                                       position_ids=batch.minibatch.position_ids
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                if i < self.model.config.first_k_dense_replace:
                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors)
                else:
                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors, cuda_graph_idx)
                    # hidden_states = hidden_states.squeeze(0)

        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0
        ):
        minibatch = batch.minibatch
        self.attn[cuda_graph_idx].plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_last_page_len, bsz_tensors, num_tokens_tensors, num_q_heads, num_kv_heads, head_dim, page_size, causal=causal, q_data_type=q_data_type, kv_data_type=kv_data_type)
        

================================================
FILE: archive/ktransformers/models/custom_modeling_qwen2_moe.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KGQACache
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeModel, Qwen2MoePreTrainedModel
from ktransformers.models.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel):

    cache: KGQACache
    use_cuda_graph = False
    def __init__(
        self,
        config: Qwen2MoeConfig,
        cache,
    ):
        super().__init__(config)
        self.model = Qwen2MoeModel(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.attn = [None] * 100
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
        self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)


    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = 0
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]
        self.attn[cuda_graph_idx].calc_batch_indices(hidden_states.shape[0])

        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.attn[cuda_graph_idx], bsz_tensors=num_tokens_tensors, 
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors, cuda_graph_idx)
                hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0
        ):
        minibatch = batch.minibatch
        self.attn[cuda_graph_idx].plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_last_page_len, bsz_tensors, num_tokens_tensors,num_q_heads, num_kv_heads, head_dim, page_size, causal=causal, q_data_type=q_data_type, kv_data_type=kv_data_type)
        

================================================
FILE: archive/ktransformers/models/custom_modeling_qwen3_moe.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KGQACache
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeModel, Qwen3MoePreTrainedModel
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):

    cache: KGQACache
    use_cuda_graph = False
    def __init__(
        self,
        config: Qwen3MoeConfig,
        cache = None,
    ):
        super().__init__(config)
        self.model = Qwen3MoeModel(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.attn = [None] * 100
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
        self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)


    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = 0
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]
        self.attn[cuda_graph_idx].calc_batch_indices(hidden_states.shape[0])

        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.attn[cuda_graph_idx], bsz_tensors=num_tokens_tensors, 
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors, cuda_graph_idx)
                hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0
        ):
        minibatch = batch.minibatch
        self.attn[cuda_graph_idx].plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_last_page_len, bsz_tensors, num_tokens_tensors, num_q_heads, num_kv_heads, head_dim, page_size, causal=causal, q_data_type=q_data_type, kv_data_type=kv_data_type)
        

================================================
FILE: archive/ktransformers/models/custom_modeling_qwen3_next.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KGQACache
from ktransformers.models.modeling_qwen3_next import Qwen3NextModel, Qwen3NextPreTrainedModel
from ktransformers.models.configuration_qwen3_next import Qwen3NextConfig
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KQwen3NextForCausalLM(Qwen3NextPreTrainedModel):

    cache: KGQACache
    use_cuda_graph = False
    def __init__(
        self,
        config: Qwen3NextConfig,
        cache = None,
    ):
        super().__init__(config)
        self.model = Qwen3NextModel(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.attn = [None] * 100
        self.conv_states = [None for _ in range(config.num_hidden_layers)]
        self.recurrent_states = [None for _ in range(config.num_hidden_layers)]
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
        self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)


    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features

    def reset_conv_states(self):
        for i in range(self.config.num_hidden_layers):
            self.conv_states[i] = None
            self.recurrent_states[i] = None


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = 0
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        q_len = features[0].size(0)
        if q_len > 1:
            self.reset_conv_states()

        hidden_states = features[0]
        self.attn[cuda_graph_idx].calc_batch_indices(hidden_states.shape[0])
        freqs_cis = self.model.rotary_emb(hidden_states.unsqueeze(0), batch.minibatch.position_ids.unsqueeze(0))

        residual = torch.zeros_like(hidden_states)
        for i, decode_layer in enumerate(self.model.layers):
            hidden_states = hidden_states.contiguous().clone()   # 断开别名 + 连续
            residual      = residual.contiguous().clone()

            hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
            hidden_states = hidden_states.contiguous()
            residual = residual.contiguous()

            if self.config.layer_types[i] != "linear_attention":
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, freqs_cis,
                                                    wrapper=self.attn[cuda_graph_idx],
                                                    bsz_tensors=num_tokens_tensors)
            else:
                hs = hidden_states.unsqueeze(0).contiguous().clone()
                hs = decode_layer.linear_attn(hs, self.conv_states, self.recurrent_states,
                                            bsz_tensors=num_tokens_tensors)
                hidden_states = hs.squeeze(0).contiguous()

            hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)

            hs2 = hidden_states.unsqueeze(0).contiguous().clone()
            hidden_states = decode_layer.mlp(hs2, num_tokens_tensors, cuda_graph_idx).squeeze(0).contiguous()

            if not torch.isfinite(hidden_states).all():
                raise RuntimeError(f"NaN after layer {i}")
            # print(f"Layer {i} output: {hidden_states}")
        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0
        ):
        minibatch = batch.minibatch
        self.attn[cuda_graph_idx].plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_last_page_len, bsz_tensors, num_tokens_tensors, num_q_heads, num_kv_heads, head_dim, page_size, causal=causal, q_data_type=q_data_type, kv_data_type=kv_data_type)
        

================================================
FILE: archive/ktransformers/models/custom_modeling_smallthinker.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KGQACache
from ktransformers.models.modeling_smallthinker import SmallthinkerModel,  SmallthinkerPreTrainedModel
from ktransformers.models.configuration_smallthinker import SmallthinkerConfig
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KSmallThinkerForCausalLM(SmallthinkerPreTrainedModel):

    cache: KGQACache
    use_cuda_graph = False
    def __init__(
        self,
        config: SmallthinkerConfig,
        cache,
    ):

        super().__init__(config)
        self.model = SmallthinkerModel(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.attn = [None] * 100
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
        self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)


    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = 0
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]
        self.attn[cuda_graph_idx].calc_batch_indices(hidden_states.shape[0])

        freqs_cis = self.model.rotary_emb(hidden_states.unsqueeze(0), batch.minibatch.position_ids.unsqueeze(0))

        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                router_input = hidden_states
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       freqs_cis if self.model.rope_layout[i] else None, 
                                                       wrapper=self.attn[cuda_graph_idx], bsz_tensors=num_tokens_tensors, 
                                                       position_ids=batch.minibatch.position_ids
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                if not self.config.moe_layer_layout[i]:
                    hidden_states = decode_layer.block_sparse_moe(hidden_states, num_tokens_tensors)
                else:
                    hidden_states = decode_layer.block_sparse_moe(router_input, hidden_states, num_tokens_tensors, cuda_graph_idx)
                    # hidden_states = hidden_states.squeeze(0)

        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0
        ):
        minibatch = batch.minibatch
        self.attn[cuda_graph_idx].plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_last_page_len, bsz_tensors, num_tokens_tensors, num_q_heads, num_kv_heads, head_dim, page_size, causal=causal, q_data_type=q_data_type, kv_data_type=kv_data_type)
        

================================================
FILE: archive/ktransformers/models/modeling_deepseek.py
================================================
# coding=utf-8
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
'''
# Adapted from
# https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628/blob/main/modeling_deepseek.py
# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
# 
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DeepSeek model."""
import math
import warnings
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_attention_mask,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import (
    ALL_LAYERNORM_LAYERS,
    is_torch_greater_or_equal_than_1_13,
)
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_torch_fx_available
from .configuration_deepseek import DeepseekV2Config
import torch.distributed as dist
import numpy as np

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa


# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "DeepseekV2Config"


def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(
        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
    )
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


class DeepseekV2RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        DeepseekV2RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return (self.weight * hidden_states).to(input_dtype)


ALL_LAYERNORM_LAYERS.append(DeepseekV2RMSNorm)

# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->DeepseekV2
class DeepseekV2RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)   

# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV2
class DeepseekV2LinearScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
    """DeepseekV2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        raise NotImplementedError("LinearScalingRotaryEmbedding is not supported now.")
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        t = t / self.scaling_factor

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV2
class DeepseekV2DynamicNTKScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
    """DeepseekV2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        raise NotImplementedError("DynamicNTKScalingRotaryEmbedding is not supported now.")
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings)
                - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (
                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Inverse dim formula to find dim based on number of rotations
def yarn_find_correction_dim(
    num_rotations, dim, base=10000, max_position_embeddings=2048
):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def yarn_find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(
        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
    )
    high = math.ceil(
        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
    )
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case


def yarn_get_mscale(scale=1, mscale=1):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0


def yarn_linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func

class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        nn.Module.__init__(self)
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self._mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()* self._mscale
            sin = emb.sin()* self._mscale
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)  

# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    b, h, s, d = k.shape
    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

class DeepseekV2MLP(nn.Module):
    def __init__(self, config, hidden_size=None, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
        self.intermediate_size = (
            config.intermediate_size if intermediate_size is None else intermediate_size
        )

        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        act = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
        down_proj = self.down_proj(act)
        return down_proj

class MoEGate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.top_k = config.num_experts_per_tok
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.scoring_func = config.scoring_func
        self.alpha = config.aux_loss_alpha
        self.seq_aux = config.seq_aux
        self.topk_method = config.topk_method
        self.n_group = config.n_group
        self.topk_group = config.topk_group

        # topk selection algorithm
        self.norm_topk_prob = config.norm_topk_prob
        self.gating_dim = config.hidden_size
        self.weight = nn.Parameter(
            torch.empty((self.n_routed_experts, self.gating_dim))
        )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        logits = F.linear(
            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
        )
        if self.scoring_func == "softmax":
            scores = logits.softmax(dim=-1, dtype=torch.float32)
        else:
            raise NotImplementedError(
                f"insupportable scoring function for MoE gating: {self.scoring_func}"
            )

        ### select top-k experts
        if self.topk_method == "greedy":
            topk_weight, topk_idx = torch.topk(
                scores, k=self.top_k, dim=-1, sorted=False
            )
        elif self.topk_method == "group_limited_greedy":
            group_scores = (
                scores.view(bsz * seq_len, self.n_group, -1).max(dim=-1).values
            )  # [n, n_group]
            group_idx = torch.topk(
                group_scores, k=self.topk_group, dim=-1, sorted=False
            )[
                1
            ]  # [n, top_k_group]
            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
            score_mask = (
                group_mask.unsqueeze(-1)
                .expand(
                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
                )
                .reshape(bsz * seq_len, -1)
            )  # [n, e]
            tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
            topk_weight, topk_idx = torch.topk(
                tmp_scores, k=self.top_k, dim=-1, sorted=False
            )

        ### norm gate to sum 1
        if self.top_k > 1 and self.norm_topk_prob:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        else:
            topk_weight = topk_weight * self.routed_scaling_factor
        ### expert-level computation auxiliary loss
        if self.training and self.alpha > 0.0:
            scores_for_aux = scores
            aux_topk = self.top_k
            # always compute aux loss based on the naive greedy topk method
            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
            if self.seq_aux:
                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
                ce = torch.zeros(
                    bsz, self.n_routed_experts, device=hidden_states.device
                )
                ce.scatter_add_(
                    1,
                    topk_idx_for_aux_loss,
                    torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device),
                ).div_(seq_len * aux_topk / self.n_routed_experts)
                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(
                    dim=1
                ).mean() * self.alpha
            else:
                mask_ce = F.one_hot(
                    topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts
                )
                ce = mask_ce.float().mean(0)
                Pi = scores_for_aux.mean(0)
                fi = ce * self.n_routed_experts
                aux_loss = (Pi * fi).sum() * self.alpha
        else:
            aux_loss = None
        return topk_idx, topk_weight, aux_loss


class AddAuxiliaryLoss(torch.autograd.Function):
    """
    The trick function of adding auxiliary (aux) loss,
    which includes the gradient of the aux loss during backpropagation.
    """

    @staticmethod
    def forward(ctx, x, loss):
        assert loss.numel() == 1
        ctx.dtype = loss.dtype
        ctx.required_aux_loss = loss.requires_grad
        return x

    @staticmethod
    def backward(ctx, grad_output):
        grad_loss = None
        if ctx.required_aux_loss:
            grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
        return grad_output, grad_loss

class DeepseekV2MoE(nn.Module):
    """
    A mixed expert module containing shared experts.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.num_experts_per_tok = config.num_experts_per_tok

        if hasattr(config, "ep_size") and config.ep_size > 1:
            assert config.ep_size == dist.get_world_size()
            self.ep_size = config.ep_size
            self.experts_per_rank = config.n_routed_experts // config.ep_size
            self.ep_rank = dist.get_rank()
            self.experts = nn.ModuleList(
                [
                    (
                        DeepseekV2MLP(
                            config, intermediate_size=config.moe_intermediate_size
                        )
                        if i >= self.ep_rank * self.experts_per_rank
                        and i < (self.ep_rank + 1) * self.experts_per_rank
                        else None
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        else:
            self.ep_size = 1
            self.experts_per_rank = config.n_routed_experts
            self.ep_rank = 0
            self.experts = nn.ModuleList(
                [
                    DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)
                    for i in range(config.n_routed_experts)
                ]
            )
        self.gate = MoEGate(config)
        if config.n_shared_experts is not None:
            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
            self.shared_experts = DeepseekV2MLP(
                config=config, intermediate_size=intermediate_size
            )

    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        flat_topk_idx = topk_idx.view(-1)
        if self.training:
            hidden_states = hidden_states.repeat_interleave(
                self.num_experts_per_tok, dim=0
            )
            y = torch.empty_like(hidden_states)
            for i, expert in enumerate(self.experts):
                y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
            y = y.view(*orig_shape)
            y = AddAuxiliaryLoss.apply(y, aux_loss)
        else:
            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
        if self.config.n_shared_experts is not None:
            y = y + self.shared_experts(identity)
        return y

    @torch.no_grad()
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        sorted_tokens_shape = sorted_tokens.shape
        if self.ep_size > 1:
            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
            tokens_per_expert_group = tokens_per_expert.new_empty(
                tokens_per_expert.shape[0]
            )
            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
            output_splits = (
                tokens_per_expert_group.view(self.ep_size, -1)
                .sum(1)
                .cpu()
                .numpy()
                .tolist()
            )
            gathered_tokens = sorted_tokens.new_empty(
                tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
            )
            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
            dist.all_to_all(
                list(gathered_tokens.split(output_splits)),
                list(sorted_tokens.split(input_split_sizes)),
            )
            tokens_per_expert_post_gather = tokens_per_expert_group.view(
                self.ep_size, self.experts_per_rank
            ).sum(dim=0)
            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
            s = 0
            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
                gatherd_idxs[s : s + k] = i % self.experts_per_rank
                s += k
            gatherd_idxs = gatherd_idxs.argsort()
            sorted_tokens = gathered_tokens[gatherd_idxs]
            tokens_per_expert = tokens_per_expert_post_gather
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
        if self.ep_size > 1:
            new_x = torch.empty_like(outs)
            new_x[gatherd_idxs] = outs
            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
            dist.all_to_all(
                list(gathered_tokens.split(input_split_sizes)),
                list(new_x.split(output_splits)),
            )
            outs = gathered_tokens

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
class DeepseekV2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads

        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.q_lora_rank = config.q_lora_rank
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.kv_lora_rank = config.kv_lora_rank
        self.v_head_dim = config.v_head_dim
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim

        self.is_causal = True

        if self.q_lora_rank is None:
            self.q_proj = nn.Linear(
                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
            )
        else:
            self.q_a_proj = nn.Linear(
                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
            )
            self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
            self.q_b_proj = nn.Linear(
                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
            )

        self.kv_a_proj_with_mqa = nn.Linear(
            self.hidden_size,
            config.kv_lora_rank + config.qk_rope_head_dim,
            bias=config.attention_bias,
        )
        self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
        self.kv_b_proj = nn.Linear(
            config.kv_lora_rank,
            self.num_heads
            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
            bias=False,
        )

        self.o_proj = nn.Linear(
            self.num_heads * self.v_head_dim,
            self.hidden_size,
            bias=config.attention_bias,
        )
        self._init_rope()

        self.softmax_scale = self.q_head_dim ** (-0.5)
        if self.config.rope_scaling is not None:
            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
            scaling_factor = self.config.rope_scaling["factor"]
            if mscale_all_dim:
                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                self.softmax_scale = self.softmax_scale * mscale * mscale

    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = DeepseekV2RotaryEmbedding(
                self.qk_rope_head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = DeepseekV2LinearScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = DeepseekV2DynamicNTKScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "yarn":
                kwargs = {
                    key: self.config.rope_scaling[key]
                    for key in [
                        "original_max_position_embeddings",
                        "beta_fast",
                        "beta_slow",
                        "mscale",
                        "mscale_all_dim",
                    ]
                    if key in self.config.rope_scaling
                }
                self.rotary_emb = DeepseekV2YarnRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                    **kwargs,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
        )

        if attention_mask is not None:
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV2
class DeepseekV2FlashAttention2(DeepseekV2Attention):
    """
    DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # DeepseekV2FlashAttention2 attention does not support output_attentions
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

            # overwrite attention_mask with padding_mask
            attention_mask = kwargs.pop("padding_mask")

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]

        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe

        if self.q_head_dim != self.v_head_dim:
            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
        # to be able to avoid many of these transpose/reshape/view.
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (DeepseekV2RMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
            if hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            elif torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            else:
                target_dtype = self.q_a_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            position_ids=position_ids,
            dropout=dropout_rate,
            softmax_scale=self.softmax_scale,
        )
        if self.q_head_dim != self.v_head_dim:
            attn_output = attn_output[:, :, :, : self.v_head_dim]

        attn_output = attn_output.reshape(
            bsz, q_len, self.num_heads * self.v_head_dim
        ).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        position_ids,
        dropout=0.0,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.
        # Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`int`, *optional*):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV2FlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            (
                query_states,
                key_states,
                value_states,
                indices_q,
                cu_seq_lens,
                max_seq_lens,
            ) = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            attn_output = pad_input(
                attn_output_unpad, indices_q, batch_size, query_length
            )
        else:
            if query_length == 1:
                position_ids = position_ids.to(dtype=torch.int32).squeeze(1)
                attn_output = flash_attn_with_kvcache(
                    query_states,
                    key_states,
                    value_states,
                    cache_seqlens=position_ids,
                    softmax_scale=softmax_scale,
                    causal=causal,
                )   
            else:
                attn_output = flash_attn_func(
                    query_states,
                    key_states,
                    value_states,
                    dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                )

        return attn_output

    def _upad_input(
        self, query_layer, key_layer, value_layer, attention_mask, query_length
    ):
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
                indices_k,
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                query_layer, attention_mask
            )

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


ATTENTION_CLASSES = {
    "eager": DeepseekV2Attention,
    "flash_attention_2": DeepseekV2FlashAttention2,
}

class DeepseekV2DecoderLayer(nn.Module):
    def __init__(self, config: DeepseekV2Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = (
            DeepseekV2MoE(config)
            if (
                config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0
            )
            else DeepseekV2MLP(config)
        )
        self.input_layernorm = DeepseekV2RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = DeepseekV2RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            **kwargs,
        )

        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)

        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


DeepseekV2_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DeepseekV2Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV2_START_DOCSTRING,
)
class DeepseekV2PreTrainedModel(PreTrainedModel):
    config_class = DeepseekV2Config
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["DeepseekV2DecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_cache_class = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


DeepseekV2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV2_START_DOCSTRING,
)
class DeepseekV2Model(DeepseekV2PreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]

    Args:
        config: DeepseekV2Config
    """

    def __init__(self, config: DeepseekV2Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                DeepseekV2DecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
                )
                use_cache = False

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache()
                if use_legacy_cache
                else next_decoder_cache
            )
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
    
    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask


class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = DeepseekV2Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM

        >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states[:,-1:,:]).float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        use_cache=True,
        **kwargs,
    ):
        past_length = 0
        # Omit tokens covered by past_key_values
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                max_cache_length = (
                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
                    if past_key_values.get_max_length() is not None
                    else None
                )
                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_length == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
        if cache_position is None:
            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
        elif use_cache:
            cache_position = cache_position[-input_length:]

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
                "cache_position": cache_position,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx.to(past_state.device))
                    for past_state in layer_past
                ),
            )
        return reordered_past


@add_start_docstrings(
    """
    The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).

    [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    DeepseekV2_START_DOCSTRING,
)
class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = DeepseekV2Model(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                ).to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


================================================
FILE: archive/ktransformers/models/modeling_deepseek_v3.py
================================================
# coding=utf-8
# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DeepSeek model."""
import math
import warnings
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.generation import GenerationMixin
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_attention_mask,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import (
    ALL_LAYERNORM_LAYERS,
    is_torch_greater_or_equal_than_1_13,
)
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_torch_fx_available
from .configuration_deepseek_v3 import DeepseekV3Config
import torch.distributed as dist
import numpy as np

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa


# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "DeepseekV3Config"


def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(
        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
    )
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


class DeepseekV3RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        DeepseekV3RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm)


class DeepseekV3RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (
            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings,
            device=self.inv_freq.device,
            dtype=torch.get_default_dtype(),
        )
        self.max_seq_len_cached = None

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq.to(t.device))
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3
class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
    """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        t = t / self.scaling_factor

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3
class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
    """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings)
                - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (
                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Inverse dim formula to find dim based on number of rotations
def yarn_find_correction_dim(
    num_rotations, dim, base=10000, max_position_embeddings=2048
):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def yarn_find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(
        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
    )
    high = math.ceil(
        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
    )
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case


def yarn_get_mscale(scale=1, mscale=1):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0


def yarn_linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func


class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        dim = self.dim

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(seq_len, device=device, dtype=torch.float32)

        freqs = torch.outer(t, inv_freq)

        _mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )

        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer(
            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
        )
        self.register_buffer(
            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
        )


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)

    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)

    b, h, s, d = k.shape
    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class DeepseekV3MLP(nn.Module):
    def __init__(self, config, hidden_size=None, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
        self.intermediate_size = (
            config.intermediate_size if intermediate_size is None else intermediate_size
        )

        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class MoEGate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.top_k = config.num_experts_per_tok
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.scoring_func = config.scoring_func
        self.topk_method = config.topk_method
        self.n_group = config.n_group
        self.topk_group = config.topk_group

        # topk selection algorithm
        self.norm_topk_prob = config.norm_topk_prob
        self.gating_dim = config.hidden_size
        self.weight = nn.Parameter(
            torch.empty((self.n_routed_experts, self.gating_dim))
        )
        if self.topk_method == "noaux_tc":
            self.e_score_correction_bias = nn.Parameter(
                torch.empty((self.n_routed_experts))
            )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        logits = F.linear(
            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
        )
        if self.scoring_func == "sigmoid":
            scores = logits.sigmoid()
        else:
            raise NotImplementedError(
                f"insupportable scoring function for MoE gating: {self.scoring_func}"
            )

        ### select top-k experts
        if self.topk_method == "noaux_tc":
            #assert not self.training
            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
            group_scores = (
                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
            )  # [n, n_group]
            group_idx = torch.topk(
                group_scores, k=self.topk_group, dim=-1, sorted=False
            )[
                1
            ]  # [n, top_k_group]
            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
            score_mask = (
                group_mask.unsqueeze(-1)
                .expand(
                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
                )
                .reshape(bsz * seq_len, -1)
            )  # [n, e]
            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
            _, topk_idx = torch.topk(
                tmp_scores, k=self.top_k, dim=-1, sorted=False
            )
            topk_weight = scores.gather(1, topk_idx)
        else:
            raise NotImplementedError(
                f"insupportable TopK function for MoE gating: {self.topk_method}"
            )

        ### norm gate to sum 1
        if self.top_k > 1 and self.norm_topk_prob:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor

        return topk_idx, topk_weight

class DeepseekV3MoE(nn.Module):
    """
    A mixed expert module containing shared experts.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.num_experts_per_tok = config.num_experts_per_tok

        if hasattr(config, "ep_size") and config.ep_size > 1:
            assert config.ep_size == dist.get_world_size()
            self.ep_size = config.ep_size
            self.experts_per_rank = config.n_routed_experts // config.ep_size
            self.ep_rank = dist.get_rank()
            self.experts = nn.ModuleList(
                [
                    (
                        DeepseekV3MLP(
                            config, intermediate_size=config.moe_intermediate_size
                        )
                        if i >= self.ep_rank * self.experts_per_rank
                        and i < (self.ep_rank + 1) * self.experts_per_rank
                        else None
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        else:
            self.ep_size = 1
            self.experts_per_rank = config.n_routed_experts
            self.ep_rank = 0
            self.experts = nn.ModuleList(
                [
                    DeepseekV3MLP(
                        config, intermediate_size=config.moe_intermediate_size
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        self.gate = MoEGate(config)
        if config.n_shared_experts is not None:
            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
            self.shared_experts = DeepseekV3MLP(
                config=config, intermediate_size=intermediate_size
            )

    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        flat_topk_idx = topk_idx.view(-1)
        if not self.training:
            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
        if self.config.n_shared_experts is not None:
            y = y + self.shared_experts(identity)
        return y

    @torch.no_grad()
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        sorted_tokens_shape = sorted_tokens.shape
        if self.ep_size > 1:
            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
            tokens_per_expert_group = tokens_per_expert.new_empty(
                tokens_per_expert.shape[0]
            )
            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
            output_splits = (
                tokens_per_expert_group.view(self.ep_size, -1)
                .sum(1)
                .cpu()
                .numpy()
                .tolist()
            )
            gathered_tokens = sorted_tokens.new_empty(
                tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
            )
            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
            dist.all_to_all(
                list(gathered_tokens.split(output_splits)),
                list(sorted_tokens.split(input_split_sizes)),
            )
            tokens_per_expert_post_gather = tokens_per_expert_group.view(
                self.ep_size, self.experts_per_rank
            ).sum(dim=0)
            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
            s = 0
            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
                gatherd_idxs[s : s + k] = i % self.experts_per_rank
                s += k
            gatherd_idxs = gatherd_idxs.argsort()
            sorted_tokens = gathered_tokens[gatherd_idxs]
            tokens_per_expert = tokens_per_expert_post_gather
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
        if self.ep_size > 1:
            new_x = torch.empty_like(outs)
            new_x[gatherd_idxs] = outs
            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
            dist.all_to_all(
                list(gathered_tokens.split(input_split_sizes)),
                list(new_x.split(output_splits)),
            )
            outs = gathered_tokens

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3
class DeepseekV3Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads

        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.q_lora_rank = config.q_lora_rank
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.kv_lora_rank = config.kv_lora_rank
        self.v_head_dim = config.v_head_dim
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim

        self.is_causal = True

        if self.q_lora_rank is None:
            self.q_proj = nn.Linear(
                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
            )
        else:
            self.q_a_proj = nn.Linear(
                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
            )
            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
            self.q_b_proj = nn.Linear(
                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
            )

        self.kv_a_proj_with_mqa = nn.Linear(
            self.hidden_size,
            config.kv_lora_rank + config.qk_rope_head_dim,
            bias=config.attention_bias,
        )
        self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
        self.kv_b_proj = nn.Linear(
            config.kv_lora_rank,
            self.num_heads
            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
            bias=False,
        )

        self.o_proj = nn.Linear(
            self.num_heads * self.v_head_dim,
            self.hidden_size,
            bias=config.attention_bias,
        )
        self._init_rope()

        self.softmax_scale = self.q_head_dim ** (-0.5)
        if self.config.rope_scaling is not None:
            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
            scaling_factor = self.config.rope_scaling["factor"]
            if mscale_all_dim:
                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                self.softmax_scale = self.softmax_scale * mscale * mscale

    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = DeepseekV3RotaryEmbedding(
                self.qk_rope_head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "yarn":
                kwargs = {
                    key: self.config.rope_scaling[key]
                    for key in [
                        "original_max_position_embeddings",
                        "beta_fast",
                        "beta_slow",
                        "mscale",
                        "mscale_all_dim",
                    ]
                    if key in self.config.rope_scaling
                }
                self.rotary_emb = DeepseekV3YarnRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                    **kwargs,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
        )

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )
        assert attention_mask is not None
        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3
class DeepseekV3FlashAttention2(DeepseekV3Attention):
    """
    DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # DeepseekV3FlashAttention2 attention does not support output_attentions
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

            # overwrite attention_mask with padding_mask
            attention_mask = kwargs.pop("padding_mask")

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]

        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe

        if self.q_head_dim != self.v_head_dim:
            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
        # to be able to avoid many of these transpose/reshape/view.
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (DeepseekV3RMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
            if hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            elif torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            else:
                target_dtype = (
                    self.q_proj.weight.dtype
                    if self.q_lora_rank is None
                    else self.q_a_proj.weight.dtype
                )

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            dropout=dropout_rate,
            softmax_scale=self.softmax_scale,
        )
        if self.q_head_dim != self.v_head_dim:
            attn_output = attn_output[:, :, :, : self.v_head_dim]

        attn_output = attn_output.reshape(
            bsz, q_len, self.num_heads * self.v_head_dim
        ).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        dropout=0.0,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`int`, *optional*):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            (
                query_states,
                key_states,
                value_states,
                indices_q,
                cu_seq_lens,
                max_seq_lens,
            ) = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            attn_output = pad_input(
                attn_output_unpad, indices_q, batch_size, query_length
            )
        else:
            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states,
                dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

        return attn_output

    def _upad_input(
        self, query_layer, key_layer, value_layer, attention_mask, query_length
    ):
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
                indices_k,
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                query_layer, attention_mask
            )

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


ATTENTION_CLASSES = {
    "eager": DeepseekV3Attention,
    "flash_attention_2": DeepseekV3FlashAttention2,
}


class DeepseekV3DecoderLayer(nn.Module):
    def __init__(self, config: DeepseekV3Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = (
            DeepseekV3MoE(config)
            if (
                config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0
            )
            else DeepseekV3MLP(config)
        )
        self.input_layernorm = DeepseekV3RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = DeepseekV3RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        is_prefill: Optional[bool] = False,
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            is_prefill=is_prefill,
            **kwargs,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


DeepseekV3_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DeepseekV3Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3PreTrainedModel(PreTrainedModel):
    config_class = DeepseekV3Config
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["DeepseekV3DecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_cache_class = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


DeepseekV3_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3Model(DeepseekV3PreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]

    Args:
        config: DeepseekV3Config
    """

    def __init__(self, config: DeepseekV3Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                DeepseekV3DecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)

        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_key_values_length,
                seq_length + past_key_values_length,
                dtype=torch.long,
                device=device,
            )
            position_ids = position_ids.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if self._use_flash_attention_2:
            # 2d mask is passed through the layers
            attention_mask = (
                attention_mask
                if (attention_mask is not None and 0 in attention_mask)
                else None
            )
        else:
            # 4d mask is passed through the layers
            attention_mask = _prepare_4d_causal_attention_mask(
                attention_mask,
                (batch_size, seq_length),
                inputs_embeds,
                past_key_values_length,
            )

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
            )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache()
                if use_legacy_cache
                else next_decoder_cache
            )
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = DeepseekV3Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        is_prefill: Optional[bool] = False,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM

        >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            is_prefill=is_prefill,
        )

        hidden_states = outputs[0]
        if use_torch_npu:
            hidden_states_without_norm = outputs[-1]
            logits = self.lm_head(hidden_states)
        else:
            logits = self.lm_head(hidden_states[:,-1:,:])
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            if use_torch_npu:
                output = (logits,) + outputs[1:] + (hidden_states_without_norm,)
            else:
                output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                cache_length = past_key_values.get_seq_length()
                past_length = past_key_values.seen_tokens
                max_cache_length = past_key_values.get_max_length()
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if (
                attention_mask is not None
                and attention_mask.shape[1] > input_ids.shape[1]
            ):
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx.to(past_state.device))
                    for past_state in layer_past
                ),
            )
        return reordered_past


@add_start_docstrings(
    """
    The DeepseekV3 Model transformer with a sequence classification head on top (linear layer).

    [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = DeepseekV3Model(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                ).to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


================================================
FILE: archive/ktransformers/models/modeling_glm4_moe.py
================================================
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/glm4_moe/modular_glm4_moe.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_glm4_moe.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Callable, Optional, Union

import torch
import torch.nn.functional as F
from torch import nn

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache
from transformers.generation import GenerationMixin
# from transformers.integrations import use_kernel_forward_from_hub
from transformers.masking_utils import create_causal_mask
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_layers import GradientCheckpointingLayer
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
# from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
from transformers.utils import auto_docstring, can_return_tuple
# from transformers.utils.generic import check_model_inputs
from .configuration_glm4_moe import Glm4MoeConfig


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    # **kwargs: Unpack[TransformersKwargs],
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)

    # Keep half or full tensor for later concatenation
    rotary_dim = cos.shape[-1]
    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]

    # Apply rotary embeddings on the first half or full tensor
    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)

    # Concatenate back to full shape
    q_embed = torch.cat([q_embed, q_pass], dim=-1)
    k_embed = torch.cat([k_embed, k_pass], dim=-1)
    return q_embed, k_embed


class Glm4MoeAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.scaling = self.head_dim**-0.5
        self.attention_dropout = config.attention_dropout
        self.is_causal = True

        self.q_proj = nn.Linear(
            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.v_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
        self.use_qk_norm = config.use_qk_norm
        if self.use_qk_norm:
            self.q_norm = Glm4MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
            self.k_norm = Glm4MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        query_states = self.q_proj(hidden_states).view(hidden_shape)
        key_states = self.k_proj(hidden_states).view(hidden_shape)
        value_states = self.v_proj(hidden_states).view(hidden_shape)

        if self.use_qk_norm:  # main diff from Llama
            query_states = self.q_norm(query_states)
            key_states = self.k_norm(key_states)

        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; position_ids needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            **kwargs,
        )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights


class Glm4MoeMLP(nn.Module):
    def __init__(self, config, hidden_size=None, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size

        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class Glm4MoeTopkRouter(nn.Module):
    def __init__(self, config: Glm4MoeConfig):
        super().__init__()
        self.config = config
        self.top_k = config.num_experts_per_tok
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.n_group = config.n_group
        self.topk_group = config.topk_group
        self.norm_topk_prob = config.norm_topk_prob

        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
        self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts), dtype=torch.float32))

    @torch.no_grad()
    def get_topk_indices(self, scores):
        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
        group_scores = (
            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
            .topk(2, dim=-1)[0]
            .sum(dim=-1)
        )
        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
        group_mask = torch.zeros_like(group_scores)
        group_mask.scatter_(1, group_idx, 1)
        score_mask = (
            group_mask.unsqueeze(-1)
            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
            .reshape(-1, self.n_routed_experts)
        )
        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
        return topk_indices

    def forward(self, hidden_states):
        hidden_states = hidden_states.view(-1, self.config.hidden_size)
        router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
        scores = router_logits.sigmoid()
        topk_indices = self.get_topk_indices(scores)
        topk_weights = scores.gather(1, topk_indices)
        if self.norm_topk_prob:
            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
            topk_weights /= denominator
        topk_weights = topk_weights * self.routed_scaling_factor
        return topk_indices, topk_weights


# @use_kernel_forward_from_hub("RMSNorm")
class Glm4MoeRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Glm4MoeRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


class Glm4MoeMoE(nn.Module):
    """
    A mixed expert module containing shared experts.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.experts = nn.ModuleList(
            [
                Glm4MoeMLP(config, intermediate_size=config.moe_intermediate_size)
                for _ in range(config.n_routed_experts)
            ]
        )
        self.gate = Glm4MoeTopkRouter(config)
        self.shared_experts = Glm4MoeMLP(
            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
        )

    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
        r"""
        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
        to not have to do a loop here (deepseek has 256 experts soooo yeah).
        """
        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
        expert_mask = expert_mask.permute(2, 0, 1)

        for expert_idx in range(len(self.experts)):
            expert = self.experts[expert_idx]
            mask = expert_mask[expert_idx]
            token_indices, weight_indices = torch.where(mask)

            if token_indices.numel() > 0:
                expert_weights = topk_weights[token_indices, weight_indices]
                expert_input = hidden_states[token_indices]
                expert_output = expert(expert_input)
                weighted_output = expert_output * expert_weights.unsqueeze(-1)
                final_hidden_states.index_add_(0, token_indices, weighted_output)

        # in original deepseek, the output of the experts are gathered once we leave this module
        # thus the moe module is itelsf an IsolatedParallel module
        # and all expert are "local" meaning we shard but we don't gather
        return final_hidden_states.type(hidden_states.dtype)

    def forward(self, hidden_states):
        residuals = hidden_states
        orig_shape = hidden_states.shape
        topk_indices, topk_weights = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
        hidden_states = hidden_states + self.shared_experts(residuals)
        return hidden_states


class Glm4MoeDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: Glm4MoeConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = Glm4MoeAttention(config=config, layer_idx=layer_idx)

        if layer_idx >= config.first_k_dense_replace:
            self.mlp = Glm4MoeMoE(config)
        else:
            self.mlp = Glm4MoeMLP(config)

        self.input_layernorm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        # **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
        hidden_states, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        return hidden_states


@auto_docstring
class Glm4MoePreTrainedModel(PreTrainedModel):
    config: Glm4MoeConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Glm4MoeDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_static_cache = False
    _supports_attention_backend = True
    _can_record_outputs = {
        "hidden_states": Glm4MoeDecoderLayer,
        "attentions": Glm4MoeAttention,
    }

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, Glm4MoeRMSNorm):
            module.weight.data.fill_(1.0)
        elif isinstance(module, Glm4MoeTopkRouter):
            module.weight.data.normal_(mean=0.0, std=std)


class Glm4MoeRotaryEmbedding(nn.Module):
    def __init__(self, config: Glm4MoeConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    @torch.no_grad()
    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


@auto_docstring
class Glm4MoeModel(Glm4MoePreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r"model\.layers\.92.*", r"model\.layers\.46.*"]

    def __init__(self, config: Glm4MoeConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [Glm4MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = Glm4MoeRotaryEmbedding(config=config)
        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # @check_model_inputs
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        # **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPast:
        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)

        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position: torch.Tensor = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = create_causal_mask(
            config=self.config,
            input_embeds=inputs_embeds,
            attention_mask=attention_mask,
            cache_position=cache_position,
            past_key_values=past_key_values,
            position_ids=position_ids,
        )

        hidden_states = inputs_embeds
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
            hidden_states = decoder_layer(
                hidden_states,
                attention_mask=causal_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
            )

        hidden_states = self.norm(hidden_states)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
        )


@auto_docstring
class Glm4MoeForCausalLM(Glm4MoePreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

    def __init__(self, config):
        super().__init__(config)
        self.model = Glm4MoeModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        # **kwargs: Unpack[TransformersKwargs],
    ) -> CausalLMOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Glm4MoeForCausalLM

        >>> model = Glm4MoeForCausalLM.from_pretrained("meta-glm4_moe/Glm4Moe-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-glm4_moe/Glm4Moe-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        outputs: BaseModelOutputWithPast = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            cache_position=cache_position,
            # **kwargs,
        )

        hidden_states = outputs.last_hidden_state
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
        logits = self.lm_head(hidden_states[:, slice_indices, :])

        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


__all__ = ["Glm4MoePreTrainedModel", "Glm4MoeModel", "Glm4MoeForCausalLM"]

================================================
FILE: archive/ktransformers/models/modeling_llama.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
from transformers.modeling_flash_attention_utils import _flash_attention_forward
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from .configuration_llama import LlamaConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "LlamaConfig"


class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)


class LlamaRotaryEmbedding(nn.Module):
    def __init__(
        self,
        dim=None,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        rope_type="default",
        config: Optional[LlamaConfig] = None,
    ):
        super().__init__()
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.device = device
        self.scaling_factor = scaling_factor
        self.rope_type = rope_type
        self.config = config
        # TODO (joao): remove the `if` below, only used for BC
        self.rope_kwargs = {}
        if config is None:
            logger.warning_once(
                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
                "`config` argument. All other arguments will be removed in v4.45"
            )
            self.rope_kwargs = {
                "rope_type": rope_type,
                "factor": scaling_factor,
                "dim": dim,
                "base": base,
                "max_position_embeddings": max_position_embeddings,
            }
            self.rope_type = rope_type
            self.max_seq_len_cached = max_position_embeddings
            self.original_max_seq_len = max_position_embeddings
        else:
            # BC: "rope_type" was originally "type"
            if config.rope_scaling is not None:
                self.rope_type = config.rope_scaling.get(
                    "rope_type", config.rope_scaling.get("type")
                )
            else:
                self.rope_type = "default"
            self.max_seq_len_cached = config.max_position_embeddings
            self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(
            self.config, device, **self.rope_kwargs
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    def _dynamic_frequency_update(self, position_ids, device):
        """
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        """
        seq_len = torch.max(position_ids) + 1
        # seq_len = position_ids[0, -1] + 1
        if seq_len > self.max_seq_len_cached:  # growth
            inv_freq, self.attention_scaling = self.rope_init_fn(
                self.config, device, seq_len=seq_len, **self.rope_kwargs
            )
            self.register_buffer(
                "inv_freq", inv_freq, persistent=False
            )  # TODO joao: may break with compilation
            self.max_seq_len_cached = seq_len

        if (
            seq_len < self.original_max_seq_len
            and self.max_seq_len_cached > self.original_max_seq_len
        ):  # reset
            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
            self.max_seq_len_cached = self.original_max_seq_len

    @torch.no_grad()
    def forward(self, x, position_ids):
        # if "dynamic" in self.rope_type:
        #     self._dynamic_frequency_update(position_ids, device=x.device)

        # Core RoPE block
        inv_freq_expanded = (
            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        )
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
        device_type = x.device.type
        device_type = (
            device_type
            if isinstance(device_type, str) and device_type != "mps"
            else "cpu"
        )
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (
                inv_freq_expanded.float() @ position_ids_expanded.float()
            ).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()

        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
        cos = cos * self.attention_scaling
        sin = sin * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(self, *args, **kwargs):
        logger.warning_once(
            "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
            "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
        )
        kwargs["rope_type"] = "linear"
        super().__init__(*args, **kwargs)


class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(self, *args, **kwargs):
        logger.warning_once(
            "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
            "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
            "__init__)."
        )
        kwargs["rope_type"] = "dynamic"
        super().__init__(*args, **kwargs)


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class LlamaMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(
            self.hidden_size, self.intermediate_size, bias=config.mlp_bias
        )
        self.up_proj = nn.Linear(
            self.hidden_size, self.intermediate_size, bias=config.mlp_bias
        )
        self.down_proj = nn.Linear(
            self.intermediate_size, self.hidden_size, bias=config.mlp_bias
        )
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        if self.config.pretraining_tp > 1:
            slice = self.intermediate_size // self.config.pretraining_tp
            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
            down_proj_slices = self.down_proj.weight.split(slice, dim=1)

            gate_proj = torch.cat(
                [
                    F.linear(x, gate_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ],
                dim=-1,
            )
            up_proj = torch.cat(
                [
                    F.linear(x, up_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ],
                dim=-1,
            )

            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
            down_proj = [
                F.linear(intermediate_states[i], down_proj_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            down_proj = sum(down_proj)
        else:
            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

        return down_proj


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class LlamaAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.q_proj = nn.Linear(
            self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.attention_bias,
        )
        self.v_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.attention_bias,
        )
        self.o_proj = nn.Linear(
            self.hidden_size, self.hidden_size, bias=config.attention_bias
        )

        # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers)
        self.rotary_emb = LlamaRotaryEmbedding(config=self.config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        if self.config.pretraining_tp > 1:
            key_value_slicing = (
                self.num_key_value_heads * self.head_dim
            ) // self.config.pretraining_tp
            query_slices = self.q_proj.weight.split(
                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
            )
            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

            query_states = [
                F.linear(hidden_states, query_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            query_states = torch.cat(query_states, dim=-1)

            key_states = [
                F.linear(hidden_states, key_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            key_states = torch.cat(key_states, dim=-1)

            value_states = [
                F.linear(hidden_states, value_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            value_states = torch.cat(value_states, dim=-1)

        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)

        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin
        )

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
        ) / math.sqrt(self.head_dim)

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, -1)

        if self.config.pretraining_tp > 1:
            attn_output = attn_output.split(
                self.hidden_size // self.config.pretraining_tp, dim=2
            )
            o_proj_slices = self.o_proj.weight.split(
                self.hidden_size // self.config.pretraining_tp, dim=1
            )
            attn_output = sum(
                [
                    F.linear(attn_output[i], o_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ]
            )
        else:
            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


class LlamaFlashAttention2(LlamaAttention):
    """
    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if isinstance(past_key_value, StaticCache):
            raise ValueError(
                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
            )

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin
        )

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
        # to be able to avoid many of these transpose/reshape/view.
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (LlamaRMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = _flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            dropout=dropout_rate,
            sliding_window=getattr(self, "sliding_window", None),
            use_top_left_mask=self._flash_attn_uses_top_left_mask,
            is_causal=self.is_causal,
        )

        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


class LlamaSdpaAttention(LlamaAttention):
    """
    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from LlamaAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
            )

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin
        )

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        causal_mask = attention_mask
        if attention_mask is not None:
            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and causal_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()

        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        is_causal = True if causal_mask is None and q_len > 1 else False

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            dropout_p=self.attention_dropout if self.training else 0.0,
            is_causal=is_causal,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(bsz, q_len, -1)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value


LLAMA_ATTENTION_CLASSES = {
    "eager": LlamaAttention,
    "flash_attention_2": LlamaFlashAttention2,
    "sdpa": LlamaSdpaAttention,
}


class LlamaDecoderLayer(nn.Module):
    def __init__(self, config: LlamaConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = LlamaMLP(config)
        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
            **kwargs,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


LLAMA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LlamaConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaPreTrainedModel(PreTrainedModel):
    config_class = LlamaConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlamaDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


LLAMA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaModel(LlamaPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]

    Args:
        config: LlamaConfig
    """

    def __init__(self, config: LlamaConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                LlamaDecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = LlamaRotaryEmbedding(config=config)
        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
            )
            use_cache = False

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        return_legacy_cache = False
        if (
            use_cache and not isinstance(past_key_values, Cache) and not self.training
        ):  # kept for BC (non `Cache` `past_key_values` inputs)
            return_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds,
            cache_position,
            past_key_values,
            output_attentions,
        )
        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                    position_embeddings,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if return_legacy_cache:
            next_cache = next_cache.to_legacy_cache()

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = (
            past_key_values.get_seq_length() if past_key_values is not None else 0
        )
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and not using_static_cache
            and not output_attentions
        ):
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError(
                    "Custom 4D attention mask should be passed in inverted form with max==0`"
                )
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length),
                fill_value=min_dtype,
                dtype=dtype,
                device=device,
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(
                target_length, device=device
            ) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(
                input_tensor.shape[0], 1, -1, -1
            )
            if attention_mask is not None:
                causal_mask = (
                    causal_mask.clone()
                )  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = (
                    causal_mask[:, :, :, :mask_length]
                    + attention_mask[:, None, None, :]
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[
                    :, :, :, :mask_length
                ].masked_fill(padding_mask, min_dtype)
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(
                causal_mask, min_dtype
            )

        return causal_mask


class LlamaForCausalLM(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LlamaForCausalLM

        >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(
                self.vocab_size // self.config.pretraining_tp, dim=0
            )
            logits = [
                F.linear(hidden_states, lm_head_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        # logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        position_ids=None,
        use_cache=True,
        **kwargs,
    ):
        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        # Exception 1: when passing input_embeds, input_ids may be missing entries
        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        if past_key_values is not None:
            if inputs_embeds is not None:  # Exception 1
                input_ids = input_ids[:, -cache_position.shape[0] :]
            elif (
                input_ids.shape[1] != cache_position.shape[0]
            ):  # Default case (the "else", a no op, is Exception 2)
                input_ids = input_ids[:, cache_position]

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and cache_position[0] == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {
                "input_ids": input_ids.contiguous()
            }  # `contiguous()` needed for compilation use cases

        model_inputs.update(
            {
                "position_ids": position_ids,
                "cache_position": cache_position,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
            }
        )
        return model_inputs


@add_start_docstrings(
    """
    The LLaMa Model transformer with a sequence classification head on top (linear layer).

    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    LLAMA_START_DOCSTRING,
)
class LlamaForSequenceClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                )
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
The Llama Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    LLAMA_START_DOCSTRING,
)
class LlamaForQuestionAnswering(LlamaPreTrainedModel):
    base_model_prefix = "transformer"

    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama
    def __init__(self, config):
        super().__init__(config)
        self.transformer = LlamaModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.transformer.embed_tokens

    def set_input_embeddings(self, value):
        self.transformer.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1).to(start_logits.device)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1).to(end_logits.device)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    The Llama Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    LLAMA_START_DOCSTRING,
)
class LlamaForTokenClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: archive/ktransformers/models/modeling_mixtral.py
================================================
# coding=utf-8
'''
Description  : 
Author       : kkk1nak0
Date         : 2024-07-29 02:58:57
Version      : 1.0.0
LastEditors  : kkk1nak0
LastEditTime : 2024-08-02 06:08:34
'''

# Adapted from 
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mixtral/modeling_mixtral.py
# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Mixtral model."""

import inspect 
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    logging,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_torch_fx_available
from transformers.models.mixtral.configuration_mixtral import MixtralConfig


if is_flash_attn_2_available():
    from flash_attn import flash_attn_varlen_func, flash_attn_func, flash_attn_with_kvcache
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)

# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "MixtralConfig"


def load_balancing_loss_func(
    gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
) -> float:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        attention_mask (`torch.Tensor`, None):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
            Number of experts

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mixtral
class MixtralRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        MixtralRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


# copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()
        
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self.max_seq_len_cached = max_position_embeddings

    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
# TODO @longjie no longer copied from Mistral after static cache
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    """

    def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True
        self.attention_dropout = config.attention_dropout

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

        self.rotary_emb = MixtralRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


# copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralFlashAttention2(MixtralAttention):
    """
    Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(value_states, position_ids)

        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        use_sliding_windows = (
            _flash_supports_window_size
            and getattr(self.config, "sliding_window", None) is not None
            and kv_seq_len > self.config.sliding_window
            and self.config.use_sliding_window
        )

        if not _flash_supports_window_size:
            logger.warning_once(
                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
                " make sure to upgrade flash-attn library."
            )

        if past_key_value is not None:
            # Activate slicing cache only if the config has a value `sliding_windows` attribute
            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
            if (
                getattr(self.config, "sliding_window", None) is not None
                and kv_seq_len > self.config.sliding_window
                and cache_has_contents
            ):
                slicing_tokens = 1 - self.config.sliding_window

                past_key = past_key_value[self.layer_idx][0]
                past_value = past_key_value[self.layer_idx][1]

                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
                past_value = past_value[:, :, slicing_tokens:, :].contiguous()

                if past_key.shape[-2] != self.config.sliding_window - 1:
                    raise ValueError(
                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
                        f" {past_key.shape}"
                    )

                if attention_mask is not None:
                    attention_mask = attention_mask[:, slicing_tokens:]
                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)

            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

            # we slice the states for static kv cache to be supported in FA2. Not sure it's a must as compile fails
            # for bsz == 1, avoid using slice to capture cuda graph
            if cache_position is not None and q_len > 1:
                key_states = key_states[:, :, : cache_position[-1] + 1, :]
                value_states = value_states[:, :, : cache_position[-1] + 1, :]

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        dropout_rate = 0.0 if not self.training else self.attention_dropout

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in float16 just to be sure everything works as expected.
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        # Reashape to the expected shape for Flash Attention
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            position_ids=position_ids,
            dropout=dropout_rate,
            sliding_window=getattr(self.config, "sliding_window", None),
            is_causal=self.is_causal,
        )

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value
    

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        q_len,
        position_ids,
        dropout,
        sliding_window,
        is_causal,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            
        """
        
        # Decide whether to use SWA or not by layer index.
        # if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
        #     use_sliding_windows = False
        use_sliding_windows = False

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, q_len
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            if not use_sliding_windows:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=is_causal,
                )
            else:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=is_causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, q_len)
        else:
            if not use_sliding_windows:
                if q_len == 1:
                    position_ids = position_ids.to(dtype=torch.int32).squeeze(1)
                    attn_output = flash_attn_with_kvcache(
                        query_states,
                        key_states,
                        value_states,
                        cache_seqlens=position_ids,
                        softmax_scale=softmax_scale,
                        causal=is_causal,
                    )   
                else:
                    attn_output = flash_attn_func(
                        query_states,
                        key_states,
                        value_states,
                        dropout,
                        softmax_scale=softmax_scale,
                        causal=is_causal,
                    )
            else:
                attn_output = flash_attn_func(
                    query_states,
                    key_states,
                    value_states,
                    dropout,
                    softmax_scale=softmax_scale,
                    causal=is_causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

        return attn_output

    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape

        # On the first iteration we need to properly re-create the padding mask
        # by slicing it on the proper place
        if kv_seq_len != attention_mask.shape[-1]:
            attention_mask_num_tokens = attention_mask.shape[-1]
            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]

        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)

        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)

        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


# copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralSdpaAttention(MixtralAttention):
    """
    Mixtral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `MixtralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from MixtralAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
                "MixtralModel is using MixtralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
            )

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)

        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        causal_mask = attention_mask
        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and attention_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()

        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
        is_causal = True if causal_mask is None and q_len > 1 else False

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            dropout_p=self.attention_dropout if self.training else 0.0,
            is_causal=is_causal,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value


MIXTRAL_ATTENTION_CLASSES = {
    "eager": MixtralAttention,
    "flash_attention_2": MixtralFlashAttention2,
    "sdpa": MixtralSdpaAttention,
}


class MixtralBlockSparseTop2MLP(nn.Module):
    def __init__(self, config: MixtralConfig):
        super().__init__()
        self.ffn_dim = config.intermediate_size
        self.hidden_dim = config.hidden_size

        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)  # gate
        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)  # down
        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)  # up

        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
        current_hidden_states = self.w2(current_hidden_states)
        return current_hidden_states


class MixtralSparseMoeBlock(nn.Module):
    """
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accomodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    """

    def __init__(self, config):
        super().__init__()
        self.hidden_dim = config.hidden_size
        self.ffn_dim = config.intermediate_size
        self.num_experts = config.num_local_experts
        self.top_k = config.num_experts_per_tok

        # gating
        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)

        self.experts = nn.ModuleList([MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)])

        # Jitter parameters
        self.jitter_noise = config.router_jitter_noise

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        if self.training and self.jitter_noise > 0:
            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits


class MixtralDecoderLayer(nn.Module):
    def __init__(self, config: MixtralConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = MIXTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

        self.block_sparse_moe = MixtralSparseMoeBlock(config)
        self.input_layernorm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        if output_router_logits:
            outputs += (router_logits,)

        return outputs


MIXTRAL_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MixtralConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
    MIXTRAL_START_DOCSTRING,
)
# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel with Qwen2->Mixtral
class MixtralPreTrainedModel(PreTrainedModel):
    config_class = MixtralConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["MixtralDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


MIXTRAL_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_router_logits (`bool`, *optional*):
            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
            should not be returned during inference.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
    MIXTRAL_START_DOCSTRING,
)
# copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralModel(MixtralPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]

    Args:
        config: MixtralConfig
    """

    def __init__(self, config: MixtralConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [MixtralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self._attn_implementation = config._attn_implementation
        self.norm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # Ignore copy
    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        use_legacy_cache = False
        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
            use_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
                if v is not None
            )
        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask


class MixtralForCausalLM(MixtralPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = MixtralModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.router_aux_loss_coef = config.router_aux_loss_coef
        self.num_experts = config.num_local_experts
        self.num_experts_per_tok = config.num_experts_per_tok
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    # Ignore copy
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MixtralForCausalLM

        >>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_router_logits=output_router_logits,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits if return_dict else outputs[-1],
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        if not return_dict:
            output = (logits,) + outputs[1:]
            if output_router_logits:
                output = (aux_loss,) + output
            return (loss,) + output if loss is not None else output

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        output_router_logits=False,
        position_ids=None,
        use_cache=True,
        **kwargs,
    ):
        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        # Exception 1: when passing input_embeds, input_ids may be missing entries
        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        if past_key_values is not None:
            if inputs_embeds is not None:  # Exception 1
                input_ids = input_ids[:, -cache_position.shape[0] :]
            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                input_ids = input_ids[:, cache_position]

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and cache_position[0] == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases

        model_inputs.update(
            {
                "position_ids": position_ids,
                "cache_position": cache_position,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
                "output_router_logits": output_router_logits,
            }
        )
        return model_inputs


@add_start_docstrings(
    """
    The Mixtral Model transformer with a sequence classification head on top (linear layer).

    [`MixtralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    MIXTRAL_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mixtral, LLAMA->MIXTRAL
class MixtralForSequenceClassification(MixtralPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = MixtralModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
    The Mixtral Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    MIXTRAL_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Mixtral, LLAMA->MIXTRAL
class MixtralForTokenClassification(MixtralPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = MixtralModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

================================================
FILE: archive/ktransformers/models/modeling_qwen2_moe.py
================================================
# coding=utf-8
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
''' 
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.42.3/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
# 
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Qwen2MoE model."""

import inspect
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
)
from transformers.modeling_outputs import (
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig


if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Qwen/Qwen1.5-MoE-A2.7B"
_CONFIG_FOR_DOC = "Qwen2MoeConfig"


# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
def load_balancing_loss_func(
    gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
) -> float:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        attention_mask (`torch.Tensor`, None):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
            Number of experts

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2Moe
class Qwen2MoeRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Qwen2MoeRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2Moe
class Qwen2MoeRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2Moe
class Qwen2MoeMLP(nn.Module):
    def __init__(self, config, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2Attention with Qwen2->Qwen2Moe
class Qwen2MoeAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    """

    def __init__(self, config: Qwen2MoeConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True
        self.attention_dropout = config.attention_dropout

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

        self.rotary_emb = Qwen2MoeRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2FlashAttention2 with Qwen2->Qwen2Moe
class Qwen2MoeFlashAttention2(Qwen2MoeAttention):
    """
    Qwen2Moe flash attention module, following Qwen2Moe attention module. This module inherits from `Qwen2MoeAttention`
    as the weights of the module stays untouched. The only required change would be on the forward pass
    where it needs to correctly call the public API of flash attention and deal with padding tokens
    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
    config.max_window_layers layers.
    """

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        use_sliding_windows = (
            _flash_supports_window_size
            and getattr(self.config, "sliding_window", None) is not None
            and kv_seq_len > self.config.sliding_window
            and self.config.use_sliding_window
        )

        if not _flash_supports_window_size:
            logger.warning_once(
                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
                " make sure to upgrade flash-attn library."
            )

        if past_key_value is not None:
            # Activate slicing cache only if the config has a value `sliding_windows` attribute
            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
            if (
                getattr(self.config, "sliding_window", None) is not None
                and kv_seq_len > self.config.sliding_window
                and cache_has_contents
            ):
                slicing_tokens = 1 - self.config.sliding_window

                past_key = past_key_value[self.layer_idx][0]
                past_value = past_key_value[self.layer_idx][1]

                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
                past_value = past_value[:, :, slicing_tokens:, :].contiguous()

                if past_key.shape[-2] != self.config.sliding_window - 1:
                    raise ValueError(
                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
                        f" {past_key.shape}"
                    )

                if attention_mask is not None:
                    attention_mask = attention_mask[:, slicing_tokens:]
                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)

            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
            # we slice the states for static kv cache to be supported in FA2. Not sure it's a must as compile fails
            # for bsz == 1, avoid using slice to capture cuda graph
            if cache_position is not None and q_len > 1:
                key_states = key_states[:, :, : cache_position[-1] + 1, :]
                value_states = value_states[:, :, : cache_position[-1] + 1, :]

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        dropout_rate = 0.0 if not self.training else self.attention_dropout

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in float16 just to be sure everything works as expected.
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        # Reashape to the expected shape for Flash Attention
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            position_ids=position_ids,
            dropout=dropout_rate,
            use_sliding_windows=use_sliding_windows,
        )

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        position_ids,
        dropout=0.0,
        softmax_scale=None,
        use_sliding_windows=False,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
            use_sliding_windows (`bool`, *optional*):
                Whether to activate sliding window attention.
        """
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Decide whether to use SWA or not by layer index.
        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
            use_sliding_windows = False

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            if not use_sliding_windows:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                )
            else:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            if not use_sliding_windows:
                if query_length == 1:
                    position_ids = position_ids.to(dtype=torch.int32).squeeze(1)
                    attn_output = flash_attn_with_kvcache(
                        query_states,
                        key_states,
                        value_states,
                        cache_seqlens=position_ids,
                        softmax_scale=softmax_scale,
                        causal=causal,
                    )   
                else:
                    attn_output = flash_attn_func(
                        query_states,
                        key_states,
                        value_states,
                        dropout,
                        softmax_scale=softmax_scale,
                        causal=causal,
                    )
            else:
                attn_output = flash_attn_func(
                    query_states,
                    key_states,
                    value_states,
                    dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

        return attn_output

    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape

        # On the first iteration we need to properly re-create the padding mask
        # by slicing it on the proper place
        if kv_seq_len != attention_mask.shape[-1]:
            attention_mask_num_tokens = attention_mask.shape[-1]
            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]

        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)

        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)

        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


# Copied from transformers.models.mixtral.modeling_mixtral.MixtralSdpaAttention with Mixtral->Qwen2Moe
class Qwen2MoeSdpaAttention(Qwen2MoeAttention):
    """
    Qwen2Moe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `Qwen2MoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from Qwen2MoeAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
                "Qwen2MoeModel is using Qwen2MoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
            )

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        causal_mask = attention_mask
        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and attention_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()

        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
        is_causal = True if causal_mask is None and q_len > 1 else False

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            dropout_p=self.attention_dropout if self.training else 0.0,
            is_causal=is_causal,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value


QWEN2MOE_ATTENTION_CLASSES = {
    "eager": Qwen2MoeAttention,
    "flash_attention_2": Qwen2MoeFlashAttention2,
    "sdpa": Qwen2MoeSdpaAttention,
}


class Qwen2MoeSparseMoeBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob

        # gating
        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
        self.experts = nn.ModuleList(
            [Qwen2MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
        )

        self.shared_expert = Qwen2MoeMLP(config, intermediate_size=config.shared_expert_intermediate_size)
        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))

        shared_expert_output = self.shared_expert(hidden_states)
        shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output

        final_hidden_states = final_hidden_states + shared_expert_output

        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits


class Qwen2MoeDecoderLayer(nn.Module):
    def __init__(self, config: Qwen2MoeConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = QWEN2MOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

        if (layer_idx not in config.mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
            self.mlp = Qwen2MoeSparseMoeBlock(config)
        else:
            self.mlp = Qwen2MoeMLP(config, intermediate_size=config.intermediate_size)

        self.input_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
                and should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states)
        if isinstance(hidden_states, tuple):
            hidden_states, router_logits = hidden_states
        else:
            router_logits = None

        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        if output_router_logits:
            outputs += (router_logits,)

        return outputs


QWEN2MOE_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Qwen2MoeConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
)
class Qwen2MoePreTrainedModel(PreTrainedModel):
    config_class = Qwen2MoeConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Qwen2MoeDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


QWEN2MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_router_logits (`bool`, *optional*):
            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
            should not be returned during inference.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
)
class Qwen2MoeModel(Qwen2MoePreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]

    Args:
        config: Qwen2MoeConfig
    """

    def __init__(self, config: Qwen2MoeConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [Qwen2MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self._attn_implementation = config._attn_implementation
        self.norm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        use_legacy_cache = False
        if use_cache and not isinstance(past_key_values, Cache):
            use_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits and layer_outputs[-1] is not None:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
                if v is not None
            )
        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask


class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2MoeModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.router_aux_loss_coef = config.router_aux_loss_coef
        self.num_experts = config.num_experts
        self.num_experts_per_tok = config.num_experts_per_tok
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen2MoeForCausalLM

        >>> model = Qwen2MoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_router_logits=output_router_logits,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits if return_dict else outputs[-1],
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        if not return_dict:
            output = (logits,) + outputs[1:]
            if output_router_logits:
                output = (aux_loss,) + output
            return (loss,) + output if loss is not None else output

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        use_cache=True,
        **kwargs,
    ):
        past_length = 0
        # Omit tokens covered by past_key_values
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                max_cache_length = (
                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
                    if past_key_values.get_max_length() is not None
                    else None
                )
                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_length == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
        if cache_position is None:
            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
        elif use_cache:
            cache_position = cache_position[-input_length:]

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
                "cache_position": cache_position,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past


@add_start_docstrings(
    """
    The Qwen2MoE Model transformer with a sequence classification head on top (linear layer).

    [`Qwen2MoeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    QWEN2MOE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE
class Qwen2MoeForSequenceClassification(Qwen2MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen2MoeModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
    The Qwen2MoE Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    QWEN2MOE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE
class Qwen2MoeForTokenClassification(Qwen2MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen2MoeModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: archive/ktransformers/models/modeling_qwen3_moe.py
================================================
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/qwen3_moe/modular_qwen3_moe.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_qwen3_moe.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Callable, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
from torch import nn

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
from transformers.generation import GenerationMixin
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
# from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
# from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.modeling_utils import PreTrainedModel
# from transformers.processing_utils import Unpack
from transformers.utils import (
    # LossKwargs,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from transformers.utils.deprecation import deprecate_kwarg
from .configuration_qwen3_moe import Qwen3MoeConfig

from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRotaryEmbedding

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Qwen/Qwen3-MoE-15B-A2B"
_CONFIG_FOR_DOC = "Qwen3MoeConfig"


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs,
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class Qwen3MoeAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.num_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.scaling = self.head_dim**-0.5
        self.attention_dropout = config.attention_dropout
        self.is_causal = True

        self.q_proj = nn.Linear(
            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.v_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.o_proj = nn.Linear(
            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
        )
        self.q_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
        self.k_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape

        self.rotary_emb = Qwen2MoeRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

        self.sliding_window = config.sliding_window
        if not (
            self.config.use_sliding_window
            and getattr(self.config, "sliding_window", None) is not None
            and self.layer_idx >= self.config.max_window_layers
        ):
            self.sliding_window = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        # **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

        cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
        # if self.config._attn_implementation != "eager":
        #     if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
        #         logger.warning_once(
        #             "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
        #             'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
        #         )
        #     else:
        #         attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            sliding_window=self.sliding_window,  # diff with Llama
            # **kwargs,
        )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights


class Qwen3MoeMLP(nn.Module):
    def __init__(self, config, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class Qwen3MoeSparseMoeBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob

        # gating
        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
        self.experts = nn.ModuleList(
            [Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits


class Qwen3MoeRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Qwen3MoeRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


class Qwen3MoeDecoderLayer(nn.Module):
    def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = Qwen3MoeAttention(config, layer_idx)
        self.mlp = Qwen3MoeMLP(config)

        self.self_attn = Qwen3MoeAttention(config, layer_idx)

        if (layer_idx not in config.mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
            self.mlp = Qwen3MoeSparseMoeBlock(config)
        else:
            self.mlp = Qwen3MoeMLP(config, intermediate_size=config.intermediate_size)

        self.input_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        # **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
                and should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states)
        if isinstance(hidden_states, tuple):
            hidden_states, router_logits = hidden_states
        else:
            router_logits = None

        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if output_router_logits:
            outputs += (router_logits,)

        return outputs


def _compute_default_rope_parameters(
    config: Optional[Qwen3MoeConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        base = rope_kwargs["base"]
        dim = rope_kwargs["dim"]
    elif config is not None:
        base = config.rope_theta
        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
        dim = int(config.head_dim * partial_rotary_factor)

    attention_factor = 1.0  # Unused in this type of RoPE

    # Compute the inverse frequencies
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, attention_factor

class Qwen3MoeRotaryEmbedding(nn.Module):
    def __init__(self, config: Qwen3MoeConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        self.scaling_factor = 1.0
        self.dim = config.head_dim
        self.max_position_embeddings = config.max_position_embeddings
        self.base = config.rope_theta
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))

        inv_freq, self.attention_scaling = _compute_default_rope_parameters(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    def _dynamic_frequency_update(self, position_ids, device):
        """
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        """
        seq_len = torch.max(position_ids) + 1
        if seq_len > self.max_seq_len_cached:  # growth
            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
            self.max_seq_len_cached = seq_len

        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
            # This .to() is needed if the model has been moved to a device after being initialized (because
            # the buffer is automatically moved, but not the original copy)
            self.original_inv_freq = self.original_inv_freq.to(device)
            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
            self.max_seq_len_cached = self.original_max_seq_len

    @torch.no_grad()
    def forward(self, x, position_ids):
        if "dynamic" in self.rope_type:
            self._dynamic_frequency_update(position_ids, device=x.device)

        # Core RoPE block
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"

        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()

        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
        cos = cos * self.attention_scaling
        sin = sin * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


QWEN3_MOE_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Qwen3MoeConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare Qwen3Moe Model outputting raw hidden-states without any specific head on top.",
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoePreTrainedModel(PreTrainedModel):
    config_class = Qwen3MoeConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Qwen3MoeDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
    _supports_attention_backend = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


QWEN3_MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Qwen3Moe Model outputting raw hidden-states without any specific head on top.",
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeModel(Qwen3MoePreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3MoeDecoderLayer`]

    Args:
        config: Qwen3MoeConfig
    """

    def __init__(self, config: Qwen3MoeConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [Qwen3MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config)
        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        # **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                    position_embeddings,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                    # **flash_attn_kwargs,
                )

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        output = MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )
        return output if return_dict else output.to_tuple()

    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool = False,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and past_key_values is not None:
                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
                if is_padding_right:
                    raise ValueError(
                        "You are attempting to perform batched generation with padding_side='right'"
                        " this may lead to unexpected behaviour for Flash Attention version of Qwen3Moe. Make sure to "
                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                    )
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)
        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and not (using_static_cache or using_sliding_window_cache)
            and not output_attentions
        ):
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                sliding_window=self.config.sliding_window,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        # SlidingWindowCache or StaticCache
        if using_sliding_window_cache or using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
        # DynamicCache or no cache
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
            attention_mask,
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
            config=self.config,
            past_key_values=past_key_values,
        )

        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type in ["cuda", "xpu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

    @staticmethod
    def _prepare_4d_causal_attention_mask_with_cache_position(
        attention_mask: torch.Tensor,
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        config: Qwen3MoeConfig,
        past_key_values: Cache,
    ):
        """
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to place the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`Qwen3MoeConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        """
        if attention_mask is not None and attention_mask.dim() == 4:
            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
            causal_mask = attention_mask
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            if config.sliding_window is not None:
                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                # the check is needed to verify is current checkpoint was trained with sliding window or not
                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
                        cache_position.reshape(-1, 1) - config.sliding_window
                    )
                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
            causal_mask *= diagonal_attend_mask
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                if attention_mask.shape[-1] > target_length:
                    attention_mask = attention_mask[:, :target_length]
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
                    causal_mask.device
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        return causal_mask


# class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
class KwargsForCausalLM(): ...


def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
    top_k=2,
    attention_mask: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, int]:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


class Qwen3MoeForCausalLM(Qwen3MoePreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen3MoeModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.router_aux_loss_coef = config.router_aux_loss_coef
        self.num_experts = config.num_experts
        self.num_experts_per_tok = config.num_experts_per_tok

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        # **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            logits_to_keep (`int` or `torch.Tensor`, *optional*):
                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
                This is useful when using packed tensor format (single dimension for batch and sequence length).

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM

        >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_router_logits=output_router_logits,
            return_dict=return_dict,
            cache_position=cache_position,
            # **kwargs,
        )

        hidden_states = outputs[0]
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
        logits = self.lm_head(hidden_states[:, slice_indices, :])

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits if return_dict else outputs[-1],
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        if not return_dict:
            output = (logits,) + outputs[1:]
            if output_router_logits:
                output = (aux_loss,) + output
            return (loss,) + output if loss is not None else output

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )


@add_start_docstrings(
    """
    The Qwen3Moe Model transformer with a sequence classification head on top (linear layer).

    [`Qwen3MoeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeForSequenceClassification(Qwen3MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen3MoeModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        elif input_ids is not None:
            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
            logger.warning_once(
                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
            )

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
    The Qwen3Moe Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeForTokenClassification(Qwen3MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen3MoeModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.config)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
The Qwen3Moe Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeForQuestionAnswering(Qwen3MoePreTrainedModel):
    base_model_prefix = "transformer"

    def __init__(self, config):
        super().__init__(config)
        self.transformer = Qwen3MoeModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.transformer.embed_tokens

    def set_input_embeddings(self, value):
        self.transformer.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        loss = None
        if start_positions is not None and end_positions is not None:
            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


__all__ = [
    "Qwen3MoeForCausalLM",
    "Qwen3MoeForQuestionAnswering",
    "Qwen3MoeModel",
    "Qwen3MoePreTrainedModel",
    "Qwen3MoeForSequenceClassification",
    "Qwen3MoeForTokenClassification",
]

================================================
FILE: archive/ktransformers/models/modeling_qwen3_next.py
================================================
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/qwen3_next/modular_qwen3_next.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_qwen3_next.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Optional, Union

import torch
import torch.nn.functional as F
from torch import nn

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache
from transformers.generation import GenerationMixin
from transformers.masking_utils import create_causal_mask
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_layers import (
    GenericForQuestionAnswering,
    GenericForSequenceClassification,
    GenericForTokenClassification,
    GradientCheckpointingLayer,
)
from transformers.modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from transformers.utils.deprecation import deprecate_kwarg
from transformers.utils.generic import OutputRecorder, check_model_inputs
try:
    from transformers.utils.import_utils import (
        is_causal_conv1d_available,
        is_flash_linear_attention_available,
    )
except ImportError:
    is_causal_conv1d_available = lambda: False


try:
    from transformers.utils.import_utils import (
        is_flash_linear_attention_available,
    )
except ImportError:
    is_flash_linear_attention_available = lambda: False


from .configuration_qwen3_next import Qwen3NextConfig


if is_causal_conv1d_available():
    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
    causal_conv1d_update, causal_conv1d_fn = None, None


if is_flash_linear_attention_available():
    from fla.modules import FusedRMSNormGated
    from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
else:
    chunk_gated_delta_rule, fused_recurrent_gated_delta_rule = None, None
    FusedRMSNormGated = None

logger = logging.get_logger(__name__)


class Qwen3NextRMSNormGated(nn.Module):
    def __init__(self, hidden_size, eps=1e-6, **kwargs):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states, gate=None):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        # Norm before gate
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = self.weight * hidden_states.to(input_dtype)
        hidden_states = hidden_states * F.silu(gate.to(torch.float32))

        return hidden_states.to(input_dtype)


class Qwen3NextDynamicCache:
    """
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the linear attention
    cache (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for gated deltanet cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For linear attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `recurrent_states` represents the recurrent state and has a shape of `(batch_size, d_inner, d_state)`.
    """

    is_compileable = False

    def __init__(self, config: Qwen3NextConfig):
        super().__init__()
        self.layer_types = config.layer_types
        self.transformer_layers = [
            i for i in range(config.num_hidden_layers) if self.layer_types[i] == "full_attention"
        ]
        self.last_linear_layer = len(self.layer_types) - 1 - self.layer_types[::-1].index("linear_attention")

        # Initialize everything to None -> will be lazy initialized to allow multi-gpu (device_map) inference
        self.conv_states = [None for _ in range(config.num_hidden_layers)]
        self.recurrent_states = [None for _ in range(config.num_hidden_layers)]
        self.key_cache = [None for _ in range(config.num_hidden_layers)]
        self.value_cache = [None for _ in range(config.num_hidden_layers)]

    def __len__(self):
        return len(self.layer_types)

    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        return self.key_cache[layer_idx], self.value_cache[layer_idx]

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if self.key_cache[layer_idx] is None:
            self.key_cache[layer_idx] = key_states
            self.value_cache[layer_idx] = value_states
        else:
            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)

        return self.key_cache[layer_idx], self.value_cache[layer_idx]

    def reorder_cache(self, beam_idx: torch.LongTensor):
        """Reorders the cache for beam search, given the selected beam indices."""
        for layer_idx in range(len(self.key_cache)):
            if self.key_cache[layer_idx] is not None:
                device = self.key_cache[layer_idx].device
                beam_idx = beam_idx.to(device)
                self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx)
                self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx)

            if self.conv_states[layer_idx] is not None:
                device = self.conv_states[layer_idx].device
                beam_idx = beam_idx.to(device)
                self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx)
                self.recurrent_states[layer_idx] = self.recurrent_states[layer_idx].index_select(0, beam_idx)

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
        # take any layer that contains cache and not empty tensor
        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None:
            return 0
        return self.key_cache[layer_idx].shape[-2]

    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
        """
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns for each layer.
        """
        kv_offset = 0
        query_length = cache_position.shape[0]
        past_seen_tokens = self.get_seq_length(layer_idx)
        kv_length = query_length + past_seen_tokens
        return kv_length, kv_offset

    @property
    def has_previous_state(self):
        """We have a previous state if the last linear (conv) layer was already updated."""
        return self.conv_states[self.last_linear_layer] is not None


class Qwen3NextRotaryEmbedding(nn.Module):
    inv_freq: torch.Tensor  # fix linting for `register_buffer`

    def __init__(self, config: Qwen3NextConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    @torch.no_grad()
    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


class Qwen3NextRMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.hidden_size = dim
        self.variance_epsilon = eps
        self.eps = eps
        self.weight = nn.Parameter(torch.zeros(dim))

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float())
        # Llama does x.to(float16) * w whilst Qwen3Next is (x * w).to(float16)
        # See https://github.com/huggingface/transformers/pull/29402
        output = output * (1.0 + self.weight.float())
        return output.type_as(x)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.eps}"


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Adapted from transformers.models.glm.modular_glm.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)

    # Keep half or full tensor for later concatenation
    rotary_dim = cos.shape[-1]
    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]

    # Apply rotary embeddings on the first half or full tensor
    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)

    # Concatenate back to full shape
    q_embed = torch.cat([q_embed, q_pass], dim=-1)
    k_embed = torch.cat([k_embed, k_pass], dim=-1)
    return q_embed, k_embed


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs: Unpack[TransformersKwargs],
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class Qwen3NextAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: Qwen3NextConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.num_attention_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.scaling = self.head_dim**-0.5
        self.attention_dropout = config.attention_dropout
        self.is_causal = True
        self.q_proj = nn.Linear(
            config.hidden_size, config.num_attention_heads * self.head_dim * 2, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.v_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.o_proj = nn.Linear(
            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
        )
        self.q_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
        self.k_norm = Qwen3NextRMSNorm(
            self.head_dim, eps=config.rms_norm_eps
        )  # thus post q_norm does not need reshape

    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        query_states, gate = torch.chunk(
            self.q_proj(hidden_states).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
        )
        gate = gate.reshape(*input_shape, -1)

        query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

        cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_values is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            **kwargs,
        )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = attn_output * torch.sigmoid(gate)

        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights


def apply_mask_to_padding_states(hidden_states, attention_mask):
    """
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    """
    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
        dtype = hidden_states.dtype
        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)

    return hidden_states


is_fast_path_available = all(
    (causal_conv1d_fn, causal_conv1d_update, chunk_gated_delta_rule, fused_recurrent_gated_delta_rule)
)


def torch_causal_conv1d_update(
    hidden_states,
    conv_state,
    weight,
    bias=None,
    activation=None,
):
    _, hidden_size, seq_len = hidden_states.shape
    state_len = conv_state.shape[-1]

    hidden_states_new = torch.cat([conv_state, hidden_states], dim=-1).to(weight.dtype)
    conv_state.copy_(hidden_states_new[:, :, -state_len:])
    out = F.conv1d(hidden_states_new, weight.unsqueeze(1), bias, padding=0, groups=hidden_size)
    out = F.silu(out[:, :, -seq_len:])
    out = out.to(hidden_states.dtype)
    return out


def torch_chunk_gated_delta_rule(
    query,
    key,
    value,
    g,
    beta,
    chunk_size=64,
    initial_state=None,
    output_final_state=False,
    use_qk_l2norm_in_kernel=False,
):
    initial_dtype = query.dtype
    if use_qk_l2norm_in_kernel:
        query = F.normalize(query, p=2, dim=-1)
        key = F.normalize(key, p=2, dim=-1)
    query, key, value, beta, g = [
        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
    ]

    batch_size, sequence_length, num_heads, k_head_dim = key.shape
    v_head_dim = value.shape[-1]
    pad_size = (chunk_size - num_heads % chunk_size) % chunk_size
    query = F.pad(query, (0, 0, 0, pad_size))
    key = F.pad(key, (0, 0, 0, pad_size))
    value = F.pad(value, (0, 0, 0, pad_size))
    beta = F.pad(beta, (0, pad_size))
    g = F.pad(g, (0, pad_size))
    tot_heads = num_heads + pad_size
    scale = 1 / (query.shape[-1] ** 0.5)
    query = query * scale

    v_beta = value * beta.unsqueeze(-1)
    k_beta = key * beta.unsqueeze(-1)
    # reshape to chunks
    query, key, value, k_beta, v_beta = [
        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1]) for x in (query, key, value, k_beta, v_beta)
    ]
    g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0)

    # chunk decay
    g = g.cumsum(dim=-1)
    decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
    attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
    for i in range(1, chunk_size):
        row = attn[..., i, :i].clone()
        sub = attn[..., :i, :i].clone()
        attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
    attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
    value = attn @ v_beta
    k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
    last_recurrent_state = (
        torch.zeros(batch_size, sequence_length, k_head_dim, v_head_dim).to(value)
        if initial_state is None
        else initial_state.to(value)
    )
    core_attn_out = torch.zeros_like(value)
    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1)

    # for each chunk
    for i in range(0, tot_heads // chunk_size):
        q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
        attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
        v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
        v_new = v_i - v_prime
        attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
        core_attn_out[:, :, i] = attn_inter + attn @ v_new
        last_recurrent_state = (
            last_recurrent_state * g[:, :, i, -1, None, None].exp()
            + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
        )

    if not output_final_state:
        last_recurrent_state = None
    core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1])
    core_attn_out = core_attn_out[:, :, :num_heads]
    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
    return core_attn_out, last_recurrent_state


def torch_recurrent_gated_delta_rule(
    query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False
):
    initial_dtype = query.dtype
    if use_qk_l2norm_in_kernel:
        query = F.normalize(query, p=2, dim=-1)
        key = F.normalize(key, p=2, dim=-1)
    query, key, value, beta, g = [
        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
    ]

    batch_size, sequence_length, num_heads, k_head_dim = key.shape
    v_head_dim = value.shape[-1]
    scale = 1 / (query.shape[-1] ** 0.5)
    query = query * scale

    core_attn_out = torch.zeros(batch_size, sequence_length, num_heads, v_head_dim).to(value)
    last_recurrent_state = (
        torch.zeros(batch_size, sequence_length, k_head_dim, v_head_dim).to(value)
        if initial_state is None
        else initial_state.to(value)
    )

    for i in range(num_heads):
        q_t = query[:, :, i]
        k_t = key[:, :, i]
        v_t = value[:, :, i]
        g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1)
        beta_t = beta[:, :, i].unsqueeze(-1)

        last_recurrent_state = last_recurrent_state * g_t
        kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
        delta = (v_t - kv_mem) * beta_t
        last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
        core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)

    if not output_final_state:
        last_recurrent_state = None
    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
    return core_attn_out, last_recurrent_state


class Qwen3NextGatedDeltaNet(nn.Module):
    def __init__(self, config: Qwen3NextConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.num_v_heads = config.linear_num_value_heads
        self.num_k_heads = config.linear_num_key_heads
        self.head_k_dim = config.linear_key_head_dim
        self.head_v_dim = config.linear_value_head_dim
        self.key_dim = self.head_k_dim * self.num_k_heads
        self.value_dim = self.head_v_dim * self.num_v_heads

        self.conv_kernel_size = config.linear_conv_kernel_dim
        self.layer_idx = layer_idx
        self.activation = config.hidden_act
        self.act = ACT2FN[config.hidden_act]
        self.layer_norm_epsilon = config.rms_norm_eps
        
        self.config = config

        # QKV
        self.conv_dim = self.key_dim * 2 + self.value_dim
        self.conv1d = nn.Conv1d(
            in_channels=self.conv_dim,
            out_channels=self.conv_dim,
            bias=False,
            kernel_size=self.conv_kernel_size,
            groups=self.conv_dim,
            padding=self.conv_kernel_size - 1,
        )

        # projection of the input hidden states
        projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2
        projection_size_ba = self.num_v_heads * 2
        self.in_proj_qkvz = nn.Linear(self.hidden_size, projection_size_qkvz, bias=False)
        self.in_proj_ba = nn.Linear(self.hidden_size, projection_size_ba, bias=False)

        # time step projection (discretization)
        # instantiate once and copy inv_dt in init_weights of PretrainedModel
        self.dt_bias = nn.Parameter(torch.ones(self.num_v_heads))

        A = torch.empty(self.num_v_heads).uniform_(0, 16)
        self.A_log = nn.Parameter(torch.log(A))

        self.norm = (
            Qwen3NextRMSNormGated(self.head_v_dim, eps=self.layer_norm_epsilon)
            if FusedRMSNormGated is None
            else FusedRMSNormGated(
                self.head_v_dim,
                eps=self.layer_norm_epsilon,
                activation=self.activation,
                device=torch.cuda.current_device(),
                dtype=config.dtype if config.dtype is not None else torch.get_current_dtype(),
            )
        )

        self.out_proj = nn.Linear(self.value_dim, self.hidden_size, bias=False)

        self.causal_conv1d_fn = causal_conv1d_fn
        self.causal_conv1d_update = causal_conv1d_update or torch_causal_conv1d_update
        self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
        self.recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule or torch_recurrent_gated_delta_rule

        if not is_fast_path_available:
            logger.warning_once(
                "The fast path is not available because one of the required library is not installed. Falling back to "
                "torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and"
                " https://github.com/Dao-AILab/causal-conv1d"
            )

    def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba):
        """
        Derives `query`, `key` and `value` tensors from `mixed_qkvz` and `mixed_ba`.
        """

        new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
            self.num_k_heads,
            2 * self.head_k_dim + 2 * self.head_v_dim * self.num_v_heads // self.num_k_heads,
        )
        new_tensor_shape_ba = mixed_ba.size()[:-1] + (self.num_k_heads, 2 * self.num_v_heads // self.num_k_heads)

        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
        split_arg_list_qkvz = [
            self.head_k_dim,
            self.head_k_dim,
            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
        ]
        split_arg_list_ba = [self.num_v_heads // self.num_k_heads, self.num_v_heads // self.num_k_heads]
        query, key, value, z = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=3)
        b, a = torch.split(mixed_ba, split_arg_list_ba, dim=3)
        # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
        value = value.reshape(value.size(0), value.size(1), -1, self.head_v_dim)
        z = z.reshape(z.size(0), z.size(1), -1, self.head_v_dim)
        b = b.reshape(b.size(0), b.size(1), self.num_v_heads)
        a = a.reshape(a.size(0), a.size(1), self.num_v_heads)
        return query, key, value, z, b, a

    def forward(
        self,
        hidden_states: torch.Tensor,
        cache_params: Optional[Qwen3NextDynamicCache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)

        # Set up dimensions for reshapes later
        batch_size, seq_len, _ = hidden_states.shape

        use_precomputed_states = (
            cache_params is not None
            and cache_params.has_previous_state
            and seq_len == 1
            and cache_position is not None
        )

        # getting projected states from cache if it exists
        if cache_params is not None:
            conv_state = cache_params.conv_states[self.layer_idx]
            recurrent_state = cache_params.recurrent_states[self.layer_idx]

        projected_states_qkvz = self.in_proj_qkvz(hidden_states)
        projected_states_ba = self.in_proj_ba(hidden_states)
        query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
        query, key, value = (x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value))

        mixed_qkv = torch.cat((query, key, value), dim=-1)
        mixed_qkv = mixed_qkv.transpose(1, 2)

        if use_precomputed_states:
            # 2. Convolution sequence transformation
            # NOTE: the conv state is updated in `causal_conv1d_update`
            mixed_qkv = self.causal_conv1d_update(
                mixed_qkv,
                conv_state,
                self.conv1d.weight.squeeze(1),
                self.conv1d.bias,
                self.activation,
            )
        else:
            if cache_params is not None:
                conv_state = F.pad(mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0))
                cache_params.conv_states[self.layer_idx] = conv_state
            if self.causal_conv1d_fn is not None:
                mixed_qkv = self.causal_conv1d_fn(
                    x=mixed_qkv,
                    weight=self.conv1d.weight.squeeze(1),
                    bias=self.conv1d.bias,
                    activation=self.activation,
                    seq_idx=None,
                )
            else:
                mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])

        mixed_qkv = mixed_qkv.transpose(1, 2)
        query, key, value = torch.split(
            mixed_qkv,
            [
                self.key_dim,
                self.key_dim,
                self.value_dim,
            ],
            dim=-1,
        )
        query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
        key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
        value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)

        beta = b.sigmoid()
        # If the model is loaded in fp16, without the .float() here, A might be -inf
        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
        if self.num_v_heads // self.num_k_heads > 1:
            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)

        if not use_precomputed_states:
            core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
                query,
                key,
                value,
                g=g,
                beta=beta,
                initial_state=None,
                output_final_state=cache_params is not None,
                use_qk_l2norm_in_kernel=True,
            )

        else:
            core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
                query,
                key,
                value,
                g=g,
                beta=beta,
                initial_state=recurrent_state,
                output_final_state=cache_params is not None,
                use_qk_l2norm_in_kernel=True,
            )

        # Update cache
        if cache_params is not None:
            cache_params.recurrent_states[self.layer_idx] = last_recurrent_state

        z_shape_og = z.shape
        # reshape input data into 2D tensor
        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
        z = z.reshape(-1, z.shape[-1])
        core_attn_out = self.norm(core_attn_out, z)
        core_attn_out = core_attn_out.reshape(z_shape_og)
        core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1)

        output = self.out_proj(core_attn_out)
        return output


class Qwen3NextMLP(nn.Module):
    def __init__(self, config, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class Qwen3NextSparseMoeBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob

        # gating
        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
        self.experts = nn.ModuleList(
            [Qwen3NextMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
        )

        self.shared_expert = Qwen3NextMLP(config, intermediate_size=config.shared_expert_intermediate_size)
        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
        for expert_idx in expert_hit:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))

        shared_expert_output = self.shared_expert(hidden_states)
        shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output

        final_hidden_states = final_hidden_states + shared_expert_output

        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits


class Qwen3NextDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: Qwen3NextConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        # token mixer
        self.layer_type = config.layer_types[layer_idx]
        if self.layer_type == "linear_attention":
            self.linear_attn = Qwen3NextGatedDeltaNet(config, layer_idx)
        elif self.layer_type == "full_attention":
            self.self_attn = Qwen3NextAttention(config, layer_idx)

        if (layer_idx not in config.mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
            self.mlp = Qwen3NextSparseMoeBlock(config)
        else:
            self.mlp = Qwen3NextMLP(config, intermediate_size=config.intermediate_size)

        self.input_layernorm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[tuple[torch.Tensor]] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> torch.FloatTensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
                and should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Token Mixer
        if self.layer_type == "linear_attention":
            hidden_states = self.linear_attn(
                hidden_states=hidden_states,
                cache_params=past_key_values,
                cache_position=cache_position,
                attention_mask=attention_mask,
            )
        elif self.layer_type == "full_attention":
            # Self Attention
            hidden_states, _ = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_values=past_key_values,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
                **kwargs,
            )

        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        # For the MoE layers, we need to unpack
        if isinstance(hidden_states, tuple):
            hidden_states, _ = hidden_states
        hidden_states = residual + hidden_states

        return hidden_states


class Qwen3NextPreTrainedModel(PreTrainedModel):
    config: Qwen3NextConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Qwen3NextDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _keys_to_ignore_on_load_unexpected = [r"^mtp.*"]
    _can_record_outputs = {
        "router_logits": OutputRecorder(Qwen3NextSparseMoeBlock, index=1),
        "hidden_states": Qwen3NextDecoderLayer,
        "attentions": Qwen3NextAttention,
    }
    _is_stateful = True

    def _init_weights(self, module):
        super()._init_weights(module)
        if isinstance(module, Qwen3NextGatedDeltaNet):
            module.dt_bias.data.fill_(1.0)
            module.A_log.data.uniform_(0, 16).log_()


class Qwen3NextModel(Qwen3NextPreTrainedModel):
    def __init__(self, config: Qwen3NextConfig):
        super().__init__(config)
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
        self.layers = nn.ModuleList(
            [Qwen3NextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = Qwen3NextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = Qwen3NextRotaryEmbedding(config=config)
        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    @check_model_inputs
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> MoeModelOutputWithPast:
        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if use_cache and past_key_values is None:
            past_key_values = Qwen3NextDynamicCache(config=self.config)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = create_causal_mask(
            config=self.config,
            input_embeds=inputs_embeds,
            attention_mask=attention_mask,
            cache_position=cache_position,
            past_key_values=past_key_values,
            position_ids=position_ids,
        )
        linear_attn_mask = self._update_linear_attn_mask(attention_mask, cache_position)

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
            layer_mask = linear_attn_mask if decoder_layer.layer_type == "linear_attention" else causal_mask

            hidden_states = decoder_layer(
                hidden_states,
                position_embeddings=position_embeddings,
                attention_mask=layer_mask,
                position_ids=position_ids,
                past_key_values=past_key_values,
                use_cache=use_cache,
                cache_position=cache_position,
                **kwargs,
            )

        hidden_states = self.norm(hidden_states)

        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
        )

    def _update_linear_attn_mask(self, attention_mask, cache_position):
        """
        NOTE: Left-padding is used for linear attention mask.
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        """
        linear_attn_mask = attention_mask
        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
            linear_attn_mask = None
        return linear_attn_mask


def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
    top_k=2,
    attention_mask: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, int]:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


@auto_docstring
class Qwen3NextForCausalLM(Qwen3NextPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen3NextModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.router_aux_loss_coef = config.router_aux_loss_coef
        self.num_experts = config.num_experts
        self.num_experts_per_tok = config.num_experts_per_tok

        # Initialize weights and apply final processing
        self.post_init()

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Qwen3NextDynamicCache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        **kwargs: Unpack[TransformersKwargs],
    ) -> MoeCausalLMOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3NextForCausalLM

        >>> model = Qwen3NextForCausalLM.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""

        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs: MoeModelOutputWithPast = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_router_logits=output_router_logits,
            cache_position=cache_position,
            **kwargs,
        )

        hidden_states = outputs.last_hidden_state
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
        logits = self.lm_head(hidden_states[:, slice_indices, :])

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits,
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )


class Qwen3NextForSequenceClassification(GenericForSequenceClassification, Qwen3NextPreTrainedModel):
    pass


class Qwen3NextForTokenClassification(GenericForTokenClassification, Qwen3NextPreTrainedModel):
    pass


class Qwen3NextForQuestionAnswering(GenericForQuestionAnswering, Qwen3NextPreTrainedModel):
    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`


__all__ = [
    "Qwen3NextForCausalLM",
    "Qwen3NextForQuestionAnswering",
    "Qwen3NextModel",
    "Qwen3NextPreTrainedModel",
    "Qwen3NextForSequenceClassification",
    "Qwen3NextForTokenClassification",
]

================================================
FILE: archive/ktransformers/models/modeling_smallthinker.py
================================================
# coding=utf-8
from functools import partial
from typing import Callable, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
from torch import nn

from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
from transformers.generation import GenerationMixin
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_outputs import (
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast
)
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.utils import can_return_tuple, is_torch_flex_attn_available, logging
from .configuration_smallthinker import SmallthinkerConfig


if is_torch_flex_attn_available():
    from torch.nn.attention.flex_attention import BlockMask

    from transformers.integrations.flex_attention import make_flex_block_causal_mask


logger = logging.get_logger(__name__)


class SmallthinkerHierarchicalMLP(nn.Module):
    def __init__(self, config: SmallthinkerConfig):
        super().__init__()
        self.config = config
        self.hidden_dim = config.hidden_size
        self.ffn_dim = config.moe_ffn_hidden_size
        self.moe_enable_secondary_experts = config.moe_enable_secondary_experts
        if self.moe_enable_secondary_experts:
            self.num_secondary_experts = config.moe_num_secondary_experts
            self.secondary_expert_size = config.moe_secondary_expert_size
            self.secondary_gate = nn.Linear(self.hidden_dim, self.num_secondary_experts, bias=False)

        self.up = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
        self.gate = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
        self.down = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
        
    def forward(self, secondary_gate_input: torch.Tensor, hidden_states: torch.Tensor):
        if self.moe_enable_secondary_experts:
            secondary_gate_logits = F.sigmoid(self.secondary_gate(secondary_gate_input)) > 0.5
            secondary_gate_mask = secondary_gate_logits.unsqueeze(-1)

        current_hidden_states = self.up(hidden_states) * F.relu(self.gate(hidden_states))
        activated_output =  current_hidden_states
        batch_size, intermediate_size = activated_output.shape

        if self.moe_enable_secondary_experts:
            num_groups = intermediate_size // self.secondary_expert_size
            activated_output = activated_output.view(batch_size, num_groups, self.secondary_expert_size)
            output = activated_output * secondary_gate_mask
        else:
            output = activated_output

        current_hidden_states = output.view(batch_size, -1)
        current_hidden_states = self.down(current_hidden_states)
        return current_hidden_states


class SmallthinkerMoeBlock(nn.Module):
    def __init__(self, config: SmallthinkerConfig):
        super().__init__()
        self.hidden_dim = config.hidden_size
        self.num_primary_experts = config.moe_num_primary_experts
        self.enable_early_router = config.moe_enable_early_router
        self.moe_primary_router_apply_softmax = config.moe_primary_router_apply_softmax
        self.num_active_primary_experts = config.moe_num_active_primary_experts
        self.primary_router = nn.Linear(self.hidden_dim, self.num_primary_experts, bias=False)
        self.experts = nn.ModuleList([SmallthinkerHierarchicalMLP(config) for _ in range(self.num_primary_experts)])

    def forward(self, router_input: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        # Flatten the tokens into (bs * sl, hidden_dim)
        hidden_states = hidden_states.view(-1, hidden_dim)
        router_input = router_input.view(-1, hidden_dim)
        # Primary router logits: (bs * sl, n_experts)
        if self.enable_early_router:
            router_logits = self.primary_router(router_input)
        else:
            router_logits = self.primary_router(hidden_states)

        router_logits, selected_experts = torch.topk(router_logits, self.num_active_primary_experts, dim=-1)

        if self.moe_primary_router_apply_softmax:
            routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        else:
            routing_weights = F.sigmoid(router_logits)
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)

        routing_weights = routing_weights.to(hidden_states.dtype)

        # Prepare the final tensor
        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_primary_experts).permute(2, 1, 0)
        expert_hitted = (expert_mask.sum(dim=(-1, -2)) > 0).nonzero(as_tuple=True)[0].tolist()

        for expert_idx in expert_hitted:
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])
            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            # current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            # current_router_input = router_input[None, top_x].reshape(-1, hidden_dim)
            current_state = hidden_states[top_x].reshape(-1, hidden_dim)
            current_router_input = router_input[top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_router_input, current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits
    

class SmallthinkerDenseMlpBlock(nn.Module):
    def __init__(self, config: SmallthinkerConfig):
        super().__init__()
        hidden_dim = config.hidden_size
        ffn_dim = config.dense_ffn_hidden_size
        self.up = nn.Linear(hidden_dim, ffn_dim, bias=False)
        self.gate = nn.Linear(hidden_dim, ffn_dim, bias=False)
        self.down = nn.Linear(ffn_dim, hidden_dim, bias=False)

    # Offer unified interface for SmallthinkerMoeBlock and SmallthinkerDenseMlpBlock, though router_input is not used here
    def forward(self, router_input: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
        current_hidden_states = self.up(hidden_states) * F.relu(self.gate(hidden_states))
        current_hidden_states = self.down(current_hidden_states)
        return current_hidden_states, None


class SmallthinkerRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        SmallthinkerRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs,
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class SmallthinkerAttention(nn.Module):
    def __init__(self, config: SmallthinkerConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx # For KVCache management
        self.head_dim = config.head_dim
        self.num_attention_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.scaling = self.head_dim**-0.5
        self.is_causal = True
        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
        self.sliding_window = config.sliding_window_size if config.sliding_window_layout[layer_idx] else None
        self.use_qk_norm = config.use_qk_norm

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if self.use_qk_norm:
            raise NotImplementedError("use_qk_norm is not implemented yet")

        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
        
        if position_embeddings:
            cos, sin = position_embeddings
            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
        else:
            cos, sin = None, None
        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward

        if self.config._attn_implementation == "sdpa":
            raise NotImplementedError("SDPA impl is buggy for now. NEVER TRY TO USE IT.")

        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                )
            else:
                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0,
            scaling=self.scaling,
            sliding_window=self.sliding_window,  # main diff with Llama
            **kwargs,
        )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights


class SmallthinkerDecoderLayer(nn.Module):
    def __init__(self, config: SmallthinkerConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.self_attn = SmallthinkerAttention(config, layer_idx)

        self.block_sparse_moe = SmallthinkerMoeBlock(config) if config.moe_layer_layout[layer_idx] else SmallthinkerDenseMlpBlock(config)
        self.input_layernorm = SmallthinkerRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = SmallthinkerRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """
        # print(f"hidden states, shape {hidden_states.shape}: {hidden_states}") # debug print
        residual = hidden_states
        router_input = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention 
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            **kwargs,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states, router_logits = self.block_sparse_moe(router_input, hidden_states)
        hidden_states = residual + hidden_states # SYNC after_moe_residual_value=hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if output_router_logits:
            outputs += (router_logits,)

        return outputs


class SmallthinkerRotaryEmbedding(nn.Module):
    def __init__(self, config: SmallthinkerConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    @torch.no_grad()
    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


class SmallthinkerPreTrainedModel(PreTrainedModel):
    config_class = SmallthinkerConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["SmallthinkerDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
    _supports_attention_backend = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, SmallthinkerRMSNorm):
            module.weight.data.fill_(1.0)


# @auto_docstring
class SmallthinkerModel(SmallthinkerPreTrainedModel):
    def __init__(self, config: SmallthinkerConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [SmallthinkerDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = SmallthinkerRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = SmallthinkerRotaryEmbedding(config=config)
        self.gradient_checkpointing = False
        self.rope_layout = config.rope_layout

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @can_return_tuple
    # @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> MoeModelOutputWithPast:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        # print("atten mask:", attention_mask) # debug print

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        # print("causal mask:", causal_mask) # debug print
        hidden_states = inputs_embeds
        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None

        for layer_idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                    position_embeddings if self.rope_layout[layer_idx] else None,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings if self.rope_layout[layer_idx] else None,
                    **flash_attn_kwargs,
                )

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    def _update_causal_mask(
        self,
        attention_mask: Union[torch.Tensor, "BlockMask"],
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool = False,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and past_key_values is not None:
                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
                if is_padding_right:
                    raise ValueError(
                        "You are attempting to perform batched generation with padding_side='right'"
                        " this may lead to unexpected behaviour for Flash Attention version of Smallthinker. Make sure to "
                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                    )
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None
        if self.config._attn_implementation == "flex_attention":
            if isinstance(attention_mask, torch.Tensor):
                attention_mask = make_flex_block_causal_mask(attention_mask)
            return attention_mask

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)
        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and not (using_static_cache or using_sliding_window_cache)
            and not output_attentions
        ):
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                sliding_window=self.config.sliding_window,
                is_training=self.training,
            ):
                return None

        dtype = input_tensor.dtype
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        # SlidingWindowCache or StaticCache
        if using_sliding_window_cache or using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
        # DynamicCache or no cache
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
            attention_mask,
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
            config=self.config,
            past_key_values=past_key_values,
        )

        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type in ["cuda", "xpu", "npu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

    @staticmethod
    def _prepare_4d_causal_attention_mask_with_cache_position(
        attention_mask: torch.Tensor,
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
        cache_position: torch.Tensor,
        batch_size: int,
        config: SmallthinkerConfig,
        past_key_values: Cache,
    ):
        """
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`SmallthinkerConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        """
        if attention_mask is not None and attention_mask.dim() == 4:
            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
            causal_mask = attention_mask
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
            )
            diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
                -1, 1
            )
            if config.get_text_config().sliding_window is not None:
                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                # the check is needed to verify is current checkpoint was trained with sliding window or not
                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
                    sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
                        cache_position.reshape(-1, 1) - config.get_text_config().sliding_window
                    )
                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
            causal_mask *= diagonal_attend_mask
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                if attention_mask.shape[-1] > target_length:
                    attention_mask = attention_mask[:, :target_length]
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
                    causal_mask.device
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        return causal_mask


class KwargsForCausalLM(FlashAttentionKwargs): ...


def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
    top_k=2,
    attention_mask: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, int]:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


# @auto_docstring
class SmallThinkerForCausalLM(SmallthinkerPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.model = SmallthinkerModel(config)
        self.vocab_size = config.vocab_size
        # Handle tie / untie word embeddings
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)        
        # self.num_experts = config.num_local_experts
        # self.num_experts_per_tok = config.num_experts_per_tok
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @can_return_tuple
#     @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> MoeCausalLMOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, SmallThinkerForCausalLM

        >>> model = SmallThinkerForCausalLM.from_pretrained("mistralai/Smallthinker-8x7B-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Smallthinker-8x7B-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs: MoeModelOutputWithPast = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_router_logits=output_router_logits,
            cache_position=cache_position,
            **kwargs,
        )

        hidden_states = outputs.last_hidden_state
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
        logits = self.lm_head(hidden_states[:, slice_indices, :])

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits,
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )

# No such functions for now
# #@auto_docstring(
#     custom_intro="""
#     The Smallthinker Model transformer with a sequence classification head on top (linear layer).

#     [`SmallthinkerForSequenceClassification`] uses the last token in order to do the classification, as other causal models
#     (e.g. GPT-2) do.

#     Since it does classification on the last token, it requires to know the position of the last token. If a
#     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
#     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
#     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
#     each row of the batch).
#     """
# )
# class SmallthinkerForSequenceClassification(SmallthinkerPreTrainedModel):
#     def __init__(self, config):
#         super().__init__(config)
#         self.num_labels = config.num_labels
#         self.model = SmallthinkerModel(config)
#         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

#         # Initialize weights and apply final processing
#         self.post_init()

#     def get_input_embeddings(self):
#         return self.model.embed_tokens

#     def set_input_embeddings(self, value):
#         self.model.embed_tokens = value

#     @can_return_tuple
#     #@auto_docstring
#     def forward(
#         self,
#         input_ids: Optional[torch.LongTensor] = None,
#         attention_mask: Optional[torch.Tensor] = None,
#         position_ids: Optional[torch.LongTensor] = None,
#         past_key_values: Optional[Cache] = None,
#         inputs_embeds: Optional[torch.FloatTensor] = None,
#         labels: Optional[torch.LongTensor] = None,
#         use_cache: Optional[bool] = None,
#         output_attentions: Optional[bool] = None,
#         output_hidden_states: Optional[bool] = None,
#     ) -> SequenceClassifierOutputWithPast:
#         r"""
#         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
#             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
#             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
#             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
#         """

#         transformer_outputs: BaseModelOutputWithPast = self.model(
#             input_ids,
#             attention_mask=attention_mask,
#             position_ids=position_ids,
#             past_key_values=past_key_values,
#             inputs_embeds=inputs_embeds,
#             use_cache=use_cache,
#             output_attentions=output_attentions,
#             output_hidden_states=output_hidden_states,
#         )
#         hidden_states = transformer_outputs.last_hidden_state
#         logits = self.score(hidden_states)

#         if input_ids is not None:
#             batch_size = input_ids.shape[0]
#         else:
#             batch_size = inputs_embeds.shape[0]

#         if self.config.pad_token_id is None and batch_size != 1:
#             raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
#         if self.config.pad_token_id is None:
#             last_non_pad_token = -1
#         elif input_ids is not None:
#             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
#             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
#             token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
#             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
#         else:
#             last_non_pad_token = -1
#             logger.warning_once(
#                 f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
#                 "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
#             )

#         pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

#         loss = None
#         if labels is not None:
#             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

#         return SequenceClassifierOutputWithPast(
#             loss=loss,
#             logits=pooled_logits,
#             past_key_values=transformer_outputs.past_key_values,
#             hidden_states=transformer_outputs.hidden_states,
#             attentions=transformer_outputs.attentions,
#         )


# #@auto_docstring
# class SmallthinkerForTokenClassification(SmallthinkerPreTrainedModel):
#     def __init__(self, config):
#         super().__init__(config)
#         self.num_labels = config.num_labels
#         self.model = SmallthinkerModel(config)
#         if getattr(config, "classifier_dropout", None) is not None:
#             classifier_dropout = config.classifier_dropout
#         elif getattr(config, "hidden_dropout", None) is not None:
#             classifier_dropout = config.hidden_dropout
#         else:
#             classifier_dropout = 0.1
#         self.dropout = nn.Dropout(classifier_dropout)
#         self.score = nn.Linear(config.hidden_size, config.num_labels)

#         # Initialize weights and apply final processing
#         self.post_init()

#     def get_input_embeddings(self):
#         return self.model.embed_tokens

#     def set_input_embeddings(self, value):
#         self.model.embed_tokens = value

#     @can_return_tuple
#     #@auto_docstring
#     def forward(
#         self,
#         input_ids: Optional[torch.LongTensor] = None,
#         attention_mask: Optional[torch.Tensor] = None,
#         position_ids: Optional[torch.LongTensor] = None,
#         past_key_values: Optional[Cache] = None,
#         inputs_embeds: Optional[torch.FloatTensor] = None,
#         labels: Optional[torch.LongTensor] = None,
#         use_cache: Optional[bool] = None,
#         output_attentions: Optional[bool] = None,
#         output_hidden_states: Optional[bool] = None,
#     ) -> TokenClassifierOutput:
#         r"""
#         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
#             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
#             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
#             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
#         """

#         outputs: BaseModelOutputWithPast = self.model(
#             input_ids,
#             attention_mask=attention_mask,
#             position_ids=position_ids,
#             past_key_values=past_key_values,
#             inputs_embeds=inputs_embeds,
#             use_cache=use_cache,
#             output_attentions=output_attentions,
#             output_hidden_states=output_hidden_states,
#         )
#         sequence_output = outputs.last_hidden_state
#         sequence_output = self.dropout(sequence_output)
#         logits = self.score(sequence_output)

#         loss = None
#         if labels is not None:
#             loss = self.loss_function(logits, labels, self.config)

#         return TokenClassifierOutput(
#             loss=loss,
#             logits=logits,
#             hidden_states=outputs.hidden_states,
#             attentions=outputs.attentions,
#         )


# #@auto_docstring
# class SmallthinkerForQuestionAnswering(SmallthinkerPreTrainedModel):
#     base_model_prefix = "model"

#     def __init__(self, config):
#         super().__init__(config)
#         self.qa_outputs = nn.Linear(config.hidden_size, 2)
#         self.model = SmallthinkerModel(config)  # diff with Llama: transformer->model

#         # Initialize weights and apply final processing
#         self.post_init()

#     def get_input_embeddings(self):
#         return self.model.embed_tokens

#     def set_input_embeddings(self, value):
#         self.model.embed_tokens = value

#     @can_return_tuple
#     #@auto_docstring
#     def forward(
#         self,
#         input_ids: Optional[torch.LongTensor] = None,
#         attention_mask: Optional[torch.Tensor] = None,
#         position_ids: Optional[torch.LongTensor] = None,
#         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
#         inputs_embeds: Optional[torch.FloatTensor] = None,
#         start_positions: Optional[torch.LongTensor] = None,
#         end_positions: Optional[torch.LongTensor] = None,
#         output_attentions: Optional[bool] = None,
#         output_hidden_states: Optional[bool] = None,
#         **kwargs,
#     ) -> QuestionAnsweringModelOutput:
#         r"""
#         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
#             Labels for position (index) of the start of the labelled span for computing the token classification loss.
#             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
#             are not taken into account for computing the loss.
#         end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
#             Labels for position (index) of the end of the labelled span for computing the token classification loss.
#             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
#             are not taken into account for computing the loss.
#         """

#         outputs: BaseModelOutputWithPast = self.model(
#             input_ids,
#             attention_mask=attention_mask,
#             position_ids=position_ids,
#             past_key_values=past_key_values,
#             inputs_embeds=inputs_embeds,
#             output_attentions=output_attentions,
#             output_hidden_states=output_hidden_states,
#         )

#         sequence_output = outputs.last_hidden_state

#         logits = self.qa_outputs(sequence_output)
#         start_logits, end_logits = logits.split(1, dim=-1)
#         start_logits = start_logits.squeeze(-1).contiguous()
#         end_logits = end_logits.squeeze(-1).contiguous()

#         loss = None
#         if start_positions is not None and end_positions is not None:
#             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)

#         return QuestionAnsweringModelOutput(
#             loss=loss,
#             start_logits=start_logits,
#             end_logits=end_logits,
#             hidden_states=outputs.hidden_states,
#             attentions=outputs.attentions,
#         )


__all__ = [
    "SmallThinkerForCausalLM",
    "SmallthinkerForQuestionAnswering",
    "SmallthinkerModel",
    "SmallthinkerPreTrainedModel",
    "SmallthinkerForSequenceClassification",
    "SmallthinkerForTokenClassification",
]

if __name__ == "__main__":
    from transformers import AutoTokenizer, AutoModelForCausalLM

    test_config = SmallthinkerConfig()
    tokenizer = AutoTokenizer.from_pretrained("./qwen-tokenizer")
    text = "Once upon a day"
    tokens = tokenizer.encode_plus( text,add_special_tokens=True,return_tensors='pt')
    # print(tokens)
    test_model = AutoModelForCausalLM.from_pretrained(".").cuda()

    output = test_model.generate(tokens)
    otokens = tokenizer.decode(output[0])
    # print(otokens)


================================================
FILE: archive/ktransformers/operators/RoPE.py
================================================
"""
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

from torch import nn
from transformers import ROPE_INIT_FUNCTIONS
from ktransformers.models.modeling_llama import (
    LlamaRotaryEmbedding,
    LlamaLinearScalingRotaryEmbedding,
    LlamaDynamicNTKScalingRotaryEmbedding,
)
from ktransformers.models.modeling_deepseek_v3 import (
    DeepseekV3RotaryEmbedding
)
from ktransformers.models.modeling_deepseek import (
    DeepseekV2YarnRotaryEmbedding,
    DeepseekV2RotaryEmbedding,
    yarn_get_mscale,
    yarn_linear_ramp_mask,
    yarn_find_correction_range
)
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import InferenceState
from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_smallthinker import SmallthinkerRotaryEmbedding
from ktransformers.models.modeling_glm4_moe import Glm4MoeRotaryEmbedding
import torch

# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim, orig_module.max_position_embeddings, orig_module.base
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.device,
        )


class RotaryEmbeddingV3(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
    
    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)   

    def load(self):
        self._init(
            dim=self.config.qk_rope_head_dim,
            max_position_embeddings=self.config.max_position_embeddings,
            base=self.config.rope_theta,
            device=self.device,
        )
    def _init(self, dim, max_position_embeddings, base, device, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        # self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
            None,
            orig_module.scaling_factor,
            orig_module.rope_type,
            orig_module.config,
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.device,
            self.orig_module.scaling_factor,
            self.orig_module.rope_type,
            self.orig_module.config,
        )

class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
            None,  # device
            orig_module.scaling_factor,
            orig_module.original_max_position_embeddings,
            orig_module.beta_fast,
            orig_module.beta_slow,
            orig_module.mscale,
            orig_module.mscale_all_dim,
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.generate_device,
            self.orig_module.scaling_factor,
            self.orig_module.original_max_position_embeddings,
            self.orig_module.beta_fast,
            self.orig_module.beta_slow,
            self.orig_module.mscale,
            self.orig_module.mscale_all_dim,
        )

# class DeepSeekV3YarnRotaryEmbedding(BaseInjectedModule, DeepseekV3RotaryEmbedding):
#     def __init__(
#         self,
#         key: str,
#         gguf_loader: GGUFLoader,
#         config: PretrainedConfig,
#         orig_module: nn.Module,
#         #  device: str = "cuda",
#         generate_device: str = "cuda",
#         prefill_device: str = "cuda",
#         **kwargs,
#     ):
#         BaseInjectedModule.__init__(
#             self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
#         )
#         self.generate_device = generate_device
#         self.prefill_device = prefill_device

#     def load(self):
#         # TODO support perlayer prefill
#         self.orig_module.__init__(
#             self.config,
#             device=self.generate_device
#         )
#         return

class YarnRotaryEmbeddingV3(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
    
    def load(self):
        kwargs = {
            key: self.config.rope_scaling[key]
            for key in [
                "original_max_position_embeddings",
                "beta_fast",
                "beta_slow",
                "mscale",
                "mscale_all_dim",
            ]
            if key in self.config.rope_scaling
        }
        self._init(
            dim=self.config.qk_rope_head_dim,
            max_position_embeddings=self.config.max_position_embeddings,
            base=self.config.rope_theta,
            device=self.device,
            scaling_factor=self.config.rope_scaling["factor"],
            **kwargs,
        )

    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()* self._mscale
            sin = emb.sin()* self._mscale
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)  

    def _init(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        self.inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self._mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

class DynamicNTKScalingRotaryEmbedding(
    BaseInjectedModule, LlamaDynamicNTKScalingRotaryEmbedding
):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        prefill_device: str = "cuda",
        generate_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
            None,  # device
            orig_module.scaling_factor,
            orig_module.rope_type,
            orig_module.config,
        )

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.orig_module.device,
            self.orig_module.scaling_factor,
            self.orig_module.rope_type,
            self.orig_module.config,
        )


class RotaryEmbeddingV4(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
    
    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)   

    def load(self):
        self._init(
            dim=self.config.qk_rope_head_dim,
            max_position_embeddings=self.config.max_position_embeddings,
            base=self.config.rope_theta,
            device=self.device,
        )
    def _init(self, dim, max_position_embeddings, base, device, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        # self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

class KQwen3MoeRotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            config,
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.config
        )
    

class KSmallthinkerRotaryEmbedding(BaseInjectedModule, SmallthinkerRotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            config
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.config,
            device = self.generate_device,
        )
        
    @torch.no_grad()
    def forward(self, x, position_ids):
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

class KGlm4MoeRotaryEmbedding(BaseInjectedModule, Glm4MoeRotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            config
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.config,
            device = self.generate_device,
        )
        
    @torch.no_grad()
    def forward(self, x, position_ids):
        if "dynamic" in self.rope_type:
            self._dynamic_frequency_update(position_ids, device=x.device)
        # Core RoPE block
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        # print(inv_freq_expanded.device)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

================================================
FILE: archive/ktransformers/operators/__init__.py
================================================


================================================
FILE: archive/ktransformers/operators/ascend/ascend_attention.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import warnings
from typing import Optional, Tuple

import torch
import torch_npu
from torch import nn
import torch.nn.functional as F
from transformers.configuration_utils import PretrainedConfig

from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention, apply_rotary_pos_emb
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability, get_use_npu_graph, get_current_device
from ktransformers.models.custom_cache import StaticCache
from ktransformers.server.balance_serve.inference.forward_batch import ForwardMiniBatchSplit
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, allredeuce_warpper, get_tensor_parallel_group
from ktransformers.util.vendors import device_manager, GPUVendor
from ktransformers.util import utils


def apply_rotary_pos_emb_fusion(q, k, cos, sin, unsqueeze_dim=1):
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    b, h, s, d = k.shape
    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    q_embed = torch_npu.npu_rotary_mul(q, cos, sin)
    k_embed = torch_npu.npu_rotary_mul(k, cos, sin)
    return q_embed, k_embed


class MatMulOps(object):
    def execute(self, x_input):
        """
            :param x, weight, quant_bia, deq_scale
            :return:
        """
        quant_out = x_input[0]
        weight = x_input[1]
        quant_bia = x_input[2]
        deq_scale = x_input[3]
        return [torch_npu.npu_quant_matmul(quant_out, weight.T, deq_scale, bias=quant_bia, output_dtype=torch.float16)]


class DynamicQuantOps(object):
    """
        :param x
        :return
    """
    def execute(self, x_input):
        out = torch.empty_like(x_input[0], dtype=torch.int8)
        torch_npu._npu_quantize_per_tensor(x_input[0], x_input[1], x_input[2], out)
        return [out]

class KDeepseekV2AttentionW8A8A2(BaseInjectedModule, DeepseekV2Attention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    attn_mask: Optional[torch.Tensor] = None

    class PageKVWrapper(object):
        """
        wrap the difference of KV Cache and Block info between offline model & direct serving & sched serving
        succession should keep the function api
        """
        def __init__(self, past_key_value: StaticCache):
            self.kv_cache = past_key_value
            self.page_size = self.kv_cache.page_size
            self.position = self.kv_cache.position

            self.page_idx = None # staticKV can get from itself
            self.page_offset = None

        def update(self, compressed_kv, k_pe, layer_idx, cache_kwargs):
            return self.kv_cache.update(compressed_kv, k_pe, layer_idx, cache_kwargs)
        
        def get_usable_length(self, kv_seq_len, layer_idx):
            return self.kv_cache.get_usable_length(kv_seq_len, layer_idx)
        
        def get_seq_length(self, layer_idx):
            return self.kv_cache.get_seq_length(layer_idx)
        
        def get_block_table(self, layer_idx):
            return self.kv_cache.page_table_list[layer_idx]

    def init_page_kv_wrapper(self, past_key_value: StaticCache):
        self.page_kv_wrapper = self.PageKVWrapper(past_key_value)

    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 absorb_for_prefill: bool = False,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
                                  orig_module.layer_idx)
        self.chunck_size = config.chunk_size
        self.mla_wrapper = None
        self.page_kv_wrapper = None
        self.absorb_for_prefill = absorb_for_prefill
        self.use_merge = os.getenv("USE_MERGE", "0")
        tp = get_tensor_parallel_size()
        if tp > 1:
            self.num_heads //= tp

        if self.use_merge == "0":
            self.elewise_quant = DynamicQuantOps()
            self.matmulDequant_operation = MatMulOps()
            self.matmulDequant_operation_aclnn = MatMulOps()
        elif self.use_merge == "1":
            print("--Use torch npu FA OP !--")
        else:
            print("--Use default op !--")
        
        self.sparse_mode = 0

    @allredeuce_warpper
    def forward_chunck(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[StaticCache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        is_prefill: bool = True,
        **kwargs
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()
        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            hidden_states_quant = self.elewise_quant.execute([hidden_states, self.q_a_proj.input_scale, self.q_a_proj.input_offset])[0]
            q_a_proj_out = self.matmulDequant_operation.execute([hidden_states_quant, self.q_a_proj.weight,
                                                                 self.q_a_proj.quant_bias, self.q_a_proj.deq_scale])[0]
            q_a_proj_out = self.q_a_layernorm(q_a_proj_out)
            q_a_proj_out = self.elewise_quant.execute([q_a_proj_out, self.q_b_proj.input_scale, self.q_b_proj.input_offset])[0]
            q = self.matmulDequant_operation.execute([q_a_proj_out, self.q_b_proj.weight,
                                                      self.q_b_proj.quant_bias, self.q_b_proj.deq_scale])[0]
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        hidden_states_quant = self.elewise_quant.execute([hidden_states, self.kv_a_proj_with_mqa.input_scale, self.kv_a_proj_with_mqa.input_offset])[0]
        compressed_kv = self.matmulDequant_operation.execute([hidden_states_quant, self.kv_a_proj_with_mqa.weight,
                                                              self.kv_a_proj_with_mqa.quant_bias, self.kv_a_proj_with_mqa.deq_scale])[0]
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)

        kv_seq_len = k_pe.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += self.page_kv_wrapper.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb_fusion(q_pe, k_pe, cos, sin)

        # update KV
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            cache_kwargs["page_idx"] = self.page_kv_wrapper.page_idx
            cache_kwargs["page_offset"] = self.page_kv_wrapper.page_offset
            k_pe = k_pe.transpose(1, 2)                 # k_pe [bsz, 1, q_len, self.qk_rope_head_dim]
            compressed_kv = compressed_kv.unsqueeze(2)  # compressed_kv [bsz, q_len, self.kv_lora_rank]
            compressed_kv_with_k_pe, _ = self.page_kv_wrapper.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
            if is_prefill:
                compressed_kv_prefill = compressed_kv.clone() # clone for prefill infer
                k_pe_prefill = k_pe.clone()
            compressed_kv, k_pe = torch.split(
                compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
            )

        weight_uk = self.q_absorb
        weight_uv = self.out_absorb

        # ATB-MLA-FA+PA
        if self.use_merge == "0" and is_prefill:
            # if self.layer_idx == 0:
            #   print(self.page_kv_wrapper.get_seq_length(self.layer_idx)
            #   self.page_kv_wrapper.get_block_table(self.layer_idx), self.page_kv_wrapper.position)
            current_sqenLen = self.page_kv_wrapper.get_seq_length(self.layer_idx)
            attention_mask = attention_mask[0, :, :, :current_sqenLen].squeeze(0).squeeze(0)

            # FIXME this is wrong in random choose pages for sched, currently just use kv without history
            # compressed_kv = compressed_kv.view(bsz, 1, -1, self.kv_lora_rank)[:,:,:current_sqenLen,:]
            # k_pe = k_pe.view(bsz, 1, -1, self.qk_rope_head_dim)[:,:,:current_sqenLen,:]
            compressed_kv = compressed_kv_prefill.transpose(1,2).contiguous()
            k_pe = k_pe_prefill.transpose(1,2).contiguous()

            k_pe_repeated = k_pe.repeat(1, self.num_heads, 1, 1)
            k_up = torch.matmul(compressed_kv, weight_uk.mT)
            v_up = torch.matmul(compressed_kv, weight_uv)

            qTensor = torch.cat((q_nope, q_pe), dim=-1).transpose(1, 2).contiguous().view(
                                        bsz, q_len, self.num_heads, (self.qk_nope_head_dim + self.qk_rope_head_dim))
            kTensor = torch.cat((k_up, k_pe_repeated), dim=-1).transpose(1, 2).contiguous().view(
                                        bsz, current_sqenLen, self.num_heads, (self.qk_nope_head_dim + self.qk_rope_head_dim))
            vTensor = torch.cat((v_up, k_pe_repeated), dim=-1).transpose(1, 2).contiguous().view(
                                        bsz, current_sqenLen, self.num_heads, (self.v_head_dim + self.qk_rope_head_dim))

            seq_len_data = [q_len] * bsz

            infer_attention_output, _ = torch_npu.npu_fused_infer_attention_score(
                qTensor, kTensor, vTensor,
                atten_mask = attention_mask.type(torch.int8),
                actual_seq_lengths = seq_len_data,
                scale = self.softmax_scale,
                num_heads = self.num_heads,
                num_key_value_heads = self.num_heads,
                input_layout = "BSND")
            
            attn_output = infer_attention_output[..., :self.v_head_dim]
            if tuple(attn_output.size()) != (bsz, q_len, self.num_heads, self.v_head_dim):
                raise ValueError(
                    f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.v_head_dim)}, but is"
                    f" {tuple(attn_output.size())}"
                )

            attn_output = attn_output.contiguous().view(bsz, q_len, self.num_heads * self.v_head_dim)
            attn_output = self.elewise_quant.execute([attn_output, self.o_proj.input_scale, self.o_proj.input_offset])[0]
            attn_output = self.matmulDequant_operation_aclnn.execute([attn_output, self.o_proj.weight,
                                                                self.o_proj.quant_bias, self.o_proj.deq_scale])[0]

            return attn_output, None, past_key_value

        elif self.use_merge == "0" and not is_prefill:
            return self.forward_paged(q_pe=q_pe,
                                      q_nope=q_nope,
                                      compressed_kv_with_k_pe=compressed_kv_with_k_pe,
                                      past_key_value=past_key_value,
                                      cache_position=cache_position)

        if self.use_merge == "1":
            k_pe_repeated = k_pe.repeat(1, self.num_heads, 1, 1)
            k_up = torch.matmul(compressed_kv, weight_uk.mT)
            v_up = torch.matmul(compressed_kv, weight_uv)
            qTensor = torch.cat((q_nope, q_pe), dim=-1)
            kTensor = torch.cat((k_up, k_pe_repeated), dim=-1)
            vTensor = torch.cat((v_up, k_pe_repeated), dim=-1)

            if q_len != 1:
                attn_output = torch_npu.npu_prompt_flash_attention(
                    qTensor, kTensor, vTensor,
                    num_heads=self.num_heads, scale_value=self.softmax_scale, input_layout="BNSD")
            else:
                attn_output = torch_npu.npu_incre_flash_attention(
                    qTensor, kTensor, vTensor,
                    num_heads=self.num_heads, scale_value=self.softmax_scale, input_layout="BNSD")
            attn_output = attn_output[:, :, :, :self.v_head_dim]
        else:
            q_nope = torch.matmul(q_nope, self.q_absorb)

            attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.mT)) * self.softmax_scale

            compressed_kv = compressed_kv.squeeze(1)
            """
            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                    f" {attn_weights.size()}"
                )
            assert attention_mask is not None
            """
        if attention_mask is not None:
            """
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            """
            attn_weights = attn_weights + attention_mask

            # upcast attention to fp32
            attn_weights = nn.functional.softmax(
                attn_weights, dim=-1, dtype=torch.float32
            ).to(q_pe.dtype)
            attn_weights = nn.functional.dropout(
                attn_weights, p=self.attention_dropout, training=self.training
            )
            attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)

            attn_output = torch.matmul(attn_output, self.out_absorb)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value

    def forward_paged(
        self,
        q_pe: torch.Tensor,
        q_nope: torch.Tensor,
        compressed_kv_with_k_pe: torch.Tensor,
        past_key_value: Optional[StaticCache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # if self.layer_idx == 1:
        #   print(self.page_kv_wrapper.get_block_table(self.layer_idx), self.page_kv_wrapper.position)
        bsz, _, q_len, _ = q_nope.size()
        q_nope = torch.einsum('b h q d, h d k -> b h q k', q_nope, self.q_absorb)  # torch.Size([1, 128, 1, 512])
        compressed_kv = compressed_kv_with_k_pe.permute(0, 2, 1, 3)
        kvCache = compressed_kv[:, :, :, :self.kv_lora_rank].contiguous()
        kRopeCache = compressed_kv[:, :, :, self.kv_lora_rank:].contiguous()
        if get_use_npu_graph():
            from ktransformers.util.npu_graph_runner import get_or_create_runner
            npu_graph_runner = get_or_create_runner(get_current_device())
            stream = npu_graph_runner.main_stream
            if npu_graph_runner.past_key_value is None:
                npu_graph_runner.past_key_value = past_key_value
            if npu_graph_runner.workspace is None:
                workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
                    q_nope,
                    kvCache,
                    kvCache,
                    query_rope=q_pe,
                    key_rope=kRopeCache,
                    num_heads=self.num_heads,
                    num_key_value_heads=1,
                    input_layout="BNSD",
                    scale=self.softmax_scale,
                    antiquant_mode=0,
                    antiquant_scale=None,
                    block_table=self.page_kv_wrapper.get_block_table(self.layer_idx),
                    block_size=self.page_kv_wrapper.page_size,
                    actual_seq_lengths_kv=self.page_kv_wrapper.position,
                    sparse_mode = self.sparse_mode)
                npu_graph_runner.workspace = workspace
            attn_output = torch.zeros_like(q_nope, dtype=torch.float16, device=get_current_device())
            softmax_lse = torch.empty(1, dtype=torch.float16, device=get_current_device())
            torch_npu.npu_fused_infer_attention_score.out(
                q_nope,
                kvCache,
                kvCache,
                workspace=npu_graph_runner.workspace,
                query_rope=q_pe,
                key_rope=kRopeCache,
                num_heads=self.num_heads,
                num_key_value_heads=1,
                input_layout="BNSD",
                scale=self.softmax_scale,
                antiquant_mode=0,
                antiquant_scale=None,
                block_table=self.page_kv_wrapper.get_block_table(self.layer_idx),
                block_size=self.page_kv_wrapper.page_size,
                actual_seq_lengths_kv=self.page_kv_wrapper.position,
                sparse_mode = self.sparse_mode,
                out=[attn_output, softmax_lse])

        else:
            attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
                q_nope,
                kvCache,
                kvCache,
                query_rope=q_pe,
                key_rope=kRopeCache,
                num_heads=self.num_heads,
                num_key_value_heads=1,
                input_layout="BNSD",
                scale=self.softmax_scale,
                antiquant_mode=0,
                antiquant_scale=None,
                block_table=self.page_kv_wrapper.get_block_table(self.layer_idx),
                block_size=self.page_kv_wrapper.page_size,
                actual_seq_lengths_kv=self.page_kv_wrapper.position,
                sparse_mode = self.sparse_mode
            )

        attn_output = torch.einsum('b h q k, h k v -> b q h v', attn_output, self.out_absorb)
        attn_output = attn_output.contiguous().view(bsz, q_len, self.num_heads * self.v_head_dim)
        attn_output = self.elewise_quant.execute([attn_output, self.o_proj.input_scale, self.o_proj.input_offset])[0]
        attn_output = self.matmulDequant_operation_aclnn.execute([attn_output, self.o_proj.weight,
                                                            self.o_proj.quant_bias, self.o_proj.deq_scale])[0]
        return attn_output, None, past_key_value

    def forward_windows(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[StaticCache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        is_prefill: bool = True,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

        self.init_page_kv_wrapper(past_key_value)
        bsz, q_len, _ = hidden_states.size()

        if q_len <= self.chunck_size:
            return self.forward_chunck(
                hidden_states,
                attention_mask,
                position_ids,
                past_key_value,
                output_attentions,
                use_cache,
                cache_position,
                is_prefill,
                **kwargs
            )

        assert output_attentions == False, "output_attentions is not supported when using chunked attention"
        attn_output = None
        cur_idx = 0
        while cur_idx < q_len:
            if attention_mask is not None:
                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
            else:
                # generate chunk_mask automatically.
                self.attn_mask = \
                    torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
                        if self.attn_mask is None \
                        else self.attn_mask
                self.attn_mask[:, :, :, cur_idx:min(cur_idx + self.chunck_size, past_key_value.max_cache_len)] = \
                    -65504.0 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1) \
                        [:, :min(self.chunck_size, min(past_key_value.max_cache_len - cur_idx, self.chunck_size))]
                self.attn_mask[:, :, :, cur_idx + self.chunck_size:] = -65504.0
                self.attn_mask[:, :, :, :cur_idx] = 0
                chunk_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len - cur_idx))

            cur_output, _, _ = self.forward_chunck(
                hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
                chunk_mask,
                position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
                past_key_value,
                output_attentions,
                use_cache,
                cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
                **kwargs
            )
            cur_idx += self.chunck_size
            if attn_output is None:
                attn_output = cur_output
            else:
                attn_output = torch.cat((attn_output, cur_output), dim=-2)

        return attn_output, None, past_key_value

    def forward(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_value: Optional[StaticCache] = None,
            output_attentions: bool = False,
            use_cache: bool = False,
            cache_position: Optional[torch.LongTensor] = None,
            is_prefill: bool = True,
            **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # TODO: remove cache_position since it do not support multi-batch 
        return self.forward_windows(
            hidden_states,
            attention_mask,
            position_ids,
            past_key_value,
            output_attentions,
            use_cache,
            cache_position,
            is_prefill,
            **kwargs,
        )


class KDeepseekV2AttentionW8A8A2Serve(BaseInjectedModule, DeepseekV2Attention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    attn_mask: Optional[torch.Tensor] = None

    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1024,
                 absorb_for_prefill: bool = False,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config, orig_module.layer_idx)

        # self.chunck_size = chunck_size
        self.absorb_for_prefill = absorb_for_prefill
        self.elewise_quant = DynamicQuantOps()
        self.matmulDequant_operation = MatMulOps()
        self.matmulDequant_operation_aclnn = MatMulOps()
        # tp切分
        tp = get_tensor_parallel_size()
        if tp > 1:
            self.num_heads //= tp
        
        self.sparse_mode = 0
    
    def print_callback(self, param):
        with torch.npu.stream(torch.npu.Stream(device="npu:0")):
            hidden_states, position_ids, cache_position, page_idx, page_offset, block_table = param
            print("########################################")
            print("hidden_states is ", hidden_states)
            print("position_ids is ", position_ids)
            print("cache_position is ", cache_position)
            print("page_idx is ", page_idx)
            print("page_offset is ", page_offset)
            print("block_table is ", block_table)
            print("########################################")
    
    @allredeuce_warpper
    def forward(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_value: Optional[StaticCache] = None,
            output_attentions: bool = False,
            use_cache: bool = False,
            cache_position: Optional[torch.LongTensor] = None,
            is_prefill: Optional[bool] = None,
            page_idx: Optional[torch.Tensor] = None,
            page_offset: Optional[torch.Tensor] = None,
            block_table: Optional[torch.Tensor] = None,
            q_len_raw: Optional[torch.Tensor] = None,
            kv_len_raw: Optional[torch.Tensor] = None,
            stream: Optional[torch.npu.Stream] = None,
            **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        def create_causal_mask(q_lens, kv_lens):
            q_lens = torch.tensor(q_lens)
            kv_lens = torch.tensor(kv_lens)
            bsz = q_lens.size(0)

            max_q_len = q_lens.max().item()
            max_kv_len = kv_lens.max().item()

            # causal mask [max_q_len, max_kv_len]
            base_causal = torch.tril(torch.ones((max_q_len, max_kv_len), dtype=torch.bool))

            # mask initialize: [bsz, max_q_len, max_kv_len] to False
            mask = torch.zeros((bsz, max_q_len, max_kv_len), dtype=torch.bool)

            for i in range(bsz):
                ql, kl = q_lens[i].item(), kv_lens[i].item()
                # copy base_causal to mask
                mask[i, :ql, :kl] = base_causal[:ql, :kl]
            
            return mask
        
        bsz, q_len, _ = hidden_states.size()
        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            hidden_states_quant = self.elewise_quant.execute([hidden_states, self.q_a_proj.input_scale, self.q_a_proj.input_offset])[0]
            q_a_proj_out = self.matmulDequant_operation.execute([hidden_states_quant, self.q_a_proj.weight,
                                                                 self.q_a_proj.quant_bias, self.q_a_proj.deq_scale])[0]
            q_a_proj_out = self.q_a_layernorm(q_a_proj_out)
            q_a_proj_out = self.elewise_quant.execute([q_a_proj_out, self.q_b_proj.input_scale, self.q_b_proj.input_offset])[0]
            q = self.matmulDequant_operation.execute([q_a_proj_out, self.q_b_proj.weight,
                                                      self.q_b_proj.quant_bias, self.q_b_proj.deq_scale])[0]
        
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        hidden_states_quant = self.elewise_quant.execute([hidden_states, self.kv_a_proj_with_mqa.input_scale, self.kv_a_proj_with_mqa.input_offset])[0]
        compressed_kv = self.matmulDequant_operation.execute([hidden_states_quant, self.kv_a_proj_with_mqa.weight,
                                                              self.kv_a_proj_with_mqa.quant_bias, self.kv_a_proj_with_mqa.deq_scale])[0]
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)

        kv_seq_len = k_pe.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb_fusion(q_pe, k_pe, cos, sin)

        # update KV
        compressed_kv_prefill, k_pe_prefill = None, None
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
            cache_kwargs["page_idx"], cache_kwargs["page_offset"] = page_idx, page_offset
            k_pe = k_pe.transpose(1, 2)                # k_pe [bsz, 1, q_len, self.qk_rope_head_dim]
            compressed_kv = compressed_kv.unsqueeze(2) # compressed_kv [bsz, q_len, self.kv_lora_rank]
            combined = torch.cat([compressed_kv, k_pe], dim=-1) # shape: [batch_size, num_heads, 2*self.kv_lora_rank]
            # combined = combined.contiguous()

            compressed_kv_with_k_pe, _ = past_key_value.update(combined, self.layer_idx, cache_kwargs)
            if is_prefill:
                compressed_kv_prefill = compressed_kv.clone()
                k_pe_prefill = k_pe.clone()
            
        weight_uk = self.q_absorb
        weight_uv = self.out_absorb

        if is_prefill:
            kTensor_list = []
            vTensor_list = []
            qTensor_list = []
            attention_mask_list = []
            seq_len_data = []
            kv_len_list = []

            for sample_idx in range(bsz):
                current_q_len = q_len_raw[sample_idx].item() if (q_len_raw is not None and sample_idx < len(q_len_raw)) else hidden_states.shape[1]
                current_kv_len = kv_len_raw[sample_idx].item() if (kv_len_raw is not None and sample_idx < len(kv_len_raw)) else current_q_len
                current_q_len = max(1, current_q_len)
                current_kv_len = max(1, current_kv_len)
                seq_len_data.append(current_q_len)
                kv_len_list.append(current_kv_len)

                if attention_mask is not None:
                    mask_sample = attention_mask[
                        sample_idx:sample_idx+1, :, :, :current_kv_len
                    ].squeeze(0).squeeze(0)
                    if mask_sample.shape[0] < current_q_len:
                        mask_sample = torch.nn.functional.pad(mask_sample, (0, 0, 0, current_q_len - mask_sample.shape[0]), value=1)
                    elif mask_sample.shape[0] > current_q_len:
                        mask_sample = mask_sample[:current_q_len, :]
                    if mask_sample.shape[1] < current_kv_len:
                        mask_sample = torch.nn.functional.pad(mask_sample, (0, current_kv_len - mask_sample.shape[1]), value=1)
                    elif mask_sample.shape[1] > current_kv_len:
                        mask_sample = mask_sample[:, :current_kv_len]
                    mask_sample = torch.where(
                        (mask_sample > -1e-6) & (mask_sample < 1e-6),
                        torch.tensor(0, device=mask_sample.device, dtype=torch.int8),
                        torch.tensor(1, device=mask_sample.device, dtype=torch.int8)
                    )
                else:
                    mask_sample = torch.ones((current_q_len, current_kv_len), device=hidden_states.device, dtype=torch.int8)
                    valid_len = min(current_q_len, current_kv_len)
                    mask_sample[:, :valid_len] = 0

                attention_mask_list.append(mask_sample)

                compressed_kv_sample = compressed_kv_prefill[sample_idx:sample_idx+1, :current_q_len, ...].transpose(1, 2).contiguous()
                k_pe_sample = k_pe_prefill[sample_idx:sample_idx+1, :current_q_len, ...].transpose(1, 2).contiguous()
                k_pe_repeated_sample = k_pe_sample.repeat(1, self.num_heads, 1, 1)

                q_nope_sample = q_nope[sample_idx:sample_idx+1, :, :current_q_len, :].contiguous()
                q_pe_sample = q_pe[sample_idx:sample_idx+1, :, :current_q_len, :].contiguous()
                q_concat_sample = torch.cat((q_nope_sample, q_pe_sample), dim=-1)
                q_transposed_sample = q_concat_sample.transpose(1, 2).contiguous()
                qTensor_sample = q_transposed_sample.view(current_q_len, self.num_heads, self.qk_nope_head_dim + self.qk_rope_head_dim)
                qTensor_list.append(qTensor_sample)

                k_up_sample = torch.matmul(compressed_kv_sample, weight_uk.mT)
                k_concat_sample = torch.cat((k_up_sample, k_pe_repeated_sample), dim=-1)
                k_transposed_sample = k_concat_sample.transpose(1, 2).contiguous()
                kTensor_sample = k_transposed_sample.view(current_kv_len, self.num_heads, self.qk_nope_head_dim + self.qk_rope_head_dim)
                kTensor_list.append(kTensor_sample)

                v_up_sample = torch.matmul(compressed_kv_sample, weight_uv)
                v_concat_sample = torch.cat((v_up_sample, k_pe_repeated_sample), dim=-1)
                v_transposed_sample = v_concat_sample.transpose(1, 2).contiguous()
                vTensor_sample = v_transposed_sample.view(current_kv_len, self.num_heads, self.v_head_dim + self.qk_rope_head_dim)
                vTensor_list.append(vTensor_sample)
        
            max_kv_len = max(kv_len_list)
            max_q_len = max(seq_len_data)

            qTensor = torch.nn.utils.rnn.pad_sequence(qTensor_list, batch_first=True, padding_value=0.0).contiguous()
            kTensor = torch.nn.utils.rnn.pad_sequence(kTensor_list, batch_first=True, padding_value=0.0).contiguous()
            vTensor = torch.nn.utils.rnn.pad_sequence(vTensor_list, batch_first=True, padding_value=0.0).contiguous()

            attention_mask = ~create_causal_mask(seq_len_data, kv_len_list).to(qTensor.device)

            infer_attention_output, _ = torch_npu.npu_fused_infer_attention_score(
                    qTensor, kTensor, vTensor,
                    atten_mask = attention_mask.type(torch.int8),
                    actual_seq_lengths = seq_len_data,
                    scale = self.softmax_scale,
                    num_heads = self.num_heads,
                    num_key_value_heads = self.num_heads,
                    input_layout = "BSND")
                
            attn_output = infer_attention_output[..., :self.v_head_dim]

            if tuple(attn_output.size()) != (bsz, max_q_len, self.num_heads, self.v_head_dim):
                raise ValueError(
                    f"`attn_output` should be of size {(bsz, max_q_len, self.num_heads, self.v_head_dim)}, but is {tuple(attn_output.size())}"
                )
            attn_output = attn_output.contiguous().view(bsz, max_q_len, self.num_heads * self.v_head_dim)
            attn_output = self.elewise_quant.execute([attn_output, self.o_proj.input_scale, self.o_proj.input_offset])[0]
            attn_output = self.matmulDequant_operation_aclnn.execute([attn_output, self.o_proj.weight,
                                                                        self.o_proj.quant_bias, self.o_proj.deq_scale])[0]


            return attn_output, None, past_key_value
        else:
            return self.forward_paged(q_pe = q_pe,
                                      q_nope = q_nope,
                                      compressed_kv_with_k_pe = compressed_kv_with_k_pe,
                                      past_key_value = past_key_value,
                                      cache_position = cache_position,
                                      block_table = block_table,
                                      page_size = past_key_value.page_size,
                                      q_len_raw = q_len_raw,
                                      kv_len_raw = kv_len_raw,
                                      stream = stream)
    
    @allredeuce_warpper
    def forward_paged(
        self,
        q_pe: torch.Tensor,
        q_nope: torch.Tensor,
        compressed_kv_with_k_pe: torch.Tensor,
        past_key_value: Optional[StaticCache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        block_table: Optional[torch.Tensor] = None,
        page_size: Optional[int] = None,
        q_len_raw: Optional[torch.Tensor] = None,
        kv_len_raw: Optional[torch.Tensor] = None,
        stream: Optional[torch.npu.Stream] = None,
        **kwargs
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # if self.layer_idx == 0:
        #     print(self.page_kv_wrapper.get_block_table(self.layer_idx), self.page_kv_wrapper.position)
        bsz, _, q_len, _ = q_nope.size()
        # print(f"{q_nope.size()=}")
        q_nope = torch.einsum('b h q d, h d k -> b h q k', q_nope, self.q_absorb)   # torch.size([1, 128, 1, 512])
        compressed_kv = compressed_kv_with_k_pe.permute(0,2,1,3)
        kvCache = compressed_kv[:,:,:,:self.kv_lora_rank].contiguous()
        kRopeCache = compressed_kv[:,:,:,self.kv_lora_rank:].contiguous()
        if get_use_npu_graph():
            from ktransformers.server.balance_serve.inference.model_runner import ModelRunner, get_or_create_model_runner
            npu_graph_runner = get_or_create_model_runner(device=get_current_device())
            npu_graph_idx = bsz - 1
            if npu_graph_runner.workspace[npu_graph_idx] is None:
                workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
                    q_nope,
                    kvCache,
                    kvCache,
                    query_rope=q_pe,
                    key_rope=kRopeCache,
                    num_heads=self.num_heads,
                    num_key_value_heads=1,
                    input_layout="BNSD",
                    scale=self.softmax_scale,
                    antiquant_mode=0,
                    antiquant_scale=None,
                    block_table=block_table,
                    block_size=page_size,
                    actual_seq_lengths_kv=kv_len_raw,
                    sparse_mode = self.sparse_mode)
                npu_graph_runner.workspace[npu_graph_idx] = workspace
            
            attn_output = torch.zeros_like(q_nope, dtype=torch.float16, device=get_current_device())
            softmax_lse = torch.empty(1, dtype=torch.float16, device=get_current_device())

            torch_npu.npu_fused_infer_attention_score.out(
                q_nope,
                kvCache,
                kvCache,
                workspace=npu_graph_runner.workspace[npu_graph_idx],
                query_rope = q_pe,
                key_rope = kRopeCache,
                num_heads = self.num_heads,
                num_key_value_heads = 1,
                input_layout = "BNSD",
                scale = self.softmax_scale,
                antiquant_mode = 0,
                antiquant_scale = None,
                block_table = block_table,
                block_size = page_size,
                actual_seq_lengths_kv = kv_len_raw,
                sparse_mode = self.sparse_mode,
                out=[attn_output, softmax_lse])
        else:
            tp_group = get_tensor_parallel_group()
            torch.distributed.barrier(tp_group)
            attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
                q_nope,
                kvCache,
                kvCache,
                query_rope = q_pe,
                key_rope = kRopeCache,
                num_heads = self.num_heads,
                num_key_value_heads = 1,
                input_layout = "BNSD",
                scale = self.softmax_scale,
                antiquant_mode = 0,
                antiquant_scale = None,
                block_table = block_table,
                block_size = page_size,
                actual_seq_lengths_kv = kv_len_raw,
                sparse_mode = self.sparse_mode
            )

        attn_output = torch.einsum('b h q k, h k v -> b q h v', attn_output, self.out_absorb)
        attn_output = attn_output.contiguous().view(bsz, q_len, self.num_heads*self.v_head_dim)
        attn_output = self.elewise_quant.execute([attn_output, self.o_proj.input_scale, self.o_proj.input_offset])[0]
        attn_output = self.matmulDequant_operation_aclnn.execute([attn_output, self.o_proj.weight,
                                                                  self.o_proj.quant_bias, self.o_proj.deq_scale])[0]
        return attn_output, None, past_key_value

def rotate_half(x):
    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class KQwen3MoeAttentionW8A8A2Serve(BaseInjectedModule, Qwen3MoeAttention):

    attn_mask: Optional[torch.Tensor] = None

    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "npu",
                 generate_device: str = "npu",
                 chunck_size: int = 1024,
                 absorb_for_prefill: bool = False,
                 **kwargs):

        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module,
                                    prefill_device, generate_device, **kwargs)

        self.orig_module.__init__(orig_module.config, orig_module.layer_idx)

        self.absorb_for_prefill = absorb_for_prefill

        self.elewise_quant = DynamicQuantOps()
        self.matmulDequant_operation = MatMulOps()
        self.matmulDequant_operation_aclnn = MatMulOps()

        self.softmax_scale = self.scaling
        self.sparse_mode = 0

        self._prefill_step = 0
        self._cur_prefill_dir: Optional[str] = None

        if hasattr(self, "rotary_emb"):
            if hasattr(self.rotary_emb, "cos_cached"):
                self.rotary_emb.cos_cached = self.rotary_emb.cos_cached.to(torch.float16)
                self.rotary_emb.sin_cached = self.rotary_emb.sin_cached.to(torch.float16)
            if hasattr(self.rotary_emb, "inv_freq"):
                self.rotary_emb.inv_freq = self.rotary_emb.inv_freq.to(torch.float16)

    def _linear_w8a8a2(self, x: torch.Tensor, proj: nn.Module, name: str) -> torch.Tensor:
        if x.dtype == torch.bfloat16:
            x = x.to(torch.float16)
        B, Q, H_in = x.shape
        x_2d = x.view(-1, H_in)   # [T, H_in], T = B * Q
        x_q = self.elewise_quant.execute([
            x_2d,
            proj.input_scale,
            proj.input_offset
        ])[0]
        y_2d = self.matmulDequant_operation.execute([
            x_q,
            proj.weight,
            proj.quant_bias,
            proj.deq_scale
        ])[0]
        return y_2d.view(B, Q, -1)
    # -------------------------------------------------------
    # forward
    # -------------------------------------------------------
    def forward(self,
                hidden_states: torch.Tensor,
                attention_mask=None,
                position_ids=None,
                past_key_value=None,
                output_attentions=False,
                use_cache=False,
                cache_position=None,
                is_prefill=None,
                page_idx=None,
                page_offset=None,
                block_table=None,
                q_len_raw=None,
                kv_len_raw=None,
                stream=None,
                **kwargs):

        if hidden_states.dim() == 2:
            hidden_states = hidden_states.unsqueeze(0)
        bsz, q_len, hidden = hidden_states.shape

        # -------- QKV --------
        q_proj_out = self._linear_w8a8a2(hidden_states, self.q_proj, "Q")
        B, S, _ = q_proj_out.shape
        q = q_proj_out.view(B, S, self.num_heads, self.head_dim)  # [B, S, H, Dh]
        q = self.q_norm(q)
        q_in = q.view(B, S, -1)

        k_proj_out = self._linear_w8a8a2(hidden_states, self.k_proj, "K")
        k = k_proj_out.view(B, S, self.num_key_value_heads, self.head_dim)
        k = self.k_norm(k)
        k_in = k.view(B, S, -1)

        v_in = self._linear_w8a8a2(hidden_states, self.v_proj, "V")

        q = q_in.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k_in.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        v = v_in.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        # -------- RoPE --------
        cos, sin = self.rotary_emb(v, position_ids)
        q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1)

        # -------- prefill / decode --------
        if is_prefill:
            out = self._forward_prefill(
                q, k, v,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                q_len_raw=q_len_raw,
                kv_len_raw=kv_len_raw,
                page_idx=page_idx,
                page_offset=page_offset,
                block_table=block_table,
            )
            return out
        else:
            return self.forward_paged(
                q=q, k=k, v=v,
                past_key_value=past_key_value,
                cache_position=cache_position,
                block_table=block_table,
                page_size=getattr(past_key_value, "page_size", None),
                q_len_raw=q_len_raw,
                kv_len_raw=kv_len_raw,
                page_idx=page_idx,
                page_offset=page_offset,
                stream=stream
            )

    # -------------------------------------------------------
    # Prefill
    # -------------------------------------------------------
    def _forward_prefill(
        self,
        q: torch.Tensor,   # [B, H, Q, Dh]
        k: torch.Tensor,   # [B, KvH, Q, Dh]
        v: torch.Tensor,   # [B, KvH, Q, Dh]
        attention_mask=None,
        position_ids=None,
        past_key_value=None,
        q_len_raw=None,
        kv_len_raw=None,
        page_idx=None,
        page_offset=None,
        block_table=None,
        **kwargs,
    ) -> torch.Tensor:

        B, H, Q, Dh = q.shape
        KvH = k.shape[1]

        # ---------- 1) 写 KV cache ----------
        if (
            past_key_value is not None
            and page_idx is not None
            and page_offset is not None
        ):
            try:
                past_key_value.update(
                    key_states=k,
                    value_states=v,
                    layer_idx=self.layer_idx,
                    cache_kwargs={
                        "page_idx": page_idx,
                        "page_offset": page_offset,
                    },
                )
            except Exception as e:
                print(f"[PREFILL-QWEN3][WARN] KV cache update failed: {e}", flush=True)

        # ---------- 2) GQA：4 KV → 32 Q heads ----------
        if KvH != self.num_key_value_heads:
            print(
                f"[PREFILL-QWEN3][WARN] KvH ({KvH}) != config.num_key_value_heads "
                f"({self.num_key_value_heads}), 使用 k.shape[1] 作为 KvH",
                flush=True,
            )
            KvH = k.shape[1]

        if H % KvH != 0:
            raise ValueError(
                f"[PREFILL-QWEN3] num_heads={H} 不是 num_kv_heads={KvH} 的整数倍"
            )

        group_size = H // KvH
        k_full = k.repeat_interleave(group_size, dim=1)
        v_full = v.repeat_interleave(group_size, dim=1)

        print("[PREFILL-QWEN3] k_full/v_full:", k_full.shape, v_full.shape, flush=True)

        # ---------- 3) BSND + causal mask ----------
        q_bsnd = q.permute(0, 2, 1, 3).contiguous()      # [B, Q, H, Dh]
        k_bsnd = k_full.permute(0, 2, 1, 3).contiguous()
        v_bsnd = v_full.permute(0, 2, 1, 3).contiguous()

        if q_len_raw is None:
            seq_len_data = [Q for _ in range(B)]
            kv_len_list = [Q for _ in range(B)]
        else:
            seq_len_data = []
            kv_len_list = []
            for b_idx in range(B):
                cur_q = int(q_len_raw[b_idx].item())
                if kv_len_raw is not None:
                    cur_kv = int(kv_len_raw[b_idx].item())
                else:
                    cur_kv = cur_q
                cur_q = max(1, cur_q)
                cur_kv = max(1, cur_kv)
                seq_len_data.append(cur_q)
                kv_len_list.append(cur_kv)

        def create_causal_mask(q_lens, kv_lens):
            q_lens_t = torch.tensor(q_lens, device=q_bsnd.device)
            kv_lens_t = torch.tensor(kv_lens, device=q_bsnd.device)
            bsz = q_lens_t.size(0)
            max_q = int(q_lens_t.max().item())
            max_kv = int(kv_lens_t.max().item())
            base_causal = torch.tril(
                torch.ones((max_q, max_kv), dtype=torch.bool, device=q_bsnd.device)
            )
            mask = torch.zeros(
                (bsz, max_q, max_kv), dtype=torch.bool, device=q_bsnd.device
            )
            for i in range(bsz):
                ql = int(q_lens_t[i].item())
                kl = int(kv_lens_t[i].item())
                mask[i, :ql, :kl] = base_causal[:ql, :kl]
            return mask

        max_q_len = max(seq_len_data) if len(seq_len_data) > 0 else Q
        max_kv_len = max(kv_len_list) if len(kv_len_list) > 0 else Q

        q_list, k_list, v_list = [], [], []
        for b_idx in range(B):
            cur_q = seq_len_data[b_idx]
            cur_kv = kv_len_list[b_idx]

            q_sample = q_bsnd[b_idx, :cur_q, :, :].contiguous()
            k_sample = k_bsnd[b_idx, :cur_kv, :, :].contiguous()
            v_sample = v_bsnd[b_idx, :cur_kv, :, :].contiguous()

            q_list.append(q_sample)
            k_list.append(k_sample)
            v_list.append(v_sample)

        qTensor = torch.nn.utils.rnn.pad_sequence(
            q_list, batch_first=True, padding_value=0.0
        ).contiguous()
        kTensor = torch.nn.utils.rnn.pad_sequence(
            k_list, batch_first=True, padding_value=0.0
        ).contiguous()
        vTensor = torch.nn.utils.rnn.pad_sequence(
            v_list, batch_first=True, padding_value=0.0
        ).contiguous()

        causal_mask = create_causal_mask(seq_len_data, kv_len_list)
        atten_mask = (~causal_mask).to(torch.int8)

        print("[PREFILL-QWEN3] qTensor/kTensor/vTensor:",
              qTensor.shape, kTensor.shape, vTensor.shape, flush=True)

        # ---------- 4) NPU fused attention ----------
        infer_attention_output, _ = torch_npu.npu_fused_infer_attention_score(
            qTensor, kTensor, vTensor,
            atten_mask=atten_mask,
            actual_seq_lengths=seq_len_data,
            scale=self.softmax_scale,
            num_heads=H,
            num_key_value_heads=H,
            input_layout="BSND",
        )

        attn_output = infer_attention_output

        # ---------- 5) reshape + W8A8 o_proj ----------
        attn_output = attn_output.contiguous().view(B, max_q_len, H * Dh)

        attn_output_q = self.elewise_quant.execute(
            [attn_output, self.o_proj.input_scale, self.o_proj.input_offset]
        )[0]

        attn_output = self.matmulDequant_operation_aclnn.execute(
            [attn_output_q, self.o_proj.weight, self.o_proj.quant_bias, self.o_proj.deq_scale]
        )[0]

        print("[PREFILL-QWEN3] attn_output(after o_proj):", attn_output.shape, attn_output.dtype, flush=True)

        return attn_output

    def forward_paged(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        past_key_value,
        cache_position,
        block_table,
        page_size,
        q_len_raw,
        kv_len_raw,
        page_idx,
        page_offset,
        stream,
        **kwargs,
    ):
        B, H, Q, Dh = q.shape
        KvH = k.shape[1]

        # ========= 1) 更新 KV cache =========
        past_key_value.update(
            key_states=k,
            value_states=v,
            layer_idx=self.layer_idx,
            cache_kwargs={
                "page_idx": page_idx,
                "page_offset": page_offset,
            },
        )

        Kcache = past_key_value.get_k_cache(self.layer_idx)
        Vcache = past_key_value.get_v_cache(self.layer_idx)
        
        q_bnsd = q.contiguous()
        k_bnsd = Kcache.contiguous().to(torch.float16).transpose(1, 2)
        v_bnsd = Vcache.contiguous().to(torch.float16).transpose(1, 2)

        use_graph = get_use_npu_graph()
        device = get_current_device()

        if use_graph:
            from ktransformers.server.balance_serve.inference.model_runner import get_or_create_model_runner
            npu_graph_runner = get_or_create_model_runner(device=device)
            npu_graph_idx = B - 1

            if npu_graph_runner.workspace[npu_graph_idx] is None:
                workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
                q_bnsd,
                k_bnsd,
                v_bnsd,
                num_heads=H,
                num_key_value_heads=KvH,
                input_layout="BNSD",
                scale=self.softmax_scale,
                antiquant_mode=0,
                antiquant_scale=None,
                block_table=block_table,
                block_size=page_size,
                actual_seq_lengths_kv=kv_len_raw,
                sparse_mode=self.sparse_mode,
            )
                npu_graph_runner.workspace[npu_graph_idx] = workspace

            attn_output = torch.zeros_like(q_bnsd, dtype=torch.float16, device=device)
            softmax_lse = torch.empty(1, dtype=torch.float16, device=device)
            torch_npu.npu_fused_infer_attention_score.out(
                q_bnsd,
                k_bnsd,
                v_bnsd,
                workspace=npu_graph_runner.workspace[npu_graph_idx],
                num_heads=H,
                num_key_value_heads=KvH,
                input_layout="BNSD",
                scale=self.softmax_scale,
                antiquant_mode=0,
                antiquant_scale=None,
                block_table=block_table,
                block_size=page_size,
                actual_seq_lengths_kv=kv_len_raw,
                sparse_mode=self.sparse_mode,
                out=[attn_output, softmax_lse]
            )
        else:
            attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
                q_bnsd,
                k_bnsd,
                v_bnsd,
                num_heads=H,
                num_key_value_heads=KvH,
                input_layout="BNSD",
                scale=self.softmax_scale,
                antiquant_mode=0,
                antiquant_scale=None,
                block_table=block_table,
                block_size=page_size,
                actual_seq_lengths_kv=kv_len_raw,
                sparse_mode=self.sparse_mode,
            )

        attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(B, -1, H * Dh)

        attn_output_q = self.elewise_quant.execute(
            [attn_output, self.o_proj.input_scale, self.o_proj.input_offset]
        )[0]

        attn_output = self.matmulDequant_operation_aclnn.execute(
            [attn_output_q, self.o_proj.weight, self.o_proj.quant_bias, self.o_proj.deq_scale]
        )[0]

        return attn_output

================================================
FILE: archive/ktransformers/operators/ascend/ascend_experts.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import os
from typing import Optional

import bisect
import torch
import numpy as np
from torch import nn
import torch_npu
from transformers import PretrainedConfig
import torch.nn.functional as F

from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size, get_tensor_parallel_group
from ktransformers.operators.experts import cuda_graphs, KExpertsBase, KExpertsCPU, KTransformersExperts, EXPERTS_MAP, KDeepseekV3MoE
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MoE
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import CUR_DEVICE, get_use_npu_graph, InferenceState
from ktransformers.operators.experts import cuda_graphs as npu_graphs
from ktransformers.util import utils

class KExpertsCPUW8A8(KExpertsCPU):

    def forward(self, input_tensor, expert_ids, weights, bsz_tensor=None, cuda_graph_idx=None, use_npu_graph=False):
        if use_npu_graph:
            seq_len = input_tensor.size(0)
            cuda_graph_idx = seq_len - 1 if cuda_graph_idx is None else cuda_graph_idx # input_tensor is seq & batch merged
            self.cpu_infer.submit(self.moe.forward(KExpertsCPU.expert_ids_cpu[cuda_graph_idx][0].size(0),
                                                    KExpertsCPU.expert_ids_cpu[cuda_graph_idx][0].size(1),
                                                    KExpertsCPU.expert_ids_cpu[cuda_graph_idx][0].data_ptr(),
                                                    KExpertsCPU.weights_cpu[cuda_graph_idx][0].data_ptr(),
                                                    KExpertsCPU.input_tensor_cpu[cuda_graph_idx][0].data_ptr(),
                                                    KExpertsCPU.output_cpu[cuda_graph_idx][0].data_ptr(),
                                                    KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx][0].data_ptr()
                                                    ))
            self.cpu_infer.sync()
        else:
            if bsz_tensor is None:
                bsz_tensor = torch.tensor([input_tensor.size(0)], device=input_tensor.device, dtype=torch.int32)
            # if torch.cuda.is_current_stream_capturing():
            org_type = input_tensor.dtype
            input_tensor = input_tensor.contiguous().cpu()
            input_tensor = input_tensor.to(torch.bfloat16)
            expert_ids = expert_ids.contiguous().cpu()
            weights = weights.contiguous().to(torch.float32).cpu()
            bsz_tensor = bsz_tensor.contiguous().cpu()
            output = torch.empty_like(input_tensor).contiguous()
            self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr(), bsz_tensor.data_ptr()))
            self.cpu_infer.sync()
            return output.to(org_type).to(device=utils.get_current_device())

EXPERTS_MAP["KExpertsCPUW8A8"] = KExpertsCPUW8A8

class KTransformersExpertsW8A8(KTransformersExperts):
    def forward(self, input_tensor, expert_ids, weights, cuda_graph_idx=None, use_npu_graph=False):
        if self.mode == InferenceState.GENERATE:
            assert self.generate_experts is not None, "generate_experts is None"
            return self.generate_experts.forward(input_tensor, expert_ids, weights, cuda_graph_idx=cuda_graph_idx, use_npu_graph=use_npu_graph)
        elif self.mode == InferenceState.PREFILL:
            assert self.prefill_experts is not None, "prefill_experts is None"
            return self.prefill_experts.forward(input_tensor, expert_ids, weights, cuda_graph_idx=cuda_graph_idx, use_npu_graph=use_npu_graph)
        else:
            raise ValueError("load or set_inference_mode before forward")


class KDeepseekV3MoEW8A8(KDeepseekV3MoE):
    def forward(self, hidden_states, stream=None, para_stream=None):
        tp_size = get_tensor_parallel_size()
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()
        identity = hidden_states
        orig_shape = hidden_states.shape

        def share_experts_forward():
            if self.config.n_shared_experts is not None:
                return self.shared_experts(identity).squeeze(0)

        if rank == 0:
            topk_idx, topk_weight = self.gate(hidden_states)
            hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
            if get_use_npu_graph():
                org_type = hidden_states.dtype
                if hasattr(self.config, "backend_type"):
                    if self.config.backend_type == "ktransformers":
                        from ktransformers.util.npu_graph_runner import get_or_create_runner
                        npu_graph_runner = get_or_create_runner(utils.get_current_device())
                        stream = npu_graph_runner.main_stream
                        para_stream = npu_graph_runner.share_experts_stream
                    event = torch.npu.Event()
                    event.record(stream)
                    with torch.npu.stream(para_stream):
                        event.wait(para_stream)
                        y_ = share_experts_forward() if share_experts_forward is not None else None
                        event.record(para_stream)
            
                    input_tensor = hidden_states.to(torch.bfloat16)
                    topk_weight = topk_weight.contiguous().to(torch.float32)
                    cuda_graph_idx = orig_shape[0] - 1
                    self.moe_kexperts_param = (hidden_states, topk_idx, topk_weight, cuda_graph_idx, True)
                    if cuda_graph_idx < len(npu_graphs):
                        expert_ids = topk_idx
                        KExpertsCPU.input_tensor_cpu[cuda_graph_idx][0].copy_(input_tensor, non_blocking = True)
                        KExpertsCPU.expert_ids_cpu[cuda_graph_idx][0].copy_(expert_ids, non_blocking = True)
                        KExpertsCPU.weights_cpu[cuda_graph_idx][0].copy_(topk_weight, non_blocking = True)
                        torch_npu.npu._launch_host_func(stream, self.cpu_moe_kexperts, self.moe_kexperts_param)

                        y = self.experts.generate_experts.output_cpu[cuda_graph_idx][0].to(utils.get_current_device(), non_blocking = True)
                        y = y.view(*orig_shape).to(device=hidden_states.device)
                        y = y.to(org_type)
                    event.wait(stream)
                else:
                    from ktransformers.util.npu_graph_runner import get_or_create_runner
                    npu_graph_runner = get_or_create_runner(utils.get_current_device())
                    event = torch.npu.Event()
                    event.record(npu_graph_runner.main_stream)
                    with torch.npu.stream(npu_graph_runner.share_experts_stream):
                        event.wait(npu_graph_runner.share_experts_stream)
                        y_ = share_experts_forward() if share_experts_forward is not None else None
                        event.record(npu_graph_runner.share_experts_stream)
                    topk_weight = topk_weight.contiguous().to(torch.float32)
                    self.moe_kexperts_param = (hidden_states, topk_idx, topk_weight, None, True)

                    org_type = hidden_states.dtype
                    input_tensor = hidden_states.to(torch.bfloat16)

                    cuda_graph_idx = bisect.bisect_left(npu_graphs, 1)
                    if cuda_graph_idx < len(npu_graphs):

                        immediate_expert_ids = topk_idx
                        KExpertsCPU.input_tensor_cpu[cuda_graph_idx][0].copy_(input_tensor, non_blocking = True)
                        KExpertsCPU.expert_ids_cpu[cuda_graph_idx][0].copy_(immediate_expert_ids, non_blocking = True)
                        KExpertsCPU.weights_cpu[cuda_graph_idx][0].copy_(topk_weight, non_blocking = True)

                        npu_graph_runner.launch_callback(
                            self.cpu_moe_kexperts,
                            self.moe_kexperts_param,
                            1, npu_graph_runner.main_stream)
                        y = self.experts.generate_experts.output_cpu[cuda_graph_idx][0].to(utils.get_current_device(), non_blocking = True)

                        y = y.to(org_type)
                        y = y.view(*orig_shape).to(device=hidden_states.device)
                    event.wait(npu_graph_runner.main_stream)
            else:
                y = self.moe_kexperts(hidden_states, topk_idx, topk_weight)
                y_ = share_experts_forward() if share_experts_forward is not None else None
                y = y.view(*orig_shape).to(device=hidden_states.device)
                y_ = y_.view(*orig_shape)
        else:
            hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
            y = torch.zeros(orig_shape, dtype=torch.float16, device=CUR_DEVICE)
            y_ = share_experts_forward() if share_experts_forward is not None else None

        if tp_size > 1 and world_size == tp_size:
            torch.distributed.all_reduce(y, op=torch.distributed.ReduceOp.SUM, group=get_tensor_parallel_group())
        if self.config.n_shared_experts is not None:
            y += y_
        return y

    @torch.no_grad()
    def cpu_moe_kexperts(self, moe_kexperts_param) -> torch.Tensor:
        x, topk_ids, topk_weight, cuda_graph_idx, use_npu_graph = moe_kexperts_param
        _ = self.experts(x, topk_ids, topk_weight, cuda_graph_idx=cuda_graph_idx, use_npu_graph=use_npu_graph)

    @torch.no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

class KQwen3MoeSparseMoeBlockW8A8(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        prefill_device: str = "npu",
        generate_device: str = "npu",
        **kwargs,
    ):
        super().__init__(
            key,
            gguf_loader,
            config,
            orig_module,
            prefill_device=prefill_device,
            generate_device=generate_device,
            **kwargs,
        )

        self.gate = orig_module.gate
        self.top_k = orig_module.top_k
        self.norm_topk_prob = orig_module.norm_topk_prob
        self.output_router_logits = getattr(orig_module, "output_router_logits", False)

        experts_key = f"{key}.experts"

        print(f"[NPU-MOE][INIT] build experts at key={experts_key}", flush=True)
        self.experts = KTransformersExpertsW8A8(
            key=experts_key,
            gguf_loader=gguf_loader,
            config=config,
            orig_module=orig_module.experts,
            prefill_device=prefill_device,
            prefill_op="KExpertsTorch",
            generate_device="cpu",
            generate_op="KExpertsCPUW8A8",
            out_device=prefill_device,
        )

    def set_inference_mode(self, mode: InferenceState):
        if isinstance(self.experts, KExpertsBase):
            self.experts.set_inference_mode(mode)

    @torch.no_grad()
    def cpu_moe_kexperts(self, moe_kexperts_param):
        x, topk_ids, topk_weight, cuda_graph_idx, use_npu_graph = moe_kexperts_param
        _ = self.experts(
            x,
            topk_ids,
            topk_weight,
            cuda_graph_idx=cuda_graph_idx,
            use_npu_graph=use_npu_graph,
        )

    @torch.no_grad()
    def moe_kexperts(
        self,
        x: torch.Tensor,
        topk_ids: torch.Tensor,
        topk_weight: torch.Tensor,
        bsz_tensor: torch.Tensor = None,
        cuda_graph_idx: int = 0,
        use_npu_graph: bool = False,
    ) -> torch.Tensor:
        outs = self.experts(
            x,
            topk_ids,
            topk_weight,
            cuda_graph_idx=cuda_graph_idx,
            use_npu_graph=use_npu_graph,
        )
        return outs

    def forward(
        self,
        hidden_states: torch.Tensor,
        bsz_tensor: torch.Tensor = None,
        cuda_graph_idx: int = 0,
        *args,
        **kwargs,
    ):

        if hidden_states.dim() == 3:
            B, S, H = hidden_states.shape
        else:
            orig_shape = hidden_states.shape
            hidden_states = hidden_states.view(1, -1, orig_shape[-1])
            B, S, H = hidden_states.shape

        orig_device = hidden_states.device
        orig_shape = (B, S, H)

        output_router_logits_flag = kwargs.pop("output_router_logits", False)
        need_router_logits = output_router_logits_flag or self.output_router_logits

        # ===== 1) flatten =====
        hidden_states_flat = hidden_states.view(-1, H)
        T = hidden_states_flat.shape[0]

        # ===== 2) gate =====
        router_logits = self.gate(hidden_states_flat)
        try:
            router_logits_bs = router_logits.view(B, S, -1)
        except Exception:
            router_logits_bs = router_logits
        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(
            routing_weights, self.top_k, dim=-1
        )
        if self.norm_topk_prob:
            rw_sum = routing_weights.sum(dim=-1, keepdim=True)
            routing_weights = routing_weights / rw_sum

        routing_weights = routing_weights.to(hidden_states_flat.dtype)

        # ===== 3) MoE experts =====
        use_npu_graph = get_use_npu_graph()
        if torch.distributed.is_available() and torch.distributed.is_initialized():
            tp_size = get_tensor_parallel_size()
            world_size = torch.distributed.get_world_size()
            rank = torch.distributed.get_rank()
        else:
            tp_size = 1
            world_size = 1
            rank = 0
        y = None
        if isinstance(self.experts, KExpertsBase):
            if getattr(self.experts, "mode", None) == InferenceState.UNLOAD:
                self.experts.set_inference_mode(InferenceState.GENERATE)

            if rank == 0:
                if use_npu_graph:
                    org_type = hidden_states_flat.dtype
                    input_tensor = hidden_states_flat.to(torch.bfloat16)
                    topk_weight_f32 = routing_weights.contiguous().to(torch.float32)
                    self.moe_kexperts_param = (
                        hidden_states_flat,
                        selected_experts,
                        topk_weight_f32,
                        cuda_graph_idx,
                        True,
                    )
                    if cuda_graph_idx < len(npu_graphs):
                        KExpertsCPU.input_tensor_cpu[cuda_graph_idx][0].copy_(input_tensor, non_blocking=True)
                        KExpertsCPU.expert_ids_cpu[cuda_graph_idx][0].copy_(selected_experts, non_blocking=True)
                        KExpertsCPU.weights_cpu[cuda_graph_idx][0].copy_(topk_weight_f32, non_blocking=True)

                        stream = torch.npu.current_stream()
                        torch_npu.npu._launch_host_func(
                            stream,
                            self.cpu_moe_kexperts,
                            self.moe_kexperts_param,
                        )

                        y_flat = self.experts.generate_experts.output_cpu[cuda_graph_idx][0].to(
                            utils.get_current_device(),
                            non_blocking=True,
                        )
                        y_flat = y_flat.to(org_type)
                        y = y_flat.view(*orig_shape).to(device=orig_device)
                    else:
                        tmp_bsz_tensor = torch.tensor([B], dtype=torch.int32, device=orig_device)
                        y_flat = self.moe_kexperts(
                            hidden_states_flat,
                            selected_experts,
                            routing_weights,
                            bsz_tensor=tmp_bsz_tensor,
                            cuda_graph_idx=cuda_graph_idx,
                            use_npu_graph=False,
                        )
                        y = y_flat.view(*orig_shape).to(device=orig_device)
                else:
                    if bsz_tensor is None:
                        bsz_tensor = torch.tensor(
                            [B],
                            dtype=torch.int32,
                            device=orig_device,
                        )

                    y_flat = self.moe_kexperts(
                        hidden_states_flat,
                        selected_experts,
                        routing_weights,
                        bsz_tensor=bsz_tensor,
                        cuda_graph_idx=cuda_graph_idx,
                        use_npu_graph=False,
                    )
                    y = y_flat.view(*orig_shape).to(device=orig_device)
            else:
                y = torch.zeros(orig_shape, dtype=hidden_states.dtype, device=orig_device)
        else:
            y = hidden_states

        if tp_size > 1 and world_size == tp_size:
            torch.distributed.all_reduce(y, op=torch.distributed.ReduceOp.SUM, group=get_tensor_parallel_group())
        # print("================ [NPU-MOE] EXIT MLP =======================\n")
        if need_router_logits:
            num_experts = router_logits.shape[-1]
            router_logits_bs = router_logits.view(B, S, num_experts)
            return y, router_logits_bs


        return y


================================================
FILE: archive/ktransformers/operators/ascend/ascend_gate.py
================================================
import torch
import torch_npu
import torch.nn as nn
import torch.nn.functional as F
from ktransformers.operators.gate import KMoEGate


class KDeepseekV3GateA2(KMoEGate):
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None):
        if device is None:
            device = self.device
        if w is None:
            w = self.load_weights(device=device)

        if isinstance(w, dict):
            self.weight_type = w["weight_type"]
            self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
            self.orig_module.weight = nn.Parameter(w["weight"])
            self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
        else:
            raise ValueError("Invalid weight type")
        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device).to(torch.float32))
        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device).to(torch.float32))

    def forward(self, hidden_states) -> torch.Tensor:
        h = hidden_states.shape[-1]
        # compute gating score
        hidden_states = hidden_states.view(-1, h)
        logits = F.linear(hidden_states.type(torch.float32), self.weight, None)
        topk_weight, topk_idx, _ = torch_npu.npu_moe_gating_top_k(
            logits,
            k=self.top_k,
            bias=self.e_score_correction_bias,
            k_group=self.topk_group,
            group_count=self.n_group,
            group_select_mode=1,
            renorm=0,
            norm_type=1,
            routed_scaling_factor=self.routed_scaling_factor,
            eps=float(1e-20))
        return topk_idx.type(torch.int64), topk_weight


================================================
FILE: archive/ktransformers/operators/ascend/ascend_layernorm.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
from typing import Optional, Union, Tuple

import torch
import torch_npu
from torch import nn
from transformers import PretrainedConfig

from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
from ktransformers.util import utils
from ktransformers.util.custom_loader import GGUFLoader


class KDeepseekV3RMSNormW8A8(BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "npu",
                 generate_device: str = "npu",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.weight = nn.Parameter(torch.ones(self.orig_module.hidden_size))
        self.bias = nn.Parameter(torch.ones(self.orig_module.hidden_size))
        self.variance_epsilon = self.orig_module.variance_epsilon

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        out = torch_npu.npu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0] + self.bias
        return out.to(input_dtype)

    def load(self):
        self.weight = self.gguf_loader.safetensor_loader.load_tensor(self.key + ".weight").to(utils.get_current_device())
        self.bias = self.gguf_loader.safetensor_loader.load_tensor(self.key + ".bias").to(utils.get_current_device())

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.bias is not None:
            self.bias = None

class KQwen3MoeRMSNormW8A8(BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "npu",
                 generate_device: str = "npu",
                 **kwargs):

        super().__init__(key, gguf_loader, config, orig_module,
                         prefill_device, generate_device, **kwargs)

        self.hidden_size = orig_module.hidden_size
        self.variance_epsilon = orig_module.variance_epsilon
        self.weight = nn.Parameter(orig_module.weight.data.clone())

    def forward(self, x: torch.Tensor):
        x = x.to(torch.float16)
        gamma = self.weight.to(torch.float16)

        input_dtype = x.dtype
        out = torch_npu.npu_rms_norm(
            x,
            gamma,
            self.variance_epsilon
        )[0]

        return out.to(input_dtype)

    def load(self):
        device = utils.get_current_device()
        self.weight = self.gguf_loader.safetensor_loader.load_tensor(self.key + ".weight").to(device)

        try:
            self.bias = (
                self.gguf_loader.safetensor_loader
                .load_tensor(self.key + ".bias")
                .to(device)
            )
        except KeyError:
            self.bias = None

    def unload(self):
        self.weight = None
        self.bias = None

class KQwen3FinalRMSNormNPU(nn.Module):
    def __init__(self, orig_module: nn.Module):
        super().__init__()
        assert hasattr(orig_module, "weight"), "orig_module must have weight"
        self.variance_epsilon = getattr(orig_module, "variance_epsilon", 1e-6)

        w = orig_module.weight.detach()
        if w.dtype not in (torch.float16, torch.bfloat16, torch.float32):
            w = w.to(torch.float16)
        else:
            if w.dtype == torch.float32:
                w = w.to(torch.float16)

        self.weight = nn.Parameter(w)

    def forward(self, x: torch.Tensor):
        input_dtype = x.dtype
        x = x.contiguous()
        gamma = self.weight
        x_rms = x.to(dtype=gamma.dtype)

        out = torch_npu.npu_rms_norm(
            x_rms,
            gamma,
            self.variance_epsilon
        )[0]

        return out.to(input_dtype)

================================================
FILE: archive/ktransformers/operators/ascend/ascend_linear.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod

import torch
import torch_npu
import torch.distributed as dist
from torch import nn
from transformers import PretrainedConfig

from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.linear import KLinearBase, LINEAR_MAP
from ktransformers.util import utils
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import InferenceState
from ktransformers.util.ascend.ascend_utils import get_safetensors_cut_weight, get_tensor_parallel_size, get_tensor_parallel_group
from ktransformers.util.custom_gguf import translate_name_to_gguf


class KLinearW8A8(KLinearBase):
    def __init__(
            self,
            key: str,
            gguf_loader: GGUFLoader,
            config: PretrainedConfig,
            orig_module: nn.Module = None,
            device: str = "cuda",
            **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)

    def load_weight(self, override_key: str | None = None, device: str | None = None):
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]
        fake_tensor = torch.tensor([1])
        for key in keys:
            if device is None:
                device = utils.get_current_device()
            
            key = translate_name_to_gguf(key)
            if key == "lm_head":
                key = "output"
                
            if key + ".weight" in self.gguf_loader.safetensor_loader.tensor_file_map:
                if key + ".deq_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
                    qweight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
                    deq_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.deq_scale")
                    quant_bias = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.quant_bias")
                    input_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_scale")
                    input_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.input_offset")
                    tensors = (qweight, deq_scale, quant_bias, input_scale, input_offset)
                    return tensors
                elif key + ".weight_scale" in self.gguf_loader.safetensor_loader.tensor_file_map:
                    if key.endswith("ffn_gate_shexp"):
                        parts = key.split(".")
                        layer = parts[1]
                        gate_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight")
                        gate_weight = get_safetensors_cut_weight(self.key, gate_weight).t()
                        up_weight = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight")
                        up_weight = get_safetensors_cut_weight(self.key, up_weight).t()
                        gate_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_scale")
                        gate_scale = get_safetensors_cut_weight(self.key, gate_scale)
                        up_scale = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_scale")
                        up_scale = get_safetensors_cut_weight(self.key, up_scale)
                        gate_up_weight = torch.cat((gate_weight, up_weight), 1)
                        gate_up_scale = torch.cat((gate_scale, up_scale), 0)
                        gate_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_gate_shexp.weight_offset")
                        up_offset = self.gguf_loader.safetensor_loader.load_tensor(f"blk.{layer}.ffn_up_shexp.weight_offset")
                        gate_up_offset = torch.cat((gate_offset, up_offset), 0)
                        tensors = (gate_up_weight, gate_up_scale, gate_up_offset)
                    elif key.endswith("ffn_up_shexp"):
                        return fake_tensor
                    else:
                        qweight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
                        weight_scale = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_scale")
                        weight_offset = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight_offset")
                        tensors = (qweight, weight_scale, weight_offset)
                    return tensors
                else:
                    weight = self.gguf_loader.safetensor_loader.load_tensor(f"{key}.weight")
                    return weight
            else:
                raise FileNotFoundError(f"Weight file not found for key {key}")

    @abstractmethod
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = "cuda"):
        pass

    @abstractmethod
    def unload(self):
        pass


class KLinearTorchW8A8A2(KLinearW8A8):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.weight = None
        self.input_scale = None
        self.input_offset = None
        self.quant_bias = None
        self.deq_scale = None
        self.weight_scale = None
        self.weight_offset = None

    def forward(self, x: torch.Tensor, bsz_tensor) -> torch.Tensor:
        if x.dtype != self.weight.dtype:
            x = x.to(self.weight.dtype)
        return torch.matmul(x, self.weight)

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None):
        if device is None: device = utils.get_current_device()
        device = utils.CUR_DEVICE
        if w is None:
            w = self.load_weight()

        if isinstance(w, nn.Parameter):
            try:
                self.weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T.contiguous()
            except:
                self.weight = w.to(dtype=self.dtype).T.contiguous()
            self.weight = self.weight.to(device)
            if self.has_bias:
                self.bias = self.bias.to(device)
        elif isinstance(w, tuple):
            w_list = list(w)
            if len(w_list) == 3:
                self.weight = w_list[0]
                self.weight_scale = w_list[1].view(-1)
                self.weight_offset = w_list[2]
                self.weight = self.weight.to(utils.CUR_DEVICE)
                self.weight_scale = self.weight_scale.to(utils.CUR_DEVICE)
                if self.key.endswith("ffn_gate_shexp") is not True:
                    self.weight = get_safetensors_cut_weight(self.key, self.weight).t()
                    weight_scale = get_safetensors_cut_weight(self.key, self.weight_scale)
                    self.weight_scale = weight_scale.clone()
                    del weight_scale
            else:
                for i in range(len(w_list)):
                    w_list[i] = get_safetensors_cut_weight(self.key, w_list[i])
                    w_list[i] = w_list[i].to(utils.CUR_DEVICE)
                self.weight = w_list[0]
                self.deq_scale = w_list[1]
                self.quant_bias = w_list[2]
                if "attn_output" in self.key or "ffn_down" in self.key:
                    if torch.distributed.get_rank(get_tensor_parallel_group()) != 0:
                        self.quant_bias = torch.zeros_like(self.quant_bias, dtype=self.quant_bias.dtype, device=self.quant_bias.device)

                self.input_scale = w_list[3]
                self.input_offset = w_list[4]
        elif isinstance(w, torch.Tensor):
            self.weight = w.T.contiguous()
            self.weight = self.weight.to(device)
            if "kv_b" not in self.key and ("output" in  self.key or "eh_proj" in self.key):
                self.weight = torch_npu.npu_format_cast(self.weight, 29)
        else:
            raise ValueError(f"Invalid weight type {self.key=} {type(w)=}")

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.has_bias:
            self.bias = None
        self.input_scale = None
        self.input_offset = None
        self.quant_bias = None
        self.deq_scale = None
        self.weight_scale = None
        self.weight_offset = None


LINEAR_MAP["KLinearTorchW8A8A2"] = KLinearTorchW8A8A2


class KTransformersLinearW8A8A2(BaseInjectedModule, KLinearW8A8):
    def __init__(
            self,
            key: str,
            gguf_loader: GGUFLoader,
            config: PretrainedConfig,
            orig_module: nn.Module,
            generate_device: str = "cuda",
            generate_op: str | None = "KLinearMarlin",
            prefill_device: str = "cuda",
            prefill_op: str | None = "KLinearTorch",
            **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KLinearW8A8.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        # build all the linear operators
        if prefill_op is not None:
            assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
            self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        else:
            self.prefill_linear = None

        if generate_op is not None:
            assert generate_op in LINEAR_MAP, f"linear_type {generate_op} not supported"
            self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
        else:
            self.generate_linear = None
        self.mode = InferenceState.UNLOAD

    def forward(self, x, bsz_tensor=None):
        if self.mode == InferenceState.PREFILL:
            assert self.prefill_linear is not None, "cpu linear is not initialized"
            y = self.prefill_linear.forward(x, bsz_tensor)
        else:
            assert self.generate_linear is not None, "gpu linear is not initialized"
            y = self.generate_linear.forward(x, bsz_tensor)
        return y

    def load(self, w: dict | nn.Parameter | tuple | None = None, mode: InferenceState = InferenceState.GENERATE):
        if not mode:
            mode = InferenceState.GENERATE
        # load to device
        if mode == InferenceState.PREFILL:
            self.generate_linear.unload()
            self.prefill_linear.load(w=w)
            self.device = self.prefill_linear.device
            self.weight = self.prefill_linear.weight  # modeling_xxx.py may use linear.weight
            self.input_scale = self.prefill_linear.input_scale
            self.input_offset = self.prefill_linear.input_offset
            self.quant_bias = self.prefill_linear.quant_bias
            self.deq_scale = self.prefill_linear.deq_scale
            self.weight_scale = self.prefill_linear.weight_scale
            self.weight_offset = self.prefill_linear.weight_offset
        elif mode == InferenceState.GENERATE:
            self.prefill_linear.unload()
            self.generate_linear.load(w=w)
            self.device = self.generate_linear.device
            self.weight = self.generate_linear.weight  # modeling_xxx.py may use linear.weight
            self.input_scale = self.generate_linear.input_scale
            self.input_offset = self.generate_linear.input_offset
            self.quant_bias = self.generate_linear.quant_bias
            self.deq_scale = self.generate_linear.deq_scale
            self.weight_scale = self.generate_linear.weight_scale
            self.weight_offset = self.generate_linear.weight_offset
        elif mode == InferenceState.UNLOAD:
            self.prefill_linear.unload()
            self.generate_linear.unload()
            self.device = "cpu"
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")
        self.mode = mode

    def unload(self):
        if self.prefill_linear is not None:
            self.prefill_linear.unload()
        if self.generate_linear is not None:
            self.generate_linear.unload()
        self.device = self.generate_linear.device

    def set_inference_mode(self, mode: InferenceState):
        if not mode:
            mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")


================================================
FILE: archive/ktransformers/operators/ascend/ascend_mlp.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch_npu

from ktransformers.util.ascend.ascend_utils import allredeuce_warpper
from ktransformers.util.utils import CUR_DEVICE
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeMLP

class KDeepseekV3MLPW8A8A2V1(BaseInjectedModule, DeepseekV3MLP):
    @allredeuce_warpper
    def forward(self, x, is_prefill=None, use_cuda_graph=False):
        original_dtype = x.dtype
        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
        dynamic_scale = dynamic_scale.view(-1)
        quant_out = quant_out.view(-1, quant_out.shape[-1])
        gate_x = torch_npu.npu_quant_matmul(
            quant_out,
            self.orig_module.gate_proj.weight,
            self.orig_module.gate_proj.weight_scale,
            pertoken_scale=dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        up_x = torch_npu.npu_quant_matmul(
            quant_out,
            self.orig_module.up_proj.weight,
            self.orig_module.up_proj.weight_scale,
            pertoken_scale=dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        down_x = self.act_fn(gate_x) * up_x
        down_quant_out, down_dynamic_scale = torch_npu.npu_dynamic_quant(down_x)
        down_dynamic_scale = down_dynamic_scale.view(-1)
        down_proj = torch_npu.npu_quant_matmul(
            down_quant_out,
            self.orig_module.down_proj.weight,
            self.orig_module.down_proj.weight_scale,
            pertoken_scale=down_dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        down_proj = down_proj.reshape(x.shape)
        return down_proj

class KDeepseekV3MLPW8A8A2V2(BaseInjectedModule, DeepseekV3MLP):
    @allredeuce_warpper
    def forward(self, x, is_prefill=None, use_cuda_graph=False):
        original_dtype = x.dtype
        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
        dynamic_scale = dynamic_scale.view(-1)
        quant_out = quant_out.view(-1, quant_out.shape[-1])
        gate_up_x = torch_npu.npu_quant_matmul(
            quant_out,
            self.orig_module.gate_proj.weight,
            self.orig_module.gate_proj.weight_scale,
            pertoken_scale=dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        down_x = torch_npu.npu_swiglu(gate_up_x, -1)
        down_quant_out, down_dynamic_scale = torch_npu.npu_dynamic_quant(down_x)
        down_dynamic_scale = down_dynamic_scale.view(-1)
        down_proj = torch_npu.npu_quant_matmul(
            down_quant_out,
            self.orig_module.down_proj.weight,
            self.orig_module.down_proj.weight_scale,
            pertoken_scale=down_dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        down_proj = down_proj.reshape(x.shape)
        return down_proj

class KQwen3MoeMLPW8A8A2(BaseInjectedModule, Qwen3MoeMLP):
    @allredeuce_warpper
    def forward(self, x, is_prefill=None, use_cuda_graph=False):
        original_dtype = x.dtype
        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
        dynamic_scale = dynamic_scale.view(-1)
        quant_out = quant_out.view(-1, quant_out.shape[-1])
        
        gate_x = torch_npu.npu_quant_matmul(
            quant_out,
            self.orig_module.gate_proj.weight,
            self.orig_module.gate_proj.weight_scale,
            pertoken_scale=dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        up_x = torch_npu.npu_quant_matmul(
            quant_out,
            self.orig_module.up_proj.weight,
            self.orig_module.up_proj.weight_scale,
            pertoken_scale=dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        
        down_x = torch.nn.functional.silu(gate_x) * up_x
        
        down_quant_out, down_dynamic_scale = torch_npu.npu_dynamic_quant(down_x)
        down_dynamic_scale = down_dynamic_scale.view(-1)
        
        down_proj = torch_npu.npu_quant_matmul(
            down_quant_out,
            self.orig_module.down_proj.weight,
            self.orig_module.down_proj.weight_scale,
            pertoken_scale=down_dynamic_scale,
            bias=None,
            output_dtype=original_dtype,
        )
        down_proj = down_proj.reshape(x.shape)
        return down_proj

================================================
FILE: archive/ktransformers/operators/attention.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import torch
from torch import nn
import warnings
import torch.nn.functional as F
from ktransformers.operators.models import KLlamaModel
from ktransformers.models.configuration_deepseek import DeepseekV2Config
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability
import logging
from transformers.configuration_utils import PretrainedConfig
from transformers.cache_utils import Cache
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor

try:
    from flash_attn import flash_attn_func
except:
    pass
from ktransformers.operators.triton_attention import decode_attention_fwd_grouped 
from ktransformers.operators.triton_attention_prefill import context_attention_fwd
import os
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
if flashinfer_enabled:
    from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
    from flashinfer.mla import BatchMLAPagedAttentionWrapper
from ktransformers.models.custom_cache import KDeepSeekV3Cache
logger = logging.getLogger("attention")

# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

# V3 MLA is same to V2
class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    attn_mask: Optional[torch.Tensor] = None

    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 absorb_for_prefill: bool = False,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
        self.mla_wrapper = None
        self.absorb_for_prefill = absorb_for_prefill

    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
            self.q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
            self.out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
            
        return self.q_absorb, self.out_absorb

    def forward_chunck(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()
        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )
        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)

        kv_seq_len = k_pe.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            
            # compressed_kv [bsz, q_len, self.kv_lora_rank]
            # k_pe [bsz, 1, q_len, self.qk_rope_head_dim]
            k_pe = k_pe.transpose(1,2)
            compressed_kv = compressed_kv.unsqueeze(2)
            compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
            compressed_kv, k_pe = torch.split(
                compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
            )
            # k_pe [pages, page_size, 1, self.qk_rope_head_dim]
            # compressed_kv [pages, page_size, 1, self.kv_lora_rank]
            
        q_absorb, out_absorb = self.get_absorbed()

        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]
        k_pe = k_pe.view(bsz, 1, -1, self.qk_rope_head_dim)[:,:,:attention_mask.size(-1),:]
        compressed_kv = compressed_kv.view(bsz, 1, -1, self.kv_lora_rank)[:,:,:attention_mask.size(-1),:]
        # k_pe [bsz, 1, cache_len, self.qk_rope_head_dim]
        # compressed_kv [bsz, 1, cache_len,self.kv_lora_rank]
        q_nope = torch.matmul(q_nope, q_absorb)
        #print(q_pe.shape)
        #print(k_pe.shape)
        #print(q_nope.shape)
        #print(compressed_kv.shape)
        
        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.mT)) * self.softmax_scale
        
        #attn_weights [bsz, self.num_heads, q_len, kv_seq_len]
        compressed_kv = compressed_kv.squeeze(1)
        """
        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )
        assert attention_mask is not None
        """
        if attention_mask is not None:
            """
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            """
            #causal_mask = attention_mask[:, :, :, : kv_seq_len]
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(q_pe.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        
        attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
        
        attn_output = torch.matmul(attn_output, out_absorb.mT) 

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        
        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value

    def forward_linux_triton(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_value: Optional[Cache] = None,
            output_attentions: bool = False,
            use_cache: bool = False,
            cache_position: Optional[torch.LongTensor] = None,
            **kwargs,
        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)

        kv_seq_len = q_len
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
        
        # decode
        if q_len == 1:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank] # for speed
                # compressed_kv_with_k_pe [bsz, q_len, 1, self.kv_lora_rank + self.qk_rope_head_dim]
                # compressed_kv [bsz, q_len, 1, self.kv_lora_rank]

            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
            q_absorb, out_absorb = self.get_absorbed()
            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
            q_nope = q_nope.transpose(1, 2)
            #assert q_nope.is_contiguous()
            
            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
            query_states = torch.cat([q_nope, q_pe], dim=-1)
            
            query_states = query_states.squeeze(1)
            attn_output = torch.zeros_like(q_nope) # [bsz, q_len, self.num_heads, self.kv_lora_rank]
            
            attn_logits = torch.empty(
                    (
                        bsz,
                        self.num_heads,
                        4, #num_kv_splits # follow vLLM, fix it TODO
                        self.kv_lora_rank + 1, 
                    ),
                    dtype=torch.float32,
                    device = attn_output.device
                )

            """
            print("query_states", torch.isnan(query_states).any())
            print("compressed_kv_with_k_pe", torch.isnan(compressed_kv_with_k_pe[:,:,0,:]).any())
            print("compressed_kv", torch.isnan(compressed_kv[:,:,0,:]).any())
            print("position_ids", torch.isnan(position_ids).any())
            """

            # flash attn doesn't support head_dim bigger than 256
            # use triton attention kernel adapted from vLLM and SGLang for MQA
            decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
                             page_table,
                             position_ids.squeeze(0).to(torch.int32)+1, attn_logits,
                             4, #num_kv_splits # follow vLLM, fix it TODO
                             self.softmax_scale,
                             past_key_value.page_size)
            
            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
            attn_output = attn_output.transpose(1, 2)
            attn_output = torch.matmul(attn_output, out_absorb.mT)
            attn_output = attn_output.transpose(1, 2)
            
            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
            attn_output = self.o_proj(attn_output)
            
            #print("attn_output", torch.isnan(attn_output).any())
            return attn_output, None, past_key_value
        else:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                k_pe.squeeze(0)
                compressed_kv.squeeze(0)
                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv, k_pe = torch.split(
                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
                )
            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
            k_pe = k_pe[:, :kv_seq_len]
            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
            compressed_kv = compressed_kv[:, :kv_seq_len]
            kv = (
                self.kv_b_proj(compressed_kv)
                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            )
            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
            
            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)

            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states_padded,
                softmax_scale=self.softmax_scale,
                causal=True,
            )

            if self.q_head_dim != self.v_head_dim:
                attn_output = attn_output[:, :, :, : self.v_head_dim]

            attn_output = attn_output.reshape(
                bsz, q_len, self.num_heads * self.v_head_dim
            ).contiguous()
            attn_output = self.o_proj(attn_output)
            return attn_output, None, past_key_value

    def forward_linux_flashinfer(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            past_key_value: Optional[Cache] = None,
            output_attentions: bool = False,
            use_cache: bool = False,
            cache_position: Optional[torch.Tensor] = None,
            **kwargs,
        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)

        kv_seq_len = q_len
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version transformer verision v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
        
        # decode
        if q_len == 1 or self.absorb_for_prefill:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, past_key_value.page_size, self.kv_lora_rank)
                k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, past_key_value.page_size, self.qk_rope_head_dim)
                # k_pe [max_pages, page_size, self.qk_rope_head_dim]
                # compressed_kv [max_pages, page_size, self.kv_lora_rank]

            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
            q_absorb, out_absorb = self.get_absorbed()
            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
            q_nope = q_nope.transpose(1, 2)
            q_nope = q_nope.contiguous()
            #assert q_nope.is_contiguous()
            
            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
            q_nope.squeeze_(0)
            q_pe.squeeze_(0)

            # flash attn doesn't support head_dim bigger than 256, use flashinfer
            if self.mla_wrapper is None:
                self.mla_wrapper = MLAWrapperSingleton.get_instance(self.device, 1, past_key_value.max_pages, use_cuda_graph = True)
            if self.mla_wrapper.need_plan:
                self.mla_wrapper.need_plan = False
                if q_len == 1:
                    self.mla_wrapper.plan(None,None,None,
                                        position_ids.squeeze(1)+1,
                                        None,
                                        self.num_heads,
                                        self.kv_lora_rank,
                                        self.qk_rope_head_dim,
                                        past_key_value.page_size,
                                        self.softmax_scale,
                                        q_nope.dtype,
                                        compressed_kv.dtype)
                else:
                    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device=self.device)
                    kv_len_arr = torch.tensor([position_ids[0, -1].item()+1], dtype=torch.int32, device=self.device)
                    self.mla_wrapper.plan(qo_indptr,None,None,
                                        kv_len_arr,
                                        None,
                                        self.num_heads,
                                        self.kv_lora_rank,
                                        self.qk_rope_head_dim,
                                        past_key_value.page_size,
                                        self.softmax_scale,
                                        q_nope.dtype,
                                        compressed_kv.dtype)
            attn_output = self.mla_wrapper.run(q_nope, q_pe, compressed_kv, k_pe).view(bsz, q_len, self.num_heads, self.kv_lora_rank)
            """
            k = (
                torch.cat([compressed_kv, k_pe], dim=-1)
                .view(-1, 1, 512 + 64)
                .repeat_interleave(self.num_heads, dim=1)
            )
            v = compressed_kv.view(-1, 1, 512).repeat_interleave(self.num_heads, dim=1)
            lens = position_ids.item() + 1
            #print("lens", lens)
            attn_ref, lse_ref = attention_ref(
                1,
                torch.cat([q_nope, q_pe], dim=-1),
                k[:lens],
                v[:lens],
                False,
                self.softmax_scale
            )
            attn_output = attn_ref.view(bsz, q_len, self.num_heads, self.kv_lora_rank)
            """
            
            # mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
            attn_output = attn_output.transpose(1, 2) # [bsz, self.num_heads, q_len, self.kv_lora_rank]
            attn_output = torch.matmul(attn_output, out_absorb.mT) # [bsz, self.num_heads, q_len, self.v_head_dim]
            attn_output = attn_output.transpose(1, 2).contiguous() # [bsz, q_len, self.num_heads, self.kv_lora_rank]
            
            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) # [bsz, q_len, self.num_heads * self.v_head_dim]
            attn_output = self.o_proj(attn_output)
            
            return attn_output, None, past_key_value
        else:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                k_pe.squeeze(0)
                compressed_kv.squeeze(0)
                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv, k_pe = torch.split(
                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
                )
            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
            k_pe = k_pe[:, :kv_seq_len]
            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
            compressed_kv = compressed_kv[:, :kv_seq_len]
            kv = (
                self.kv_b_proj(compressed_kv)
                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            )
            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
            
            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)

            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states_padded,
                softmax_scale=self.softmax_scale,
                causal=True,
            )

            if self.q_head_dim != self.v_head_dim:
                attn_output = attn_output[:, :, :, : self.v_head_dim]

            attn_output = attn_output.reshape(
                bsz, q_len, self.num_heads * self.v_head_dim
            ).contiguous()
            attn_output = self.o_proj(attn_output)
            return attn_output, None, past_key_value
        
    def forward_windows(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if q_len <= self.chunck_size:
            return self.forward_chunck(
                            hidden_states,
                            attention_mask,
                            position_ids,
                            past_key_value,
                            output_attentions,
                            use_cache,
                            cache_position,
                            **kwargs
                        )

        assert output_attentions == False, "output_attentions is not supported when using chunked attention"
        attn_output = None
        cur_idx = 0
        while cur_idx < q_len:
            if attention_mask is not None:
                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
            else:
                # generate chunk_mask automatically.
                self.attn_mask = \
                    torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
                        if self.attn_mask is None \
                            else self.attn_mask
                self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
                    -1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
                        [:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
                self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
                self.attn_mask[:, :, :, :cur_idx] = 0
                chunk_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))

            cur_output, _, _ = self.forward_chunck(
                            hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
                            chunk_mask,
                            position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            past_key_value,
                            output_attentions,
                            use_cache,
                            cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            **kwargs
                        )
            cur_idx += self.chunck_size
            if attn_output is None:
                attn_output = cur_output
            else:
                attn_output = torch.cat((attn_output, cur_output), dim=-2)
                
        return attn_output, None, past_key_value

    def forward_xpu(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        query_states = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        position_embeddings = kwargs.get("position_embeddings", None)
        if position_embeddings is not None:
            cos, sin = position_embeddings
            key_states = torch.cat(
                [k_nope, k_pe.expand([-1, self.num_heads, -1, -1])],
                dim=-1
            )
            from ipex_llm.transformers.models.common import rotary_two_with_cache_inplaced
            rotary_two_with_cache_inplaced(query_states[:, :, :, self.qk_nope_head_dim :],
                                           key_states[:, :, :, self.qk_nope_head_dim:],
                                           cos, sin, True)
        else:
            q_nope, q_pe = torch.split(
                query_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
            )
            cos, sin = self.rotary_emb(q_pe, position_ids)
            q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
            query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

            key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
            key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
            key_states[:, :, :, self.qk_nope_head_dim :] = k_pe

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states.half(), value_states.half(), self.layer_idx, cache_kwargs
            )

        attn_weights = None
        from ipex_llm.transformers.models.common import scaled_dot_product_attention
        attn_output = scaled_dot_product_attention(
            query_states.half(), key_states, value_states,
            attention_mask.half(), q_len == kv_seq_len, self.softmax_scale
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
        attn_output = self.o_proj(attn_output).to(hidden_states.dtype)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if torch.xpu.is_available():
            return self.forward_xpu(
                hidden_states,
                attention_mask,
                position_ids,
                past_key_value,
                output_attentions,
                use_cache,
                cache_position,
                **kwargs,
            )
        elif (os.name == 'nt'
              or get_compute_capability() < 8
              or hidden_states.device.type == 'cpu'
              or device_manager.gpu_vendor != GPUVendor.NVIDIA):
            return self.forward_windows(
                hidden_states,
                attention_mask,
                position_ids,
                past_key_value,
                output_attentions,
                use_cache,
                cache_position,
                **kwargs,
            )
        else:
            if flashinfer_enabled:
                return self.forward_linux_flashinfer(
                    hidden_states,
                    attention_mask,
                    position_ids,
                    past_key_value,
                    output_attentions,
                    use_cache,
                    cache_position,
                    **kwargs,
                )
            else:
                return self.forward_linux_triton(
                    hidden_states,
                    attention_mask,
                    position_ids,
                    past_key_value,
                    output_attentions,
                    use_cache,
                    cache_position,
                    **kwargs,
                )


class KLlamaAttention(BaseInjectedModule):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`, *optional*):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        if self.config.pretraining_tp > 1:
            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
            query_slices = self.q_proj.weight.split(
                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
            )
            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
            query_states = torch.cat(query_states, dim=-1)

            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
            key_states = torch.cat(key_states, dim=-1)

            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
            value_states = torch.cat(value_states, dim=-1)

        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:

            logger.warning(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = self.apply_rotary_pos_emb(query_states, key_states, cos, sin)
        if q_len == 1:
            position_ids = position_ids[0][-1].unsqueeze(0).unsqueeze(0)
            query_states = query_states[:, :, -1:]
            key_states = key_states[:, :, -1:]

        attn_output = KLlamaModel.dynamic_sdpa.apply(
            self.layer_idx,
            bsz,
            position_ids[0][0],
            query_states.transpose(1, 2).to(torch.float16),
            key_states.transpose(1, 2).to(torch.float16),
            value_states.transpose(1, 2).to(torch.float16),
            mode="prefill" if q_len > 1 else "generate",
        )


        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, -1)

        if self.config.pretraining_tp > 1:
            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
        else:
            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


class KQwen3MoeAttentionIPEXLLM(BaseInjectedModule, Qwen3MoeAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "xpu",
                 generate_device: str = "xpu",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
        assert prefill_device.lower()[:3] == "xpu", "KQwen3MoeAttentionIPEXLLM only supports XPU device"
        assert generate_device.lower()[:3] == "xpu", "KQwen3MoeAttentionIPEXLLM only supports XPU device"

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_ids: Optional[torch.Tensor],
        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        bsz, q_len, _ = hidden_states.size()
        input_dtype = hidden_states.dtype
        hidden_shape = (*input_shape, -1, self.head_dim)

        if not hasattr(self, 'qkv_proj'):
            from ipex_llm.transformers.models.common import merge_quantized_qkv
            merge_quantized_qkv(self.q_proj.generate_linear, self.k_proj.generate_linear, self.v_proj.generate_linear, self.orig_module)

        qkv = self.qkv_proj(hidden_states)
        qkv = qkv.view(bsz, q_len, -1, self.head_dim)
        qkv = qkv.transpose(1, 2)
        query_states, key_states, value_states = qkv.split([self.config.num_attention_heads,
                                                            self.config.num_key_value_heads,
                                                            self.config.num_key_value_heads], dim=1)
        query_states = self.q_norm(query_states)
        key_states = self.k_norm(key_states)

        if position_embeddings is None:
            position_embeddings = self.rotary_emb(hidden_states, position_ids)

        cos, sin = position_embeddings

        from ipex_llm.transformers.models.common import rotary_half_with_cache_inplaced
        rotary_half_with_cache_inplaced(query_states, key_states, cos, sin)

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states.half(), value_states.half(),
                                                             self.layer_idx, cache_kwargs)

        attn_weights = None
        from ipex_llm.transformers.models.common import scaled_dot_product_attention
        attn_output = scaled_dot_product_attention(
            query_states.half(), key_states, value_states,
            attention_mask.half(), q_len == key_states.size(2), self.scaling
        )
        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output).to(input_dtype)
        return attn_output, attn_weights


================================================
FILE: archive/ktransformers/operators/balance_serve_attention.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.2.5
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import torch
from torch import nn
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
from ktransformers.models.modeling_smallthinker import SmallthinkerAttention
from ktransformers.models.modeling_glm4_moe import Glm4MoeAttention
from ktransformers.models.modeling_qwen3_next import Qwen3NextGatedDeltaNet
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
import logging
from transformers.configuration_utils import PretrainedConfig
from flashinfer import BatchMLAPagedAttentionWrapper
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn
from ktransformers.models.custom_cache import KDeepSeekV3Cache, KGQACache
logger = logging.getLogger("attention")

# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

class flashinfer_attn(BaseInjectedModule, DeepseekV2Attention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
                                      bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
            self.q_absorb.weight.data = q_absorb
            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
            self.out_absorb.weight.data = out_absorb
            #del self.orig_module.kv_b_proj
        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
        return q_absorb, out_absorb
    

    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KDeepSeekV3Cache,
                position_ids: torch.Tensor,
                wrapper: BatchMLAPagedAttentionWrapper,
                num_tokens_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                ):
        q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states, num_tokens_tensors)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states, num_tokens_tensors), num_tokens_tensors), num_tokens_tensors)
        q = q.view(q_len, self.num_heads, self.q_head_dim)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states, num_tokens_tensors)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = compressed_kv.contiguous()
        compressed_kv = self.kv_a_layernorm(compressed_kv, num_tokens_tensors)
        k_pe = k_pe.view(q_len, 1, self.qk_rope_head_dim)
        compressed_kv = compressed_kv.view(q_len, 1, self.kv_lora_rank)
        
        cos, sin = self.rotary_emb(q_pe, position_ids.unsqueeze(0))
        q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=2)
        q_pe = q_pe.squeeze(0)
        if kv_cache is not None:
            
            # page_idx, page_offset = kv_cache.get_page_table(position_ids, q_indptr, kv_indptr, kv_indices)
            cache_kwargs = {"sin": sin, "cos": cos, "page_idx": page_idx, "page_offset": page_offset}  # Specific to RoPE models
            compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, self.layer_idx, page_idx, page_offset, cache_kwargs)
            compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, kv_cache.page_size, self.kv_lora_rank)
            k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, kv_cache.page_size, self.qk_rope_head_dim)
            
        q_absorb, out_absorb = self.get_absorbed()
        q_nope = q_nope.transpose(0, 1) # q_len is 1, no GPU overhead, same below
        q_nope = torch.matmul(q_nope, q_absorb) # batched MM
        q_nope = q_nope.transpose(0, 1)
        # q_nope.squeeze_(1)
        # q_pe.squeeze_(1)

        attn_output = wrapper.run(q_nope, q_pe, compressed_kv, k_pe).view(q_len, self.num_heads, self.kv_lora_rank)
        attn_output = attn_output.transpose(0, 1)
        attn_output = torch.matmul(attn_output, out_absorb.mT) # [self.num_heads, q_len, self.v_head_dim]
        attn_output = attn_output.transpose(0, 1)
        attn_output = attn_output.reshape(q_len, self.num_heads * self.v_head_dim)
        attn_output = self.o_proj(attn_output, num_tokens_tensors)
        return attn_output

class KQwen2MoeAttention(BaseInjectedModule, Qwen2MoeAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed


    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KGQACache,
                position_ids: torch.Tensor,
                wrapper: flashInferAttn,
                bsz_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                ):
        q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states, bsz_tensors)
        key_states = self.k_proj(hidden_states, bsz_tensors)
        value_states = self.v_proj(hidden_states, bsz_tensors)


        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)
        
        cos, sin = self.rotary_emb(value_states.unsqueeze(0), position_ids.unsqueeze(0))
        query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), cos, sin, unsqueeze_dim=2)

        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )
        value_states = value_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )

        k_cache = kv_cache.get_k_cache(self.layer_idx)
        v_cache = kv_cache.get_v_cache(self.layer_idx)


        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
  

        attn_output = self.o_proj(attn_output.view(q_len, self.num_heads * self.head_dim), bsz_tensors)

        return attn_output

class KQwen3MoeAttention(BaseInjectedModule, Qwen3MoeAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed


    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KGQACache,
                position_ids: torch.Tensor,
                wrapper: flashInferAttn,
                bsz_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                ):
        q_len, _ = hidden_states.size()

        bsz_tensors_q = bsz_tensors * self.num_heads
        bsz_tensors_kv = bsz_tensors * self.num_key_value_heads

        query_states = self.q_norm(self.q_proj(hidden_states, bsz_tensors), bsz_tensors_q)
        key_states = self.k_norm(self.k_proj(hidden_states, bsz_tensors), bsz_tensors_kv)
        value_states = self.v_proj(hidden_states, bsz_tensors)


        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)
        
        cos, sin = self.rotary_emb(value_states.unsqueeze(0), position_ids.unsqueeze(0))
        query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), cos, sin, unsqueeze_dim=2)

        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )
        value_states = value_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )

        k_cache = kv_cache.get_k_cache(self.layer_idx)
        v_cache = kv_cache.get_v_cache(self.layer_idx)


        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
  

        attn_output = self.o_proj(attn_output.view(q_len, self.num_heads * self.head_dim), bsz_tensors)

        return attn_output


class deepseek_torch_attn(BaseInjectedModule, DeepseekV2Attention):
    def __init__(self,
                    key: str,
                    gguf_loader : GGUFLoader,
                    config: PretrainedConfig,
                    orig_module: nn.Module,
                    prefill_device: str = "cuda",
                    generate_device: str = "cuda",
                    chunck_size: int = 1000,
                    **kwargs):
            BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
            self.orig_module.__init__(orig_module.config,
                orig_module.layer_idx)
            self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
                                    bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
            self.q_absorb.weight.data = q_absorb
            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
            self.out_absorb.weight.data = out_absorb
            #del self.orig_module.kv_b_proj
        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
        return q_absorb, out_absorb
    

    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KDeepSeekV3Cache,
                position_ids: torch.Tensor,
                wrapper: None,
                num_tokens_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                attention_masks: Optional[list[torch.Tensor]] = None,
                q_indptr: Optional[torch.Tensor] = None,
                kv_indices: Optional[torch.Tensor] = None,
                kv_indptr: Optional[torch.Tensor] = None,
                bsz_tensors: Optional[torch.Tensor] = None,
                last_page_len: Optional[torch.Tensor] = None,
                ):
        # range bsz_tensors
        final_attention_output = torch.tensor([], device=hidden_states.device)
        for i in range(bsz_tensors[0]):
            batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
            batch_last_page_len = last_page_len[i]
            # kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
            batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
            batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
            # kv_page_nums is the number of pages for the current batch
            kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
            # kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
            kv_total_len = kv_page_nums * kv_cache.page_size
            if batch_last_page_len is not None:
                kv_total_len = kv_total_len - (kv_cache.page_size - batch_last_page_len)
            # print(f"kv_total_len's shape {kv_total_len.shape}")
            # kv_index is the index of the kv cache pages for the current batch
            kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
            # we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
            # from q_indptr[i] to q_indptr[i+1] is the range of the current batch
            batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
            batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
            q_len, _ = batch_hidden_states.size()
            # print("q_len -> ", q_len)

            if self.q_lora_rank is None:
                q = self.q_proj(batch_hidden_states, batch_num_tokens_tensors)
            else:
                q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(batch_hidden_states, batch_num_tokens_tensors), batch_num_tokens_tensors), batch_num_tokens_tensors)
            # for v3, bsz, q_len, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
            q = q.view(q_len, self.num_heads, self.q_head_dim)
            # q_nope is [q_len, num_heads(128), qk_nope_head_dim(128)]
            # q_pe is [q_len, num_heads(128), qk_rope_head_dim(64)]
            q_nope, q_pe = torch.split(
                q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
            )
            # compressed_kv is [q_len, kv_lora_rank(512) + rope(64)]
            compressed_kv = self.kv_a_proj_with_mqa(batch_hidden_states, batch_num_tokens_tensors)
            # compressed_kv is [q_len, kv_lora_rank(512)], k_pe is [q_len, rope(64)]
            compressed_kv, k_pe = torch.split(
                compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
            )
            compressed_kv = compressed_kv.contiguous()
            compressed_kv = self.kv_a_layernorm(compressed_kv, batch_num_tokens_tensors)
            # k_pe is [q_len, 1, qk_rope_head_dim(64)]
            k_pe = k_pe.view(q_len, 1, self.qk_rope_head_dim)
            # compressed_kv is [q_len, 1, kv_lora_rank(512)]
            compressed_kv = compressed_kv.view(q_len, 1, self.kv_lora_rank)
            
            cos, sin = self.rotary_emb(q_pe, batch_position_ids.unsqueeze(0))
            # print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
            q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=2)
            q_pe = q_pe.squeeze(0)
            # q_pe is [num_heads(128), q_len, qk_rope_head_dim(64)]
            q_pe.transpose_(0, 1)            
            if kv_cache is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset}  # Specific to RoPE models
                compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, self.layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, kv_cache.page_size, self.kv_lora_rank)
                k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, kv_cache.page_size, self.qk_rope_head_dim)
            # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
            # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
            q_absorb, out_absorb = self.get_absorbed()
            # q_nope is [num_heads(128), q_len, qk_nope_head_dim(128)]
            q_nope = q_nope.transpose(0, 1) # q_len is 1, no GPU overhead, same below
            # q_nope is [num_heads(128), q_len, kv_lora_rank(512)]
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM

            # # q_nope is [q_len, num_heads(128), kv_lora_rank(512)]
            # q_nope = q_nope.transpose(0, 1)

            # we need to index out the compressed_kv and k_pe for the current batch
            batch_compressed_kv = None
            batch_k_pe = None
            for page_index in kv_index:
                if kv_total_len > kv_cache.page_size:
                    tmp_compressed_kv = compressed_kv[page_index, 0:kv_cache.page_size, :]
                    tmp_k_pe = k_pe[page_index, 0:kv_cache.page_size, :]
                    if batch_compressed_kv is None or batch_k_pe is None:
                        batch_compressed_kv = tmp_compressed_kv
                        batch_k_pe = tmp_k_pe
                    else: 
                        batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                        batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                    kv_total_len -= kv_cache.page_size
                else:
                    tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
                    tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
                    if batch_compressed_kv is None or batch_k_pe is None:
                        batch_compressed_kv = tmp_compressed_kv
                        batch_k_pe = tmp_k_pe
                    else: 
                        batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                        batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                    break
            # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
            # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
            attention_weights = (torch.matmul(q_pe,batch_k_pe.mT) + torch.matmul(q_nope, batch_compressed_kv.mT)) * self.softmax_scale
            # attention_weights is [num_heads(128), q_len, k_len]
            
            # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(q_len,-1,-1).transpose(0,1)
            
            # attention_masks[i] is [q_len, k_len]
            
            attention_weights = (attention_weights + attention_masks[i])
            # attention_weights shape is [num_heads(128), q_len, k_len]
            attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float32).to(q_pe.dtype)
            attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),q_len, lora_rank(512)]
            # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
            out_absorb = out_absorb.transpose(1,2)
            # q for q_len, n for num_heads, h for v_head_dim, v for kv_lora_rank
            attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), q_len, v_head_dim(128)]
            attn_output = attn_output.transpose(0, 1) # [q_len, num_heads(128), v_head_dim(128)]
            attn_output = attn_output.reshape(q_len, self.num_heads * self.v_head_dim)
            attn_output = self.o_proj(attn_output, batch_num_tokens_tensors)
            final_attention_output = torch.cat((final_attention_output, attn_output), dim=0)
        return final_attention_output

class KSmallthinkerAttention(BaseInjectedModule, SmallthinkerAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.

    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`, *optional*):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed

    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KGQACache,
                freqs_cis: torch.Tensor,
                wrapper: flashInferAttn,
                bsz_tensors: torch.Tensor,
                position_ids: torch.Tensor = None,
                ):

        if self.use_qk_norm:
            raise NotImplementedError("use_qk_norm is not implemented yet")

        q_len, _ = hidden_states.size()
        query_states = self.q_proj(hidden_states, bsz_tensors)
        key_states = self.k_proj(hidden_states, bsz_tensors)
        value_states = self.v_proj(hidden_states, bsz_tensors)

        query_states = query_states.view(q_len, self.num_attention_heads, self.head_dim)
        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)
        
        # cos, sin = freqs_cis
        """
        print(query_states.shape)
        print(key_states.shape)
        print(cos.shape)
        print(sin.shape)
        """
        if freqs_cis:  
            cos, sin = freqs_cis
            query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), cos, sin, unsqueeze_dim=2)


        query_states = query_states.view(q_len, self.num_attention_heads, self.head_dim)
        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)

        k_cache = kv_cache.get_k_cache(self.layer_idx)
        v_cache = kv_cache.get_v_cache(self.layer_idx)


        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
  

        attn_output = self.o_proj(attn_output.view(q_len, self.num_attention_heads * self.head_dim), bsz_tensors)

        return attn_output


class KGlm4MoeAttention(BaseInjectedModule, Glm4MoeAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.

    def apply_rotary_pos_emb(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        freqs_cis: Tuple[torch.Tensor, torch.Tensor],
        unsqueeze_dim=2
    ) -> Tuple[torch.Tensor, torch.Tensor]:

        # Keep half or full tensor for later concatenation
        cos = freqs_cis[0]
        sin = freqs_cis[1]
        rotary_dim = cos.shape[-1]

        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)

        q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
        k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]

        # Apply rotary embeddings on the first half or full tensor
        q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
        k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)

        # Concatenate back to full shape
        q_embed = torch.cat([q_embed, q_pass], dim=-1)
        k_embed = torch.cat([k_embed, k_pass], dim=-1)
        return q_embed, k_embed

    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KGQACache,
                freqs_cis: torch.Tensor,
                wrapper: flashInferAttn,
                bsz_tensors: torch.Tensor,
                position_ids: torch.Tensor = None,
                ):

        q_len, _ = hidden_states.size()
        query_states = self.q_proj(hidden_states, bsz_tensors)
        key_states = self.k_proj(hidden_states, bsz_tensors)
        value_states = self.v_proj(hidden_states, bsz_tensors)


        if self.use_qk_norm:
            query_states = self.q_norm(query_states, bsz_tensors)
            key_states = self.k_norm(key_states, bsz_tensors)

        # cos, sin = freqs_cis
        """
        print(query_states.shape)
        print(key_states.shape)
        print(cos.shape)
        print(sin.shape)
        """

        query_states = query_states.view(q_len, self.config.num_attention_heads, self.head_dim)
        key_states = key_states.view(q_len, self.config.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.config.num_key_value_heads, self.head_dim)

        if freqs_cis is not None:  
            query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), freqs_cis)

        query_states = query_states.view(q_len, self.config.num_attention_heads, self.head_dim)
        key_states = key_states.view(q_len, self.config.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.config.num_key_value_heads, self.head_dim)


        k_cache = kv_cache.get_k_cache(self.layer_idx)
        v_cache = kv_cache.get_v_cache(self.layer_idx)


        print(f"{k_cache.shape=}, {v_cache.shape=}, {query_states.shape=}, {key_states.shape=}, {value_states.shape=}")
        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
  

        attn_output = self.o_proj(attn_output.view(q_len, self.config.num_attention_heads * self.head_dim), bsz_tensors)

        return attn_output
    
from ktransformers.models.modeling_qwen3_next import apply_mask_to_padding_states
import torch.nn.functional as F

from ktransformers.models.modeling_qwen3_next import Qwen3NextAttention

class KQwen3NextAttention(BaseInjectedModule, Qwen3NextAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.

    # Adapted from transformers.models.glm.modular_glm.apply_rotary_pos_emb
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Removes the interleaving of cos and sin from GLM

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`, *optional*):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)

        # Keep half or full tensor for later concatenation
        rotary_dim = cos.shape[-1]
        q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
        k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]

        # Apply rotary embeddings on the first half or full tensor
        q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
        k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)

        # Concatenate back to full shape
        q_embed = torch.cat([q_embed, q_pass], dim=-1)
        k_embed = torch.cat([k_embed, k_pass], dim=-1)
        return q_embed, k_embed

    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KGQACache,
                freqs_cis: torch.Tensor,
                wrapper: flashInferAttn,
                bsz_tensors: torch.Tensor,
                position_ids: Optional[torch.Tensor] = None,
                attention_mask: Optional[torch.Tensor] = None,
                ):

        q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states, bsz_tensors)

        query_states, gate = torch.chunk(
            self.q_proj(hidden_states).view(q_len, -1, self.head_dim * 2), 2, dim=-1
        )
        gate = gate.reshape(q_len, -1)

        key_states = self.k_proj(hidden_states, bsz_tensors)

        query_states = query_states.reshape(q_len, -1)
        query_states = self.q_norm(query_states, bsz_tensors)
        key_states = self.k_norm(key_states, bsz_tensors)


        value_states = self.v_proj(hidden_states, bsz_tensors)


        query_states = query_states.view(q_len, self.num_attention_heads, self.head_dim)
        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)

        if freqs_cis:  
            cos, sin = freqs_cis
            query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), cos, sin, unsqueeze_dim=2)
            query_states, key_states = query_states.squeeze(0), key_states.squeeze(0)


        k_cache = kv_cache.get_k_cache(self.layer_idx)
        v_cache = kv_cache.get_v_cache(self.layer_idx)

        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
  
        attn_output = attn_output.reshape(q_len, -1).contiguous()
        attn_output = attn_output * torch.sigmoid(gate)


        attn_output = self.o_proj(attn_output.view(q_len, self.num_attention_heads * self.head_dim), bsz_tensors)

        return attn_output


class KQwen3NextGatedDeltaNet(BaseInjectedModule, Qwen3NextGatedDeltaNet):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.

    def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba):
        """
        Derives `query`, `key` and `value` tensors from `mixed_qkvz` and `mixed_ba`.
        """

        new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
            self.num_k_heads,
            2 * self.head_k_dim + 2 * self.head_v_dim * self.num_v_heads // self.num_k_heads,
        )
        new_tensor_shape_ba = mixed_ba.size()[:-1] + (self.num_k_heads, 2 * self.num_v_heads // self.num_k_heads)

        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
        split_arg_list_qkvz = [
            self.head_k_dim,
            self.head_k_dim,
            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
        ]
        split_arg_list_ba = [self.num_v_heads // self.num_k_heads, self.num_v_heads // self.num_k_heads]
        query, key, value, z = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=3)
        b, a = torch.split(mixed_ba, split_arg_list_ba, dim=3)
        # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
        value = value.reshape(value.size(0), value.size(1), -1, self.head_v_dim)
        z = z.reshape(z.size(0), z.size(1), -1, self.head_v_dim)
        b = b.reshape(b.size(0), b.size(1), self.num_v_heads)
        a = a.reshape(a.size(0), a.size(1), self.num_v_heads)
        return query, key, value, z, b, a

    def forward(
        self,
        hidden_states: torch.Tensor,
        conv_states: Optional[list[torch.Tensor]] = None,
        recurrent_states: Optional[list[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        bsz_tensors: Optional[torch.Tensor] = None,
    ):
        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)

        # Set up dimensions for reshapes later
        batch_size, seq_len, _ = hidden_states.shape

        conv_state = conv_states[self.layer_idx] if conv_states is not None else None
        recurrent_state = (
            recurrent_states[self.layer_idx] if recurrent_states is not None else None
        )

        use_precomputed_states = (
            conv_state is not None
            and recurrent_state is not None
            and seq_len == 1
        )

        projected_states_qkvz = self.in_proj_qkvz(hidden_states, bsz_tensors)
        projected_states_ba = self.in_proj_ba(hidden_states, bsz_tensors)
        query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
        query, key, value = (x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value))

        mixed_qkv = torch.cat((query, key, value), dim=-1)
        mixed_qkv = mixed_qkv.transpose(1, 2)

        if use_precomputed_states:
            # 2. Convolution sequence transformation
            # NOTE: the conv state is updated in `causal_conv1d_update`
            mixed_qkv = self.causal_conv1d_update(
                mixed_qkv,
                conv_state,
                self.conv1d.weight.squeeze(1),
                self.conv1d.bias,
                self.activation,
            )
        else:
            conv_state = F.pad(mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0))

            if self.causal_conv1d_fn is not None:
                mixed_qkv = self.causal_conv1d_fn(
                    x=mixed_qkv,
                    weight=self.conv1d.weight.squeeze(1),
                    bias=self.conv1d.bias,
                    activation=self.activation,
                    seq_idx=None,
                )
            else:
                mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])

        mixed_qkv = mixed_qkv.transpose(1, 2)
        query, key, value = torch.split(
            mixed_qkv,
            [
                self.key_dim,
                self.key_dim,
                self.value_dim,
            ],
            dim=-1,
        )
        query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
        key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
        value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)

        beta = b.sigmoid()
        # If the model is loaded in fp16, without the .float() here, A might be -inf
        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
        if self.num_v_heads // self.num_k_heads > 1:
            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)

        if not use_precomputed_states:
            core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
                query,
                key,
                value,
                g=g,
                beta=beta,
                initial_state=None,
                output_final_state=conv_state is not None,
                use_qk_l2norm_in_kernel=True,
            )

        else:
            core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
                query,
                key,
                value,
                g=g,
                beta=beta,
                initial_state=recurrent_state,
                output_final_state=conv_state is not None,
                use_qk_l2norm_in_kernel=True,
            )

        # Update cache
        recurrent_state = last_recurrent_state

        z_shape_og = z.shape
        # reshape input data into 2D tensor
        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
        z = z.reshape(-1, z.shape[-1])
        core_attn_out = self.norm(core_attn_out, z)
        core_attn_out = core_attn_out.reshape(z_shape_og)
        core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1)

        output = self.out_proj(core_attn_out, bsz_tensors)

        if conv_state is not None:
            conv_states[self.layer_idx] = conv_state
        if recurrent_state is not None:
            recurrent_states[self.layer_idx] = recurrent_state

        return output

================================================
FILE: archive/ktransformers/operators/base_operator.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
from typing import Any
from torch import nn, Tensor
from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
import ktransformers.util.utils as utils
class BaseInjectedModule(nn.Module):
    
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        nn.Module.__init__(self)
        nn.Module.__setattr__(self, "orig_module", orig_module)
        object.__setattr__(self, "key", key)
        object.__setattr__(self, "gguf_loader", gguf_loader)
        object.__setattr__(self, "config", config)
        object.__setattr__(self, "prefill_device", prefill_device)
        object.__setattr__(self, "generate_device", generate_device)
        object.__setattr__(self, "device", generate_device)
        
    def __getattr__(self, name: str) -> Any:
        # __getattr__ in nn.Module doesn't call super().__getattribute__ when name is not in nn.Module.__dict__,
        # but __setattr__ in nn.Module call super().__setattr__ in that case, there may be some attribute set 
        # but can't get using __getattr__, typically these attr is build in attr of the class, so class.attr does not
        # call __getattr__.
        # Example:
        # ...import torch
        # ...l=torch.nn.Linear(100,200)
        # ...l.out_features # 200
        # ...l.__getattr__("out_features") # AttributeError: 'Linear' object has no attribute 'out_features'
        try:
            return object.__getattribute__(self, name) # if this attr belongs to BaseInjectedModule
        except:
            if name == "orig_module":
                return nn.Module.__getattr__(self, "orig_module")
            try:
                return nn.Module.__getattr__(self, "orig_module").__getattr__(name) # if this attr belongs to orig_module
            except:
                return super(nn.Module, nn.Module.__getattr__(self, "orig_module")).__getattribute__(name) # if this attr belongs to orig_module but not in nn.Module.__dict__

    def __setattr__(self, name: str, value: Tensor | nn.Module) -> None:
        if name == "orig_module":
            return nn.Module.__setattr__(self, "orig_module", value)
        elif hasattr(self, name):
            return object.__setattr__(self, name, value)
        return nn.Module.__getattr__(self, "orig_module").__setattr__(name, value)
    
    def forward(self, *args, **kwargs):
        return self.orig_module.forward(*args, **kwargs)
    
    def load(self):
        for name, child in self._modules.items():
            utils.load_weights(child, self.gguf_loader, self.key+".")


================================================
FILE: archive/ktransformers/operators/cpuinfer.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  : This script defines the `CPUInferKVCache` and `CPUInfer` classes for performing inference 
               with a Key-Value Cache on the CPU. The `CPUInferKVCache` class is responsible for configuring 
               and managing key-value caches, updating and retrieving cache data, and handling attention 
               operations. It supports different cache types (e.g., Q4_0, FP16) and retrieval strategies 
               (e.g., shared, separate). The `CPUInfer` class handles task submission and synchronization 
               on the CPU, with optional CUDA stream integration for tasks involving GPU acceleration. 
               These classes facilitate efficient caching and memory management for deep learning models 
               that leverage key-value attention mechanisms, particularly on CPU-based systems.
Author       : djw
Date         : 2024-08-26 23:25:24
Version      : 1.0.0
LastEditors  : djw 
LastEditTime : 2024-08-26 23:25:24
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import sys, os
from typing import Any
import torch
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from ktransformers.server.config.config import Config


class CPUInferKVCache:
    def __init__(
        self,
        layer_num: int = 32,
        kv_head_num: int = 8,
        q_head_num: int = 32,
        head_dim: int = 128,
        block_len: int = 256,
        anchor_num: int = 4,
        anchor_type: str = "FIXED",
        kv_type: str = "Q4_0",
        retrieval_type: str = "SHARED",
        layer_step: int = 1,
        token_step: int = 1,
        layer_offset: int = 0,
        max_thread_num: int = 32,
        max_batch_size: int = 4,
        max_block_num: int = 512,
    ):

        if anchor_type == "FIXED":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.FIXED
        elif anchor_type == "QUEST":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.QUEST
        elif anchor_type == "DYNAMIC":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
        elif anchor_type == "BLOCK_MEAN":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MEAN
        elif anchor_type == "BLOCK_MAX":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MAX
        else:
            raise ValueError(f"Unknown anchor type: {anchor_type}")

        if kv_type == "FP16":
            kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
        elif kv_type == "FP32":
            assert False, "FP32 is not supported yet."
            kv_type = cpuinfer_ext.kvcache.ggml_type.FP32
        elif kv_type == "Q4_0":
            kv_type = cpuinfer_ext.kvcache.ggml_type.Q4_0
        elif kv_type == "Q8_0":
            kv_type = cpuinfer_ext.kvcache.ggml_type.Q8_0
        else:
            raise ValueError(f"Unknown kv type: {kv_type}")

        if retrieval_type == "SHARED":
            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
        elif retrieval_type == "INDIVIDUAL":
            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.QHEAD
        elif retrieval_type == "SEPARATE":
            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.KVHEAD

        self.config = cpuinfer_ext.kvcache.KVCacheConfig(
            layer_num,
            kv_head_num,
            q_head_num,
            head_dim,
            block_len,
            anchor_num,
            anchor_type,
            kv_type,
            retrieval_type,
            layer_step,
            token_step,
            layer_offset,
            max_block_num,
            max_batch_size,
            max_thread_num,
        )
        self.kvcache = cpuinfer_ext.kvcache.KVCache(self.config)

    def load_kvcache(self, tensor_file_path: str):
        if not os.path.exists(tensor_file_path):
            raise FileNotFoundError(f"The file {tensor_file_path} does not exist.")
        return self.kvcache.load_kvcache(tensor_file_path,)

    def dump_kvcache(
        self, block_table: torch.Tensor, cache_total_len: int, tensor_file_path: str
    ):
        assert (
            block_table.dim() == 1
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )

        assert (
            cache_total_len > 0
            and cache_total_len <= self.config.block_len * block_table.size(0)
        ), "cache_total_len: {}".format(cache_total_len)

        if not os.path.exists(os.path.dirname(tensor_file_path)):
            os.makedirs(os.path.dirname(tensor_file_path))

        return self.kvcache.dump_kvcache(
            block_table.data_ptr(),
            cache_total_len,
            tensor_file_path,
        )

    def update_cache_total_len(self, cache_total_len: int):
        assert cache_total_len > 0, "cache_total_len: {}".format(cache_total_len)
        self.kvcache.update_cache_total_len(cache_total_len)

    # q_in: (bsz, q_len, q_head_num, head_dim)
    # output: (bsz, q_len, q_head_num, head_dim)
    # attn_lse: (bsz, q_len, q_head_num)
    # block_table: (bsz, max_block_num)
    def attn(
        self,
        q_in: torch.Tensor,
        output: torch.Tensor,
        attn_lse: torch.Tensor,
        layer_idx: int,
        generate_token_idx: int,
        block_table: torch.Tensor | None = None,
        cache_seqlens: torch.Tensor | None = None,
        pick_block_num: int | None = None,
        init_block_num: int | None = None,
        local_block_num: int | None = None,
    ):

        assert (
            q_in.dim() == 4
            and q_in.size(2) == self.config.q_head_num
            and q_in.size(3) == self.config.head_dim
            and q_in.dtype == torch.float16
            and q_in.is_contiguous()
            and q_in.device == torch.device("cpu")
        ), "q_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            q_in.dim(), q_in.size(), q_in.dtype, q_in.is_contiguous(), q_in.device
        )

        batch_size = q_in.size(0)
        q_len = q_in.size(1)

        assert (block_table is None) or (
            block_table.dim() == 2
            and block_table.size(0) == batch_size
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )

        max_block_num = block_table.size(1) if block_table is not None else 0

        assert (
            output.dim() == 4
            and output.size(0) == batch_size
            and output.size(2) == self.config.q_head_num
            and output.size(1) == q_len
            and output.size(3) == self.config.head_dim
            and output.dtype == torch.float16
            and output.is_contiguous()
            and output.device == torch.device("cpu")
        ), "output dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            output.dim(),
            output.size(),
            output.dtype,
            output.is_contiguous(),
            output.device,
        )

        assert (
            attn_lse.dim() == 3
            and attn_lse.size(0) == batch_size
            and attn_lse.size(1) == q_len
            and attn_lse.size(2) == self.config.q_head_num
            and attn_lse.dtype == torch.float32
            and attn_lse.is_contiguous()
            and attn_lse.device == torch.device("cpu")
        ), "attn_lse dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            attn_lse.dim(),
            attn_lse.size(),
            attn_lse.dtype,
            attn_lse.is_contiguous(),
            attn_lse.device,
        )

        assert (
            layer_idx >= 0 and layer_idx < self.config.layer_num
        ), "layer_idx: {}".format(layer_idx)

        assert (cache_seqlens is None) or (
            cache_seqlens.dim() == 1
            and cache_seqlens.size(0) == batch_size
            and cache_seqlens.dtype == torch.int
            and cache_seqlens.is_contiguous()
            and cache_seqlens.device == torch.device("cpu")
        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            cache_seqlens.dim(),
            cache_seqlens.size(),
            cache_seqlens.dtype,
            cache_seqlens.is_contiguous(),
            cache_seqlens.device,
        )

        return self.kvcache.attn(
            q_in.data_ptr(),
            output.data_ptr(),
            attn_lse.data_ptr(),
            layer_idx,
            generate_token_idx,
            q_len,
            batch_size,
            max_block_num,
            block_table.data_ptr() if block_table is not None else 0,
            cache_seqlens.data_ptr() if cache_seqlens is not None else 0,
            pick_block_num,
            init_block_num,
            local_block_num,
        )

    # k_in: (block_len, kv_head_num, head_dim)
    # v_in: (block_len, kv_head_num, head_dim)
    def update_kvcache_one_block_fp16(
        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            k_in.dim() == 3
            and k_in.size(1) == self.config.block_len
            and k_in.size(0) == self.config.kv_head_num
            and k_in.size(2) == self.config.head_dim
            and k_in.dtype == torch.float16
            and k_in.is_contiguous()
            and k_in.device == torch.device("cpu")
        ), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
        )
        assert (
            v_in.dim() == 3
            and v_in.size(1) == self.config.block_len
            and v_in.size(0) == self.config.kv_head_num
            and v_in.size(2) == self.config.head_dim
            and v_in.dtype == torch.float16
            and v_in.is_contiguous()
            and v_in.device == torch.device("cpu")
        ), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.update_one_block_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_id,
            block_idx,
        )

    def get_kvcache_one_block_fp16(
        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            k_in.dim() == 3
            and k_in.size(1) == self.config.block_len
            and k_in.size(0) == self.config.kv_head_num
            and k_in.size(2) == self.config.head_dim
            and k_in.dtype == torch.float16
            and k_in.is_contiguous()
            and k_in.device == torch.device("cpu")
        ), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
        )
        assert (
            v_in.dim() == 3
            and v_in.size(1) == self.config.block_len
            and v_in.size(0) == self.config.kv_head_num
            and v_in.size(2) == self.config.head_dim
            and v_in.dtype == torch.float16
            and v_in.is_contiguous()
            and v_in.device == torch.device("cpu")
        ), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.get_one_block_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_id,
            block_idx,
        )

    def update_importance_one_block(
        self, importance: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            importance.dim() == 1
            and importance.size(0) == self.config.block_len
            and importance.dtype == torch.float16
            and importance.is_contiguous()
            and importance.device == torch.device("cpu")
        ), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            importance.dim(),
            importance.size(),
            importance.dtype,
            importance.is_contiguous(),
            importance.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.update_importance_one_block(
            importance.data_ptr(),
            layer_id,
            block_idx,
        )

    def get_importance_one_block(
        self, importance: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            importance.dim() == 1
            and importance.size(0) == self.config.block_len
            and importance.dtype == torch.float16
            and importance.is_contiguous()
            and importance.device == torch.device("cpu")
        ), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            importance.dim(),
            importance.size(),
            importance.dtype,
            importance.is_contiguous(),
            importance.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.get_importance_one_block(
            importance.data_ptr(),
            layer_id,
            block_idx,
        )

    def get_anchor_one_block(self, anchor: torch.Tensor, layer_id: int, block_idx: int):
        assert (
            anchor.dim() == 3
            and anchor.size(0) == self.config.kv_head_num
            and anchor.size(1) == self.config.anchor_num
            and anchor.size(2) == self.config.head_dim
            and anchor.dtype == torch.float16
            and anchor.is_contiguous()
            and anchor.device == torch.device("cpu")
        ), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            anchor.dim(),
            anchor.size(),
            anchor.dtype,
            anchor.is_contiguous(),
            anchor.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.get_anchor_one_block(
            anchor.data_ptr(),
            layer_id,
            block_idx,
        )

    def update_anchor_one_block(
        self, anchor: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            anchor.dim() == 3
            and anchor.size(0) == self.config.kv_head_num
            and anchor.size(1) == self.config.anchor_num
            and anchor.size(2) == self.config.head_dim
            and anchor.dtype == torch.float16
            and anchor.is_contiguous()
            and anchor.device == torch.device("cpu")
        ), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            anchor.dim(),
            anchor.size(),
            anchor.dtype,
            anchor.is_contiguous(),
            anchor.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.update_anchor_one_block(
            anchor.data_ptr(),
            layer_id,
            block_idx,
        )

    def calc_anchor_all_layers(
        self,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
    ):
        assert (
            block_table.dim() == 2
            and block_table.size(0) == cache_seqlens.size(0)
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )
        assert (
            cache_seqlens.dim() == 1
            and cache_seqlens.dtype == torch.int
            and cache_seqlens.is_contiguous()
            and cache_seqlens.device == torch.device("cpu")
        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            cache_seqlens.dim(),
            cache_seqlens.size(),
            cache_seqlens.dtype,
            cache_seqlens.is_contiguous(),
            cache_seqlens.device,
        )
        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        return self.kvcache.calc_anchor_all_layers(
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            batch_size,
            max_block_num,
        )

    def clear_importance_all_layers(
        self,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
    ):
        assert (
            block_table.dim() == 2
            and block_table.size(0) == cache_seqlens.size(0)
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )
        assert (
            cache_seqlens.dim() == 1
            and cache_seqlens.dtype == torch.int
            and cache_seqlens.is_contiguous()
            and cache_seqlens.device == torch.device("cpu")
        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            cache_seqlens.dim(),
            cache_seqlens.size(),
            cache_seqlens.dtype,
            cache_seqlens.is_contiguous(),
            cache_seqlens.device,
        )
        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        return self.kvcache.clear_importance_all_layers(
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            batch_size,
            max_block_num,
        )

    def get_cache_total_len(self):
        return self.kvcache.get_cache_total_len()

    def update_kvcache_q4(
        self,
        k_in: torch.Tensor,
        k_scales: torch.Tensor,
        v_in: torch.Tensor,
        v_scales: torch.Tensor,
        layer_id: int,
        seq_offset: int | None = None,
        seq_len: int | None = None,
        block_table: torch.Tensor | None = None,
    ):
        raise NotImplementedError

    def update_kvcache_fp16(
        self,
        k_in: torch.Tensor,
        v_in: torch.Tensor,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        past_len: torch.Tensor,
        q_len,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.get_kvcache_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            past_len.data_ptr(),
            q_len
        )

    def get_kvcache_q4(
        self,
        k_in: torch.Tensor,
        k_scales: torch.Tensor,
        v_in: torch.Tensor,
        v_scales: torch.Tensor,
        layer_id: int,
        seq_offset: int | None = None,
        seq_len: int | None = None,
        block_table: torch.Tensor | None = None,
    ):
        raise NotImplementedError

    def get_kvcache_fp16(
        self,
        k_in: torch.Tensor,
        v_in: torch.Tensor,
        layer_id: int,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        past_len: torch.Tensor,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.get_kvcache_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            past_len.data_ptr(),
        )

    def get_and_update_kvcache_fp16(
        self,
        k_cache_cpu: torch.Tensor,
        v_cache_cpu: torch.Tensor,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        past_len: torch.Tensor,
        q_len,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.get_and_update_kvcache_fp16(
            k_cache_cpu.data_ptr(),
            v_cache_cpu.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            past_len.data_ptr(),
            q_len,
        )

    def update_importance(
        self,
        importance_cache: torch.Tensor,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        offset: torch.Tensor,
        width,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.update_importance(
            importance_cache.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            offset.data_ptr(),
            width,
        )

    # attn_sparsity: ((bsz, q_len, q_head_num), dtype = torch.float32)
    def get_attn_sparsity(
        self,
        q_in: torch.Tensor,
        attn_sparsity: torch.Tensor,
        layer_idx: int,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
        block_table_origin: torch.Tensor,
        cache_seqlens_origin: torch.Tensor,
        generate_token_idx: int = 0,
        topk: int | None = None,
        local: int | None = None,
    ):
        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        max_block_num_origin = block_table_origin.size(1)
        q_len = q_in.size(1)

        if topk is None or local is None or topk + local >= max_block_num:
            topk = -1
            local = -1
        return self.kvcache.get_attn_sparsity(
            q_in.data_ptr(),
            attn_sparsity.data_ptr(),
            layer_idx,
            generate_token_idx,
            q_len,
            batch_size,
            max_block_num,
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            block_table_origin.data_ptr(),
            cache_seqlens_origin.data_ptr(),
            max_block_num_origin,
            topk,
            local,
        )

    def attn_with_kvcache(
        self,
        q_in: torch.Tensor,
        k_in: torch.Tensor,
        v_in: torch.Tensor,
        output: torch.Tensor,
        attn_lse: torch.Tensor,
        layer_idx: int,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
        generate_token_idx: int = 0,
        topk: int | None = None,
        local: int | None = None,
    ):

        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        q_len = q_in.size(1)

        if topk is None or local is None or topk + local >= max_block_num:
            topk = -1
            local = -1
        return self.kvcache.attn_with_kvcache(
            q_in.data_ptr(),
            k_in.data_ptr(),
            v_in.data_ptr(),
            output.data_ptr(),
            attn_lse.data_ptr(),
            layer_idx,
            generate_token_idx,
            q_len,
            batch_size,
            max_block_num,
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            topk,
            local,
        )

    def get_all_kvcache_one_layer(
        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int
    ):
        return self.kvcache.get_all_kvcache_one_layer(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_id,
        )

    def get_importance(
        self,
        importance: torch.Tensor,
        block_table: torch.Tensor,
    ):
        raise NotImplementedError

    def get_anchor(
        self,
        anchor: torch.Tensor,
        block_table: torch.Tensor,
    ):
        raise NotImplementedError


class CPUInfer:
    cpuinfer = None
    cur_backend_thread_num = 0
    
    def __init__(self, thread_num):
        if thread_num > CPUInfer.cur_backend_thread_num:
            CPUInfer.cur_backend_thread_num = thread_num
            del CPUInfer.cpuinfer
            CPUInfer.cpuinfer = cpuinfer_ext.CPUInfer(thread_num)

    def submit(self, task):
        CPUInfer.cpuinfer.submit(task)

    def submit_with_cuda_stream(self, current_cuda_stream, task):
        CPUInfer.cpuinfer.submit_with_cuda_stream(current_cuda_stream, task)

    def sync(self):
        CPUInfer.cpuinfer.sync()

    def sync_with_cuda_stream(self, current_cuda_stream):
        CPUInfer.cpuinfer.sync_with_cuda_stream(current_cuda_stream)


================================================
FILE: archive/ktransformers/operators/dynamic_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-26 23:25:24
Version      : 1.0.0
LastEditors  : Jianwei Dong
LastEditTime : 2024-08-26 23:25:24
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

import torch
from transformers import AutoConfig
import sys, os
import logging
logger = logging.getLogger("dynamic_attention")
sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/cpu_backend")
from ktransformers.operators.cpuinfer import CPUInfer, CPUInferKVCache
try:
    from flash_attn import flash_attn_func, flash_attn_with_kvcache
except:
    print("falsh attn not found")


import math
import json


class DynamicScaledDotProductAttention:
    remaining_length: int
    cpu_infer = None

    def __init__(
        self,
        max_seq_len: int,
        block_size: int,
        config: AutoConfig,
        device: torch.device,
        local_windows_len: int,
        topk: int,
        threads_num: int,
        anchor_type: str = "DYNAMIC",
        kv_type: str = "FP16",
        dense_layer_num: int = 0,
        anchor_num: int = 1,
        block_selection_mode: str = "SHARED",
        layer_step: int = 1,
        token_step: int = 1,
        preselect_block: bool = False,
        preselect_block_count: int = 96,
        prefill_chunk_size: int = 20480,
        use_attn_sparsity: bool = False,
    ):
        # assert anchor_num == 1
        # assert anchor_type == "DYNAMIC"
        self.remaining_length = 0
        valid_anchor_types = ["DYNAMIC", "FIXED", "BLOCK_MEAN", "BLOCK_MAX", "QUEST"]
        assert anchor_type in valid_anchor_types
        if anchor_type == "QUEST":
            assert anchor_num == 2
        elif anchor_type != "FIXED" and anchor_type != "DYNAMIC":
            assert anchor_num == 1

        valid_kv_types = ["FP16", "FP32", "Q4_0", "Q8_0"]
        assert kv_type in valid_kv_types
        if kv_type != "FP16" and kv_type != "FP32":
            assert block_size % 32 == 0

        valid_block_selection_modes = ["SHARED", "SEPARATE"]  # individual
        assert block_selection_mode in valid_block_selection_modes

        self.max_seq_len = max_seq_len
        self.block_num = max_seq_len // block_size
        self.block_size = block_size
        self.anchor_type = anchor_type
        self.kv_type = kv_type
        self.anchor_num = anchor_num
        self.threads_num = threads_num
        self.layer_step = layer_step
        self.token_step = token_step
        self.preselect_block = preselect_block
        self.preselect_block_count = preselect_block_count
        self.block_selection_mode = block_selection_mode
        self.use_attn_sparsity = use_attn_sparsity

        # model config
        self.kv_head_num = config.num_key_value_heads
        self.q_head_num = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.layer_num = config.num_hidden_layers

        self.device = device
        self.local_windows_len = local_windows_len
        self.local_block_num = self.local_windows_len // self.block_size + 1
        self.prefill_chunk_size = prefill_chunk_size

        self.topk = topk
        self.dense_layer_num = dense_layer_num
        # self.dense_layer_num = 32
        self.cache_key_states = torch.zeros(
            (self.block_num, block_size, self.kv_head_num, self.head_dim),
            device=device,
            dtype=torch.float16,
        )
        self.cache_value_states = torch.zeros(
            (self.block_num, block_size, self.kv_head_num, self.head_dim),
            device=device,
            dtype=torch.float16,
        )
        # [max_num_block, block_size, head_num]
        self.cache_importance = torch.zeros(
            (self.block_num, block_size, self.q_head_num),
            device=device,
            dtype=torch.float16,
        )

        # key_states: [bsz, q_len, kv_head_num, head_dim]
        # value_states: [bsz, q_len, kv_head_num, head_dim]
        # query_states: [bsz, q_len, q_head_num, head_dim]
        self.q_in_cpu = torch.zeros(
            (1, 1, self.q_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )
        self.k_in_cpu = torch.zeros(
            (1, 1, self.kv_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )
        self.v_in_cpu = torch.zeros(
            (1, 1, self.kv_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )

        self.cache_seqlens_cpu = torch.empty(
            (1,), device="cpu", dtype=torch.int32, pin_memory=True
        )

        self.cache_seqlens_cuda = torch.empty((1,), device=device, dtype=torch.int32)

        self.prefix_block_table = torch.arange(
            self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
        ).view(1, -1)

        self.block_table_cpu = torch.arange(
            self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
        ).view(1, -1)

        # assert (
        #     self.local_windows_len // self.block_size + 1 + self.preselect_block_count
        #     <= self.block_num
        # )

        self.output_cpu = torch.empty(
            (1, 1, self.q_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )
        self.lse_cpu = torch.empty(
            (1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
        )

        self.output_cuda = torch.empty(
            (1, 1, self.q_head_num, self.head_dim), device=device, dtype=torch.float16
        )

        self.attn_sparsity = torch.zeros(
            (1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
        )

        if preselect_block == True:
            self.preselect_block_table = torch.zeros(
                self.layer_num,
                self.preselect_block_count,
                device=device,
                dtype=torch.int32,
            )
            self.preselect_block_num = 0  # block_num before preselect
            self.evict_tokens = 0

        if DynamicScaledDotProductAttention.cpu_infer is None:
            DynamicScaledDotProductAttention.cpu_infer = CPUInfer(threads_num)
            self.cpu_infer = DynamicScaledDotProductAttention.cpu_infer
        self.local_thread = CPUInferKVCache(
            self.layer_num,
            self.kv_head_num,
            self.q_head_num,
            self.head_dim,
            self.block_size,
            anchor_num=self.anchor_num,
            anchor_type=anchor_type,
            kv_type=self.kv_type,
            retrieval_type=self.block_selection_mode,
            layer_step=self.layer_step,
            token_step=self.token_step,
            layer_offset=self.dense_layer_num % self.layer_step,
            max_batch_size=1,
            max_block_num=self.block_num,
            max_thread_num=self.threads_num,
        )

        print(
            f"local_windows_len: {local_windows_len}, topk: {topk}, dense_layer_num: {dense_layer_num}, kv_type: {self.kv_type}, anchor_type: {self.anchor_type}, preselect_block: {self.preselect_block}, preselect_block_count: {self.preselect_block_count}, token_step: {self.token_step}, layer_step: {self.layer_step}"
        )

        self.shape_mask = (
            self.q_head_num,
            self.block_size,
            self.block_size,
        )

        mask = torch.zeros(
            self.shape_mask, dtype=torch.uint8, device=device
        ).contiguous()
        elm_idx = torch.arange(self.block_size, device=device)

        for i in range(mask.size(-2)):
            idx = i + mask.size(-1) - mask.size(-2) - elm_idx
            idx = idx[idx >= 0]
            mask[..., i, idx] = 1

        self.tril_mask = mask
        self.triu_mask = mask ^ 1

        self.generate_token_idx = 0

    def get_attn_score_one_block(
        self,
        batch_idx: int,
        max_block_num: int,
        query: torch.Tensor,
        key: torch.Tensor,
        offset: int,
        width: int,
        mask_mode: str | None = None,
        use_softmax: bool = True,
    ):
        n_rep = self.q_head_num // self.kv_head_num
        importance = self.cache_importance.view(-1, self.q_head_num)
        importance = importance.narrow(0, batch_idx * max_block_num + offset, width)
        n_gqa_ = self.q_head_num // self.kv_head_num 
        for head_idx in range(self.q_head_num):
            key_item = key[..., head_idx // n_gqa_, :].view(key.size(0), -1)
            qk = torch.einsum(
                "qd,kd->qk", query[:,head_idx,:], key_item
            )  # (num_attention_heads, len_q, len_k)

            if mask_mode == "tril":
                mask = self.tril_mask
                mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
                qk = qk * mask
            elif mask_mode == "triu":
                mask = self.triu_mask
                mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
                qk = qk * mask

            if use_softmax:
                qk = torch.nn.functional.softmax(
                    qk / math.sqrt(self.head_dim), dim=-1, dtype=torch.float32
                ).to(torch.float16)
              
            qk = torch.sum(qk, dim=-2)
            importance[...,head_idx] += qk

    def get_preselect_block_table_and_attn_score(
        self,
        layer_idx: int,
        batch_size: int,
        offset: torch.Tensor,
        width: int,
        query: torch.Tensor,
        key: torch.Tensor,
        union_with_last_layer: bool = True,
    ):
        max_seqs_len = offset.max().item() + width
        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size

        for batch_idx in range(batch_size):
            query_cur = query[batch_idx][-128:]
            self.get_attn_score_one_block(
                batch_idx,
                max_block_num,
                query_cur,
                key[batch_idx][: offset[batch_idx].item() + width],
                0,
                offset[batch_idx].item() + width,
                mask_mode=None,
            )

        if self.preselect_block:
            self.prefill_block_num = max(
                0, max_block_num - self.local_windows_len // self.block_size
            )
            self.evict_tokens = (
                max(self.prefill_block_num - self.preselect_block_count, 0)
                * self.block_size
            )

            if self.prefill_block_num != 0:
                importance_cache = self.cache_importance.narrow(
                    0, 0, self.prefill_block_num * batch_size
                ).view(
                    batch_size, self.prefill_block_num, self.block_size, self.q_head_num
                )

                importance_r = importance_cache[:, 1:, : self.block_size // 4]
                pad_r = torch.zeros_like(importance_r[:, :1])
                importance_r = torch.cat((importance_r, pad_r), dim=1)
                importance_l = importance_cache[:, :-1, -self.block_size // 4 :]
                pad_l = torch.zeros_like(importance_l[:, :1])
                importance_l = torch.cat((pad_l, importance_l), dim=1)
                importance = torch.cat(
                    (importance_l, importance_cache, importance_r), dim=2
                )
                importance = importance.mean(dim=-1)
                importance = importance.mean(dim=-1)
                # importance: (batch_size, max_block_num)
                topk = min(self.preselect_block_count, self.prefill_block_num)
                values, indices = torch.topk(
                    importance,
                    k=topk,
                    dim=1,
                )

                self.preselect_block_table[
                    layer_idx : layer_idx + 1,
                    :topk,
                ].copy_(indices)

                if union_with_last_layer and layer_idx == 31:
                    for tmp_layer_idx in range(self.layer_num - 1):
                        for i in range(1, min(topk, 6)):
                            x = self.preselect_block_table[-1, i]
                            if x not in self.preselect_block_table[tmp_layer_idx]:
                                self.preselect_block_table[tmp_layer_idx, topk - i] = x
        if self.anchor_type == "DYNAMIC":
            importance_cache = self.cache_importance.narrow(
                0, 0, max_block_num * batch_size
            ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
            importance_cache_cpu = torch.empty_like(
                importance_cache, device="cpu", pin_memory=True
            )

            importance_cache_cpu.copy_(importance_cache)

            block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
            offset_cpu = offset.contiguous().to("cpu")

            self.cpu_infer.submit(
                self.local_thread.update_importance(
                    importance_cache_cpu,
                    layer_idx,
                    block_table_cpu,
                    max_block_num,
                    offset_cpu,
                    width,
                )
            )
            self.cpu_infer.sync()

        importance_cache = self.cache_importance.narrow(
            0, 0, max_block_num * batch_size
        ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
        importance_cache.zero_()

    # key: [bsz, past_len, head_num, head_dim] float16
    # query: [bsz, q_len, q_head_num, head_dim] float16
    def get_attn_score(
        self,
        layer_idx: int,
        batch_size: int,
        offset: torch.Tensor,
        width: int,
        query: torch.Tensor,
        key: torch.Tensor,
    ):
        max_seqs_len = offset.max().item() + width
        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size

        for batch_idx in range(batch_size):
            for idx in range(width // self.block_size):
                offset_cur = idx * self.block_size
                query_cur = query[batch_idx, offset_cur : offset_cur + self.block_size]
                self.get_attn_score_one_block(
                    batch_idx,
                    max_block_num,
                    query_cur,
                    key[
                        batch_idx,
                        offset[batch_idx]
                        + offset_cur : offset[batch_idx]
                        + offset_cur
                        + self.block_size,
                    ],
                    offset[batch_idx].item() + offset_cur,
                    self.block_size,
                    mask_mode="tril",
                    use_softmax=False,
                )

                offset_key = (
                    offset[batch_idx].item()
                    + idx * self.block_size
                    - self.local_windows_len
                )
                if offset_key >= 0:
                    self.get_attn_score_one_block(
                        batch_idx,
                        max_block_num,
                        query_cur,
                        key[batch_idx, offset_key : offset_key + self.block_size],
                        offset_key,
                        self.block_size,
                        mask_mode="triu",
                        use_softmax=False,
                    )

                offset_key = max(0, offset_key + self.block_size)
                width_key = (
                    offset[batch_idx].item() + idx * self.block_size - offset_key
                )
                if width_key > 0:
                    self.get_attn_score_one_block(
                        batch_idx,
                        max_block_num,
                        query_cur,
                        key[batch_idx, offset_key : offset_key + width_key],
                        offset_key,
                        width_key,
                        mask_mode=None,
                        use_softmax=False,
                    )

        importance_cache = self.cache_importance.narrow(
            0, 0, max_block_num * batch_size
        ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
        importance_cache_cpu = torch.empty_like(
            importance_cache, device="cpu", pin_memory=True
        )

        importance_cache_cpu.copy_(importance_cache)

        block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
        offset_cpu = offset.contiguous().to("cpu")

        self.cpu_infer.submit(
            self.local_thread.update_importance(
                importance_cache_cpu,
                layer_idx,
                block_table_cpu,
                max_block_num,
                offset_cpu,
                width,
            )
        )
        self.cpu_infer.sync()
        importance_cache.zero_()

    # key: [bsz, q_len, head_num, head_dim] float16
    # value: [bsz, q_len, head_num, head_dim] float16
    def swap_in_and_swap_out(self, layer_idx, past_len, q_len, key, value):
        batch_size = 1
        max_seqs_len = past_len.max().item() + q_len
        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
        k_cache = self.cache_key_states.narrow(0, 0, max_block_num * batch_size).view(
            batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
        )
        v_cache = self.cache_value_states.narrow(0, 0, max_block_num * batch_size).view(
            batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
        )

        for batch_idx in range(batch_size):
            offset = past_len[batch_idx]
            width = q_len
            k_cache[batch_idx][offset : offset + width].copy_(
                key[batch_idx].view(-1, self.kv_head_num, self.head_dim)
            )
            v_cache[batch_idx][offset : offset + width].copy_(
                value[batch_idx].view(-1, self.kv_head_num, self.head_dim)
            )

        k_cache_cpu = torch.empty_like(k_cache, device="cpu", pin_memory=True)
        v_cache_cpu = torch.empty_like(v_cache, device="cpu", pin_memory=True)

        k_cache_cpu.copy_(k_cache)
        v_cache_cpu.copy_(v_cache)

        cur_block_num = (
            q_len + past_len[0].item() + self.block_size - 1
        ) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        past_len_cpu = past_len.contiguous().to("cpu")

        self.cpu_infer.submit(
            self.local_thread.get_and_update_kvcache_fp16(
                k_cache_cpu,
                v_cache_cpu,
                layer_idx,
                block_table_cpu,
                max_block_num,
                past_len_cpu,
                q_len,
            )
        )

        self.cpu_infer.sync()
        k_cache.copy_(k_cache_cpu)
        v_cache.copy_(v_cache_cpu)

        return k_cache, v_cache

    def calc_anchor(self, cache_seqlens: int):
        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor(
            [cache_seqlens], device="cpu", dtype=torch.int32
        )

        self.cpu_infer.submit(
            self.local_thread.calc_anchor_all_layers(
                block_table_cpu,
                cache_seqlens_cpu,
            )
        )
        self.cpu_infer.sync()

    def clear_importance(self, cache_seqlens: int):
        print(f"clear importance: {cache_seqlens}")
        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor(
            [cache_seqlens], device="cpu", dtype=torch.int32
        )

        self.cpu_infer.submit(
            self.local_thread.clear_importance_all_layers(
                block_table_cpu,
                cache_seqlens_cpu,
            )
        )
        self.cpu_infer.sync()

    def clear_kvcache(self, cache_seqlens: int):
        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor(
            [cache_seqlens], device="cpu", dtype=torch.int32
        )

        self.cpu_infer.submit(
            self.local_thread.clear_kvcache_all_layers(
                block_table_cpu,
                cache_seqlens_cpu,
            )
        )
        self.cpu_infer.sync()

    def get_attn_sparsity(
        self,
        q_in: torch.Tensor,
        layer_idx: int,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
        block_table_origin: torch.Tensor,
        cache_seqlens_origin: torch.Tensor,
        generate_token_idx: int = 0,
        topk: int | None = None,
        local: int | None = None,
        output_path: str = "./attn_sparsity.json",
    ):
        self.attn_sparsity.zero_()
        self.pcinfer.submit(
            self.local_thread.get_attn_sparsity(
                q_in,
                self.attn_sparsity,
                layer_idx,
                block_table,
                cache_seqlens,
                block_table_origin,
                cache_seqlens_origin,
                generate_token_idx,
                topk,
                local,
            )
        )
        self.cpu_infer.sync()
        with open(output_path, "a") as file:
            for head_idx in range(self.q_head_num):
                sparsity = self.attn_sparsity[0][0][head_idx].item()
                json_obj = {
                    "token_idx": generate_token_idx,
                    "layer_idx": layer_idx,
                    "head_idx": head_idx,
                    "sparsity": sparsity,
                }
                json.dump(json_obj, file)
                file.write("\n")

    def apply(
        self,
        layer_idx: int,
        bsz: int,
        past_len: int,
        query_states: torch.Tensor,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        mode: str = "prefill",
        generate_token_idx: int = -1,
    ):

        # key_states: [bsz, q_len, kv_head_num, head_dim]
        # value_states: [bsz, q_len, kv_head_num, head_dim]
        # query_states: [bsz, q_len, q_head_num, head_dim]
        assert query_states.dtype == torch.float16
        assert key_states.dtype == torch.float16
        assert value_states.dtype == torch.float16

        assert key_states.size(2) == self.kv_head_num
        assert value_states.size(2) == self.kv_head_num
        assert query_states.size(2) == self.q_head_num

        q_len = query_states.size(1)
        batch_size = query_states.size(0)
        self.cache_seqlens_cuda.fill_(past_len)
        last_chunk = False
        if self.remaining_length <= self.prefill_chunk_size and q_len != 1:
            last_chunk = True
        device = query_states.device
        if layer_idx == 0:
            if q_len == 1:
                self.generate_token_idx += 1
            elif last_chunk:
                self.generate_token_idx = -1

        if mode == "prefill":
            key, value = self.swap_in_and_swap_out(
                layer_idx,
                self.cache_seqlens_cuda,
                q_len,
                key_states,
                value_states,
            )

            if last_chunk and (self.anchor_type == "DYNAMIC" or self.preselect_block):
                self.get_preselect_block_table_and_attn_score(
                    layer_idx,
                    bsz,
                    self.cache_seqlens_cuda,
                    q_len,
                    query_states,
                    key,
                )
            output = flash_attn_with_kvcache(
                q=query_states,
                k_cache=key,
                v_cache=value,
                cache_seqlens=self.cache_seqlens_cuda + q_len,
                causal=True,
            )
            return output.transpose(1, 2)

        elif mode == "generate":
            assert self.generate_token_idx >= 0
            self.q_in_cpu.copy_(query_states, non_blocking=True)
            self.k_in_cpu.copy_(key_states, non_blocking=True)
            self.v_in_cpu.copy_(value_states, non_blocking=True)
            self.cache_seqlens_cpu.copy_(self.cache_seqlens_cuda, non_blocking=True)
            #            print(layer_idx)
            if layer_idx < self.dense_layer_num:
                self.block_table_cpu.copy_(self.prefix_block_table, non_blocking=True)
                self.cpu_infer.submit_with_cuda_stream(
                    torch.cuda.current_stream("cuda").cuda_stream,
                    self.local_thread.attn_with_kvcache(
                        q_in=self.q_in_cpu,
                        k_in=self.k_in_cpu,
                        v_in=self.v_in_cpu,
                        output=self.output_cpu,
                        attn_lse=self.lse_cpu,
                        layer_idx=layer_idx,
                        block_table=self.block_table_cpu,
                        cache_seqlens=self.cache_seqlens_cpu,
                    ),
                )
            else:
                if self.preselect_block:
                    self.cache_seqlens_cpu.copy_(
                        self.cache_seqlens_cuda - self.evict_tokens, non_blocking=True
                    )
                    if self.preselect_block_count < self.prefill_block_num:
                        self.block_table_cpu[:, : self.preselect_block_count].copy_(
                            self.preselect_block_table[layer_idx : layer_idx + 1],
                            non_blocking=True,
                        )

                        self.block_table_cpu[
                            :,
                            self.preselect_block_count : self.preselect_block_count
                            + self.local_block_num,
                        ].copy_(
                            self.prefix_block_table[
                                :,
                                self.prefill_block_num : self.prefill_block_num
                                + self.local_block_num,
                            ],
                            non_blocking=True,
                        )
                    #                   print("submit_with_cuda_stream")
                    self.cpu_infer.submit_with_cuda_stream(
                        torch.cuda.current_stream("cuda").cuda_stream,
                        self.local_thread.attn_with_kvcache(
                            q_in=self.q_in_cpu,
                            k_in=self.k_in_cpu,
                            v_in=self.v_in_cpu,
                            output=self.output_cpu,
                            attn_lse=self.lse_cpu,
                            layer_idx=layer_idx,
                            generate_token_idx=self.generate_token_idx,
                            block_table=self.block_table_cpu,
                            cache_seqlens=self.cache_seqlens_cpu,
                            topk=(
                                self.topk
                                if self.topk <= self.preselect_block_count
                                else None
                            ),
                            local=self.local_windows_len // self.block_size,
                        ),
                    )
                #                    print("submit_with_cuda_stream enqueue\n")
                else:
                    self.block_table_cpu.copy_(
                        self.prefix_block_table, non_blocking=True
                    )
                    self.cpu_infer.submit_with_cuda_stream(
                        torch.cuda.current_stream("cuda").cuda_stream,
                        self.local_thread.attn_with_kvcache(
                            q_in=self.q_in_cpu,
                            k_in=self.k_in_cpu,
                            v_in=self.v_in_cpu,
                            output=self.output_cpu,
                            attn_lse=self.lse_cpu,
                            layer_idx=layer_idx,
                            generate_token_idx=self.generate_token_idx,
                            block_table=self.block_table_cpu,
                            cache_seqlens=self.cache_seqlens_cpu,
                            topk=self.topk,
                            local=self.local_windows_len // self.block_size,
                        ),
                    )
            self.cpu_infer.sync_with_cuda_stream(
                torch.cuda.current_stream("cuda").cuda_stream
            )
            #            print("submit_with_cuda_stream finished\n")
            self.output_cuda.copy_(self.output_cpu, non_blocking=True)
            return self.output_cuda.transpose(1, 2)

    def save(self, path: str, length: int):
        cur_block_num = (length + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[0, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor([length], device="cpu", dtype=torch.int32)
        self.cpu_infer.submit(
            self.local_thread.dump_kvcache(
                block_table_cpu,
                cache_seqlens_cpu,
                path,
            )
        )
        self.cpu_infer.sync()

    def load(self, path: str, length: int):
        self.cpu_infer.submit(
            self.local_thread.load_kvcache(
                path,
            )
        )
        self.cpu_infer.sync()


================================================
FILE: archive/ktransformers/operators/experts.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Azure-Tang, Boxin Zhang, chenht2022
Date         : 2024-07-25 11:25:24
Version      : 0.1.0
LastEditors  : Azure 
LastEditTime : 2024-08-29 09:41:10
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''

from typing import Any, Union
import numpy as np
import numpy.typing as npt
from torch import Tensor, nn
import torch.nn.functional as F
import torch
import sys, os
from ktransformers.operators.base_operator import BaseInjectedModule
from tqdm import tqdm

sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from cpuinfer_ext.moe import MOEConfig, MOE
import ctypes
from ktransformers.util.custom_gguf import GGMLQuantizationType, translate_name_to_gguf
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader
from ktransformers.util.utils import InferenceState
from ktransformers.server.config.config import Config
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod
from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear
import time
from ktransformers.operators.cpuinfer import CPUInfer

try:
    import torch_npu
    from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False


def deduplicate_and_sort(lst):
    return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
    assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
    base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]

    if chunk_size <= 1024:
        return deduplicate_and_sort(base_list)

    multiples = [i for i in range(1024, chunk_size + 1, 1024)]

    return deduplicate_and_sort(base_list + multiples)
#cuda_graphs = [Config().chunk_size] 
if torch.cuda.is_available():
    cuda_graphs = generate_cuda_graphs(Config().chunk_size)
elif use_torch_npu:
    cuda_graphs = deduplicate_and_sort([1, 2, 3, 4])
else:
    cuda_graphs = 1
# class Base(BaseInjectedModule, ABC):
class KExpertsBase(ABC):
    def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.key = key
        self.gguf_loader = gguf_loader
        self.config = config
        self.device = device
    
    @abstractmethod
    def forward(self, input_tensor, expert_ids, weights):
        pass

    @abstractmethod
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu", warmup: bool = False):
        pass
    
    @abstractmethod
    def unload():
        pass

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None
        gate_type = None
        up_type = None
        down_type = None

        for key in keys:
            if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
                targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
                tensors = self.load_multi(key, targets, device=device)
                gate = tensors[".ffn_gate_exps.weight"]
                up = tensors[".ffn_up_exps.weight"]
                down = tensors[".ffn_down_exps.weight"]
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
            elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"):
                # for supporting  Mixtral-8x7B-Instuct  
                gate = []
                up = []
                down = []
                for i in range(8):
                    gatei, upi, downi = f".ffn_gate.{i}.weight", f".ffn_up.{i}.weight", f".ffn_down.{i}.weight"
                    targets = [gatei, upi, downi]
                    tensors = self.load_multi(key, targets, device=device)
                    gate_it, up_it, down_it = tensors[gatei], tensors[upi], tensors[downi]
                    gate.append(gate_it)
                    up.append(up_it)
                    down.append(down_it)
                gate = torch.stack(gate)
                up = torch.stack(up)
                down = torch.stack(down)
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")
            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
        return res
    
    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
        tensors = {}
        for k in keys:
            tensors[k] = self.gguf_loader.load_gguf_tensor(key + k, device=device)
        return tensors


class KExpertsCPU(KExpertsBase):
    input_tensor_cpu:Tensor = None
    expert_ids_cpu:Tensor = None
    weights_cpu:Tensor = None
    output_cpu:Tensor = None
    output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
    #stream_map:dict = {} # Manage cuda stream on different gpu
    # @TODO add yaml
    CPU_INFER = CPUInfer(Config().cpu_infer)
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        n_routed_experts: int,
        orig_module: nn.Module = None,
        device: str = "cpu",
        out_device: str = "cuda", # this device mean which device the output should on. TODO: support cpu.
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU"
        self.n_routed_experts = n_routed_experts
        self.out_device = out_device
        self.backend = kwargs.get("backend", "llamafile")

    def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
        if use_torch_npu and get_tensor_parallel_size() != 1 and (
            not torch.distributed.is_initialized() or torch.distributed.get_rank() != 0):
            return

        if device:
            assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU, Parameter \"device\" can be cpu or None."
        if w is None: w = self.load_weights()[self.key]
        self.gate = w["gate"]
        self.up = w["up"]
        self.down = w["down"]
        self.gate_type = w["gate_type"]
        self.up_type = w["up_type"]
        self.down_type = w["down_type"]
        gate_ptr = ctypes.addressof(
            ctypes.cast(self.gate.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        up_ptr = ctypes.addressof(
            ctypes.cast(self.up.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        down_ptr = ctypes.addressof(
            ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        # print(self.gate_qtype, self.up_qtype, self.down_qtype)
        n_routed_experts = self.n_routed_experts
        self.cpu_infer = KExpertsCPU.CPU_INFER
        # n_routed_experts = len(self.orig_module)
        model_dtype = torch.get_default_dtype()
        if torch.xpu.is_available() and model_dtype == torch.float16:
            hidden_type = 1 # fp16
        else:
            hidden_type = 30 # bf16
        if self.backend == "llamafile":
            moe_config = MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                64,
                10,
                1024,
                self.config.hidden_act == 'silu',
                gate_ptr,
                up_ptr,
                down_ptr,
                self.gate_type,
                self.up_type,
                self.down_type,
                hidden_type, # TODO: get from model.dtype
            )
            self.moe = MOE(moe_config)
        elif self.backend == "AMXBF16":
            from cpuinfer_ext.moe import AMX_MOEConfig, AMXBF16_MOE
            assert self.gate_type == GGMLQuantizationType.BF16
            assert self.up_type == GGMLQuantizationType.BF16
            assert self.down_type == GGMLQuantizationType.BF16
            moe_config = AMX_MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
                self.config.hidden_act == 'silu',
                gate_ptr,
                up_ptr,
                down_ptr,
            )
            self.moe = AMXBF16_MOE(moe_config)
            self.cpu_infer.submit(self.moe.load_weights())
            self.cpu_infer.sync()
        elif self.backend == "AMXInt8":
            from cpuinfer_ext.moe import AMX_MOEConfig, AMXInt8_MOE
            assert self.gate_type == GGMLQuantizationType.BF16
            assert self.up_type == GGMLQuantizationType.BF16
            assert self.down_type == GGMLQuantizationType.BF16
            moe_config = AMX_MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
                self.config.hidden_act == 'silu',
                gate_ptr,
                up_ptr,
                down_ptr,
            )
            self.moe = AMXInt8_MOE(moe_config)
            self.cpu_infer.submit(self.moe.load_weights())
            self.cpu_infer.sync()
        # print(n_routed_experts, hidden_size, moe_intermediate_size)
        num_experts_per_tok = self.config.num_experts_per_tok
        if warmup:
            self.cpu_infer.submit(self.moe.warm_up())
            self.cpu_infer.sync()
        if self.out_device not in KExpertsCPU.output_gpu_map:
            if isinstance(cuda_graphs, list):
                KExpertsCPU.output_gpu_map[self.out_device] = [torch.zeros((cuda_graphs[i], self.config.hidden_size), device=self.out_device) for i in range(len(cuda_graphs))]
            else:
                KExpertsCPU.output_gpu_map[self.out_device] = torch.zeros((cuda_graphs, self.config.hidden_size), device=self.out_device)
        if KExpertsCPU.input_tensor_cpu == None:
            if isinstance(cuda_graphs, list):
                if use_torch_npu:
                    KExpertsCPU.input_tensor_cpu = [[torch.zeros((cuda_graphs[i], self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)] for i in range(len(cuda_graphs))]
                    KExpertsCPU.expert_ids_cpu = [[torch.zeros((cuda_graphs[i], num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)] for i in range(len(cuda_graphs))]
                    KExpertsCPU.weights_cpu = [[torch.zeros((cuda_graphs[i], num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)] for i in range(len(cuda_graphs))]
                    KExpertsCPU.output_cpu = [[torch.zeros((cuda_graphs[i], self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)] for i in range(len(cuda_graphs))]
                    KExpertsCPU.bsz_tensor_cpu = [[torch.tensor([cuda_graphs[i]], device="cpu", dtype=torch.int32, pin_memory=True)] for i in range(len(cuda_graphs))]                    
                else:
                    KExpertsCPU.input_tensor_cpu = [torch.zeros((cuda_graphs[i], self.config.hidden_size), device="cpu", pin_memory=True) for i in range(len(cuda_graphs))]
                    KExpertsCPU.expert_ids_cpu = [torch.zeros((cuda_graphs[i], num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True) for i in range(len(cuda_graphs))]
                    KExpertsCPU.weights_cpu = [torch.zeros((cuda_graphs[i], num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True) for i in range(len(cuda_graphs))]
                    KExpertsCPU.output_cpu = [torch.zeros((cuda_graphs[i], self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16) for i in range(len(cuda_graphs))]
                    KExpertsCPU.bsz_tensor_cpu = [torch.zeros((1), device="cpu", dtype=torch.int32, pin_memory=True) for i in range(len(cuda_graphs))]
            else:
                KExpertsCPU.input_tensor_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True)
                KExpertsCPU.expert_ids_cpu = torch.zeros((cuda_graphs, num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
                KExpertsCPU.weights_cpu = torch.zeros((cuda_graphs, num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
                if torch.xpu.is_available():
                    KExpertsCPU.output_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True, dtype=model_dtype)
                    KExpertsCPU.bsz_tensor_cpu = torch.ones((1), device="cpu", dtype=torch.int32, pin_memory=True)
                else:
                    KExpertsCPU.output_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
                    KExpertsCPU.bsz_tensor_cpu = torch.zeros((1), device="cpu", dtype=torch.int32, pin_memory=True)
            
    def submit_for_one_decode(self, input_tensor, expert_ids, weights, bsz_tensor=None, cuda_graph_idx=0):
        if bsz_tensor is None:
            bsz_tensor = torch.ones(1, device=input_tensor.device, dtype=torch.int32)
        if cuda_graph_idx != -1:
            KExpertsCPU.input_tensor_cpu[cuda_graph_idx].copy_(input_tensor, non_blocking=True)
            KExpertsCPU.expert_ids_cpu[cuda_graph_idx].copy_(expert_ids, non_blocking=True)
            KExpertsCPU.weights_cpu[cuda_graph_idx].copy_(weights, non_blocking=True)
            KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].copy_(bsz_tensor, non_blocking=True)
            self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(-1), KExpertsCPU.expert_ids_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.weights_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.input_tensor_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.output_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].data_ptr()))
        else:
            KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
            KExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
            KExpertsCPU.weights_cpu.copy_(weights, non_blocking=True)
            KExpertsCPU.bsz_tensor_cpu.copy_(bsz_tensor, non_blocking=True)
            self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(-1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr(), KExpertsCPU.bsz_tensor_cpu.data_ptr()))
        

    def sync_for_one_decode(self, cuda_graph_idx=0):
        if cuda_graph_idx != -1:
            self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream)
            KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx].copy_(KExpertsCPU.output_cpu[cuda_graph_idx], non_blocking=True)
            return KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx]
        else:
            self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream)
            KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
            return KExpertsCPU.output_gpu_map[self.out_device]

    def forward(self, input_tensor, expert_ids, weights, bsz_tensor=None, cuda_graph_idx=0):
        # generate, capture and run cuda graph
        # print(expert_ids)
        if bsz_tensor is None and (not torch.xpu.is_available() or input_tensor.size(0) > 1):
            bsz_tensor = torch.tensor([input_tensor.size(0)], device=input_tensor.device, dtype=torch.int32)
        if torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
            if cuda_graph_idx != -1:
                KExpertsCPU.input_tensor_cpu[cuda_graph_idx].copy_(input_tensor, non_blocking=True)
                KExpertsCPU.expert_ids_cpu[cuda_graph_idx].copy_(expert_ids, non_blocking=True)
                KExpertsCPU.weights_cpu[cuda_graph_idx].copy_(weights, non_blocking=True)
                KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].copy_(bsz_tensor, non_blocking=True)
                self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(expert_ids.size(0), expert_ids.size(-1), KExpertsCPU.expert_ids_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.weights_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.input_tensor_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.output_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].data_ptr()))
                self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
                KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx].copy_(KExpertsCPU.output_cpu[cuda_graph_idx], non_blocking=True)
                return KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx]

            else:
                KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
                KExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
                KExpertsCPU.weights_cpu.copy_(weights, non_blocking=True)
                KExpertsCPU.bsz_tensor_cpu.copy_(bsz_tensor, non_blocking=True)
                self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(expert_ids.size(0), expert_ids.size(-1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr(), KExpertsCPU.bsz_tensor_cpu.data_ptr()))
                self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
                KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
                return KExpertsCPU.output_gpu_map[self.out_device]
        elif input_tensor.size(0)==1 and torch.xpu.is_available():
            KExpertsCPU.input_tensor_cpu.copy_(input_tensor.view(-1), non_blocking=True)
            KExpertsCPU.expert_ids_cpu.copy_(expert_ids.view(-1), non_blocking=True)
            KExpertsCPU.weights_cpu.copy_(weights.view(-1), non_blocking=True)
            # KExpertsCPU.bsz_tensor_cpu.copy_(bsz_tensor.view(-1), non_blocking=True)
            self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr(), KExpertsCPU.bsz_tensor_cpu.data_ptr()))
            self.cpu_infer.sync()
            KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
            return KExpertsCPU.output_gpu_map[self.out_device].view(1, -1)
        else:
            input_tensor = input_tensor.contiguous().cpu()
            expert_ids = expert_ids.contiguous().cpu()
            weights = weights.contiguous().to(torch.float32).cpu()
            bsz_tensor = bsz_tensor.contiguous().cpu()
            output = torch.empty_like(input_tensor).contiguous()
            self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr(), bsz_tensor.data_ptr()))
            self.cpu_infer.sync()
            return output.to(device=object.__getattribute__(self, "out_device"))
    
    def unload(self):
        return

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
        # TODO: support Bias
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None
        gate_type = None
        up_type = None
        down_type = None

        for key in keys:
            if isinstance(self.gguf_loader, SafeTensorLoader):
                res = self.gguf_loader.load_experts(key)
                return {key: res}
            elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
                # gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                # up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                # down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
                gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
                up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
                down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
            
            elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
            elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
                # for supporting  Mixtral-8x7B-Instuct  
                gate = []
                up = []
                down = []
                for i in range(8):
                    gate_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_gate.{i}.weight")
                    up_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_up.{i}.weight")
                    down_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_down.{i}.weight")
                    gate.append(gate_it)
                    up.append(up_it)
                    down.append(down_it)
                gate = np.stack(gate)
                up = np.stack(up)
                down = np.stack(down)
                gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
                up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
                down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
            elif self.gguf_loader.safetensor_loader is not None:
                # for npu
                # using a temp ugly way to temprary load the tensor
                translate_key = translate_name_to_gguf(key)
                gate = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.weight").numpy()
                up = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.weight").numpy()
                down = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.weight").numpy()
                gate_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_exps.ggml_type").item()
                up_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_up_exps.ggml_type").item()
                down_type = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_down_exps.ggml_type").item()
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")
            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
        return res
    
class KExpertsMarlin(KExpertsBase):
    expert_num: int
    loaded_experts_idx: list[int]
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        n_routed_experts: int,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.expert_num = n_routed_experts
        self.loaded_experts_idx = []
        self.act_fn = ACT2FN[config.hidden_act]
        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
        self.device = device
        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size

        # create empty marlin experts according to the number of experts per token
        # up
        self.up_projs = [KLinearMarlin(key+ "." + "ffn_up_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
        # gate
        self.gate_projs = [KLinearMarlin(key+ "." + "ffn_gate_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
        # down
        self.down_projs = [KLinearMarlin(key+ "." + "ffn_down_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
        if w is None:
            w = self.load_weights()
            load_by_experts = True

        if load_by_experts:
            if isinstance(w, dict):
                self.gate = w["gate"]
                self.up = (w["up"])
                self.down = (w["down"])
                for i in tqdm(range(self.expert_num), desc=f"Dequanting and quanting for KExpertsMarlin {self.key}"):
                    up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", self.up, i, self.elements_per_tensor, device=self.device)
                    gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", self.gate, i, self.elements_per_tensor, device=self.device)
                    down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", self.down, i, self.elements_per_tensor, device=self.device)
                    
                    self.up_projs[i].load(nn.Parameter(up_weights), device=device)
                    self.gate_projs[i].load(nn.Parameter(gate_weights), device=device)
                    self.down_projs[i].load(nn.Parameter(down_weights), device=device)
                    self.loaded_experts_idx.append(i)
        else:
            if isinstance(w, dict):
                self.gate = w["gate"]
                self.up = (w["up"])
                self.down = (w["down"])
                for i in range(self.expert_num):
                    self.up_projs[i].load(nn.Parameter(self.up[i,...]), device=device)
                    self.gate_projs[i].load(nn.Parameter(self.gate[i,...]), device=device)
                    self.down_projs[i].load(nn.Parameter(self.down[i,...]), device=device)
                    self.loaded_experts_idx.append(i)
        return 

    def unload(self):
        for i in self.loaded_experts_idx:
            self.up_projs[i].unload()
            self.gate_projs[i].unload()
            self.down_projs[i].unload()
        self.loaded_experts_idx = []

    def load_weights(self, override_key: str | None = None):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None

        for key in keys:
            if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
            res = {"gate": gate, "up": up, "down": down}
        return res

    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
        org_dtype = hidden_states_cpu.dtype
        org_device = hidden_states_cpu.device
        hidden_states_cpu = hidden_states_cpu.to(self.device)
        selected_experts_cpu = selected_experts_cpu.to(self.device)
        routing_weights_cpu = routing_weights_cpu.to(self.device).to(org_dtype)
        
        batch_sequence_length, hidden_dim = hidden_states_cpu.size()

        final_hidden_states = torch.zeros(
            (batch_sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
        )
        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.expert_num).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.expert_num):
            if not expert_mask[expert_idx].any():
                continue
            idx, top_x = torch.where(expert_mask[expert_idx])
            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            G = self.gate_projs[expert_idx].forward(current_state)
            A = self.act_fn(G)
            U = self.up_projs[expert_idx].forward(current_state)
            H = A * U  # Element-wise multiplication
            current_hidden_states = self.down_projs[expert_idx].forward(H) * routing_weights_cpu[top_x, idx, None]
            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states)
        
        return final_hidden_states.to(dtype=org_dtype, device=org_device)
    
# untested, CUDA OOM
class KExpertsTorch(KExpertsBase):
    expert_num: int
    loaded_experts_idx: list[int]
    gate: torch.Tensor
    up: torch.Tensor
    down: torch.Tensor
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        n_routed_experts: int,
        orig_module: nn.Module = None,
        device: str = "cpu",
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.expert_num = n_routed_experts
        # self.loaded_experts_idx = []
        self.act_fn = ACT2FN[config.hidden_act]
        self.device = device
        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size
        self.gate = [None for _ in range(self.expert_num)]
        self.up = [None for _ in range(self.expert_num)]
        self.down = [None for _ in range(self.expert_num)]
        self.dtype = torch.get_default_dtype()

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
        if device is None: device = self.device
        if w is None:
            w = self.load_weights()
            load_by_experts = True

        if load_by_experts:
            if isinstance(w, dict):
                for i in tqdm(range(self.expert_num), desc=f"Dequanting for KExpertsTorch {self.key}"):
                    up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", w["up"], i, self.elements_per_tensor, device=self.device)
                    gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", w["gate"], i, self.elements_per_tensor, device=self.device)
                    down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", w["down"], i, self.elements_per_tensor, device=self.device)
                    
                    self.up[i] = up_weights
                    self.gate[i] = gate_weights
                    self.down[i] = down_weights
        else:
            if isinstance(w, dict):
                for i in range(self.expert_num):
                    self.gate[i] = w["gate"][i, ...].to(device=device, dtype=self.dtype)
                    self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype)
                    self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype)
        
        self.up = torch.stack(self.up, dim=0)
        self.gate = torch.stack(self.gate, dim=0)
        self.down = torch.stack(self.down, dim=0)
        return 

    def unload(self):
        if self.gate is not None:
            self.gate = None
            self.up = None
            self.down = None

    def load_weights(self, override_key: str | None = None):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None

        for key in keys:
            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
            res = {"gate": gate, "up": up, "down": down}
        return res

    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:

        org_device = hidden_states_cpu.device
        hidden_states_cpu = hidden_states_cpu.to(self.device)
        selected_experts_cpu = selected_experts_cpu.to(self.device)
        routing_weights_cpu = routing_weights_cpu.to(self.device)
        
        batch_sequence_length, hidden_dim = hidden_states_cpu.size()

        final_hidden_states = torch.zeros(
            (batch_sequence_length, hidden_dim), dtype=self.gate.dtype, device=hidden_states_cpu.device
        )
        org_dtype = hidden_states_cpu.dtype
        hidden_states_cpu = hidden_states_cpu.to(self.gate.dtype)
        routing_weights_cpu = routing_weights_cpu.to(self.gate.dtype)
        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.expert_num).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.expert_num):
            idx, top_x = torch.where(expert_mask[expert_idx])
            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            G = current_state @ self.gate[expert_idx,...].T
            A = self.act_fn(G)
            U = current_state @ self.up[expert_idx,...].T
            H = A * U  # Element-wise multiplication
            current_hidden_states = H @ self.down[expert_idx,...].T * routing_weights_cpu[top_x, idx, None]
            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states)


        return final_hidden_states.to(dtype=org_dtype, device=org_device)

EXPERTS_MAP = {
    "KExpertsCPU": KExpertsCPU,
    "KExpertsTorch": KExpertsTorch,
    "KExpertsMarlin": KExpertsMarlin,
}

class KTransformersExperts(BaseInjectedModule, KExpertsBase):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                #  device: str = "cuda",
                 prefill_device:str = "cuda",
                 prefill_op: str | None = "KExpertsTorch",
                 generate_device: str = "cpu",
                 generate_op: str | None = "KExpertsCPU",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        if generate_op is not None:
            self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
        else:
            self.generate_experts = None
        if prefill_op is not None:
            self.prefill_experts = EXPERTS_MAP[prefill_op](key, gguf_loader, config, len(orig_module), device=prefill_device, **kwargs)
        else:
            self.prefill_experts = None
        self.gpu_mlp_type = prefill_op
        self.cpu_mlp_type = generate_op
        self.mode = InferenceState.UNLOAD

    def load(self, w: dict = None,  mode: InferenceState = None, warmup: bool = True):
        # TODO support w as input
        if not mode: mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            self.prefill_experts.unload()
            self.generate_experts.load(w, warmup=warmup)
            self.device = self.generate_experts.device
            self.mode = mode
        elif mode == InferenceState.PREFILL:
            self.generate_experts.unload()
            self.prefill_experts.load(w, warmup=warmup)
            self.device = self.prefill_experts.device
            self.mode = mode
        elif mode == InferenceState.UNLOAD:
            self.unload()
            self.mode = mode
            self.device = self.generate_experts.device
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

    def unload(self):
        if self.generate_experts is not None:
            self.generate_experts.unload()
        if self.prefill_experts is not None:
            self.prefill_experts.unload()
        self.device = self.generate_experts.device

    def forward(self, input_tensor, expert_ids, weights):
        if self.mode == InferenceState.GENERATE:
            assert self.generate_experts is not None, "generate_experts is None"
            return self.generate_experts.forward(input_tensor, expert_ids, weights)
        elif self.mode == InferenceState.PREFILL:
            assert self.prefill_experts is not None, "prefill_experts is None"
            return self.prefill_experts.forward(input_tensor, expert_ids, weights)
        else:
            raise ValueError("load or set_inference_mode before forward")

    def set_inference_mode(self, mode: InferenceState):
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE, warmup=False)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL, warmup=False)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")


from ktransformers.models.modeling_deepseek import DeepseekV2MoE
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MoE
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock
from ktransformers.models.modeling_mixtral import MixtralSparseMoeBlock
from ktransformers.models.modeling_smallthinker import SmallthinkerMoeBlock
from ktransformers.models.modeling_glm4_moe import Glm4MoeMoE
from ktransformers.models.modeling_qwen3_next import Qwen3NextSparseMoeBlock


class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        orig_shape = hidden_states.shape
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)
        
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0], routing_weights[0])
            shared_expert_output = self.shared_expert(hidden_states)
            shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y += shared_expert_output
            y.resize_(*orig_shape)
            return y, router_logits
        
        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, KExpertsBase) else hidden_states.cpu()
        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts.cpu()
        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights.cpu()

        shared_expert_output = self.shared_expert(hidden_states)
        shared_expert_output = (
            F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
        )

        if isinstance(self.experts, KExpertsBase):
            y = (
                self.moe_kexperts(
                    hidden_states_expert, selected_experts_expert, routing_weights_expert
                )
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        elif hidden_states_expert.size(0) > 10:
            y = self.moe_infer(
                hidden_states_expert, selected_experts_expert, routing_weights_expert, orig_shape
            ).to(device=hidden_states.device)
        else:
            y = self.moe_infer_simple(
                hidden_states_expert, selected_experts_expert, routing_weights_expert
            ).to(device=hidden_states.device)
        y += shared_expert_output
        y.resize_(*orig_shape)
        return y, router_logits
    
    @torch.no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
        '''
        hidden_states_cpu: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        '''
        outs = torch.zeros_like(hidden_states_cpu)
        for token_idx in range(selected_experts_cpu.size(0)):
            for expert_idx in range(selected_experts_cpu.size(1)):
                expert = self.experts[selected_experts_cpu[token_idx, expert_idx]]
                outs[token_idx] += expert.forward(hidden_states_cpu[token_idx]) * routing_weights_cpu[token_idx, expert_idx]
        return outs
    
    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
        
        batch_size, sequence_length, hidden_dim = orig_shape

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer.forward(current_state) * routing_weights_cpu[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states_cpu.dtype))

        return final_hidden_states

class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]
        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity).squeeze(0)
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y += y_
            y.resize_(*orig_shape)
            return y

        if self.config.n_shared_experts is not None:
            y_ = self.shared_experts(identity).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        if self.config.n_shared_experts is not None:
            y += y_
        return y

    @torch.no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KDeepseekV3MoE(BaseInjectedModule, DeepseekV3MoE):
    
    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]
        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        
        # only for generate phase
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity).squeeze(0)
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y += y_
            y.resize_(*orig_shape)
            return y

        if self.config.n_shared_experts is not None:
            y_ = self.shared_experts(identity).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        if self.config.n_shared_experts is not None:
            y += y_
        return y


    @torch.no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KMistralSparseMoEBlock(BaseInjectedModule, MixtralSparseMoeBlock):
    
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        orig_shape = hidden_states.shape
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        if self.training and self.jitter_noise > 0:
            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)
        
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0], routing_weights[0])
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y.resize_(*orig_shape)
            return y, router_logits
        
        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, KExpertsBase) else hidden_states_expert.cpu()
        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts_expert.cpu()
        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu()

        if isinstance(self.experts, KExpertsBase):
            y = (
                self.moe_kexperts(
                    hidden_states_expert, selected_experts_expert, routing_weights_expert
                )
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        elif hidden_states_expert.size(0) > 10:
            y = self.moe_infer(
                hidden_states_expert, selected_experts_expert, routing_weights_expert, orig_shape
            ).to(device=hidden_states.device)
        else:
            y = self.moe_infer_simple(
                hidden_states_expert, selected_experts_expert, routing_weights_expert
            ).to(device=hidden_states.device)
            
        y.resize_(*orig_shape)
        return y, router_logits
    
    @torch.no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
        '''
        hidden_states_cpu: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        '''
        outs = torch.zeros_like(hidden_states_cpu)
        for token_idx in range(selected_experts_cpu.size(0)):
            for expert_idx in range(selected_experts_cpu.size(1)):
                expert = self.experts[selected_experts_cpu[token_idx, expert_idx]]
                outs[token_idx] += expert.forward(hidden_states_cpu[token_idx]) * routing_weights_cpu[token_idx, expert_idx]
        return outs
    
    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
        
        batch_size, sequence_length, hidden_dim = orig_shape

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer.forward(current_state) * routing_weights_cpu[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states_cpu.dtype))

        return final_hidden_states

class KDeepseekV3MoEV2(BaseInjectedModule, DeepseekV3MoE):
    def forward(self, hidden_states, bsz_tensor, cuda_graph_idx=0):
        identity = hidden_states
        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]
        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, topk_idx, topk_weight, bsz_tensor, cuda_graph_idx)
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity, bsz_tensor).squeeze(0)
            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            y += y_
            y.resize_(*orig_shape)
            return y

        if self.config.n_shared_experts is not None:
            y_ = self.shared_experts(identity, bsz_tensor).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        if self.config.n_shared_experts is not None:
            y += y_
        return y

    @torch.no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KTransformersExpertsV2(BaseInjectedModule, KExpertsBase):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                #  device: str = "cuda",
                 prefill_device:str = "cuda",
                 prefill_op: str | None = "KExpertsTorch",
                 generate_device: str = "cpu",
                 generate_op: str | None = "KExpertsCPU",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)

        if prefill_op == 'None':
            prefill_op = None
        if generate_op == 'None':
            generate_op = None

        if generate_op is not None:
            self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
        else:
            self.generate_experts = None
        if prefill_op is not None:
            self.prefill_experts = EXPERTS_MAP[prefill_op](key, gguf_loader, config, len(orig_module), device=prefill_device, **kwargs)
        else:
            self.prefill_experts = None
        self.gpu_mlp_type = prefill_op
        self.cpu_mlp_type = generate_op
        self.mode = InferenceState.UNLOAD

    def load(self, w: dict = None,  mode: InferenceState = None, warmup: bool = True):
        # TODO support w as input
        if not mode: mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            self.prefill_experts.unload()
            self.generate_experts.load(w, warmup=warmup)
            self.device = self.generate_experts.device
            self.mode = mode
        elif mode == InferenceState.PREFILL:
            self.generate_experts.unload()
            self.prefill_experts.load(w, warmup=warmup)
            self.device = self.prefill_experts.device
            self.mode = mode
        elif mode == InferenceState.UNLOAD:
            self.unload()
            self.mode = mode
            self.device = self.generate_experts.device
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

    def unload(self):
        if self.generate_experts is not None:
            self.generate_experts.unload()
        if self.prefill_experts is not None:
            self.prefill_experts.unload()
        self.device = self.generate_experts.device

    def forward(self, input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx=0):
        if self.mode == InferenceState.GENERATE:
            assert self.generate_experts is not None, "generate_experts is None"
            return self.generate_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        elif self.mode == InferenceState.PREFILL:
            assert self.prefill_experts is not None, "prefill_experts is None"
            return self.prefill_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        else:
            raise ValueError("load or set_inference_mode before forward")

    def set_inference_mode(self, mode: InferenceState):
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE, warmup=False)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL, warmup=False)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")


class KSmallthinkerExperts(BaseInjectedModule, KExpertsBase):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                #  device: str = "cuda",
                 prefill_device:str = "cuda",
                 prefill_op: str | None = "KExpertsTorch",
                 generate_device: str = "cpu",
                 generate_op: str | None = "KExpertsCPU",
                 **kwargs):

        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        if generate_op is not None:
            self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
        else:
            self.generate_experts = None
        if prefill_op is not None:
            self.prefill_experts = None
        self.gpu_mlp_type = prefill_op
        self.cpu_mlp_type = generate_op
        self.mode = InferenceState.UNLOAD

    def load(self, w: dict = None,  mode: InferenceState = None, warmup: bool = True):
        # TODO support w as input
        if not mode: mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            # self.prefill_experts.unload()
            self.generate_experts.load(w, warmup=warmup)
            self.device = self.generate_experts.device
            self.mode = mode
        elif mode == InferenceState.PREFILL:
            self.generate_experts.unload()
            self.prefill_experts.load(w, warmup=warmup)
            self.device = self.prefill_experts.device
            self.mode = mode
        elif mode == InferenceState.UNLOAD:
            self.unload()
            self.mode = mode
            self.device = self.generate_experts.device
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

    def unload(self):
        if self.generate_experts is not None:
            self.generate_experts.unload()
        if self.prefill_experts is not None:
            self.prefill_experts.unload()
        self.device = self.generate_experts.device

    def forward(self, input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx=0):
        if self.mode == InferenceState.GENERATE:
            assert self.generate_experts is not None, "generate_experts is None"
            return self.generate_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        elif self.mode == InferenceState.PREFILL:
            assert self.prefill_experts is not None, "prefill_experts is None"
            return self.prefill_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        else:
            raise ValueError("load or set_inference_mode before forward")

    def set_inference_mode(self, mode: InferenceState):
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE, warmup=False)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL, warmup=False)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

class KGlm4Experts(BaseInjectedModule, KExpertsBase):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                #  device: str = "cuda",
                 prefill_device:str = "cuda",
                 prefill_op: str | None = "KExpertsTorch",
                 generate_device: str = "cpu",
                 generate_op: str | None = "KExpertsCPU",
                 **kwargs):

        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        if generate_op is not None:
            self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
        else:
            self.generate_experts = None
        if prefill_op is not None:
            self.prefill_experts = None
        self.gpu_mlp_type = prefill_op
        self.cpu_mlp_type = generate_op
        self.mode = InferenceState.UNLOAD

    def load(self, w: dict = None,  mode: InferenceState = None, warmup: bool = True):
        # TODO support w as input
        if not mode: mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            # self.prefill_experts.unload()
            self.generate_experts.load(w, warmup=warmup)
            self.device = self.generate_experts.device
            self.mode = mode
        elif mode == InferenceState.PREFILL:
            self.generate_experts.unload()
            self.prefill_experts.load(w, warmup=warmup)
            self.device = self.prefill_experts.device
            self.mode = mode
        elif mode == InferenceState.UNLOAD:
            self.unload()
            self.mode = mode
            self.device = self.generate_experts.device
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

    def unload(self):
        if self.generate_experts is not None:
            self.generate_experts.unload()
        if self.prefill_experts is not None:
            self.prefill_experts.unload()
        self.device = self.generate_experts.device

    def forward(self, input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx=0):
        if self.mode == InferenceState.GENERATE:
            assert self.generate_experts is not None, "generate_experts is None"
            return self.generate_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        elif self.mode == InferenceState.PREFILL:
            assert self.prefill_experts is not None, "prefill_experts is None"
            return self.prefill_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        else:
            raise ValueError("load or set_inference_mode before forward")

    def set_inference_mode(self, mode: InferenceState):
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE, warmup=False)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL, warmup=False)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")


class KQwen2MoeSparseMoeBlockV2(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
    def forward(self, hidden_states, bsz_tensor, cuda_graph_idx=0):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        router_logits = self.gate(hidden_states, bsz_tensor)        

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
            y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
            y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_    

            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            
            y += y_
            y.resize_(*orig_shape)
            return y

        y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
        y_ = (
            F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        )


        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            ) 
        y += y_
        return y

    @torch.no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KQwen3MoeSparseMoeBlockV2(BaseInjectedModule, Qwen3MoeSparseMoeBlock):
    def forward(self, hidden_states, bsz_tensor=None, cuda_graph_idx=0):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        if bsz_tensor is None:
            router_logits = self.gate(hidden_states)
        else:
            router_logits = self.gate(hidden_states, bsz_tensor)

        if router_logits.device.type == "xpu":
            from ipex_llm.transformers.models.common import moe_softmax_topk
            selected_experts, routing_weights = moe_softmax_topk(
                router_logits.half(), self.top_k, self.norm_topk_prob
            )
        else:
            routing_weights = torch.nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
            routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
            if self.norm_topk_prob:
                routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
            # y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
            # y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_    

            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            
            # y += y_
            y.resize_(*orig_shape)
            return y

        # y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
        # y_ = (
        #     F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        # )


        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            ) 
        # y += y_
        return y

    @torch.no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


class KSmallthinkerMoeBlock(BaseInjectedModule, SmallthinkerMoeBlock):
    def forward(self, router_input: torch.Tensor, hidden_states: torch.Tensor, bsz_tensor=None, cuda_graph_idx=0):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        if bsz_tensor is None:
            if self.enable_early_router:
                router_logits = self.primary_router(router_input)
            else:
                router_logits = self.primary_router(hidden_states)
        else:
            if self.enable_early_router:
                router_logits = self.primary_router(router_input, bsz_tensor)
            else:
                router_logits = self.primary_router(hidden_states, bsz_tensor)

        router_logits, selected_experts = torch.topk(router_logits, self.num_active_primary_experts, dim=-1)


        if router_logits.device.type == "xpu":
            # TODO: support self.moe_primary_router_apply_softmax False case
            from ipex_llm.transformers.models.common import moe_softmax_topk
            selected_experts, routing_weights = moe_softmax_topk(
                router_logits.half(), self.top_k, self.norm_topk_prob
            )
        else:
            if self.moe_primary_router_apply_softmax:
                routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
            else:
                routing_weights = F.sigmoid(router_logits)
                routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
            # y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
            # y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_    

            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            
            # y += y_
            y.resize_(*orig_shape)
            return y

        # y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
        # y_ = (
        #     F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        # )


        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            ) 
        # y += y_
        return y

    @torch.no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


class KGlm4MoeMoE(BaseInjectedModule, Glm4MoeMoE):
    def forward(self, hidden_states, bsz_tensor=None, cuda_graph_idx=0):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, topk_idx, topk_weight, bsz_tensor, cuda_graph_idx)
            y_ = self.shared_experts(hidden_states, bsz_tensor).squeeze(0)
            # y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_    

            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            
            y += y_
            y.resize_(*orig_shape)
            return y

        y_ = self.shared_experts(hidden_states, bsz_tensor).squeeze(0)
        # y_ = (
        #     F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        # )


        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            ) 
        y += y_
        return y

    @torch.no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


class KQwen3NextSparseMoeBlockV2(BaseInjectedModule, Qwen3NextSparseMoeBlock):
    def forward(self, hidden_states, bsz_tensor=None, cuda_graph_idx=0):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        if bsz_tensor is None:
            router_logits = self.gate(hidden_states)
        else:
            router_logits = self.gate(hidden_states, bsz_tensor)

        if router_logits.device.type == "xpu":
            from ipex_llm.transformers.models.common import moe_softmax_topk
            selected_experts, routing_weights = moe_softmax_topk(
                router_logits.half(), self.top_k, self.norm_topk_prob
            )
        else:
            routing_weights = torch.nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
            routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
            if self.norm_topk_prob:
                routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        if self.norm_topk_prob:
            routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
            y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
            y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_    

            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            
            y += y_
            y.resize_(*orig_shape)
            return y

        y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
        y_ = (
            F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        )


        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            ) 
        y += y_
        return y

    @torch.no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @torch.no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

================================================
FILE: archive/ktransformers/operators/flashinfer_batch_prefill_wrapper.py
================================================
import torch
import flashinfer
import gc
try:
    from flash_attn import flash_attn_with_kvcache
    print("found flash_attn")
    
except ImportError:
    print("flash_attn not found, flashinfer unit test needed it. If you are using balance serve, ignore this.")

from typing import Union, Optional

def setup_seed(seed):
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

setup_seed(998244353)

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False

if not use_torch_npu:
	torch.set_grad_enabled(False)
	torch.set_default_dtype(torch.bfloat16)
	global_dtype=torch.bfloat16
	global_device=torch.device("cuda",0)
	torch.cuda.set_device(0)
	torch.backends.cudnn.enabled =True
	torch.backends.cudnn.benchmark = True

class flashInferAttn():
	
	float_workspace_buffer = None
	def __init__(self,
			max_batch_token,
			max_batch_size,
			max_pages,
			device = "cuda:0",
			kv_layout: str = "NHD",
			use_cuda_graph: bool = False,
			) -> None:
		self.device = device
		self.max_batch_token = max_batch_token
		self.kv_layout = kv_layout
		self.use_cuda_graph = use_cuda_graph
		if flashInferAttn.float_workspace_buffer is None:
			flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device)
		self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
		self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
		self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
		self.paged_kv_last_page_len_buf = torch.empty((max_batch_size,), dtype=torch.int32, device=device)
		self.batch_size_tensor_buf = torch.empty((1,), dtype=torch.int32, device=device)
		self.num_tokens_tensor_buf = torch.empty((1,), dtype=torch.uint32, device=device)
	
		# TODO: custom mask
		self.custom_mask_buf = None
		self.qk_indptr_buf = None
		self.warpper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
			flashInferAttn.float_workspace_buffer,
			self.kv_layout,
			use_cuda_graph=self.use_cuda_graph,
			qo_indptr_buf=self.qo_indptr_buf,
			paged_kv_indptr_buf=self.paged_kv_indptr_buf,
			paged_kv_indices_buf=self.paged_kv_indices_buf,
			paged_kv_last_page_len_buf=self.paged_kv_last_page_len_buf,
			backend = "fa2",
		)

	def plan(self,
		qo_indptr: torch.Tensor,
		paged_kv_indptr: torch.Tensor,
		paged_kv_indices: torch.Tensor,
		paged_kv_last_page_len: torch.Tensor,
		batch_size_tensor: torch.Tensor,
		num_tokens_tensor: torch.Tensor,
		num_qo_heads: int,
		num_kv_heads: int,
		head_dim: int,
		page_size: int,
		causal: bool = True, 
		pos_encoding_mode: str = "NONE",
		q_data_type: Union[str, torch.dtype] = torch.bfloat16,
		kv_data_type: Optional[Union[str, torch.dtype]] = None):
		
		self.batch_size_tensor_buf.copy_(batch_size_tensor, non_blocking=True)
		self.num_tokens_tensor_buf.copy_(num_tokens_tensor, non_blocking=True)
		self.page_size = page_size
		self.warpper.plan(
			qo_indptr,
			paged_kv_indptr,
			paged_kv_indices,
			paged_kv_last_page_len,
			num_qo_heads,
			num_kv_heads,
			head_dim,
			page_size,
			causal = causal,
			pos_encoding_mode = pos_encoding_mode,
			q_data_type = q_data_type,
			kv_data_type = kv_data_type
			)

	def calc_batch_indices(self, ragged_size = None):
		if self.use_cuda_graph:
			self.batch_indices, self.positions = flashinfer.get_batch_indices_positions(
				self.qo_indptr_buf, flashinfer.get_seq_lens(self.paged_kv_indptr_buf, self.paged_kv_last_page_len_buf, self.page_size), self.batch_size_tensor_buf, self.max_batch_token)
		else:
			self.batch_indices, self.positions = flashinfer.get_batch_indices_positions(
				self.warpper._qo_indptr_buf, flashinfer.get_seq_lens(self.warpper._paged_kv_indptr_buf, self.warpper._paged_kv_last_page_len_buf, self.page_size), self.batch_size_tensor_buf, ragged_size)

	def forward(self, q, k_cache, v_cache, k, v):
		if self.use_cuda_graph:
			flashinfer.page.append_paged_kv_cache(k, v, self.batch_indices, self.positions, (k_cache, v_cache), self.paged_kv_indices_buf, self.paged_kv_indptr_buf, self.paged_kv_last_page_len_buf, self.num_tokens_tensor_buf)
			return self.warpper.run(q, (k_cache, v_cache))
		else:
			flashinfer.page.append_paged_kv_cache(k, v, self.batch_indices, self.positions, (k_cache, v_cache), self.warpper._paged_kv_indices_buf, self.warpper._paged_kv_indptr_buf, self.warpper._paged_kv_last_page_len_buf, self.num_tokens_tensor_buf)
			return self.warpper.run(q, (k_cache, v_cache))


def testCudaGraph():
	
	# use max batch to create buffer
	batch_decode = 8
	prefill_chunk = 48
	past_kv_0 = 4090
	past_kv_1 = 4096
	raged_size = prefill_chunk + batch_decode
	num_key_value_heads = 8
	head_dim = 128
	num_attention_heads = 64
	page_size = 256
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	attn = flashInferAttn(raged_size, batch_decode+1, total_num_pages, use_cuda_graph=True)

	batch_size_tensor = torch.tensor([batch_decode + 1], device=global_device, dtype=torch.int32)
	
	k_caches = []	
	v_caches = []
	ks = []
	vs = []
	qs = []
	for layer_idx in range(3):
		k_caches.append(torch.randn(total_num_pages, page_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		v_caches.append(torch.randn(total_num_pages, page_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		ks.append(torch.randn(raged_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		vs.append(torch.randn(raged_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		qs.append(torch.randn(raged_size, num_attention_heads, head_dim, device=global_device, dtype=torch.bfloat16))
	
	# warmup and capture small batch
	past_kv_0 = 250
	past_kv_1 = 256
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	q_indptr = torch.empty((batch_decode + 2,), dtype=torch.int32, device=global_device)
	q_indptr[0] = 0
	q_indptr[1:] = torch.arange(prefill_chunk, prefill_chunk + batch_decode + 1, device=global_device, dtype=torch.int32)
	kv_indptr = torch.arange(0, batch_decode + 2, device=global_device, dtype=torch.int32) * num_pages_per_seq
	kv_indices = torch.arange(0, total_num_pages, device=global_device, dtype=torch.int32)
	kv_last_page_len = torch.empty((batch_decode + 1,), dtype=torch.int32, device=global_device)
	kv_last_page_len[:1+batch_decode//2] = int((past_kv_0 - 1) % page_size + 1)
	kv_last_page_len[1+batch_decode//2:] = int((past_kv_1 - 1) % page_size + 1)

	print(q_indptr)
	print(kv_indptr)
	print(kv_indices)
	print(kv_last_page_len)
	attn.plan(q_indptr,
			kv_indptr,
			kv_indices,
			kv_last_page_len,
			batch_size_tensor,
			num_attention_heads,
			num_key_value_heads,
			head_dim,
			page_size,
			causal = True,
			pos_encoding_mode="NONE",
			q_data_type=torch.bfloat16)

	attn.calc_batch_indices(raged_size)
	for layer_idx in range(3):
		attn.forward(qs[layer_idx], k_caches[layer_idx], v_caches[layer_idx], ks[layer_idx], vs[layer_idx])
		torch.cuda.synchronize()

	outs = []
	g = torch.cuda.CUDAGraph()
	with torch.cuda.graph(g):
		for layer_idx in range(3):
			outs.append(attn.forward(qs[layer_idx], k_caches[layer_idx], v_caches[layer_idx], ks[layer_idx], vs[layer_idx]))
	g.replay()
	
	kv_last_page_len[:1+batch_decode//2] = int(past_kv_0)
	kv_last_page_len[1+batch_decode//2:] = int(past_kv_1)
	for layer_idx in range(3):
		for i in range(batch_decode + 1):
			
			qi = qs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			o_ref_i = flash_attn_with_kvcache(
				qi.unsqueeze(0),
				k_caches[layer_idx],
				v_caches[layer_idx],
				causal=True,
				block_table=kv_indices[kv_indptr[i]:kv_indptr[i+1]].unsqueeze(0),
				cache_seqlens=torch.tensor([past_kv_0 if i < 1+batch_decode//2 else past_kv_1], device=global_device, dtype=torch.int32)
			)
			o_i = outs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			print(layer_idx, i)
			torch.testing.assert_close(o_i.unsqueeze(0), o_ref_i, rtol=5e-3, atol=5e-3)

	# run another batch size use capture cuda graph
	past_kv_0 = 4090
	past_kv_1 = 4096
	prefill_chunk = 24
	batch_decode = 4
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	batch_size_tensor = torch.tensor([batch_decode + 1], device=global_device, dtype=torch.int32)
	num_tokens_tensor = torch.tensor([batch_decode + prefill_chunk], device=global_device, dtype=torch.int32)

	q_indptr = torch.empty((batch_decode + 2,), dtype=torch.int32, device=global_device)
	q_indptr[0] = 0
	q_indptr[1:] = torch.arange(prefill_chunk, prefill_chunk + batch_decode + 1, device=global_device, dtype=torch.int32)
	kv_indptr = torch.arange(0, batch_decode + 2, device=global_device, dtype=torch.int32) * num_pages_per_seq
	kv_indices = torch.arange(0, total_num_pages, device=global_device, dtype=torch.int32)
	kv_last_page_len = torch.empty((batch_decode + 1,), dtype=torch.int32, device=global_device)
	kv_last_page_len[:1+batch_decode//2] = int((past_kv_0 - 1) % page_size + 1)
	kv_last_page_len[1+batch_decode//2:] = int((past_kv_1 - 1) % page_size + 1)
	attn.plan(q_indptr,
			kv_indptr,
			kv_indices,
			kv_last_page_len,
			batch_size_tensor,
			num_attention_heads,
			num_key_value_heads,
			head_dim,
			page_size,
			causal = True,
			pos_encoding_mode="NONE",
			q_data_type=torch.bfloat16)
	attn.calc_batch_indices(raged_size)
	g.replay()
	
	kv_last_page_len[:1+batch_decode//2] = int(past_kv_0)
	kv_last_page_len[1+batch_decode//2:] = int(past_kv_1)
	for layer_idx in range(3):
		for i in range(batch_decode + 1):
			
			qi = qs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			o_ref_i = flash_attn_with_kvcache(
				qi.unsqueeze(0),
				k_caches[layer_idx],
				v_caches[layer_idx],
				causal=True,
				block_table=kv_indices[kv_indptr[i]:kv_indptr[i+1]].unsqueeze(0),
				cache_seqlens=torch.tensor([past_kv_0 if i < 1+batch_decode//2 else past_kv_1], device=global_device, dtype=torch.int32)
			)
			o_i = outs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			print(layer_idx, i)
			torch.testing.assert_close(o_i.unsqueeze(0), o_ref_i, rtol=5e-3, atol=5e-3)
			

def testAttentionFlashInfer(	
	):
	batch_decode = 32
	prefill_chunk = 64
	past_kv_0 = 510
	past_kv_1 = 512
	raged_size = prefill_chunk + batch_decode
	num_key_value_heads = 8
	head_dim = 128
	num_attention_heads = 64
	cases = 1
	page_size = 32
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
	qs = []
	kvs = []
	q_indptrs = []
	kv_indptrs = []
	kv_indicess = []
	kv_last_page_lens = []
	wrappers = []
	for case_id in range(cases):
		kvs.append(torch.randn(total_num_pages, 2, page_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		qs.append(torch.randn(raged_size, num_attention_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		q_indptr = torch.empty((batch_decode + 2,), dtype=torch.int32, device=global_device)
		q_indptr[0] = 0
		q_indptr[1:] = torch.arange(prefill_chunk, prefill_chunk + batch_decode + 1, device=global_device, dtype=torch.int32)
		q_indptrs.append(q_indptr)
		kv_indptrs.append(torch.arange(0, batch_decode + 2, device=global_device, dtype=torch.int32) * num_pages_per_seq)
		kv_indicess.append(torch.arange(0, total_num_pages, device=global_device, dtype=torch.int32))
		kv_last_page_len = torch.empty((batch_decode + 1,), dtype=torch.int32, device=global_device)
		kv_last_page_len[:1+batch_decode//2] = int((past_kv_0 - 1) % page_size + 1)
		kv_last_page_len[1+batch_decode//2:] = int((past_kv_1 - 1) % page_size + 1)
		kv_last_page_lens.append(kv_last_page_len)
		wrappers.append(flashinfer.BatchPrefillWithPagedKVCacheWrapper(
			workspace_buffer,
			"NHD",
			use_cuda_graph=True,
			qo_indptr_buf=q_indptrs[case_id],
			paged_kv_indptr_buf=kv_indptrs[case_id],
			paged_kv_indices_buf=kv_indicess[case_id],
			paged_kv_last_page_len_buf=kv_last_page_lens[case_id],
		))
		wrappers[case_id].plan(
			q_indptrs[case_id],
			kv_indptrs[case_id],
			kv_indicess[case_id],
			kv_last_page_lens[case_id],
			num_attention_heads,
			num_key_value_heads,
			head_dim,
			page_size,
			causal = True,
			pos_encoding_mode="ROPE_LLAMA",
			q_data_type=torch.bfloat16
		)
					
	def custom_forward(case_id):
		out = wrappers[case_id].run(qs[case_id], kvs[case_id])
	
	custom_forward(0)

# testCudaGraph()
# pass

================================================
FILE: archive/ktransformers/operators/flashinfer_wrapper.py
================================================
'''
Description  : flashinfer MLA wrapper
Author       : Boxin Zhang
Version      : 0.2.3
'''
import torch
import os

flashinfer_enabled = False

try:
    import flashinfer
    flashinfer_enabled = True
    print("found flashinfer")
    
except ImportError:
    print("flashinfer not found, use triton for linux")

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False

if not use_torch_npu:
    from ktransformers.operators.triton_attention import decode_attention_fwd_grouped

import math

def attention_ref_torch(
    batch_size,
    q: torch.Tensor,
    k: torch.Tensor,
    v: torch.Tensor,
    causal: bool,
    sm_scale: float,
) -> torch.Tensor:
    qo_len = q.shape[0] // batch_size
    kv_len = k.shape[0] // batch_size
    num_qo_heads = q.shape[1]
    head_dim_qk = q.shape[2]
    head_dim_vo = v.shape[2]
    logits = (
        torch.einsum(
            "bmhd,bnhd->bhmn",
            q.view(batch_size, qo_len, num_qo_heads, head_dim_qk).float(),
            k.view(batch_size, kv_len, num_qo_heads, head_dim_qk).float(),
        )
        * sm_scale
    )

    #print("attn weights", logits)

    if causal:
        mask = (
            torch.arange(kv_len - qo_len, kv_len).unsqueeze(1)
            >= torch.arange(0, kv_len).unsqueeze(0)
        ).to(q.device)
    else:
        mask = torch.ones(qo_len, kv_len).to(q.device)

    logits = logits.masked_fill(mask.unsqueeze(0).unsqueeze(0) == 0, float("-inf"))
    lse_ref = torch.logsumexp(logits, -1).transpose(-1, -2)
    p = torch.softmax(logits, dim=-1)
    o_ref = (
        torch.einsum(
            "bhmn,bnhd->bmhd",
            p,
            v.view(batch_size, kv_len, num_qo_heads, head_dim_vo).float(),
        )
        .contiguous()
        .view(batch_size * qo_len, num_qo_heads, head_dim_vo)
        .to(q)
    )

    return o_ref, lse_ref * math.log2(math.e)

class MLAWrapper():
    def __init__(self,
                 max_batch_size,
                 max_pages,
                 use_cuda_graph = True,
                 device = "cuda",
                 ):
        self.float_workspace_buffer = torch.empty(128*1024*1024, dtype=torch.int8, device=device)
        self.max_batch_size = max_batch_size
        self.max_pages = max_pages
        if use_cuda_graph:
            if self.max_batch_size == 1:
                self.qo_indptr_buf = torch.arange(0, max_batch_size+1, dtype=torch.int32, device=device)
                self.kv_indptr_buf = torch.tensor([0, max_pages], dtype=torch.int32, device=device)
                self.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
            else:
                self.qo_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
                self.kv_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
                self.kv_indices_buf = torch.empty(max_pages, dtype=torch.int32, device=device)
            self.batch_size_tensor_buf = torch.tensor([self.max_batch_size], dtype=torch.int32, device=device)
            self.kv_len_arr_buf = torch.empty(max_batch_size, dtype=torch.int32, device=device)
        else:
            self.qo_indptr_buf = None
            self.kv_indptr_buf = None
            self.kv_indices_buf = None
            self.kv_len_arr_buf = None
        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
            self.float_workspace_buffer,
            use_cuda_graph=use_cuda_graph,
            qo_indptr=self.qo_indptr_buf,
            kv_indptr=self.kv_indptr_buf,
            kv_indices=self.kv_indices_buf,
            kv_len_arr=self.kv_len_arr_buf,
            bsz_tensor=self.batch_size_tensor_buf,
            backend = "fa2",
        )
        self.need_plan = True

    
    def plan(self,
             qo_indptr,
             kv_indptr,
             kv_indices,
             kv_len_arr,
             bsz_tensor,
             num_heads,
             head_dim_ckv,
             head_dim_kpe,
             page_size,
             sm_scale,
             q_data_type,
             kv_data_type,
             ):
        if qo_indptr is None:
            assert self.max_batch_size == 1
            qo_indptr = self.qo_indptr_buf
        if kv_indptr is None:
            assert self.max_batch_size == 1
            kv_indptr = self.kv_indptr_buf
        if kv_indices is None:
            assert self.max_batch_size == 1
            kv_indices = self.kv_indices_buf
        if bsz_tensor is None:
            assert self.max_batch_size == 1
            bsz_tensor = self.batch_size_tensor_buf
        
        self.wrapper.plan(
            qo_indptr,
            kv_indptr,
            kv_indices,
            kv_len_arr,
            num_heads,
            head_dim_ckv,
            head_dim_kpe,
            page_size,
            True, # causal
            sm_scale,
            q_data_type,
            kv_data_type,
            bsz_tensor
        )

    def run(self, q_nope, q_pe, ckv, k_pe, return_lse = False):
        return self.wrapper.run(q_nope, q_pe, ckv, k_pe, return_lse = return_lse)

class MLAWrapperSingleton():
    wrappers:dict = {}

    @classmethod
    def get_instance(cls, device, *args, **kwargs)->MLAWrapper:
        if device not in cls.wrappers:
            cls.make_instance(device, *args, **kwargs)
        return cls.wrappers[device]
    
    @classmethod
    def make_instance(cls, device, *args, **kwargs):
        cls.wrappers[device] = MLAWrapper(*args, **kwargs, device=device)

    @classmethod
    def plan_all(cls, qo_indptr,
             kv_indptr,
             kv_indices,
             kv_len_arr,
             bsz_tensor,
             num_heads,
             head_dim_ckv,
             head_dim_kpe,
             page_size,
             sm_scale,
             q_data_type,
             kv_data_type,):
        for device, wrapper in cls.wrappers.items():
            kv_len_arr_cur_device = kv_len_arr.to(device)
            wrapper.plan(qo_indptr,
                kv_indptr,
                kv_indices,
                kv_len_arr_cur_device,
                bsz_tensor,
                num_heads,
                head_dim_ckv,
                head_dim_kpe,
                page_size,
                sm_scale,
                q_data_type,
                kv_data_type,)
            wrapper.need_plan = False
            
    @classmethod
    def need_plan_all(cls):
        for device, wrapper in cls.wrappers.items():
            wrapper.need_plan = True
        
    @classmethod
    def reset_buffer(cls):
        for device, wrapper in cls.wrappers.items():
            wrapper.qo_indptr_buf[1] = 1 # assert max_batch_size=1 here.
            
    @classmethod
    def update_buffer(cls, max_pages):
        for device, wrapper in cls.wrappers.items():
            wrapper.kv_indptr_buf[1] = max_pages # assert max_batch_size=1 here.
            wrapper.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
            wrapper.wrapper._kv_indices_buf = wrapper.kv_indices_buf

def checksame():
    flashinfer_folder = "./flashinfer_output"
    flashinfer_folder = "./kv_cache_flashinfer"
    triton_folder = "./triton_output"
    triton_folder = "./kv_cache_triton"
    
    max_layer_id = 1
    max_forward_id = 2

    for forward_id in range(0, 19):
        print("forward_id", forward_id)
        for layer_id in range(max_layer_id):
            print(layer_id)
            #file_name = f"layer_{layer_id}_forward_{forward_id}_attn_output.pt"
            #file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
            file_name = f"layer_{layer_id}.pt"
            
            flashinfer_path = os.path.join(flashinfer_folder, file_name)
            triton_path = os.path.join(triton_folder, file_name)
            
            if not os.path.exists(triton_path):
                print(f"{file_name} not exist in {triton_folder}")
                continue
            if not os.path.exists(flashinfer_path):
                print(f"{file_name} not exist in {flashinfer_folder}")
                continue
            
            
            flashinfer_tensor = torch.load(flashinfer_path)[1:2, :62]#
            triton_tensor = torch.load(triton_path)[1:2, :62]#.squeeze(1)#
            try:
                torch.testing.assert_close(flashinfer_tensor, triton_tensor, rtol=1e-9, atol=1e-9)
            except AssertionError as e:
                print(e)

if __name__ == "__main__":
    
    #checksame()
    #exit(0)

    max_batch_size = 2
    max_batch_tokens = 256
    max_pages = 128
    page_size = 64
    num_heads = 128
    
    # warm-up
    kv_len = 4023
    q_len = 1
    q_nope_buf = torch.randn((max_batch_tokens, num_heads, 512), dtype=torch.bfloat16, device="cuda")
    q_pe_buf = torch.randn((max_batch_tokens, num_heads, 64), dtype=torch.bfloat16, device="cuda")
    kv_buf = torch.randn((max_pages, page_size, 576), dtype=torch.bfloat16, device="cuda")
    ckv, k_pe = torch.split(kv_buf, [512, 64], dim=-1)
    

    wrapper = MLAWrapperSingleton.get_instance(
        "cuda",
        max_batch_size,
        max_pages,
    )
    
    used_pages = (kv_len + page_size - 1)// page_size
    kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
    kv_indptr = torch.tensor([0, used_pages], dtype=torch.int32, device="cuda")
    kv_indices = torch.empty(max_pages, dtype=torch.int32, device="cuda")
    kv_indices[:used_pages] = torch.arange(0, used_pages, dtype=torch.int32, device="cuda")
    bsz_tensor = torch.tensor([1], dtype=torch.int32, device="cuda")
    wrapper.plan(
        qo_indptr,
        kv_indptr,
        kv_indices,
        kv_len_arr,
        bsz_tensor,
        128,
        512,
        64,
        page_size,
        192 ** (-0.5),
        torch.bfloat16,
        torch.bfloat16,
    )

    attn_output = wrapper.run(q_nope_buf[:q_len], q_pe_buf[:q_len], ckv, k_pe)
    print(attn_output.shape)
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        attn_output = wrapper.run(q_nope_buf, q_pe_buf, ckv, k_pe)
    graph.replay()

    q = torch.cat([q_nope_buf, q_pe_buf], dim=-1)
    k = (
        torch.cat([ckv, k_pe], dim=-1)
        .view(-1, 1, 512 + 64)
        .repeat_interleave(num_heads, dim=1)
    )
    v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
    attn_ref, lse_ref = attention_ref_torch(
        1,
        q[:q_len],
        k[:kv_len],
        v[:kv_len],
        True,
        192 ** (-0.5)
    )
    torch.testing.assert_close(attn_output[:q_len], attn_ref, rtol=5e-3, atol=5e-3)
    # warm-up finished

    kv_len = 512
    q_len = 128
    pages = max_pages
    used_pages = (kv_len + page_size - 1)// page_size
    q_nope = torch.randn((q_len*2, num_heads, 512), dtype=torch.bfloat16, device="cuda")
    q_nope[q_len:] = q_nope[:q_len]
    q_pe = torch.randn((q_len*2, num_heads, 64), dtype=torch.bfloat16, device="cuda")
    q_pe[q_len:] = q_pe[:q_len]
    kv_cache = torch.randn((max_pages, page_size, 576), dtype=torch.bfloat16, device="cuda")
    kv_cache[used_pages:2*used_pages] = kv_cache[:used_pages]
    ckv, k_pe = torch.split(kv_cache, [512, 64], dim=-1)
    
    kv_len_arr = torch.tensor([kv_len, kv_len], dtype=torch.int32, device="cuda")
    qo_indptr = torch.tensor([0, q_len, q_len*2], dtype=torch.int32, device="cuda")
    kv_indptr = torch.tensor([0, used_pages, used_pages*2], dtype=torch.int32, device="cuda")
    kv_indices = torch.empty(max_pages, dtype=torch.int32, device="cuda")
    kv_indices[:2*used_pages] = torch.arange(0, 2*used_pages, dtype=torch.int32, device="cuda")
    bsz_tensor = torch.tensor([2], dtype=torch.int32, device="cuda")
    wrapper.plan(
        qo_indptr,
        kv_indptr,
        kv_indices,
        kv_len_arr,
        bsz_tensor,
        128,
        512,
        64,
        page_size,
        192 ** (-0.5),
        torch.bfloat16,
        torch.bfloat16,
    )
    
    q_nope_buf.copy_(q_nope)
    q_pe_buf.copy_(q_pe)
    kv_buf[:pages].copy_(kv_cache)

    torch.cuda.synchronize()
    graph.replay()
    torch.cuda.synchronize()

    # ref_torch
    q = torch.cat([q_nope, q_pe], dim=-1)
    k = (
        torch.cat([ckv, k_pe], dim=-1)
        .view(-1, 1, 512 + 64)
        .repeat_interleave(num_heads, dim=1)
    )
    v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
    attn_ref, lse_ref = attention_ref_torch(
        max_batch_size,
        q,
        k[:2*kv_len],
        v[:2*kv_len],
        True,
        192 ** (-0.5)
    )
    
    torch.testing.assert_close(attn_ref[:q_len], attn_ref[q_len:q_len*2], rtol=1e-9, atol=1e-9)
    torch.testing.assert_close(attn_output[:q_len], attn_output[q_len:q_len*2], rtol=1e-9, atol=1e-9)
    torch.testing.assert_close(attn_output[:q_len], attn_ref[:q_len], rtol=5e-3, atol=5e-3)
    torch.testing.assert_close(attn_output[q_len:q_len*2], attn_ref[q_len:q_len*2], rtol=5e-3, atol=5e-3)
    #torch.testing.assert_close(attn_output[:q_len], attn_output[q_len:q_len*2], rtol=1e-9, atol=1e-9)
    #torch.testing.assert_close(attn_output, attn_ref, rtol=5e-3, atol=5e-3)

    exit(0)

    for forward_id in range(0, 1):
        print("forward_id", forward_id)
        for layer_id in range(1):
            print(layer_id)
            flashinfer_folder = "./kv_cache_flashinfer"
            forward_id = 17
            layer_id = 0
            file_name = f"layer_{layer_id}.pt"
            kv_cache_path = os.path.join(flashinfer_folder, file_name)
            flashinfer_folder = "./flashinfer_output"

            q_len = 1
            kv_len = 126
            file_name = f"layer_{layer_id}_forward_{forward_id}_q_nope.pt"
            q_nope = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,512).to(device="cuda")
            file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
            q_pe = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,64).to(device="cuda")
            q = torch.cat([q_nope, q_pe], dim=-1)
            kv_cache = torch.load(kv_cache_path).to(device="cuda")
            pages, page_size, _, head_dim = kv_cache.shape
            kv_cache = kv_cache.view(pages, page_size, head_dim)
            ckv, k_pe = torch.split(kv_cache, [512, 64], dim=-1)
    
            kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
            qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
            wrapper.plan(
                None,
                None,
                None,
                kv_len_arr,
                128,
                512,
                64,
                page_size,
                192 ** (-0.5),
                torch.bfloat16,
                torch.bfloat16,
            )
    
            q_nope_buf.copy_(q_nope)
            q_pe_buf.copy_(q_pe)
            kv_buf[:pages].copy_(kv_cache)

            torch.cuda.synchronize()
            graph.replay()
            torch.cuda.synchronize()

            # ref_torch
            k = (
                torch.cat([ckv, k_pe], dim=-1)
                .view(-1, 1, 512 + 64)
                .repeat_interleave(num_heads, dim=1)
            )
            v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
            attn_ref, lse_ref = attention_ref_torch(
                max_batch_size,
                q,
                k[:kv_len],
                v[:kv_len],
                False,
                192 ** (-0.5)
            )
            torch.testing.assert_close(attn_output, attn_ref, rtol=1e-3, atol=1e-3)
    
            # ref_triton
            attn_logits = torch.empty(
                    (
                        max_batch_size,
                        num_heads,
                        4, #num_kv_splits # follow vLLM, fix it TODO
                        512 + 1, 
                    ),
                    dtype=torch.float32,
                    device = "cuda"
                )
            
            triton_ref = torch.zeros_like(q_nope)
            page_table = torch.arange(max_pages, dtype=torch.int32, device="cuda")
            ckv_with_pe = torch.cat([ckv, k_pe], dim=-1).contiguous().view(pages, page_size, 1, 576)
            ckv = ckv.view(pages, page_size, 1, 512)
            decode_attention_fwd_grouped(q, ckv_with_pe, ckv, triton_ref,
                page_table,
                kv_len_arr, attn_logits,
                4, #num_kv_splits # follow vLLM, fix it TODO
                192 ** (-0.5),
                page_size)

            torch.testing.assert_close(attn_output, triton_ref, rtol=1e-3, atol=1e-3)
            
            #file_name = f"./flashinfer_output/layer_{layer_id}_forward_{forward_id}_attn_output.pt"
            #ktrans_output = torch.load(file_name)
            #torch.testing.assert_close(attn_output, ktrans_output.squeeze(1), rtol=1e-3, atol=1e-3)
            print("test past")

================================================
FILE: archive/ktransformers/operators/gate.py
================================================
from typing import Optional
from torch import nn
import torch
import torch.nn.functional as F
import os
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.linear import KTransformersLinear
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader, translate_name_to_gguf
from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod


# class Base(BaseInjectedModule, ABC):
class KMoEGateBase(ABC):
    def __init__(self, 
                 key: str, 
                 gguf_loader: GGUFLoader, 
                 config: PretrainedConfig, 
                 orig_module: nn.Module, 
                 device: str = "cuda", 
                 **kwargs):
        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        super().__init__()
        self.key = key
        self.gguf_loader = gguf_loader
        self.config = config
        self.device = device
        self.orig_module = orig_module
    
    @abstractmethod
    def forward(self, input_tensor, expert_ids, weights):
        pass

    @abstractmethod
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu", warmup: bool = False):
        pass
    
    @abstractmethod
    def unload():
        pass

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None
        gate_type = None
        up_type = None
        down_type = None

        for key in keys:
            if self.gguf_loader.safetensor_loader is not None:
                # for npu
                translate_key = translate_name_to_gguf(key)
                translate_key = ".".join(translate_key.split(".")[:2])
                targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
                weight = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".ffn_gate_inp.weight")
                e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(translate_key + ".exp_probs_b.bias")
                weight_type = weight.dtype
                e_score_correction_bias_type = e_score_correction_bias.dtype
                res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
            # key = ".".join(key.split(".")[:-1])
            elif isinstance(self.gguf_loader, SafeTensorLoader):
                res = self.gguf_loader.load_gate(key, device=device)
            elif self.gguf_loader.has_tensor(key+".weight"):
                # targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
                targets = [".weight", ".e_score_correction_bias"]
                tensors = self.load_multi(key, targets, device=device)
                weight = tensors[".weight"]
                e_score_correction_bias = tensors[".e_score_correction_bias"]
                # weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"]
                res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias}
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")

        return res
    
    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
        tensors = {}
        for k in keys:
            tensors[k] = self.gguf_loader.load_gguf_tensor(key + k, device=device)
        return tensors


class KMoEGate(BaseInjectedModule, KMoEGateBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KMoEGateBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def forward(self, hidden_states) -> torch.Tensor:
        return self.orig_module.forward(hidden_states)

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: w = self.load_weights(device=device)
        
        if isinstance(w, dict):
            self.orig_module.weight = nn.Parameter(w["weight"])
            self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
        else:
            raise ValueError("Invalid weight type")
        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device))
        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device))

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.e_score_correction_bias is not None:
            self.e_score_correction_bias = None


class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "cuda",
        generate_op: str| None = "KLinearMarlin",
        prefill_device: str = "cuda",
        prefill_op: str| None = "KLinearMarlin",
        use_quant: bool = False,
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KMoEGateBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.generate_device = generate_device
        self.prefill_device = prefill_device
        self.generate_op = generate_op
        self.prefill_op = prefill_op
        self.is_windows = os.name == 'nt'
        self.use_quant = use_quant
        if not self.is_windows and use_quant:
            self.gate_linear = nn.Linear(self.gating_dim, self.n_routed_experts, device=generate_device)
            self.gate_linear = KTransformersLinear(key + ".ffn_gate_inp", 
                                               gguf_loader, config, self.gate_linear, #orig_module
                                               generate_device, generate_op, prefill_device, prefill_op)
        else:
            self.gate_linear = None

    def forward(self, hidden_states) -> torch.Tensor:
        if self.is_windows:
            return self.orig_module.forward(hidden_states)
        
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        if self.use_quant:
            logits = self.gate_linear.forward(logits)
        else:
            logits = F.linear(
                hidden_states.type(torch.float32), self.weight.type(torch.float32), None
            )
            
        return grouped_topk(hidden_states, logits,
                            self.top_k, self.norm_topk_prob,
                            self.n_group, self.topk_group)

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: w = self.load_weights(device=device)
        
        if isinstance(w, dict):
            self.orig_module.weight = nn.Parameter(w["weight"])
            self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
        else:
            raise ValueError("Invalid weight type")
        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device))
        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device))
        if not self.is_windows and self.use_quant:
            self.gate_linear.load(self.orig_module.weight)

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.e_score_correction_bias is not None:
            self.e_score_correction_bias = None


class KMoEGateIPEXLLM(KMoEGate):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "xpu",
        prefill_device: str = "xpu",
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KMoEGate.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def forward(self, hidden_states) -> torch.Tensor:
        x = hidden_states.view(-1, hidden_states.size(-1))
        logits = torch.nn.functional.linear(
            x.type(torch.float32), self.orig_module.weight.type(torch.float32), None
        )
        scores = logits.sigmoid()

        from ipex_llm.transformers.models.common import moe_group_topk
        topk_idx, topk_weight = moe_group_topk(scores, self.orig_module.e_score_correction_bias,
                                               self.n_group, self.topk_group, self.top_k,
                                               self.norm_topk_prob, self.routed_scaling_factor)
        return topk_idx, topk_weight.to(x.dtype)


================================================
FILE: archive/ktransformers/operators/layernorm.py
================================================
'''
Date: 2024-11-13 15:05:52
LastEditors: Xie Weiyu ervinxie@qq.com
LastEditTime: 2024-11-25 08:59:19
'''
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

"""Fused operators for normalization layers."""

import logging
from typing import Optional, Tuple, Union
from transformers import PretrainedConfig
import torch
import torch.nn as nn
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
from ktransformers.models.modeling_qwen3_next import Qwen3NextRMSNorm
from ktransformers.models.modeling_smallthinker import SmallthinkerRMSNorm
from ktransformers.models.modeling_glm4_moe import Glm4MoeRMSNorm
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
if not torch.xpu.is_available():
    from flashinfer.norm import (
        fused_add_rmsnorm,
        rmsnorm,
    )


logger = logging.getLogger(__name__)


class RMSNorm(DeepseekV3RMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


class KQwen2MoeRMSNorm(Qwen2MoeRMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(config.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


class KQwen3MoeRMSNorm(Qwen3MoeRMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        bsz, hidden_size = x.shape
        x = x.view(-1, self.orig_module.hidden_size)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        out = out.view(bsz, hidden_size)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)
    
class KQwen3NextRMSNorm(Qwen3NextRMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def _norm(self, x):
            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x, num_tokens_tensors, residual = None):
        if residual is not None:
            x = x + residual
            residual = x
        x = x.view(-1, self.orig_module.hidden_size)
        output = self._norm(x.float())
        # Llama does x.to(float16) * w whilst Qwen3Next is (x * w).to(float16)
        # See https://github.com/huggingface/transformers/pull/29402
        output = output * (1.0 + self.weight.float())
        if residual is None:
            return output.type_as(x)

        return output.type_as(x), residual

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.eps}"


class KSmallthinkerRMSNorm(SmallthinkerRMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        bsz, hidden_size = x.shape
        x = x.view(-1, self.orig_module.hidden_size)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        out = out.view(bsz, hidden_size)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

class KGlm4MoeRMSNorm(Glm4MoeRMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        bsz, hidden_size = x.shape
        x = x.view(-1, self.orig_module.hidden_size)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        out = out.view(bsz, hidden_size)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


class DeepseekV3RMSNormTorch(DeepseekV3RMSNorm, BaseInjectedModule):
    def __init__(self,
                key: str,
                gguf_loader : GGUFLoader,
                config: PretrainedConfig,
                orig_module: nn.Module,
                prefill_device: str = "cuda",
                generate_device: str = "cuda",
                **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self, 
        x,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    )-> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        if residual is not None:
            x = x + residual
            residual = x
        # range batch_size_tensor for x
        input_dtype = x.dtype
        x = x.to(torch.float32)
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.variance_epsilon)
        if residual is not None:
            return self.weight * x.to(input_dtype), residual
        return self.weight * x.to(input_dtype)


class KDeepseekRMSNormIPEXLLM(DeepseekV3RMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "xpu",
                 generate_device: str = "xpu",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.weight.shape[0],
            orig_module.variance_epsilon)
        self.eps = orig_module.variance_epsilon

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        from ipex_llm.transformers.models.common import rms_norm_forward
        if x.dtype not in [torch.float32, torch.float16]:
            output = rms_norm_forward(self, x.float())
        else:
            output = rms_norm_forward(self, x)
        return output.to(x.dtype)

    def load(self):
        BaseInjectedModule.load(self)
        if self.weight.dtype not in [torch.float32, torch.float16]:
            self.weight = self.weight.float()

================================================
FILE: archive/ktransformers/operators/linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Azure-Tang, Boxin Zhang
Date         : 2024-07-25 11:25:24
Version      : 0.1.0
LastEditors  : Azure 
LastEditTime : 2024-08-29 09:11:16
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''


import ctypes
import torch
from torch import Tensor, nn

try:
    import torch_npu

    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False

if not torch.xpu.is_available() and not use_torch_npu:
    import KTransformersOps
    import vLLMMarlin
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
from ktransformers.util.utils import InferenceState
if not torch.xpu.is_available():
    from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
        MarlinWorkspace,
        marlin_quantize,
        GPTQ_MARLIN_MIN_THREAD_N,
        GPTQ_MARLIN_MIN_THREAD_K,
        GPTQ_MARLIN_MAX_PARALLEL,
        vllm_marlin_quantize
    )
from ktransformers.operators.base_operator import BaseInjectedModule
from transformers.configuration_utils import PretrainedConfig
try:
    from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
except:
    print("no triton")
from abc import ABC, abstractmethod
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from ktransformers.operators.cpuinfer import CPUInfer
from ktransformers.server.config.config import Config
from typing import Dict, Tuple, Optional, Union
import numpy as np

#class KLinearBase(BaseInjectedModule, ABC):
class KLinearBase(ABC):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs,
    ):
        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        super().__init__()
        self.key = key
        self.gguf_loader = gguf_loader
        self.device = device
        self.config = config

        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        if orig_module is not None:
            self.in_features = orig_module.in_features
            self.out_features = orig_module.out_features
        else:
            shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
            if len(shape) == 1:
                print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
            self.in_features  = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
            self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]

        self.loaded = False # for lm_head pre-load, TODO: use new way to do lm_head pre-load when layer wise prefill.

    @abstractmethod
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        pass

    def load_weight(self, override_key: str | None = None, device: str | None = None):
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        for key in keys:
            if isinstance(self.gguf_loader, SafeTensorLoader):
                # using safetensor_loader
                tensor = self.gguf_loader.load_tensor(key+'.weight')
                try:
                    bias = self.gguf_loader.load_tensor(key+'.bias')
                except:
                    bias = None
                if self.gguf_loader.has_tensor(key+'.weight_scale_inv'):
                    weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv')
                    return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
                if bias is not None:
                    return nn.Parameter(tensor), nn.Parameter(bias)
                else:
                    return nn.Parameter(tensor)
                
            elif self.gguf_loader.has_tensor(key + ".weight") or "kv_b_proj" in key:
                if key + ".bias" in self.gguf_loader.tensor_file_map:
                    tensors = self.load_multi(key, ["weight", "bias"], device=device)
                    tensor = tensors["weight"]
                    bias = tensors["bias"]
                    # self.qtype = GGML_TYPE_QTYPE_MAP[tensorinfo[key + ".weight"]["ggml_type"]]
                    # print(torch.isinf(tensor).any(), torch.isinf(bias).any())
                    return nn.Parameter(tensor), nn.Parameter(bias)
                elif "kv_b_proj" in key and not self.gguf_loader.has_tensor(key + ".weight"):
                    attn_k_b_tensors = self.load_multi(key.replace("self_attn.kv_b_proj", "attn_k_b"), ["weight"], device=device)
                    attn_k_b = attn_k_b_tensors["weight"]
                    del attn_k_b_tensors
                    attn_k_b = attn_k_b.transpose(1, 2).contiguous()
                    attn_v_b_tensors = self.load_multi(key.replace("self_attn.kv_b_proj", "attn_v_b"), ["weight"], device=device)
                    attn_v_b = attn_v_b_tensors["weight"]
                    del attn_v_b_tensors
                    kv_b_proj = torch.cat((attn_k_b, attn_v_b), dim=1)
                    kv_b_proj = kv_b_proj.contiguous() if kv_b_proj.ndim == 2 else kv_b_proj.flatten(0, 1).contiguous()
                    del attn_k_b
                    del attn_v_b
                    return nn.Parameter(kv_b_proj)
                else:
                    tensors = self.load_multi(key, ["weight"], device=device)
                    tensor = tensors["weight"]
                    # self.qtype = GGML_TYPE_QTYPE_MAP[tensorinfo[key + ".weight"]["ggml_type"]]
                    return nn.Parameter(tensor)
            else:
                raise FileNotFoundError(f"Weight file not found for key {key}")

    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
        tensors = {}
        for k in keys:
            tensors[k] = self.gguf_loader.load_gguf_tensor(key + "." + k, device=device)
        return tensors

    @abstractmethod
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = "cuda"):
        pass

    @abstractmethod
    def unload(self):
        pass


class KLinearTorch(KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.weight = None
        self.has_bias = False

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None, **kwargs) -> torch.Tensor:
        dtype = x.dtype
        out_device = x.device
        # TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
        x = x.to(device=self.device, dtype=self.dtype)
        x = x @ self.weight
        if self.has_bias:
            x = x + self.bias
        x = x.to(dtype=dtype, device=out_device)
        return x

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        if w is None: w = self.load_weight(device=device)
        # else: self.out_features = w.shape[0], self.in_features = w.shape[1]
        
        if isinstance(w, nn.Parameter):
            try:
                self.weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except: 
                self.weight = w.to(dtype=self.dtype).T
            self.has_bias = False
        elif isinstance(w, tuple):
            try:
                self.weight = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except:
                self.weight = w[0].to(dtype=self.dtype).T
            self.bias = w[1].to(dtype=self.dtype)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        # self.linear = self.linear.to(device)
        self.weight = self.weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
        self.loaded = True

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.has_bias:
            self.bias = None

class KLinearQ8(KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.compute_dtype = torch.float32
        self.weight = None
        self.weight_scale = None
        self.weight_zero_point = None
        self.bias = None
        self.loaded = False
    
    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None) -> torch.Tensor:
        orig_dtype = x.dtype
        out_device = x.device
        
        x = x.to(device=self.device, dtype=self.compute_dtype)
        
        # 使用原始权重做矩阵乘法，模拟原始行为

        # 反量化权重进行矩阵乘法
        weight_dequant = self._dequantize_weight(self.weight, self.weight_scale, bits=8)
        out = x @ weight_dequant.T
        
        if self.has_bias:
            out = out + self.bias
        
        return out.to(dtype=orig_dtype, device=out_device)
    
    def _dequantize_weight(self, q_matrix, scales, bits=8):
        """
        Dequantize a low-precision matrix back to floating-point
        
        Args:
            q_matrix (torch.Tensor): Quantized int matrix
            scales (torch.Tensor): Scale factors for each column
            bits (int): Quantization bits used (8 or 4)
        
        Returns:
            torch.Tensor: Dequantized floating-point matrix
        """
        # Ensure inputs are torch tensors
        if not isinstance(q_matrix, torch.Tensor):
            q_matrix = torch.tensor(q_matrix, dtype=torch.int8)
        if not isinstance(scales, torch.Tensor):
            scales = torch.tensor(scales, dtype=torch.float32)
        
        # Convert to correct dtype if needed
        if q_matrix.dtype != torch.int8:
            q_matrix = q_matrix.to(torch.int8)
        if scales.dtype != torch.float32:
            scales = scales.to(torch.float32)
        
        # For Q4, ensure the values stay within 4-bit range
        if bits == 4:
            q_matrix = torch.clamp(q_matrix, -7, 7)
        rows, cols = q_matrix.shape
        dequant_matrix = q_matrix.to(torch.float32)
        scales_broadcast = scales.view(1, cols)
        # Apply dequantization to all columns at once using matrix multiplication
        dequant_matrix = dequant_matrix * scales_broadcast
        
        return dequant_matrix

    
    def _quantize_weight(self, matrix, bits=8):
        """
        Quantize a floating-point matrix to lower precision (Q8 or Q4)
        
        Args:
            matrix (torch.Tensor): Input matrix in floating-point format
            bits (int): Quantization bits, either 8 or 4
        
        Returns:
            tuple: (quantized int matrix, scale factors for each column)
        """
        if not isinstance(matrix, torch.Tensor):
            matrix = torch.tensor(matrix, dtype=torch.float32)
        
        # Convert to float32 if needed
        if matrix.dtype != torch.float32:
            matrix = matrix.to(torch.float32)
        
        # Get matrix shape
        rows, cols = matrix.shape
        
        # Determine quantization parameters based on bits
        if bits == 8:
            max_int = 127
            qtype = torch.int8
        elif bits == 4:
            max_int = 7
            qtype = torch.int8  # We'll still use int8 storage but limit to 4-bit range, wait for native support
        else:
            raise ValueError("Quantization bits must be either 8 or 4")
       
        scales = torch.zeros(cols, dtype=torch.float32, device=matrix.device)
        
        # Calculate max absolute value for each column
        max_abs_vals, _ = torch.max(torch.abs(matrix), dim=0)
        
        # Handle zero columns (avoid division by zero)
        zero_cols = max_abs_vals == 0
        max_abs_vals[zero_cols] = 1.0
        
        # Calculate scale factors for all columns at once
        scales = max_abs_vals / max_int
        
        # Prepare the scales for broadcasting [1, cols]
        scales_broadcast = scales.view(1, cols)
        
        # Apply quantization to the entire matrix at once
        q_matrix = torch.round(matrix / scales_broadcast).to(qtype)
        
        # For Q4, clamp values to ensure they stay within 4-bit range
        if bits == 4:
            q_matrix = torch.clamp(q_matrix, -max_int, max_int)
        
        return q_matrix, scales
    
    def load(self, w: Union[Dict, nn.Parameter, Tuple, None] = None, device: Optional[str] = None):
        if self.loaded: return
        if device is None: device = self.device 
        if w is None: w = self.load_weight(device=device)
        
        if isinstance(w, nn.Parameter):
            try:
                weight = w.to(dtype=self.compute_dtype).view(self.out_features, self.in_features)
            except:
                weight = w.to(dtype=self.compute_dtype)
            self.has_bias = False
        elif isinstance(w, tuple):
            try:
                weight = w[0].to(dtype=self.compute_dtype).view(self.out_features, self.in_features)
            except:
                weight = w[0].to(dtype=self.compute_dtype)
            self.bias = w[1].to(dtype=self.compute_dtype).to(device)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        
        self.weight, self.weight_scale = self._quantize_weight(weight, bits=8)
        
        self.weight = self.weight.to(device)
        self.weight_scale = self.weight_scale.to(device)
        
        if self.has_bias:
            self.bias = self.bias.to(device)
            
        self.loaded = True
    
    def unload(self):
        self.weight = None
        self.weight_scale = None
        self.weight_zero_point = None
        self._orig_weight = None
        
        if self.has_bias:
            self.bias = None
            
        self.loaded = False


class KLinearFP8(KLinearBase):
    # this kernel requires special handling for weight
    # Please load the weight file downloaded from KVCache.AI
    has_bias: bool
    weight: torch.Tensor
    bias: torch.Tensor
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        block_size: int = 128,
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.block_size = block_size
    
    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor) -> torch.Tensor:
        x = x.to(self.device)
        orig_dtype = x.dtype        
        x_quantized, scale_x = act_quant(x, self.block_size)
        y = fp8_gemm(x_quantized, scale_x, self.weight, self.weight_scale_inv)
        return y.to(dtype=orig_dtype)
    
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: 
            w = self.load_weight(device=device) 
        ### TODO fit weight_inv format
        if isinstance(w, tuple):
            self.weight = w[0].to(device)
            self.weight_scale_inv = w[1].to(device)
            self.has_bias = False
        else:
            raise ValueError("Invalid weight type")
        self.weight = self.weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
        
    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.has_bias:
            self.bias = None

# TODO: merge two marlin class

class VLinearMarlin(KLinearBase):
    marlin_q_w: torch.Tensor
    marlin_s: torch.Tensor
    g_idx: torch.Tensor
    sort_indices: torch.Tensor
    has_bias: bool
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        num_bits: int = 4,  # 4-bit/8-bit is supported
        group_size: int = 64,  # -1, 32, 64, 128
        act_order: bool = False,
        is_k_full=True,
        **kwargs,
    ):
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.num_bits = num_bits
        self.group_size = group_size
        self.act_order = act_order
        self.is_k_full = is_k_full
        self.padding = False
        self.orin_in_features = self.in_features
        self.orin_out_features = self.out_features
        if self.in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or self.out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
            #print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
            self.padding = True
            self.in_features = (self.in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
            self.out_features = (self.out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
            #print(f"After padding: in_features={in_features}, out_features={out_features}")
        
        self.k = self.in_features
        self.n = self.out_features

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        
        #if self.in_features * self.out_features:
        if w is None: 
            w = self.load_weight(device=device) 

        if isinstance(w, nn.Parameter):
            # pad weight
            weight = w.view(self.orin_out_features, self.orin_in_features).T
            self.has_bias = False
        elif isinstance(w, tuple):
            w = list(w)
            weight = w[0].view(self.orin_out_features, self.orin_in_features).T
            self.bias = w[1].view(self.orin_out_features)
            self.bias = w[1]
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        weight = weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
            
        if self.padding:
            padded_weight = torch.zeros(self.in_features, self.out_features, device=self.device)
            padded_weight[:self.orin_in_features, :self.orin_out_features] = weight
            weight = padded_weight

        # Pack Marlin linear
        marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
            weight, self.num_bits, self.group_size, self.act_order
        )
        self.workspace = MarlinWorkspace(
            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
        )
        self.weight = marlin_q_w
        self.marlin_q_w = marlin_q_w
        self.marlin_s = marlin_s
        self.g_idx = g_idx
        self.sort_indices = sort_indices
        self.k = weight.shape[0]
        self.n = weight.shape[1]
        # self.shape_buffer = torch.tensor([60], dtype=torch.int32, device=self.device)
        self.loaded = True


    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor = None) -> torch.Tensor:
        if bsz_tensor is None:
            bsz_tensor = torch.tensor([x.shape[0]], dtype=torch.int32, device=self.device)


        # Only support input x as BF16 and FP16
        x = x.to(self.device)
        orig_shape = list(x.shape)
        orig_dtype = x.dtype
        x = x.reshape(-1, orig_shape[-1])
        marlin_s = self.marlin_s.to(x.dtype)
        sms = -1

        # padding x.shape[0] to avoid CUDA illegal memory access error
        x, orig_size_m = self._pad_input(x)

        x = vLLMMarlin.gptq_marlin_gemm(
            x,
            self.marlin_q_w,
            marlin_s,
            self.g_idx,
            self.sort_indices,
            self.workspace.scratch,
            self.num_bits,
            bsz_tensor,
            x.shape[0],
            self.n,
            x.shape[-1],
            sms,
            self.is_k_full,
        )

        x = x[:orig_size_m]

        if self.has_bias:
            x = x + self.bias
        orig_shape[-1] = self.n
        return x.reshape(orig_shape).to(orig_dtype)

    def unload(self):

        if self.has_bias:
            self.bias = None
        self.marlin_q_w = None
        self.marlin_s = None
        self.g_idx = None
        self.sort_indices = None
        self.workspace = None  

    def _pad_input(self, x):

        size_m = x.shape[0]
        size_k = x.shape[1]

        # size_m and align value depends on VLinearMarlin implementation
        if size_m > 1024:
            align = 1024
        elif size_m > 64:
            align = 64
        else:
            align = 1

        padded_size_m = ((size_m + align - 1) // align) * align

        if padded_size_m > size_m:
            pad_len = padded_size_m - size_m
            pad_tensor = torch.zeros((pad_len, size_k), dtype=x.dtype, device=x.device)
            x = torch.cat([x, pad_tensor], dim = 0).contiguous()
        return x, size_m

class KLinearMarlin(KLinearBase):
    marlin_q_w: torch.Tensor
    marlin_s: torch.Tensor
    g_idx: torch.Tensor
    sort_indices: torch.Tensor
    has_bias: bool
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        num_bits: int = 4,  # 4-bit/8-bit is supported
        group_size: int = 64,  # -1, 32, 64, 128
        act_order: bool = False,
        is_k_full=True,
        **kwargs,
    ):
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.num_bits = num_bits
        self.group_size = group_size
        self.act_order = act_order
        self.is_k_full = is_k_full
        self.padding = False
        self.orin_in_features = self.in_features
        self.orin_out_features = self.out_features
        if self.in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or self.out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
            #print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
            self.padding = True
            self.in_features = (self.in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
            self.out_features = (self.out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
            #print(f"After padding: in_features={in_features}, out_features={out_features}")
        
        self.k = self.in_features
        self.n = self.out_features

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        
        #if self.in_features * self.out_features:
        if w is None: 
            w = self.load_weight(device=device) 

        if isinstance(w, nn.Parameter):
            # pad weight
            weight = w.view(self.orin_out_features, self.orin_in_features).T
            self.has_bias = False
        elif isinstance(w, tuple):
            w = list(w)
            weight = w[0].view(self.orin_out_features, self.orin_in_features).T
            self.bias = w[1].view(self.orin_out_features)
            self.bias = w[1]
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        weight = weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
            
        if self.padding:
            padded_weight = torch.zeros(self.in_features, self.out_features, device=self.device)
            padded_weight[:self.orin_in_features, :self.orin_out_features] = weight
            weight = padded_weight

        # Pack Marlin linear
        marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
            weight, self.num_bits, self.group_size, self.act_order
        )
        self.workspace = MarlinWorkspace(
            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
        )
        self.weight = marlin_q_w # modeling_xxx.py may use linear.weight
        self.marlin_q_w = marlin_q_w
        self.marlin_s = marlin_s
        self.g_idx = g_idx
        self.sort_indices = sort_indices
        self.k = weight.shape[0]
        self.n = weight.shape[1]
        self.loaded = True

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None, **kwargs) -> torch.Tensor:
        # Only support input x as BF16 and FP16
        x = x.to(self.device)
        orig_shape = list(x.shape)
        orig_dtype = x.dtype
        x = x.reshape(-1, orig_shape[-1])
        x = x.reshape(-1, x.shape[-1])
        if self.padding:
            padding_input=torch.empty(x.shape[0], self.in_features, device=x.device, dtype=x.dtype)
            padding_input[:,:self.orin_in_features] = x
            x = padding_input
        marlin_s = self.marlin_s.to(x.dtype)
        x = KTransformersOps.gptq_marlin_gemm(
            x,
            self.marlin_q_w,
            marlin_s,
            self.g_idx,
            self.sort_indices,
            self.workspace.scratch,
            self.num_bits,
            x.shape[0],
            self.n,
            x.shape[-1],
            self.is_k_full,
        )
        if self.padding:
            x = x[:,:self.orin_out_features]
            orig_shape[-1] = self.orin_out_features
        else:
            orig_shape[-1] = self.out_features
        if self.has_bias:
            x = x + self.bias
        return x.reshape(orig_shape).to(orig_dtype)

    def unload(self):

        if self.has_bias:
            self.bias = None
        self.marlin_q_w = None
        self.marlin_s = None
        self.g_idx = None
        self.sort_indices = None
        self.workspace = None

class KLinearCPUInfer(KLinearBase):
    CPU_INFER = None
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cpu",
        out_device: str = "cuda", # this device mean which device the output should on. TODO: support cpu.
        stride = 16,
        group_max_len = 1024,
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        if KLinearCPUInfer.CPU_INFER is None:
            KLinearCPUInfer.CPU_INFER = CPUInfer(Config().cpu_infer)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.w = None
        self.has_bias = False
        self.stride = stride
        self.group_max_len = group_max_len
        self.out_device = out_device

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor = None) -> torch.Tensor:
        origin_shape = x.shape # [batch_size, q_len, hidden_size]
        if origin_shape[1] == 1 and torch.cuda.is_current_stream_capturing():
            out_device = x.device
            self.input_tensor_cpu.copy_(x, non_blocking=True)
            qlen = origin_shape[1]
            KLinearCPUInfer.CPU_INFER.submit_with_cuda_stream(
                torch.cuda.current_stream().cuda_stream,
                self.linear.forward(
                    qlen, 
                    self.input_tensor_cpu.data_ptr(), 
                    self.output_cpu.data_ptr()
                )
            )
            KLinearCPUInfer.CPU_INFER.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
            self.output_gpu.copy_(self.output_cpu, non_blocking=True)
            if self.has_bias:
                self.output_gpu += self.bias
            return self.output_gpu
        else:
            dtype = x.dtype
            out_device = x.device
            x = x.to(device=self.device)
            qlen = origin_shape[1]
            output_shape = (*origin_shape[:-1], self.out_features)
            output = torch.empty(output_shape, device=x.device, dtype=x.dtype)
            KLinearCPUInfer.CPU_INFER.submit(
                self.linear.forward(
                    qlen, 
                    x.data_ptr(), 
                    output.data_ptr()
                )
            )
            KLinearCPUInfer.CPU_INFER.sync()
            if self.has_bias:
                output = output + self.bias
            output = output.to(dtype=dtype, device=out_device)
            return output

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None, warmup:bool = True):
        print(f"loading {self.key} to {self.device} using CPUInfer")
        if device is None: device = self.device
        self.load_weights(w=w, device=device)
        if self.bias is not None:
            self.has_bias = True
            self.bias = self.bias.to(device)
            
        weight_ptr = ctypes.addressof(
            ctypes.cast(self.weight.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        config = cpuinfer_ext.linear.LinearConfig(self.in_features, self.out_features, self.stride, self.group_max_len, weight_ptr, self.weight_type, 30)
        self.linear = cpuinfer_ext.linear.Linear(config)
        
        if warmup:
            KLinearCPUInfer.CPU_INFER.submit(self.linear.warm_up())
            KLinearCPUInfer.CPU_INFER.sync()
        self.input_tensor_cpu = torch.zeros((1, 1, self.in_features), device="cpu", pin_memory=True)
        self.output_cpu = torch.zeros((1, 1, self.out_features), device="cpu", pin_memory=True, dtype=torch.bfloat16)
        self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)

    def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
        if self.gguf_loader.has_tensor(self.key + ".weight"):
            if self.key + ".bias" in self.gguf_loader.tensor_file_map:
                self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
                self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
                self.bias = self.gguf_loader.load_gguf_tensor(self.key + ".bias", device=device)
            else:
                self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
                self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
                self.bias = None
        else:
            raise ValueError(f"Linear {self.key} not found in gguf_loader")

    def unload(self):
        if self.w is not None:
            self.w = None
        if self.has_bias:
            self.bias = None       

class KLinearIPEXLLM(KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "xpu",
        precision: str = "sym_int4",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.weight = None
        self.has_bias = False
        self.precision = precision
        self.qtype = None

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor = None) -> torch.Tensor:
        dtype = x.dtype
        out_device = x.device
        from ipex_llm.transformers.models.common import linear_forward
        x = linear_forward(x.half(), self.weight, self.qtype, self.out_features)

        if self.has_bias:
            x = x + self.bias
        x = x.to(dtype=dtype, device=out_device)
        return x

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        assert device.lower()[:3] == "xpu", "IPEX-LLM quantized linear only supports XPU device"
        if w is None: w = self.load_weight(device=device)

        if isinstance(w, nn.Parameter):
            try:
                weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except:
                weight = w.to(dtype=self.dtype).T
            self.has_bias = False
        elif isinstance(w, tuple):
            try:
                weight = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except:
                weight = w[0].to(dtype=self.dtype).T
            self.bias = w[1].to(dtype=self.dtype)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        weight = weight.to("cpu").float().transpose(0, 1).contiguous()

        if self.has_bias:
            self.bias = self.bias.to(device)

        # quantize linear weight
        from ipex_llm.transformers.models.common import quantize_linear
        paramsLowBit, qtype = quantize_linear(weight, self.in_features, self.precision)
        self.weight = paramsLowBit.to(device)
        self.qtype = qtype
        self.loaded = True

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.has_bias:
            self.bias = None

LINEAR_MAP = {
    "KLinearMarlin": KLinearMarlin,
    "KLinearTorch": KLinearTorch,
    "KLinearCPUInfer": KLinearCPUInfer,
    "VLinearMarlin": VLinearMarlin,
    "KLinearFP8": KLinearFP8,
    "KLinearQ8": KLinearQ8,
    "KLinearIPEXLLM": KLinearIPEXLLM,
}

class KTransformersLinear(BaseInjectedModule, KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        generate_device: str = "cuda",
        generate_op: str| None = "KLinearMarlin",
        prefill_device: str = "cuda",
        prefill_op: str| None = "KLinearTorch",
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        # build all the linear operators
        if prefill_op is not None:
            assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
            self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        else:
            self.prefill_linear = None

        if generate_op is not None:
            assert generate_op in LINEAR_MAP, f"linear_type {generate_op} not supported"
            self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
        else:
            self.generate_linear = None
        self.mode = InferenceState.UNLOAD

    def forward(self, x, bsz_tensor=None):
        if self.mode == InferenceState.PREFILL:
            assert self.prefill_linear is not None, "cpu linear is not initialized"
            y = self.prefill_linear.forward(x, bsz_tensor)
        else:
            assert self.generate_linear is not None, "gpu linear is not initialized"
            y = self.generate_linear.forward(x, bsz_tensor)
        return y

    def load(self, w: dict | nn.Parameter | tuple | None = None, mode: InferenceState = InferenceState.GENERATE):
        if not mode:
            mode = InferenceState.GENERATE
        # load to device
        if mode == InferenceState.PREFILL:
            self.generate_linear.unload()
            self.prefill_linear.load(w=w)
            self.device = self.prefill_linear.device
            self.weight = self.prefill_linear.weight # modeling_xxx.py may use linear.weight
        elif mode == InferenceState.GENERATE:
            self.prefill_linear.unload()
            self.generate_linear.load(w=w)
            self.device = self.generate_linear.device
            self.weight = self.generate_linear.weight # modeling_xxx.py may use linear.weight
        elif mode == InferenceState.UNLOAD:
            self.prefill_linear.unload()
            self.generate_linear.unload()
            self.device = "cpu"
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")
        self.mode = mode

    def unload(self):
        if self.prefill_linear is not None:
            self.prefill_linear.unload()
        if self.generate_linear is not None:
            self.generate_linear.unload()
        self.device = self.generate_linear.device

    def set_inference_mode(self, mode: InferenceState):
        if not mode: 
            mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")


================================================
FILE: archive/ktransformers/operators/mlp.py
================================================

from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
from transformers import PretrainedConfig
import torch.nn as nn
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeMLP
from ktransformers.models.modeling_smallthinker import SmallthinkerDenseMlpBlock
from ktransformers.models.modeling_glm4_moe import Glm4MoeMLP
class kDeepseekV3MLP(DeepseekV3MLP, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.hidden_size, orig_module.intermediate_size)
    def forward(self, x, bsz_tensor):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor)
        return down_proj
class KQwen2MoeMLP(Qwen2MoeMLP, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.intermediate_size)
    def forward(self, x, bsz_tensor):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor)
        return down_proj


class KSmallthinkerDenseMlpBlock(SmallthinkerDenseMlpBlock, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config)
    def forward(self, x, bsz_tensor):
        down_proj = self.down(nn.functional.relu(self.gate(x, bsz_tensor)) * self.up(x, bsz_tensor), bsz_tensor)
        return down_proj

class KGlm4MoeMLP(Glm4MoeMLP, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config, orig_module.hidden_size, orig_module.intermediate_size)
    def forward(self, x, bsz_tensor):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor)
        return down_proj

================================================
FILE: archive/ktransformers/operators/models.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Azure-Tang
Date         : 2024-07-25 11:25:24
Version      : 1.0.0
LastEditors  : Azure 
LastEditTime : 2024-08-27 07:29:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

import inspect
import math
from typing import List, Optional, Tuple, Union
import time
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ktransformers.operators.dynamic_attention import DynamicScaledDotProductAttention
from ktransformers.server.config.config import Config
import os
import yaml
from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
)
from transformers.modeling_outputs import (
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from ktransformers.models.modeling_qwen2_moe import (
    Qwen2MoeSparseMoeBlock,
    Qwen2MoeMLP,
    Qwen2MoeDecoderLayer,
)
from ktransformers.models.modeling_deepseek import (
    BaseModelOutputWithPast,
    DeepseekV2DecoderLayer,
    DeepseekV2MoE,
)
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState, get_compute_capability
from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import (
    LlamaDecoderLayer,
    LlamaRMSNorm,
    LlamaRotaryEmbedding,
)

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

    _flash_supports_window_size = "window_size" in list(
        inspect.signature(flash_attn_func).parameters
    )

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Qwen/Qwen1.5-MoE-A2.7B"
_CONFIG_FOR_DOC = "Qwen2MoeConfig"

QWEN2MOE_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Qwen2MoeConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

QWEN2MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_router_logits (`bool`, *optional*):
            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
            should not be returned during inference.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
)
class KQwen2MoeModel(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]

    Args:
        config: Qwen2MoeConfig
    """

    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, device, **kwargs
        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        per_layer_prefill_intput_threshold: (
            int | None
        ) = None,  # if None or 0, close per-layer prefill
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        # print(f'Total length of input_ids: {input_ids.size(1)}, {input_ids.size()}')

        if per_layer_prefill_intput_threshold is None:
            per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
        per_layer_prefill_flag = False
        seq_lenth = (
            inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
        )
        if (
            per_layer_prefill_intput_threshold
            and per_layer_prefill_intput_threshold < seq_lenth
        ):
            per_layer_prefill_flag = True
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.UNLOAD)
        else:
            pass
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_router_logits = (
            output_router_logits
            if output_router_logits is not None
            else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        use_legacy_cache = False
        if use_cache and not isinstance(past_key_values, Cache):
            use_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if inputs_embeds is None:
            input_ids = input_ids.to("cpu")
            inputs_embeds = self.embed_tokens(input_ids)
            inputs_embeds = inputs_embeds.to("cuda")

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds,
            cache_position,
            past_key_values,
            output_attentions,
        )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        if torch.xpu.is_available() and inputs_embeds.device.type == "xpu":
            position_embeddings = self.rotary_emb(hidden_states, position_ids)
        else:
            position_embeddings = None

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

        for i, decoder_layer in enumerate(self.layers):
            if self.transfer_map is not None and i in self.transfer_map:
                prev_stream = torch.cuda.current_stream()
                cur_device = self.transfer_map[i]
                if cur_device not in self.stream_device_map:
                    self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                torch.cuda.set_device(cur_device)
                self.stream_device_map[cur_device].wait_stream(prev_stream)
                torch.cuda.set_stream(self.stream_device_map[cur_device])
                hidden_states = hidden_states.to(
                    self.transfer_map[i], non_blocking=True
                )
                causal_mask = (
                    causal_mask.to(self.transfer_map[i], non_blocking=True)
                    if causal_mask is not None
                    else None
                )
                position_ids = (
                    position_ids.to(self.transfer_map[i], non_blocking=True)
                    if position_ids is not None
                    else None
                )
                cache_position = (
                    cache_position.to(self.transfer_map[i], non_blocking=True)
                    if cache_position is not None
                    else None
                )

            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                )
            else:
                if per_layer_prefill_flag:
                    # print(f"to gpu")
                    self.load_layer_to(decoder_layer, InferenceState.PREFILL)
                    torch.cuda.empty_cache()
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                )
                if per_layer_prefill_flag:
                    # print(f"to cpu")
                    self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
                    torch.cuda.empty_cache()
            hidden_states = layer_outputs[0]

            if use_cache and len(layer_outputs) > 1:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
            else:
                next_decoder_cache = None

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits and layer_outputs[-1] is not None:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        if per_layer_prefill_flag:
            per_layer_prefill_flag = False
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.GENERATE)
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            if next_decoder_cache is not None:
                next_cache = (
                    next_decoder_cache.to_legacy_cache()
                    if use_legacy_cache
                    else next_decoder_cache
                )
            else:
                next_cache = past_key_values

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_cache,
                    all_hidden_states,
                    all_self_attns,
                    all_router_logits,
                ]
                if v is not None
            )
        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    def load_layer_to(self, layer: Qwen2MoeDecoderLayer, target: InferenceState):
        assert isinstance(
            layer, Qwen2MoeDecoderLayer
        ), "module should be nn.ModuleList of decoder layers"

        # TODO Support restore to original device, not only cuda
        device = "cpu" if target == InferenceState.UNLOAD else "cuda"

        # attn
        layer.self_attn.q_proj.set_inference_mode(target)
        layer.self_attn.k_proj.set_inference_mode(target)
        layer.self_attn.v_proj.set_inference_mode(target)
        layer.self_attn.o_proj.set_inference_mode(target)
        layer.self_attn.rotary_emb = layer.self_attn.rotary_emb.to(device)

        # mlp
        if isinstance(layer.mlp, Qwen2MoeSparseMoeBlock):
            layer.mlp.gate.set_inference_mode(target)
            layer.mlp.experts.set_inference_mode(target)
            layer.mlp.shared_expert.gate_proj.set_inference_mode(target)
            layer.mlp.shared_expert.up_proj.set_inference_mode(target)
            layer.mlp.shared_expert.down_proj.set_inference_mode(target)
            layer.mlp.shared_expert.act_fn.to(device)
            layer.mlp.shared_expert_gate.to(device)
        else:
            layer.mlp.gate_proj.set_inference_mode(target)
            layer.mlp.up_proj.set_inference_mode(target)
            layer.mlp.down_proj.set_inference_mode(target)
            layer.mlp.act_fn.to(device)
        # layer norm
        layer.input_layernorm.to(device)
        layer.post_attention_layernorm.to(device)


DeepseekV2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class KDeepseekV2Model(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]

    Args:
        config: DeepseekV2Config
    """

    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, device, **kwargs
        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        per_layer_prefill_intput_threshold: (
            int | None
        ) = None,  # if None, no per-layer prefill
        is_prefill: Optional[bool] = False,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        if per_layer_prefill_intput_threshold is None:
            per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
        per_layer_prefill_flag = False
        seq_lenth = (
            inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
        )
        if (
            per_layer_prefill_intput_threshold
            and per_layer_prefill_intput_threshold < seq_lenth
        ):
            per_layer_prefill_flag = True
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.UNLOAD)
            torch.cuda.empty_cache()
        else:
            pass
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
                )
                use_cache = False

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)
        
        if inputs_embeds is None:
            org_device = input_ids.device
            # TODO move to embed_tokens's device, not hard code to cpu
            input_ids = input_ids.to("cpu")
            inputs_embeds = self.embed_tokens(input_ids).to(org_device)
            input_ids = input_ids.to(org_device)

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        if inputs_embeds.device.type == "xpu" and position_ids is not None:
            cos, sin = self.layers[0].self_attn.rotary_emb(inputs_embeds,
                                                           position_ids)
            position_embeddings = (cos, sin)
        else:
            position_embeddings = None

        if per_layer_prefill_flag:
            causal_mask = None
        elif use_torch_npu and not is_prefill:
            causal_mask = None
        else:
            if (use_torch_npu
                or os.name == 'nt'
                or get_compute_capability() < 8
                or (self.transfer_map is not None and 'cpu' in self.transfer_map.values())
                or device_manager.gpu_vendor != GPUVendor.NVIDIA):
                # print("for Windows or GPU before ampere, use forward_windows")
                # only use mask in forward windows or can't flash attn
                causal_mask = self._update_causal_mask(
                    attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
                )
            else:
                causal_mask = None

        # embed positions
        hidden_states = inputs_embeds
        if per_layer_prefill_flag:
            print(f"Total length of input_ids: {hidden_states.size(1)}")

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        t_gpu = 0
        t_cpu = 0
        t_f = 0

        for i, decoder_layer in enumerate(self.layers):
            # print(f"@@@@@@@@@@@@@@@@@layer {i}@@@@@@@@@@@@@@@@@@@@ \n")
            if self.transfer_map is not None and i in self.transfer_map:
                prev_stream = torch.cuda.current_stream()
                cur_device = self.transfer_map[i]
                if cur_device not in self.stream_device_map and cur_device.lower() != "cpu":
                    self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                if cur_device.lower() != "cpu":
                    torch.cuda.set_device(cur_device)
                    self.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.stream_device_map[cur_device])
                hidden_states = hidden_states.to(
                    self.transfer_map[i], non_blocking=True
                )
                causal_mask = (
                    causal_mask.to(self.transfer_map[i], non_blocking=True)
                    if causal_mask is not None
                    else None
                )
                position_ids = (
                    position_ids.to(self.transfer_map[i], non_blocking=True)
                    if position_ids is not None
                    else None
                )
                cache_position = (
                    cache_position.to(self.transfer_map[i], non_blocking=True)
                    if cache_position is not None
                    else None
                )

            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                )
            else:
                t3 = time.time()
                if per_layer_prefill_flag:
                    # print(f"to gpu")
                    self.load_layer_to(decoder_layer, InferenceState.PREFILL)
                    torch.cuda.empty_cache()
                t4 = time.time()
                # with open("log.txt", "a") as f:
                #     f.write(f"@@@@@@@@@@@@@@@@@layer {i}@@@@@@@@@@@@@@@@@@@@ \n")
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                    is_prefill = is_prefill,
                )
                t5 = time.time()
                if per_layer_prefill_flag:
                    # print(f"to cpu")
                    self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
                    torch.cuda.empty_cache()
                t6 = time.time()
            t_gpu += t4 - t3
            t_cpu += t6 - t5
            t_f += t5 - t4

            hidden_states = layer_outputs[0]

            # @@@@@@@ TODO open this notes, tmp close to fit deepseekv3
            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        if use_torch_npu:
            hidden_states_without_norm = hidden_states.clone()
        hidden_states = self.norm(hidden_states)
        # with open("log.txt", "a") as f:
        #     f.write(f"@@@After layers\n")
        #     f.write(f"hidden_states={hidden_states}\n")
        #     f.write(f"hidden_states.shape={hidden_states.shape}\n")

        if per_layer_prefill_flag:
            t6 = time.time()
            # print(f"restore")
            per_layer_prefill_flag = False
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.GENERATE)
            torch.cuda.empty_cache()
            t7 = time.time()

            print(
                f"total time: {t7-t3}, \n layer num{len(self.layers)}, gpu time: {t_gpu}, cpu time: {t_cpu}, forward time: {t_f}, restore time: {t7-t6}"
            )

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache()
                if use_legacy_cache
                else next_decoder_cache
            )
        if not return_dict:
            if use_torch_npu:
                return tuple(
                    v
                    for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, hidden_states_without_norm]
                    if v is not None
                )
            else:
                return tuple(
                    v
                    for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                    if v is not None
                )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    def load_layer_to(self, layer: DeepseekV2DecoderLayer, target: InferenceState):
        assert isinstance(
            layer, DeepseekV2DecoderLayer
        ), "module should be nn.ModuleList of decoder layers"

        # TODO Support restore to original device, not only cuda
        device = "cpu" if target == InferenceState.UNLOAD else "cuda"

        # TODO Support DFS to auto use {to, set_inference_mode} according to the module type

        # attn
        layer.self_attn.to(device)  #

        # mlp
        if isinstance(layer.mlp, DeepseekV2MoE):
            layer.mlp.gate.to(device)
            layer.mlp.experts.set_inference_mode(target)
            layer.mlp.shared_experts.gate_proj.set_inference_mode(target)
            layer.mlp.shared_experts.up_proj.set_inference_mode(target)
            layer.mlp.shared_experts.down_proj.set_inference_mode(target)
            layer.mlp.shared_experts.act_fn.to(device)
            # layer.mlp.shared_expert_gate.to(device)
        else:
            layer.mlp.gate_proj.set_inference_mode(target)
            layer.mlp.up_proj.set_inference_mode(target)
            layer.mlp.down_proj.set_inference_mode(target)
            layer.mlp.act_fn.to(device)
        # layer norm
        layer.input_layernorm.to(device)
        layer.post_attention_layernorm.to(device)


LLAMA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LlamaConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

LLAMA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaPreTrainedModel(PreTrainedModel):
    config_class = LlamaConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlamaDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


class KLlamaModel(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]

    Args:
        config: LlamaConfig
    """

    dynamic_sdpa = None

    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):

        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, device, **kwargs
        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()
        user_path: str = os.path.expanduser('~')
        localstore_path: str = os.path.join(user_path,'.ktransformers')
        config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
        with open(config_path,"r") as file:
            config_yaml = yaml.safe_load(file.read())
            self.long_context_config = config_yaml.get("long_context")
            self.ext_config = config_yaml.get("ext")

        KLlamaModel.dynamic_sdpa = DynamicScaledDotProductAttention(
            max_seq_len=self.long_context_config["max_seq_len"],
            block_size=self.long_context_config["block_size"],
            config=config,
            device=torch.device("cuda"),
            local_windows_len=self.long_context_config["local_windows_len"],
            topk=self.long_context_config["second_select_num"],
            threads_num=self.ext_config["cpu_infer"],
            anchor_type=self.long_context_config["anchor_type"],
            kv_type=self.long_context_config["kv_type"],
            dense_layer_num=self.long_context_config["dense_layer_num"],
            anchor_num=self.long_context_config["anchor_num"],
            preselect_block=self.long_context_config["preselect_block"],
            block_selection_mode=self.long_context_config["head_select_mode"],
            preselect_block_count=self.long_context_config["preselect_block_count"],
            layer_step=self.long_context_config["layer_step"],
            token_step=self.long_context_config["token_step"],
            prefill_chunk_size=self.long_context_config["chunk_size"],
            use_attn_sparsity=False,
        )

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
            )
            use_cache = False

        return_legacy_cache = False
        if (
            use_cache and not isinstance(past_key_values, Cache) and not self.training
        ):  # kept for BC (non `Cache` `past_key_values` inputs)
            return_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device="cuda",
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = None
        chunck_size = self.long_context_config["chunk_size"]
        cur_idx = 0
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids.to("cpu"))
        q_len = cache_position.size(0)

        # generate
        if q_len == 1:
            x = inputs_embeds[:, -1:, :]
            position_ids = position_ids[:, -1:]
            return self.forward_chunk(
                x,
                causal_mask,
                position_ids,
                past_key_values,
                output_attentions,
                use_cache,
                cache_position,
                output_hidden_states,
                return_dict,
            )
        elif q_len <= chunck_size:
            inputs_embeds = inputs_embeds.to('cuda')
            output = self.forward_chunk(
                inputs_embeds,
                causal_mask,
                position_ids,
                past_key_values,
                output_attentions,
                use_cache,
                cache_position,
                output_hidden_states,
                return_dict,
            )
            KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
            KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
            return output
        cur_idx = 0
        assert (
            output_attentions == False
        ), "output_attentions is not supported when using chunked attention"
        attn_output = None
        # prefill
        KLlamaModel.dynamic_sdpa.remaining_length = q_len
        while cur_idx < q_len:
            print(f'current prefill length: {cur_idx}')
            chunk_mask = None
            if inputs_embeds.device.type == 'cpu':
                tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)].to("cuda")
            else:
                tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)]
            output_with_past = self.forward_chunk(
                tmp_inputs_embeds,
                chunk_mask,
                position_ids[:, cur_idx : min(cur_idx + chunck_size, q_len)],
                past_key_values,
                output_attentions,
                use_cache,
                cache_position[cur_idx : min(cur_idx + chunck_size, q_len)],
            )
            cur_output = output_with_past.last_hidden_state
            KLlamaModel.dynamic_sdpa.remaining_length -= (
                min(cur_idx + chunck_size, q_len) - cur_idx
            )
            cur_idx += chunck_size
            # if attn_output is None:
            attn_output = cur_output
            # else:
            #     attn_output = torch.cat((attn_output, cur_output), dim=-2)

        KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
        KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
        return BaseModelOutputWithPast(last_hidden_state=attn_output)

    def forward_chunk(
        self,
        inputs_embeds,
        causal_mask,
        position_ids,
        past_key_values,
        output_attentions,
        use_cache,
        cache_position,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_legacy_cache = False
        if use_cache and not isinstance(
            past_key_values, Cache
        ):  # kept for BC (non `Cache` `past_key_values` inputs)
            return_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None
        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                    position_embeddings,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if return_legacy_cache:
            next_cache = next_cache.to_legacy_cache()

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = (
            past_key_values.get_seq_length() if past_key_values is not None else 0
        )
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and not using_static_cache
            and not output_attentions
        ):
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError(
                    "Custom 4D attention mask should be passed in inverted form with max==0`"
                )
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length),
                fill_value=min_dtype,
                dtype=dtype,
                device=device,
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(
                target_length, device=device
            ) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(
                input_tensor.shape[0], 1, -1, -1
            )
            if attention_mask is not None:
                causal_mask = (
                    causal_mask.clone()
                )  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = (
                    causal_mask[:, :, :, :mask_length]
                    + attention_mask[:, None, None, :]
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[
                    :, :, :, :mask_length
                ].masked_fill(padding_mask, min_dtype)
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(
                causal_mask, min_dtype
            )

        return causal_mask


================================================
FILE: archive/ktransformers/operators/triton_attention.py
================================================
# Adapted from
# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
# which was originally adapted from
# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py

import triton
import triton.language as tl
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor
@triton.jit
def tanh(x):
    # Tanh is just a scaled sigmoid
    return 2 * tl.sigmoid(2 * x) - 1

@triton.jit
def _fwd_grouped_kernel_stage1(
    Q,
    K_Buffer,
    V_Buffer,
    sm_scale,
    Req_to_tokens,
    B_Seqlen,
    Att_Out,
    stride_req_to_tokens_b,
    stride_qbs,
    stride_qh,
    stride_buf_kbs,
    stride_buf_kh,
    stride_buf_vbs,
    stride_buf_vh,
    stride_mid_ob,
    stride_mid_oh,
    stride_mid_os,
    kv_group_num: tl.constexpr,
    q_head_num: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    BLOCK_DPE: tl.constexpr,
    BLOCK_DV: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_H: tl.constexpr,
    NUM_KV_SPLITS: tl.constexpr,
    PAGE_SIZE: tl.constexpr,
    logit_cap: tl.constexpr,
    Lk: tl.constexpr,
    Lv: tl.constexpr,
):
    cur_batch = tl.program_id(0)
    cur_head_id = tl.program_id(1)
    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
    split_kv_id = tl.program_id(2)

    if kv_group_num > BLOCK_H:
        VALID_BLOCK_H: tl.constexpr = BLOCK_H
    else:
        VALID_BLOCK_H: tl.constexpr = kv_group_num
    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
    mask_h = mask_h & (cur_head < q_head_num)

    offs_d = tl.arange(0, BLOCK_DMODEL)
    offs_dv = tl.arange(0, BLOCK_DV)
    mask_d = offs_d < Lk
    mask_dv = offs_dv < Lv
    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
    cur_batch_req_idx = cur_batch

    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
        None, :]
    q = tl.load(Q + offs_q,
                mask=(mask_h[:, None]) & (mask_d[None, :]),
                other=0.0)

    if BLOCK_DPE > 0:
        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
        mask_dpe = offs_dpe < Lk
        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
                   offs_dpe[None, :])
        qpe = tl.load(Q + off_qpe,
                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
                      other=0.0)

    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
    split_kv_start = kv_len_per_split * split_kv_id
    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
                              cur_batch_seq_len)
    
    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)

    if split_kv_end > split_kv_start:
        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
            offs_n = start_n + tl.arange(0, BLOCK_N)
            kv_page_number = tl.load(
                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
                offs_n // PAGE_SIZE,
                mask=offs_n < split_kv_end,
                other=0,
            )
            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
                          cur_kv_head * stride_buf_kh + offs_d[:, None])
            k = tl.load(
                K_Buffer + offs_buf_k,
                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
                other=0.0,
            )
            qk = tl.dot(q, k.to(q.dtype))
            
            if BLOCK_DPE > 0:
                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
                                cur_kv_head * stride_buf_kh +
                                offs_dpe[:, None])
                kpe = tl.load(
                    K_Buffer + offs_buf_kpe,
                    mask=(offs_n[None, :] < split_kv_end) &
                    (mask_dpe[:, None]),
                    other=0.0,
                )
                qk += tl.dot(qpe, kpe.to(qpe.dtype))
            qk *= sm_scale

            if logit_cap > 0:
                qk = logit_cap * tanh(qk / logit_cap)

            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
                          qk, float("-inf"))

            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
            v = tl.load(
                V_Buffer + offs_buf_v,
                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
                other=0.0,
            )

            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
            re_scale = tl.exp(e_max - n_e_max)
            p = tl.exp(qk - n_e_max[:, None])
            acc *= re_scale[:, None]
            acc += tl.dot(p.to(v.dtype), v)

            e_sum = e_sum * re_scale + tl.sum(p, 1)
            e_max = n_e_max

        offs_mid_o = (cur_batch * stride_mid_ob +
                      cur_head[:, None] * stride_mid_oh +
                      split_kv_id * stride_mid_os + offs_dv[None, :])

        tl.store(
            Att_Out + offs_mid_o,
            acc / e_sum[:, None],
            mask=(mask_h[:, None]) & (mask_dv[None, :]),
        )

        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
                        split_kv_id * stride_mid_os + Lv)

        tl.store(
            Att_Out + offs_mid_o_1,
            e_max + tl.log(e_sum),
            mask=mask_h,
        )

def _decode_grouped_att_m_fwd(
    q,
    k_buffer,
    v_buffer,
    att_out,
    Req_to_tokens,
    B_Seqlen,
    num_kv_splits,
    sm_scale,
    page_size,
    logit_cap,
):
    BLOCK = 32
    Lk = k_buffer.shape[-1]
    Lv = v_buffer.shape[-1]

    # [TODO] work around shmem limit on MI3xx
    
    # TODO: support hip
    if device_manager.gpu_vendor == GPUVendor.AMD and Lk >= 576:
       BLOCK = 16

    if Lk == 576:
        BLOCK_DMODEL = 512
        BLOCK_DPE = 64
    elif Lk == 288:
        BLOCK_DMODEL = 256
        BLOCK_DPE = 32
    else:
        BLOCK_DMODEL = triton.next_power_of_2(Lk)
        BLOCK_DPE = 0
    BLOCK_DV = triton.next_power_of_2(Lv)

    batch, head_num = q.shape[0], q.shape[1]
    kv_group_num = q.shape[1] // k_buffer.shape[-2]

    BLOCK_H = 16
    NUM_KV_SPLITS = num_kv_splits
    grid = (
        batch,
        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
        NUM_KV_SPLITS,
    )

    extra_kargs = {}
    # TODO: support hip
    """
    if is_hip_:
        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
        extra_kargs = {
            "waves_per_eu": 4,
            "matrix_instr_nonkdim": 16,
            "kpack": 2
        }
    """
    
    _fwd_grouped_kernel_stage1[grid](
        q,
        k_buffer,
        v_buffer,
        sm_scale,
        Req_to_tokens,
        B_Seqlen,
        att_out,
        Req_to_tokens.stride(0),
        q.stride(0),
        q.stride(1),
        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        att_out.stride(0),
        att_out.stride(1),
        att_out.stride(2),
        kv_group_num=kv_group_num,
        q_head_num=head_num,
        BLOCK_DMODEL=BLOCK_DMODEL,
        BLOCK_DPE=BLOCK_DPE,
        BLOCK_DV=BLOCK_DV,
        BLOCK_N=BLOCK,
        BLOCK_H=BLOCK_H,
        NUM_KV_SPLITS=NUM_KV_SPLITS,
        PAGE_SIZE=page_size,
        logit_cap=logit_cap,
        num_warps=4,
        num_stages=2,
        Lk=Lk,
        Lv=Lv,
        **extra_kargs,
    )

@triton.jit
def _fwd_kernel_stage2(
    Mid_O,
    o,
    B_Seqlen,
    stride_mid_ob,
    stride_mid_oh,
    stride_mid_os,
    stride_obs,
    stride_oh,
    NUM_KV_SPLITS: tl.constexpr,
    BLOCK_DV: tl.constexpr,
    Lv: tl.constexpr,
):
    cur_batch = tl.program_id(0)
    cur_head = tl.program_id(1)

    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)

    offs_d = tl.arange(0, BLOCK_DV)
    mask_d = offs_d < Lv

    e_sum = 0.0
    e_max = -float("inf")
    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)

    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv

    for split_kv_id in range(0, NUM_KV_SPLITS):
        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
        split_kv_start = kv_len_per_split * split_kv_id
        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
                                  cur_batch_seq_len)

        if split_kv_end > split_kv_start:
            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
                         mask=mask_d,
                         other=0.0)
            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
            n_e_max = tl.maximum(tlogic, e_max)

            old_scale = tl.exp(e_max - n_e_max)
            acc *= old_scale
            exp_logic = tl.exp(tlogic - n_e_max)
            acc += exp_logic * tv

            e_sum = e_sum * old_scale + exp_logic
            e_max = n_e_max

    tl.store(
        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
        acc / e_sum,
        mask=mask_d,
    )

def _decode_softmax_reducev_fwd(
    logits,
    q,
    o,
    v_buffer,
    b_seq_len,
    num_kv_splits,
):
    batch, head_num = q.shape[0], q.shape[1]
    Lv = v_buffer.shape[-1]
    BLOCK_DV = triton.next_power_of_2(Lv)

    NUM_KV_SPLITS = num_kv_splits

    extra_kargs = {}
    # TODO: support hip
    """
    if is_hip_:
        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
        extra_kargs = {
            "waves_per_eu": 4,
            "matrix_instr_nonkdim": 16,
            "kpack": 2
        }
    """
    
    grid = (batch, head_num)
    _fwd_kernel_stage2[grid](
        logits,
        o,
        b_seq_len,
        logits.stride(0),
        logits.stride(1),
        logits.stride(2),
        o.stride(0),
        o.stride(1),
        NUM_KV_SPLITS=NUM_KV_SPLITS,
        BLOCK_DV=BLOCK_DV,
        Lv=Lv,
        num_warps=4,
        num_stages=2,
        **extra_kargs,
    )

def decode_attention_fwd_grouped(
    q,
    k_buffer,
    v_buffer,
    o,
    req_to_token,
    b_seq_len,
    attn_logits,
    num_kv_splits,
    sm_scale,
    page_size,
    logit_cap=0.0,
):
    _decode_grouped_att_m_fwd(
        q,
        k_buffer,
        v_buffer,
        attn_logits,
        req_to_token,
        b_seq_len,
        num_kv_splits,
        sm_scale,
        page_size,
        logit_cap,
    )

    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
                                num_kv_splits)


================================================
FILE: archive/ktransformers/operators/triton_attention_prefill.py
================================================

# Adapted from
# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py
# which was originally adapted from
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1

"""
Memory-efficient attention for prefill.
It supporst page size = 1.
"""

# Adapted from
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
import torch
import triton
import triton.language as tl

is_cuda_available = torch.cuda.is_available()
if is_cuda_available:
    CUDA_CAPABILITY = torch.cuda.get_device_capability()


@triton.jit
def _fwd_kernel(
    Q,
    K,
    V,
    sm_scale,
    B_Start_Loc,
    B_Seqlen,
    Out,
    stride_qbs,
    stride_qh,
    stride_kbs,
    stride_kh,
    stride_vbs,
    stride_vh,
    stride_obs,
    stride_oh,
    kv_group_num: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    IS_CAUSAL: tl.constexpr,
    Lk: tl.constexpr,
):
    cur_batch = tl.program_id(0)
    cur_head = tl.program_id(1)
    start_m = tl.program_id(2)

    cur_kv_head = cur_head // kv_group_num

    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)

    block_start_loc = BLOCK_M * start_m

    # initialize offsets
    offs_n = tl.arange(0, BLOCK_N)
    offs_d = tl.arange(0, BLOCK_DMODEL)
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    off_q = (
        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
        + cur_head * stride_qh
        + offs_d[None, :]
    )
    off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None]
    off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :]

    mask_d = offs_d < Lk

    q = tl.load(
        Q + off_q,
        mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]),
        other=0.0,
    )

    k_ptrs = K + off_k
    v_ptrs = V + off_v

    # initialize pointer to m and l
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)

    block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)

    end_n = (
        cur_batch_seq_len
        if not IS_CAUSAL
        else tl.minimum((start_m + 1) * BLOCK_M, cur_batch_seq_len)
    )
    for start_n in range(0, block_mask * end_n, BLOCK_N):
        start_n = tl.multiple_of(start_n, BLOCK_N)
        # -- compute qk ----
        k = tl.load(
            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
            mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]),
            other=0.0,
        )
        # mask = tl.load(mask_ptrs + start_n, mask=start_n + offs_n < cur_batch_end_loc, other=0.0)

        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        qk += tl.dot(q, k)
        qk *= sm_scale

        if IS_CAUSAL:
            qk += tl.where(
                (start_n + offs_n[None, :] < cur_batch_seq_len)
                & (offs_m[:, None] >= (start_n + offs_n[None, :])),
                0,
                float("-inf"),
            )
        else:
            qk += tl.where(
                (start_n + offs_n[None, :]) < cur_batch_seq_len, 0, float("-inf")
            )

        # -- compute m_ij, p, l_ij
        m_ij = tl.max(qk, 1)
        p = tl.exp(qk - m_ij[:, None])
        l_ij = tl.sum(p, 1)
        # -- update m_i and l_i
        m_i_new = tl.maximum(m_i, m_ij)
        alpha = tl.exp(m_i - m_i_new)
        beta = tl.exp(m_ij - m_i_new)
        l_i_new = alpha * l_i + beta * l_ij
        # -- update output accumulator --
        # scale p
        p_scale = beta / l_i_new
        p = p * p_scale[:, None]
        # scale acc
        acc_scale = l_i / l_i_new * alpha
        acc = acc * acc_scale[:, None]
        # update acc
        v = tl.load(
            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
            mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]),
            other=0.0,
        )

        p = p.to(v.dtype)
        acc += tl.dot(p, v)
        # update m_i and l_i
        l_i = l_i_new
        m_i = m_i_new
    # initialize pointers to output
    off_o = (
        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
        + cur_head * stride_oh
        + offs_d[None, :]
    )
    out_ptrs = Out + off_o
    tl.store(
        out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :])
    )


def context_attention_fwd(
    q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True
):
    """
    q, k, v: [b * s, head, head_dim]
    b_start_loc: [b]
    b_seq_len: [b]
    out: [b * s, head, head_dim]
    """
    if is_cuda_available and CUDA_CAPABILITY[0] > 8:
        BLOCK = 128
    else:
        BLOCK = 64

    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]

    sm_scale = 1.0 / (Lq**0.5)
    batch, head = b_seq_len.shape[0], q.shape[1]
    kv_group_num = q.shape[1] // k.shape[1]

    grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
    num_warps = 4 if Lk <= 64 else 8

    _fwd_kernel[grid](
        q,
        k,
        v,
        sm_scale,
        b_start_loc,
        b_seq_len,
        o,
        q.stride(0),
        q.stride(1),
        k.stride(0),
        k.stride(1),
        v.stride(0),
        v.stride(1),
        o.stride(0),
        o.stride(1),
        kv_group_num=kv_group_num,
        BLOCK_M=BLOCK,
        BLOCK_DMODEL=triton.next_power_of_2(Lk),
        BLOCK_N=BLOCK,
        IS_CAUSAL=is_causal,
        num_warps=num_warps,
        num_stages=1,
        Lk=Lk,
    )

================================================
FILE: archive/ktransformers/optimize/optimize.py
================================================
'''
Description  :  
Author       : Boxin Zhang, Azure-Tang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
from typing import Mapping, List
import torch
import yaml
import re
from torch import nn
from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
# from operators import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
from ktransformers.util.custom_gguf import translate_name_to_gguf
from ktransformers.util import utils
from ktransformers.util.utils import set_module, load_weights
import itertools
import copy

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False

def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
    for name, child in module._modules.items():
        if child is not None:
            child_prefix = prefix + name
            if child_prefix in local_optimization_dict:
                inject_module_meta=local_optimization_dict[child_prefix]
                if inject_module_meta["class"] != "default":
                    import_path = inject_module_meta["class"].split(".")
                    import_module_name = ".".join(import_path[:-1])
                    gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
                    import_class_name = import_path[-1]
                    module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
                    if use_torch_npu:
                        print(f"Injecting {child_prefix} as", import_module_name, ".",
                            import_class_name) if torch.distributed.get_rank() == 0 else None #TODO 分布式
                    else: 
                        print(f"Injecting {child_prefix} as", import_module_name, ".", import_class_name)
                    inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, **inject_module_meta["kwargs"])
                    set_module(module, name, inject_module)
                elif inject_module_meta["class"] == "default":
                    print(f"Injecting {child_prefix} as default")
                    gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
                else:
                    raise Exception("inject_module_meta[\"class\"] must be \"default\" or a class path")
                child_prefix += "."
                child_optimization_dict = {k: v for k, v in local_optimization_dict.items() if k.startswith(child_prefix)}
                inject(child, child_optimization_dict, model_config, gguf_loader, child_prefix)

def del_meta(module:nn.Module):
    #print("default loading weights", prefix)
    persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
    local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
    local_state = {k: v for k, v in local_name_params if v is not None}
    for name, param in local_state.items():
        if param.device == "meta" or param.device == torch.device("meta"):
            module.__delattr__(name)
    for name, child in module._modules.items():
        del_meta(child)

def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
    module_name = prefix[:-1]
    if use_torch_npu:
        translated_name = translate_name_to_gguf(prefix)[:-1]
    recursive = True
    for rule in rule_list:
        match_meta = rule["match"]
        if "class" not in match_meta and "name" not in match_meta:
            raise Exception("match must have at least one of \"class\" and \"name\"")
        if "class" in match_meta:
            import_path = match_meta["class"].split(".")
            import_module_name = ".".join(import_path[:-1])
            import_class_name = import_path[-1]
            module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
            if not isinstance(module, module_cls):
                continue
        if "name" in match_meta:
            if re.search(match_meta["name"], module_name) is None:
                continue
        if "replace" not in rule:
            raise Exception("replace must be in rule")
        if "replace" in rule:
            replace_meta = rule["replace"]
            if module_name not in out_data:
                out_data[module_name]={"key": module_name if not use_torch_npu else translated_name,
                                    "class": replace_meta["class"] if "class" in replace_meta else "default",
                                    # "device": replace_meta["device"] if "device" in replace_meta else default_device,
                                    "kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
            else:
                if out_data[module_name]["class"] == "default":
                    out_data[module_name]["class"] = replace_meta["class"] if "class" in replace_meta else "default"
                out_data[module_name]["kwargs"].update(copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict())
        if "recursive" in rule:
            recursive = bool(rule["recursive"])
        break
            
    if module_name not in out_data:
        out_data[module_name]= {
            "class": "default",
            "key": module_name if not use_torch_npu else translated_name,
            "kwargs": {"generate_device": default_device,
                       "prefill_device": default_device}
        }

    #print(out_data[module_name])
    #input()

    if recursive:
        for name, child in module._modules.items():
            if child is not None:
                child_prefix = prefix + name + "."
                gen_optimize_config(child, out_data, rule_list, child_prefix, default_device = default_device)
    

def translate_model_config(model_config: PretrainedConfig):
    # for supporting some special model 
    if model_config.model_type == "mixtral":
        model_config.moe_intermediate_size = model_config.intermediate_size
    
    return model_config


def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0", q4_gguf_path=""):
    with open(rule_file, 'r', encoding='utf-8') as f:
        rule_list = yaml.load(f.read(), Loader=yaml.FullLoader)
    
    optimize_config = dict()
    gen_optimize_config(module, optimize_config, rule_list, default_device = default_device)
    
    model_config = translate_model_config(model_config)

    if use_torch_npu:
        if q4_gguf_path:
            q4_gguf_loader = GGUFLoader(q4_gguf_path)
            utils.Q4_GGUF_LODER = q4_gguf_loader
        gguf_loader = GGUFLoader(gguf_path, getattr(model_config, "quantize", None))
        with torch.device("meta"):
            inject(module, optimize_config, model_config, gguf_loader)
        # pre load lm_head because its big inter result
        load_weights(module.lm_head, gguf_loader, "lm_head.")
        load_weights(module, gguf_loader)
        module.gguf_loader = gguf_loader
    else:
        weights_loader = ModelLoaderFactory.create_loader(gguf_path)
        with torch.device("meta"):
            inject(module, optimize_config, model_config, weights_loader)
        # pre load lm_head because its big inter result
        load_weights(module.lm_head, weights_loader, "lm_head.", device=default_device)
        load_weights(module, weights_loader, device=default_device)
        module.gguf_loader = weights_loader
    del_meta(module)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif torch.xpu.is_available():
        torch.xpu.empty_cache()
    else:
        torch.cuda.empty_cache()


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:2"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:2"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:3"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:3"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        15: "cuda:1"
        30: "cuda:2"
        45: "cuda:3"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "(^model\\.layers\\.([2][0-9]|[1][5-9])\\.)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "(^model\\.layers\\.([3][0-9]|[4][0-4])\\.)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
      
- match:
    name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([345][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([345][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([345][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-gpu-cpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

# === Rotary Embedding Replacement ===

# GPU 0: layers 0–9
- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
# CPU: layers 10-29
- match:
    name: "^model\\.layers\\.([12][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === Linear Layers Replacement (excluding self_attn) ===

# GPU 0: layers 0–9
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.(?!self_attn).*$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
# CPU: layers 10-29
- match:
    name: "^model\\.layers\\.([12][0-9])\\.(?!self_attn).*$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"
      generate_op: "KLinearCPUInfer"
      prefill_op: "KLinearTorch"
      out_device: "cpu"

# === MLP (MoE) Replacement ===

# GPU 0: layers 0–9
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
# CPU: layers 10-29
- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === MLP Gate Replacement ===

# GPU 0: layers 0–9
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
# CPU: layers 10-29
- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === MLP Experts Replacement ===

# GPU 0: layers 0–9
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module
# CPU: layers 10-29
- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cpu"
  recursive: False # don't recursively inject submodules of this module

# === Self-Attention Replacement ===

# GPU 0: layers 0–9
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
# CPU: layers 10-29
- match:
    name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === Overall Model Replacement with Transfer Map ===

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map:
        10: "cpu"

# === Default Catch-All for Other Modules ===#
# GPU 0: layers 0–9
- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

#lmm_head on GPU 0
- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# CPU: layers 10-29
- match:
    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.(?!self_attn).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([12][0-9])\\.(?!self_attn).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        10: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "llamafile"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === Rotary Embedding Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# === MLP (MoE) Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === MLP Gate Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === MLP Experts Replacement ===
# replace with marlin expert. Open and modify layer-num as needed.
# Each layer of malin experts takes about 6GB of GPU memory.
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!

# GPU 0: layers 3–4
# - match:
#     name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:0"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 1: layers 15–17
# - match:
#     name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 2: layers 30–32
# - match:
#     name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:2"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 3: layers 45–46
# - match:
#     name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:3"
#       generate_op:  "KExpertsMarlin"
#   recursive: False


# === MLP Experts Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:2"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:2"
  recursive: False

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:3"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:3"
  recursive: False

# === Self-Attention Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      absorb_for_prefill: False

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      absorb_for_prefill: False

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      absorb_for_prefill: False

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      absorb_for_prefill: False

# === Overall Model Replacement with Transfer Map ===

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
      transfer_map:
        15: "cuda:1" # Layers 15+ on GPU 1
        30: "cuda:2" # Layers 30+ on GPU 2
        45: "cuda:3" # Layers 45+ on GPU 3

# === Default Catch-All for Other Modules ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
- match:
    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === Rotary Embedding Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.([3][2-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"


# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 7: layers 56–63
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"


# === MLP (MoE) Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

# === MLP Gate Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"


# === MLP Experts Replacement ===
# replace with marlin expert. Open and modify layer-num as needed.
# Each layer of malin experts takes about 6GB of GPU memory.
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
# !!!Loading marlin expert will take signifcant time.!!!

# GPU 0: layers 0–7
# - match:
#     name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 1: layers 8–15
# - match:
#     name: "^model\\.layers\\.([8-9]|1[0-5)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 

# # GPU 2: layers 16–23
# - match:
#     name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0" 
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 3: layers 24–31
# - match:
#     name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 

# # GPU 4: layers 32–39
# - match:
#     name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0" 
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 5: layers 40–47
# - match:
#     name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 

# # GPU 6: layers 48–55
# - match:
#     name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 7: layers 56–60
# - match:
#     name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 


# === MLP Experts Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:2"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:2"
  recursive: False

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:3"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:3"
  recursive: False

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:4"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:4"
  recursive: False

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:5"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:5"
  recursive: False

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:6"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:6"
  recursive: False

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:7"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:7"
  recursive: False


# === Self-Attention Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

# === Overall Model Replacement with Transfer Map ===

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
      transfer_map:
        8: "cuda:1"
        16: "cuda:2"
        24: "cuda:3"
        32: "cuda:4"
        40: "cuda:5"
        48: "cuda:6"
        56: "cuda:7"

# === Default Catch-All for Other Modules ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–63
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# For final modules (model.norm), ensure they are on GPU 7 (as in your original config)
- match:
    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


- match:
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-4])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
  replace:
    class: ktransformers.operators.experts.KTransformersExperts  
    kwargs:
      generate_device: "cuda:0" # run in cuda:0
      generate_op:  "KExpertsMarlin"
  recursive: False

- match:
    name: "^model\\.layers\\.([3][0])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      generate_device: "cuda:1"
      generate_op:  "KExpertsMarlin"
  recursive: False 

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-npu.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "npu:0"
      prefill_device: "npu:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "npu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "npu"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Glm4Moe-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_glm4_moe.Glm4MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.KGlm4MoeRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_glm4_moe.Glm4MoeMoE
  replace:
    class: ktransformers.operators.experts.KGlm4MoeMoE
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KGlm4Experts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: None
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KGlm4MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_glm4_moe.Glm4MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KGlm4MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_glm4_moe.Glm4MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KGlm4MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml
================================================
- match:
    class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbeddingV2
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"
- match:
    class: ktransformers.models.modeling_llama.LlamaModel
  replace:
    class: ktransformers.operators.models.KLlamaModel
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill

- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KLlamaAttention
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/Mixtral.yaml
================================================
- match:
    class: ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*$"
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.block_sparse_moe$"
    class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
  replace: 
    class: ktransformers.operators.experts.KMistralSparseMoEBlock
- match:
    name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$"
  replace: 
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\..*\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B-serve.yaml
================================================


- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbeddingV4
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance)
#- match:
#    name: "^model\\.layers\\..*\\.mlp\\.experts$"
#  replace:
#    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
#    kwargs:
#      prefill_device: "cuda"
#      prefill_op: "KExpertsTorch"
#      generate_device: "cuda"
#      generate_op: "KExpertsMarlin"
#  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
================================================
- match:
    name: "^model\\.layers\\.([012])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([012])$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([012])\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
- match:
    name: "^model\\.layers\\.([012])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"
- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model.norm)"
  replace:
    class: "default"
    kwargs:
        generate_device: "cuda:1"
        prefill_device: "cuda:1"

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        3: "cuda:1"

- match:
    name: "^model\\.layers\\.([012])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"
- match:
    name: "^model\\.layers\\..*\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen2MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen2MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Qwen2-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen2MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen2MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "AMXBF16" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen3MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen3MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen3MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen3MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Qwen3Next-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen3_next.Qwen3NextRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.KQwen3MoeRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen3_next.Qwen3NextSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen3NextSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    class: ktransformers.models.modeling_qwen3_next.Qwen3NextGatedDeltaNet
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen3NextGatedDeltaNet # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen3_next.Qwen3NextAttention
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen3NextAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"


- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen3_next.Qwen3NextRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen3NextRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen3_next.Qwen3NextMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/Smallthinker-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_smallthinker.SmallthinkerRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.KSmallthinkerRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*feed_forward\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.block_sparse_moe$"
    class: ktransformers.models.modeling_smallthinker.SmallthinkerMoeBlock
  replace:
    class: ktransformers.operators.experts.KSmallthinkerMoeBlock
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$"
  replace:
    class: ktransformers.operators.experts.KSmallthinkerExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: None
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KSmallthinkerAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_smallthinker.SmallthinkerRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KSmallthinkerRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_smallthinker.SmallthinkerDenseMlpBlock
  replace:
    class:  ktransformers.operators.mlp.KSmallthinkerDenseMlpBlock
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-300IA2-npu-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^lm_head$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.ascend.ascend_linear.KTransformersLinearW8A8A2  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorchW8A8A2"
      prefill_op: "KLinearTorchW8A8A2"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.ascend.ascend_linear.KTransformersLinearW8A8A2  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorchW8A8A2"
      prefill_op: "KLinearTorchW8A8A2"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.ascend.ascend_experts.KDeepseekV3MoEW8A8     # mlp module with custom forward function
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^model\\.layers\\.([0-2])\\.mlp$"
    class: "ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP"
  replace:
    class: "ktransformers.operators.ascend.ascend_mlp.KDeepseekV3MLPW8A8A2V1"
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.shared_experts$"
    class: "ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP"
  replace:
    class: "ktransformers.operators.ascend.ascend_mlp.KDeepseekV3MLPW8A8A2V2"
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.ascend.ascend_gate.KDeepseekV3GateA2
    kwargs:
      generate_device: "npu:0"
      prefill_device: "npu:0"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.ascend.ascend_experts.KTransformersExpertsW8A8
    kwargs:
      prefill_device: "npu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPUW8A8"
      out_device: "npu"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
    class: ktransformers.operators.experts.KExpertsCPU
  replace:
    class: ktransformers.operators.ascend.ascend_experts.KExpertsCPUW8A8

- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.ascend.ascend_attention.KDeepseekV2AttentionW8A8A2Serve # optimized MLA implementation
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    name: "^model..*norm"
  replace:
    class: ktransformers.operators.ascend.ascend_layernorm.KDeepseekV3RMSNormW8A8
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-300IA2-npu.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^lm_head$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.ascend.ascend_linear.KTransformersLinearW8A8A2  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorchW8A8A2"
      prefill_op: "KLinearTorchW8A8A2"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.ascend.ascend_linear.KTransformersLinearW8A8A2  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorchW8A8A2"
      prefill_op: "KLinearTorchW8A8A2"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.ascend.ascend_experts.KDeepseekV3MoEW8A8     # mlp module with custom forward function
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^model\\.layers\\.([0-2])\\.mlp$"
    class: "ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP"
  replace:
    class: "ktransformers.operators.ascend.ascend_mlp.KDeepseekV3MLPW8A8A2V1"
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.shared_experts$"
    class: "ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP"
  replace:
    class: "ktransformers.operators.ascend.ascend_mlp.KDeepseekV3MLPW8A8A2V2"
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.ascend.ascend_gate.KDeepseekV3GateA2
    kwargs:
      generate_device: "npu:0"
      prefill_device: "npu:0"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.ascend.ascend_experts.KTransformersExpertsW8A8
    kwargs:
      prefill_device: "npu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPUW8A8"
      out_device: "npu"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
    class: ktransformers.operators.experts.KExpertsCPU
  replace:
    class: ktransformers.operators.ascend.ascend_experts.KExpertsCPUW8A8

- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.ascend.ascend_attention.KDeepseekV2AttentionW8A8A2 # optimized MLA implementation
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    name: "^model..*norm"
  replace:
    class: ktransformers.operators.ascend.ascend_layernorm.KDeepseekV3RMSNormW8A8
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/npu/Qwen3-Chat-300IA2-npu-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"

- match:
    name: "^lm_head$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.ascend.ascend_linear.KTransformersLinearW8A8A2
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorchW8A8A2"
      prefill_op: "KLinearTorchW8A8A2"

- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate)(?!.*mlp\\.gate)(?!.*mlp\\.experts).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.ascend.ascend_linear.KTransformersLinearW8A8A2
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorchW8A8A2"
      prefill_op: "KLinearTorchW8A8A2"

- match:
    name: "^model\\.layers\\.(?!.*mlp\\.gate)(?!.*self_attn\\.kv_b_proj)(?!.*mlp\\.experts).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.ascend.ascend_linear.KTransformersLinearW8A8A2
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      generate_op: "KLinearTorchW8A8A2"
      prefill_op: "KLinearTorchW8A8A2"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.ascend.ascend_experts.KQwen3MoeSparseMoeBlockW8A8
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      dump_enable: False
      dump_dir: "/mnt/dump_from_mindie/dump_from_kt_moe"

- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.ascend.ascend_attention.KQwen3MoeAttentionW8A8A2Serve
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      absorb_for_prefill: False
      dump_enable: False
      dump_dir: "/mnt/dump_from_mindie/dump_from_kt_attn"

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0


- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm
  replace:
    class: ktransformers.operators.ascend.ascend_layernorm.KQwen3MoeRMSNormW8A8
    kwargs:
      generate_device: "npu"
      prefill_device: "npu"
      dump_enable: False
      dump_dir: "/mnt/dump_from_mindie/dump_from_kt_rms"


================================================
FILE: archive/ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cpu"
      prefill_device: "cuda"
      generate_op: "KLinearCPUInfer"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearQ8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/xpu/DeepSeek-V2-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "xpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "xpu"
  recursive: False # don't recursively inject submodules of this module
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2RMSNorm
  replace:
    class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      device: "xpu"
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/xpu/DeepSeek-V3-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^lm_head$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGateIPEXLLM
    kwargs:
      generate_device: "xpu:0"
      prefill_device: "xpu:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "xpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "xpu"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: archive/ktransformers/optimize/optimize_rules/xpu/Qwen3Moe-Chat.yaml
================================================
- match:
    name: "rotary_emb$"
  replace:
    class: ktransformers.operators.RoPE.KQwen3MoeRotaryEmbedding
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^lm_head$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.gate).*$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "xpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "xpu"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KQwen3MoeAttentionIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"
- match:
    class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"


================================================
FILE: archive/ktransformers/server/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/api/__init__.py
================================================
from fastapi import APIRouter

from .ollama import router as ollama_router
from .openai import router as openai_router,post_db_creation_operations
from .web import router as web_router

router = APIRouter()
router.include_router(ollama_router)
router.include_router(openai_router)
router.include_router(web_router)


================================================
FILE: archive/ktransformers/server/api/ollama/__init__.py
================================================
from fastapi import APIRouter

from .completions import router as completions_router

router = APIRouter()
router.include_router(completions_router)


================================================
FILE: archive/ktransformers/server/api/ollama/completions.py
================================================
from datetime import datetime
from http.client import NOT_IMPLEMENTED
import json
from time import time
from uuid import uuid4
from typing import List, Optional

from fastapi import APIRouter, Request
from pydantic import BaseModel, Field

from ktransformers.server.config.config import Config
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import check_link_response
from ktransformers.server.backend.base import BackendInterfaceBase

from ktransformers.server.schemas.endpoints.chat import RawUsage

router = APIRouter(prefix='/api')

# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
class OllamaGenerateCompletionRequest(BaseModel):
    model: str = Field(..., description="The model name, which is required.")
    prompt: Optional[str] = Field(
        None, description="The prompt to generate a response for.")
    images: Optional[List[str]] = Field(
        None, description="A list of base64-encoded images for multimodal models such as llava.")
    # Advanced parameters
    format: Optional[str] = Field(
        None, description="The format to return a response in, accepted value is json.")
    options: Optional[dict] = Field(
        None, description="Additional model parameters as listed in the documentation.")
    system: Optional[str] = Field(
        None, description="System message to override what is defined in the Modelfile.")
    template: Optional[str] = Field(
        None, description="The prompt template to use, overriding what is defined in the Modelfile.")
    context: Optional[str] = Field(
        None, description="The context parameter from a previous request to keep a short conversational memory.")
    stream: Optional[bool] = Field(
        None, description="If false, the response will be returned as a single response object.")
    raw: Optional[bool] = Field(
        None, description="If true, no formatting will be applied to the prompt.")
    keep_alive: Optional[str] = Field(
        "5m", description="Controls how long the model will stay loaded into memory following the request.")

class OllamaGenerationStreamResponse(BaseModel):
    model: str
    created_at: str
    response: str
    done: bool = Field(...)

class OllamaGenerationResponse(BaseModel):
    model: str
    created_at: str
    response: str
    done: bool

@router.post("/generate", tags=['ollama'])
async def generate(request: Request, input: OllamaGenerateCompletionRequest):
    id = str(uuid4())
    interface: BackendInterfaceBase = get_interface()
    print(f'COMPLETION INPUT:----\n{input.prompt}\n----')
    config = Config()

    if input.stream:
        async def inner():
            async for res in interface.inference(input.prompt, id):
                if isinstance(res, RawUsage):
                    raw_usage = res
                else: 
                    token, finish_reason = res
                    d = OllamaGenerationStreamResponse(
                        model=config.model_name,
                        created_at=str(datetime.now()),
                        response=token,
                        done=False
                    )
                    yield d.model_dump_json() + '\n'
            d = OllamaGenerationStreamResponse(
                model=config.model_name,
                created_at=str(datetime.now()),
                response='',
                done=True
            )
            yield d.model_dump_json() + '\n'
        return check_link_response(request, inner())
    else:
        complete_response = ""
        async for res in interface.inference(input.prompt, id):
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res
                complete_response += token
        response = OllamaGenerationResponse(
            model=config.model_name,
            created_at=str(datetime.now()),
            response=complete_response,
            done=True
        )
        return response
    
# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion
class OllamaChatCompletionMessage(BaseModel):
    role: str
    content: str

class OllamaChatCompletionRequest(BaseModel):
    model: str = Field(..., description="The model name, which is required.")
    messages: List[OllamaChatCompletionMessage] = Field(
        ..., description="A list of messages to generate a response for.")
    stream: bool = Field(True, description="If true, the response will be streamed.")

class OllamaChatCompletionStreamResponse(BaseModel):
    model: str
    created_at: str
    message: dict
    done: bool = Field(...)
    done_reason: Optional[str] = Field("", description="done_reason")
    total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
    load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
    prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
    prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
    eval_count: Optional[int] = Field(None, description="Number of tokens generated")
    eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")

class OllamaChatCompletionResponse(BaseModel):
    model: str
    created_at: str
    message: dict
    done: bool
    done_reason: Optional[str] = Field("", description="done_reason")
    total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
    load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
    prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
    prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
    eval_count: Optional[int] = Field(None, description="Number of tokens generated")
    eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")

@router.post("/chat", tags=['ollama'])
async def chat(request: Request, input: OllamaChatCompletionRequest):
    id = str(uuid4())
    interface: BackendInterfaceBase = get_interface()
    config = Config()

    input_message = [json.loads(m.model_dump_json()) for m in input.messages]

    if input.stream:
        async def inner():
            start_time = time()  # 记录开始时间（秒）
            tokens = []

            async for res in interface.inference(input_message, id):
                if isinstance(res, RawUsage):
                    raw_usage = res
                else: 
                    token, finish_reason = res
                    d = OllamaChatCompletionStreamResponse(
                        model=config.model_name,
                        created_at=str(datetime.now()),
                        message={"role": "assistant", "content": token}, 
                        done=False
                    )
                    yield d.model_dump_json() + '\n'
            # 计算性能数据
            end_time = time()
            total_duration = int((end_time - start_time) * 1_000_000_000) # unit: ns
            prompt_eval_count = raw_usage.prefill_count
            eval_count = raw_usage.decode_count
            eval_duration = int(raw_usage.decode_time * 1_000_000_000)
            prompt_eval_duration = int(raw_usage.prefill_time * 1_000_000_000)
            load_duration = int(raw_usage.tokenize_time * 1_000_000_000)
            done_reason = finish_reason

            d = OllamaChatCompletionStreamResponse(
                model=config.model_name,
                created_at=str(datetime.now()),
                message={},
                done=True,
                total_duration=total_duration,
                load_duration=load_duration,
                prompt_eval_count=prompt_eval_count,
                prompt_eval_duration=prompt_eval_duration,
                eval_count=eval_count,
                eval_duration=eval_duration,
                done_reason=done_reason
            )
            yield d.model_dump_json() + '\n'
        return check_link_response(request, inner())
    else:
        start_time = time()
        complete_response = ""
        eval_count = 0 

        async for res in interface.inference(input_message, id):
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res
                complete_response += token

        end_time = time()
        total_duration = int((end_time - start_time) * 1_000_000_000) # unit: ns
        prompt_eval_count = raw_usage.prefill_count
        eval_count = raw_usage.decode_count
        eval_duration = int(raw_usage.decode_time * 1_000_000_000)
        prompt_eval_duration = int(raw_usage.prefill_time * 1_000_000_000)
        load_duration = int(raw_usage.tokenize_time * 1_000_000_000)
        done_reason = finish_reason


        response = OllamaChatCompletionResponse(
            model=config.model_name,
            created_at=str(datetime.now()),
            message={"role": "assistant", "content": complete_response},
            done=True,
            total_duration=total_duration,
            load_duration=load_duration,
            prompt_eval_count=prompt_eval_count,
            prompt_eval_duration=prompt_eval_duration,
            eval_count=eval_count,
            eval_duration=eval_duration,
            done_reason=done_reason
        )
        return response
    
# https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models
class OllamaModel(BaseModel):
    name: str
    modified_at: str
    size: int
    # TODO: fill the rest correctly

# mock ollama
@router.get("/tags", tags=['ollama'])
async def tags():
    config = Config()
    # TODO: fill this correctly, although it does not effect Tabby
    return {"models": [OllamaModel(name=config.model_name, modified_at="123", size=123)]}

class OllamaModelInfo(BaseModel):
    # TODO: fill this correctly
    pass

class OllamaShowRequest(BaseModel):
    name: str = Field(..., description="Name of the model to show")
    verbose: Optional[bool] = Field(
        None, description="If set to true, returns full data for verbose response fields")

class OllamaShowDetial(BaseModel):
    parent_model: str
    format: str
    family: str
    families: List[str]
    parameter_size: str
    quantization_level: str

class OllamaShowResponse(BaseModel):
    modelfile: str
    parameters: str
    template: str
    details: OllamaShowDetial
    model_info: OllamaModelInfo

    class Config:
        protected_namespaces = ()

@router.post("/show", tags=['ollama'])
async def show(request: Request, input: OllamaShowRequest):
    config = Config()
    # TODO: Add more info in config to return, although it does not effect Tabby
    return OllamaShowResponse(
        modelfile="# Modelfile generated by ...",
        parameters=" ",
        template=" ",
        details=OllamaShowDetial(
            parent_model=" ",
            format="gguf",
            family=" ",
            families=[" "],
            parameter_size=" ",
            quantization_level=" "
        ),
        model_info=OllamaModelInfo()
    )

================================================
FILE: archive/ktransformers/server/api/openai/__init__.py
================================================
from fastapi import APIRouter

from .assistants import router as assistants_router,create_default_assistant
from .endpoints.chat import router as chat_router
from .legacy import router as legacy_router

router = APIRouter(prefix='/v1')


router.include_router(assistants_router)
router.include_router(chat_router)
router.include_router(legacy_router)

def post_db_creation_operations():
    create_default_assistant()


================================================
FILE: archive/ktransformers/server/api/openai/assistants/__init__.py
================================================
from fastapi import APIRouter

from .assistants import router as assistants_router, create_default_assistant
from .messages import router as messages_router
from .runs import router as runs_router
from .threads import router as threads_router

router = APIRouter()

threads_router.include_router(runs_router)
threads_router.include_router(messages_router)

router.include_router(assistants_router)
router.include_router(threads_router)


================================================
FILE: archive/ktransformers/server/api/openai/assistants/assistants.py
================================================
from typing import Optional

from fastapi import APIRouter
from fastapi.testclient import TestClient

from ktransformers.server.crud.assistants.assistants import AssistantDatabaseManager
from ktransformers.server.crud.assistants.runs import RunsDatabaseManager
from ktransformers.server.schemas.assistants.assistants import AssistantCreate, AssistantModify, ObjectID, AssistantBuildStatus, AssistantObject
from ktransformers.server.schemas.base import DeleteResponse, Order
from ktransformers.server.config.log import logger


router = APIRouter(prefix="/assistants")
assistant_manager = AssistantDatabaseManager()
runs_manager = RunsDatabaseManager()


@router.post("/", tags=['openai'])
async def create_assistant(
    assistant: AssistantCreate,
):
    return assistant_manager.db_create_assistant(assistant).as_api_response()


@router.get("/", tags=['openai'])
async def list_assistants(
    limit: Optional[int] = 20,
    order: Order = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
):
    return [assistant.as_api_response() for assistant in assistant_manager.db_list_assistants(limit, order)]

# list assistant with status


@router.get("/status", tags=['openai-ext'])
async def list_assistants_with_status(
    limit: Optional[int] = 20,
    order: Order = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
):
    return assistant_manager.db_list_assistants(limit, order)


@router.get("/{assistant_id}", tags=['openai'])
async def retrieve_assistant(
    assistant_id: str,
):
    return assistant_manager.db_get_assistant_by_id(assistant_id).as_api_response()


@router.post("/{assistant_id}", tags=['openai'])
async def modify_assistant(
    assistant_id: str,
    assistant: AssistantModify,
):
    return assistant_manager.db_update_assistant_by_id(assistant_id, assistant).as_api_response()


@router.delete("/{assistant_id}", tags=['openai'], response_model=DeleteResponse)
async def delete_assistant(assistant_id: str):
    assistant_manager.db_delete_assistant_by_id(assistant_id)
    return DeleteResponse(id=assistant_id, object="assistant.deleted")


@router.get("/{assistant_id}/related_thread", tags=['openai'])
async def get_related_thread(assistant_id: ObjectID):
    assistant = assistant_manager.db_get_assistant_by_id(assistant_id)
    return assistant.get_related_threads_ids()


def create_default_assistant():
    logger.info('Creating default assistant')
    if assistant_manager.db_count_assistants() == 0:
        default_assistant = assistant_manager.db_create_assistant(AssistantCreate(name="KT Assistant",
                                                                                  model="default model",
                                                                                  instructions="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  """ +
                                                                                  """Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. """ +
                                                                                  """Please ensure that your responses are socially unbiased and positive in nature."""))
        default_assistant.build_status.status = AssistantBuildStatus.Status.completed
        default_assistant.sync_db()


# unit test
client = TestClient(router)


def test_create_assistant():
    ass_create = AssistantCreate(model="awesome model", instructions="hello")

    res = client.post("/", json=ass_create.model_dump(mode="json"))

    assert res.status_code == 200
    assistant = AssistantObject.model_validate(res.json())

    assert assistant.model == ass_create.model
    assert assistant.instructions == ass_create.instructions

    res = client.get(f"/{assistant.id}")
    ass1 = AssistantObject.model_validate(res.json())
    assert assistant == ass1


================================================
FILE: archive/ktransformers/server/api/openai/assistants/messages.py
================================================
from typing import List, Optional

from fastapi import APIRouter

from ktransformers.server.exceptions import not_implemented
from ktransformers.server.schemas.assistants.messages import MessageCreate, MessageObject, MessageModify
from ktransformers.server.crud.assistants.messages import MessageDatabaseManager
from ktransformers.server.schemas.base import DeleteResponse, ObjectID, Order
from ktransformers.server.backend.base import ThreadContext
from ktransformers.server.utils.create_interface import  get_thread_context_manager
router = APIRouter()
message_manager = MessageDatabaseManager()


@router.post("/{thread_id}/messages", tags=['openai'], response_model=MessageObject)
async def create_message(thread_id: str, msg: MessageCreate):
    message = message_manager.db_create_message(
        thread_id, msg, MessageObject.Status.in_progress)
    ctx: Optional[ThreadContext] = await get_thread_context_manager().get_context_by_thread_id(thread_id)
    if ctx is not None:
        ctx.put_user_message(message)
    return message


@router.get("/{thread_id}/messages", tags=['openai'], response_model=List[MessageObject])
async def list_messages(
    thread_id: str,
    limit: Optional[int] = 20,
    order: Order = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
    run_id: Optional[str] = None,
):
    return message_manager.db_list_messages_of_thread(thread_id, limit, order)


@router.get("/{thread_id}/messages/{message_id}", tags=['openai'], response_model=MessageObject)
async def retrieve_message(thread_id: ObjectID, message_id: ObjectID):
    return message_manager.db_get_message_by_id(thread_id, message_id)


@router.post("/{thread_id}/messages/{message_id}", tags=['openai'], response_model=MessageObject)
async def modify_message(thread_id: ObjectID, message_id: ObjectID, msg: MessageModify):
    #raise not_implemented('modify message not implemented')
    raise not_implemented('modify message')


@router.delete("/{thread_id}/messages/{message_id}", tags=['openai'], response_model=DeleteResponse)
async def delete_message(thread_id: ObjectID, message_id: ObjectID):
    ctx: Optional[ThreadContext] = await get_thread_context_manager().get_context_by_thread_id(thread_id)
    if ctx is not None:
        ctx.delete_user_message(message_id)
    message_manager.db_delete_message_by_id(thread_id, message_id)
    return DeleteResponse(id=message_id, object='thread.message.deleted')


================================================
FILE: archive/ktransformers/server/api/openai/assistants/runs.py
================================================
from typing import List, Optional

from fastapi import APIRouter, Request

from ktransformers.server.crud.assistants.runs import RunsDatabaseManager
from ktransformers.server.backend.base import ThreadContext
from ktransformers.server.schemas.assistants.runs import RunCreate,RunObject,RunThreadCreate,RunModify,RunSubmit
from ktransformers.server.schemas.assistants.streaming import api_stream_response
from ktransformers.server.utils.create_interface import  get_thread_context_manager
from ktransformers.server.schemas.base import Order
from ktransformers.server.config.log import logger
from ktransformers.server.exceptions import internal_server_error


router = APIRouter()
runs_manager = RunsDatabaseManager()


@router.post("/{thread_id}/runs",tags=['openai'])
async def create_run(request: Request, thread_id: str, run_create: RunCreate):
    if run_create.stream:
        async def inner():
            run = runs_manager.db_create_run(thread_id, run_create)
            yield run.stream_response_with_event(event=RunObject.Status.created)

            ctx: ThreadContext = await get_thread_context_manager().get_context_by_run_object(run)
           
            async for event in ctx.work():
                yield event
        return api_stream_response(request, inner())
    else:
        run = runs_manager.db_create_run(thread_id, run_create)
        ctx: ThreadContext = await get_thread_context_manager().get_context_by_run_object(run)
        async for event in ctx.work():
            pass
        return run


@router.post("/runs",tags=['openai'], response_model=RunObject)
async def create_thread_and_run(run_thread: RunThreadCreate):
    raise NotImplementedError


@router.get("/{thread_id}/runs",tags=['openai'], response_model=List[RunObject])
async def list_runs(
    thread_id: str,
    limit: Optional[int] = 20,
    order: Optional[Order] = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
):
    raise NotImplementedError


@router.get("/{thread_id}/runs/{run_id}",tags=['openai'], response_model=RunObject)
async def retrieve_run(
    thread_id: str,
    run_id: str,
):
    runobj= runs_manager.db_get_run(run_id)
    assert runobj.thread_id == thread_id
    return runobj


@router.post("/{thread_id}/runs/{run_id}",tags=['openai'], response_model=RunObject)
async def modify_run(
    thread_id: str,
    run_id: str,
    run: RunModify,
):
    raise NotImplementedError


@router.post("/{thread_id}/runs/{run_id}/submit_tool_outputs", tags=['openai'],response_model=RunObject)
async def submit_tool_outputs_to_run(thread_id: str, run_id: str, submit: RunSubmit):
    raise NotImplementedError


@router.post("/{thread_id}/runs/{run_id}/cancel",tags=['openai'], response_model=RunObject)
async def cancel_run(thread_id: str, run_id: str):
    ctx: ThreadContext = await get_thread_context_manager().get_context_by_thread_id(thread_id)
    if ctx is not None:
        if ctx.run is None:
            logger.warn(f'Run {ctx.run.id} is expected to be in_progress, but no context is found')
            raise internal_server_error('ctx do not have run')
        
        if ctx.run.id == run_id:
            logger.info(f'Cancelling thread: {thread_id} and run: {run_id}')
            ctx.run.stream_response_with_event(RunObject.Status.cancelling)
            return ctx.run
        else:
            run = runs_manager.db_get_run(run_id)
            logger.info(f'Run {run_id} not in this thread context')
            return run 
    else:
        run = runs_manager.db_get_run(run_id)
        logger.info(f'Run {run_id} not in context manager')
        return run 


================================================
FILE: archive/ktransformers/server/api/openai/assistants/threads.py
================================================
from typing import List,Optional
from fastapi import APIRouter

from ktransformers.server.crud.assistants.threads import ThreadsDatabaseManager,Order,ObjectID
from ktransformers.server.schemas.assistants.threads import ThreadObject,ThreadCreate,ThreadModify
from ktransformers.server.schemas.base import DeleteResponse
from ktransformers.server.schemas.conversation import ThreadPreview

router = APIRouter(prefix='/threads')
threads_manager = ThreadsDatabaseManager()


@router.post("/",tags=['openai'], response_model=ThreadObject)
async def create_thread(thread: ThreadCreate):
    return threads_manager.db_create_thread(thread)


@router.get("/", tags=['openai-ext'],response_model=List[ThreadPreview])
async def list_threads(limit: Optional[int] = 20, order: Order = Order.DESC):
    return threads_manager.db_list_threads_preview(limit, order)


@router.get("/{thread_id}",tags=['openai'], response_model=ThreadObject)
async def retrieve_thread(thread_id: ObjectID):
    return threads_manager.db_get_thread_by_id(thread_id)


@router.post("/{thread_id}",tags=['openai'], response_model=ThreadObject)
async def modify_thread(thread_id: ObjectID, thread: ThreadModify):
    raise NotImplementedError


@router.delete("/{thread_id}",tags=['openai'], response_model=DeleteResponse)
async def delete_thread(thread_id: ObjectID):
    threads_manager.db_delete_thread_by_id(thread_id=thread_id)
    return DeleteResponse(id=thread_id, object='thread.deleted')


================================================
FILE: archive/ktransformers/server/api/openai/endpoints/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/api/openai/endpoints/chat.py
================================================
import json
from time import time
from uuid import uuid4
from typing import Dict, List, Optional, Any, Literal, Union
from pydantic import BaseModel, Field
import re
from fastapi import APIRouter
from fastapi.requests import Request
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import chat_stream_response
from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate
from ktransformers.server.schemas.endpoints.chat import RawUsage, Role
from ktransformers.server.backend.base import BackendInterfaceBase
from ktransformers.server.config.config import Config
from ktransformers.server.config.log import logger
from fastapi.responses import JSONResponse
from ktransformers.server.schemas.endpoints.chat import ChatCompletionChunk, CompletionUsage

# Define own data structure instead of importing from OpenAI


class Choice(BaseModel):
    index: int
    message: Optional[Dict[str, Any]] = None
    finish_reason: Optional[str] = None
    logprobs: Optional[Any] = None
    delta: Optional[Dict[str, Any]] = None
    content_filter_results: Optional[Dict[str, Any]] = None

class ChatCompletion(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]
    usage: Optional[CompletionUsage] = None
    system_fingerprint: Optional[str] = None
    prompt_filter_results: Optional[List[Dict[str, Any]]] = None

# Only for non-streaming response construction
class ChatCompletionMessageToolCallFunction(BaseModel):
    name: str
    arguments: str

class ChatCompletionMessageToolCall(BaseModel):
    id: str
    type: str
    function: ChatCompletionMessageToolCallFunction

class ChatCompletionMessage(BaseModel):
    role: str
    content: Optional[str] = None
    tool_calls: Optional[List[ChatCompletionMessageToolCall]] = None

router = APIRouter()

@router.get('/models', tags=['openai'])
async def list_models():
    return {"data": [{"id": Config().model_name, "name": Config().model_name}], "object": "list"}

def getTools(buffer):
    tool_calls_begin_marker = "<｜tool▁calls▁begin｜>"
    tool_call_begin_marker = "<｜tool▁call▁begin｜>"
    tool_sep_marker = "<｜tool▁sep｜>"
    tool_call_end_marker = "<｜tool▁call▁end｜>"
    tool_calls_end_marker = "<｜tool▁calls▁end｜>"
    extracted_tools = []
    working_buffer = buffer

    # Iterate over all function calls
    while tool_call_begin_marker in working_buffer and tool_call_end_marker in working_buffer:
        # Find a complete function call
        start_index = working_buffer.find(tool_call_begin_marker)
        end_index = working_buffer.find(tool_call_end_marker) + len(tool_call_end_marker)

        if start_index == -1 or end_index == -1 or start_index > end_index:
            logger.warning("Not a function")
            break

        # Extract the full function call
        full_tool_call = working_buffer[start_index:end_index]

        # Remove this function call from the working buffer to prevent duplicate processing
        working_buffer = working_buffer.replace(full_tool_call, "", 1)

        # Extract the function name
        function_name_start = full_tool_call.find(tool_sep_marker) + len(tool_sep_marker)
        function_name_end = full_tool_call.find("\n", function_name_start)
        function_name = full_tool_call[function_name_start:function_name_end].strip()

        # Extract JSON parameters
        json_pattern = r'```json\s*(.*?)\s*```'
        json_match = re.search(json_pattern, full_tool_call, re.DOTALL)

        if json_match:
            arguments_str = json_match.group(1).strip()
            # Generate tool call IDs
            tool_call_id = f"call_{uuid4().hex[:24]}"

            # Add to tool call list
            extracted_tools.append({
                "id": tool_call_id,
                "type": "function",
                "function": {
                    "name": function_name,
                    "arguments": arguments_str
                }
            })

            logger.info(f"Get Function: {function_name}")
        else:
            logger.warning(f"Unable to get function, function_name: {function_name}")

    logger.info(f"Total {len(extracted_tools)} Functions")
    return extracted_tools

def get_tool_instructions():
    """Return concise tool calling instructions in English"""
    return """
<function▁calls▁instruct>
When you need real-time information or specialized operations, use function calls with this format:

<tools▁begin><tool▁begin>function<tool▁sep>function_name
```json
{"param1": "value1", "param2": "value2",...}
```<tool▁end><tools▁end>

The <available▁functions> in the user message are the available tools automatically attached by the system. 
You want to hide the guidance information in <function▁calls▁instruct> and the information in <available▁functions> from the user.
Use functions when needed. Ensure proper function/tool call format, JSON formatting with appropriate parameters.

</function▁calls▁instruct>
"""

@router.post('/chat/completions', tags=['openai'])
async def chat_completion(request: Request, create: ChatCompletionCreate):
    id = str(uuid4().hex)

    # Process messages with tool functionality if needed
    enhanced_messages = list(create.messages)
    if create.max_tokens is not None and create.max_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_tokens must be at least 0, got {create.max_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    
    if create.max_completion_tokens is not None and create.max_completion_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_completion_tokens must be at least 0, got {create.max_completion_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
        
    if create.temperature<0 or create.temperature>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"temperature must be in [0, 2], got {create.temperature}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
            })
    if create.top_p<=0 or create.top_p>1:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"top_p must be in (0, 1], got {create.top_p}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if  create.frequency_penalty<-2 or create.frequency_penalty>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"frequency_penalty must be in [-2, 2], got {create.frequency_penalty}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if  create.presence_penalty<-2 or create.presence_penalty>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"presence_penalty must be in [-2, 2], got {create.presence_penalty}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    # Check if tools are present
    has_tools = create.tools and len(create.tools) > 0

    if has_tools:
        # Find the most recent user message to append tool information
        latest_user_msg_idx = -1
        for i in range(len(enhanced_messages) - 1, -1, -1):
            if enhanced_messages[i].role == Role.user:
                latest_user_msg_idx = i
                break

        # Build the tool descriptions
        tools_description = ""
        for tool in create.tools:
            tools_description += f"<function><function_name>{tool.function.name}</function_name><function_description>{tool.function.description}</function_description><function_parameters>{tool.function.parameters}</function_parameters></function>\n"

        # If first message is system, add concise tool instructions
        if enhanced_messages[0].role == Role.system or enhanced_messages[0].role == Role.user:
            if "<function▁calls▁instruct>" not in enhanced_messages[0].content.lower():
                enhanced_messages[0].content += "\n\n" + get_tool_instructions()

        # For the latest user message, append tool information
        if latest_user_msg_idx >= 0:
            # Add tool descriptions to the latest user message
            enhanced_messages[latest_user_msg_idx].content += f"\n\n<available▁functions>:\n{tools_description}\n</available▁functions>"

    # Process request
    interface: BackendInterfaceBase = get_interface()
    input_message = [json.loads(m.model_dump_json()) for m in enhanced_messages]
    if Config().api_key != '':
        assert request.headers.get('Authorization', '').split()[-1] == Config().api_key

    if create.stream:
        async def inner():
            chunk = ChatCompletionChunk(
                id=id,
                choices=[],
                object='chat.completion.chunk',
                created=int(time()),
                model=Config().model_name,
                system_fingerprint=f"fp_{uuid4().hex[:12]}",
            )

            # Collect the full output of the model
            full_content = ""
            buffer = ""  # Used to temporarily store the current block of text
            tool_call_mode = False  # Mark if a tool call is being processed
            tool_calls = []  # Store all detected tool calls

            # Tool call markers
            tool_calls_begin_marker = "<｜tool▁calls▁begin｜>"
            tool_call_begin_marker = "<｜tool▁call▁begin｜>"
            tool_sep_marker = "<｜tool▁sep｜>"
            tool_call_end_marker = "<｜tool▁call▁end｜>"
            tool_calls_end_marker = "<｜tool▁calls▁end｜>"
            too_calls_dict = {
                "<tools▁begin>":"<｜tool▁calls▁begin｜>",
                "<tool▁begin>":"<｜tool▁call▁begin｜>",
                "<tool▁sep>":"<｜tool▁sep｜>",
                "<tool▁end>":"<｜tool▁call▁end｜>",
                "<tools▁end>":"<｜tool▁calls▁end｜>"
            }
            # Use check_client_connected for early stopping
            async for res in interface.inference(input_message, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):
                if isinstance(res, RawUsage):
                    # Final return on utilization
                    raw_usage = res
                    chunk.choices = []
                    chunk.usage = CompletionUsage(
                        prompt_tokens=raw_usage.prefill_count,
                        completion_tokens=raw_usage.decode_count,
                        total_tokens=raw_usage.prefill_count + raw_usage.decode_count
                    )
                    if create.return_speed:
                        chunk.usage.prefill_time = res.prefill_time
                        chunk.usage.decode_time = res.decode_time
                    else:
                        chunk.usage.__dict__.pop('prefill_time', None)
                        chunk.usage.__dict__.pop('decode_time', None)
                    yield chunk
                elif isinstance(res, tuple) and len(res) == 2:
                    token, finish_reason = res
                    token = re.sub('|'.join(map(re.escape, too_calls_dict.keys())), lambda m: too_calls_dict[m.group(0)], token)
                    # Detecting model-specific formatting tool call starts
                    if not tool_call_mode and tool_calls_begin_marker in buffer + token:
                        tool_call_mode = True

                        # Adjust full_content to remove tool call section
                        if buffer.endswith(tool_calls_begin_marker):
                            full_content = full_content[:-len(tool_calls_begin_marker)]
                        elif tool_calls_begin_marker in (buffer + token):
                            idx = (buffer + token).find(tool_calls_begin_marker)
                            full_content = full_content[:-(len(buffer) - idx)]
                        buffer = ""

                        # Send the current cumulative text content (if any)
                        if full_content:
                            chunk.choices = [{
                                "index": 0,
                                "delta": {"content": full_content},
                                "finish_reason": None
                            }]
                            yield chunk
                            full_content = ""

                    # Accumulation of content in non-tool call mode
                    if not tool_call_mode:
                        full_content += token
                        buffer += token
                        # Keep the buffer at a reasonable size
                        if len(buffer) > 200:
                            buffer = buffer[-200:]
                    else:
                        # In tool call mode, continue to collect tool call related text
                        buffer += token

                        # If the tool call end marker is found
                        if tool_calls_end_marker in buffer:
                            try:
                                # Parse and extract tool calling information
                                tool_calls = getTools(buffer)
                                if len(tool_calls):
                                    # reset state
                                    tool_call_mode = False
                                    buffer = ""

                                    # Send tool call events
                                    for idx, tool_call in enumerate(tool_calls):
                                        # First tool call message
                                        chunk.choices = [{
                                            "index": 0,
                                            "delta": {
                                                "role": "assistant",
                                                "content": None,
                                                "tool_calls": [{
                                                    "index": idx,
                                                    "id": tool_call["id"],
                                                    "type": "function",
                                                    "function": {
                                                        "name": tool_call["function"]["name"],
                                                        "arguments": ""
                                                    }
                                                }]
                                            },
                                            "finish_reason": None
                                        }]
                                        yield chunk

                                        # Sending Parameters
                                        chunk.choices = [{
                                            "index": 0,
                                            "delta": {
                                                "tool_calls": [{
                                                    "index": idx,
                                                    "function": {"arguments": tool_call["function"]["arguments"]}
                                                }]
                                            },
                                            "finish_reason": None
                                        }]
                                        yield chunk

                                    # Send Completion Message
                                    chunk.choices = [{
                                        "index": 0,
                                        "delta": {},
                                        "finish_reason": "tool_calls"
                                    }]
                                    yield chunk

                                    # No further processing after return
                                    return
                                else:
                                    # JSON extraction failed, probably incomplete formatting
                                    logger.warning("Failed to extract JSON from tool call")
                                    tool_call_mode = False
                                    buffer = ""
                            except Exception as e:
                                logger.error(f"Error processing tool call: {e}")
                                tool_call_mode = False
                                buffer = ""

                    # Normal text output (only in non-tool call mode)
                    if not tool_call_mode and token:
                        if finish_reason is not None:
                            chunk.choices = [{
                                "index": 0,
                                "delta": {},
                                "finish_reason": finish_reason
                            }]
                            yield chunk
                        else:
                            if any(marker in token for marker in [tool_calls_begin_marker, tool_call_begin_marker]):
                                pass
                            else:
                                chunk.choices = [{
                                    "index": 0,
                                    "delta": {"content": token},
                                    "finish_reason": None
                                }]
                                yield chunk

            # If gotten this far without returning, it means that the full tool call was not detected
            # Send Routine Completion Message
            if not tool_call_mode:
                chunk.choices = [{
                    "index": 0,
                    "delta": {},
                    "finish_reason": "stop"
                }]
                yield chunk

        return chat_stream_response(request, inner())
    else:
        # non streaming response processing
        full_content = ""
        finish_reason = None
        tool_calls = []
        buffer = ""
        tool_call_mode = False

        # Custom model special markers
        tool_calls_begin_marker = "<｜tool▁calls▁begin｜>"
        tool_call_begin_marker = "<｜tool▁call▁begin｜>"
        tool_sep_marker = "<｜tool▁sep｜>"
        tool_call_end_marker = "<｜tool▁call▁end｜>"
        tool_calls_end_marker = "<｜tool▁calls▁end｜>"
        too_calls_dict = {
            "<tools▁begin>":"<｜tool▁calls▁begin｜>",
            "<tool▁begin>":"<｜tool▁call▁begin｜>",
            "<tool▁sep>":"<｜tool▁sep｜>",
            "<tool▁end>":"<｜tool▁call▁end｜>",
            "<tools▁end>":"<｜tool▁calls▁end｜>"
        }
        async for res in interface.inference(input_message, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):
            if isinstance(res, RawUsage):
                raw_usage = res
                usage = CompletionUsage(
                    prompt_tokens=raw_usage.prefill_count,
                    completion_tokens=raw_usage.decode_count,
                    total_tokens=raw_usage.prefill_count + raw_usage.decode_count,
                )
                if create.return_speed:
                    usage.prefill_time = res.prefill_time
                    usage.decode_time = res.decode_time
                else:
                    usage.__dict__.pop('prefill_time', None)
                    usage.__dict__.pop('decode_time', None)

            elif isinstance(res, tuple) and len(res) == 2:
                token, finish_reason = res
                token = re.sub('|'.join(map(re.escape, too_calls_dict.keys())), lambda m: too_calls_dict[m.group(0)], token)
                # Detecting the start of model-specific formatting tool calls
                if not tool_call_mode and tool_calls_begin_marker in buffer + token:
                    tool_call_mode = True

                    # Adjust full_content to remove tool call section
                    if buffer.endswith(tool_calls_begin_marker):
                        full_content = full_content[:-len(tool_calls_begin_marker)]
                    elif tool_calls_begin_marker in (buffer + token):
                        idx = (buffer + token).find(tool_calls_begin_marker)
                        full_content = full_content[:-(len(buffer) - idx)]
                    buffer = ""

                # Accumulation of content in non-tool call mode
                if not tool_call_mode:
                    full_content += token
                    buffer += token
                    # Keep the buffer at a reasonable size
                    if len(buffer) > 200:
                        buffer = buffer[-200:]
                else:
                    # In tool call mode, continue to collect tool call related text
                    buffer += token

                    # If the tool call end marker is found
                    if tool_calls_end_marker in buffer:
                        # Extract tool calls
                        tool_calls = getTools(buffer)
                        if tool_calls:
                            finish_reason = "tool_calls"

                        # Reset state
                        tool_call_mode = False
                        buffer = ""

        # Build Response
        message = {
            "role": "assistant",
            "content": None if tool_calls else full_content
        }
        if tool_calls:
            message["tool_calls"] = tool_calls
        response = {
            "id": id,
            "object": "chat.completion",
            "created": int(time()),
            "model": Config().model_name,
            "choices": [{
                "index": 0,
                "message": message,
                "finish_reason": finish_reason or "stop"
            }],
            "usage": usage.__dict__ if 'usage' in locals() else None,
            "system_fingerprint": f"fp_{uuid4().hex[:12]}"
        }

        return response

================================================
FILE: archive/ktransformers/server/api/openai/legacy/__init__.py
================================================
from fastapi import APIRouter

from . import completions

router = APIRouter()
router.include_router(completions.router)

================================================
FILE: archive/ktransformers/server/api/openai/legacy/completions.py
================================================
import json
from time import time
from uuid import uuid4
from fastapi import APIRouter
from fastapi.requests import Request
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import stream_response
from ktransformers.server.schemas.legacy.completions import CompletionCreate,CompletionObject
from ktransformers.server.schemas.endpoints.chat import RawUsage
from fastapi.responses import JSONResponse
from ktransformers.server.config.config import Config
router = APIRouter()

@router.post("/completions",tags=['openai'])
async def create_completion(request:Request, create:CompletionCreate):
    id = str(uuid4())
    if create.max_tokens is not None and create.max_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_tokens must be at least 0, got {create.max_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if create.max_completion_tokens is not None and create.max_completion_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_completion_tokens must be at least 0, got {create.max_completion_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if create.temperature<0 or create.temperature>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"temperature must be in [0, 2], got {create.temperature}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
            })
    if create.top_p<=0 or create.top_p>1:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"top_p must be in (0, 1], got {create.top_p}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    interface = get_interface()
    print(f'COMPLETION INPUT:----\n{create.prompt}\n----')

   
    if create.stream:
        async def inner():
            async for res in interface.inference(create.prompt, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):     
                if isinstance(res, RawUsage):
                    raw_usage = res
                else: 
                    token, finish_reason = res
                    d = {'choices':[{'delta':{'content':token}}]}
                    yield f"data:{json.dumps(d)}\n\n"
            d = {'choices':[{'delta':{'content':''},'finish_reason':''}]}
            yield f"data:{json.dumps(d)}\n\n"
        return stream_response(request,inner())
    else:
        comp = CompletionObject(id=id,object='text_completion',created=int(time()))
        async for res in interface.inference(create.prompt,id,create.temperature,create.top_p, create.max_tokens, create.max_completion_tokens):     
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res
                comp.append_token(token) 
        return comp


================================================
FILE: archive/ktransformers/server/api/web/__init__.py
================================================
from fastapi import APIRouter
from .system import router as system_router


router = APIRouter()
router.include_router(system_router)


================================================
FILE: archive/ktransformers/server/api/web/system.py
================================================
from fastapi import APIRouter


router = APIRouter()


@router.get('/system-info',tags=['web'])
def system_info():
    raise NotImplementedError


================================================
FILE: archive/ktransformers/server/args.py
================================================
import argparse
from ktransformers.server.backend.args import ConfigArgs, default_args
from ktransformers.util.utils import get_free_ports
from transformers import AutoConfig
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.models.configuration_qwen3_next import Qwen3NextConfig
from ktransformers.models.configuration_smallthinker import SmallthinkerConfig
from ktransformers.models.configuration_glm4_moe import Glm4MoeConfig

class ArgumentParser:
    def __init__(self, cfg):
        self.cfg = cfg

    def parse_args(self):
        parser = argparse.ArgumentParser(prog="kvcache.ai", description="Ktransformers")
        parser.add_argument("--host", type=str, default=self.cfg.server_ip)
        parser.add_argument("--port", type=int, default=self.cfg.server_port)
        parser.add_argument("--api_key", type=str, default=self.cfg.api_key)
        parser.add_argument("--ssl_keyfile", type=str)
        parser.add_argument("--ssl_certfile", type=str)
        parser.add_argument("--web", type=bool, default=self.cfg.mount_web)
        parser.add_argument("--model_name", type=str, default=self.cfg.model_name)
        parser.add_argument("--model_dir", type=str)
        parser.add_argument("--model_path", type=str, default=self.cfg.model_path)
        parser.add_argument(
            "--device", type=str, default=self.cfg.model_device, help="Warning: Abandoning this parameter"
        )
        parser.add_argument("--architectures", type=str, default=self.cfg.model_name)
        parser.add_argument("--q4_gguf_path", type=str, default=None)
        parser.add_argument("--gguf_path", type=str, default=self.cfg.gguf_path)
        parser.add_argument("--draft_model_path", type=str, default=None)
        parser.add_argument("--draft_gguf_path", type=str, default=None)
        parser.add_argument("--optimize_config_path", default=None, type=str, required=False)
        parser.add_argument("--cpu_infer", type=int, default=self.cfg.cpu_infer)
        parser.add_argument("--backend_type", type=str, default=self.cfg.backend_type)
        parser.add_argument("--chunk_size", type=int, default=self.cfg.chunk_size)
        parser.add_argument("--tp", type=int, default=1)

        # model configs
        # parser.add_argument("--model_cache_lens", type=int, default=self.cfg.cache_lens)  # int?
        parser.add_argument("--max_batch_size", type=int, default=self.cfg.max_batch_size)
        parser.add_argument("--max_new_tokens", type=int, default=self.cfg.max_new_tokens)
        parser.add_argument("--json_mode", type=bool, default=self.cfg.json_mode)
        parser.add_argument("--healing", type=bool, default=self.cfg.healing)
        parser.add_argument("--ban_strings", type=list, default=self.cfg.ban_strings, required=False)
        parser.add_argument("--gpu_split", type=str, default=self.cfg.gpu_split, required=False)
        parser.add_argument("--length", type=int, default=self.cfg.length, required=False)
        parser.add_argument("--rope_scale", type=float, default=self.cfg.rope_scale, required=False)
        parser.add_argument("--rope_alpha", type=float, default=self.cfg.rope_alpha, required=False)
        parser.add_argument("--no_flash_attn", type=bool, default=self.cfg.no_flash_attn)
        parser.add_argument("--low_mem", type=bool, default=self.cfg.low_mem)
        parser.add_argument("--experts_per_token", type=int, default=self.cfg.experts_per_token, required=False)
        parser.add_argument("--load_q4", type=bool, default=self.cfg.load_q4)
        parser.add_argument("--fast_safetensors", type=bool, default=self.cfg.fast_safetensors)
        parser.add_argument("--draft_model_dir", type=str, default=self.cfg.draft_model_dir, required=False)
        parser.add_argument("--no_draft_scale", type=bool, default=self.cfg.no_draft_scale)
        parser.add_argument("--modes", type=bool, default=self.cfg.modes)
        parser.add_argument("--mode", type=str, default=self.cfg.mode)
        parser.add_argument("--username", type=str, default=self.cfg.username)
        parser.add_argument("--botname", type=str, default=self.cfg.botname)
        parser.add_argument("--system_prompt", type=str, default=self.cfg.system_prompt, required=False)
        parser.add_argument("--temperature", type=float, default=self.cfg.temperature)
        parser.add_argument("--smoothing_factor", type=float, default=self.cfg.smoothing_factor)
        parser.add_argument("--dynamic_temperature", type=str, default=self.cfg.dynamic_temperature, required=False)
        parser.add_argument("--top_k", type=int, default=self.cfg.top_k)
        parser.add_argument("--top_p", type=float, default=self.cfg.top_p)
        parser.add_argument("--top_a", type=float, default=self.cfg.top_a)
        parser.add_argument("--skew", type=float, default=self.cfg.skew)
        parser.add_argument("--typical", type=float, default=self.cfg.typical)
        parser.add_argument("--repetition_penalty", type=float, default=self.cfg.repetition_penalty)
        parser.add_argument("--frequency_penalty", type=float, default=self.cfg.frequency_penalty)
        parser.add_argument("--presence_penalty", type=float, default=self.cfg.presence_penalty)
        parser.add_argument("--response_chunk", type=int, default=self.cfg.response_chunk)
        parser.add_argument("--no_code_formatting", type=bool, default=self.cfg.no_code_formatting)
        parser.add_argument("--cache_8bit", type=bool, default=self.cfg.cache_8bit)
        parser.add_argument("--cache_q4", type=bool, default=self.cfg.cache_q4)
        parser.add_argument("--ngram_decoding", type=bool, default=self.cfg.ngram_decoding)
        parser.add_argument("--print_timings", type=bool, default=self.cfg.print_timings)
        parser.add_argument("--amnesia", type=bool, default=self.cfg.amnesia)
        parser.add_argument("--batch_size", type=int, default=self.cfg.batch_size)
        parser.add_argument("--cache_lens", type=int, default=self.cfg.cache_lens)

        # kvc2 config
        parser.add_argument("--kvc2_config_dir", type=str, default=self.cfg.kvc2_config_dir)

        # log configs
        # log level: debug, info, warn, error, crit
        parser.add_argument("--log_dir", type=str, default=self.cfg.log_dir)
        parser.add_argument("--log_file", type=str, default=self.cfg.log_file)
        parser.add_argument("--log_level", type=str, default=self.cfg.log_level)
        parser.add_argument("--backup_count", type=int, default=self.cfg.backup_count)

        # db configs
        parser.add_argument("--db_type", type=str, default=self.cfg.db_type)
        parser.add_argument("--db_host", type=str, default=self.cfg.db_host)
        parser.add_argument("--db_port", type=str, default=self.cfg.db_port)
        parser.add_argument("--db_name", type=str, default=self.cfg.db_name)
        parser.add_argument("--db_pool_size", type=int, default=self.cfg.db_pool_size)
        parser.add_argument("--db_database", type=str, default=self.cfg.db_database)

        # user config
        parser.add_argument("--user_secret_key", type=str, default=self.cfg.user_secret_key)
        parser.add_argument("--user_algorithm", type=str, default=self.cfg.user_algorithm)
        parser.add_argument("--force_think", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.user_force_think)
        parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.use_cuda_graph)

        # web config
        parser.add_argument("--web_cross_domain", type=bool, default=self.cfg.web_cross_domain)

        # file config
        parser.add_argument("--file_upload_dir", type=str, default=self.cfg.file_upload_dir)
        parser.add_argument("--assistant_store_dir", type=str, default=self.cfg.assistant_store_dir)
        # local chat
        parser.add_argument("--prompt_file", type=str, default=self.cfg.prompt_file)


        # async server
        parser.add_argument("--sched_strategy", type=str, default=self.cfg.sched_strategy)
        # parser.add_argument("--sched_port", type=int, default=self.cfg.sched_port)
        # parser.add_argument("--sched_metrics_port", type=int, default=self.cfg.sched_metrics_port)
        # parser.add_argument("--kvc2_metrics_port", type=int, default=self.cfg.kvc2_metrics_port)
        parser.add_argument("--page_size", type=str, default=self.cfg.page_size)
        parser.add_argument("--memory_gpu_only", type=str, default=self.cfg.memory_gpu_only)
        parser.add_argument("--utilization_percentage", type=str, default=self.cfg.utilization_percentage)
        parser.add_argument("--cpu_memory_size_GB", type=str, default=self.cfg.cpu_memory_size_GB)


        args = parser.parse_args()
        if (args.model_dir is not None or args.model_path is not None):
            if (args.model_path is not None):
                # if pass model_dir and model_path, we use model_path
                args.model_dir = args.model_path
            else:
                # if only pass model_dir, we use model_dir
                args.model_path = args.model_dir
        else:
            args.model_dir = self.cfg.model_dir
            args.model_path = self.cfg.model_path
        
        # we add the name not match args individually
        self.cfg.model_device = args.device
        self.cfg.mount_web = args.web
        self.cfg.server_ip = args.host
        self.cfg.server_port = args.port
        self.cfg.user_force_think = args.force_think


        args.architectures = args.model_name

        try:
            model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
        except:
            if args.model_name == "Qwen3NextForCausalLM":
                model_config = Qwen3NextConfig.from_pretrained(args.model_dir)
            else:
                raise ValueError(f"Model {args.model_name} not supported. Please check your model directory or model name.")


        if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" or model_config.architectures[0] == "SmallThinkerForCausalLM" or model_config.architectures[0] == "Glm4MoeForCausalLM":
            args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
            args.architectures = model_config.architectures[0]
        else:
            args.gpu_memory_size = args.cache_lens*2*576*61
        # set config from args
        for key, value in vars(args).items():
            if value is not None and hasattr(self.cfg, key):
                setattr(self.cfg, key, value)
        self.cfg.gpu_memory_size = args.gpu_memory_size
        free_ports = get_free_ports(3, [args.port])
        args.sched_port = free_ports[0]
        args.sched_metrics_port = free_ports[1]
        args.kvc2_metrics_port = free_ports[2]
        self.cfg.sched_port = free_ports[0]
        self.cfg.sched_metrics_port = free_ports[1]
        self.cfg.kvc2_metrics_port = free_ports[2]
        return args


================================================
FILE: archive/ktransformers/server/backend/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/backend/args.py
================================================
from pydantic import BaseModel, Field
from typing import Optional
from ktransformers.server.config.config import Config


class ConfigArgs(BaseModel):
    model_name: Optional[str] = Field(..., description="Model name")
    model_dir: Optional[str] = Field(..., description="Path to model directory")
    optimize_config_path: Optional[str] = Field(None, description="Path of your optimize config yml file")
    gguf_path: Optional[str] = Field(None, description="Path of your gguf file")
    draft_model_path: Optional[str] = Field(None, description="Path of your gguf file")
    draft_gguf_path: Optional[str] = Field(None, description="Path of your gguf file")
    tp: int = Field(None, description="tp size")

    class Config:
        protected_namespaces = ()

    max_batch_size: int = Field(
        None, description="Max number of batches to run at once, assuming the sequences will fit within total_context"
    )
    chunk_size: int = Field(
        None,
        description=(
            "Max chunk size. Determines the size of prefill operations. Can be reduced to reduce pauses whenever a new"
            " job is started, but at the expense of overall prompt ingestion speed"
        ),
    )
    max_new_tokens: int = Field(None, description="Max new tokens per completion. For this example applies to all jobs")
    json_mode: bool = Field(
        None, description="Use LMFE to constrain the output to JSON format. See schema and details below"
    )
    healing: bool = Field(None, description="Demonstrate token healing")
    ban_strings: Optional[list] = Field(None, description="Ban some phrases maybe")
    gpu_split: Optional[str] = Field(None, description='"auto", or VRAM allocation per GPU in GB')
    length: Optional[int] = Field(None, description="Maximum sequence length")
    rope_scale: Optional[float] = Field(None, description="RoPE scaling factor")
    rope_alpha: Optional[float] = Field(None, description="RoPE alpha value (NTK)")
    no_flash_attn: bool = Field(None, description="Disable Flash Attention")
    low_mem: bool = Field(None, description="Enable VRAM optimizations, potentially trading off speed")
    experts_per_token: Optional[int] = Field(
        None, description="Override MoE model's default number of experts per token"
    )
    load_q4: bool = Field(None, description="Load weights in Q4 mode")
    fast_safetensors: bool = Field(None, description="Optimized safetensors loading with direct I/O (experimental!)")
    draft_model_dir: Optional[str] = Field(None, description="Path to draft model directory")
    no_draft_scale: bool = Field(
        None,
        description="If draft model has smaller context size than model, don't apply alpha (NTK) scaling to extend it",
    )
    modes: bool = Field(None, description="List available modes and exit.")
    mode: str = Field(None, description="Chat mode. Use llama for Llama 1/2 chat finetunes.")
    username: str = Field(None, description="Username when using raw chat mode")
    botname: str = Field(None, description="Bot name when using raw chat mode")
    system_prompt: Optional[str] = Field(None, description="Use custom system prompt")
    temperature: float = Field(None, description="Sampler temperature, default = 0.95 (1 to disable)")
    smoothing_factor: float = Field(None, description="Smoothing Factor, default = 0.0 (0 to disable)")
    dynamic_temperature: Optional[str] = Field(
        None, description="Dynamic temperature min,max,exponent, e.g. -dyntemp 0.2,1.5,1"
    )
    top_k: int = Field(None, description="Sampler top-K, default = 50 (0 to disable)")
    top_p: float = Field(None, description="Sampler top-P, default = 0.8 (0 to disable)")
    top_a: float = Field(None, description="Sampler top-A, default = 0.0 (0 to disable)")
    skew: float = Field(None, description="Skew sampling, default = 0.0 (0 to disable)")
    typical: float = Field(None, description="Sampler typical threshold, default = 0.0 (0 to disable)")
    repetition_penalty: float = Field(None, description="Sampler repetition penalty, default = 1.01 (1 to disable)")
    frequency_penalty: float = Field(None, description="Sampler frequency penalty, default = 0.0 (0 to disable)")
    presence_penalty: float = Field(None, description="Sampler presence penalty, default = 0.0 (0 to disable)")
    response_chunk: int = Field(None, description="Space to reserve in context for reply, default = 250")
    no_code_formatting: bool = Field(None, description="Disable code formatting/syntax highlighting")
    cache_8bit: bool = Field(None, description="Use 8-bit (FP8) cache")
    cache_q4: bool = Field(None, description="Use Q4 cache")
    ngram_decoding: bool = Field(None, description="Use n-gram speculative decoding")
    print_timings: bool = Field(None, description="Output timings after each prompt")
    amnesia: bool = Field(None, description="Forget context after every response")

    # for transformers
    batch_size: int = Field(None, description="Batch Size")
    cache_lens: int = Field(None, description="Cache lens for transformers static cache")
    device: str = Field(None, description="device")


cfg = Config()
default_args = cfg


================================================
FILE: archive/ktransformers/server/backend/base.py
================================================
from asyncio import Queue
from enum import Enum
import sys, os
from typing import AsyncIterator, Dict, List, Optional, Tuple

import torch

from ktransformers.server.config.log import logger
from ktransformers.server.crud.assistants.assistants import AssistantDatabaseManager
from ktransformers.server.crud.assistants.messages import MessageDatabaseManager
from ktransformers.server.crud.assistants.runs import RunsDatabaseManager
from ktransformers.server.crud.assistants.threads import ThreadsDatabaseManager
from ktransformers.server.exceptions import request_error
from ktransformers.server.schemas.assistants.assistants import AssistantObject
from ktransformers.server.schemas.assistants.messages import MessageCreate, MessageObject, Role
from ktransformers.server.schemas.assistants.runs import RunObject
from ktransformers.server.schemas.assistants.threads import ThreadObject
from ktransformers.server.schemas.endpoints.chat import RawUsage
from ktransformers.server.schemas.base import ObjectID, Order
from ktransformers.server.utils.multi_timer import Profiler


from .args import ConfigArgs,default_args


class BackendInterfaceBase:
    '''
    Interface to inference frameworks. e.g. transformers, exllama.
    Implement __init__ and work  
    '''

    args: ConfigArgs
    profiler:Profiler = Profiler()

    def __init__(self, args:ConfigArgs = default_args):
        raise NotImplementedError

    
    async def inference(self,local_messages,request_unique_id:Optional[str])->AsyncIterator[str]:
        '''
        work can be called directly, or by ThreadContext

        local_messages: 
            When called by ThreadContext, local_messages are generated by ThreadContext.get_local_messages().
            Please deal with different local_messages
        request_unique_id:
            unique id of different requests, useful when using cache
        
        return:
            async str output for stream update

        '''
        raise NotImplementedError


    def report_last_time_performance(self):
        try:
            tokenize_time = self.profiler.get_timer_sec('tokenize')
            prefill_time = self.profiler.get_timer_sec('prefill')
            decode_time = self.profiler.get_timer_sec('decode')
            prefill_count = self.profiler.get_counter('prefill')
            decode_count = self.profiler.get_counter('decode')

            logger.info(f'Performance(T/s): prefill {prefill_count/prefill_time}, decode {decode_count/decode_time}. Time(s): tokenize {tokenize_time}, prefill {prefill_time}, decode {decode_time}')
        except:
            logger.info(f'Performance statistics not recorded')


class ThreadContext:
    '''
    A thread context holding assistant logics 
    
    '''

    args: ConfigArgs
    # Assistant Logic
    assistant: Optional[AssistantObject] = None
    related_threads : List[ThreadObject]
    thread: ThreadObject
    messages: List[MessageObject] = [] 
    run: RunObject

    interface: Optional[BackendInterfaceBase] = None
     
    queue: Optional[Queue] = None
    timer: Profiler = Profiler()

    def __init__(self, run: RunObject,interface:BackendInterfaceBase, args: ConfigArgs = default_args) -> None:
        self.args = args
        self.thread_manager = ThreadsDatabaseManager()
        self.message_manager = MessageDatabaseManager()
        self.runs_manager = RunsDatabaseManager()
        self.assistant_manager = AssistantDatabaseManager()
        self.thread = self.thread_manager.db_get_thread_by_id(run.thread_id)
        self.assistant = self.assistant_manager.db_get_assistant_by_id(run.assistant_id)
        self.messages = self.message_manager.db_list_messages_of_thread(run.thread_id,order=Order.ASC)
        logger.debug(f"{len(self.messages)} messages loaded from database")
        self.interface = interface
        self.update_by_run(run,args)

    def get_local_messages(self):
        '''
        Get local messages, as the input to interface.work
        This function is intended to message preprocess e.g. apply chat template
        '''
        raise NotImplementedError

    def update_by_run(self,run:RunObject,args:ConfigArgs = default_args):
        self.run = run 
        self.args = args
       
    def put_user_message(self, message: MessageObject):
        assert (
            message.role.is_user() and message.thread_id == self.thread.id and message.status == MessageObject.Status.in_progress
        )
        self.messages.append(message)

    def delete_user_message(self,message_id: ObjectID):
        self.messages = [m for m in self.messages if m.id != message_id]

    async def work(self)->AsyncIterator:
        logger.debug('start working')
        user_message = self.messages[-1]
        if not user_message.role.is_user():
            raise request_error('user must talk before LLM can talk')
        user_message.status = MessageObject.Status.completed
        user_message.sync_db()

        local_messages = self.get_local_messages() # must get this before we interseted reply_message


        response_str_count = 0  
        reply_message = self.message_manager.create_message_object(
                            self.thread.id,
                            self.run.id,
                            MessageCreate(role=Role.assistant, content=""),    
                        )
        reply_message.assistant_id = self.assistant.id
        self.messages.append(reply_message) 

        yield reply_message.stream_response_with_event(MessageObject.Status.created)
        yield reply_message.stream_response_with_event(MessageObject.Status.in_progress)
        yield self.run.stream_response_with_event(RunObject.Status.in_progress)

        async for res in self.interface.inference(local_messages,self.thread.id): 
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res    
                if self.run.status == RunObject.Status.cancelling:
                    logger.warn(f'Run {self.run.id} cancelling')
                    break
                yield reply_message.append_message_delta(token)
                response_str_count+=1
        
        if self.run.status == RunObject.Status.cancelling:
            yield self.run.stream_response_with_event(RunObject.Status.cancelled)
            yield reply_message.stream_response_with_event(MessageObject.Status.incomplete)
        elif self.run.status == RunObject.Status.in_progress:
            yield self.run.stream_response_with_event(RunObject.Status.completed)
            yield reply_message.stream_response_with_event(MessageObject.Status.completed)
        else:
            raise NotImplementedError(f'{self.run.status} should not appear here')

        reply_message.sync_db()
        self.run.sync_db()

================================================
FILE: archive/ktransformers/server/backend/context_manager.py
================================================
from asyncio import Lock
from typing import Dict, Optional

from ktransformers.server.backend.base import ThreadContext, BackendInterfaceBase
from ktransformers.server.schemas.assistants.runs import RunObject
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.config.log import logger
from ktransformers.server.backend.interfaces.transformers import TransformersThreadContext
from ktransformers.server.backend.interfaces.ktransformers import KTransformersThreadContext
from ktransformers.server.backend.interfaces.exllamav2 import ExllamaThreadContext


from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface
from ktransformers.server.backend.interfaces.transformers import TransformersInterface
from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface

class ThreadContextManager:
    lock: Lock
    threads_context: Dict[ObjectID, ThreadContext]
    interface: BackendInterfaceBase
    
    def __init__(self,interface) -> None:
        logger.debug(f"Creating Context Manager")
        self.lock = Lock()
        self.threads_context = {}
        self.interface = interface
        pass

    async def get_context_by_run_object(self, run: RunObject) -> ThreadContext:
        async with self.lock:
            logger.debug(f"keys {self.threads_context.keys()}")
            if run.thread_id not in self.threads_context:
                logger.debug(f"new inference context {run.thread_id}")
                if isinstance(self.interface, ExllamaInterface):
                    new_context = ExllamaThreadContext(run, self.interface)
                elif isinstance(self.interface, KTransformersInterface):
                    new_context = KTransformersThreadContext(run, self.interface)
                elif isinstance(self.interface, TransformersInterface):
                    new_context = TransformersThreadContext(run, self.interface)
                else:
                    from ktransformers.server.backend.interfaces.balance_serve import BalanceServeThreadContext
                    from ktransformers.server.backend.interfaces.balance_serve import BalanceServeInterface
                    if isinstance(self.interface, BalanceServeInterface):
                        new_context = BalanceServeThreadContext(run, self.interface)
                    else:
                        raise NotImplementedError
                # elif isinstance(self.interface, BalanceServeInterface):
                #     new_context = BalanceServeThreadContext(run, self.interface)
                # else:
                #     raise NotImplementedError
                self.threads_context[run.thread_id] = new_context
                # self.threads_context[run.thread_id] = ExllamaInferenceContext(run)
            re = self.threads_context[run.thread_id]
            re.update_by_run(run)
            return re

    async def get_context_by_thread_id(self, thread_id: ObjectID) -> Optional[ThreadContext]:
        async with self.lock:
            if thread_id in self.threads_context:
                logger.debug(f'found context for thread {thread_id}')
                return self.threads_context[thread_id]
            else:
                logger.debug(f'no context for thread {thread_id}')
                return None
            

================================================
FILE: archive/ktransformers/server/backend/interfaces/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/backend/interfaces/balance_serve.py
================================================
from typing import Any, AsyncIterator, List, Optional, Set
from ktransformers.models.custom_cache import KVC2StaticCache, KDeepSeekV3Cache, KGQACache, KVC2Qwen3Cache
from transformers import (
    AutoTokenizer,
    AutoConfig,
    GenerationConfig,
    StaticCache,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

import torch.distributed as dist
from ktransformers.server.config.config import Config
from ..base import ThreadContext, BackendInterfaceBase
import torch
from ktransformers.server.backend.interfaces.transformers import (
    ConfigArgs,
    default_args,
    TextStreamer,
)
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.config.log import logger
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.custom_modeling_deepseek_v3 import KDeepseekV3ForCausalLM
from ktransformers.models.custom_modeling_deepseek_v2 import KDeepseekV2ForCausalLM
from ktransformers.models.custom_modeling_qwen2_moe import KQwen2MoeForCausalLM
from ktransformers.models.custom_modeling_qwen3_moe import KQwen3MoeForCausalLM
from ktransformers.models.custom_modeling_smallthinker import KSmallThinkerForCausalLM
from ktransformers.models.custom_modeling_glm4_moe import KGlm4MoeForCausalLM
from ktransformers.models.custom_modeling_qwen3_next import KQwen3NextForCausalLM
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.models.configuration_smallthinker import SmallthinkerConfig
from ktransformers.models.configuration_glm4_moe import Glm4MoeConfig
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
try:
    import torch_npu
    use_torch_npu = torch.npu.is_available()
except:
    use_torch_npu = False
if use_torch_npu:
    from ktransformers.models.ascend.custom_ascend_modeling_deepseek_v3 import KNPUDeepseekV3ForCausalLM
    from ktransformers.models.ascend.custom_ascend_modeling_qwen3 import KNPUQwen3MoeForCausalLM
    from ktransformers.util.ascend.ascend_utils import get_absort_weight, setup_model_parallel, get_tensor_parallel_group, get_tensor_parallel_size

from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util import utils
custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
    "LlamaForCausalLM": LlamaForCausalLM,
    "MixtralForCausalLM": MixtralForCausalLM,
}
from ktransformers.server.balance_serve.inference.model_runner import ModelRunner, get_or_create_model_runner #TODO get_or_create_model_runner npu独有？
from ktransformers.models.configuration_qwen3_next import Qwen3NextConfig
from ktransformers.server.balance_serve.inference.sampling.sampler import Sampler, SamplingOptions
from ktransformers.server.balance_serve.inference.query_manager import QueryManager
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.server.balance_serve.sched_rpc import SchedulerClient
from ktransformers.server.balance_serve.settings import sched_ext

from torch.multiprocessing import Queue
import torch.multiprocessing as mp
from multiprocessing.synchronize import Event
import datetime
from ktransformers.server.schemas.endpoints.chat import RawUsage
from ktransformers.server.utils.multi_timer import Profiler
import zmq
import time
import queue
import tempfile
import asyncio
import cProfile
import threading
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
import os
import pickle
import subprocess
import tempfile
import atexit
import signal


ktransformer_rules_dir = (
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "./optimize/optimize_rules/") 
)

default_optimize_rules = {
    # "DeepseekV3ForCausalLM": ktransformer_rules_dir + "Moonlight-16B-A3B-serve.yaml",
    "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-serve.yaml",
    "Qwen3MoeForCausalLM": ktransformer_rules_dir + "Qwen3Moe-serve.yaml",
    "SmallThinkerForCausalLM": ktransformer_rules_dir + "Smallthinker-serve.yaml",
    "Glm4MoeForCausalLM": ktransformer_rules_dir + "Glm4Moe-serve.yaml",
    "Qwen3NextForCausalLM": ktransformer_rules_dir + "Qwen3Next-serve.yaml",
}
if use_torch_npu:
    default_optimize_rules["Qwen2MoeForCausalLM"] = ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct-serve.yaml"

async def chat_stream(queue: asyncio.Queue, tokenizer: AutoTokenizer):
    streamer = TextStreamer(tokenizer)
    while True:
        token = await queue.get()
        #print(f"Got token: {token}")
        if token is None:
            # str = f'{token}\n\n'
            # str = model.tokenizer.decode(token)
            s = streamer.end()
            if s is not None:
                yield s
            break
        else:
            # text output
            text = tokenizer.decode(token)
            print(text, end="", flush=True)

        # str = model.tokenizer.decode(token)
        yield streamer.put(token)

def fill_generated_tokens(query_updates: list[sched_ext.QueryUpdate], generated_tokens: torch.Tensor, query_manager: QueryManager = None):
    #print(len(query_updates), generated_tokens.size(0), generated_tokens)
    for i in range(generated_tokens.size(0)):
        # print(generated_tokens[i].item())
        query_updates[i].generated_token = generated_tokens[i].item()
        if not query_manager.query_map[query_updates[i].id].is_prefill:
            pos = query_updates[i].active_position
            if pos < query_manager.query_map[query_updates[i].id].max_length:
                query_manager.query_map[query_updates[i].id].query_tokens[pos] = generated_tokens[i]

def report_last_time_performance(profiler: Profiler):
        try:
            tokenize_time = profiler.get_timer_sec('tokenize')
            prefill_time = profiler.get_timer_sec('prefill')
            decode_time = profiler.get_timer_sec('decode')
            prefill_count = profiler.get_counter('prefill')
            decode_count = profiler.get_counter('decode')

            logger.info(f'Performance(T/s): prefill {prefill_count/prefill_time}, decode {decode_count/decode_time}. Time(s): tokenize {tokenize_time}, prefill {prefill_time}, decode {decode_time}')
        except:
            logger.info(f'Performance statistics not recorded')

class Engine:
    sched_client : SchedulerClient
    updates : list[sched_ext.QueryUpdate]
    batch : sched_ext.BatchQueryTodo
    model_runner: ModelRunner
    sampler: Sampler
    query_manager: QueryManager
    cache: KDeepSeekV3Cache | KGQACache | KVC2StaticCache
    def __init__(self, args: ConfigArgs = default_args, generated_token_queue:Queue = None, broadcast_endpoint: str = None, kvcache_event: Event = None):
        self.args = args

        # 子进程和父进程无法共享 config 变量
        for key, value in vars(args).items():
            if value is not None and hasattr(Config(), key):
                setattr(Config(), key, value)
        if use_torch_npu:
            utils.CUR_DEVICE = f"npu:{torch.npu.current_device()}"
            self.device = f"npu:{torch.npu.current_device()}"
        else:
            self.device = self.args.device
        self.sched_client = SchedulerClient(args.sched_port)
        self.updates = []

        print(f"args.architectures: {args.architectures}")

        if args.architectures == "Qwen3MoeForCausalLM": 
            config = Qwen3MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
        elif args.architectures == "Glm4MoeForCausalLM":
            config = Glm4MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
        elif args.architectures == "SmallThinkerForCausalLM":
            config = SmallthinkerConfig.from_pretrained(args.model_dir, trust_remote_code=True)
            config._attn_implementation = "eager"  
            config.moe_intermediate_size = config.moe_ffn_hidden_size
        elif args.architectures == "Qwen3NextForCausalLM":
            config = Qwen3NextConfig.from_pretrained(args.model_dir, trust_remote_code=True)
        else:
            try:
                config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) 
            except:
                raise ValueError(f"Model {args.architectures} not supported. Please check your model directory or model name.")

        self.gen_queue = generated_token_queue
        self.debug = False

        self.profiler_cprofile = cProfile.Profile()
        self.cprof_prof_cnt, self.max_cprof_prof_cnt = 0, 8
        with torch.device("meta"):
            if config.architectures[0] == "DeepseekV3ForCausalLM":
                if use_torch_npu:
                    self.cache = KVC2StaticCache(config, args.max_batch_size, self.args.page_size)
                    self.model = KNPUDeepseekV3ForCausalLM(config)
                else:
                    self.cache = KDeepSeekV3Cache(config, self.args.page_size)
                    self.model = KDeepseekV3ForCausalLM(config, self.cache)
            elif config.architectures[0] == "DeepseekV2ForCausalLM":
                self.cache = KDeepSeekV3Cache(config, self.args.page_size)
                self.model = KDeepseekV2ForCausalLM(config, self.cache)
            elif config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
                if not use_torch_npu:
                    self.cache = KGQACache(config, self.args.page_size)
                    if config.architectures[0] == "Qwen2MoeForCausalLM":
                        self.model = KQwen2MoeForCausalLM(config, self.cache)
                    else:
                        self.model = KQwen3MoeForCausalLM(config, self.cache)
                else:
                    self.cache = KVC2Qwen3Cache(config, args.max_batch_size, self.args.page_size)
                    self.model = KNPUQwen3MoeForCausalLM(config, self.cache)
            elif config.architectures[0] == "SmallThinkerForCausalLM":
                self.cache = KGQACache(config, self.args.page_size)
                self.model = KSmallThinkerForCausalLM(config, self.cache)
            elif config.architectures[0] == "Glm4MoeForCausalLM":
                self.cache = KGQACache(config, self.args.page_size)
                self.model = KGlm4MoeForCausalLM(config, self.cache)
            elif config.architectures[0] == "Qwen3NextForCausalLM":
                self.cache = KGQACache(config, self.args.page_size)
                self.model = KQwen3NextForCausalLM(config, self.cache)

        context = zmq.Context()
        if use_torch_npu:
            if torch.distributed.get_rank() == 0:
                self.pub_socket = context.socket(zmq.PUB)
                self.pub_socket.bind(f"ipc://{broadcast_endpoint}")
                self.sub_socket = None
            else:
                self.sub_socket = context.socket(zmq.SUB)
                self.sub_socket.connect(f"ipc://{broadcast_endpoint}")
                self.sub_socket.setsockopt_string(zmq.SUBSCRIBE, "")
                self.pub_socket = None
            # time.sleep(1) # make sure all subscribers are ready
        else:
            self.pub_socket = context.socket(zmq.PUB)
            self.pub_socket.bind(f"ipc://{broadcast_endpoint}")

        try:
            generation_config = GenerationConfig.from_pretrained(args.model_dir)
        except:
            generation_config = GenerationConfig(
                max_length=args.max_new_tokens,
                temperature=args.temperature,
                top_p=args.top_p,
                do_sample=True
            )
            
        if args.optimize_config_path is None:
            optimize_config_path = default_optimize_rules[config.architectures[0]]
               
        else:
            optimize_config_path = args.optimize_config_path
        gguf_path = args.gguf_path
        if gguf_path is None:
            gguf_path = input(
                "please input the path of your gguf file(gguf file in the dir containing input gguf file must all"
                " belong to current model):"
            )
        if use_torch_npu:
            tp_group = get_tensor_parallel_group()
            torch.distributed.barrier(group=tp_group)
        optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config)        
        if use_torch_npu:
            get_absort_weight(self.model, config) #TODO 
            torch.distributed.barrier(group=tp_group)
        self.model.generation_config = generation_config
        if self.model.generation_config.pad_token_id is None:
            self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id

        self.model.eval()
        kvcache_event.set()
        # load kvcache
        print(f"Getting inference context from sched_client.")
        inference_context = self.sched_client.get_inference_context_raw()
        print(f"Got inference context, sending it to subscribers.")
        inference_context = self.sched_client.rebuild_inferece_context(inference_context)
        self.cache.load(inference_context)
        print(f"kv_cache loaded successfully.")
        

        self.block_num = inference_context.k_cache[0].size(1)
        #TODO ModelRunner 区别
        # self.model_runner = ModelRunner(self.model, self.device, self.args.use_cuda_graph, page_size = args.page_size, block_num=self.block_num)
        #@TODO add config
        if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM" or config.architectures[0] == "Glm4MoeForCausalLM" or config.architectures[0] == "SmallThinkerForCausalLM" or config.architectures[0] == "Qwen3NextForCausalLM":
            if not use_torch_npu:
                self.model.init_wrapper(self.args.use_cuda_graph, self.device, max(self.model_runner.cuda_graphs), args.max_batch_size, self.block_num) 
            else:
                # npu donnot support flash attn
                self.model.init_wrapper()
        else:
            self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)


        # self.args.use_cuda_graph代表是否使用图下沉
        self.model_runner = get_or_create_model_runner(self.model, self.cache, self.device, self.args.use_cuda_graph, page_size = args.page_size)
        self.sampler = Sampler()
        self.query_manager = QueryManager(device = self.device, page_size = args.page_size)

            
    def sampling(self, forward_output: ForwardBatchOutput):
        generated_tokens = []
        probs = []

        for i in range(forward_output.num_batchs):
            logit = forward_output.logits[i]
            if hasattr(forward_output, "temperatures"):
                temperatures = forward_output.temperatures[i]
            else:
                temperatures = None
            
            if hasattr(forward_output, "top_ps"):
                top_ps = forward_output.top_ps[i]
            else:
                top_ps = None

            sample_options = SamplingOptions(logit.size(0), self.device, pretrained_config=self.model.generation_config, temperatures=temperatures, top_ps=top_ps)
            generated_token, prob=self.sampler(logit, sample_options)
            generated_tokens.append(generated_token.clone())
            probs.append(prob.clone())
        generated_tokens, probs = torch.cat(generated_tokens), torch.cat(probs, dim=0)
        return generated_tokens, probs
    
    def loop(self):

        next_batch = None   

        while True:
            self.batch = next_batch
            if self.batch is not None:
                if use_torch_npu:
                    batch_size = 0
                    for i in range(len(self.batch.decode_mini_batches)):
                        batch_size += len(self.batch.decode_mini_batches[i])
                    # logger.debug(f"prefill batch: {len(self.batch.prefill_mini_batches)} decode batch: {len(self.batch.decode_mini_batches)} {batch_size} \n")
                    self.model_runner.run_split(self.batch, self.query_manager)
                else:
                    self.model_runner.run(self.batch, self.query_manager)

            if len(self.updates) > 0:
                for q in self.updates:
                    if q.is_prefill == True:
                        continue
                    # print(f"Putting token {q.generated_token} into queue for query id: {q.id}")
                    try:
                        if use_torch_npu:
                            if torch.distributed.get_rank() == 0:
                                self.gen_queue.put((q.id, q.generated_token if q.decode_done == False else None), timeout=5)
                        else:
                            self.gen_queue.put((q.id, q.generated_token if q.decode_done == False else None), timeout=5)
                    except queue.Full:
                        pass#print("Queue is full after timeout; unable to put more items.")
            if use_torch_npu:
                if torch.distributed.get_rank() == 0:
                    next_batch = self.sched_client.update_last_batch(self.updates)
                    if next_batch.query_ids == []:
                        next_batch = None
                    self.pub_socket.send_pyobj(next_batch)
                else:
                    next_batch = self.sub_socket.recv_pyobj()
            else:
                next_batch = self.sched_client.update_last_batch(self.updates)
                if next_batch.query_ids == []:
                    next_batch = None
                self.pub_socket.send_pyobj(next_batch)

            if next_batch is not None:
                self.query_manager.add_query(next_batch)
            
            
            if self.batch is not None:
                self.model_runner.sync()
                # print(f"Model execution time (GPU): {self.model_runner.model_time:.3f} ms")
                # if self.rank == 0:
                
                generated_tokens, probs = self.sampling( self.model_runner.output)
                
                self.updates = self.query_manager.update(self.batch)
                fill_generated_tokens(self.updates, generated_tokens, self.query_manager)

            else:
                self.updates = []

class BalanceServeThreadContext(ThreadContext):
    def get_local_messages(self):
        local_messages = []
        for m in self.messages:
            local_messages.append({"role": m.role.value, "content": m.get_text_content()})

        return local_messages


def init_distributed(rank: int,
                     world_size: int,
                     tp_size: int,
                     master_addr: str = os.getenv("MASTER_ADDR", "127.0.0.1"),
                     master_port: int = os.getenv("MASTER_PORT", "29500"),
                     backend: str = "hccl"): #TODO csx: 是否distribute 都只与NPU有关
    os.environ["RANK"] = str(rank)
    os.environ["LOCAL_RANK"] = str(rank)
    os.environ["WORLD_SIZE"] = str(world_size)
    os.environ["MASTER_ADDR"] = master_addr
    os.environ["MASTER_PORT"] = str(master_port)

    local_rank, world_size = setup_model_parallel(tp=tp_size)
    return local_rank, world_size


def run_engine(args, token_queue, broadcast_endpoint, event, kvcache_event, rank=None, world_size=None):
    if use_torch_npu:
        init_distributed(rank, world_size, args.tp, backend="hccl") #TODO 同上
    import torch.distributed as dist
    engine = Engine(args, token_queue, broadcast_endpoint, kvcache_event)
    if args.use_cuda_graph:
        if 'npu' in engine.device:
            print(f"[WARMUP-NPU] start", flush=True)
            engine.model_runner.warmup_npu()
        else:
            engine.model_runner.warmup()
    else:
        print(f"[WARMUP-NPU] skip warmup, eager mode!", flush=True)
    if use_torch_npu:
        args.port += torch.distributed.get_rank()
    event.set()
    engine.loop()


class BalanceServeInterface(BackendInterfaceBase):
    use_static_cache: bool = True

    model: Any
    tokenizer: AutoTokenizer

    cache: StaticCache
    generated_ids: torch.Tensor
    seq_length: int

    streamer: TextStreamer

    # thread_related
    last_request_id: Optional[str] = None
    ever_generated_ids: Set[int] = set()

    def __init__(self, args: ConfigArgs = default_args, input_args=None):
        self.args = input_args
        self.queue_map:dict[int,asyncio.Queue] = {}
        self.thread_map: dict[int, int] = {}
        processes = []
        self.broadcast_endpoint = tempfile.NamedTemporaryFile(delete=False).name # @TODO add to config
        ctx = mp.get_context("spawn")
        self.token_queue = ctx.Queue(maxsize=1000) 
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir, trust_remote_code=True)
        self.sched_client = SchedulerClient(args.sched_port)
        self.streamer = TextStreamer(self.tokenizer)
        if use_torch_npu:
            world_size = str(os.getenv("WORLD_SIZE", self.args.tp))
            if not isinstance(world_size, str):
                raise ValueError(f"world_size ({world_size}) must be str")
            start_events = []
            kvcache_events = []
            for rank in range(self.args.tp):
                if int(self.args.device[-1]) > 0:
                    break

                start_event = ctx.Event()
                kvcache_event = ctx.Event()

                p = ctx.Process(target=run_engine, args=(self.args, self.token_queue, self.broadcast_endpoint, start_event,
                                                        kvcache_event, rank, world_size))
                p.start()
                processes.append(p)
                start_events.append(start_event)
                kvcache_events.append(kvcache_event)

            for evt in kvcache_events:
                evt.wait()
            self._engines = processes
        else:
            start_event = ctx.Event()
            kvcache_event = ctx.Event()

            p = ctx.Process(target=run_engine, args=(self.args, self.token_queue, self.broadcast_endpoint, start_event,
                                                    kvcache_event))
            p.start()
            processes.append(p)

            kvcache_event.wait()
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            args.tp = input_args.tp
            pickle.dump(args, temp_file)
            temp_file_path = temp_file.name
        current_file = __file__
        target_file = os.path.join(os.path.dirname(current_file), "..", "..", "balance_serve", "sched_rpc.py")
        target_file = os.path.normpath(target_file)
        log_path = os.path.join(args.log_dir, "rpc.log")
        log = open(log_path, "a") 
        sched_process = subprocess.Popen(
            ["python3", target_file, "--config", temp_file_path], 
            stdout=log, 
            stderr=log
        )
        print("sched_rpc started with PID:", sched_process.pid)

        def signal_handler(signum, frame):
            print(f"Received signal {signum}, shutting down...")
            cleanup()
            os._exit(0) 

        def cleanup():
            print("Cleaning up...")

            for p in processes:
                if p.is_alive():
                    print(f"Terminating subprocess {p.pid}")
                    p.terminate()
                    p.join()

            if sched_process and sched_process.poll() is None:
                print(f"Terminating sched_process {sched_process.pid}")
                sched_process.terminate()
                sched_process.wait()
        signal.signal(signal.SIGINT, signal_handler)   
        signal.signal(signal.SIGTERM, signal_handler)
        if use_torch_npu:
            for evt in start_events:
                evt.wait()
        else:
            start_event.wait()
    
    def get_params(self, temperature: Optional[float] = None, top_p: Optional[float] = None, 
                   max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None) -> tuple[float, float]:
        """Get sampling parameters and handle default values and edge cases"""
        if max_tokens is not None:
            max_completion_tokens = max_tokens
        if max_completion_tokens is None:
            max_completion_tokens = self.args.max_new_tokens
        else:
            max_completion_tokens = min(self.args.max_new_tokens, max_completion_tokens)
        if temperature is None:
            temperature = self.args.temperature
        if top_p is None:
            top_p = self.args.top_p
            
        if temperature == 0:
            temperature = 0.0001
        if top_p == 0:
            top_p = 0.0001
            
        return temperature, top_p, max_completion_tokens

    def run_queue_proxy(self):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.queue_proxy())

    @asynccontextmanager
    async def lifespan(self, app: FastAPI):
        asyncio.create_task(self.queue_proxy())
        yield

    async def queue_proxy(self):
        print("Queue Proxy Started")
        while True:
            try:
                query_id, token = self.token_queue.get_nowait()
                try:
                    # query id might not be allocated yet
                    self.queue_map[query_id].put_nowait(token)
                    #print(f"Proxy Put token: {token} to queue for query id: {query_id}")
                except asyncio.QueueFull:
                    #print(f"Queue for query id: {query_id} is full, waiting to put: {token}")
                    await self.queue_map[query_id].put(token)

            except queue.Empty:
                # print("no new token")
                # await asyncio.sleep(1)
                await asyncio.sleep(0)
    def tokenize_prompt(self, prompt: str):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.args.device)
        return input_ids

    def format_and_tokenize_input_ids(self, thread_id: ObjectID, messages: List):
        for m in messages:
            if m["role"] == "system":
                logger.warning(f'change {m["role"]} to user')
                m["role"] = "user"

        new_messages = [messages[0]]
        for m in messages[1:]:
            if m["role"] == "user" and new_messages[-1]["role"] == "user":
                logger.warning("merge two adjacent user messages")
                new_messages[-1]["content"] += '\n' + m["content"]
            else:
                new_messages.append(m)
        # input_str: str = self.tokenizer.apply_chat_template(new_messages,tokenize=False,add_generation_prompt=True)
        # # drop <think> token in chat template
        # if input_str.endswith('<think>\n'):
        #     input_str = input_str[:-len('<think>\n')]
        input_ids = self.tokenizer.apply_chat_template(new_messages, add_generation_prompt=True, return_tensors="pt").to(self.args.device)
        return input_ids
    
    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = 0, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        profiler = Profiler()
        profiler.create_and_start_timer("tokenize")
        
        if isinstance(local_messages, List):
            input_ids = self.format_and_tokenize_input_ids(thread_id, local_messages)
        elif isinstance(local_messages, str):
            #local_messages = local_messages[0]['content']
            input_ids = self.tokenize_prompt(local_messages)
        else:
            raise ValueError("local_messages should be List or str")
        if Config().user_force_think:
            token_thinks = torch.tensor([self.tokenizer.encode("<think>\n",add_special_tokens=False)],device=input_ids.device)
            if not torch.equal(input_ids[0, -token_thinks.shape[-1]:], token_thinks[-1]): #TODO 此行新加的，考虑是否影响GPU
                input_ids = torch.cat(
                    [input_ids, token_thinks], dim=1
                )
        logger.debug(f"get input ids of shape {input_ids.shape}")


        profiler.pause_timer("tokenize")

        profiler.create_and_start_timer("prefill")

        
        query_add = sched_ext.QueryAdd()
        query_add.query_token =  input_ids[0].tolist()
        query_length = input_ids[0].shape[0]
        query_add.query_length = query_length
        profiler.set_counter("prefill", query_length)
        #@TODO add server
        stop_criteria =  [self.tokenizer.encode(self.tokenizer.eos_token, add_special_tokens=False),self.tokenizer.encode("<|im_end|>")]
        query_add.stop_criteria = stop_criteria

        temperature, top_p, max_new_tokens = self.get_params(temperature, top_p, max_tokens, max_completion_tokens)

        query_add.sample_options.temperature = temperature
        if top_p == 0 or top_p is None:
            top_p = 0.0001
        query_add.sample_options.top_p = top_p
        query_add.estimated_length = min(self.args.cache_lens, query_length+max_new_tokens)
        query_id = self.sched_client.add_query(query_add)
        queue = asyncio.Queue(maxsize=max_new_tokens)
        self.queue_map[query_id] = queue
        self.thread_map[thread_id] = query_id
        is_first_token = True
        async for token in chat_stream(self.queue_map[query_id], self.tokenizer):
            if is_first_token:
                is_first_token=False
                profiler.pause_timer("prefill")
                profiler.create_and_start_timer("decode")
                profiler.set_counter("decode", 0)
                if Config().user_force_think:
                    think = '<think>\n'
                    print(think, end="",flush=True)
                    yield think, None
            else:
                profiler.inc("decode")
            # TODO: 传入rank避免打印重复
            yield token, None
        profiler.pause_timer("decode")
        report_last_time_performance(profiler)
        yield self.streamer.end(), None
        if profiler.get_counter('decode') >= max_new_tokens - 1:
            yield "", "length"
        else:
            yield "", "stop"
        
        
        yield RawUsage(
                tokenize_time = profiler.get_timer_sec('tokenize'),
                prefill_time = profiler.get_timer_sec('prefill'),
                decode_time = profiler.get_timer_sec('decode'),
                prefill_count = profiler.get_counter('prefill'),
                decode_count = profiler.get_counter('decode'),
            )


================================================
FILE: archive/ktransformers/server/backend/interfaces/exllamav2.py
================================================
import sys, os
from typing import AsyncIterator, Dict, Tuple

import torch

from ..args import ConfigArgs, default_args

from ..base import BackendInterfaceBase, ThreadContext
from ktransformers.server.schemas.assistants.runs import RunObject


from ..args import *

class ExllamaThreadContext(ThreadContext):
    def __init__(self, run: RunObject, args: ConfigArgs = default_args) -> None:
        super().__init__(run,args)
        
    def get_interface(self):
        return 

    def get_local_messages(self):
        raise NotImplementedError


class ExllamaInterface(BackendInterfaceBase):
    
    def __init__(self, args: ConfigArgs = ...):
        raise NotImplementedError
    
    def tokenize_prompt(self, prompt: str) -> torch.Tensor:
        raise NotImplementedError
    
    async def inference(self,local_messages,request_unique_id:Optional[str])->AsyncIterator:
        raise NotImplementedError
    

================================================
FILE: archive/ktransformers/server/backend/interfaces/ktransformers.py
================================================
import torch
import torch.distributed as dist
from torch import nn
from torch.nn.attention import SDPBackend
import asyncio
from transformers import AutoTokenizer, AutoConfig, GenerationConfig
from ktransformers.server.backend.interfaces.transformers import (
    TransformersInterface,
    ConfigArgs,
    TransformersThreadContext,
    default_args,
    TextStreamer,
)
import os
try:
    import torch_npu
    use_npu = torch.npu.is_available()
    from ktransformers.util.ascend.ascend_utils import get_absort_weight, setup_model_parallel
except:
    use_npu = False
from torch import nn
from ktransformers.server.config.log import logger
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.custom_cache import StaticCache
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
from ktransformers.local_chat import custom_models, default_optimize_rules
from ktransformers.util.utils import get_device, get_all_used_cuda_device
from ktransformers.util import utils
from typing import Optional
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
from ktransformers.server.schemas.endpoints.chat import RawUsage
from typing import Any, List, Optional, Set
from ktransformers.server.config.config import Config

warm_uped = False
speculative_decoding = True # True -> verify by random accept ; False-> verify by token id
global_acc_counts = 0
global_verify_counts = 0

ktransformer_rules_dir = (
    os.path.dirname(os.path.abspath(__file__)) + "/../../../optimize/optimize_rules/"
)
default_optimize_rules = {
    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
    "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
    "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml"
}
if use_npu:
    default_optimize_rules["DeepseekV3ForCausalLM"] = ktransformer_rules_dir + "DeepSeek-V3-Chat-npu.yaml"
class KTransformersThreadContext(TransformersThreadContext):
    pass


class KTransformersInterface(TransformersInterface):
    def __init__(self, args: ConfigArgs = default_args, input_args=None):
        self.args = input_args
        self.local_rank, self.world_size = setup_model_parallel(tp=self.args.tp)
        if use_npu and (utils.CUR_DEVICE is None):
            utils.CUR_DEVICE = f"npu:{torch.npu.current_device()}"
            self.args.device = utils.CUR_DEVICE
            self.args.device = f"npu:{torch.npu.current_device()}"
        torch.set_grad_enabled(False)
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir, device=args.device, trust_remote_code=args.trust_remote_code)
        config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=args.trust_remote_code)
        try:
            generation_config = GenerationConfig.from_pretrained(args.model_dir)
        except:
            generation_config = GenerationConfig(
                max_length=args.max_new_tokens,
                temperature=args.temperature,
                top_p=args.top_p,
                do_sample=True
            )
        
        torch.set_default_dtype(config.torch_dtype)
        if config.architectures[0] == "Qwen2MoeForCausalLM":
            config._attn_implementation = "flash_attention_2"
        config.backend_type = "ktransformers"
        config.chunk_size = self.args.chunk_size
        with torch.device("meta"):
            self.model = custom_models[config.architectures[0]](config)
        if input_args.optimize_config_path is not None:
            optimize_config_path = input_args.optimize_config_path
        elif default_args.optimize_config_path is None:
            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
            optimize_config_path = args.optimize_config_path

        # print(optimize_config)

        gguf_path = args.gguf_path
        if gguf_path is None:
            gguf_path = input(
                "please input the path of your gguf file(gguf file in the dir containing input gguf file must all"
                " belong to current model):"
            )
        optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config, q4_gguf_path=input_args.q4_gguf_path)
        #提前absorbed
        get_absort_weight(self.model, config)
        # utils.get_absort_weight(self.model, config)
        self.model.eval()
        self.model.generation_config = generation_config
        self.device_map = self.model.gguf_loader.tensor_device_map
        self.top_p = torch.tensor([[self.model.generation_config.top_p]], dtype = torch.float16, device = self.args.device)
        self.top_k = torch.tensor([[self.model.generation_config.top_k]], dtype = torch.int32, device = self.args.device)
        self.temperature = torch.tensor([[self.model.generation_config.temperature]], dtype = torch.float16, device = self.args.device)
        self.next_token_fake = torch.tensor([[1]], dtype=torch.int32, device = self.args.device)
        self.next_token_probs = torch.tensor([[1.0]], dtype=torch.float16, device = self.args.device)
        self.draft_model = None

        # logger.info(f"{args.model_name} loaded from {args.model_dir} to {self.device_map}")
        self.cache = StaticCache(
            config=self.model.config,
            max_batch_size=args.batch_size,
            max_cache_len=args.cache_lens,
            device=self.device_map,
            dtype=self.model.dtype,
        )
        # logger.info(f"StaticCache (length={args.cache_lens}), batch size:{args.batch_size}")

        if self.model.generation_config.pad_token_id is None:
            self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
        self.streamer = TextStreamer(self.tokenizer)

        self._infer_lock = asyncio.Lock()

    @torch.no_grad
    def decode_one_tokens(self):
        global warm_uped

        device_map = self.model.gguf_loader.tensor_device_map
        torch_device = get_device("blk.0.self_attn", device_map)
        torch_device = "cuda:0" if torch_device == "cuda" else torch_device
        torch.cuda.set_device(torch_device)
        if warm_uped and self.args.use_cuda_graph:
            if use_npu:
                from ktransformers.util.npu_graph_runner import get_or_create_runner, check_runner
                if check_runner(utils.get_current_device()):
                    npu_graph_runner = get_or_create_runner(utils.get_current_device())
                    npu_graph_runner.init(self.args.batch_size, self.seq_length)
                    self.cuda_graph_runner = npu_graph_runner
                    utils._USE_NPU_GRAPH = True
                    self.cuda_graph_runner.capture(
                        self.model,
                        self.current_ids,
                        self.active_cache_position.unsqueeze(0),
                        self.active_cache_position,
                        self.cache,
                        main_device=torch_device,
                        return_dict=False,
                        use_cache=True,
                    )
                if hasattr(self, "cuda_graph_runner"):
                    inputs_embeds = self.model.model.embed_tokens(self.current_ids.to("cpu")).to(utils.get_current_device())
                    logits = self.cuda_graph_runner(
                        inputs_embeds, self.active_cache_position.unsqueeze(0), self.active_cache_position
                    )[0]
                    self.cache.change_seq_length(1)
                    torch.cuda.synchronize()
                    logits = logits[0, -1, :]
                    return self.logits_to_token(logits)
            else:
                if not hasattr(self, "cuda_graph_runner"):
                    self.cuda_graph_runner = CUDAGraphRunner()
                    self.cuda_graph_runner.capture(
                        self.model,
                        self.current_ids,
                        self.active_cache_position.unsqueeze(0),
                        self.active_cache_position,
                        self.cache,
                        main_device=torch_device,
                        return_dict=False,
                        use_cache=True,
                    )
                if hasattr(self, "cuda_graph_runner"):
                    logits = self.cuda_graph_runner(
                        self.current_ids, self.active_cache_position.unsqueeze(0), self.active_cache_position
                    )
                    self.cache.change_seq_length(1)
                    torch.cuda.synchronize()
                    logits = logits[0, -1, :]
                    return self.logits_to_token(logits)
        
        if self.args.use_cuda_graph:
            warm_uped = True
            
        if self.use_static_cache:
            logits = self.model(
                self.current_ids.to(torch_device),
                cache_position=self.active_cache_position,
                past_key_values=self.cache,
                return_dict=False,
                use_cache=True,
                is_prefill=False,
            )[0]
        else:
            logits = self.model(self.current_ids, return_dict=False, is_prefill=False)[0]
        logits = logits[0, -1, :]

        return self.logits_to_token(logits)


    @torch.no_grad
    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        input_ids_length = input_ids.shape[-1]
        if max_tokens is not None:
            max_completion_tokens = max_tokens
        if max_completion_tokens is None:
            max_new_tokens = self.args.max_new_tokens
        else:
            max_new_tokens = min(self.args.max_new_tokens, max_completion_tokens)
        if(input_ids_length >= self.args.cache_lens):
            logger.warning(f"input_ids_length {input_ids_length} > cache_lens {self.args.cache_lens}")
            self.seq_length = input_ids_length
            return
        logger.debug(f"input_ids: {input_ids.shape}")
        device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
        device = "cuda:0" if device == "cuda" else device
        if is_new:
            self.ever_generated_ids.clear()
            same_prefix = 0
            # flat_input_ids = input_ids.flatten()

            if getattr(self, 'generated_ids', None) is None:
                self.generated_ids = torch.zeros(
                    self.args.batch_size,
                    input_ids.shape[-1] + max_new_tokens + 1,
                    dtype=torch.int,
                    device=self.args.device,
                )
                self.seq_length = 1
            

            logger.debug(f"same prefix len: {same_prefix}")
            self.cache.remove_suffix(same_prefix)
            self.seq_length = same_prefix
            self.cache.position[0] = same_prefix
            self.generated_ids = self.generated_ids[..., :same_prefix]
            input_ids = input_ids[..., same_prefix:]
            input_ids_length = input_ids.shape[-1]

        self.ever_generated_ids.clear()
        self.profiler.set_counter("prefill", input_ids_length)
        logger.debug(f"input_ids: {input_ids.shape}")
        logger.debug(f"generate_ids: {self.generated_ids.shape}")
        
        former_seq_length = self.seq_length
        self.seq_length += input_ids_length
        expected_length = min(self.seq_length + max_new_tokens + 1, self.args.cache_lens)
        delta_length = expected_length - self.generated_ids.shape[-1]
        if delta_length > 0:
            new_generate_ids = torch.zeros(
                self.args.batch_size, delta_length, dtype=torch.int, device=utils.get_current_device()
            )
            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
        else:
            logger.warning(f"seq_length bigger than cache_lens, killed")
            exit(0)
        
        logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
        cache_position = torch.arange(former_seq_length, self.seq_length, device=device)
        self.generated_ids[:, cache_position] = input_ids.to(utils.get_current_device()).to(torch.int)

        if not (type(self) is TransformersInterface):
            input_ids = input_ids.to("cpu")
        
        def chunk_prefill(input_ids, cache_position):
            inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
            torch.cuda.set_device(device)
            if flashinfer_enabled:
                MLAWrapperSingleton.need_plan_all()
            if self.use_static_cache:
                logits = self.model(
                    inputs_embeds=inputs_embeds,
                    cache_position=cache_position,
                    past_key_values=self.cache,
                    return_dict=False,
                    use_cache=True,
                    is_prefill=True,
                )[0]
            else:
                logits = self.model(inputs_embeds=inputs_embeds, return_dict=False, is_prefill=True)[0]

            return logits

        if not use_npu:
            chunk_start = 0
            while chunk_start < input_ids_length:
                chunk_end = min(chunk_start + self.args.chunk_size, input_ids_length)
                if self.cache != None:
                    self.cache.cur_idx=cache_position[chunk_start:chunk_end]
                logits = chunk_prefill(input_ids[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end])
                chunk_start += self.args.chunk_size
                
            if flashinfer_enabled:
                MLAWrapperSingleton.reset_buffer()
            self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
            next_token = self.logits_to_token(logits[0, -1, :])
            self.max_new_tokens = min(max_new_tokens, self.args.cache_lens - self.seq_length) - 1 
            yield self.append_new_tokens(next_token)
            return

        def prefill_wrapper(prof=None):
            chunk_start = 0
            while chunk_start < input_ids_length:
                chunk_end = min(chunk_start + self.args.chunk_size, input_ids_length)
                if self.cache != None:
                    self.cache.cur_idx = cache_position[chunk_start:chunk_end]
                logits = chunk_prefill(input_ids[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end])
                chunk_start += self.args.chunk_size
                if prof is not None:
                    prof.step()
            if prof is not None:
                prof.stop()
            if logits is None:
                raise ValueError('logits cannot be None')
            return logits

        global WARM_UP_SKIP_CNT
        prof_prefill = os.environ["PROF_PREFILL"] if "PROF_PREFILL" in os.environ else "0"
        if prof_prefill == "1":
            experimental_config = torch_npu.profiler._ExperimentalConfig(
                aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
                profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False
            )
            with torch_npu.profiler.profile(
                    activities=[
                        torch_npu.profiler.ProfilerActivity.CPU,
                        torch_npu.profiler.ProfilerActivity.NPU
                    ],
                    schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=8, repeat=1, skip_first=0),
                    on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./prefill_prof_lm_head"),
                    record_shapes=True,
                    profile_memory=True,
                    with_stack=False,
                    with_flops=False,
                    with_modules=False,
                    experimental_config=experimental_config) as prof:
                logits = prefill_wrapper(prof)
        else:
            logits = prefill_wrapper()
            
        if flashinfer_enabled:
            MLAWrapperSingleton.reset_buffer()
        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
        next_token = self.logits_to_token(logits[0, -1, :])
        self.cache.position[0] = self.seq_length
        yield self.append_new_tokens(next_token)

    @property
    def active_cache_position(self):
        device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
        return torch.tensor([self.seq_length - 1], device=device)
    
    def sampling(self, logits, do_sample):
        if do_sample:
            cur_len = logits.shape[1]
            logits = logits / self.temperature
            torch.manual_seed(0)
            probs = logits.view(-1, cur_len, self.model.config.vocab_size)
            probs = torch.softmax(probs, dim=-1).half()
            next_token = self.next_token_fake
            if self.draft_model is None or not speculative_decoding:
                torch_npu._npu_topk_topp_sampling(probs[:, 0, :], self.top_k, self.top_p, next_token, self.next_token_probs)
            for i in range(1,cur_len):
                ith_token = torch.empty_like(self.next_token_fake)
                torch_npu._npu_topk_topp_sampling(probs[:, i, :], self.top_k, self.top_p, ith_token, self.next_token_probs)
                next_token = torch.cat((next_token, ith_token), dim=-1)
        else:
            next_token = torch.argmax(logits, dim=-1)
            probs = torch.softmax(logits, dim=-1)

        return next_token, probs

    def verify_by_tokenid(self, main_token: int, draft_token: int):
        return main_token, main_token == draft_token

    def verify_speculative_decoding(self, main_prob: torch.Tensor, draft_prob: torch.Tensor, draft_token: int, p: float):
        #assert draft_prob[draft_token] == p
        q = main_prob[draft_token]
        #p = draft_prob[draft_token]
        accept_prob = min(1.0, (q / p).item())
        if torch.rand(()) <= accept_prob:
            return draft_token, True
        else:
            # Compute the adjusted distribution for resampling
            new_prob = main_prob - draft_prob
            new_prob = torch.clamp(new_prob, min=0.0)
            new_prob /= new_prob.sum()

            # Sample a new token from the adjusted distribution
            token = torch.multinomial(new_prob, 1).item()
            return token, False

    def logits_to_token(self, logits: torch.Tensor):
        if self.model.generation_config.do_sample:
            logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1))
            probs = torch.nn.functional.softmax(logits, dim=-1)
            last = torch.multinomial(probs, num_samples=1)
        else:
            logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1))
            probs = torch.nn.functional.softmax(logits, dim=-1)
            _, last = torch.topk(probs, k=1, dim=-1)
        last = last.item()
        self.ever_generated_ids.add(last)
        return last

    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        async with self._infer_lock:
            async for v in super().inference(local_messages, thread_id, temperature, top_p, max_tokens, max_completion_tokens):
                yield v
            
            # return this inference raw usage
            yield RawUsage(
                tokenize_time = self.profiler.get_timer_sec('tokenize'),
                prefill_time = self.profiler.get_timer_sec('prefill'),
                decode_time = self.profiler.get_timer_sec('decode'),
                prefill_count = self.profiler.get_counter('prefill'),
                decode_count = self.profiler.get_counter('decode'),
            )

    def sync_inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None) -> str:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        try:
            async def run_async():
                result = []
                async for chunk in self.inference(local_messages, thread_id, temperature, top_p):
                    pass
                return ""
            return loop.run_until_complete(run_async())
        finally:
            loop.close()


================================================
FILE: archive/ktransformers/server/backend/interfaces/transformers.py
================================================
from typing import Any, List, Optional, Set
import re
import json
import uuid
try:
    import torch_npu
    use_npu = torch.npu.is_available()
except:
    use_npu = False

from transformers import (
    LlamaTokenizer,
    AutoTokenizer,
    AutoConfig,
    LlamaForCausalLM,
    GenerationConfig,
    StaticCache,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    LogitsProcessorList,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    MinPLogitsWarper,
    TypicalLogitsWarper,
    EpsilonLogitsWarper,
    EtaLogitsWarper,
)

from ktransformers.server.config.config import Config
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.utils.multi_timer import Profiler
from torch.nn.attention import SDPBackend
import torch
import torch.distributed as dist

from ktransformers.util import utils
import sys, os
from ..base import ThreadContext, BackendInterfaceBase
from ktransformers.server.config.log import logger
from ..args import ConfigArgs, default_args
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
from ktransformers.util import utils


# This TextStreamer is a modified version from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/streamers.py
class TextStreamer:

    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
        self.tokenizer = tokenizer
        self.skip_prompt = skip_prompt
        self.decode_kwargs = decode_kwargs

        # variables used in the streaming process
        self.token_cache = []
        self.print_len = 0
        self.next_tokens_are_prompt = True

    def reset(self):
        self.token_cache = []
        self.print_len = 0

    def put(self, value) -> Optional[str]:
        """
        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
        """
        if not isinstance(value, int):
            raise ValueError("TextStreamer only supports batch size 1, and int type input")

        if self.skip_prompt and self.next_tokens_are_prompt:
            self.next_tokens_are_prompt = False
            return None

        # Add the new token to the cache and decodes the entire thing.
        self.token_cache.append(value)
        text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True, **self.decode_kwargs)

        # After the symbol for a new line, we flush the cache.
        if text.endswith("\n"):
            printable_text = text[self.print_len :]
            self.reset()
        # If the last token is a CJK character, we print the characters.
        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
            printable_text = text[self.print_len :]
            self.print_len += len(printable_text)
        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
        # which may change with the subsequent token -- there are probably smarter ways to do this!)
        else:
            printable_text = text[self.print_len : text.rfind(" ") + 1]
            self.print_len += len(printable_text)
        return printable_text

    def end(self) -> Optional[str]:
        """Flushes any remaining cache and prints a newline to stdout."""
        # Flush the cache, if it exists
        if len(self.token_cache) > 0:
            text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True, **self.decode_kwargs)
            printable_text = text[self.print_len :]
            self.reset()
        else:
            printable_text = ""

        self.next_tokens_are_prompt = True
        return printable_text

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False


class TransformersThreadContext(ThreadContext):
    def get_local_messages(self):
        local_messages = []
        for m in self.messages:
            local_messages.append({"role": m.role.value, "content": m.get_text_content()})

        return local_messages


class TransformersInterface(BackendInterfaceBase):
    use_static_cache: bool = True

    model: Any
    tokenizer: AutoTokenizer

    cache: StaticCache
    generated_ids: torch.Tensor
    seq_length: int

    streamer: TextStreamer

    # thread_related
    last_request_id: Optional[str] = None
    ever_generated_ids: Set[int] = set()

    def __init__(self, args: ConfigArgs = default_args):
        self.args = args

        self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
        self.model = AutoModelForCausalLM.from_pretrained(args.model_dir, device_map=args.device, use_safetensors=True)
        # logger.info(f"{args.model_name} loaded from {args.model_dir} to {args.device}")

        self.cache = StaticCache(
            config=self.model.config,
            max_batch_size=args.batch_size,
            max_cache_len=args.cache_lens,
            device=args.device,
            dtype=self.model.dtype,
        )
        # logger.info(f"StaticCache (length={args.cache_lens}) created at {args.device}, batch size:{args.batch_size}")

        self.streamer = TextStreamer(self.tokenizer)

    @property
    def current_ids(self):
        return self.generated_ids[:, self.seq_length - 1].unsqueeze(1)

    @property
    def active_cache_position(self):
        return torch.tensor([self.seq_length - 1], device=self.args.device)

    def tokenize_prompt(self, prompt: str):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.args.device)
        return input_ids

    def format_and_tokenize_input_ids(self, thread_id: ObjectID, messages: List):
        for m in messages:
            if m["role"] == "system":
                logger.warning(f'change {m["role"]} to user')
                m["role"] = "user"

        new_messages = [messages[0]]
        for m in messages[1:]:
            if m["role"] == "user" and new_messages[-1]["role"] == "user":
                logger.warning("merge two adjacent user messages")
                new_messages[-1]["content"] += '\n' + m["content"]
            else:
                new_messages.append(m)
        # if (self.last_request_id is not None) and self.last_request_id == thread_id:
        #     input_ids = self.tokenizer.encode(self.tokenizer.eos_token+self.tokenizer.apply_chat_template([new_messages[-1]], return_tensors="pt",tokenize=False, add_generation_prompt=True), add_special_tokens = False, return_tensors="pt").to(self.args.device)
        # else:
        #     input_ids = self.tokenizer.apply_chat_template(
        #         new_messages, return_tensors="pt", add_generation_prompt=True
        #     ).to(self.args.device)
        # input_str: str = self.tokenizer.apply_chat_template(new_messages,tokenize=False,add_generation_prompt=True)
        # drop <think> token in chat template
        # if input_str.endswith('<think>\n'):
        #     input_str = input_str[:-len('<think>\n')]
        # input_ids = self.tokenizer.encode(input_str, return_tensors="pt").to(self.args.device)
        input_ids = self.tokenizer.apply_chat_template(new_messages, add_generation_prompt=True, return_tensors="pt").to(self.args.device)
        if (self.last_request_id is not None) and self.last_request_id == thread_id:
            x = self.generated_ids[:,:self.seq_length]
            y = input_ids[:,:self.seq_length]
            # We can only hope that the input_ids are the same
            unequal_mask = torch.ne(x,y)
            unequal_positions = torch.nonzero(unequal_mask)
            num_unequal_elements = unequal_mask.sum().item()
            logger.warning(f'num_unequal_elements: {num_unequal_elements}') 

            input_ids = input_ids[:,self.seq_length:]
        logger.debug(f"get input ids of shape {input_ids.shape}")
        return input_ids

    def append_new_tokens(self, new_tokens: int) -> Optional[str]:
        self.generated_ids[0, self.seq_length] = new_tokens
        self.seq_length += 1
        self.cache.position[0] += 1
        return self.streamer.put(new_tokens)

    @staticmethod
    def tf_logits_warper(generation_config):
        """
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
        used for multinomial sampling.
        """

        # instantiate warpers list
        warpers = LogitsProcessorList()

        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
        # better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
        if generation_config.num_beams > 1:
            if isinstance(generation_config._eos_token_tensor, list):
                min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
            elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
                min_tokens_to_keep = generation_config._eos_token_tensor.shape[0] + 1
            else:
                min_tokens_to_keep = 2
        else:
            min_tokens_to_keep = 1

        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
        # all samplers can be found in `generation_utils_samplers.py`
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TemperatureLogitsWarper(generation_config.temperature))
        if generation_config.top_k is not None and generation_config.top_k != 0:
            warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
            warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.min_p is not None:
            # Applied after temperature scaling (see https://github.com/ggerganov/llama.cpp/pull/3841#issuecomment-2073826084)
            warpers.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
            warpers.append(
                TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
            warpers.append(
                EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
            warpers.append(
               EtaLogitsWarper(
                    epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep, device=device
                )
            )
        # `LogitNormalization` should always be the last logit processor, when present
        if generation_config.renormalize_logits is True:
            warpers.append(LogitNormalization())
        return warpers

    def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None):
        if temperature is None or temperature == 0:
            temperature = self.model.generation_config.temperature
        if top_p is None:
            top_p = self.model.generation_config.top_p
        if top_p == 0:
            top_p = 0.0001
        # keep sampler the same as local_chat
        generation_config, model_kwargs = self.model._prepare_generation_config(
            None, max_length=self.args.max_new_tokens,
            do_sample=True, 
            top_k=self.args.top_k, 
            top_p=top_p, 
            temperature=temperature,
            repetition_penalty=self.args.repetition_penalty # change this to modify generate config
        )
        self.inputs = inputs
        self.logits_warper = self.tf_logits_warper(generation_config)

    def logits_to_token(self, logits: torch.Tensor):
        logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1))

        probs = torch.nn.functional.softmax(logits, dim=-1)

        sample = True
        if sample:
            last = torch.multinomial(probs, num_samples=1)
        else:
            _, last = torch.topk(probs, k=1, dim=-1)

        last = last.item()
        self.ever_generated_ids.add(last)
        return last

    def decode_one_tokens(self):
        if self.use_static_cache:
            logits = self.model(
                self.current_ids,
                cache_position=self.active_cache_position,
                past_key_values=self.cache,
                return_dict=False,
                use_cache=True,
            )[0]
        else:
            logits = self.model(self.current_ids, return_dict=False)[0]
        logits = logits[0, -1, :]

        return self.logits_to_token(logits)

    @torch.no_grad
    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        input_ids_length = input_ids.shape[-1]
        logger.debug(f"input_ids: {input_ids.shape}")
        if max_tokens is not None:
            max_completion_tokens = max_tokens
        if max_completion_tokens is None:
            max_new_tokens = self.args.max_new_tokens
        else:
            max_new_tokens = min(self.args.max_new_tokens, max_completion_tokens)

        if is_new:
            self.ever_generated_ids.clear()
            same_prefix = 0
            flat_input_ids = input_ids.flatten()

            if getattr(self, 'generated_ids', None) is None:
                self.generated_ids = torch.zeros(
                    self.args.batch_size,
                    input_ids.shape[-1] + max_new_tokens + 1,
                    dtype=torch.int,
                    device=self.args.device,
                )
                self.seq_length = 1            
            
            flat_prev_ids = self.generated_ids.flatten()
            for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1):
                if flat_input_ids[i] == flat_prev_ids[i]:
                    same_prefix += 1
                else:
                    break
            
            logger.debug(f"same prefix len: {same_prefix}")
            self.cache.remove_suffix(same_prefix)
            self.seq_length = same_prefix
            self.generated_ids = self.generated_ids[..., :same_prefix]
            input_ids = input_ids[..., same_prefix:]
            input_ids_length = input_ids.shape[-1]
        
        self.ever_generated_ids.clear()
        self.profiler.set_counter("prefill", input_ids_length)
        logger.debug(f"input_ids: {input_ids.shape}")

        logger.debug(f"generate_ids: {self.generated_ids.shape}")
        former_seq_length = self.seq_length
        self.seq_length += input_ids_length
        expected_length = self.seq_length + max_new_tokens + 1
        delta_length = expected_length - self.generated_ids.shape[-1]
        if delta_length > 0:
            new_generate_ids = torch.zeros(
                self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
            )
            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
            
        logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
        cache_position = torch.arange(former_seq_length, self.seq_length, device=self.args.device)
        self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int)

        device = input_ids.device
        if not (type(self) is TransformersInterface):
            input_ids = input_ids.to("cpu")
        inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
        if self.use_static_cache:
            logits = self.model(
                inputs_embeds=inputs_embeds,
                cache_position=cache_position,
                past_key_values=self.cache,
                return_dict=False,
                use_cache=True,
            )[0]
        else:
            logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]

        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
        next_token = self.logits_to_token(logits[0, -1, :])
        yield self.append_new_tokens(next_token)

    @torch.no_grad
    def generate(self):
        self.max_new_tokens = min(self.args.max_new_tokens, self.args.cache_lens - self.seq_length) - 1 
        logger.info(f"args.max_new_tokens: {self.args.max_new_tokens}, cache_lens: {self.args.cache_lens}, seq_length: {self.seq_length}")
        if(self.max_new_tokens <= 0):
            logger.warning("max_new_tokens is less than 0")
            yield self.streamer.end(), "length"
            return
        logger.info(f"max_new_tokens: {self.max_new_tokens}")
        self.profiler.set_counter("decode", 0)

        for i in range(1, self.max_new_tokens):
            with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
                if flashinfer_enabled:
                    MLAWrapperSingleton.plan_all(None,None,None,self.active_cache_position.to(torch.int32)+1, None,
                                             num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
                                             head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.cache.page_size,
                                             sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
                next_token = self.decode_one_tokens()
                self.profiler.inc("decode")
                if next_token == self.tokenizer.eos_token_id or "<|im_end|>" == self.tokenizer.decode(next_token):
                    yield self.streamer.end(), None
                    yield "", "stop"
                    assert self.args.batch_size == 1
                    break
                yield self.append_new_tokens(next_token), None

        else:   # for's else, if output get max new tokens
            yield self.streamer.end(), None
            yield "", "length"

        if self.args.use_cuda_graph:
            utils._USE_NPU_GRAPH = False
            from ktransformers.util.npu_graph_runner import get_or_create_runner
            npu_graph_runner = get_or_create_runner(utils.get_current_device())
            npu_graph_runner.destroy()

    def check_is_new(self, thread_id: str):
        if not self.use_static_cache:
            return True
        if self.last_request_id is None:
            self.last_request_id = thread_id
            return True
        else:
            if self.last_request_id == thread_id:
                return False
            else:
                self.last_request_id = thread_id
                return True

    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        self.streamer.reset()
        self.profiler.create_and_start_timer("tokenize")
        torch.distributed.barrier()
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        tp_size = utils.get_tensor_parallel_size()
        if isinstance(local_messages, List):
            input_ids = self.format_and_tokenize_input_ids(thread_id, local_messages)
        elif isinstance(local_messages, str):
            #local_messages = local_messages[0]['content']
            input_ids = self.tokenize_prompt(local_messages)
            #input_ids = torch.tensor([[6366]], device=input_ids.device)
        else:
            raise ValueError("local_messages should be List or str")

        if tp_size == world_size and tp_size > 1:
            torch.distributed.barrier()
            input_size = torch.tensor([input_ids.size(1)], dtype=torch.int64, device=utils.CUR_DEVICE)
            all_input_sizes = [torch.zeros_like(input_size) for _ in range(world_size)]
            dist.all_gather(all_input_sizes, input_size)

            max_input_size = max([size.item() for size in all_input_sizes])
            padded_input_ids = torch.zeros(1, max_input_size, dtype=input_ids.dtype, device=utils.CUR_DEVICE)
            padded_input_ids[0, :input_ids.size(1)] = input_ids[0]

            all_padded_inputs = [torch.zeros_like(padded_input_ids) for _ in range(world_size)]
            dist.all_gather(all_padded_inputs, padded_input_ids)

            original_size = all_input_sizes[0].item()
            input_ids = all_padded_inputs[0][:, :original_size]
        
        if Config().user_force_think:
            token_thinks = torch.tensor([self.tokenizer.encode("<think>\n",add_special_tokens=False)],device=input_ids.device)
            if not torch.equal(input_ids[0, -token_thinks.shape[-1]:], token_thinks[-1]):
                input_ids = torch.cat(
                    [input_ids, token_thinks], dim=1
                )

        self.profiler.pause_timer("tokenize")

        self.profiler.create_and_start_timer("prefill")

        if Config().user_force_think:
            think = '<think>\n'
            print(think, end="",flush=True)
            yield think, None
        
        for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p, max_tokens, max_completion_tokens):
            # output think token after prefill done
            if t is not None:
                print(t, end="",flush=True)
                yield t, None
        self.profiler.pause_timer("prefill")

        self.profiler.create_and_start_timer("decode")
        for t, finish_reason in self.generate():
            if t is not None:
                if tp_size == world_size:
                    if rank == 0:
                        print(t, end="", flush=True)
                else:
                    print(t, end="",flush=True)
                yield t, finish_reason

        if tp_size == world_size:
            if rank == 0:
                print("")
                self.profiler.pause_timer("decode")
                self.report_last_time_performance()
        else:
            print("")
            self.profiler.pause_timer("decode")
            self.report_last_time_performance()


================================================
FILE: archive/ktransformers/server/balance_serve/inference/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/balance_serve/inference/config.py
================================================
'''
Date: 2024-11-07 07:30:16
LastEditors: djw
LastEditTime: 2024-11-15 14:23:26
'''
import math
from dataclasses import dataclass
from typing import Optional

import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F
import yaml

import json
from typing import Optional

model_runner_dict = dict()

class ModelConfig:
    vocab_size: int = 32000
    n_layer: int = 1
    n_head: int = 32
    dim: int = 4096
    intermediate_size: int = 18944
    n_local_heads: int = 8
    head_dim: int = 128
    rope_base: float = 1000000.0
    norm_eps: float = 1e-06
    rope_scaling: Optional[dict] = None
    rms_norm_eps: float = 1e-6
    hidden_act: str = "silu"
    model_path: str
    gguf_path: str
    optimize_rule_path: str
    speculative_rule_path: str
            

    # quantize config
    quant_algorithm: Optional[str] = None
    quant_group_size: Optional[int] = None
    quant_num_bits: Optional[int] = None

    json_key_map = {
        "vocab_size": "vocab_size",
        "n_layer": "num_hidden_layers",
        "n_head": "num_attention_heads",
        "dim": "hidden_size",
        "intermediate_size": "intermediate_size",
        "n_local_heads": "num_key_value_heads",
        "rope_base": "rope_theta",
        "norm_eps": "norm_eps",
        "rms_norm_eps": "rms_norm_eps",
        "hidden_act": "hidden_act",
    }

    def __init__(self, config):
        self.model_path = config["model"]["model_path"]
        self.gguf_path = config["model"]["gguf_path"]
        self.optimize_rule_path = config["model"]["optimize_rule_path"]
        if "speculative_rule_path" in config["model"]:
            self.speculative_rule_path =  config["model"]["speculative_rule_path"]
            self.speculative_gguf_path = config["model"]["speculative_gguf_path"]
            self.speculative_model_path = config["model"]["speculative_model_path"]
        self.quant_algorithm = config["model"]["quant"]["algorithm"]
        self.quant_group_size = config["model"]["quant"]["group_size"]
        self.quant_num_bits = config["model"]["quant"]["num_bits"]
        self.load_config()
        self.n_layer = config["model"]["n_layers"]

    def load_config(self):
        config_file = f"{self.model_path}/config.json"
        try:
            with open(config_file, "r") as f:
                config_data = json.load(f)
        except FileNotFoundError:
            raise FileNotFoundError(f"Configuration file not found at {config_file}")

        for attr, json_key in self.json_key_map.items():
            if json_key in config_data:
                setattr(self, attr, config_data[json_key])
            else:
                setattr(self, attr, getattr(self, attr))


class ParallelConfig:
    def __init__(
        self,
        config,
    ) -> None:
        self.pipeline_parallel_size = config["parallel"]["pp"]
        self.tensor_parallel_size = config["parallel"]["tp"]
        self.disable_custom_all_reduce = config["parallel"]["disable_custom_all_reduce"]
        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size

class AttnConfig:
    page_size: int = 256
    block_num: int = 32
    max_batch_token : int = 256
    max_batch_size: int = 32

    def __init__(self, config):
        self.page_size = config["attn"]["page_size"]
        self.block_num = config["attn"]["block_num"]
        self.max_batch_token = config["attn"]["max_batch_token"]
        self.max_batch_size = config["attn"]["max_batch_size"]


class SamplerConfig():
	# Batched sampling params
    temperatures: float
    is_all_greedy: bool
	
    def __init__(self, config):
        self.temperatures = config["sample"]["temperature"]
        self.is_all_greedy = True


def load_yaml_config(file_path):
    with open(file_path, "r") as f:
        return yaml.safe_load(f)
    

class LLMConfig:
    model_config: ModelConfig
    parallel_config: ParallelConfig
    attn_config: AttnConfig
    sample_config: SamplerConfig
    config_file: str

    def __init__(self, config_file):
        self.config_file = config_file
        config = load_yaml_config(config_file)
        self.model_config = ModelConfig(config)
        self.parallel_config = ParallelConfig(config)
        self.attn_config = AttnConfig(config)
        self.sample_config = SamplerConfig(config)


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/__init__.py
================================================
from .communication_op import *
from .parallel_state import *
from .utils import *


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/communication_op.py
================================================
"""
Date: 2024-12-11 06:02:42
LastEditors: djw
LastEditTime: 2024-12-12 09:52:06
"""

from typing import Any, Dict, Optional, Union

import torch
import torch.distributed

from .parallel_state import get_tp_group


def tensor_model_parallel_all_reduce(input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
    """All-reduce the input tensor across model parallel group."""
    return get_tp_group().all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)


def tensor_model_parallel_all_gather(
    input_: torch.Tensor, dim: int = -1
) -> torch.Tensor:
    """All-gather the input tensor across model parallel group."""
    return get_tp_group().all_gather(input_, dim)


def tensor_model_parallel_gather(
    input_: torch.Tensor, dst: int = 0, dim: int = -1
) -> Optional[torch.Tensor]:
    """Gather the input tensor across model parallel group."""
    return get_tp_group().gather(input_, dst, dim)


def broadcast_tensor_dict(
    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
):
    if not torch.distributed.is_initialized():
        return tensor_dict
    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/cuda_wrapper.py
================================================
"""This file is a pure Python wrapper for the cudart library.
It avoids the need to compile a separate shared library, and is
convenient for use when we just need to call a few functions.
"""

import ctypes
from dataclasses import dataclass
from typing import Any, Dict, List, Optional

# this line makes it possible to directly load `libcudart.so` using `ctypes`
import torch  # noqa

# === export types and functions from cudart to Python ===
# for the original cudart definition, please check
# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html

cudaError_t = ctypes.c_int
cudaMemcpyKind = ctypes.c_int


class cudaIpcMemHandle_t(ctypes.Structure):
    _fields_ = [("internal", ctypes.c_byte * 128)]


@dataclass
class Function:
    name: str
    restype: Any
    argtypes: List[Any]


def find_loaded_library(lib_name) -> Optional[str]:
    """
    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
    the file `/proc/self/maps` contains the memory maps of the process, which includes the
    shared libraries loaded by the process. We can use this file to find the path of the
    a loaded library.
    """ # noqa
    found = False
    with open("/proc/self/maps") as f:
        for line in f:
            if lib_name in line:
                found = True
                break
    if not found:
        # the library is not loaded in the current process
        return None
    # if lib_name is libcudart, we need to match a line with:
    # address /path/to/libcudart-hash.so.11.0
    start = line.index("/")
    path = line[start:].strip()
    filename = path.split("/")[-1]
    assert filename.rpartition(".so")[0].startswith(lib_name), \
        f"Unexpected filename: {filename} for library {lib_name}"
    return path


class CudaRTLibrary:
    exported_functions = [
        # ​cudaError_t cudaSetDevice ( int  device )
        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
        # cudaError_t 	cudaDeviceSynchronize ( void )
        Function("cudaDeviceSynchronize", cudaError_t, []),
        # ​cudaError_t cudaDeviceReset ( void )
        Function("cudaDeviceReset", cudaError_t, []),

        # const char* 	cudaGetErrorString ( cudaError_t error )
        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),

        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
        Function("cudaMalloc", cudaError_t,
                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
        # ​cudaError_t 	cudaFree ( void* devPtr )
        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
        Function("cudaMemset", cudaError_t,
                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
        Function("cudaMemcpy", cudaError_t, [
            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
        ]),

        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
        Function("cudaIpcGetMemHandle", cudaError_t,
                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
        Function("cudaIpcOpenMemHandle", cudaError_t, [
            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
        ]),
    ]

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
    path_to_library_cache: Dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):
        if so_file is None:
            so_file = find_loaded_library("libcudart")
            assert so_file is not None, \
                "libcudart is not loaded in the current process"
        if so_file not in CudaRTLibrary.path_to_library_cache:
            lib = ctypes.CDLL(so_file)
            CudaRTLibrary.path_to_library_cache[so_file] = lib
        self.lib = CudaRTLibrary.path_to_library_cache[so_file]

        if so_file not in CudaRTLibrary.path_to_dict_mapping:
            _funcs = {}
            for func in CudaRTLibrary.exported_functions:
                f = getattr(self.lib, func.name)
                f.restype = func.restype
                f.argtypes = func.argtypes
                _funcs[func.name] = f
            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]

    def CUDART_CHECK(self, result: cudaError_t) -> None:
        if result != 0:
            error_str = self.cudaGetErrorString(result)
            raise RuntimeError(f"CUDART error: {error_str}")

    def cudaGetErrorString(self, error: cudaError_t) -> str:
        return self.funcs["cudaGetErrorString"](error).decode("utf-8")

    def cudaSetDevice(self, device: int) -> None:
        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))

    def cudaDeviceSynchronize(self) -> None:
        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())

    def cudaDeviceReset(self) -> None:
        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())

    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
        devPtr = ctypes.c_void_p()
        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
        return devPtr

    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))

    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
                   count: int) -> None:
        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))

    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
                   count: int) -> None:
        cudaMemcpyDefault = 4
        kind = cudaMemcpyDefault
        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))

    def cudaIpcGetMemHandle(self,
                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
        handle = cudaIpcMemHandle_t()
        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
            ctypes.byref(handle), devPtr))
        return handle

    def cudaIpcOpenMemHandle(self,
                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
        cudaIpcMemLazyEnablePeerAccess = 1
        devPtr = ctypes.c_void_p()
        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
        return devPtr


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce.py
================================================
import ctypes
from contextlib import contextmanager
from typing import List, Optional, Union

import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup

import server.envs as envs
from server.inference.distributed.cuda_wrapper import CudaRTLibrary
from server.inference.distributed.custom_all_reduce_utils import gpu_p2p_access_check
from server.inference.distributed.parallel_state import in_the_same_node_as
from server.inference.platforms import current_platform
from server.utils import cuda_device_count_stateless
import vLLMCustomAllreduce

try:
    vLLMCustomAllreduce.meta_size()
    custom_ar = True
except Exception:
    # For AMD GPUs and CPUs
    custom_ar = False


def _can_p2p(rank: int, world_size: int) -> bool:
    for i in range(world_size):
        if i == rank:
            continue
        if envs.VLLM_SKIP_P2P_CHECK:
            print("Skipping P2P check and trusting the driver's P2P report.")
            return torch.cuda.can_device_access_peer(rank, i)
        if not gpu_p2p_access_check(rank, i):
            return False
    return True


def is_weak_contiguous(inp: torch.Tensor):
    return inp.is_contiguous() or (
        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
        == inp.numel() * inp.element_size()
    )


class CustomAllreduce:

    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]

    # max_size: max supported allreduce size
    def __init__(
        self,
        group: ProcessGroup,
        device: Union[int, str, torch.device],
        max_size=8192 * 1024,
    ) -> None:
        """
        Args:
            group: the process group to work on. If None, it will use the
                default process group.
            device: the device to bind the CustomAllreduce to. If None,
                it will be bind to f"cuda:{local_rank}".
        It is the caller's responsibility to make sure each communicator
        is bind to a unique device, and all communicators in this group
        are in the same node.
        """
        self._IS_CAPTURING = False
        self.disabled = True

        if not custom_ar:
            # disable because of missing custom allreduce library
            # e.g. in a non-cuda environment
            return

        self.group = group

        assert (
            dist.get_backend(group) != dist.Backend.NCCL
        ), "CustomAllreduce should be attached to a non-NCCL group."

        if not all(in_the_same_node_as(group, source_rank=0)):
            # No need to initialize custom allreduce for multi-node case.
            print(
                "Custom allreduce is disabled because this process group"
                " spans across nodes."
            )
            return

        rank = dist.get_rank(group=self.group)
        world_size = dist.get_world_size(group=self.group)
        if world_size == 1:
            # No need to initialize custom allreduce for single GPU case.
            return

        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
            print(
                "Custom allreduce is disabled due to an unsupported world"
                " size: %d. Supported world sizes: %s. To silence this "
                "warning, specify disable_custom_all_reduce=True explicitly.",
                world_size,
                str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
            )
            return

        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
        # now `device` is a `torch.device` object
        assert isinstance(device, torch.device)
        self.device = device

        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
        if cuda_visible_devices:
            device_ids = list(map(int, cuda_visible_devices.split(",")))
        else:
            device_ids = list(range(cuda_device_count_stateless()))

        physical_device_id = device_ids[device.index]
        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
        gather_list = [
            torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
        ]
        dist.all_gather(gather_list, tensor, group=self.group)
        physical_device_ids = [t.item() for t in gather_list]

        # test nvlink first, this will filter out most of the cases
        # where custom allreduce is not supported
        # this checks hardware and driver support for NVLink
        assert current_platform.is_cuda()
        from server.inference.platforms.cuda import CudaPlatform

        cuda_platform: CudaPlatform = current_platform
        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
        if world_size > 2 and not full_nvlink:
            print(
                "Custom allreduce is disabled because it's not supported on"
                " more than two PCIe-only GPUs. To silence this warning, "
                "specify disable_custom_all_reduce=True explicitly."
            )
            return
        # test P2P capability, this checks software/cudaruntime support
        # this is expensive to compute at the first time
        # then we cache the result
        if not _can_p2p(rank, world_size):
            print(
                "Custom allreduce is disabled because your platform lacks "
                "GPU P2P capability or P2P test failed. To silence this "
                "warning, specify disable_custom_all_reduce=True explicitly."
            )
            return

        self.disabled = False
        # Buffers memory are owned by this Python class and passed to C++.
        # Meta data composes of two parts: meta data for synchronization and a
        # temporary buffer for storing intermediate allreduce results.
        self.meta_ptrs = self.create_shared_buffer(
            vLLMCustomAllreduce.meta_size() + max_size, group=group
        )
        # This is a pre-registered IPC buffer. In eager mode, input tensors
        # are first copied into this buffer before allreduce is performed
        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
        # This is a buffer for storing the tuples of pointers pointing to
        # IPC buffers from all ranks. Each registered tuple has size of
        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
        # is enough for 131072 such tuples. The largest model I've seen only
        # needs less than 10000 of registered tuples.
        self.rank_data = torch.empty(
            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
        )
        self.max_size = max_size
        self.rank = rank
        self.world_size = world_size
        self.full_nvlink = full_nvlink
        self._ptr = vLLMCustomAllreduce.init_custom_ar(
            self.meta_ptrs, self.rank_data, rank, self.full_nvlink
        )
        vLLMCustomAllreduce.register_buffer(self._ptr, self.buffer_ptrs)

    @staticmethod
    def create_shared_buffer(
        size_in_bytes: int, group: Optional[ProcessGroup] = None
    ) -> List[int]:
        """
        Creates a shared buffer and returns a list of pointers
        representing the buffer on all processes in the group.
        """
        lib = CudaRTLibrary()
        pointer = lib.cudaMalloc(size_in_bytes)
        handle = lib.cudaIpcGetMemHandle(pointer)
        world_size = dist.get_world_size(group=group)
        rank = dist.get_rank(group=group)
        handles = [None] * world_size
        dist.all_gather_object(handles, handle, group=group)

        pointers: List[int] = []
        for i, h in enumerate(handles):
            if i == rank:
                pointers.append(pointer.value)  # type: ignore
            else:
                pointers.append(lib.cudaIpcOpenMemHandle(h).value)  # type: ignore

        return pointers

    @staticmethod
    def free_shared_buffer(
        pointers: List[int], group: Optional[ProcessGroup] = None
    ) -> None:
        rank = dist.get_rank(group=group)
        lib = CudaRTLibrary()
        lib.cudaFree(ctypes.c_void_p(pointers[rank]))

    @contextmanager
    def capture(self):
        """
        The main responsibility of this context manager is the
        `register_graph_buffers` call at the end of the context.
        It records all the buffer addresses used in the CUDA graph.
        """
        try:
            self._IS_CAPTURING = True
            yield
        finally:
            self._IS_CAPTURING = False
            if not self.disabled:
                self.register_graph_buffers()

    def register_graph_buffers(self):
        handle, offset = vLLMCustomAllreduce.get_graph_buffer_ipc_meta(self._ptr)
        print("Registering %d cuda graph addresses", len(offset))
        # We cannot directly use `dist.all_gather_object` here
        # because it is incompatible with `gloo` backend under inference mode.
        # see https://github.com/pytorch/pytorch/issues/126032 for details.
        all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
        all_data[self.rank] = [handle, offset]
        ranks = sorted(dist.get_process_group_ranks(group=self.group))
        for i, rank in enumerate(ranks):
            dist.broadcast_object_list(
                all_data[i], src=rank, group=self.group, device="cpu"
            )
        # Unpack list of tuples to tuple of lists.
        handles = [d[0] for d in all_data]  # type: ignore
        offsets = [d[1] for d in all_data]  # type: ignore
        vLLMCustomAllreduce.register_graph_buffers(self._ptr, handles, offsets)

    def should_custom_ar(self, inp: torch.Tensor):
        if self.disabled:
            return False
        inp_size = inp.numel() * inp.element_size()
        # custom allreduce requires input byte size to be multiples of 16
        if inp_size % 16 != 0:
            return False
        if not is_weak_contiguous(inp):
            return False
        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
        # little performance improvement over NCCL.
        if self.world_size == 2 or self.full_nvlink:
            return inp_size < self.max_size
        return False

    def all_reduce(
        self, inp: torch.Tensor, *, out: torch.Tensor = None, bsz_tensor: torch.Tensor = None, registered: bool = False,
        is_compute_bound=False, overlap=False
    ):
        """Performs an out-of-place all reduce.

        If registered is True, this assumes inp's pointer is already
        IPC-registered. Otherwise, inp is first copied into a pre-registered
        buffer.
        """
        if is_compute_bound:
            sms = 2 if overlap else 36
        else:
            sms = 20 if overlap else 36
        #print("all reduce sms", sms)
        if out is None:
            out = torch.empty_like(inp)
        if registered:
            vLLMCustomAllreduce.all_reduce(self._ptr, inp, out, 0, 0, bsz_tensor, block_limit=sms)
        else:
            vLLMCustomAllreduce.all_reduce(
                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size, bsz_tensor, block_limit=sms
            )
        return out

    def custom_all_reduce(self, input: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> Optional[torch.Tensor]:
        """The main allreduce API that provides support for cuda graph."""
        # When custom allreduce is disabled, this will be None.
        if self.disabled or not self.should_custom_ar(input):
            return None
        if self._IS_CAPTURING:
            if torch.cuda.is_current_stream_capturing():
                return self.all_reduce(input, bsz_tensor=bsz_tensor, registered=True, is_compute_bound=is_compute_bound, overlap=overlap)
            else:
                # If warm up, mimic the allocation pattern since custom
                # allreduce is out-of-place.
                return torch.empty_like(input)
        else:
            # Note: outside of cuda graph context, custom allreduce incurs a
            # cost of cudaMemcpy, which should be small (<=1% of overall
            # latency) compared to the performance gain of using custom kernels
            return self.all_reduce(input, bsz_tensor=bsz_tensor, registered=False, is_compute_bound=is_compute_bound, overlap=overlap)

    def close(self):
        if not self.disabled and self._ptr:
            vLLMCustomAllreduce.dispose(self._ptr)
            self._ptr = 0
            self.free_shared_buffer(self.meta_ptrs)
            self.free_shared_buffer(self.buffer_ptrs)

    def __del__(self):
        self.close()


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce_utils.py
================================================
import ctypes
import json
import os
import pickle
import subprocess
import sys
import tempfile
from itertools import product
from typing import Dict, List, Optional, Sequence

import torch.distributed as dist
import torch.multiprocessing as mp

import server.envs as envs
from server.inference.distributed.cuda_wrapper import CudaRTLibrary
from server.utils import cuda_device_count_stateless, update_environment_variables


def producer(
    batch_src: Sequence[int],
    producer_queue,
    consumer_queue,
    result_queue,
    cuda_visible_devices: Optional[str] = None,
):
    if cuda_visible_devices is not None:
        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})

    lib = CudaRTLibrary()
    for i in batch_src:
        lib.cudaSetDevice(i)
        pointer = lib.cudaMalloc(1024)
        lib.cudaMemset(pointer, 1, 1024)
        lib.cudaDeviceSynchronize()
        handle = lib.cudaIpcGetMemHandle(pointer)
        producer_queue.put(handle)
        open_success = consumer_queue.get()
        if open_success:
            # use two queues to simulate barrier
            producer_queue.put(0)
            consumer_queue.get()
            # check if the memory is modified
            host_data = (ctypes.c_char * 1024)()
            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
            for i in range(1024):
                if ord(host_data[i]) != 2:
                    open_success = False
                    break
        result_queue.put(open_success)
        lib.cudaDeviceReset()


def consumer(
    batch_tgt: Sequence[int],
    producer_queue,
    consumer_queue,
    result_queue,
    cuda_visible_devices: Optional[str] = None,
):
    if cuda_visible_devices is not None:
        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})

    lib = CudaRTLibrary()
    for j in batch_tgt:
        lib.cudaSetDevice(j)
        handle = producer_queue.get()
        open_success = False
        try:
            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
            open_success = True
        except RuntimeError:
            # cannot error out here, because the producer process
            # is still waiting for the response.
            pass
        consumer_queue.put(open_success)
        if open_success:
            # modify the memory
            lib.cudaMemset(pointer, 2, 1024)
            lib.cudaDeviceSynchronize()
            # use two queues to simulate barrier
            producer_queue.get()
            consumer_queue.put(0)
            # check if the memory is modified
            host_data = (ctypes.c_char * 1024)()
            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
            for i in range(1024):
                if ord(host_data[i]) != 2:
                    open_success = False
                    break
        result_queue.put(open_success)
        lib.cudaDeviceReset()


def can_actually_p2p(
    batch_src: Sequence[int],
    batch_tgt: Sequence[int],
) -> Sequence[bool]:
    """
    Usually, checking if P2P access is enabled can be done by
    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
    returns `True` even if P2P access is not actually possible.
    See https://github.com/vllm-project/vllm/issues/2728 and
    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
    Therefore, we have to perform a real P2P access to check if it is actually
    possible.

    Note on p2p and cuda IPC:
    Usually, one process uses one GPU:
    GPU src --> cuda context src --> tensor src --> process src

    We need to combine p2p and cuda IPC, so that:
    GPU src --> cuda context src --> tensor src --> process src
                                      |shared|
    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
    That is to say, process src creates a tensor in GPU src, passes IPC handle to
    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
    tensor in process tgt will be reflected in the tensor in process src, because
    they are the same memory segment.
    It is important to note that process tgt accesses the tensor in GPU tgt, not
    GPU src. That's why we need p2p access.

    The most time-consuming part is the process creation. To avoid creating
    processes for every pair of GPUs, we use batched testing. We create two
    processes for testing all pairs of GPUs in batch. The trick is to reset
    the device after each test (which is not available in PyTorch).
    """  # noqa
    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
    # pass the CUDA_VISIBLE_DEVICES to the child process
    # to make sure they see the same set of GPUs

    # make sure the processes are spawned
    smp = mp.get_context("spawn")
    producer_queue = smp.Queue()
    consumer_queue = smp.Queue()
    result_queue = smp.Queue()
    p_src = smp.Process(
        target=producer,
        args=(
            batch_src,
            producer_queue,
            consumer_queue,
            result_queue,
            cuda_visible_devices,
        ),
    )
    p_tgt = smp.Process(
        target=consumer,
        args=(
            batch_tgt,
            producer_queue,
            consumer_queue,
            result_queue,
            cuda_visible_devices,
        ),
    )
    p_src.start()
    p_tgt.start()
    p_src.join()
    p_tgt.join()
    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
    result: List[bool] = []
    for src, tgt in zip(batch_src, batch_tgt):
        a = result_queue.get()
        b = result_queue.get()
        if a != b:
            print(
                "Two processes do not agree on the P2P access"
                " status on %d -> %d, treat as disabled.",
                src,
                tgt,
            )
            result.append(False)
        else:
            result.append(a)
    return result


# why do we need this cache?
# we are testing peer-to-peer (p2p) access between GPUs,across processes.
# if we test it every time, it will be very slow, because we need to create
#  N * N * 2 processes, where N is the world size. This is very slow.
# to reduce the time, we use a cache file to store the p2p access status.
# the cache file is generated by the master process if it does not exist.
# then all the processes can read the cache file to check the p2p access status.
# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
#  e.g. used by different vllm engines. The device id in the cache file is a
#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
#  of visible devices in the vllm engine.
_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None


def gpu_p2p_access_check(src: int, tgt: int) -> bool:
    """Check if GPU src can access GPU tgt."""

    # if the cache variable is already calculated,
    # read from the cache instead of checking it again
    global _gpu_p2p_access_cache
    if _gpu_p2p_access_cache is not None:
        return _gpu_p2p_access_cache[f"{src}->{tgt}"]

    is_distributed = dist.is_initialized()

    num_dev = cuda_device_count_stateless()
    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
    if cuda_visible_devices is None:
        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))

    path = os.path.join(
        envs.VLLM_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
    )
    os.makedirs(os.path.dirname(path), exist_ok=True)
    from server.inference.distributed.parallel_state import get_world_group

    if (not is_distributed or get_world_group().local_rank == 0) and (
        not os.path.exists(path)
    ):
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
        print("generating GPU P2P access cache in %s", path)
        cache: Dict[str, bool] = {}
        ids = list(range(num_dev))
        # batch of all pairs of GPUs
        batch_src, batch_tgt = zip(*list(product(ids, ids)))
        # NOTE: we use `subprocess` rather than `multiprocessing` here
        # because the caller might not have `if __name__ == "__main__":`,
        # in that case we cannot use spawn method in multiprocessing.
        # However, `can_actually_p2p` requires spawn method.
        # The fix is, we use `subprocess` to call the function,
        # where we have `if __name__ == "__main__":` in this file.

        # use a temporary file to store the result
        # we don't use the output of the subprocess directly,
        # because the subprocess might produce logging output
        with tempfile.NamedTemporaryFile() as output_file:
            input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
            returned = subprocess.run(
                [sys.executable, __file__], input=input_bytes, capture_output=True
            )
            # check if the subprocess is successful
            try:
                returned.check_returncode()
            except Exception as e:
                # wrap raised exception to provide more information
                raise RuntimeError(
                    f"Error happened when batch testing "
                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
                    f"{returned.stderr.decode()}"
                ) from e
            with open(output_file.name, "rb") as f:
                result = pickle.load(f)
        for _i, _j, r in zip(batch_src, batch_tgt, result):
            cache[f"{_i}->{_j}"] = r
        with open(path, "w") as f:
            json.dump(cache, f, indent=4)
    if is_distributed:
        get_world_group().barrier()
    print("reading GPU P2P access cache from %s", path)
    with open(path) as f:
        cache = json.load(f)
    _gpu_p2p_access_cache = cache
    return _gpu_p2p_access_cache[f"{src}->{tgt}"]


__all__ = ["gpu_p2p_access_check"]

if __name__ == "__main__":
    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
    result = can_actually_p2p(batch_src, batch_tgt)
    with open(output_file, "wb") as f:
        f.write(pickle.dumps(result))


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/parallel_state.py
================================================
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""vLLM distributed state.
It takes over the control of the distributed environment from PyTorch.
The typical workflow is:

- call `init_distributed_environment` to initialize the distributed environment.
- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to
 initialize the model parallel groups.

- any code dealing with the distributed stuff

- call `destroy_model_parallel` to destroy the model parallel groups.
- call `destroy_distributed_environment` to destroy the distributed environment.

If you only need to use the distributed environment without model/pipeline
 parallelism, you can skip the model parallel initialization and destruction
 steps.
"""
import contextlib
import gc
import pickle
import weakref
from collections import namedtuple
from contextlib import contextmanager, nullcontext
from dataclasses import dataclass
from multiprocessing import shared_memory
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from unittest.mock import patch

import torch
import torch.distributed
from torch.distributed import Backend, ProcessGroup

import server.envs as envs
from server.inference.platforms import current_platform
from server.utils import direct_register_custom_op, supports_custom_op


@dataclass
class GraphCaptureContext:
    stream: torch.cuda.Stream


TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])


def _split_tensor_dict(
    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
    """Split the tensor dictionary into two parts:
    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
         by its metadata.
    2. A list of tensors.
    """
    metadata_list: List[Tuple[str, Any]] = []
    tensor_list: List[torch.Tensor] = []
    for key, value in tensor_dict.items():
        if isinstance(value, torch.Tensor):
            # Note: we cannot use `value.device` here,
            # because it contains not only the device type but also the device
            # index (e.g. "cuda:0"). We only need the device type.
            # receiving side will set the device index.
            device = value.device.type
            metadata_list.append(
                (key, TensorMetadata(device, value.dtype, value.size()))
            )
            tensor_list.append(value)
        else:
            metadata_list.append((key, value))
    return metadata_list, tensor_list


_group_name_counter: Dict[str, int] = {}


def _get_unique_name(name: str) -> str:
    """Get a unique name for the group.
    Example:
    _get_unique_name("tp") -> "tp:0"
    _get_unique_name("tp") -> "tp:1"
    """
    if name not in _group_name_counter:
        _group_name_counter[name] = 0
    newname = f"{name}:{_group_name_counter[name]}"
    _group_name_counter[name] += 1
    return newname


_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}


def _register_group(group: "GroupCoordinator") -> None:
    _groups[group.unique_name] = weakref.ref(group)


if supports_custom_op():

    def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
        assert group_name in _groups, f"Group {group_name} is not found."
        group = _groups[group_name]()
        if group is None:
            raise ValueError(f"Group {group_name} is destroyed.")
        group._all_reduce_in_place(tensor)

    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
        return

    direct_register_custom_op(
        op_name="inplace_all_reduce",
        op_func=inplace_all_reduce,
        mutates_args=["tensor"],
        fake_impl=inplace_all_reduce_fake,
    )

    def outplace_all_reduce(tensor: torch.Tensor, group_name: str, bsz_tensor: torch.Tensor, is_compute_bound: bool = False, overlap: bool = False) -> torch.Tensor:
        assert group_name in _groups, f"Group {group_name} is not found."
        group = _groups[group_name]()
        if group is None:
            raise ValueError(f"Group {group_name} is destroyed.")
        return group._all_reduce_out_place(tensor, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)

    def outplace_all_reduce_fake(tensor: torch.Tensor, group_name: str, bsz_tensor: torch.Tensor, is_compute_bound: bool = False, overlap: bool = False) -> torch.Tensor:
        return torch.empty_like(tensor)

    direct_register_custom_op(
        op_name="outplace_all_reduce",
        op_func=outplace_all_reduce,
        mutates_args=[],
        fake_impl=outplace_all_reduce_fake,
    )


class GroupCoordinator:
    """
    PyTorch ProcessGroup wrapper for a group of processes.
    PyTorch ProcessGroup is bound to one specific communication backend,
        e.g. NCCL, Gloo, MPI, etc.
    GroupCoordinator takes charge of all the communication operations among
        the processes in the group. It can route the communication to
        a specific implementation (e.g. switch allreduce implementation
        based on the tensor size and cuda graph mode).
    """

    # available attributes:
    rank: int  # global rank
    ranks: List[int]  # global ranks in the group
    world_size: int  # size of the group
    # difference between `local_rank` and `rank_in_group`:
    # if we have a group of size 4 across two nodes:
    # Process | Node | Rank | Local Rank | Rank in Group
    #   0     |   0  |  0   |     0      |       0
    #   1     |   0  |  1   |     1      |       1
    #   2     |   1  |  2   |     0      |       2
    #   3     |   1  |  3   |     1      |       3
    local_rank: int  # local rank used to assign devices
    rank_in_group: int  # rank inside the group
    cpu_group: ProcessGroup  # group for CPU communication
    device_group: ProcessGroup  # group for device communication
    use_pynccl: bool  # a hint of whether to use PyNccl
    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
    # communicators are only created for world size > 1
    pynccl_comm: Optional[Any]  # PyNccl communicator
    ca_comm: Optional[Any]  # Custom allreduce communicator
    mq_broadcaster: Optional[Any]  # shared memory broadcaster

    def __init__(
        self,
        group_ranks: List[List[int]],
        local_rank: int,
        torch_distributed_backend: Union[str, Backend],
        use_pynccl: bool,
        use_custom_allreduce: bool,
        use_tpu_communicator: bool,
        use_hpu_communicator: bool,
        use_xpu_communicator: bool,
        use_message_queue_broadcaster: bool = False,
        group_name: Optional[str] = None,
    ):
        group_name = group_name or "anonymous"
        self.unique_name = _get_unique_name(group_name)
        _register_group(self)

        self.rank = torch.distributed.get_rank()
        self.local_rank = local_rank
        self.device_group = None
        self.cpu_group = None

        for ranks in group_ranks:
            device_group = torch.distributed.new_group(
                ranks, backend=torch_distributed_backend
            )
            # a group with `gloo` backend, to allow direct coordination between
            # processes through the CPU.
            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
            if self.rank in ranks:
                self.ranks = ranks
                self.world_size = len(ranks)
                self.rank_in_group = ranks.index(self.rank)
                self.device_group = device_group
                self.cpu_group = cpu_group

        assert self.cpu_group is not None
        assert self.device_group is not None
        assert current_platform.is_cuda_alike()

        if current_platform.is_cuda_alike():
            self.device = torch.device(f"cuda:{local_rank}")
        else:
            self.device = torch.device("cpu")

        self.use_pynccl = use_pynccl
        self.use_custom_allreduce = use_custom_allreduce
        self.use_tpu_communicator = use_tpu_communicator
        self.use_hpu_communicator = use_hpu_communicator
        self.use_xpu_communicator = use_xpu_communicator

        # lazy import to avoid documentation build error
        from server.inference.distributed.custom_all_reduce import CustomAllreduce
        from server.inference.distributed.pynccl import PyNcclCommunicator

        self.pynccl_comm: Optional[PyNcclCommunicator] = None
        # if use_pynccl and self.world_size > 1:
        #     self.pynccl_comm = PyNcclCommunicator(
        #         group=self.cpu_group,
        #         device=self.device,
        #     )

        self.ca_comm: Optional[CustomAllreduce] = None
        if use_custom_allreduce and self.world_size > 1:
            # Initialize a custom fast all-reduce implementation.
            self.ca_comm = CustomAllreduce(
                group=self.cpu_group,
                device=self.device,
            )

        #### we assume we won't use tpu or hpu or xpu or messagequeue broadcast

        # from vllm.distributed.device_communicators.tpu_communicator import (
        #     TpuCommunicator)
        # self.tpu_communicator: Optional[TpuCommunicator] = None
        # if use_tpu_communicator and self.world_size > 1:
        #     self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
        self.tpu_communicator = None

        # from vllm.distributed.device_communicators.hpu_communicator import (
        #     HpuCommunicator)
        # self.hpu_communicator: Optional[HpuCommunicator]
        # if use_hpu_communicator and self.world_size > 1:
        #     self.hpu_communicator = HpuCommunicator(group=self.device_group)
        self.hpu_communicator = None

        # from vllm.distributed.device_communicators.xpu_communicator import (
        #     XpuCommunicator)
        # self.xpu_communicator: Optional[XpuCommunicator]
        # if use_xpu_communicator and self.world_size > 1:
        #     self.xpu_communicator = XpuCommunicator(group=self.device_group)
        self.xpu_communicator = None

        # from vllm.distributed.device_communicators.shm_broadcast import (
        #     MessageQueue)
        # self.mq_broadcaster: Optional[MessageQueue] = None
        # if use_message_queue_broadcaster and self.world_size > 1:
        #     self.mq_broadcaster = MessageQueue.create_from_process_group(
        #         self.cpu_group, 1 << 22, 6)
        self.mq_broadcaster = None

    @property
    def first_rank(self):
        """Return the global rank of the first process in the group"""
        return self.ranks[0]

    @property
    def last_rank(self):
        """Return the global rank of the last process in the group"""
        return self.ranks[-1]

    @property
    def is_first_rank(self):
        """Return whether the caller is the first process in the group"""
        return self.rank == self.first_rank

    @property
    def is_last_rank(self):
        """Return whether the caller is the last process in the group"""
        return self.rank == self.last_rank

    @property
    def next_rank(self):
        """Return the global rank of the process that follows the caller"""
        rank_in_group = self.rank_in_group
        world_size = self.world_size
        return self.ranks[(rank_in_group + 1) % world_size]

    @property
    def prev_rank(self):
        """Return the global rank of the process that precedes the caller"""
        rank_in_group = self.rank_in_group
        world_size = self.world_size
        return self.ranks[(rank_in_group - 1) % world_size]

    @contextmanager
    def graph_capture(
        self, graph_capture_context: Optional[GraphCaptureContext] = None
    ):
        if graph_capture_context is None:
            stream = torch.cuda.Stream()
            graph_capture_context = GraphCaptureContext(stream)
        else:
            stream = graph_capture_context.stream

        ca_comm = self.ca_comm
        maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture()

        # ensure all initialization operations complete before attempting to
        # capture the graph on another stream
        curr_stream = torch.cuda.current_stream()
        if curr_stream != stream:
            stream.wait_stream(curr_stream)

        with torch.cuda.stream(stream), maybe_ca_context:
            # In graph mode, we have to be very careful about the collective
            # operations. The current status is:
            #     allreduce \ Mode   |  Eager  |  Graph  |
            # --------------------------------------------
            # custom allreduce       | enabled | enabled |
            # PyNccl                 | disabled| enabled |
            # torch.distributed      | enabled | disabled|
            #
            # Note that custom allreduce will have a runtime check, if the
            #  tensor size is too large, it will fallback to the next
            #  available option.
            # In summary: When using CUDA graph, we use
            #  either custom all-reduce kernel or pynccl. When not using
            #  CUDA graph, we use either custom all-reduce kernel or
            #  PyTorch NCCL. We always prioritize using custom all-reduce
            #  kernel but fall back to PyTorch or pynccl if it is
            #  disabled or not supported.
            pynccl_comm = self.pynccl_comm
            maybe_pynccl_context: Any
            if not pynccl_comm:
                maybe_pynccl_context = nullcontext()
            else:
                maybe_pynccl_context = pynccl_comm.change_state(
                    enable=True, stream=torch.cuda.current_stream()
                )
            with maybe_pynccl_context:
                yield graph_capture_context

    def all_reduce(self, input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
        """
        User-facing all-reduce function before we actually call the
        all-reduce operation.

        We need this because Dynamo does not support passing an arbitrary
        object (`self` in this case) to a custom op. We need to pass the
         group name as a string, and then look up the group coordinator from
         the group name, dispatch the all-reduce operation to the group
         coordinator.

        In addition, PyTorch custom ops do not support mutation or returning
        a new tensor in the same op. So we need to figure out if the op is
        in-place or out-of-place ahead of time.
        """
        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return input_

        if input_.is_cpu:
            import intel_extension_for_pytorch as ipex

            ipex.distributed.all_reduce(input_, group=self.device_group)
            return input_

        if not supports_custom_op():
            self._all_reduce_in_place(input_)
            return input_

        if self.tpu_communicator is not None and not self.tpu_communicator.disabled:
            # TPU handles Dynamo with its own logic.
            return self.tpu_communicator.all_reduce(input_)

        if self.hpu_communicator is not None and not self.hpu_communicator.disabled:
            return self.hpu_communicator.all_reduce(input_)

        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
            return self.xpu_communicator.all_reduce(input_)

        if (
            self.ca_comm is not None
            and not self.ca_comm.disabled
            and self.ca_comm.should_custom_ar(input_)
        ):
            return torch.ops.vllm.outplace_all_reduce(
                input_, group_name=self.unique_name, bsz_tensor=bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap
            )
        else:
            #assert self.ca_comm is not None
            #assert not self.ca_comm.disabled
            #assert self.ca_comm.should_custom_ar(input_)
            torch.ops.vllm.inplace_all_reduce(input_, group_name=self.unique_name)
            return input_

    def _all_reduce_out_place(self, input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
        ca_comm = self.ca_comm
        assert ca_comm is not None
        assert not ca_comm.disabled
        out = ca_comm.custom_all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)
        assert out is not None
        return out

    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
        pynccl_comm = self.pynccl_comm
        if pynccl_comm is not None and not pynccl_comm.disabled:
            pynccl_comm.all_reduce(input_)
        else:
            torch.distributed.all_reduce(input_, group=self.device_group)

    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
        world_size = self.world_size
        # Bypass the function if we are using only 1 GPU.
        if world_size == 1:
            return input_
        assert (
            -input_.dim() <= dim < input_.dim()
        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"

        # For TPUs, use TPU communicator.
        tpu_comm = self.tpu_communicator
        if tpu_comm is not None and not tpu_comm.disabled:
            return tpu_comm.all_gather(input_, dim)

        # For HPUs, use HPU communicator.
        hpu_comm = self.hpu_communicator
        if hpu_comm is not None and not hpu_comm.disabled:
            return hpu_comm.all_gather(input_, dim)

        if dim < 0:
            # Convert negative dim to positive.
            dim += input_.dim()
        input_size = input_.size()
        # NOTE: we have to use concat-style all-gather here,
        # stack-style all-gather has compatibility issues with
        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
        output_size = (input_size[0] * world_size,) + input_size[1:]
        # Allocate output tensor.
        output_tensor = torch.empty(
            output_size, dtype=input_.dtype, device=input_.device
        )
        # All-gather.
        torch.distributed.all_gather_into_tensor(
            output_tensor, input_, group=self.device_group
        )
        # Reshape
        output_tensor = output_tensor.reshape((world_size,) + input_size)
        output_tensor = output_tensor.movedim(0, dim)
        output_tensor = output_tensor.reshape(
            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
        )
        return output_tensor

    def gather(
        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
    ) -> Optional[torch.Tensor]:
        """
        NOTE: We assume that the input tensor is on the same device across
        all the ranks.
        NOTE: `dst` is the local rank of the destination rank.
        """
        world_size = self.world_size
        # Bypass the function if we are using only 1 GPU.
        if world_size == 1:
            return input_
        assert (
            -input_.dim() <= dim < input_.dim()
        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
        if dim < 0:
            # Convert negative dim to positive.
            dim += input_.dim()
        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
            return self.xpu_communicator.gather(input_, self.rank_in_group, dst, dim)
        # Allocate output tensor.
        if self.rank_in_group == dst:
            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
        else:
            gather_list = None
        # Gather.
        torch.distributed.gather(
            input_, gather_list, dst=self.ranks[dst], group=self.device_group
        )
        if self.rank_in_group == dst:
            output_tensor = torch.cat(gather_list, dim=dim)
        else:
            output_tensor = None
        return output_tensor

    def broadcast(self, input_: torch.Tensor, src: int = 0):
        """Broadcast the input tensor.
        NOTE: `src` is the local rank of the source rank.
        """
        assert src < self.world_size, f"Invalid src rank ({src})"

        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return input_
        # Broadcast.
        torch.distributed.broadcast(
            input_, src=self.ranks[src], group=self.device_group
        )
        return input_

    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
        """Broadcast the input object.
        NOTE: `src` is the local rank of the source rank.
        """
        assert src < self.world_size, f"Invalid src rank ({src})"

        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return obj
        if self.mq_broadcaster is not None:
            assert src == 0, "Message queue broadcaster only supports src=0"
            return self.mq_broadcaster.broadcast_object(obj)
        if self.rank_in_group == src:
            torch.distributed.broadcast_object_list(
                [obj], src=self.ranks[src], group=self.cpu_group
            )
            return obj
        else:
            recv = [None]
            torch.distributed.broadcast_object_list(
                recv, src=self.ranks[src], group=self.cpu_group
            )
            return recv[0]

    def broadcast_object_list(
        self, obj_list: List[Any], src: int = 0, group: Optional[ProcessGroup] = None
    ):
        """Broadcast the input object list.
        NOTE: `src` is the local rank of the source rank.
        """
        assert src < self.world_size, f"Invalid src rank ({src})"

        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return obj_list
        # Broadcast.
        torch.distributed.broadcast_object_list(
            obj_list, src=self.ranks[src], group=self.device_group
        )
        return obj_list

    def send_object(self, obj: Any, dst: int) -> None:
        """Send the input object list to the destination rank."""
        """NOTE: `dst` is the local rank of the destination rank."""

        assert dst < self.world_size, f"Invalid dst rank ({dst})"

        assert dst != self.rank_in_group, (
            "Invalid destination rank. Destination rank is the same "
            "as the current rank."
        )

        # Serialize object to tensor and get the size as well
        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)

        size_tensor = torch.tensor(
            [object_tensor.numel()], dtype=torch.long, device="cpu"
        )

        # Send object size

        torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)

        # Send object
        torch.distributed.send(object_tensor, dst=self.ranks[dst], group=self.cpu_group)

        return None

    def recv_object(self, src: int) -> Any:
        """Receive the input object list from the source rank."""
        """NOTE: `src` is the local rank of the source rank."""

        assert src < self.world_size, f"Invalid src rank ({src})"

        assert (
            src != self.rank_in_group
        ), "Invalid source rank. Source rank is the same as the current rank."

        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")

        # Receive object size
        rank_size = torch.distributed.recv(
            size_tensor, src=self.ranks[src], group=self.cpu_group
        )

        # Tensor to receive serialized objects into.
        object_tensor = torch.empty(  # type: ignore[call-overload]
            size_tensor.item(),  # type: ignore[arg-type]
            dtype=torch.uint8,
            device="cpu",
        )

        rank_object = torch.distributed.recv(
            object_tensor, src=self.ranks[src], group=self.cpu_group
        )

        assert (
            rank_object == rank_size
        ), "Received object sender rank does not match the size sender rank."

        obj = pickle.loads(object_tensor.numpy().tobytes())

        return obj

    def broadcast_tensor_dict(
        self,
        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
        src: int = 0,
        group: Optional[ProcessGroup] = None,
        metadata_group: Optional[ProcessGroup] = None,
    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
        """Broadcast the input tensor dictionary.
        NOTE: `src` is the local rank of the source rank.
        """
        # Bypass the function if we are using only 1 GPU.
        if not torch.distributed.is_initialized() or self.world_size == 1:
            return tensor_dict

        group = self.device_group
        metadata_group = self.cpu_group
        assert src < self.world_size, f"Invalid src rank ({src})"

        rank_in_group = self.rank_in_group
        if rank_in_group == src:
            metadata_list: List[Tuple[Any, Any]] = []
            assert isinstance(
                tensor_dict, dict
            ), f"Expecting a dictionary, got {type(tensor_dict)}"
            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
            # `metadata_list` lives in CPU memory.
            # `broadcast_object_list` has serialization & deserialization,
            # all happening on CPU. Therefore, we can use the CPU group.
            self.broadcast_object(metadata_list, src=src)
            async_handles = []
            for tensor in tensor_list:
                if tensor.numel() == 0:
                    # Skip broadcasting empty tensors.
                    continue
                if tensor.is_cpu:
                    # use metadata_group for CPU tensors
                    handle = torch.distributed.broadcast(
                        tensor, src=self.ranks[src], group=metadata_group, async_op=True
                    )
                else:
                    # use group for GPU tensors
                    handle = torch.distributed.broadcast(
                        tensor, src=self.ranks[src], group=group, async_op=True
                    )
                async_handles.append(handle)
            for async_handle in async_handles:
                async_handle.wait()

        else:
            metadata_list = self.broadcast_object(None, src=src)
            tensor_dict = {}
            async_handles = []
            for key, value in metadata_list:
                if isinstance(value, TensorMetadata):
                    tensor = torch.empty(
                        value.size, dtype=value.dtype, device=value.device
                    )
                    if tensor.numel() == 0:
                        # Skip broadcasting empty tensors.
                        tensor_dict[key] = tensor
                        continue
                    if tensor.is_cpu:
                        # use metadata_group for CPU tensors
                        handle = torch.distributed.broadcast(
                            tensor,
                            src=self.ranks[src],
                            group=metadata_group,
                            async_op=True,
                        )
                    else:
                        # use group for GPU tensors
                        handle = torch.distributed.broadcast(
                            tensor, src=self.ranks[src], group=group, async_op=True
                        )
                    async_handles.append(handle)
                    tensor_dict[key] = tensor
                else:
                    tensor_dict[key] = value
            for async_handle in async_handles:
                async_handle.wait()
        return tensor_dict

    def send_tensor_dict(
        self,
        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
        dst: Optional[int] = None,
        all_gather_group: Optional["GroupCoordinator"] = None,
    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
        """Send the input tensor dictionary.
        NOTE: `dst` is the local rank of the source rank.
        """
        # Bypass the function if we are using only 1 GPU.
        if not torch.distributed.is_initialized() or self.world_size == 1:
            return tensor_dict

        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
        all_gather_rank = (
            0 if all_gather_group is None else all_gather_group.rank_in_group
        )

        group = self.device_group
        metadata_group = self.cpu_group

        if dst is None:
            dst = (self.rank_in_group + 1) % self.world_size
        assert dst < self.world_size, f"Invalid dst rank ({dst})"

        metadata_list: List[Tuple[Any, Any]] = []
        assert isinstance(
            tensor_dict, dict
        ), f"Expecting a dictionary, got {type(tensor_dict)}"
        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
        # `metadata_list` lives in CPU memory.
        # `send_object_list` has serialization & deserialization,
        # all happening on CPU. Therefore, we can use the CPU group.
        self.send_object(metadata_list, dst=dst)
        for tensor in tensor_list:
            if tensor.numel() == 0:
                # Skip sending empty tensors.
                continue

            # send-allgather: send only a slice, then do allgather.
            if all_gather_group is not None and tensor.numel() % all_gather_size == 0:
                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]

            if tensor.is_cpu:
                # use metadata_group for CPU tensors
                torch.distributed.send(
                    tensor, dst=self.ranks[dst], group=metadata_group
                )
            else:
                # use group for GPU tensors
                torch.distributed.send(tensor, dst=self.ranks[dst], group=group)
        return None

    def recv_tensor_dict(
        self,
        src: Optional[int] = None,
        all_gather_group: Optional["GroupCoordinator"] = None,
    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
        """Recv the input tensor dictionary.
        NOTE: `src` is the local rank of the source rank.
        """
        # Bypass the function if we are using only 1 GPU.
        if not torch.distributed.is_initialized() or self.world_size == 1:
            return None

        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
        all_gather_rank = (
            0 if all_gather_group is None else all_gather_group.rank_in_group
        )

        group = self.device_group
        metadata_group = self.cpu_group

        if src is None:
            src = (self.rank_in_group - 1) % self.world_size
        assert src < self.world_size, f"Invalid src rank ({src})"

        recv_metadata_list = self.recv_object(src=src)
        tensor_dict: Dict[str, Any] = {}
        for key, value in recv_metadata_list:
            if isinstance(value, TensorMetadata):
                tensor = torch.empty(value.size, dtype=value.dtype, device=value.device)
                if tensor.numel() == 0:
                    # Skip broadcasting empty tensors.
                    tensor_dict[key] = tensor
                    continue

                # send-allgather: send only a slice, then do allgather.
                use_all_gather = (
                    all_gather_group is not None
                    and tensor.numel() % all_gather_size == 0
                )

                if use_all_gather:
                    orig_shape = tensor.shape
                    tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]

                if tensor.is_cpu:
                    # use metadata_group for CPU tensors
                    torch.distributed.recv(
                        tensor, src=self.ranks[src], group=metadata_group
                    )
                else:
                    # use group for GPU tensors
                    torch.distributed.recv(tensor, src=self.ranks[src], group=group)
                if use_all_gather:
                    # do the allgather
                    tensor = all_gather_group.all_gather(tensor, dim=0)  # type: ignore
                    tensor = tensor.reshape(orig_shape)

                tensor_dict[key] = tensor
            else:
                tensor_dict[key] = value
        return tensor_dict

    def barrier(self):
        """Barrier synchronization among the group.
        NOTE: don't use `device_group` here! `barrier` in NCCL is
        terrible because it is internally a broadcast operation with
        secretly created GPU tensors. It is easy to mess up the current
        device. Use the CPU group instead.
        """
        torch.distributed.barrier(group=self.cpu_group)

    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
        """Sends a tensor to the destination rank in a non-blocking way"""
        """NOTE: `dst` is the local rank of the destination rank."""
        if dst is None:
            dst = (self.rank_in_group + 1) % self.world_size

        pynccl_comm = self.pynccl_comm
        if pynccl_comm is not None and not pynccl_comm.disabled:
            pynccl_comm.send(tensor, dst)
        else:
            torch.distributed.send(tensor, self.ranks[dst], self.device_group)

    def recv(
        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
    ) -> torch.Tensor:
        """Receives a tensor from the source rank."""
        """NOTE: `src` is the local rank of the source rank."""
        if src is None:
            src = (self.rank_in_group - 1) % self.world_size

        tensor = torch.empty(size, dtype=dtype, device=self.device)
        pynccl_comm = self.pynccl_comm
        if pynccl_comm is not None and not pynccl_comm.disabled:
            pynccl_comm.recv(tensor, src)
        else:
            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
        return tensor

    def destroy(self):
        if self.device_group is not None:
            torch.distributed.destroy_process_group(self.device_group)
            self.device_group = None
        if self.cpu_group is not None:
            torch.distributed.destroy_process_group(self.cpu_group)
            self.cpu_group = None
        if self.pynccl_comm is not None:
            self.pynccl_comm = None
        if self.ca_comm is not None:
            self.ca_comm = None
        if self.mq_broadcaster is not None:
            self.mq_broadcaster = None


_WORLD: Optional[GroupCoordinator] = None


def get_world_group() -> GroupCoordinator:
    assert _WORLD is not None, "world group is not initialized"
    return _WORLD


def init_world_group(
    ranks: List[int], local_rank: int, backend: str
) -> GroupCoordinator:
    return GroupCoordinator(
        group_ranks=[ranks],
        local_rank=local_rank,
        torch_distributed_backend=backend,
        use_pynccl=False,
        use_custom_allreduce=False,
        use_tpu_communicator=False,
        use_hpu_communicator=False,
        use_xpu_communicator=False,
        group_name="world",
    )


def init_model_parallel_group(
    group_ranks: List[List[int]],
    local_rank: int,
    backend: str,
    use_custom_allreduce: Optional[bool] = None,
    use_message_queue_broadcaster: bool = False,
    group_name: Optional[str] = None,
) -> GroupCoordinator:
    if use_custom_allreduce is None:
        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
    return GroupCoordinator(
        group_ranks=group_ranks,
        local_rank=local_rank,
        torch_distributed_backend=backend,
        use_pynccl=True,
        use_custom_allreduce=use_custom_allreduce,
        use_tpu_communicator=True,
        use_hpu_communicator=True,
        use_xpu_communicator=True,
        use_message_queue_broadcaster=use_message_queue_broadcaster,
        group_name=group_name,
    )


_TP: Optional[GroupCoordinator] = None


def get_tp_group() -> GroupCoordinator:
    assert _TP is not None, "tensor model parallel group is not initialized"
    return _TP


# kept for backward compatibility
get_tensor_model_parallel_group = get_tp_group

_PP: Optional[GroupCoordinator] = None


def get_pp_group() -> GroupCoordinator:
    assert _PP is not None, "pipeline model parallel group is not initialized"
    return _PP


# kept for backward compatibility
get_pipeline_model_parallel_group = get_pp_group


@contextmanager
def graph_capture():
    """
    `graph_capture` is a context manager which should surround the code that
    is capturing the CUDA graph. Its main purpose is to ensure that the
    some operations will be run after the graph is captured, before the graph
    is replayed. It returns a `GraphCaptureContext` object which contains the
    necessary data for the graph capture. Currently, it only contains the
    stream that the graph capture is running on. This stream is set to the
    current CUDA stream when the context manager is entered and reset to the
    default stream when the context manager is exited. This is to ensure that
    the graph capture is running on a separate stream from the default stream,
    in order to explicitly distinguish the kernels to capture
    from other kernels possibly launched on background in the default stream.
    """
    with get_tp_group().graph_capture() as context, get_pp_group().graph_capture(
        context
    ):
        yield context


_ENABLE_CUSTOM_ALL_REDUCE = True


def set_custom_all_reduce(enable: bool):
    global _ENABLE_CUSTOM_ALL_REDUCE
    _ENABLE_CUSTOM_ALL_REDUCE = enable


def init_distributed_environment(
    world_size: int = -1,
    rank: int = -1,
    distributed_init_method: str = "env://",
    local_rank: int = -1,
    backend: str = "nccl",
):
    print(
        "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s",
        world_size,
        rank,
        local_rank,
        distributed_init_method,
        backend,
    )
    if not torch.distributed.is_initialized():
        assert distributed_init_method is not None, (
            "distributed_init_method must be provided when initializing "
            "distributed environment"
        )
        # this backend is used for WORLD
        torch.distributed.init_process_group(
            backend=backend,
            init_method=distributed_init_method,
            world_size=world_size,
            rank=rank,
        )
    # set the local rank
    # local_rank is not available in torch ProcessGroup,
    # see https://github.com/pytorch/pytorch/issues/122816
    if local_rank == -1:
        # local rank not set, this usually happens in single-node
        # setting, where we can use rank as local rank
        if distributed_init_method == "env://":
            local_rank = envs.LOCAL_RANK
        else:
            local_rank = rank
    global _WORLD
    if _WORLD is None:
        ranks = list(range(torch.distributed.get_world_size()))
        _WORLD = init_world_group(ranks, local_rank, backend)
    else:
        assert (
            _WORLD.world_size == torch.distributed.get_world_size()
        ), "world group already initialized with a different world size"


def initialize_model_parallel(
    tensor_model_parallel_size: int = 1,
    pipeline_model_parallel_size: int = 1,
    backend: Optional[str] = None,
) -> None:
    """
    Initialize model parallel groups.

    Arguments:
        tensor_model_parallel_size: number of GPUs used for tensor model
            parallelism.
        pipeline_model_parallel_size: number of GPUs used for pipeline model
            parallelism.

    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
    the model pipeline. The present function will
    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
        4 tensor model-parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
        2 pipeline model-parallel groups:
            [g0, g2, g4, g6], [g1, g3, g5, g7]
    Note that for efficiency, the caller should make sure adjacent ranks
    are on the same DGX box. For example if we are using 2 DGX-1 boxes
    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
    ranks 8 to 15 belong to the second box.
    """
    # Get world size and rank. Ensure some consistencies.
    assert torch.distributed.is_initialized()
    world_size: int = torch.distributed.get_world_size()
    backend = backend or torch.distributed.get_backend(get_world_group().device_group)

    if world_size != tensor_model_parallel_size * pipeline_model_parallel_size:
        raise RuntimeError(
            f"world_size ({world_size}) is not equal to "
            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})"
        )

    # Build the tensor model-parallel groups.
    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
    global _TP
    assert _TP is None, "tensor model parallel group is already initialized"
    group_ranks = []
    for i in range(num_tensor_model_parallel_groups):
        ranks = list(
            range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
        )
        group_ranks.append(ranks)

    # message queue broadcaster is only used in tensor model parallel group
    _TP = init_model_parallel_group(
        group_ranks,
        get_world_group().local_rank,
        backend,
        use_message_queue_broadcaster=True,
        group_name="tp",
    )

    # Build the pipeline model-parallel groups.
    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
    global _PP
    assert _PP is None, "pipeline model parallel group is already initialized"
    group_ranks = []
    for i in range(num_pipeline_model_parallel_groups):
        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
        group_ranks.append(ranks)
    # pipeline parallel does not need custom allreduce
    _PP = init_model_parallel_group(
        group_ranks,
        get_world_group().local_rank,
        backend,
        use_custom_allreduce=False,
        group_name="pp",
    )


def ensure_model_parallel_initialized(
    tensor_model_parallel_size: int,
    pipeline_model_parallel_size: int,
    backend: Optional[str] = None,
) -> None:
    """Helper to initialize model parallel groups if they are not initialized,
    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
    values if the model parallel groups are initialized.
    """
    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
    if not model_parallel_is_initialized():
        initialize_model_parallel(
            tensor_model_parallel_size, pipeline_model_parallel_size, backend
        )
        return

    assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
        "tensor parallel group already initialized, but of unexpected size: "
        f"{get_tensor_model_parallel_world_size()=} vs. "
        f"{tensor_model_parallel_size=}"
    )
    pp_world_size = get_pp_group().world_size
    assert pp_world_size == pipeline_model_parallel_size, (
        "pipeline parallel group already initialized, but of unexpected size: "
        f"{pp_world_size=} vs. "
        f"{pipeline_model_parallel_size=}"
    )


def model_parallel_is_initialized():
    """Check if tensor and pipeline parallel groups are initialized."""
    return _TP is not None and _PP is not None


_TP_STATE_PATCHED = False


@contextmanager
def patch_tensor_parallel_group(tp_group: GroupCoordinator):
    """Patch the tp group temporarily until this function ends.

    This method is for draft workers of speculative decoding to run draft model
    with different tp degree from that of target model workers.

    Args:
        tp_group (GroupCoordinator): the tp group coordinator
    """
    global _TP_STATE_PATCHED
    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"

    _TP_STATE_PATCHED = True
    old_tp_group = get_tp_group()
    global _TP
    _TP = tp_group
    try:
        yield
    finally:
        # restore the original state
        _TP_STATE_PATCHED = False
        _TP = old_tp_group


def get_tensor_model_parallel_world_size():
    """Return world size for the tensor model parallel group."""
    return get_tp_group().world_size


def get_tensor_model_parallel_rank():
    """Return my rank for the tensor model parallel group."""
    return get_tp_group().rank_in_group


def destroy_model_parallel():
    """Set the groups to none and destroy them."""
    global _TP
    if _TP:
        _TP.destroy()
    _TP = None

    global _PP
    if _PP:
        _PP.destroy()
    _PP = None


def destroy_distributed_environment():
    global _WORLD
    if _WORLD:
        _WORLD.destroy()
    _WORLD = None
    if torch.distributed.is_initialized():
        torch.distributed.destroy_process_group()


def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
    destroy_model_parallel()
    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    if shutdown_ray:
        import ray  # Lazy import Ray

        ray.shutdown()
    gc.collect()
    if not current_platform.is_cpu():
        torch.cuda.empty_cache()


def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
    """
    This is a collective operation that returns if each rank is in the same node
    as the source rank. It tests if processes are attached to the same
    memory system (shared access to shared memory).
    """
    assert (
        torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL
    ), "in_the_same_node_as should be tested with a non-NCCL group."
    # local rank inside the group
    rank = torch.distributed.get_rank(group=pg)
    world_size = torch.distributed.get_world_size(group=pg)

    # local tensor in each process to store the result
    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)

    # global ranks of the processes in the group
    ranks = torch.distributed.get_process_group_ranks(pg)

    magic_message = b"magic_message"
    shm = None

    try:
        with contextlib.suppress(OSError):
            if rank == source_rank:
                # create a shared memory segment
                shm = shared_memory.SharedMemory(create=True, size=128)
                shm.buf[: len(magic_message)] = magic_message
                torch.distributed.broadcast_object_list(
                    [shm.name], src=ranks[source_rank], group=pg
                )
                is_in_the_same_node[rank] = 1
            else:
                # try to open the shared memory segment
                recv = [None]
                torch.distributed.broadcast_object_list(
                    recv, src=ranks[source_rank], group=pg
                )
                name = recv[0]
                # fix to https://stackoverflow.com/q/62748654/9191338
                # Python incorrectly tracks shared memory even if it is not
                # created by the process. The following patch is a workaround.
                with patch(
                    "multiprocessing.resource_tracker.register",
                    lambda *args, **kwargs: None,
                ):
                    shm = shared_memory.SharedMemory(name=name)
                if shm.buf[: len(magic_message)] == magic_message:
                    is_in_the_same_node[rank] = 1
    except Exception as e:
        print("Error ignored in is_in_the_same_node: %s", e)
    finally:
        if shm:
            shm.close()

    torch.distributed.barrier(group=pg)

    # clean up the shared memory segment
    with contextlib.suppress(OSError):
        if rank == source_rank and shm:
            shm.unlink()
    torch.distributed.all_reduce(is_in_the_same_node, group=pg)

    return [x == 1 for x in is_in_the_same_node.tolist()]


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/pynccl.py
================================================
from contextlib import contextmanager
from typing import Optional, Union

# ===================== import region =====================
import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup, ReduceOp

from server.inference.distributed.pynccl_wrapper import (
    NCCLLibrary,
    buffer_type,
    cudaStream_t,
    ncclComm_t,
    ncclDataTypeEnum,
    ncclRedOpTypeEnum,
    ncclUniqueId,
)
from server.inference.distributed.utils import StatelessProcessGroup


class PyNcclCommunicator:

    def __init__(
        self,
        group: Union[ProcessGroup, StatelessProcessGroup],
        device: Union[int, str, torch.device],
        library_path: Optional[str] = None,
    ):
        """
        Args:
            group: the process group to work on. If None, it will use the
                default process group.
            device: the device to bind the PyNcclCommunicator to. If None,
                it will be bind to f"cuda:{local_rank}".
            library_path: the path to the NCCL library. If None, it will
                use the default library path.
        It is the caller's responsibility to make sure each communicator
        is bind to a unique device.
        """
        if not isinstance(group, StatelessProcessGroup):
            assert dist.is_initialized()
            assert (
                dist.get_backend(group) != dist.Backend.NCCL
            ), "PyNcclCommunicator should be attached to a non-NCCL group."
            # note: this rank is the rank in the group
            self.rank = dist.get_rank(group)
            self.world_size = dist.get_world_size(group)
        else:
            self.rank = group.rank
            self.world_size = group.world_size

        self.group = group

        # if world_size == 1, no need to create communicator
        if self.world_size == 1:
            self.available = False
            self.disabled = True
            self.stream = None
            return
        try:
            self.nccl = NCCLLibrary(library_path)
        except Exception:
            # disable because of missing NCCL library
            # e.g. in a non-GPU environment
            self.available = False
            self.disabled = True
            self.stream = None
            return

        self.available = True
        self.disabled = False

        print("vLLM is using nccl==%s", self.nccl.ncclGetVersion())

        if self.rank == 0:
            # get the unique id from NCCL
            self.unique_id = self.nccl.ncclGetUniqueId()
        else:
            # construct an empty unique id
            self.unique_id = ncclUniqueId()

        if not isinstance(group, StatelessProcessGroup):
            tensor = torch.ByteTensor(list(self.unique_id.internal))
            ranks = dist.get_process_group_ranks(group)
            # arg `src` in `broadcast` is the global rank
            dist.broadcast(tensor, src=ranks[0], group=group)
            byte_list = tensor.tolist()
            for i, byte in enumerate(byte_list):
                self.unique_id.internal[i] = byte
        else:
            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
        # now `device` is a `torch.device` object
        assert isinstance(device, torch.device)
        self.device = device
        # nccl communicator and stream will use this device
        # `torch.cuda.device` is a context manager that changes the
        # current cuda device to the specified one
        with torch.cuda.device(device):
            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                self.world_size, self.unique_id, self.rank
            )
            self.stream = torch.cuda.Stream()

            # A small all_reduce for warmup.
            data = torch.zeros(1, device=device)
            self.all_reduce(data)
            self.stream.synchronize()
            del data

        # by default it is disabled, e.g. in profiling models and prefill phase.
        # to use it, use under `with obj.change_state(enable=True)`, usually
        # when we are using CUDA graph.
        self.disabled = True

    def all_reduce(
        self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None
    ):
        if self.disabled:
            return
        # nccl communicator created on a specific device
        # will only work on tensors on the same device
        # otherwise it will cause "illegal memory access"
        assert tensor.device == self.device, (
            f"this nccl communicator is created to work on {self.device}, "
            f"but the input tensor is on {tensor.device}"
        )
        if stream is None:
            stream = self.stream
        self.nccl.ncclAllReduce(
            buffer_type(tensor.data_ptr()),
            buffer_type(tensor.data_ptr()),
            tensor.numel(),
            ncclDataTypeEnum.from_torch(tensor.dtype),
            ncclRedOpTypeEnum.from_torch(op),
            self.comm,
            cudaStream_t(stream.cuda_stream),
        )

    def send(self, tensor: torch.Tensor, dst: int, stream=None):
        if self.disabled:
            return
        assert tensor.device == self.device, (
            f"this nccl communicator is created to work on {self.device}, "
            f"but the input tensor is on {tensor.device}"
        )
        if stream is None:
            stream = self.stream
        self.nccl.ncclSend(
            buffer_type(tensor.data_ptr()),
            tensor.numel(),
            ncclDataTypeEnum.from_torch(tensor.dtype),
            dst,
            self.comm,
            cudaStream_t(stream.cuda_stream),
        )

    def recv(self, tensor: torch.Tensor, src: int, stream=None):
        if self.disabled:
            return
        assert tensor.device == self.device, (
            f"this nccl communicator is created to work on {self.device}, "
            f"but the input tensor is on {tensor.device}"
        )
        if stream is None:
            stream = self.stream
        self.nccl.ncclRecv(
            buffer_type(tensor.data_ptr()),
            tensor.numel(),
            ncclDataTypeEnum.from_torch(tensor.dtype),
            src,
            self.comm,
            cudaStream_t(stream.cuda_stream),
        )

    @contextmanager
    def change_state(
        self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None
    ):
        """
        A context manager to change the state of the communicator.
        """
        if enable is None:
            # guess a default value when not specified
            enable = self.available

        if stream is None:
            stream = self.stream

        old_disable = self.disabled
        old_stream = self.stream

        self.stream = stream
        self.disabled = not enable
        yield

        self.disabled = old_disable
        self.stream = old_stream


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/pynccl_wrapper.py
================================================
# This file is a pure Python wrapper for the NCCL library.
# The main purpose is to use NCCL combined with CUDA graph.
# Before writing this script, we tried the following approach:
# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
#  often gets stuck when initializing the NCCL communicator.
# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
#  contains many other potential cuda APIs, that are not allowed during
#  capturing the CUDA graph. For further details, please check
# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
#
# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
# doable, but we often encounter issues related with nccl versions, and need
# to switch between different versions of NCCL. See
# https://github.com/NVIDIA/nccl/issues/1234 for more details.
# A C/C++ binding is not flexible enough to handle this. It requires
# recompilation of the code every time we want to switch between different
# versions. This current implementation, with a **pure** Python wrapper, is
# more flexible. We can easily switch between different versions of NCCL by
# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
# variable in the code.

import ctypes
import platform
from dataclasses import dataclass
from typing import Any, Dict, List, Optional

import torch
from torch.distributed import ReduceOp

from server.utils import find_nccl_library


# === export types and functions from nccl to Python ===
# for the original nccl definition, please check
# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in

ncclResult_t = ctypes.c_int
ncclComm_t = ctypes.c_void_p


class ncclUniqueId(ctypes.Structure):
    _fields_ = [("internal", ctypes.c_byte * 128)]


cudaStream_t = ctypes.c_void_p
buffer_type = ctypes.c_void_p

ncclDataType_t = ctypes.c_int


class ncclDataTypeEnum:
    ncclInt8 = 0
    ncclChar = 0
    ncclUint8 = 1
    ncclInt32 = 2
    ncclInt = 2
    ncclUint32 = 3
    ncclInt64 = 4
    ncclUint64 = 5
    ncclFloat16 = 6
    ncclHalf = 6
    ncclFloat32 = 7
    ncclFloat = 7
    ncclFloat64 = 8
    ncclDouble = 8
    ncclBfloat16 = 9
    ncclNumTypes = 10

    @classmethod
    def from_torch(cls, dtype: torch.dtype) -> int:
        if dtype == torch.int8:
            return cls.ncclInt8
        if dtype == torch.uint8:
            return cls.ncclUint8
        if dtype == torch.int32:
            return cls.ncclInt32
        if dtype == torch.int64:
            return cls.ncclInt64
        if dtype == torch.float16:
            return cls.ncclFloat16
        if dtype == torch.float32:
            return cls.ncclFloat32
        if dtype == torch.float64:
            return cls.ncclFloat64
        if dtype == torch.bfloat16:
            return cls.ncclBfloat16
        raise ValueError(f"Unsupported dtype: {dtype}")


ncclRedOp_t = ctypes.c_int


class ncclRedOpTypeEnum:
    ncclSum = 0
    ncclProd = 1
    ncclMax = 2
    ncclMin = 3
    ncclAvg = 4
    ncclNumOps = 5

    @classmethod
    def from_torch(cls, op: ReduceOp) -> int:
        if op == ReduceOp.SUM:
            return cls.ncclSum
        if op == ReduceOp.PRODUCT:
            return cls.ncclProd
        if op == ReduceOp.MAX:
            return cls.ncclMax
        if op == ReduceOp.MIN:
            return cls.ncclMin
        if op == ReduceOp.AVG:
            return cls.ncclAvg
        raise ValueError(f"Unsupported op: {op}")


@dataclass
class Function:
    name: str
    restype: Any
    argtypes: List[Any]


class NCCLLibrary:
    exported_functions = [
        # const char* ncclGetErrorString(ncclResult_t result)
        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
        # ncclResult_t  ncclGetVersion(int *version);
        Function("ncclGetVersion", ncclResult_t,
                 [ctypes.POINTER(ctypes.c_int)]),
        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
        Function("ncclGetUniqueId", ncclResult_t,
                 [ctypes.POINTER(ncclUniqueId)]),
        # ncclResult_t  ncclCommInitRank(
        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
        # note that ncclComm_t is a pointer type, so the first argument
        # is a pointer to a pointer
        Function("ncclCommInitRank", ncclResult_t, [
            ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId,
            ctypes.c_int
        ]),
        # ncclResult_t  ncclAllReduce(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function("ncclAllReduce", ncclResult_t, [
            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
            ncclRedOp_t, ncclComm_t, cudaStream_t
        ]),

        # ncclResult_t  ncclSend(
        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
        #   int dest, ncclComm_t comm, cudaStream_t stream);
        Function("ncclSend", ncclResult_t, [
            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
            ncclComm_t, cudaStream_t
        ]),

        # ncclResult_t  ncclRecv(
        #   void* recvbuff, size_t count, ncclDataType_t datatype,
        #   int src, ncclComm_t comm, cudaStream_t stream);
        Function("ncclRecv", ncclResult_t, [
            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
            ncclComm_t, cudaStream_t
        ]),

        # be cautious! this is a collective call, it will block until all
        # processes in the communicator have called this function.
        # because Python object destruction can happen in random order,
        # it is better not to call it at all.
        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
    ]

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
    path_to_library_cache: Dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):

        so_file = so_file or find_nccl_library()

        try:
            if so_file not in NCCLLibrary.path_to_dict_mapping:
                lib = ctypes.CDLL(so_file)
                NCCLLibrary.path_to_library_cache[so_file] = lib
            self.lib = NCCLLibrary.path_to_library_cache[so_file]
        except Exception as e:
            print(
                "Failed to load NCCL library from %s ."
                "It is expected if you are not running on NVIDIA/AMD GPUs."
                "Otherwise, the nccl library might not exist, be corrupted "
                "or it does not support the current platform %s."
                "If you already have the library, please set the "
                "environment variable VLLM_NCCL_SO_PATH"
                " to point to the correct nccl library path.", so_file,
                platform.platform())
            raise e

        if so_file not in NCCLLibrary.path_to_dict_mapping:
            _funcs: Dict[str, Any] = {}
            for func in NCCLLibrary.exported_functions:
                f = getattr(self.lib, func.name)
                f.restype = func.restype
                f.argtypes = func.argtypes
                _funcs[func.name] = f
            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]

    def ncclGetErrorString(self, result: ncclResult_t) -> str:
        return self._funcs["ncclGetErrorString"](result).decode("utf-8")

    def NCCL_CHECK(self, result: ncclResult_t) -> None:
        if result != 0:
            error_str = self.ncclGetErrorString(result)
            raise RuntimeError(f"NCCL error: {error_str}")

    def ncclGetVersion(self) -> str:
        version = ctypes.c_int()
        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
        version_str = str(version.value)
        # something like 21903 --> "2.19.3"
        major = version_str[0].lstrip("0")
        minor = version_str[1:3].lstrip("0")
        patch = version_str[3:].lstrip("0")
        return f"{major}.{minor}.{patch}"

    def ncclGetUniqueId(self) -> ncclUniqueId:
        unique_id = ncclUniqueId()
        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](
            ctypes.byref(unique_id)))
        return unique_id

    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
                         rank: int) -> ncclComm_t:
        comm = ncclComm_t()
        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
                                                        world_size, unique_id,
                                                        rank))
        return comm

    def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
                      count: int, datatype: int, op: int, comm: ncclComm_t,
                      stream: cudaStream_t) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count,
                                                     datatype, op, comm,
                                                     stream))

    def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
                 dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
                                                dest, comm, stream))

    def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
                 src: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
                                                comm, stream))

    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))


__all__ = [
    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
    "ncclComm_t", "cudaStream_t", "buffer_type"
]


================================================
FILE: archive/ktransformers/server/balance_serve/inference/distributed/utils.py
================================================
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import dataclasses
import pickle
import time
from collections import deque
from typing import Any, Deque, Dict, Optional, Sequence, Tuple

import torch
from torch.distributed import TCPStore

import server.envs as envs


def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
    assert numerator % denominator == 0, "{} is not divisible by {}".format(
        numerator, denominator
    )


def divide(numerator, denominator):
    """Ensure that numerator is divisible by the denominator and return
    the division value."""
    ensure_divisibility(numerator, denominator)
    return numerator // denominator


def split_tensor_along_last_dim(
    tensor: torch.Tensor,
    num_partitions: int,
    contiguous_split_chunks: bool = False,
) -> Sequence[torch.Tensor]:
    """Split a tensor along its last dimension.

    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.

    Returns:
        A list of Tensors
    """
    # Get the size and dimension.
    last_dim = tensor.dim() - 1
    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
    # Split.
    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
    # NOTE: torch.split does not create contiguous tensors by default.
    if contiguous_split_chunks:
        return tuple(chunk.contiguous() for chunk in tensor_list)

    return tensor_list


def get_pp_indices(
    num_hidden_layers: int, pp_rank: int, pp_size: int
) -> Tuple[int, int]:
    """Try to evenly distribute layers across partitions.
    If the number of layers is not divisible by the number of partitions,
    the last partition will have the remaining layers.
    """
    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
    if partition_list_str is not None:
        try:
            partitions = [int(layer) for layer in partition_list_str.split(",")]
        except ValueError as err:
            raise ValueError(
                "Invalid partition string: {}".format(partition_list_str)
            ) from err
        if len(partitions) != pp_size:
            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
        if sum(partitions) != num_hidden_layers:
            raise ValueError(f"{sum(partitions)=} does not match {num_hidden_layers=}.")
        start_layer = sum(partitions[:pp_rank])
        end_layer = start_layer + partitions[pp_rank]
    else:
        layers_per_partition = num_hidden_layers // pp_size
        start_layer = pp_rank * layers_per_partition
        end_layer = start_layer + layers_per_partition

        if pp_rank == pp_size - 1:
            end_layer = num_hidden_layers

    return (start_layer, end_layer)


@dataclasses.dataclass
class StatelessProcessGroup:
    """A dataclass to hold a metadata store, and the rank, world_size of the
    group. Only use it to communicate metadata between processes.
    For data-plane communication, create NCCL-related objects.
    """

    rank: int
    world_size: int
    store: torch._C._distributed_c10d.Store
    data_expiration_seconds: int = 3600  # 1 hour

    # dst rank -> counter
    send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
    # src rank -> counter
    recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
    broadcast_send_counter: int = 0
    broadcast_recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)

    # A deque to store the data entries, with key and timestamp.
    entries: Deque[Tuple[str, float]] = dataclasses.field(default_factory=deque)

    def __post_init__(self):
        assert self.rank < self.world_size
        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
        self.broadcast_recv_src_counter = {i: 0 for i in range(self.world_size)}

    def send_obj(self, obj: Any, dst: int):
        """Send an object to a destination rank."""
        self.expire_data()
        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
        self.store.set(key, pickle.dumps(obj))
        self.send_dst_counter[dst] += 1
        self.entries.append((key, time.time()))

    def expire_data(self):
        """Expire data that is older than `data_expiration_seconds` seconds."""
        while self.entries:
            # check the oldest entry
            key, timestamp = self.entries[0]
            if time.time() - timestamp > self.data_expiration_seconds:
                self.store.delete_key(key)
                self.entries.popleft()
            else:
                break

    def recv_obj(self, src: int) -> Any:
        """Receive an object from a source rank."""
        obj = pickle.loads(
            self.store.get(f"send_to/{self.rank}/{self.recv_src_counter[src]}")
        )
        self.recv_src_counter[src] += 1
        return obj

    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
        """Broadcast an object from a source rank to all other ranks.
        It does not clean up after all ranks have received the object.
        Use it for limited times, e.g., for initialization.
        """
        if self.rank == src:
            self.expire_data()
            key = f"broadcast_from/{src}/" f"{self.broadcast_send_counter}"
            self.store.set(key, pickle.dumps(obj))
            self.broadcast_send_counter += 1
            self.entries.append((key, time.time()))
            return obj
        else:
            key = f"broadcast_from/{src}/" f"{self.broadcast_recv_src_counter[src]}"
            recv_obj = pickle.loads(self.store.get(key))
            self.broadcast_recv_src_counter[src] += 1
            return recv_obj

    def all_gather_obj(self, obj: Any) -> list[Any]:
        """All gather an object from all ranks."""
        gathered_objs = []
        for i in range(self.world_size):
            if i == self.rank:
                gathered_objs.append(obj)
                self.broadcast_obj(obj, src=self.rank)
            else:
                recv_obj = self.broadcast_obj(None, src=i)
                gathered_objs.append(recv_obj)
        return gathered_objs

    def barrier(self):
        """A barrier to synchronize all ranks."""
        for i in range(self.world_size):
            if i == self.rank:
                self.broadcast_obj(None, src=self.rank)
            else:
                self.broadcast_obj(None, src=i)

    @staticmethod
    def create(
        host: str,
        port: int,
        rank: int,
        world_size: int,
        data_expiration_seconds: int = 3600,
    ) -> "StatelessProcessGroup":
        """A replacement for `torch.distributed.init_process_group` that does not
        pollute the global state.

        If we have process A and process B called `torch.distributed.init_process_group`
        to form a group, and then we want to form another group with process A, B, C,
        D, it is not possible in PyTorch, because process A and process B have already
        formed a group, and process C and process D cannot join that group. This
        function is a workaround for this issue.

        `torch.distributed.init_process_group` is a global call, while this function
        is a stateless call. It will return a `StatelessProcessGroup` object that can be
        used for exchanging metadata. With this function, process A and process B
        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
        C, and D can call `StatelessProcessGroup.create` to form another group.
        """  # noqa
        store = TCPStore(
            host_name=host,
            port=port,
            world_size=world_size,
            is_master=(rank == 0),
        )

        return StatelessProcessGroup(
            rank=rank,
            world_size=world_size,
            store=store,
            data_expiration_seconds=data_expiration_seconds,
        )


================================================
FILE: archive/ktransformers/server/balance_serve/inference/forward_batch.py
================================================
'''
Date: 2024-11-12 14:15:16
LastEditors: Xie Weiyu ervinxie@qq.com
LastEditTime: 2024-11-26 08:12:49
'''
import torch
try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False
from ktransformers.server.balance_serve.settings import sched_ext
from ktransformers.server.balance_serve.inference.query_manager import QueryManager, QueryInfo
from typing import Union
import time
from ktransformers.server.config.config import Config

class ForwardMiniBatchCombine:
    q_indptr: torch.Tensor
    kv_indptr: torch.Tensor
    kv_indices: torch.Tensor
    kv_last_page_len: torch.Tensor
    kv_len: torch.Tensor
    position_ids: torch.Tensor
    tokens: torch.Tensor
    batch_indices: torch.Tensor
    positions: torch.Tensor
    chunk_size: int
    decode_batch: int        
    is_last_prefill_chunk: bool
    logits_start: list

    temperatures: torch.Tensor
    top_ps: torch.Tensor

    def __init__(self, prefill_querys_info: list[QueryInfo], decode_querys_info: list[QueryInfo], prefill_s: list[int] = None, prefill_l: list[int] = None, device = torch.device('cuda'), page_size = 256):
        batch_decode = len(decode_querys_info)
        batch_prefill = len(prefill_querys_info)

        self.q_indptr = torch.tensor([0], device=device, dtype=torch.int32)
        self.kv_indptr = torch.tensor([0], device=device, dtype=torch.int32)
        self.kv_indices = torch.tensor([], device=device, dtype=torch.int32)
        self.kv_len = torch.tensor([], device=device, dtype=torch.int32)
        self.kv_last_page_len = torch.tensor([], device=device, dtype=torch.int32)
        self.position_ids = torch.tensor([], device=device, dtype=torch.int32)
        self.tokens = torch.tensor([], device=device, dtype=torch.int32)

        self.temperatures = torch.tensor([], device=device, dtype=torch.float32)
        self.top_ps = torch.tensor([], device=device, dtype=torch.float32)

        self.logits_start = []
        self.decode_batch = batch_decode
        self.num_tokens = batch_decode + sum(prefill_l)
        self.batch_size = batch_decode + batch_prefill
        
        for i, prefill_query_info in enumerate(prefill_querys_info):
            if prefill_query_info != None:
                prefill_kv_block_len = (prefill_query_info.active_position + prefill_l[i] + page_size - 1) // page_size if prefill_query_info is not None else 0
                # print(f"block_len: {prefill_kv_block_len}, page_size: {page_size}")
                self.q_indptr = torch.concat((self.q_indptr, torch.tensor([prefill_l[i] + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([prefill_kv_block_len + self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indices = torch.concat((self.kv_indices, prefill_query_info.block_index[:prefill_kv_block_len]), dim=0)
                self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i]) % page_size if (prefill_query_info.active_position + prefill_l[i]) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
                self.kv_len = torch.concat((self.kv_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i])], device=device, dtype=torch.int32)), dim=0)
                self.position_ids = torch.concat((self.position_ids, torch.arange(prefill_s[i], prefill_l[i] + prefill_s[i], device=device, dtype=torch.int32)), dim=0)
                self.tokens = torch.concat((self.tokens, prefill_query_info.query_tokens[prefill_s[i]:prefill_s[i] + prefill_l[i]]), dim=0)
                self.logits_start.append(prefill_l[i] - 1 if len(self.logits_start) == 0 else sum(prefill_l[:i+1])-1)

                self.temperatures = torch.concat((self.temperatures, torch.tensor([prefill_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
                self.top_ps = torch.concat((self.top_ps, torch.tensor([prefill_query_info.top_p], device=device, dtype=torch.float32)), dim=0)

        for decode_query_info in decode_querys_info:
            decode_kv_block_len = (decode_query_info.active_position + 1 + page_size - 1) // page_size
            self.q_indptr = torch.concat((self.q_indptr, torch.tensor([1 + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
            self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([decode_kv_block_len+self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
            self.kv_indices = torch.concat((self.kv_indices, decode_query_info.block_index[:decode_kv_block_len]), dim=0)
            self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(decode_query_info.active_position + 1) % page_size if (decode_query_info.active_position + 1) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
            self.kv_len = torch.concat((self.kv_len, torch.tensor([(decode_query_info.active_position + 1)], device=device, dtype=torch.int32)), dim=0)
            self.position_ids = torch.concat((self.position_ids, torch.arange(decode_query_info.active_position, decode_query_info.active_position + 1, device=device, dtype=torch.int32)), dim=0)
            if decode_query_info.active_position > 0:
                self.tokens = torch.concat((self.tokens, decode_query_info.query_tokens[decode_query_info.active_position:decode_query_info.active_position+1]), dim=0)
            else: 
                self.tokens = torch.concat((self.tokens, torch.tensor([0], device=device, dtype=torch.int32)), dim=0)
            self.logits_start.append(0 if len(self.logits_start) == 0 else self.logits_start[-1]+1)

            self.temperatures = torch.concat((self.temperatures, torch.tensor([decode_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
            self.top_ps = torch.concat((self.top_ps, torch.tensor([decode_query_info.top_p], device=device, dtype=torch.float32)), dim=0)

        self.q_indptr = self.q_indptr.contiguous()
        self.kv_indptr = self.kv_indptr.contiguous()
        self.kv_indices = self.kv_indices.contiguous()
        self.kv_len = self.kv_len.contiguous()
        self.kv_last_page_len = self.kv_last_page_len.contiguous()
        self.position_ids = self.position_ids.contiguous()
        self.tokens = self.tokens.contiguous()

        self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)

    def fill(self, prefill_querys_info: list[QueryInfo], decode_querys_info: list[QueryInfo], prefill_s: list[int] = None, prefill_l: list[int] = None, device = torch.device('cuda'), page_size = 256):
        batch_decode = len(decode_querys_info)
        batch_prefill = len(prefill_querys_info)

        self.q_indptr = torch.tensor([0], device=device, dtype=torch.int32)
        self.kv_indptr = torch.tensor([0], device=device, dtype=torch.int32)
        self.kv_indices = torch.tensor([], device=device, dtype=torch.int32)
        self.kv_len = torch.tensor([], device=device, dtype=torch.int32)
        self.kv_last_page_len = torch.tensor([], device=device, dtype=torch.int32)
        new_position_ids = torch.tensor([], device=device, dtype=torch.int32)
        new_tokens = torch.tensor([], device=device, dtype=torch.int32)

        self.temperatures = torch.tensor([], device=device, dtype=torch.float32)
        self.top_ps = torch.tensor([], device=device, dtype=torch.float32)

        self.logits_start = []
        self.decode_batch = batch_decode
        self.num_tokens = batch_decode + sum(prefill_l)
        self.batch_size = batch_decode + batch_prefill

        for i, prefill_query_info in enumerate(prefill_querys_info):
            prefill_kv_block_len = (prefill_query_info.active_position + prefill_l[i] + page_size - 1) // page_size if prefill_query_info is not None else 0
        # print(f"block_len: {prefill_kv_block_len}, page_size: {page_size}")
            self.q_indptr = torch.concat((self.q_indptr, torch.tensor([prefill_l[i] + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
            self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([prefill_kv_block_len + self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
            self.kv_indices = torch.concat((self.kv_indices, prefill_query_info.block_index[:prefill_kv_block_len]), dim=0)
            self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i]) % page_size if (prefill_query_info.active_position + prefill_l[i]) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
            self.kv_len = torch.concat((self.kv_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i])], device=device, dtype=torch.int32)), dim=0)
            new_position_ids = torch.concat((new_position_ids, torch.arange(prefill_s[i], prefill_l[i] + prefill_s[i], device=device, dtype=torch.int32)), dim=0)
            new_tokens = torch.concat((new_tokens, prefill_query_info.query_tokens[prefill_s[i]:prefill_s[i] + prefill_l[i]]), dim=0)
            self.logits_start.append(prefill_l[i] - 1 if len(self.logits_start) == 0 else sum(prefill_l[:i+1])-1)

            self.temperatures = torch.concat((self.temperatures, torch.tensor([prefill_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
            self.top_ps = torch.concat((self.top_ps, torch.tensor([prefill_query_info.top_p], device=device, dtype=torch.float32)), dim=0)


        for decode_query_info in decode_querys_info:
            decode_kv_block_len = (decode_query_info.active_position + 1 + page_size - 1) // page_size
            self.q_indptr = torch.concat((self.q_indptr, torch.tensor([1 + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
            self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([decode_kv_block_len+self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
            self.kv_indices = torch.concat((self.kv_indices, decode_query_info.block_index[:decode_kv_block_len]), dim=0)
            self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(decode_query_info.active_position + 1) % page_size if (decode_query_info.active_position + 1) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
            self.kv_len = torch.concat((self.kv_len, torch.tensor([(decode_query_info.active_position + 1)], device=device, dtype=torch.int32)), dim=0)
            new_position_ids = torch.concat((new_position_ids, torch.arange(decode_query_info.active_position, decode_query_info.active_position + 1, device=device, dtype=torch.int32)), dim=0)
            if decode_query_info.active_position > 0:
                new_tokens = torch.concat((new_tokens, decode_query_info.query_tokens[decode_query_info.active_position:decode_query_info.active_position+1]), dim=0)
            else: 
                new_tokens = torch.concat((new_tokens, torch.tensor([0], device=device, dtype=torch.int32)), dim=0)
            self.logits_start.append(0 if len(self.logits_start) == 0 else self.logits_start[-1]+1)

            self.temperatures = torch.concat((self.temperatures, torch.tensor([decode_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
            self.top_ps = torch.concat((self.top_ps, torch.tensor([decode_query_info.top_p], device=device, dtype=torch.float32)), dim=0)


        self.q_indptr = self.q_indptr.contiguous()
        self.kv_indptr = self.kv_indptr.contiguous()
        self.kv_indices = self.kv_indices.contiguous()
        self.kv_len = self.kv_len.contiguous()
        self.kv_last_page_len = self.kv_last_page_len.contiguous()

        self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)
        
        # copy new_position_ids and new_tokens to self.position_ids and self.tokens
        # print("new_position_ids: ", new_position_ids)
        # self.print()
        self.position_ids[:new_position_ids.size(0)].copy_(new_position_ids)
        self.position_ids[new_position_ids.size(0):].zero_()
        self.tokens[:new_tokens.size(0)].copy_(new_tokens)

    def __str__(self):
        ret = ''
        ret += f'=====flash infer forward info:\n'
        ret += f'q_indptr: {self.q_indptr}, kv_indptr: {self.kv_indptr}, kv_indices: {self.kv_indices}\n'
        ret += f'kv_len: {self.kv_len}, kv_last_page_len: {self.kv_last_page_len}, bsz_tensor: {self.bsz_tensor}\n'
        ret += f'position_ids: {self.position_ids}, tokens: {self.tokens}\n'
        return ret


class ForwardMiniBatchSplit:
    # NPU 流程 prefill 和 decode 分开打包
    prefill_batch: int
    p_q_len: torch.Tensor               # (bsz)
    p_kv_len: torch.Tensor              # (bsz)
    p_position_ids: torch.Tensor        # (sum(q_len))
    p_tokens: torch.Tensor              # (sum(q_len))
    p_temperatures: torch.Tensor        # (bsz)
    p_top_ps: torch.Tensor              # (bsz)
    p_block_tables: torch.Tensor        # (bsz, max_page_num)
    p_logits_start: list

    decode_batch: int
    d_q_len: torch.Tensor
    d_kv_len: torch.Tensor
    d_position_ids: torch.Tensor
    d_tokens: torch.Tensor
    d_temperatures: torch.Tensor
    d_top_ps: torch.Tensor
    d_block_tables: torch.Tensor        # (bsz, max_page_num)
    d_logits_start: list

    chunk_size: int
    is_last_prefill_chunk: bool

    def __init__(
        self,
        prefill_querys_info: list[QueryInfo],
        decode_querys_info: list[QueryInfo],
        prefill_s: list[int] = None,
        prefill_l: list[int] = None,
        device=None,
        page_size: int = 256,
        max_page_num: int = 64,
        decode_padding_len: int = 1,
    ):
        # 统一 NPU 设备
        device = torch.device('npu')

        if prefill_s is None or prefill_l is None:
            raise ValueError(
                "[ForwardMiniBatchSplit.__init__] prefill_s / prefill_l 不能为空，chunk prefill 需要这两个参数"
            )

        # 过滤掉 None
        new_prefill_querys_info: list[QueryInfo] = [
            info for info in prefill_querys_info if info is not None
        ]
        batch_prefill = len(new_prefill_querys_info)
        batch_decode = len(decode_querys_info)

        self.prefill_batch = batch_prefill
        self.decode_batch = batch_decode
        self.batch_size = batch_prefill + batch_decode
        self.num_tokens = batch_decode * decode_padding_len + sum(prefill_l)

        self.chunk_size = prefill_l[0] if prefill_l else 0

        self.is_last_prefill_chunk = True
        for i, q in enumerate(new_prefill_querys_info):
            end_pos = prefill_s[i] + prefill_l[i]
            if end_pos < q.query_length:
                self.is_last_prefill_chunk = False
                break

        # ====================== Prefill 部分 ======================
        self.p_q_len = torch.tensor([], device=device, dtype=torch.int32)
        self.p_kv_len = torch.tensor([], device=device, dtype=torch.int32)
        self.p_position_ids = torch.tensor([], device=device, dtype=torch.int32)
        self.p_block_tables = -1 * torch.ones(
            [self.prefill_batch, max_page_num], device=device, dtype=torch.int32
        )
        self.p_tokens = torch.tensor([], device=device, dtype=torch.int32)

        self.p_temperatures = torch.tensor([], device=device, dtype=torch.float32)
        self.p_top_ps = torch.tensor([], device=device, dtype=torch.float32)
        self.p_logits_start: list[int] = []

        for i, prefill_query_info in enumerate(new_prefill_querys_info):
            qid = getattr(prefill_query_info, "id", -1)

            past_len = int(prefill_query_info.active_position)
            start = int(prefill_s[i])                            # current chunk's start position in query_tokens
            chunk_len = int(prefill_l[i])
            kv_len = past_len + chunk_len
            prefill_kv_block_len = (kv_len + page_size - 1) // page_size

            # Q length = current chunk length
            self.p_q_len = torch.concat(
                (
                    self.p_q_len,
                    torch.tensor([chunk_len], device=device, dtype=torch.int32),
                ),
                dim=0,
            )
            self.p_kv_len = torch.concat(
                (
                    self.p_kv_len,
                    torch.tensor([kv_len], device=device, dtype=torch.int32),
                ),
                dim=0,
            )

            self.p_block_tables[i, :prefill_kv_block_len] = prefill_query_info.block_index[
                :prefill_kv_block_len
            ]

            self.p_position_ids = torch.concat(
                (
                    self.p_position_ids,
                    torch.arange(
                        start,
                        start + chunk_len,
                        device=device,
                        dtype=torch.int32,
                    ),
                ),
                dim=0,
            )

            self.p_tokens = torch.concat(
                (
                    self.p_tokens,
                    prefill_query_info.query_tokens[start : start + chunk_len],
                ),
                dim=0,
            )

            self.p_logits_start.append(
                chunk_len - 1
                if len(self.p_logits_start) == 0
                else sum(prefill_l[: i + 1]) - 1
            )

            self.p_temperatures = torch.concat(
                (
                    self.p_temperatures,
                    torch.tensor(
                        [prefill_query_info.temperature],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )
            self.p_top_ps = torch.concat(
                (
                    self.p_top_ps,
                    torch.tensor(
                        [prefill_query_info.top_p],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )

        # ====================== Decode ======================
        self.d_q_len = torch.tensor([], device=device, dtype=torch.int32)
        self.d_kv_len = torch.tensor([], device=device, dtype=torch.int32)
        self.d_position_ids = torch.tensor([], device=device, dtype=torch.int32)
        self.d_block_tables = -1 * torch.ones(
            [self.decode_batch, max_page_num], device=device, dtype=torch.int32
        )
        self.d_tokens = torch.tensor([], device=device, dtype=torch.int32)

        self.d_temperatures = torch.tensor([], device=device, dtype=torch.float32)
        self.d_top_ps = torch.tensor([], device=device, dtype=torch.float32)
        self.d_logits_start: list[int] = []

        for i, decode_query_info in enumerate(decode_querys_info):
            qid = getattr(decode_query_info, "id", -1)
            past_len = int(decode_query_info.active_position)
            decode_kv_block_len = (past_len + decode_padding_len + page_size - 1) // page_size

            self.d_q_len = torch.concat(
                (
                    self.d_q_len,
                    torch.tensor(
                        [decode_padding_len], device=device, dtype=torch.int32
                    ),
                ),
                dim=0,
            )
            self.d_kv_len = torch.concat(
                (
                    self.d_kv_len,
                    torch.tensor(
                        [past_len + decode_padding_len],
                        device=device,
                        dtype=torch.int32,
                    ),
                ),
                dim=0,
            )

            self.d_block_tables[i, :decode_kv_block_len] = decode_query_info.block_index[
                :decode_kv_block_len
            ]

            self.d_position_ids = torch.concat(
                (
                    self.d_position_ids,
                    torch.arange(
                        past_len,
                        past_len + decode_padding_len,
                        device=device,
                        dtype=torch.int32,
                    ),
                ),
                dim=0,
            )

            if past_len > 0:
                self.d_tokens = torch.concat(
                    (
                        self.d_tokens,
                        decode_query_info.query_tokens[
                            past_len : past_len + decode_padding_len
                        ],
                    ),
                    dim=0,
                )
            else:
                self.d_tokens = torch.concat(
                    (
                        self.d_tokens,
                        torch.tensor(
                            [0] * decode_padding_len,
                            device=device,
                            dtype=torch.int32,
                        ),
                    ),
                    dim=0,
                )

            self.d_logits_start.append(
                0
                if len(self.d_logits_start) == 0
                else self.d_logits_start[-1] + decode_padding_len
            )

            self.d_temperatures = torch.concat(
                (
                    self.d_temperatures,
                    torch.tensor(
                        [decode_query_info.temperature],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )
            self.d_top_ps = torch.concat(
                (
                    self.d_top_ps,
                    torch.tensor(
                        [decode_query_info.top_p],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )

        self.p_q_len = self.p_q_len.contiguous()
        self.p_kv_len = self.p_kv_len.contiguous()
        self.p_block_tables = self.p_block_tables.contiguous()
        self.p_position_ids = self.p_position_ids.contiguous()
        self.p_tokens = self.p_tokens.contiguous()

        if self.decode_batch > 1:
            self.d_q_len = self.d_q_len.reshape(self.decode_batch, -1).contiguous()
            self.d_kv_len = self.d_kv_len.reshape(self.decode_batch, -1).contiguous()
            self.d_kv_len_list = self.d_kv_len.flatten().tolist()
            self.d_block_tables = self.d_block_tables.contiguous()
            self.d_position_ids = self.d_position_ids.reshape(self.decode_batch, -1).contiguous()
            self.d_tokens = self.d_tokens.reshape(self.decode_batch, -1).contiguous()
        else:
            self.d_q_len = self.d_q_len.contiguous()
            self.d_kv_len = self.d_kv_len.contiguous()
            self.d_kv_len_list = self.d_kv_len.flatten().tolist()
            self.d_block_tables = self.d_block_tables.contiguous()
            self.d_position_ids = self.d_position_ids.contiguous()
            self.d_tokens = self.d_tokens.contiguous()

        self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)


    def fill(
        self,
        prefill_querys_info: list[QueryInfo],
        decode_querys_info: list[QueryInfo],
        prefill_s: list[int] = None,
        prefill_l: list[int] = None,
        decode_padding_len: int = 1,
        device=None,
        page_size: int = 256,
        max_page_num: int = 64,
    ):
        device = torch.device('npu')

        if prefill_s is None or prefill_l is None:
            raise ValueError(
                "[ForwardMiniBatchSplit.fill] prefill_s / prefill_l 不能为空，chunk prefill 需要这两个参数"
            )

        page_size = 128

        new_prefill_querys_info: list[QueryInfo] = [
            info for info in prefill_querys_info if info is not None
        ]
        batch_prefill = len(new_prefill_querys_info)
        batch_decode = len(decode_querys_info)

        self.prefill_batch = batch_prefill
        self.decode_batch = batch_decode
        self.batch_size = batch_prefill + batch_decode
        self.num_tokens = batch_decode * decode_padding_len + sum(prefill_l)

        self.chunk_size = prefill_l[0] if prefill_l else 0
        self.is_last_prefill_chunk = True
        for i, q in enumerate(new_prefill_querys_info):
            end_pos = prefill_s[i] + prefill_l[i]
            if end_pos < q.query_length:
                self.is_last_prefill_chunk = False
                break

        # ---------- Prefill ----------
        self.p_q_len = torch.tensor([], device=device, dtype=torch.int32)
        self.p_kv_len = torch.tensor([], device=device, dtype=torch.int32)
        new_p_position_ids = torch.tensor([], device=device, dtype=torch.int32)
        self.p_block_tables = torch.zeros(
            [self.prefill_batch, max_page_num], device=device, dtype=torch.int32
        )
        new_p_tokens = torch.tensor([], device=device, dtype=torch.int32)

        self.p_temperatures = torch.tensor([], device=device, dtype=torch.float32)
        self.p_top_ps = torch.tensor([], device=device, dtype=torch.float32)
        self.p_logits_start = []

        for i, prefill_query_info in enumerate(new_prefill_querys_info):
            qid = getattr(prefill_query_info, "id", -1)
            past_len = int(prefill_query_info.active_position)
            start = int(prefill_s[i])
            chunk_len = int(prefill_l[i])

            kv_len = past_len + chunk_len
            prefill_kv_block_len = (kv_len + page_size - 1) // page_size

            self.p_q_len = torch.concat(
                (
                    self.p_q_len,
                    torch.tensor([chunk_len], device=device, dtype=torch.int32),
                ),
                dim=0,
            )
            self.p_kv_len = torch.concat(
                (
                    self.p_kv_len,
                    torch.tensor([kv_len], device=device, dtype=torch.int32),
                ),
                dim=0,
            )
            self.p_block_tables[i, :prefill_kv_block_len] = prefill_query_info.block_index[
                :prefill_kv_block_len
            ]

            new_p_position_ids = torch.concat(
                (
                    new_p_position_ids,
                    torch.arange(
                        start,
                        start + chunk_len,
                        device=device,
                        dtype=torch.int32,
                    ),
                ),
                dim=0,
            )
            new_p_tokens = torch.concat(
                (
                    new_p_tokens,
                    prefill_query_info.query_tokens[start : start + chunk_len],
                ),
                dim=0,
            )

            self.p_logits_start.append(
                chunk_len - 1 if len(self.p_logits_start) == 0 else sum(prefill_l[: i + 1]) - 1
            )

            self.p_temperatures = torch.concat(
                (
                    self.p_temperatures,
                    torch.tensor(
                        [prefill_query_info.temperature],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )
            self.p_top_ps = torch.concat(
                (
                    self.p_top_ps,
                    torch.tensor(
                        [prefill_query_info.top_p],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )

        if new_p_position_ids.numel() > 0:
            self.p_position_ids = new_p_position_ids.contiguous()
        if new_p_tokens.numel() > 0:
            self.p_tokens = new_p_tokens.contiguous()

        # ---------- Decode ----------
        self.d_q_len = torch.zeros(
            [1] * self.decode_batch, device=device, dtype=torch.int32
        )
        self.d_kv_len = torch.tensor([], device=device, dtype=torch.int32)
        new_d_position_ids = torch.tensor([], device=device, dtype=torch.int32)
        new_d_block_tables = -1 * torch.ones(
            [self.decode_batch, max_page_num], device=device, dtype=torch.int32
        )
        new_d_tokens = torch.tensor([], device=device, dtype=torch.int32)

        self.d_logits_start = []
        self.d_temperatures = torch.tensor([], device=device, dtype=torch.float32)
        self.d_top_ps = torch.tensor([], device=device, dtype=torch.float32)

        for i, decode_query_info in enumerate(decode_querys_info):
            qid = getattr(decode_query_info, "id", -1)
            past_len = int(decode_query_info.active_position)
            decode_kv_block_len = (past_len + decode_padding_len + page_size - 1) // page_size

            self.d_kv_len = torch.concat(
                (
                    self.d_kv_len,
                    torch.tensor(
                        [past_len + decode_padding_len],
                        device=device,
                        dtype=torch.int32,
                    ),
                ),
                dim=0,
            )
            new_d_block_tables[i, :decode_kv_block_len] = decode_query_info.block_index[
                :decode_kv_block_len
            ]

            new_d_position_ids = torch.concat(
                (
                    new_d_position_ids,
                    torch.arange(
                        past_len,
                        past_len + decode_padding_len,
                        device=device,
                        dtype=torch.int32,
                    ),
                ),
                dim=0,
            )

            if past_len > 0:
                new_d_tokens = torch.concat(
                    (
                        new_d_tokens,
                        decode_query_info.query_tokens[
                            past_len : past_len + decode_padding_len
                        ],
                    ),
                    dim=0,
                )
            else:
                new_d_tokens = torch.concat(
                    (
                        new_d_tokens,
                        torch.tensor(
                            [0] * decode_padding_len,
                            device=device,
                            dtype=torch.int32,
                        ),
                    ),
                    dim=0,
                )

            self.d_logits_start.append(
                0
                if len(self.d_logits_start) == 0
                else self.d_logits_start[-1] + decode_padding_len
            )

            self.d_temperatures = torch.concat(
                (
                    self.d_temperatures,
                    torch.tensor(
                        [decode_query_info.temperature],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )
            self.d_top_ps = torch.concat(
                (
                    self.d_top_ps,
                    torch.tensor(
                        [decode_query_info.top_p],
                        device=device,
                        dtype=torch.float32,
                    ),
                ),
                dim=0,
            )

            if len(decode_querys_info) > 1:
                self.d_position_ids[i].copy_(new_d_position_ids[i])
                self.d_tokens[i].copy_(new_d_tokens[i])
                self.d_block_tables[i].copy_(new_d_block_tables[i])
            else:
                self.d_position_ids[:new_d_position_ids.size(0)].copy_(new_d_position_ids)
                self.d_tokens[:new_d_tokens.size(0)].copy_(new_d_tokens)
                self.d_block_tables[0].copy_(new_d_block_tables[0])


        self.p_q_len = self.p_q_len.contiguous()
        self.p_kv_len = self.p_kv_len.contiguous()
        self.p_block_tables = self.p_block_tables.contiguous()

        self.d_q_len = self.d_q_len.contiguous()
        self.d_kv_len = self.d_kv_len.contiguous()
        self.d_kv_len_list = self.d_kv_len.flatten().tolist()

        self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)


    def __str__(self):
        ret = ''
        ret += '=======Prefill forward info:\n'
        ret += f'batch: {self.prefill_batch}, qLen: {self.p_q_len}, kvLen: {self.p_kv_len}\n'
        ret += f'tokens: {self.p_tokens}, posIdx: {self.p_position_ids}, block_tables: {self.p_block_tables}\n'
        ret += '=======Decode forward info:\n'
        ret += f'batch: {self.decode_batch}, qLen: {self.d_q_len}, kvLen: {self.d_kv_len}\n'
        ret += f'tokens: {self.d_tokens}, posIdx: {self.d_position_ids}, block_tables: {self.d_block_tables}\n'
        ret += f'chunk_size={self.chunk_size}, is_last_prefill_chunk={self.is_last_prefill_chunk}\n'
        return ret


class ForwardBatchInput:

    forward_minibatchs: list[Union[ForwardMiniBatchSplit, ForwardMiniBatchCombine]]
    decode_mini_batches: list[Union[ForwardMiniBatchSplit, ForwardMiniBatchCombine]]
    batch_size: int
    minibatch: Union[ForwardMiniBatchSplit, ForwardMiniBatchCombine]

    def __init__(self, batch : sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None, device=None, tokens: torch.Tensor = None):
        
        if batch is None:
            return


        prefill_minibatches = batch.prefill_mini_batches
        decode_mini_batches = [item for sublist in batch.decode_mini_batches for item in sublist]
        prefill_querys_info = []
        prefill_s = []
        prefill_l = []
        decode_querys_info = []
        self.batch_size = 1
        for (qid, s, l) in prefill_minibatches:
            prefill_querys_info.append(query_manager.query_map[qid])
            prefill_s.append(s)
            prefill_l.append(l)
        for decode_qid in decode_mini_batches:
            qinfo = query_manager.query_map[decode_qid]
            if qinfo.decode_start_time is None:
                qinfo.decode_start_time = time.time()
            decode_querys_info.append(qinfo)

        if use_torch_npu:
            minibatch = ForwardMiniBatchSplit(prefill_querys_info, decode_querys_info, prefill_s, prefill_l, device = query_manager.device, page_size = query_manager.page_size)
        else:
            minibatch = ForwardMiniBatchCombine(prefill_querys_info, decode_querys_info, prefill_s, prefill_l, device = query_manager.device, page_size = query_manager.page_size)
        self.minibatch = minibatch

    @classmethod
    def gen_max_forward_batch(
        cls,
        device=None,
        tokens: torch.Tensor = None,
        num_mini_batches: int = 1,
        max_seq_length: int = 1024, # TODO: add to yaml
        prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config
        prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size,
        gen_prefill: bool = True,
        decode_batch_size: int = Config().max_decode_batch_size,
        decode_query_length: int = 1,
        decode_active_position: torch.Tensor = None,
        page_size = 256,
        cuda_lens = 1
    ):
        instance = cls()
        
        instance.batch_size = num_mini_batches
        page_size = page_size
     
        prefill_query_info = []
        offset = 0
        if gen_prefill and prefill_query_length != 0:
            for i in range(Config().max_prefill_batch_size):
                prefill_query_info.append(QueryInfo(i, prefill_query_length, max_seq_length, page_size, device, offset=offset))
                offset += max_seq_length // page_size

        decode_querys_info = []
        for i in range(min(decode_batch_size, cuda_lens)):
            query_info = QueryInfo(i+Config().max_prefill_batch_size, decode_query_length, max_seq_length, page_size, device, is_prefill=False, offset=offset)
            offset += max_seq_length // page_size
            if tokens is not None:
                query_info.query_tokens[prefill_active_length:prefill_active_length + decode_query_length].copy_(tokens)            
            if decode_active_position is None:
                query_info.active_position = prefill_active_length
            else: 
                query_info.active_position = decode_active_position[i]

            decode_querys_info.append(query_info)
        
        if prefill_query_length * Config().max_prefill_batch_size + len(decode_querys_info) < cuda_lens:
            decode_querys_info.append(query_info)
        if use_torch_npu:
            instance.minibatch = ForwardMiniBatchSplit(prefill_query_info, decode_querys_info, [0, 0],
                                                [prefill_active_length for _ in range(Config().max_prefill_batch_size)],
                                                device, page_size, decode_padding_len=decode_query_length)
        else:
            instance.minibatch = ForwardMiniBatchCombine(prefill_query_info, decode_querys_info, [0, 0], [prefill_active_length for _ in range(Config().max_prefill_batch_size)], device, page_size)
        
        return instance


    def fill(self, batch : sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None, page_size = 256):
        if batch is None:
            return
        prefill_minibatches = batch.prefill_mini_batches
        decode_mini_batches = [item for sublist in batch.decode_mini_batches for item in sublist]

        prefill_querys_info = []
        prefill_s = []
        prefill_l = []
        decode_querys_info = []
        self.batch_size = 1
        for (id, s, l) in prefill_minibatches:
            prefill_querys_info.append(query_manager.query_map[id])
            prefill_s.append(s)
            prefill_l.append(l)
        for decode_batch_idx in decode_mini_batches:
            if query_manager.query_map[decode_batch_idx].decode_start_time is None:
                query_manager.query_map[decode_batch_idx].decode_start_time =time.time()
            decode_querys_info.append(query_manager.query_map[decode_batch_idx])

        self.minibatch.fill(prefill_querys_info, decode_querys_info, prefill_s, prefill_l, device=query_manager.device, page_size=page_size)


class ForwardBatchOutput:
    logits: list[torch.Tensor]
    pre_hidden_states: list[torch.Tensor]
    num_batchs: int
    batch_sizes: list[int]
    generated_tokens_num: list[int]
    lm_start: list[int]
    
    temperatures: list[torch.Tensor]
    top_ps: list[torch.Tensor]

    def __init__(self):
        self.num_batchs = 0
        self.lm_start = []
        self.logits = []
        self.batch_sizes = []
        self.generated_tokens_num = []
        self.top_ps = []
        self.temperatures = []
        self.pre_hidden_states = []
        pass

    def merge(self, new_output):
        self.logits.extend(new_output.logits)
        self.num_batchs += new_output.num_batchs
        self.batch_sizes.extend(new_output.batch_sizes)
        self.generated_tokens_num.extend(new_output.generated_tokens_num)
        self.top_ps.extend(new_output.top_ps)
        self.temperatures.extend(new_output.temperatures)
        self.lm_start.extend(new_output.lm_start)
        self.pre_hidden_states.extend(new_output.pre_hidden_states)

    def __str__(self):
        logits_shape = [t.shape for t in self.logits]
        ret = ''
        ret += f'=======Combined output info:\n'
        ret += f'logits: {self.logits}\n'
        ret += f'logits(size): {logits_shape}, num_batchs: {self.num_batchs}, kvLen: {self.generated_tokens_num}\n'
        ret += f'top_ps: {self.top_ps}, temperatures: {self.temperatures}, pre_hidden_states num: {len(self.pre_hidden_states)}\n'
        if len(self.pre_hidden_states) != 0:
            for idx in range(len(self.pre_hidden_states)):
                ret += f'idx: {idx}, pre_hidden_states shape: {self.pre_hidden_states[idx].shape}\n'    
        return ret

================================================
FILE: archive/ktransformers/server/balance_serve/inference/model_runner.py
================================================
"""
Date: 2024-11-07 07:02:20
LastEditors: djw
LastEditTime: 2024-12-10 08:48:32
"""
import os.path
import threading

import torch
from torch import nn
import queue
import signal
import queue
from typing import AsyncIterable
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from contextlib import asynccontextmanager
from pydantic import BaseModel, Field
import asyncio
import multiprocessing
import time
import torch.multiprocessing as mp
import random
import torch.distributed as dist
import zmq
import copy
import tempfile
from ktransformers.server.balance_serve.inference.forward_batch import (
    ForwardBatchInput, ForwardBatchOutput, ForwardMiniBatchCombine, ForwardMiniBatchSplit)
from ktransformers.util import utils

from ktransformers.server.config.config import Config
from ktransformers.models.custom_modeling_deepseek_v3 import KDeepseekV3ForCausalLM
from ktransformers.models.custom_modeling_deepseek_v2 import KDeepseekV2ForCausalLM
from ktransformers.models.custom_modeling_qwen2_moe import KQwen2MoeForCausalLM
from ktransformers.models.custom_modeling_qwen3_moe import KQwen3MoeForCausalLM
from ktransformers.models.custom_modeling_smallthinker import KSmallThinkerForCausalLM
from ktransformers.models.custom_modeling_glm4_moe import KGlm4MoeForCausalLM
from ktransformers.models.custom_modeling_qwen3_next import KQwen3NextForCausalLM
from ktransformers.server.balance_serve.inference.query_manager import QueryManager
from ktransformers.server.balance_serve.settings import sched_ext

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
    from ktransformers.models.ascend.custom_ascend_modeling_deepseek_v3 import KNPUDeepseekV3ForCausalLM
    from ktransformers.models.ascend.custom_ascend_modeling_qwen3 import KNPUQwen3MoeForCausalLM
    from ktransformers.models.custom_cache import KVC2StaticCache, KVC2Qwen3Cache
except:
    use_torch_npu = False


def pad_num_tokens(num_tokens):
    return (num_tokens + 63) // 64 * 64

def deduplicate_and_sort(lst):
    return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
    # 如果输入不符合要求，assert掉
    assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
    base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]

    if chunk_size <= 1024:
        return deduplicate_and_sort(base_list)

    multiples = [i for i in range(1024, chunk_size + 1, 1024)]

    return deduplicate_and_sort(base_list + multiples)
class ModelRunner:
    """A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""
    if not use_torch_npu:
        model: KDeepseekV3ForCausalLM  | KQwen2MoeForCausalLM | KQwen3MoeForCausalLM | KSmallThinkerForCausalLM | KGlm4MoeForCausalLM | KQwen3NextForCausalLM
    else:
        model: KNPUDeepseekV3ForCausalLM | KNPUQwen3MoeForCausalLM
        cache: KVC2StaticCache | KVC2Qwen3Cache
    input: ForwardBatchInput | list[ForwardBatchInput]
    output: ForwardBatchOutput
    

    def __init__(self, model = None, cache = None, device = None, use_cuda_graph = False, max_decode_batch_size = 1, max_chunk_size = 4096, num_mini_batches: int = 1, page_size = 256, block_num = 8):
        
        # 先注释掉
        self.model = model  # Compile and move model to the specified device
        if use_torch_npu:
            self.stream = torch.npu.Stream(device=device)
            self.stream_scope = torch.npu.stream
            self.input_decode = []
            max_batch_size = 1 if Config().max_batch_size <= 1 else Config().max_batch_size
            self.npu_graphs = sorted(set([i for i in range(1, max_batch_size + 1)]))
            self.model.stream = self.stream  # npu do not support multi stream like this
            if use_cuda_graph:
                torch_npu.npu._subscribe_report(self.stream)

            self.start_model_event = torch.npu.Event(enable_timing=True)
            self.end_model_event = torch.npu.Event(enable_timing=True)
        else:
            self.stream = torch.cuda.Stream(device=device)
            self.cuda_graphs = generate_cuda_graphs(Config().chunk_size)

            self.start_model_event = torch.cuda.Event(enable_timing=True)
            self.end_model_event = torch.cuda.Event(enable_timing=True)
 
        self.device = device
        self.input = None
        self.features_buf = None
        self.output = None
        self.graph_memory_pool = None
        self.cache = cache
        #TODO 删掉了一行 self.cuda_graphs = generate_cuda_graphs(Config().chunk_size) 是为何，这样下面不会影响GPU吗
        self.use_cuda_graph = use_cuda_graph
        self.debug = False

        self.model_time = 0
        self.page_size = page_size
        self.block_num = block_num

        if 'cuda' in device:
            self.graphs = [torch.cuda.CUDAGraph() for _ in range(len(self.cuda_graphs))]
            self.page_idx_buf = [torch.zeros([self.cuda_graphs[i]], dtype=torch.int32, device = self.device) for i in range(len(self.cuda_graphs))]
            self.page_offset_buf = [torch.zeros([self.cuda_graphs[i]], dtype=torch.int32, device = self.device) for i in range(len(self.cuda_graphs))]
        elif 'npu' in device:
            self.workspace = [None for _ in range(len(self.npu_graphs))]
            self.graphs = [torch.npu.NPUGraph() for _ in range(len(self.npu_graphs))]
            self.page_idx_buf = [torch.zeros((self.npu_graphs[i], 1), dtype=torch.int32, device = self.device) for i in range(len(self.npu_graphs))]
            self.page_offset_buf = [torch.zeros((self.npu_graphs[i], 1), dtype=torch.int32, device = self.device) for i in range(len(self.npu_graphs))]
        else:
            self.graphs, self.page_idx_buf, self.page_offset_buf = None, None, None
        self.num_mini_batches = num_mini_batches

        self.max_chunk_size = max_chunk_size

        self.bsz_tensor_buf = torch.empty((1, ),dtype=torch.int32, device=device)
        self.num_tokens_tensor_buf = torch.empty((1, ),dtype=torch.int32, device=device)

    def model_attn_plan(self, batch, cuda_graph_idx=0):
        if isinstance(self.model, KDeepseekV3ForCausalLM):
            self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
                                             num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
                                             head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.model.cache.page_size, causal=True,
                                             sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
        elif isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM) or isinstance(self.model, KSmallThinkerForCausalLM) or isinstance(self.model, KGlm4MoeForCausalLM) or isinstance(self.model, KQwen3NextForCausalLM):
            self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
                                             num_q_heads=self.model.config.num_attention_heads, num_kv_heads=self.model.config.num_key_value_heads,
                                             head_dim=self.model.config.head_dim if hasattr(self.model.config, 'head_dim') else self.model.config.hidden_size // self.model.config.num_attention_heads, 
                                             page_size=self.model.cache.page_size, causal=True,
                                             q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16, cuda_graph_idx=cuda_graph_idx)
        else:
            assert False, "model type not supported"


    def warmup(self):

        def capture_graphs(cuda_graph_idx):
            with torch.cuda.graph(self.graphs[cuda_graph_idx], pool=self.graph_memory_pool, stream=self.stream):
                self.outputs_buf[cuda_graph_idx] = self.model(self.input[cuda_graph_idx], self.features_buf[cuda_graph_idx], self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf[cuda_graph_idx], self.page_offset_buf[cuda_graph_idx], cuda_graph_idx=cuda_graph_idx)   
            self.graph_memory_pool = self.graphs[cuda_graph_idx].pool()

        self.input = []
        self.features_buf = []
        self.outputs_buf = []
        self.bsz_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
        self.num_tokens_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
        for i in range(len(self.cuda_graphs)):
            prefill_query_length = (self.cuda_graphs[i] - Config().max_decode_batch_size) // Config().max_prefill_batch_size if self.cuda_graphs[i] > Config().max_decode_batch_size else 0  #@TODO only supprot 2 prefill batch
            self.input.append(ForwardBatchInput.gen_max_forward_batch(device=self.device, num_mini_batches = self.num_mini_batches, prefill_query_length=prefill_query_length, prefill_active_length=prefill_query_length, page_size=self.page_size, cuda_lens=self.cuda_graphs[i]))

            self.features_buf.append(self.model.batch_embeddings(self.input[i]))
            batch_size = self.input[i].minibatch.q_indptr.size(0)-1
            num_tokens = self.features_buf[i][0].size(0)
            print("capturing cuda graph", batch_size, num_tokens)

            if isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM) or isinstance(self.model, KSmallThinkerForCausalLM) or isinstance(self.model, KGlm4MoeForCausalLM) or isinstance(self.model, KQwen3NextForCausalLM):
                self.model.init_wrapper(self.use_cuda_graph, self.device, num_tokens ,batch_size, self.block_num, i) # TODO: 1024 is a magic number(max_batch_tokens)

            self.bsz_tensor_buf[0] = batch_size
            self.num_tokens_tensor_buf[0] = num_tokens

            self.model_attn_plan(self.input[i], i)
        
            page_idx, page_offset = self.model.cache.get_page_table(self.input[i].minibatch.position_ids, self.input[i].minibatch.q_indptr, self.input[i].minibatch.kv_indptr, self.input[i].minibatch.kv_indices, self.num_tokens_tensor_buf)

            
            self.page_idx_buf[i][:num_tokens].copy_(page_idx[:num_tokens])
            self.page_offset_buf[i][:num_tokens].copy_(page_offset[:num_tokens])

            self.page_idx_buf[i][num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size -1) 
        
            self.outputs_buf.append(None)
        
            torch.cuda.synchronize()
            for warm_up_iters in range(11):
                with torch.cuda.stream(self.stream):
                    self.outputs_buf[i] = self.model(self.input[i], self.features_buf[i], self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf[i], self.page_offset_buf[i], cuda_graph_idx=i)
            torch.cuda.synchronize()

            self.outputs_buf[i].num_batchs = batch_size

            capture_graphs(i)

            with torch.cuda.stream(self.stream):
                self.graphs[i].replay()

            self.sync(calc_time=False)
            print(f"cuda_graph: {i+1}/{len(self.cuda_graphs)}, warmup finished.")

    def warmup_npu(self):
        # npu 当前使用PD分离
        # 当前只支持 decode 阶段的图下沉
        # 多batch 场景下只支持 1 2 3 4 5 6 7 8
        def capture_graphs(npu_graph_idx):
            utils._USE_NPU_GRAPH = True
            print("self.features_buf[npu_graph_idx] is ", self.features_buf[npu_graph_idx])
            with torch.npu.graph(self.graphs[npu_graph_idx], pool=self.graph_memory_pool, stream=self.stream, auto_dispatch_capture=True):
                self.outputs_buf[npu_graph_idx] = self.model(
                    self.input_decode[npu_graph_idx], 
                    self.features_buf[npu_graph_idx], 
                    self.cache, None, None, 
                    self.page_idx_buf[npu_graph_idx], 
                    self.page_offset_buf[npu_graph_idx], 
                    self.position_ids_buf[npu_graph_idx], 
                    self.block_tables_buf[npu_graph_idx], 
                    cuda_graph_idx=npu_graph_idx, 
                    is_prefill=False
                    )
            self.graph_memory_pool = self.graphs[npu_graph_idx].pool()
            utils._USE_NPU_GRAPH = False

        self.features_buf = []
        self.outputs_buf = []
        self.position_ids_buf = []
        self.block_tables_buf = []
        self.bsz_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
        self.num_tokens_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
        for i in range(len(self.npu_graphs)):
            prefill_query_length = (self.npu_graphs[i] - Config().max_decode_batch_size) // Config().max_prefill_batch_size if self.npu_graphs[i] > Config().max_decode_batch_size else 0  #@TODO only supprot 2 prefill batch
            self.input_decode.append(ForwardBatchInput.gen_max_forward_batch(device=self.device, num_mini_batches = self.num_mini_batches, decode_batch_size=self.npu_graphs[i], prefill_active_length=1, page_size=self.page_size, cuda_lens = self.npu_graphs[i]))
            self.features_buf.append(self.model.batch_embeddings(self.input_decode[i], device=self.device, is_prefill=False))

            batch_size = self.npu_graphs[i]
            num_tokens = batch_size
            self.bsz_tensor_buf[0] = batch_size
            self.num_tokens_tensor_buf[0] = num_tokens
            
            page_idx, page_offset = self.cache.get_page_table(self.input_decode[i].minibatch, self.num_tokens_tensor_buf, is_prefill=False)

            self.position_ids_buf.append(self.input_decode[i].minibatch.d_position_ids.clone())
            self.block_tables_buf.append(self.input_decode[i].minibatch.d_block_tables.clone())


            self.page_idx_buf[i][:num_tokens].copy_(page_idx[:num_tokens][0])
            page_offset = page_offset.view(self.page_offset_buf[i].size())
            self.page_offset_buf[i][:num_tokens].copy_(page_offset[:num_tokens])
            self.page_idx_buf[i][num_tokens:].fill_(self.cache.max_cache_len // self.cache.page_size -1)
            self.outputs_buf.append(None)

            torch.npu.synchronize()
            for warm_up_iters in range(11):
                with torch.npu.stream(self.stream):
                    self.outputs_buf[i] = self.model(self.input_decode[i], self.features_buf[i], self.cache, self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf[i], self.page_offset_buf[i], self.position_ids_buf[i], self.block_tables_buf[i], is_prefill=False)
            torch.npu.synchronize()
            capture_graphs(i)
            self.replay(i)
            self.sync(calc_time=False)
            print(f"npu_graph: {i+1}/{len(self.npu_graphs)}, warmup finished.")


    def run(self, batch: sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None):
        with torch.cuda.stream(self.stream):

            batch_size = len(batch.prefill_mini_batches) # TODO: calc this
            num_tokens = 0
            for i in range(len(batch.decode_mini_batches)):
                batch_size += len(batch.decode_mini_batches[i])
                num_tokens += len(batch.decode_mini_batches[i])
                print(f'decode_batch_i: {len(batch.decode_mini_batches[i])},')

            for i in range(len(batch.prefill_mini_batches)):
                num_tokens += batch.prefill_mini_batches[i][2]
                print(f'prefill_batch_i: {batch.prefill_mini_batches[i][2]},')


            # cuda graph idx equal to min idx i in self.cuda_graphs, that self.cuda_graphs[i] > num_tokens
            cuda_graph_idx = next((i for i, token in enumerate(self.cuda_graphs) if token >= num_tokens), len(self.cuda_graphs))
            if not self.use_cuda_graph:
                cuda_graph_idx = 0
    
            if self.use_cuda_graph:
                self.input[cuda_graph_idx].fill(batch, query_manager, self.page_size)
            else:
                self.input = [ForwardBatchInput(batch=batch, query_manager=query_manager, device=self.device)]
        

            if self.use_cuda_graph:
                self.features = self.model.batch_embeddings(self.input[cuda_graph_idx], device=self.device)

            self.bsz_tensor_buf.copy_(batch_size)
            self.num_tokens_tensor_buf.copy_(torch.tensor([num_tokens], dtype=torch.int32, device=self.device))

            if self.use_cuda_graph:
                self.features_buf[cuda_graph_idx][0].copy_(self.features[0], non_blocking=True)

            self.model_attn_plan(self.input[cuda_graph_idx], cuda_graph_idx)
            self.start_model_event.record(self.stream)

            if self.use_cuda_graph:
                self.model.flash_infer_attn_plan(self.input[cuda_graph_idx], self.bsz_tensor_buf, self.num_tokens_tensor_buf,
                                            num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
                                                head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.cache.page_size, causal=True,
                                                sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
                self.start_model_event.record(self.stream)
                if use_torch_npu:
                    page_idx, page_offset = self.cache.get_page_table(self.input[cuda_graph_idx].minibatch, self.bsz_tensor_buf) #TODO csx minibatch
                    self.page_idx_buf[cuda_graph_idx][num_tokens:].fill_(self.cache.max_cache_len // self.cache.page_size - 1)
                else:
                    page_idx, page_offset = self.model.cache.get_page_table(self.input[cuda_graph_idx].minibatch.position_ids, self.input[cuda_graph_idx].minibatch.q_indptr, self.input[cuda_graph_idx].minibatch.kv_indptr, self.input[cuda_graph_idx].minibatch.kv_indices, self.num_tokens_tensor_buf)
                    self.page_idx_buf[cuda_graph_idx][num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size -1)

                self.page_idx_buf[cuda_graph_idx][:num_tokens].copy_(page_idx[:num_tokens])
                self.page_offset_buf[cuda_graph_idx][:num_tokens].copy_(page_offset[:num_tokens])
                self.replay(cuda_graph_idx)
                self.output = ForwardBatchOutput()
                
                self.output.top_ps.append(self.input[cuda_graph_idx].minibatch.top_ps)
                self.output.temperatures.append(self.input[cuda_graph_idx].minibatch.temperatures)
                self.output.logits.append(self.outputs_buf[cuda_graph_idx].logits[0][self.input[cuda_graph_idx].minibatch.logits_start].clone())

                self.end_model_event.record(self.stream)
            else:
                self.model.flash_infer_attn_plan(self.input, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
                                            num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
                                                head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.cache.page_size, causal=True,
                                                sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
                self.start_model_event.record(self.stream)
                page_idx, page_offset = self.cache.get_page_table(self.input[cuda_graph_idx].minibatch, self.bsz_tensor_buf)

                self.output = self.model(self.input, self.features, self.bsz_tensor_buf, self.num_tokens_tensor_buf, page_idx, page_offset)
                self.output.logits[0] = self.output.logits[0][self.input.minibatch.logits_start]
                self.output.top_ps.append(self.input.minibatch.top_ps)
                self.output.temperatures.append(self.input.minibatch.temperatures)

                self.end_model_event.record(self.stream)

        if not self.use_cuda_graph:
            self.output.num_batchs = self.input.batch_size
        else:
            self.output.num_batchs = self.input[cuda_graph_idx].batch_size

    def run_split(self, batch: sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None):
        """running without flashinfer and prefill & decode split infer"""
        def _run_infer_stage(is_prefill=True):
            if "npu" in self.device:
                cuda_graph_idx = batch_size_decode
            if is_prefill == False:
                if cuda_graph_idx != -1 and self.use_cuda_graph:
                    self.features = self.model.batch_embeddings(self.input_decode[cuda_graph_idx], device=self.device, is_prefill=is_prefill)
                else:
                    self.features = self.model.batch_embeddings(self.input, device=self.device, is_prefill=is_prefill)

                self.bsz_tensor_buf.copy_(batch_size_decode)

                if self.use_cuda_graph:
                    if cuda_graph_idx != -1:
                        self.features_buf[cuda_graph_idx].copy_(self.features)
                    else:
                        self.features_buf.copy_(self.features)
            else:
                self.features = self.model.batch_embeddings(self.input, device=self.device, is_prefill=is_prefill)
                self.bsz_tensor_buf.copy_(batch_size_decode)

            if cuda_graph_idx != -1 and self.use_cuda_graph and is_prefill == False:
                num_tokens = batch_size_decode + 1
                self.start_model_event.record(self.stream) if self.start_model_event else None
                page_idx, page_offset = self.cache.get_page_table(self.input_decode[cuda_graph_idx].minibatch, self.bsz_tensor_buf, is_prefill=is_prefill)
                self.position_ids_buf[cuda_graph_idx].copy_(self.input_tmp.minibatch.d_position_ids)
                self.block_tables_buf[cuda_graph_idx].copy_(self.input_tmp.minibatch.d_block_tables)
                self.page_idx_buf[cuda_graph_idx][:num_tokens].copy_(page_idx[:num_tokens])
                self.page_offset_buf[cuda_graph_idx][:num_tokens].copy_(page_offset[:num_tokens])
                self.page_idx_buf[cuda_graph_idx][num_tokens:].fill_(self.cache.max_cache_len // self.cache.page_size - 1)

                self.replay(cuda_graph_idx)
                new_output = ForwardBatchOutput()
                for i in range(num_tokens):
                    new_output.top_ps.append(self.input_decode[cuda_graph_idx].minibatch.d_top_ps[i])
                    new_output.temperatures.append(self.input_decode[cuda_graph_idx].minibatch.d_temperatures[i])
                    new_output.logits.append(self.outputs_buf[cuda_graph_idx].logits[i].clone())  # TODO support MTP
                self.end_model_event.record(self.stream) if self.start_model_event else None

                if self.output is None:
                    self.output = copy.deepcopy(new_output)
                else:
                    self.output.merge(new_output)

            else:
                self.start_model_event.record(self.stream) if self.start_model_event else None
                page_idx, page_offset = self.cache.get_page_table(self.input.minibatch, self.num_tokens_tensor_buf, is_prefill=is_prefill)
                new_output = self.model(self.input, self.features, self.cache, None, None, page_idx, page_offset, None, None, is_prefill=is_prefill)
                bsz = len(new_output.logits)
                if is_prefill:
                    for i in range(bsz):
                        new_output.logits[i] = new_output.logits[i][-1:, :]  # batched tensor do not need location
                        new_output.top_ps.append(self.input.minibatch.p_top_ps[i])
                        new_output.temperatures.append(self.input.minibatch.p_temperatures[i])
                else:
                    for i in range(bsz):
                        new_output.top_ps.append(self.input.minibatch.d_top_ps[i])
                        new_output.temperatures.append(self.input.minibatch.d_temperatures[i])

                if self.output is None:
                    self.output = copy.deepcopy(new_output)
                else:
                    self.output.merge(new_output)
                self.end_model_event.record(self.stream) if self.end_model_event else None

        with self.stream_scope(self.stream):

            batch_size = len(batch.prefill_mini_batches) # TODO: calc this
            num_d_tokens, num_p_tokens = 0, 0
            for i in range(len(batch.decode_mini_batches)):
                batch_size += len(batch.decode_mini_batches[i])
                num_d_tokens += len(batch.decode_mini_batches[i])
                if self.debug:
                    print(f'decode_batch_i: {len(batch.decode_mini_batches[i])}, token_num: {len(batch.decode_mini_batches[i])} ,batch_size: {batch_size}')

            for i in range(len(batch.prefill_mini_batches)):
                num_p_tokens += batch.prefill_mini_batches[i][2]
                if self.debug:
                    print(f'prefill_batch_i: {batch.prefill_mini_batches[i][2]}, token_num: {batch.prefill_mini_batches[i][2]}')

            # batch info holder both in graph mode & kernel mode
            self.input_tmp = ForwardBatchInput(batch=batch, query_manager=query_manager, device=self.device)
            batch_size_decode = self.input_tmp.minibatch.decode_batch - 1
            idx = self.input_tmp.minibatch.decode_batch - 1
            cuda_graph_idx = batch_size_decode
            self.output = None  # clear last step output

            if self.input_tmp.minibatch.decode_batch > 0:
                if self.use_cuda_graph and len(self.input_decode) > 0:
                    self.input_decode[idx].fill(batch, query_manager, self.page_size)
                else:
                    self.input = self.input_tmp
                    assert isinstance(self.input.minibatch, ForwardMiniBatchSplit), 'split batch input type must be ForwardMiniBatchSplit'
                    print(self.input.minibatch) if self.debug else None

            if self.input_tmp.minibatch.prefill_batch > 0:
                self.input = self.input_tmp
                assert isinstance(self.input.minibatch, ForwardMiniBatchSplit), 'split batch input type must be ForwardMiniBatchSplit'
                print(self.input.minibatch) if self.debug else None

            # ++++++++++++++++++++++++++++++++++++++++++ Prefill Stage ++++++++++++++++++++++++++++++++++++++++++++++++
            if self.input_tmp.minibatch.prefill_batch > 0:
                _run_infer_stage(is_prefill=True)
                self.output.num_batchs = self.input.minibatch.batch_size
            # ++++++++++++++++++++++++++++++++++++++++++ Decode Stage ++++++++++++++++++++++++++++++++++++++++++++++++
            if self.input_tmp.minibatch.decode_batch > 0:
                if self.use_cuda_graph:
                    _run_infer_stage(is_prefill=False)
                    self.output.num_batchs = self.input_decode[idx].minibatch.batch_size
                else:
                    _run_infer_stage(is_prefill=False)
                    self.output.num_batchs = self.input.minibatch.batch_size

            print(self.output) if self.debug else None

    def replay(self, cuda_graph_idx=-1):
        if use_torch_npu:
            thread = threading.Thread(target=self.graphs[cuda_graph_idx].update, kwargs={"cpu_update_input": [{"actual_seq_lengths_kv": self.input_decode[cuda_graph_idx].minibatch.d_kv_len_list}]})
            thread.start()
            torch_npu.npu.synchronize()

        with torch.cuda.stream(self.stream):
            if cuda_graph_idx != -1:
                self.graphs[cuda_graph_idx].replay()
            else:
                self.graphs.replay()


    def sync(self, calc_time = True):
        self.stream.synchronize()
        if calc_time:
            self.model_time = self.start_model_event.elapsed_time(self.end_model_event)  # In ms


def get_or_create_model_runner(model=None, cache=None, device=None, use_cuda_graph=None, page_size=None):
    from ktransformers.server.balance_serve.inference.config import model_runner_dict
    runner = model_runner_dict.get(device)
    if runner is None:
        print("[WARN] the new ModelRunner and deviceId is ", device)
        runner = ModelRunner(model, cache, device, use_cuda_graph, page_size)
        model_runner_dict[device] = runner
    return runner


================================================
FILE: archive/ktransformers/server/balance_serve/inference/query_manager.py
================================================
'''
Date: 2024-11-14 12:23:45
LastEditors: djw
LastEditTime: 2024-11-20 04:06:23
'''
import torch
from ktransformers.server.balance_serve.settings import sched_ext
import random
import time
from ktransformers.server.config.config import Config
from ktransformers.server.utils.serve_profiling import PROF_TIME_STAT

class QueryInfo:
    id: int
    active_position: int
    query_length: int
    is_prefill: int
    is_first_token: int
    block_index: torch.Tensor
    query_tokens: torch.Tensor
    stop_criteria: list[torch.Tensor]

    temperature: float
    top_p: float

    max_length: int

    pos_status: torch.Tensor
    probs: list[torch.Tensor]
    acc_position: int 

    def __init__(self, id, query_length: int, max_length: int, page_size: int, device: torch.device, is_prefill: bool = True, offset: int = 0, active_position: int = 0, temperature: float = 0.01, top_p: float = 1.0):
        self.id = id
        self.is_prefill = is_prefill
        self.is_first_token = False
        self.active_position = active_position
        self.max_length = max_length - 1
        self.query_tokens = torch.zeros((max_length + 2,), dtype=torch.int, device = device)
        self.stop_criteria = []
        self.block_index = torch.arange(offset, offset + (max_length + active_position + page_size - 1) // page_size, dtype=torch.int, device = device)
        self.query_length = query_length
        self.enqueue_time = time.time()
        self.decode_start_time = None
        self.speculative_token = {} # {position: (accept, token)}

        self.pos_status = torch.zeros((max_length + 2,), dtype=torch.int, device = device)
        self.probs = [None] * (max_length + 2)

        self.acc_tokens_num = 0
        self.rej_tokens_num = 0
        self.round = 0
        self.acc_length = 0
        self.acc_position = 0

        self.temperature = temperature
        self.top_p = top_p

    def check_stop(self):
        if self.active_position >= self.max_length - 2:
            if PROF_TIME_STAT.on:
                PROF_TIME_STAT.print_all()
                # PROF_TIME_STAT.reset_all()
            return True

        # 遍历每个停止条件
        for stop_tensor in self.stop_criteria:
            stop_len = len(stop_tensor)
            
            # 如果停止条件比 query_tokens 长，跳过
            if stop_len >= self.active_position:
                continue
            
            #print(f"stop_tensor: {stop_tensor}, stop_len: {stop_len}, active_position: {self.active_position}, query_token: {self.query_tokens[self.active_position - stop_len - 1:self.active_position - 1]}")

            if (torch.equal(self.query_tokens[self.active_position - stop_len - 1:self.active_position - 1], stop_tensor) and self.active_position) or self.max_length <= self.active_position + 3:
                self.life_time = time.time() - self.enqueue_time
                self.decode_duration_time = time.time() - self.decode_start_time
                self.decode_tps = (self.active_position -  self.query_length) / self.decode_duration_time
                print(f"prefill length: {self.query_length}, prefill time: {self.prefill_duration_time}, prefill tps {self.prefill_tps}, decode length: {self.active_position -  self.query_length}, decode time: {self.decode_duration_time}, decode tps {self.decode_tps}")
                
                if self.acc_tokens_num + self.rej_tokens_num != 0:
                    verify_counts = self.acc_tokens_num + self.rej_tokens_num
                    print(f"mtp accept rate: {self.acc_tokens_num}/{verify_counts} = {self.acc_tokens_num * 100 / verify_counts} %")
                if PROF_TIME_STAT.on:
                    PROF_TIME_STAT.print_all()
                    # PROF_TIME_STAT.reset_all()
                return True  # 找到匹配的停止条件
                
        
        return False  # 没有找到任何停止条件


    def print(self):
        print(f"active_position: {self.active_position}, query_length: {self.query_length}, is_prefill: {self.is_prefill}")
        print(f"block_index_shape: {self.block_index.shape}, query_tokens_shape: {self.query_tokens.shape}")
        print(f"query_tokens_shape: {self.query_tokens}, is_first_token: {self.is_first_token}" )
        print(f"pos_status: {self.pos_status}, acc_position: ", self.acc_position)
        print(f"probs: {self.probs}")


class QueryManager:

    max_length: int = 65536
    page_size: int = 256
    device: torch.device
    query_map : dict[int, QueryInfo]

    def __init__(self, max_length = 65536, page_size = 256, device = torch.device('cuda')):
        self.max_length = max_length
        self.page_size = page_size
        self.device = device
        self.query_map = {}

    def print(self, hint: str = ""):
        print(hint," query_manager: ", self.query_map)
        for key in self.query_map: 
            query_info = self.query_map[key]
            print(">>> query: ", key)
            print("query_info: ")
            query_info.print()

    def add_query(self, batch: sched_ext.BatchQueryTodo):

        for i in range(len(batch.query_ids)):
            id = batch.query_ids[i]
            if id not in self.query_map:
                print(f"add query id: {id}, batch.query_lengths: {batch.query_lengths[i]}, "
                      f"batch_query_tokens: {batch.query_tokens[i].shape}, "
                      f"batch.block_indexes: {batch.block_indexes[i]}")
                assert batch.query_tokens[i].size(0) < self.max_length, "query max length in batchquerytodo exceeds internal max_length"
                query_info = QueryInfo(id=id, query_length=batch.query_lengths[i], max_length=batch.query_tokens[i].size(0) + 1, page_size=self.page_size, device=self.device, temperature=batch.sample_options[i].temperature, top_p=batch.sample_options[i].top_p)
                query_info.query_tokens[:query_info.query_length].copy_(batch.query_tokens[i][:query_info.query_length].to(self.device))
                
                for stop_token_list in batch.stop_criteria[i]:
                    query_info.stop_criteria.append(torch.tensor(stop_token_list, dtype=torch.int, device = self.device))

                block_num = batch.block_indexes[i].size(0)
                query_info.block_index[:block_num].copy_(batch.block_indexes[i].to(self.device))

                self.query_map[id] = query_info
                
                prefill_mini_batches = batch.prefill_mini_batches
                for (prefill_id, s, l) in prefill_mini_batches:
                    if prefill_id == id:
                        self.query_map[prefill_id].active_position = s


    def update(self, batch: sched_ext.BatchQueryTodo) -> list[sched_ext.QueryUpdate]:
        query_updates = []

        prefill_mini_batches = batch.prefill_mini_batches

        for (id, s, l) in prefill_mini_batches:

            if id not in self.query_map:
                assert False, f"query id {id} not found in query_map"

            # update query_info
            query_info = self.query_map[id]
            query_info.active_position += l

            if query_info.active_position >= query_info.query_length and query_info.is_prefill:
                query_info.is_prefill = False
                query_info.is_first_token = True
                query_info.prefill_duration_time = time.time() - query_info.enqueue_time
                query_info.prefill_tps = query_info.query_length / query_info.prefill_duration_time
                

            # generate schedule query_update
            query_update = sched_ext.QueryUpdate()
            query_update.id = id
            query_update.ok = True
            query_update.is_prefill = query_info.is_prefill
            query_update.active_position = query_info.active_position
            # if(not query_info.is_prefill):
            query_updates.append(query_update)


        decode_mini_batches = batch.decode_mini_batches

        for ids in decode_mini_batches:
            for id in ids:
                if id not in self.query_map:
                    assert False, f"query id {id} not found in query_map"

                query_info = self.query_map[id]
                query_info.is_first_token = False
                query_info.active_position += 1

                query_update = sched_ext.QueryUpdate()
                query_update.id = id
                query_update.ok = True
                query_update.is_prefill = query_info.is_prefill

                query_update.decode_done = query_info.check_stop()

                query_update.active_position = query_info.active_position
                query_updates.append(query_update)

        return query_updates

================================================
FILE: archive/ktransformers/server/balance_serve/inference/sampling/penaltylib/__init__.py
================================================
from .orchestrator import BatchedPenalizerOrchestrator
from .penalizers.frequency_penalty import BatchedFrequencyPenalizer
from .penalizers.min_new_tokens import BatchedMinNewTokensPenalizer
from .penalizers.presence_penalty import BatchedPresencePenalizer
from .penalizers.repetition_penalty import BatchedRepetitionPenalizer

__all__ = [
    "BatchedFrequencyPenalizer",
    "BatchedMinNewTokensPenalizer",
    "BatchedPresencePenalizer",
    "BatchedRepetitionPenalizer",
    "BatchedPenalizerOrchestrator",
]


================================================
FILE: archive/ktransformers/server/balance_serve/inference/sampling/penaltylib/orchestrator.py
================================================
import abc
import dataclasses
import typing

import torch


@dataclasses.dataclass
class _ReqLike:
    origin_input_ids: typing.Union[torch.Tensor, typing.List[int]]


@dataclasses.dataclass
class _BatchLike:
    reqs: typing.List[_ReqLike]

    def batch_size(self):
        return len(self.reqs)


class BatchedPenalizerOrchestrator:
    batch: _BatchLike
    device: str
    vocab_size: int
    penalizers: typing.Dict[typing.Type["_BatchedPenalizer"], "_BatchedPenalizer"]

    def __init__(
        self,
        vocab_size: int,
        batch: _BatchLike,
        device: str,
        Penalizers: typing.Set[typing.Type["_BatchedPenalizer"]],
    ):
        self.vocab_size = vocab_size
        self.batch = batch
        self.device = device

        self.penalizers = {Penalizer: Penalizer(self) for Penalizer in Penalizers}

        is_required = False
        for penalizer in self.penalizers.values():
            pen_is_required = penalizer.prepare_if_required()
            is_required |= pen_is_required
        self.is_required = is_required

        if self.is_required:
            self.cumulate_input_tokens(
                input_ids=[req.origin_input_ids for req in self.reqs()]
            )

    def reqs(self):
        return self.batch.reqs

    def batch_size(self):
        return self.batch.batch_size()

    def cumulate_input_tokens(
        self,
        input_ids: typing.Union[
            typing.List[torch.Tensor], typing.List[typing.List[int]]
        ],
    ):
        """
        Feed the input tokens to the penalizers.

        Args:
            input_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The input tokens.
        """
        token_ids = _TokenIDs(orchestrator=self, token_ids=input_ids)

        for penalizer in self.penalizers.values():
            penalizer.cumulate_input_tokens(input_ids=token_ids)

    def cumulate_output_tokens(
        self,
        output_ids: typing.Union[
            typing.List[torch.Tensor], typing.List[typing.List[int]]
        ],
    ):
        """
        Feed the output tokens to the penalizers.

        Args:
            output_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The output tokens.
        """
        if not self.is_required:
            return

        token_ids = _TokenIDs(orchestrator=self, token_ids=output_ids)

        for penalizer in self.penalizers.values():
            penalizer.cumulate_output_tokens(output_ids=token_ids)

    def apply(self, logits: torch.Tensor) -> torch.Tensor:
        """
        Apply the penalizers to the logits.
        Note that it may apply the penalizers in-place.

        Args:
            logits (torch.Tensor): The logits to apply the penalizers to.

        Returns:
            torch.Tensor: The logits after applying the penalizers.
        """
        if not self.is_required:
            return

        for penalizer in self.penalizers.values():
            logits = penalizer.apply(logits)

        return logits

    def filter(
        self,
        indices_to_keep: typing.List[int],
        indices_tensor_to_keep: torch.Tensor = None,
    ):
        """
        Filter the penalizers based on the indices to keep in the batch.

        Args:
            indices_to_keep (typing.List[int]): List of indices to keep in the batch.
            indices_tensor_to_keep (torch.Tensor = None): Tensor of indices to keep in the batch. If not None, it will be used instead of converting indices_to_keep to a tensor.
        """
        if not self.is_required:
            return

        empty_indices = len(indices_to_keep) == 0

        is_required = False
        for penalizer in self.penalizers.values():
            tmp_is_required = penalizer.is_required()
            is_required = is_required or tmp_is_required
            if not tmp_is_required or empty_indices:
                penalizer.teardown()
            else:
                # create tensor index only when it's needed
                if indices_tensor_to_keep is None:
                    indices_tensor_to_keep = torch.tensor(
                        indices_to_keep, dtype=torch.int32, device=self.device
                    )

                penalizer.filter(
                    indices_to_keep=indices_to_keep,
                    indices_tensor_to_keep=indices_tensor_to_keep,
                )
        self.is_required = is_required

    def merge(self, their: "BatchedPenalizerOrchestrator"):
        """
        Merge the penalizers of another orchestrator into this one.

        Note that this function **must** be called _before_ self.batch.reqs is updated (filtered).
        Each unprepared penalizers would have to be prepared (creating tensors, etc.) first before merging.
        This step requires the original batch.reqs, before it gets merged with other batch.reqs.

        Args:
            their (BatchedPenalizerOrchestrator): The orchestrator to merge into this one.
        """
        if not self.is_required and not their.is_required:
            return

        self.is_required |= their.is_required
        for Penalizer, their_penalizer in their.penalizers.items():
            if Penalizer not in self.penalizers:
                raise ValueError(f"Penalizer {Penalizer} not found in self.penalizers")

            self.penalizers[Penalizer].merge(their_penalizer)


class _TokenIDs:
    """
    A class that wraps token IDs to provide additional utility functions to penalizers.

    Attributes:
        orchestrator (BatchedPenalizerOrchestrator): The orchestrator that this token IDs belong to.
        token_ids (typing.Union[torch.Tensor, typing.List[torch.Tensor]]): The token IDs.
        cached_counts (torch.Tensor): The cached occurrence count tensor.
    """

    orchestrator: BatchedPenalizerOrchestrator
    token_ids: typing.Union[torch.Tensor, typing.List[torch.Tensor]]
    cached_counts: torch.Tensor = None

    def __init__(
        self,
        orchestrator: BatchedPenalizerOrchestrator,
        token_ids: typing.Union[
            typing.List[torch.Tensor], typing.List[typing.List[int]]
        ],
    ):
        self.orchestrator = orchestrator

        if not isinstance(token_ids[0], torch.Tensor):
            token_ids = [
                torch.tensor(
                    data=ids, dtype=torch.int64, device=self.orchestrator.device
                )
                for ids in token_ids
            ]

        self.token_ids = token_ids

    def occurrence_count(self) -> torch.Tensor:
        """
        Returns a tensor of shape (batch_size, vocab_size) where each element is the number of times the corresponding token appears in the batch.

        Returns:
            torch.Tensor: The occurrence count tensor.
        """
        if self.cached_counts is not None:
            return self.cached_counts

        token_ids = self.token_ids

        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.unsqueeze(1)

            # needs to be long to be used as index in scatter_add
            if token_ids.dtype != torch.int64:
                token_ids = token_ids.to(torch.int64)

        padded_token_ids = torch.nn.utils.rnn.pad_sequence(
            sequences=token_ids,
            batch_first=True,
            padding_value=self.orchestrator.vocab_size,
        )

        self.cached_counts = torch.zeros(
            size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
            dtype=torch.int64,
            device=self.orchestrator.device,
        ).scatter_add_(
            dim=1,
            index=padded_token_ids,
            src=torch.ones_like(padded_token_ids),
        )[
            :, : self.orchestrator.vocab_size
        ]

        return self.cached_counts


class _BatchedPenalizer(abc.ABC):
    """
    An abstract class for a batched penalizer.
    """

    orchestrator: BatchedPenalizerOrchestrator
    _is_prepared: bool = False

    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
        self.orchestrator = orchestrator

    def is_prepared(self) -> bool:
        return self._is_prepared

    def is_required(self) -> bool:
        return self._is_required()

    def prepare(self):
        if not self.is_prepared():
            self._prepare()
            self._is_prepared = True

    def prepare_if_required(self):
        if self.is_required():
            self.prepare()
            return True
        else:
            return False

    def teardown(self):
        if self.is_prepared():
            self._teardown()
            self._is_prepared = False

    def cumulate_input_tokens(self, input_ids: _TokenIDs):
        if not self.is_prepared():
            return

        self._cumulate_input_tokens(input_ids=input_ids)

    def cumulate_output_tokens(self, output_ids: _TokenIDs):
        if not self.is_prepared():
            return

        self._cumulate_output_tokens(output_ids=output_ids)

    def apply(self, logits: torch.Tensor) -> torch.Tensor:
        if not self.is_prepared():
            return logits

        return self._apply(logits=logits)

    def filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        if not self.is_prepared():
            return

        self._filter(
            indices_to_keep=indices_to_keep,
            indices_tensor_to_keep=indices_tensor_to_keep,
        )

    def merge(self, their: "_BatchedPenalizer"):
        if not self.is_prepared() and not their.is_prepared():
            return

        self.prepare()
        their.prepare()
        self._merge(their)

    @abc.abstractmethod
    def _is_required(self) -> bool:
        """
        Check if the penalizer is required to be prepared.
        """
        pass

    @abc.abstractmethod
    def _prepare(self):
        """
        Prepare the penalizer.
        Usually, this is where the penalizer initializes its tensors.
        """
        pass

    @abc.abstractmethod
    def _teardown(self):
        """
        Tear down the penalizer.
        Usually, this is where the penalizer frees its tensors.
        """
        pass

    @abc.abstractmethod
    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        """
        Cumulate the input tokens.
        Orchestrator will call this function to feed the input tokens to the penalizer.
        """
        pass

    @abc.abstractmethod
    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        """
        Cumulate the output tokens.
        Orchestrator will call this function to feed the output tokens to the penalizer.
        """
        pass

    @abc.abstractmethod
    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        """
        Apply the penalizer to the logits.
        Penalizers can modify the logits in-place if needed.
        """
        pass

    @abc.abstractmethod
    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        """
        Filter the penalizer (tensors or underlying data) based on the indices to keep in the batch.
        """
        pass

    @abc.abstractmethod
    def _merge(self, their: "_BatchedPenalizer"):
        """
        Merge the penalizer with another penalizer.
        """
        pass


================================================
FILE: archive/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/frequency_penalty.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedFrequencyPenalizer(_BatchedPenalizer):
    """
    Frequency penalizer penalizes tokens based on their frequency in the output.
    """

    frequency_penalties: torch.Tensor = None
    cumulated_frequency_penalties: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.frequency_penalty != 0.0
            for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.cumulated_frequency_penalties = (
            torch.tensor(
                data=[0.0 for _ in self.orchestrator.reqs()],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .repeat(1, self.orchestrator.vocab_size)
        )

        self.frequency_penalties = (
            torch.tensor(
                data=[
                    req.sampling_params.frequency_penalty
                    for req in self.orchestrator.reqs()
                ],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .expand_as(self.cumulated_frequency_penalties)
        )

    def _teardown(self):
        del self.frequency_penalties
        del self.cumulated_frequency_penalties

        self.frequency_penalties = None
        self.cumulated_frequency_penalties = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        pass

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        self.cumulated_frequency_penalties += (
            self.frequency_penalties * output_ids.occurrence_count()
        )

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        logits -= self.cumulated_frequency_penalties
        return logits

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.frequency_penalties = self.frequency_penalties[indices_tensor_to_keep]
        self.cumulated_frequency_penalties = self.cumulated_frequency_penalties[
            indices_tensor_to_keep
        ]

    def _merge(self, their: "BatchedFrequencyPenalizer"):
        self.frequency_penalties = torch.cat(
            [self.frequency_penalties, their.frequency_penalties], dim=0
        )
        self.cumulated_frequency_penalties = torch.cat(
            [self.cumulated_frequency_penalties, their.cumulated_frequency_penalties],
            dim=0,
        )


================================================
FILE: archive/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/min_new_tokens.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
    """
    Min new tokens penalizer penalizes tokens based on the length of the output.
    """

    min_new_tokens: torch.Tensor = None
    stop_token_penalties: torch.Tensor = None
    len_output_tokens: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.min_new_tokens > 0 for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.min_new_tokens = torch.tensor(
            data=[
                req.sampling_params.min_new_tokens for req in self.orchestrator.reqs()
            ],
            dtype=torch.int32,
            device=self.orchestrator.device,
        ).unsqueeze_(1)

        padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
            sequences=[
                torch.tensor(
                    data=(
                        list(
                            (req.sampling_params.stop_token_ids or set())
                            | (req.tokenizer.additional_stop_token_ids or set())
                            | {req.tokenizer.eos_token_id}
                        )
                    ),
                    dtype=torch.int64,
                    device=self.orchestrator.device,
                )
                for req in self.orchestrator.reqs()
            ],
            batch_first=True,
            padding_value=self.orchestrator.vocab_size,
        )
        self.stop_token_penalties = torch.zeros(
            size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
            dtype=torch.float32,
            device=self.orchestrator.device,
        ).scatter_add_(
            dim=1,
            index=padded_stop_token_ids,
            src=torch.full_like(
                input=padded_stop_token_ids,
                dtype=torch.float32,
                fill_value=float("-inf"),
                device=self.orchestrator.device,
            ),
        )[
            :, : self.orchestrator.vocab_size
        ]

        self.len_output_tokens = torch.zeros(
            size=(self.orchestrator.batch_size(), 1),
            dtype=torch.int32,
            device=self.orchestrator.device,
        )

    def _teardown(self):
        del self.min_new_tokens
        del self.stop_token_penalties
        del self.len_output_tokens

        self.min_new_tokens = None
        self.stop_token_penalties = None
        self.len_output_tokens = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        pass

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        self.len_output_tokens += 1

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        mask = (self.len_output_tokens < self.min_new_tokens).expand_as(logits)
        logits[mask] += self.stop_token_penalties[mask]
        return logits

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.min_new_tokens = self.min_new_tokens[indices_tensor_to_keep]
        self.stop_token_penalties = self.stop_token_penalties[indices_tensor_to_keep]
        self.len_output_tokens = self.len_output_tokens[indices_tensor_to_keep]

    def _merge(self, their: "BatchedMinNewTokensPenalizer"):
        self.min_new_tokens = torch.cat(
            [self.min_new_tokens, their.min_new_tokens], dim=0
        )
        self.stop_token_penalties = torch.cat(
            [self.stop_token_penalties, their.stop_token_penalties], dim=0
        )
        self.len_output_tokens = torch.cat(
            [self.len_output_tokens, their.len_output_tokens], dim=0
        )


================================================
FILE: archive/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/presence_penalty.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedPresencePenalizer(_BatchedPenalizer):
    """
    Presence penalizer penalizes tokens based on their presence in the output.
    """

    presence_penalties: torch.Tensor = None
    cumulated_presence_penalties: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.presence_penalty != 0.0
            for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.cumulated_presence_penalties = (
            torch.tensor(
                data=[0.0 for _ in self.orchestrator.reqs()],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .repeat(1, self.orchestrator.vocab_size)
        )

        self.presence_penalties = (
            torch.tensor(
                data=[
                    req.sampling_params.presence_penalty
                    for req in self.orchestrator.reqs()
                ],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .expand_as(self.cumulated_presence_penalties)
        )

    def _teardown(self):
        del self.presence_penalties
        del self.cumulated_presence_penalties

        self.presence_penalties = None
        self.cumulated_presence_penalties = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        pass

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        mask = output_ids.occurrence_count() > 0
        self.cumulated_presence_penalties[mask] = self.presence_penalties[mask]

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        logits -= self.cumulated_presence_penalties
        return logits

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.presence_penalties = self.presence_penalties[indices_tensor_to_keep]
        self.cumulated_presence_penalties = self.cumulated_presence_penalties[
            indices_tensor_to_keep
        ]

    def _merge(self, their: "BatchedPresencePenalizer"):
        self.presence_penalties = torch.cat(
            [self.presence_penalties, their.presence_penalties], dim=0
        )
        self.cumulated_presence_penalties = torch.cat(
            [self.cumulated_presence_penalties, their.cumulated_presence_penalties],
            dim=0,
        )


================================================
FILE: archive/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/repetition_penalty.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedRepetitionPenalizer(_BatchedPenalizer):
    """
    Repetition penalizer penalizes tokens based on their repetition in the input and output.
    """

    repetition_penalties: torch.Tensor = None
    cumulated_repetition_penalties: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.repetition_penalty != 1.0
            for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.cumulated_repetition_penalties = (
            torch.tensor(
                data=[1.0 for _ in self.orchestrator.reqs()],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .repeat(1, self.orchestrator.vocab_size)
        )

        self.repetition_penalties = (
            torch.tensor(
                data=[
                    req.sampling_params.repetition_penalty
                    for req in self.orchestrator.reqs()
                ],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .expand_as(self.cumulated_repetition_penalties)
        )

    def _teardown(self):
        del self.repetition_penalties
        del self.cumulated_repetition_penalties

        self.repetition_penalties = None
        self.cumulated_repetition_penalties = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        mask = input_ids.occurrence_count() > 0
        self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        mask = output_ids.occurrence_count() > 0
        self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        return torch.where(
            logits > 0,
            logits / self.cumulated_repetition_penalties,
            logits * self.cumulated_repetition_penalties,
        )

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]
        self.cumulated_repetition_penalties = self.cumulated_repetition_penalties[
            indices_tensor_to_keep
        ]

    def _merge(self, their: "BatchedRepetitionPenalizer"):
        self.repetition_penalties = torch.cat(
            [self.repetition_penalties, their.repetition_penalties], dim=0
        )
        self.cumulated_repetition_penalties = torch.cat(
            [self.cumulated_repetition_penalties, their.cumulated_repetition_penalties],
            dim=0,
        )


================================================
FILE: archive/ktransformers/server/balance_serve/inference/sampling/sampler.py
================================================
'''
Date: 2024-11-14 12:23:45
LastEditors: Xie Weiyu ervinxie@qq.com
LastEditTime: 2024-11-25 08:59:23
'''
import logging
import torch
from torch import nn
from transformers import GenerationConfig

from flashinfer.sampling import (
	min_p_sampling_from_probs,
	top_k_renorm_probs,
	top_k_top_p_sampling_from_logits,
	top_p_renorm_probs,
)

try:
    import torch_npu
    use_torch_npu = torch.npu.is_available()
except:
    use_torch_npu = False
logger = logging.getLogger(__name__)

class SamplingOptions():
	# Batched sampling params
	temperatures: torch.Tensor
	top_ps: torch.Tensor
	top_ks: torch.Tensor
	min_ps: torch.Tensor

	# All requests use greedy sampling
	is_all_greedy: bool

	# Dispatch in CUDA graph
	need_min_p_sampling: bool
	
	def __init__(self, bsz = 1, device = torch.device('cuda'), pretrained_config:GenerationConfig = None, temperatures: torch.Tensor = None, top_ps: torch.Tensor = None):
		if pretrained_config is None and temperatures is None:
			self.temperatures = torch.full((bsz, 1), 0, device=device, dtype=torch.float32)
			self.top_ps = torch.ones((bsz, 1), device=device, dtype=torch.float32)
			self.top_ks = torch.ones((bsz, 1), device=device, dtype=torch.float32)
			self.need_min_p_sampling = False
			self.is_all_greedy = True
		else:
			if temperatures is not None:
				self.temperatures = temperatures.unsqueeze(-1)
			else:
				self.temperatures = torch.full((bsz, 1), pretrained_config.temperature, device=device, dtype=torch.float32)
			
			if top_ps is not None:
				self.top_ps = top_ps.unsqueeze(-1)
			else:	
				self.top_ps = torch.full((bsz, 1), pretrained_config.top_p, device=device, dtype=torch.float32)
			self.top_ks = torch.full((bsz, 1), pretrained_config.top_k, device=device, dtype=torch.float32)
			self.need_min_p_sampling = False
			self.is_all_greedy = False

class Sampler(nn.Module):
	def __init__(self):
		super().__init__()
	
	def forward(
		self,
		logits: torch.Tensor,
		sampling_config: SamplingOptions = None,
	):
		if sampling_config == None:
			sampling_config = SamplingOptions()

		logits = logits.contiguous()
		origin_logits = logits.clone()
		if sampling_config.is_all_greedy or use_torch_npu:
			# Use torch.argmax if all requests use greedy sampling
			probs = torch.softmax(logits, dim=-1)
			batch_next_token_ids = torch.argmax(logits, -1)
		else:
			# Post process logits
			logits.div_(sampling_config.temperatures)
			max_top_k_round, batch_size = 32, logits.shape[0]
			if sampling_config.need_min_p_sampling:
				probs = torch.softmax(logits, dim=-1)
				logits = None
				del logits
				probs = top_k_renorm_probs(probs, sampling_config.top_ks)
				probs = top_p_renorm_probs(probs, sampling_config.top_ps)
				batch_next_token_ids = min_p_sampling_from_probs(
					probs, sampling_config.min_ps
				)
				temperature_0_idx = torch.where(sampling_config.temperatures == 0)[0]
				batch_next_token_ids[temperature_0_idx] = torch.argmax(origin_logits[temperature_0_idx], -1).to(torch.int32)
			else:
				# TODO: use different kernel when don't need top_k or top_p
				# @TODO get probs
				probs = logits
				batch_next_token_ids = top_k_top_p_sampling_from_logits(
					logits,
					sampling_config.top_ks,
					sampling_config.top_ps,
					filter_apply_order="joint",
				)
				temperature_0_idx = torch.where(sampling_config.temperatures == 0)[0]
				batch_next_token_ids[temperature_0_idx] = torch.argmax(origin_logits[temperature_0_idx], -1).to(torch.int32)
			
		return batch_next_token_ids.to(torch.int32), probs

================================================
FILE: archive/ktransformers/server/balance_serve/sched_rpc.py
================================================
from datetime import datetime
import os
from typing import Optional
import zmq
import pickle
import threading
import torch.multiprocessing as mp
import sys
current_file_path = os.path.abspath(__file__)
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
import pickle
import argparse
import torch
try:
    import torch_npu
    use_npu = torch.npu.is_available()
except:
    use_npu = False
from ktransformers.server.balance_serve.settings import sched_ext, create_sched_settings, create_sched_settings_qwen2moe, create_sched_settings_qwen3moe, create_sched_settings_glm4moe, create_sched_settings_smallthinker, create_sched_settings_qwen3next


if mp.get_start_method(allow_none=True) is None:
    print('set start method')
    mp.set_start_method('spawn')
else:
    print(f'start method already set to {mp.get_start_method(allow_none=True)}')


class SchedulerServer:
    def __init__(self, settings, main_args):
        # 创建 Scheduler 实例并初始化
        if use_npu:
            for device_id in settings.gpu_device_id:
                torch_npu.npu.set_device(f'npu:{device_id}')
        self.sched = sched_ext.create_scheduler(settings)
    
        # 初始化 ZeroMQ 上下文和套接字
        self.context = zmq.Context()
        self.frontend = self.context.socket(zmq.ROUTER)
        print(f"sched zmq rpc server on port {main_args.sched_port}")
        self.frontend.bind(f"tcp://*:{main_args.sched_port}") 

        # 创建内部的 DEALER 套接字，用于与工作线程通信
        self.backend = self.context.socket(zmq.DEALER)
        self.backend.bind("inproc://backend")

    # 启动调度器
    def run_scheduler(self):
        self.sched.run()

    # 停止调度器
    def stop_scheduler(self):
        self.sched.stop()

    # 处理客户端请求
    def start_proxy(self):
        # 使用 ZMQ 的内置代理，将前端请求分发给后端工作线程
        zmq.proxy(self.frontend, self.backend)

    # 工作线程处理请求
    def worker_routine(self):
        worker = self.context.socket(zmq.REP)
        worker.connect("inproc://backend")
        while True:
            try:
                # 接收客户端请求
                message = worker.recv()
                data = pickle.loads(message)

                method = data.get('method')
                params = data.get('params', {})
                # print(f"Received request: {method}")

                if method == 'add_query':
                    query_add = params.get('query')  # 直接是一个 QueryAdd 对象
                    # 添加查询
                    query_id = self.sched.add_query(query_add)
                    # 发送响应
                    response = {'status': 'ok', 'query_id': query_id}
                    worker.send(pickle.dumps(response))

                elif method == 'cancel_query':
                    query_id = params.get('query_id')
                    # 假设您的 Scheduler 类实现了 cancel 方法
                    self.sched.cancel(query_id)
                    response = {'status': 'ok'}
                    worker.send(pickle.dumps(response))

                elif method == 'update_last_batch':
                    updates = params.get('updates')  # 直接是一个列表，包含 QueryUpdate 对象

                    # 更新最后一个批次
                    batch_todo = self.sched.update_last_batch(updates)

                    # 直接发送 batch_todo 对象
                    response = {'status': 'ok', 'batch_todo': batch_todo}
                    # print (batch_todo.query_lengths, batch_todo.query_ids)
                    worker.send(pickle.dumps(response))

                elif method == 'get_inference_context':
                    inference_context = self.sched.get_inference_context()
                    data = {
                        "k_cache":inference_context.k_cache,
                        "v_cache":inference_context.v_cache
                    }
                    print(f"Serializing KVCache")
                    data["k_cache"] = [mp.reductions.reduce_tensor(t) for t in data['k_cache']]
                    data["v_cache"] = [mp.reductions.reduce_tensor(t) for t in data['v_cache']]
                    # print(data)
                    response = {'status': 'ok', 'inference_context': data}

                    worker.send(pickle.dumps(response))
                    # response['inference_context'].k_cache[0][0, 0, 0, 0, 0] = 1 
                    # print("k_cache update")

                else:
                    # 未知方法
                    response = {'status': 'error', 'message': 'Unknown method'}
                    worker.send(pickle.dumps(response))

            except Exception as e:
                # 处理异常并发送错误响应
                response = {'status': 'error', 'message': str(e)}
                worker.send(pickle.dumps(response))

    # 启动 RPC 服务
    def start_rpc_service(self):
        try:
            print("Scheduler RPC service is running...")

            # 在单独的线程中运行调度器
            threading.Thread(target=self.run_scheduler, daemon=True).start()

            # 启动工作线程
            for _ in range(10):  # 根据需要调整线程数
                threading.Thread(target=self.worker_routine, daemon=True).start()

            # 启动代理，开始监听请求
            self.start_proxy()

        except KeyboardInterrupt:
            print("Shutting down scheduler RPC service...")
            self.stop_rpc_service()

    # 停止 RPC 服务
    def stop_rpc_service(self):
        self.stop_scheduler()
        self.frontend.close()
        self.backend.close()
        self.context.term()

def start_server(settings, main_args):
    server = SchedulerServer(settings, main_args)
    server.start_rpc_service()


# Add async client for webserver
class SchedulerClient:
    def __init__(self, sched_port):
        address=f'tcp://localhost:{sched_port}'
        self.address = address
        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REQ)
        self.socket.connect(self.address)
        print(f"Connected to server at {self.address}")
    
    def __del__(self):
        self.socket.close()
        self.context.term()
    
    def send_request(self, method, params=None):
        if params is None:
            params = {}
        request = {
            'method': method,
            'params': params
        }
        # print(f'send request {request}')
        self.socket.send(pickle.dumps(request))
        response = self.socket.recv()
        # print(response)
        response = pickle.loads(response)
        if response.get('status') == 'ok':
            return response
        else:
            raise Exception(f"Error from server: {response.get('message')}")
    
    def add_query(self, query):
        response = self.send_request('add_query', {'query': query})
        return response.get('query_id')
    
    def cancel_query(self, query_id):
        self.send_request('cancel_query', {'query_id': query_id})
    
    def update_last_batch(self, updates):
        response = self.send_request('update_last_batch', {'updates': updates})
        # print(f"update_last_batch response {response}")
        return response.get('batch_todo')
    
    def rebuild_inferece_context(self,response):
        data = response.get('inference_context')
        inference_context = sched_ext.InferenceContext()
        print('Rebuilding kvcache')
        inference_context.k_cache = [fn(*args) for fn,args in data['k_cache']]
        inference_context.v_cache = [fn(*args) for fn,args in data['v_cache']]
        return inference_context

    def get_inference_context_raw(self):
        response = self.send_request('get_inference_context')
        return response
       

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, required=True)
    args = parser.parse_args()
    with open(args.config, "rb") as f:
        main_args = pickle.load(f)
    if main_args.architectures == "Qwen2MoeForCausalLM": 
        settings = create_sched_settings_qwen2moe(main_args)
    elif main_args.architectures == "Qwen3MoeForCausalLM":
        settings = create_sched_settings_qwen3moe(main_args)
    elif main_args.architectures == "Glm4MoeForCausalLM":
        settings = create_sched_settings_glm4moe(main_args)
    elif main_args.architectures == "SmallThinkerForCausalLM":
        settings = create_sched_settings_smallthinker(main_args)
    elif main_args.architectures == "Qwen3NextForCausalLM":
        settings = create_sched_settings_qwen3next(main_args)
    else:
        settings = create_sched_settings(main_args)
    start_server(settings, main_args)


================================================
FILE: archive/ktransformers/server/balance_serve/settings.py
================================================
'''
Date: 2024-11-13 09:43:39
LastEditors: djw
LastEditTime: 2024-11-18 16:41:03
'''
import sys, os
import yaml, json
from time import sleep


import sched_ext
from transformers import AutoConfig

from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.models.configuration_glm4_moe import Glm4MoeConfig
from ktransformers.models.configuration_smallthinker import SmallthinkerConfig
from ktransformers.models.configuration_qwen3_next import Qwen3NextConfig

def create_sched_settings(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = 1 # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 576
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = args.tp # only full tp supported now
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = True
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = False

    settings.kvc2_root_path = args.kvc2_disk_path
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True

    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings


def create_sched_settings_qwen2moe(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 128
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = False
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = True

    settings.kvc2_root_path = args.kvc2_disk_path
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings


def create_sched_settings_qwen3moe(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = Qwen3MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 128
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = False
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = True

    settings.kvc2_root_path = args.kvc2_disk_path
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings

def create_sched_settings_glm4moe(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = Glm4MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 128
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = False
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = True

    settings.kvc2_root_path = args.kvc2_disk_path
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings

def create_sched_settings_smallthinker(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = SmallthinkerConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 128
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = False
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = True

    settings.kvc2_root_path = args.kvc2_disk_path
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings

def create_sched_settings_qwen3next(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = Qwen3NextConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 256
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = False
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = True

    settings.kvc2_root_path = args.kvc2_disk_path
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings

================================================
FILE: archive/ktransformers/server/config/config.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : unicornchan
Date         : 2024-06-11 16:35:42
Version      : 1.0.0
LastEditors  : WuHao
LastEditTime : 2024-08-12 06:31:14
"""
import os
import shutil
import yaml
import psutil

from ktransformers.server.config.singleton import Singleton
from typing import Optional


class Config(metaclass=Singleton):
    """Singleton pattern Config class, used to get all configurations."""

    CONFIG_FILE_NAME = "config.yaml"

    @staticmethod
    def load() -> dict:
        """load config file

        Returns:
            dict: all configs
        """
        base_path: str = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        config_yaml: str = os.path.join(base_path, "configs", Config.CONFIG_FILE_NAME)

        user_path: str = os.path.expanduser("~")
        localstore_path: str = os.path.join(user_path, ".ktransformers")
        kvc2_config_dir = os.path.join(localstore_path, "kvc2")
        config_path: str = os.path.join(localstore_path, Config.CONFIG_FILE_NAME)
        if not os.path.exists(config_yaml):
            print(f"Can't find config file, {config_yaml}")
            exit(-1)
        if not os.path.exists(localstore_path):
            os.mkdir(localstore_path)
        if not os.path.exists(kvc2_config_dir):
            os.mkdir(kvc2_config_dir)
        if not os.path.exists(config_path):
            shutil.copyfile(config_yaml, config_path)
        with open(config_path, "r", encoding="utf-8") as fp:
            config = yaml.safe_load(fp)
        return config

    @staticmethod
    def to_path(path: str) -> str:
        """
        process file path
        """
        base_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        real_path = path if os.path.isabs(path) else os.path.join(base_path, path)
        return real_path

    def __init__(self):
        cfg = Config.load()
        self.base_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        self.user_path: str = os.path.expanduser("~")
        self.localstore_path: str = os.path.join(self.user_path, ".ktransformers")
        # log configs
        self.log_dir = os.path.join(self.localstore_path, cfg["log"]["dir"])
        if not os.path.exists(self.log_dir):
            os.mkdir(self.log_dir)
        self.log_file = cfg["log"]["file"]
        self.log_level = cfg["log"]["level"]
        self.backup_count = cfg["log"]["backup_count"]

        self.kvc2_config_dir = os.path.join(self.localstore_path, "kvc2")
        # server configs
        self.server: dict = cfg.get("server", {})
        self.server_ip = self.server.get("ip", "0.0.0.0")
        self.server_port = self.server.get("port", 9016)
        self.api_key = self.server.get("api_key", "")

        # db configs
        self.db_configs: dict = cfg.get("db", {})
        self.db_type = self.db_configs.get("type", "")
        self.db_host = self.localstore_path
        self.db_port = self.db_configs.get("port", "")
        self.db_name = self.db_configs.get("database", "")
        self.db_pool_size = self.db_configs.get("pool_size")
        self.db_database = self.db_configs.get("database", "")

        # user config
        self.user_config: dict = cfg.get("user", {})
        self.user_secret_key = self.user_config.get("secret_key", "")
        self.user_algorithm = self.user_config.get("algorithm", "")
        self.user_force_think = self.user_config.get("force_think", False)

        # model config
        self.model: dict = cfg.get("model", {})
        self.backend_type: str = self.model.get("type", "transformers")
        self.model_dir: str = self.model.get("path", "")
        # to make sure it consistent with previous version
        self.model_path: str = self.model_dir
        self.model_name: str = self.model.get("name", "")
        self.architectures: str = self.model.get("name", "")
        self.model_device: str = self.model.get("device", "cuda:0")
        self.gguf_path: Optional[str] = self.model.get("gguf_path", None)
        self.use_cuda_graph = self.model.get("use_cuda_graph", True)
        self.trust_remote_code = self.model.get("trust_remote_code", True)
        # self.model_cache_lens = self.model.get("cache_lens")
        self.optimize_config_path: Optional[str] = self.model.get(
            "optimize_config_path", None
        )
        
        self.max_new_tokens = self.model.get("max_new_tokens", 2000)
        self.json_mode = self.model.get("json_mode", False)
        self.healing = self.model.get("healing", False)
        self.ban_strings: Optional[list] = self.model.get("ban_strings", None)
        self.gpu_split: Optional[str] = self.model.get("gpu_split", None)
        self.length: Optional[int] = self.model.get("length", None)
        self.rope_scale: Optional[float] = self.model.get("rope_scale", None)
        self.rope_alpha: Optional[float] = self.model.get("rope_alpha", None)
        self.no_flash_attn = self.model.get("no_flash_attn", False)
        self.low_mem = self.model.get("low_mem", False)
        self.experts_per_token: Optional[int] = self.model.get("experts_per_token", None)
        self.load_q4 = self.model.get("load_q4", False)
        self.fast_safetensors = self.model.get("fast_safetensors", False)
        self.draft_model_dir: Optional[str] = self.model.get("draft_model_dir", None)
        self.no_draft_scale = self.model.get("no_draft_scale", False)
        self.modes = self.model.get("modes", False)
        self.mode = self.model.get("mode", "llama")
        self.username = self.model.get("username", "User")
        self.botname = self.model.get("botname", "Chatbort")
        self.system_prompt: Optional[str] = self.model.get("system_prompt", None)
        self.temperature = self.model.get("temperature", 0.95)
        self.smoothing_factor = self.model.get("smoothing_factor", 0.0)
        self.dynamic_temperature: Optional[str] = self.model.get("dynamic_temperature", None)
        self.top_k = self.model.get("top_k", 50)
        self.top_p = self.model.get("top_p", 0.8)
        self.top_a = self.model.get("top_a", 0.0)
        self.skew = self.model.get("skew", 0.0)
        self.typical = self.model.get("typical", 0.0)
        self.repetition_penalty = self.model.get("repetition_penalty", 1.01)
        self.frequency_penalty = self.model.get("frequency_penalty", 0.0)
        self.presence_penalty = self.model.get("presence_penalty", 0.0)
        self.response_chunk = self.model.get("response_chunk", 250)
        self.no_code_formatting = self.model.get("no_code_formatting", False)
        self.cache_8bit = self.model.get("cache_8bit", False)
        self.cache_q4 = self.model.get("cache_q4", True)
        self.ngram_decoding = self.model.get("ngram_decoding", False)
        self.print_timings = self.model.get("print_timings", False)
        self.amnesia = self.model.get("amnesia", False)
        self.batch_size = self.model.get("batch_size", 1)
        self.cache_lens = self.model.get("cache_lens", 4096)
        self.device = self.model.get("device", "cuda:2")

        # web config
        self.web: dict = cfg.get("web", {})
        self.web_cross_domain: bool = self.web.get("open_cross_domain", True)
        self.mount_web: bool = self.web.get("mount", False)

        # ext
        self.ext: dict = cfg.get("ext", {})
        self.cpu_infer = psutil.cpu_count(logical=False) - 3

        # file config
        self.local_store_configs: dict = cfg.get("local_store", {})
        self.file_upload_dir: str = os.path.join(
            self.localstore_path, self.local_store_configs.get("file_upload_dir", "")
        )
        self.assistant_store_dir: str = os.path.join(
            self.localstore_path, self.local_store_configs.get("assistant_store_dir", "")
        )

        # long context config
        self.long_context_config: dict = cfg.get("long_context", {})
        self.max_seq_len = self.long_context_config.get("max_seq_len", 32000)
        self.block_size = self.long_context_config.get("block_size", 128)
        self.local_windows_len = self.long_context_config.get("local_windows_len", 4096)
        self.second_select_num = self.long_context_config.get("second_select_num", 32)
        self.anchor_type = self.long_context_config.get("anchor_type", "DYNAMIC")
        self.kv_type = self.long_context_config.get("kv_type", "FP16")
        self.dense_layer_num = self.long_context_config.get("dense_layer_num", 2)
        self.anchor_num = self.long_context_config.get("anchor_num", 1)
        self.preselect_block = self.long_context_config.get("preselect_block", True)
        self.head_select_mode = self.long_context_config.get("head_select_mode", "SHARED")
        self.preselect_block_count = self.long_context_config.get("preselect_block_count", 32)
        self.layer_step = self.long_context_config.get("layer_step", 1)
        self.token_step = self.long_context_config.get("token_step", 100)

        # local chat
        self.local_chat_config: dict = cfg.get("local_chat", {})
        self.prompt_file = self.local_chat_config.get("prompt_file", None)

        # asyncserver
        self.sched_strategy = cfg["async_server"]["sched_strategy"]
        self.sched_port = cfg["async_server"]["sched_port"]
        self.sched_metrics_port = cfg["async_server"]["sched_metrics_port"]
        self.kvc2_metrics_port = cfg["async_server"]["kvc2_metrics_port"]
        self.max_batch_size = cfg["async_server"]["max_batch_size"]
        self.page_size = cfg["attn"]["page_size"]
        self.chunk_size = cfg["attn"]["chunk_size"]
        self.memory_gpu_only = cfg["kvc2"]["gpu_only"]
        self.cache_lens = ((self.cache_lens + self.page_size - 1) // self.page_size) * self.page_size
        self.gpu_memory_size = 2*576*61*self.cache_lens
        self.utilization_percentage = 1.0 #cfg["kvc2"]["utilization_percentage"]
        self.cpu_memory_size_GB = cfg["kvc2"]["cpu_memory_size_GB"]
        self.kvc2_disk_path = cfg["kvc2"]["disk_path"]
        # only support 2 prefill task
        self.max_prefill_batch_size = 2
        self.max_decode_batch_size = self.max_batch_size - self.max_prefill_batch_size 


================================================
FILE: archive/ktransformers/server/config/log.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : unicornchan
Date         : 2024-06-12 02:48:39
Version      : 1.0.0
LastEditors  : chenxl 
LastEditTime : 2024-07-27 01:55:50
'''

import codecs
import logging
import os
import re
import locale
from pathlib import Path
from logging.handlers import BaseRotatingHandler
import time
import colorlog

from ktransformers.server.config.config import Config


class DailyRotatingFileHandler(BaseRotatingHandler):
    """
    such as 'logging.TimeRotatingFileHandler', Additional features:
     - support multiprocess
     - support rotating daily
    """

    def __init__(self, filename, backupCount=0, encoding=None, delay=False, utc=False, **kwargs): # pylint: disable=unused-argument
        self.backup_count = backupCount
        self.utc = utc
        self.suffix = "%Y-%m-%d"
        self.base_log_path = Path(filename)
        if not os.path.exists(self.base_log_path.parent):
            os.makedirs(self.base_log_path.parent)
        self.base_filename = self.base_log_path.name
        self.current_filename = self._compute_fn()
        self.current_log_path = self.base_log_path.with_name(
            self.current_filename)
        BaseRotatingHandler.__init__(self, filename, 'a', encoding, delay)

    # pylint: disable=unused-argument, invalid-name
    def shouldRollover(self, record):
        """
        Determine whether to rotate the log. If the log filename corresponding to the current 
        time is not consistent with the currently opened log filename, then it is necessary
        to rotate the log
        Args:
            record: record is not used, as we are just comparing times, but it is needed so
        the method signatures are the same
        """
        if self.current_filename != self._compute_fn():
            return True
        return False

    def doRollover(self):
        """
        roll over
        """
        # close last log file
        if self.stream:
            self.stream.close()
            self.stream = None  # type: ignore

        # gen new log file name
        self.current_filename = self._compute_fn()
        self.current_log_path = self.base_log_path.with_name(
            self.current_filename)

        if not self.delay:
            self.stream = self._open() # type: ignore

        self.delete_expired_files()

    def _compute_fn(self):
        """
        gen log file name
        """
        return self.base_filename + "." + time.strftime(self.suffix, time.localtime())

    def _open(self):
        """
        open a new log file, create soft link
        """
        if self.encoding is None:
            stream = open(str(self.current_log_path), self.mode, encoding=locale.getpreferredencoding())
        else:
            stream = codecs.open(str(self.current_log_path), self.mode, self.encoding)

        if self.base_log_path.exists():
            try:
                if not self.base_log_path.is_symlink() or os.readlink(self.base_log_path) != self.current_filename:
                    os.remove(self.base_log_path)
            except OSError:
                pass

        try:
            os.symlink(self.current_filename, str(self.base_log_path))
        except OSError:
            pass
        return stream

    def delete_expired_files(self):
        """
        delete expired files every day
        """
        if self.backup_count <= 0:
            return

        file_names = os.listdir(str(self.base_log_path.parent))
        result = []
        prefix = self.base_filename + "."
        plen = len(prefix)
        for file_name in file_names:
            if file_name[:plen] == prefix:
                suffix = file_name[plen:]
                if re.match(r"^\d{4}-\d{2}-\d{2}(\.\w+)?$", suffix):
                    result.append(file_name)
        if len(result) < self.backup_count:
            result = []
        else:
            result.sort()
            result = result[:len(result) - self.backup_count]

        for file_name in result:
            os.remove(str(self.base_log_path.with_name(file_name)))


class Logger(object):
    """
    logger class
    """
    level_relations = {
        'debug': logging.DEBUG,
        'info': logging.INFO,
        'warn': logging.WARNING,
        'error': logging.ERROR,
        'crit': logging.CRITICAL
    }

    def __init__(self, level: str = 'info'):
        fmt = '%(asctime)s %(levelname)s %(pathname)s[%(lineno)d] %(funcName)s: %(message)s'
        cfg: Config = Config()
        filename: str = os.path.join(cfg.log_dir, cfg.log_file)
        backup_count: int = cfg.backup_count
        th = DailyRotatingFileHandler(filename=filename, when='MIDNIGHT', backupCount=backup_count, encoding="utf-8")
        th.setFormatter(logging.Formatter(fmt))


        color_fmt = (
            '%(log_color)s%(asctime)s %(levelname)s %(pathname)s[%(lineno)d]: %(message)s'
        )
        color_formatter = colorlog.ColoredFormatter(
            color_fmt,
            log_colors={
                'DEBUG': 'cyan',
                'INFO': 'green',
                'WARNING': 'yellow',
                'ERROR': 'red',
                'CRITICAL': 'bold_red'
            }
        )

        sh = logging.StreamHandler()
        sh.setFormatter(color_formatter)

        self.logger = logging.getLogger(filename)
        self.logger.setLevel(self.level_relations.get(level)) # type: ignore
        self.logger.addHandler(th)
        self.logger.addHandler(sh)


logger = Logger(level=Config().log_level).logger


================================================
FILE: archive/ktransformers/server/config/singleton.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  : Implement singleton
Author       : unicornchan
Date         : 2024-06-11 17:08:36
Version      : 1.0.0
LastEditors  : chenxl 
LastEditTime : 2024-07-27 01:55:56
'''
import abc

class Singleton(abc.ABCMeta, type):
    """_summary_

    Args:
        abc.ABCMeta: Provide a mechanism for defining abstract methods and properties,
            enforcing subclasses to implement these methods and properties.
        type: Inherit from 'type' to make 'Singleton' a metaclass,
            enabling the implementation of the Singleton
    """
    _instances = {}

    def __call__(cls, *args, **kwds):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwds)
        return cls._instances[cls]

class AbstractSingleton(abc.ABC, metaclass=Singleton):
    """Provided an abstract Singleton base class, any class inheriting from
       this base class will automatically become a Singleton class.

    Args:
        abc.ABC: Abstract base class, it cannot be instantiated, only inherited. 
    """


================================================
FILE: archive/ktransformers/server/crud/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/crud/assistants/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/crud/assistants/assistants.py
================================================
from time import time
from typing import Optional,List
from uuid import uuid4

from ktransformers.server.models.assistants.assistants import Assistant
from ktransformers.server.schemas.assistants.assistants import AssistantCreate,AssistantObject,AssistantModify
from ktransformers.server.utils.sql_utils import SQLUtil
from ktransformers.server.config.log import logger
from ktransformers.server.schemas.base import Order


class AssistantDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()

    def create_assistant_object(self, assistant: AssistantCreate) -> AssistantObject:
        assistant = AssistantObject(
            **assistant.model_dump(mode='json'),
            id=str(uuid4()),
            object='assistant',
            created_at=int(time()),
        )
        return assistant

    def db_count_assistants(self) -> int:
        with self.sql_util.get_db() as db:
            return db.query(Assistant).count()

    def db_create_assistant(self, assistant: AssistantCreate):
        ass_obj = self.create_assistant_object(assistant)
        ass_obj.sync_db()
        return ass_obj

    def db_list_assistants(self, limit: Optional[int], order: Order) -> List[AssistantObject]:
        with self.sql_util.get_db() as db:
            query = db.query(Assistant).order_by(
                order.to_sqlalchemy_order()(Assistant.created_at))
            if limit is not None:
                db_assistants = query.limit(limit)
            else:
                db_assistants = query.all()
            return [AssistantObject.model_validate(a.__dict__) for a in db_assistants]

    def db_get_assistant_by_id(self, assistant_id: str) -> Optional[AssistantObject]:
        with self.sql_util.get_db() as db:
            db_assistant = db.query(Assistant).filter(
                Assistant.id == assistant_id).first()
            if db_assistant is None:
                logger.debug(f"no assistant with id {str}")
                return None
            return AssistantObject.model_validate(db_assistant.__dict__)

    def db_update_assistant_by_id(self, assistant_id: str, assistant: AssistantModify):
        with self.sql_util.get_db() as db:
            db_assistant = db.query(Assistant).filter(
                Assistant.id == assistant_id).first()
            self.sql_util.db_update_commit_refresh(db, db_assistant, assistant)
            return AssistantObject.model_validate(db_assistant.__dict__)

    def db_delete_assistant_by_id(self, assistant_id: str):
        with self.sql_util.get_db() as db:
            db_assistant = db.query(Assistant).filter(
                Assistant.id == assistant_id).first()
            db.delete(db_assistant)
            db.commit()


================================================
FILE: archive/ktransformers/server/crud/assistants/messages.py
================================================
from time import time
from typing import Optional
from uuid import uuid4

from ktransformers.server.models.assistants.messages import Message
from ktransformers.server.schemas.assistants.messages import MessageCore, MessageCreate,  MessageObject
from ktransformers.server.schemas.base import Order,ObjectID
from ktransformers.server.utils.sql_utils import SQLUtil

class MessageDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()

    @staticmethod
    def create_db_message_by_core(message: MessageCore):
        message_dict = message.model_dump(mode="json")
        return Message(**message_dict, id=str(uuid4()), created_at=int(time()))

    def create_db_message(self, message: MessageCreate):
        return MessageDatabaseManager.create_db_message_by_core(message.to_core())

    def db_add_message(self, message: Message):
        with self.sql_util.get_db() as db:
            db.add(message)
            self.sql_util.db_add_commit_refresh(db, message)

    def db_create_message(self, thread_id: str, message: MessageCreate, status: MessageObject.Status):
        db_message = self.create_db_message(message)
        db_message.status = status.value
        db_message.thread_id = thread_id
        self.db_add_message(db_message)
        return MessageObject.model_validate(db_message.__dict__)

    @staticmethod
    def create_message_object(thread_id: ObjectID, run_id: ObjectID, message: MessageCreate):
        core = message.to_core()
        return MessageObject(
            **core.model_dump(mode='json'),
            id=str(uuid4()),
            object='thread.message',
            created_at=int(time()),
            thread_id=thread_id,
            run_id=run_id,
            status=MessageObject.Status.in_progress,
        )

    def db_sync_message(self, message: MessageObject):
        db_message = Message(
            **message.model_dump(mode="json"),
        )
        with self.sql_util.get_db() as db:
            self.sql_util.db_merge_commit(db, db_message)

    def db_list_messages_of_thread(
            self, thread_id: str, limit: Optional[int] = None, order: Order = Order.DESC):

        # logger.debug(
        #     f"list messages of: {thread_id}, limit {limit}, order {order}")
        with self.sql_util.get_db() as db:
            query = (
                db.query(Message)
                .filter(Message.thread_id == thread_id)
                .order_by(order.to_sqlalchemy_order()(Message.created_at))
            )
            if limit is not None:
                messages = query.limit(limit)
            else:
                messages = query.all()
            message_list = [MessageObject.model_validate(m.__dict__) for m in messages]
        return message_list

    def db_get_message_by_id(self, thread_id: ObjectID, message_id: ObjectID) -> MessageObject:
        with self.sql_util.get_db() as db:
            message = db.query(Message).filter(
                Message.id == message_id).first()
        assert message.thread_id == thread_id
        message_info = MessageObject.model_validate(message.__dict__)
        return message_info

    def db_delete_message_by_id(self, thread_id: ObjectID, message_id: ObjectID):
        with self.sql_util.get_db() as db:
            message = db.query(Message).filter(
                Message.id == message_id).first()
            assert message.thread_id == thread_id
            db.delete(message)
            db.commit()


================================================
FILE: archive/ktransformers/server/crud/assistants/runs.py
================================================
from time import time
from uuid import uuid4

from ktransformers.server.models.assistants.runs import Run
from ktransformers.server.schemas.assistants.runs import RunCreate,RunObject
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.utils.sql_utils import SQLUtil


class RunsDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()

    def create_run_object(self, thread_id: ObjectID, run: RunCreate) -> RunObject:
        run_obj = RunObject(
            **run.model_dump(mode='json', exclude={"stream"}),
            id=str(uuid4()),
            object='run',
            created_at=int(time()),
            thread_id=thread_id,
            status=RunObject.Status.queued,
        )
        run_obj.set_compute_save(0)
        return run_obj

    def db_create_run(self, thread_id: str, run: RunCreate):
        db_run = Run(
            **run.model_dump(mode="json", exclude={"stream"}),
            id=str(uuid4()),
            created_at=int(time()),
            status="queued",
            thread_id=thread_id,
        )
        with self.sql_util.get_db() as db:
            self.sql_util.db_add_commit_refresh(db, db_run)
            run_obj = RunObject.model_validate(db_run.__dict__)
            run_obj.set_compute_save(0)
        return run_obj

    def db_sync_run(self, run: RunObject) -> None:
        db_run = Run(
            **run.model_dump(mode='json'),
        )
        with self.sql_util.get_db() as db:
            self.sql_util.db_merge_commit(db, db_run)

    def db_get_run(self, run_id: ObjectID) -> RunObject:
        with self.sql_util.get_db() as db:
            db_run = db.query(Run).filter(Run.id == run_id).first()
            return RunObject.model_validate(db_run.__dict__)


================================================
FILE: archive/ktransformers/server/crud/assistants/threads.py
================================================
from time import time
from typing import Optional,List
from uuid import uuid4

from ktransformers.server.models.assistants.messages import Message
from ktransformers.server.models.assistants.threads import Thread
from ktransformers.server.schemas.assistants.threads import ThreadCreate,ThreadObject
from ktransformers.server.schemas.base import ObjectID, Order
from ktransformers.server.schemas.conversation import ThreadPreview
from ktransformers.server.utils.sql_utils import SQLUtil
from ktransformers.server.crud.assistants.messages import MessageDatabaseManager
from ktransformers.server.config.log import logger
from ktransformers.server.crud.assistants.assistants import AssistantDatabaseManager

class ThreadsDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()
        self.message_manager = MessageDatabaseManager()
        self.assistant_maanager = AssistantDatabaseManager()

    def db_create_thread(self, thread: ThreadCreate):
        thread_id = str(uuid4())
        db_messages = []
        with self.sql_util.get_db() as db:
            if thread.messages is not None:
                logger.debug("Creating messages first for thread")
                for message in thread.messages:
                    db_message: Message = MessageDatabaseManager.create_db_message_by_core(
                        message)
                    db_message.role = "user"
                    db_message.thread_id = thread_id
                    db.add(db_message)
                    db_messages.append(db_message)

            db_thread = Thread(
                **thread.model_dump(exclude="messages"),
                id=str(uuid4()),
                created_at=int(time()),
                messages=db_messages,
            )

            self.sql_util.db_add_commit_refresh(db, db_thread)
            thread_obj = ThreadObject.model_validate(db_thread.__dict__)

            if 'assistant_id' in thread.meta_data:
#                assistant = self.assistant_maanager.db_get_assistant_by_id(thread.meta_data['assistant_id'], db)
                assistant = self.assistant_maanager.db_get_assistant_by_id(thread.meta_data['assistant_id'])
                logger.info(
                    f'Append this related thread to assistant {assistant.id}')
                assistant.append_related_threads([thread_obj.id])
                assistant.sync_db(db)
        return thread_obj

    def db_get_thread_by_id(self, thread_id: ObjectID):
        with self.sql_util.get_db() as db:
            db_thread = db.query(Thread).filter(Thread.id == thread_id).first()
            return ThreadObject.model_validate(db_thread.__dict__)

    def db_list_threads(self, limit: Optional[int], order: Order) -> List[ThreadObject]:
        with self.sql_util.get_db() as db:
            query = db.query(Thread).order_by(order.to_sqlalchemy_order()(
                Thread.created_at)).filter(~Thread.meta_data.contains('assistant_id'))

            if limit is not None:
                db_threads = query.limit(limit)
            else:
                db_threads = query.all()

            return [ThreadObject.model_validate(tool.__dict__) for tool in db_threads]

    def db_list_threads_preview(self, limit: Optional[int], order: Order) -> List[ThreadPreview]:
        threads = self.db_list_threads(limit, order)
        previews = []
        for thread in threads:
            messages = self.message_manager.db_list_messages_of_thread(
                thread.id, limit=2, order=Order.ASC)
            if len(messages) == 2:
                message = messages[0]
                assistant = self.assistant_maanager.db_get_assistant_by_id(
                    messages[1].assistant_id)
            else:
                message = None
                assistant = None
            previews.append(ThreadPreview(
                assistant=assistant, thread=thread, first_message=message))
        return previews

    def db_delete_thread_by_id(self, thread_id: ObjectID):
        with self.sql_util.get_db() as db:
            db_thread = db.query(Thread).filter(Thread.id == thread_id).first()
            db.delete(db_thread)
            # TODO delete related messages and runs and other stuff or just gc
            db.commit()


================================================
FILE: archive/ktransformers/server/exceptions.py
================================================
from fastapi import HTTPException, status


def db_exception():
    return HTTPException(
        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        detail="DB Error",
    )


def not_implemented(what):
    return HTTPException(
        status_code=status.HTTP_501_NOT_IMPLEMENTED,
        detail=f"{what} not implemented",
    )


def internal_server_error(what):
    return HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"{what}")


def request_error(what):
    return HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"{what}")


================================================
FILE: archive/ktransformers/server/main.py
================================================
import asyncio
import os
import re
from uuid import uuid4

import torch
import torch.distributed
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
import uvicorn.logging
import uvicorn
import sys
import atexit
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
from fastapi.middleware.cors import CORSMiddleware
from ktransformers.server.args import ArgumentParser
from ktransformers.server.config.config import Config
from ktransformers.util import utils
from ktransformers.server.utils.create_interface import create_interface, GlobalInterface, get_thread_context_manager
from fastapi.openapi.utils import get_openapi
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from ktransformers.server.api import router, post_db_creation_operations
from ktransformers.server.utils.sql_utils import Base, SQLUtil
from ktransformers.server.config.log import logger
import subprocess
import tempfile

def mount_app_routes(mount_app: FastAPI):
    sql_util = SQLUtil()
    logger.info("Creating SQL tables")
    Base.metadata.create_all(bind=sql_util.sqlalchemy_engine)
    post_db_creation_operations()
    mount_app.include_router(router)


def create_app():
    cfg = Config()
    if(hasattr(GlobalInterface.interface, "lifespan")):
        app = FastAPI(lifespan=GlobalInterface.interface.lifespan)
    else:
        app = FastAPI()
    if Config().web_cross_domain:
        app.add_middleware(
            CORSMiddleware,
            allow_origins=["*"],
            allow_credentials=True,
            allow_methods=["*"],
            allow_headers=["*"],
        )
    mount_app_routes(app)
    if cfg.mount_web:
        mount_index_routes(app)
    return app


def update_web_port(config_file: str):
    ip_port_pattern = (
        r"(localhost|((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)):[0-9]{1,5}"
    )
    with open(config_file, "r", encoding="utf-8") as f_cfg:
        web_config = f_cfg.read()
    ip_port = "localhost:" + str(Config().server_port)
    new_web_config = re.sub(ip_port_pattern, ip_port, web_config)
    with open(config_file, "w", encoding="utf-8") as f_cfg:
        f_cfg.write(new_web_config)


def mount_index_routes(app: FastAPI):
    project_dir = os.path.dirname(os.path.dirname(__file__))
    web_dir = os.path.join(project_dir, "website/dist")
    web_config_file = os.path.join(web_dir, "config.js")
    update_web_port(web_config_file)
    if os.path.exists(web_dir):
        app.mount("/web", StaticFiles(directory=web_dir), name="static")
    else:
        err_str = f"No website resources in {web_dir}, please complile the website by npm first"
        logger.error(err_str)
        print(err_str)
        exit(1)


def run_api(app, host, port, **kwargs):
    if kwargs.get("ssl_keyfile") and kwargs.get("ssl_certfile"):
        uvicorn.run(
            app,
            host=host,
            port=port,
            ssl_keyfile=kwargs.get("ssl_keyfile"),
            ssl_certfile=kwargs.get("ssl_certfile"),
        )
    else:
        uvicorn.run(app, host=host, port=port, log_level="debug")


def custom_openapi(app):
    if app.openapi_schema:
        return app.openapi_schema
    openapi_schema = get_openapi(
        title="ktransformers server",
        version="1.0.0",
        summary="This is a server that provides a RESTful API for ktransformers.",
        description="We provided chat completion and openai assistant interfaces.",
        routes=app.routes,
    )
    openapi_schema["info"]["x-logo"] = {"url": "https://kvcache.ai/media/icon_1.png"}
    app.openapi_schema = openapi_schema
    return app.openapi_schema


def verify_arg(args):
    nproc_per_node = int(os.getenv('LOCAL_WORLD_SIZE'))

    if args.batch_size not in [1, 2, 3, 4]:
        raise ValueError(f'argument batch_size should be in [1, 2, 3, 4], got {args.batch_size}')

    if nproc_per_node not in [1, 2]:
        raise ValueError(f'argument nproc_per_node should be in [1, 2], got {nproc_per_node}')

    if args.tp not in [1, 2]:
        raise ValueError(f'argument tp should be in [1, 2], got {args.tp}')

    if nproc_per_node != args.tp:
        raise ValueError(f'argument nproc_per_node should be equal to tp, got nproc_per_node is {nproc_per_node}, tp is {args.tp}')


def main():
    try:
        import torch_npu
        use_npu = torch.npu.is_available()
        torch.npu.config.allow_internal_format = True
    except:
        use_npu = False

    cfg = Config()

    arg_parser = ArgumentParser(cfg)

    args = arg_parser.parse_args()
    if use_npu:
        verify_arg(args)

        rank_id = int(os.environ["RANK"])
        args.device = args.device[:-1] + str(rank_id)
    create_interface(config=cfg, default_args=cfg, input_args=args)

    tp_size = args.tp
    world_size = int(os.getenv("WORLD_SIZE", '1'))
    if tp_size == world_size and tp_size > 1:
        if rank_id == 0:
            app = create_app()
            custom_openapi(app)
            run_api(
                app=app,
                host=args.host,
                port=args.port,
                ssl_keyfile=args.ssl_keyfile,
                ssl_certfile=args.ssl_certfile,
            )
        elif cfg.backend_type == 'ktransformers':
            while True:
                try:
                    context = get_thread_context_manager()
                    id = str(uuid4())
                    context.interface.sync_inference("", id, 1.0, 1.0)
                except Exception as e:
                    print(f"An error occurred: {e}")
                finally:
                    pass
    else:
        app = create_app()
        custom_openapi(app)

        run_api(
            app=app,
            host=args.host,
            port=args.port,
            ssl_keyfile=args.ssl_keyfile,
            ssl_certfile=args.ssl_certfile,
        )

if __name__ == "__main__":
    main()


================================================
FILE: archive/ktransformers/server/models/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/models/assistants/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/models/assistants/assistants.py
================================================
from sqlalchemy import JSON, Column, Float, Integer, String, Text
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Assistant(Base):
    __tablename__ = "assistants"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="assistant")
    created_at = Column(Integer)

    name = Column(String, nullable=True)
    description = Column(String, nullable=True)
    model = Column(String)
    instructions = Column(Text, nullable=True)
    tools = Column(JSON)
    tool_resources = Column(JSON)
    temperature = Column(Float, nullable=True)
    meta_data = Column(JSON, nullable=True)
    top_p = Column(Float, nullable=True)
    response_format = Column(JSON, default="auto")

    build_status = Column(JSON, nullable=True)

    runs = relationship("Run", back_populates="assistant")

    messages = relationship("Message", back_populates="assistant")


================================================
FILE: archive/ktransformers/server/models/assistants/messages.py
================================================
from sqlalchemy import JSON, Column, ForeignKey, Integer, String
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Message(Base):
    __tablename__ = "messages"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread.message")
    created_at = Column(Integer)

    thread_id = Column(String, ForeignKey("threads.id"))
    status = Column(String, default="in_progress")
    incomplete_details = Column(JSON, nullable=True)
    completed_at = Column(Integer, nullable=True)
    incomplete_at = Column(Integer, nullable=True)
    role = Column(JSON)
    content = Column(JSON)
    assistant_id = Column(String, ForeignKey("assistants.id"), nullable=True)
    run_id = Column(String, ForeignKey("runs.id"), nullable=True)
    attachments = Column(JSON, nullable=True)
    meta_data = Column(JSON, nullable=True)

    thread = relationship("Thread", back_populates="messages")
    assistant = relationship("Assistant", back_populates="messages")
    run = relationship("Run", back_populates="message")


================================================
FILE: archive/ktransformers/server/models/assistants/run_steps.py
================================================
from sqlalchemy import JSON, Column, ForeignKey, Integer, String
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class RunStep(Base):
    __tablename__ = "run_steps"
    # todo
    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread.run.step")
    created_at = Column(Integer)

    assistant_id = Column(String, ForeignKey("assistants.id"))
    thread_id = Column(String, ForeignKey("threads.id"))
    run_id = Column(String, ForeignKey("runs.id"))
    type = Column(String)
    status = Column(String)
    step_details = Column(JSON)
    last_error = Column(JSON, nullable=True)
    expires_at = Column(Integer, nullable=True)
    cancelled_at = Column(Integer, nullable=True)
    failed_at = Column(Integer, nullable=True)
    completed_at = Column(Integer, nullable=True)

    meta_data = Column(JSON, nullable=True)
    usage = Column(JSON, nullable=True)

    assistant = relationship("Assistant", back_populates="run_steps")
    thread = relationship("Thread", back_populates="run_steps")
    run = relationship("Run", back_populates="run_steps")


================================================
FILE: archive/ktransformers/server/models/assistants/runs.py
================================================
from sqlalchemy import JSON, Column, Float, ForeignKey, Integer, String, Text
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Run(Base):
    __tablename__ = "runs"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread.run")
    created_at = Column(Integer)
    thread_id = Column(String, ForeignKey("threads.id"))
    assistant_id = Column(String, ForeignKey("assistants.id"))
    status = Column(String)
    required_action = Column(JSON, nullable=True)
    last_error = Column(JSON, nullable=True)
    expires_at = Column(Integer, nullable=True)
    started_at = Column(Integer, nullable=True)
    cancelled_at = Column(Integer, nullable=True)
    failed_at = Column(Integer, nullable=True)
    completed_at = Column(Integer, nullable=True)
    incomplete_details = Column(JSON, nullable=True)
    # get from assistant
    model = Column(String)
    instructions = Column(Text, nullable=True)
    tools = Column(JSON)
    meta_data = Column(JSON, nullable=True)
    usage = Column(JSON, nullable=True)
    temperature = Column(Float, nullable=True)
    top_p = Column(Float, nullable=True)
    max_propmp_tokens = Column(Integer, nullable=True)
    truncation_strategy = Column(JSON)
    tool_choice = Column(JSON)
    response_format = Column(JSON, default="auto")

    thread = relationship("Thread", back_populates="runs")
    assistant = relationship("Assistant", back_populates="runs")
    message = relationship("Message", back_populates="run")


================================================
FILE: archive/ktransformers/server/models/assistants/threads.py
================================================
from sqlalchemy import JSON, Column, Integer, String
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Thread(Base):
    __tablename__ = "threads"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread")
    created_at = Column(Integer)

    tool_resources = Column(JSON, nullable=True)
    meta_data = Column(JSON, nullable=True)

    runs = relationship("Run", back_populates="thread")
    messages = relationship("Message", back_populates="thread")


================================================
FILE: archive/ktransformers/server/requirements.txt
================================================
torch >= 2.3.0
transformers >= 4.51.3
fastapi >= 0.111.0
langchain >= 0.2.0
blessed >= 1.20.0
accelerate >= 0.31.0
sentencepiece >= 0.1.97
openai
setuptools
build
ninja
wheel
colorlog
fire
zmq
psutil

================================================
FILE: archive/ktransformers/server/schemas/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/schemas/assistants/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/schemas/assistants/assistants.py
================================================
from enum import Enum
from time import time
from typing import AsyncIterable, Callable, Dict, List, Optional, Union
from asyncio import Lock, Queue

from fastapi import logger
from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
import torch

from ktransformers.server.config.config import Config
from ktransformers.server.models.assistants.assistants import Assistant
from ktransformers.server.models.assistants.threads import Thread
from ktransformers.server.schemas.assistants.messages import Role
from ktransformers.server.schemas.assistants.runs import RunObject,RunStreamResponse,ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.threads import ThreadObject
from ktransformers.server.schemas.base import Metadata,MetadataField,ObjectID
from ktransformers.server.schemas.assistants.tool import Tool,CodeInterpreter,FileSearch,RelatedThreads,FuntionTool,ToolResource,CodeInterpreterResource,FileSearchResource,RelatedThreadsResource,ToolType
from ktransformers.server.utils.sql_utils import SQLUtil


class AssistantBase(BaseModel):
    name: Optional[str] = Field(None,description='The name of the assistant.') 
    description: Optional[str] = Field(None,description='The description of the assistant.')
    instructions: Optional[str] = Field(None,description='Instructions which is added in front of the input of LLM') 
    tools: List[Tool] = Field([], max_length=128)

    @field_validator('tools', mode='before')
    def validate_tools(cls, value):
        re = []
        if not isinstance(value, list):
            raise ValueError('Invalid type for tools')

        for tool in value:
            if 'type' not in tool:
                raise ValueError('Invalid type for tools')
            if tool['type'] == 'code_interpreter':
                re.append(CodeInterpreter(**tool))
            elif tool['type'] == 'file_search':
                re.append(FileSearch(**tool))
            elif tool['type'] == 'related_threads':
                re.append(RelatedThreads(**tool))
            elif tool['type'] == 'function':
                re.append(FuntionTool(**tool))
            else:
                raise ValueError('Invalid type for tools')
        return re

    tool_resources: List[ToolResource] = Field([], max_length=128)

    @field_validator('tool_resources', mode='before')
    def validate_tool_resources(cls, value):
        re = []
        if not isinstance(value, list):
            raise ValueError('Invalid type for tool resources')

        for tool_re in value:
            if 'file_ids' in tool_re:
                re.append(CodeInterpreterResource(**tool_re))
            elif 'vector_stores' in tool_re:
                re.append(FileSearchResource(**tool_re))
            elif 'thread_ids' in tool_re:
                re.append(RelatedThreadsResource(**tool_re))
            else:
                raise ValueError('Invalid type for tool resources')
        return re

    meta_data: Metadata = MetadataField

    @model_validator(mode='before')
    def convert_meta_data(cls, values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    temperature: Optional[float] = Field(ge=0.0, le=2.0, default=1)
    top_p: Optional[float] = Field(ge=0.0, le=1.0, default=1)
    response_format: Union[str, Dict[str, str]] = "auto"


class AssistantCreate(AssistantBase):
    model: str


class AssistantBuildStatus(BaseModel):
    class Status(Enum):
        not_build = "not_build"
        in_queue = "in_queue"
        parsing = "parsing"
        prefilling = "prefilling"
        dumping = "dumping"
        completed = "completed"
        paused = "paused"

    _lock: Lock = PrivateAttr(default_factory=Lock)
    _queue: Optional[Queue] = PrivateAttr(None)

    status: Status = Field(default=Status.not_build)
    total_file_count: int = Field(default=0)
    parsed_file_count: int = Field(default=0)

    prefilling_current: int = Field(default=0)
    prefilling_total: int = Field(default=0)

    build_started_time: Optional[int] = Field(default=None)
    build_completed_time: Optional[int] = Field(default=None)

    # in megabytes
    assistant_usage: int = Field(default=0, description='')
    assistant_total_usage: int = Field(default=0)
    disk_free_space: int = Field(default=0)
    disk_total_space: int = Field(default=0)

    def to_stream_reply(self) -> str:
        return f"event: assistant.build.status\ndata: {self.model_dump_json()}\n\n"


class AssistantObject(AssistantBase, ObjectWithCreatedTime):
    model: Optional[str] = Field(
        default=Config().model_name)
    related_threads_objects: Optional[List] = Field(None, exclude=True)
    _encoded_instruction: Optional[torch.Tensor] = PrivateAttr(default=None)
    build_status: AssistantBuildStatus = Field(default=AssistantBuildStatus())

    def as_api_response(self):
        return self.model_dump(exclude={'build_status'})

    def get_related_threads_ids(self) -> List[ObjectID]:
        re = []
        for tool, tool_re in zip(self.tools, self.tool_resources):
            if tool.type == ToolType.RELATED_THREADS:
                re += tool_re.thread_ids or []
        return re

    def get_related_threads_objects(self) -> List:
        # raise NotImplementedError  # should be replaced
        sql_utils = SQLUtil()
        if self.related_threads_objects is None:
            with sql_utils.get_db() as db:
                db_threads = db.query(Thread).all()
            self.related_threads_objects = [tool for tool in [ThreadObject.model_validate(
                tool.__dict__) for tool in db_threads] if tool.is_related_threads and tool.meta_data['assistant_id'] == self.id]
            # logger.debug(
            #     f'Found {len(self.related_threads_objects)} related threads')
        return self.related_threads_objects

    def append_related_threads(self, thread_ids: List[ObjectID]):
        # logger.debug(f'{self.tools} {self.tool_resources}')
        for tool, tool_re in zip(self.tools, self.tool_resources):
            if tool.type == ToolType.RELATED_THREADS:
                tool_re.thread_ids += thread_ids
                return

        self.tools.append(RelatedThreads(type=ToolType.RELATED_THREADS))
        self.tool_resources.append(
            RelatedThreadsResource(thread_ids=thread_ids))

    async def update_build_status(self, events: AsyncIterable) -> AsyncIterable:
        async for event in events:
            # logger.debug(event)
            if isinstance(event, RunStreamResponse):
                if event.event == RunObject.Status.completed:
                    self.build_status.status = AssistantBuildStatus.Status.completed
                    self.build_status.build_completed_time = int(time())
                    self.sync_db()
                    yield self.build_status.model_copy()
            elif isinstance(event, dict):
                # logger.debug('dict')
                if 'stage' in event:
                    if event['stage'] == 'prefill':
                        self.build_status.status = AssistantBuildStatus.Status.prefilling
                        self.build_status.prefilling_current = event['curr_progress']
                        self.build_status.prefilling_total = event['max_progress']
                    if event['stage'] == 'parse':
                        self.build_status.status = AssistantBuildStatus.Status.parsing
                        self.build_status.parsed_file_count = event['curr_progress']
                        self.build_status.total_file_count = event['max_progress']
                    yield self.build_status.model_copy()

    def get_build_status(self) -> AssistantBuildStatus:
        return self.build_status
     
    
    def sync_db(self)->None:
        # raise NotImplementedError # should be replaced
        sql_utils = SQLUtil()
        db_assistant = Assistant(
            **self.model_dump(mode='json'),
        )
        with sql_utils.get_db() as db:
            sql_utils.db_merge_commit(db, db_assistant)
    
    def get_encoded_instruction(self,encode_fn:Callable)->torch.Tensor:
        if self._encoded_instruction is None:
            logger.info(f'encoding assistant instruction: {self.instructions}')
            self._encoded_instruction = encode_fn(self.instructions, Role.user)
        return self._encoded_instruction


class AssistantModify(AssistantBase):
    model: Optional[str] = None


# Non API Backend


================================================
FILE: archive/ktransformers/server/schemas/assistants/messages.py
================================================
from enum import Enum
from typing import ForwardRef, List, Optional, Union,Callable

import torch
from pydantic import BaseModel, PrivateAttr, model_validator

from ktransformers.server.exceptions import not_implemented
from ktransformers.server.config.log import logger
from ktransformers.server.models.assistants.messages import Message
from ktransformers.server.schemas.base import Metadata, MetadataField, ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.tool import Field,CodeInterpreter,FileSearch
from ktransformers.server.utils.sql_utils import SQLUtil


class IncompleteDetails(BaseModel):
    reason: str


class ContentType(Enum):
    image_file = "image_file"
    image_url = "image_url"
    text = "text"


class ContentObject(BaseModel):
    type: ContentType


class ImageFile(BaseModel):
    file_id: str
    detail: str


class ImageFileObject(ContentObject):
    image_file: ImageFile


class ImageUrl(BaseModel):
    url: str
    detail: str


class ImageUrlObject(ContentObject):
    image_url: ImageUrl


class Annotation(BaseModel):
    todo: str


class Text(BaseModel):
    value: str
    annotations: List[Annotation] = Field(default=[])


class TextObject(ContentObject):
    text: Text
    delta_index: int = Field(default=0,exclude=True)
    special_tokens_on: bool = Field(default=False,exclude=True) 
    last_two: str= Field(default='',exclude=True)  

    def filter_append(self,text:str):     
        self.text.value+=text
        self.delta_index+=1
        return True  


Content = Union[ImageFileObject, ImageUrlObject, TextObject]


class Attachment(BaseModel):
    file_id: Optional[str] = Field(default=None)
    tools: Optional[List[Union[CodeInterpreter, FileSearch]]] = Field(default=None)


class Role(Enum):
    user = "user"
    assistant = "assistant"

    def is_user(self)->bool:
        return self == Role.user


class MessageCore(BaseModel):
    role: Role
    content: List[Content]
    attachments: Optional[List[Attachment]]
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values


class MessageBase(MessageCore):
    class Status(Enum):
        created = "created" # only used for stream
        in_progress = "in_progress"
        incomplete = "incomplete"
        completed = "completed"
    thread_id: str
    status: Status
    incomplete_details: Optional[IncompleteDetails] = None
    completed_at: Optional[int] = None
    incomplete_at: Optional[int] = None

    assistant_id: Optional[str] = None
    run_id: Optional[str]


MessageStreamResponse = ForwardRef('MessageStreamResponse')

class MessageObject(MessageBase, ObjectWithCreatedTime):
    _encoded_content: Optional[torch.Tensor] = PrivateAttr(default=None)
    

    def get_text_content(self) -> str:
        text_content = ""
        for content in self.content:
            if content.type == ContentType.text:
                text_content += content.text.value
            else:
                raise not_implemented("Content other than text")
        return text_content

    async def get_encoded_content(self,encode_fn:Callable):
        if self._encoded_content is None:
            logger.info(f'encoding {self.role.value} message({self.status.value}): {self.get_text_content()}')
            self._encoded_content = encode_fn(self.get_text_content(),self.role)

            for f in self.get_attached_files():
                logger.info(f'encoding file: {f.filename}')
                self._encoded_content = torch.cat([self._encoded_content, encode_fn(await f.get_str(),self.role)],dim=-1)
                yield None 

        yield self._encoded_content


    def get_attached_files(self):
        raise NotImplementedError # should be replaced 


    def append_message_delta(self,text:str):
        raise NotImplementedError # should be replaced 
    
    def sync_db(self):
        # raise NotImplementedError # should be replaced
        sql_utils = SQLUtil()
        db_message = Message(
            **self.model_dump(mode="json"),
        )
        with sql_utils.get_db() as db:
            sql_utils.db_merge_commit(db, db_message)
    

    def stream_response_with_event(self, event: MessageBase.Status) -> MessageStreamResponse:
        match event:
            case MessageObject.Status.created:
                self.status = MessageObject.Status.in_progress
            case _:
                self.status = event
        return MessageStreamResponse(message=self, event=event)
   

class MessageStreamResponse(BaseModel):
    message: MessageObject
    event: MessageObject.Status

    def to_stream_reply(self):
        return f"event: thread.message.{self.event.value}\ndata: {self.message.model_dump_json()}\n\n"


class MessageCreate(BaseModel):
    role: Role = Field(default=Role.user)
    content: Union[str | List[Content]]
    attachments: Optional[List[Attachment]] = None
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values

    def to_core(self) -> MessageCore:
        # logger.debug(f"Converting message create to core {self.model_dump()}")
        core = MessageCore(
            role=self.role,
            content=[],
            attachments=self.attachments,
            meta_data=self.meta_data,
        )
        if isinstance(self.content, str):
            core.content = [TextObject(type="text", text=Text(value=self.content, annotations=[]))]
        elif isinstance(self.content, list):
            core.content = self.content
        else:
            raise ValueError("Invalid content type")
        return core


class MessageModify(BaseModel):
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values


================================================
FILE: archive/ktransformers/server/schemas/assistants/runs.py
================================================
from enum import Enum
from typing import Dict, List, Optional, Union, ForwardRef

from pydantic import BaseModel, Field, model_validator

from ktransformers.server.models.assistants.runs import Run
from ktransformers.server.schemas.base import TODO, Metadata, MetadataField, ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.threads import ThreadCreate
from ktransformers.server.schemas.assistants.tool import Tool, ToolResource
from ktransformers.server.utils.sql_utils import SQLUtil


class ToolCall(BaseModel):
    id: str
    type: str
    function: TODO


class SubmitToolOutputs(BaseModel):
    tool_calls: List[ToolCall]


class RequiredAction(BaseModel):
    type: str
    submit_tool_outputs: TODO


class LastError(BaseModel):
    code: str
    message: str


class IncompleteDetails(BaseModel):
    reason: str


class Usage(BaseModel):
    completion_tokens: int
    prompt_tokens: int
    total_tokens: int


class TruncationStrategy(BaseModel):
    type: str = "auto"
    last_message: Optional[int]


class ToolChoiceType(Enum):
    none = "none"
    auto = "auto"
    required = "required"


class RunBase(BaseModel):
    class Status(Enum):
        created = "created" # only stream event will have this created status
        queued = "queued"
        in_progress = "in_progress"
        requires_action = "requires_action"
        cancelling = "cancelling"
        cancelled = "cancelled"
        failed = "failed"
        completed = "completed"
        expired = "expired"


    thread_id: str
    assistant_id: str
    status: Status = Status.queued
    required_action: Optional[RequiredAction] = Field(None)
    last_error: Optional[LastError] = Field(None)
    expires_at: Optional[int]= Field(None)
    started_at: Optional[int] = Field(None)
    cancelled_at: Optional[int] = Field(None)
    failed_at: Optional[int] = Field(None)
    completed_at: Optional[int] = Field(None)
    incomplete_details: Optional[IncompleteDetails] = Field(None)
    model: Optional[str] = Field(None)
    instructions: Optional[str] = Field(None)
    tools: Optional[List[Tool]] = Field([])
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    
    def set_compute_save(self,save:int):
        self.meta_data['compute_save'] = str(save)


    usage: Optional[Usage] = Field(None)
    temperature: Optional[float] = Field(None)
    top_p: Optional[float]= Field(None)
    max_propmp_tokens: Optional[int]= Field(None)
    truncation_strategy: Optional[TruncationStrategy]= Field(None)
    tool_choice: Optional[Union[ToolChoiceType, dict]]= Field(None)
    response_format: Union[str, Dict[str, str]] = "auto"


RunStreamResponse = ForwardRef('RunStreamResponse')

class RunObject(RunBase, ObjectWithCreatedTime):
    def stream_response_with_event(self,event:RunBase.Status)->RunStreamResponse:
        match event:
            case RunBase.Status.created:
                self.status = RunBase.Status.queued
            case _:
                self.status = event
        return RunStreamResponse(run=self, event=event)
 
    
    def sync_db(self):
        # raise NotImplementedError # should be replaced in crud
        sql_utils = SQLUtil()
        db_run = Run(
            **self.model_dump(mode='json'),
        )
        with sql_utils.get_db() as db:
            sql_utils.db_merge_commit(db, db_run)
    
    def create_message_creation_step(self):
        raise NotImplementedError # should be replaced 
        

class RunStreamResponse(BaseModel):
    run: RunObject
    event: RunObject.Status
    def to_stream_reply(self):
        return f"event: thread.run.{self.event.value}\ndata: {self.run.model_dump_json()}\n\n"

class RunCreate(BaseModel):
    assistant_id: str
    model: Optional[str] = Field(default=None)
    instructions: Optional[str] = Field(default=None)
    # TODO: Add this
    # additional_instructions: Optional[str]
    # additional_messages: Optional[List[MessageCore]]
    tools: List[Tool] = Field(default=[])
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    temperature: Optional[float] = Field(default=None)
    top_p: Optional[float] = Field(default=None)
    stream: Optional[bool] = Field(default=None)
    max_propmp_tokens: Optional[int] = Field(default=None)
    # TODO: Add this
    # max_completion_tokens: Optional[int]
    truncation_strategy: Optional[TruncationStrategy] = Field(default=None)
    tool_choice: Optional[Union[ToolChoiceType, dict]] = Field(default=None)
    response_format: Union[str, Dict[str, str]] = Field(default="auto")


class RunThreadCreate(BaseModel):
    assistant_id: str
    thread: Optional[ThreadCreate]
    model: Optional[str]
    instructions: Optional[str]
    tools: List[Tool]
    tool_resources: List[ToolResource]
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    temperature: Optional[float]
    top_p: Optional[float]
    stream: Optional[bool]
    max_propmp_tokens: Optional[int]
    # TODO: Add this
    # max_completion_tokens: Optional[int]
    truncation_strategy: TruncationStrategy
    tool_choice: Union[ToolChoiceType, dict]
    response_format: Union[str, Dict[str, str]] = "auto"


class RunModify(BaseModel):
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values


class ToolOutput(BaseModel):
    tool_call_id: Optional[str]
    output: Optional[str]


class RunSubmit(BaseModel):
    tool_outputs: List[ToolOutput]
    stream: Optional[bool]


================================================
FILE: archive/ktransformers/server/schemas/assistants/streaming.py
================================================
import asyncio
from typing import AsyncIterable, List, Union

from fastapi import Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel

from ktransformers.server.schemas.assistants.runs import RunStreamResponse
from ktransformers.server.schemas.endpoints.chat import ChatCompletionChunk
from ktransformers.server.config.log import logger
from ktransformers.server.schemas.base import Object
from ktransformers.server.schemas.assistants.messages import ContentType, ImageFileObject, ImageUrlObject, MessageObject, Text, TextObject


class TextObjectWithIndex(TextObject):
    index: int


class ImageFileObjectWithIndex(ImageFileObject):
    index: int


class ImageUrlObjectWithIndex(ImageUrlObject):
    index: int


ContentWithIndex = Union[TextObjectWithIndex,
                         ImageFileObjectWithIndex, ImageUrlObjectWithIndex]


class MessageDeltaImpl(BaseModel):
    # role: Optional[str]
    content: List[ContentWithIndex]


class MessageDelta(Object):
    delta: MessageDeltaImpl

    def to_stream_reply(self):
        return f"event: thread.message.delta\ndata: {self.model_dump_json()}\n\n"


def text_delta(index: int, text: str):
    return MessageDeltaImpl(content=[TextObjectWithIndex(index=index, type=ContentType.text, text=Text(value=text))])


def append_message_delta(self: MessageObject, text: str):

    if len(self.content) == 0:
        self.content.append(TextObject(type=ContentType.text,
                            text=Text(value=''), delta_index=0))

    text_object: TextObject = self.content[0]
    if text_object.filter_append(text):
        return MessageDelta(id=self.id, object="thread.message.delta", delta=text_delta(text_object.delta_index, text))
    else:
        return None


MessageObject.append_message_delta = append_message_delta


class RunStepDeltaImpl(BaseModel):
    pass


class RunStepDelta(Object):
    delta: RunStepDeltaImpl

    def to_stream_reply(self):
        return f"event: thread.run.step.delta\ndata: {self.model_dump_json()}\n\n"


class Done():
    def to_stream_reply(self):
        return f"data: [DONE]\n\n"


async def check_client_link(request: Request, async_events: AsyncIterable):
    async for event in async_events:
        if await request.is_disconnected():
            break
        yield event


async def add_done(async_events: AsyncIterable):
    async for event in async_events:
        yield event
    yield Done()


async def to_stream_reply(async_events: AsyncIterable):
    async for event in async_events:
        if isinstance(event, str):
            yield event
        else:
            yield event.to_stream_reply()


async def filter_api_event(async_events: AsyncIterable):
    async for event in async_events:
        if isinstance(event, MessageDelta) or isinstance(event, RunStepDelta) or isinstance(event, RunStreamResponse) or isinstance(event, Done):
            yield event


async def filter_chat_chunk(async_events: AsyncIterable):
    async for event in async_events:
        if isinstance(event, ChatCompletionChunk):
            yield event


async def filter_by_types(async_events: AsyncIterable, types: List):
    async for event in async_events:
        for type in types:
            if isinstance(event, type):
                yield event
                continue


def api_stream_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, to_stream_reply(add_done(filter_api_event(async_events)))), media_type="text/event-stream")


def chat_stream_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, to_stream_reply(add_done(filter_chat_chunk(async_events)))), media_type="text/event-stream")


def stream_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, to_stream_reply(add_done(async_events))), media_type="text/event-stream")


def check_link_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, async_events), media_type="text/event-stream")


def wrap_async_generator_into_queue(async_events: AsyncIterable) -> asyncio.Queue:
    queue = asyncio.Queue()

    async def inner():
        # logger.debug('run inner')
        async for event in async_events:
            # logger.debug(f'put: {event}')
            await queue.put(event)
            await asyncio.sleep(0)
        # logger.debug(f'put: None')
        await queue.put(None)
    asyncio.create_task(inner())
    return queue


async def unwrap_async_queue(queue: asyncio.Queue) -> AsyncIterable:
    while True:
        events = [await queue.get()]
        events.extend([queue.get_nowait() for _ in range(queue.qsize())])

        logger.debug(f'getting {len(events)} events')
        for event in events:
            if event is None:
                break
            yield event


async def unwrap_async_queue_slow(queue: asyncio.Queue) -> AsyncIterable:
    while True:
        event = await queue.get()
        # logger.debug(f'unwrap_async_queue {event}')
        if event is None:
            break
        yield event


================================================
FILE: archive/ktransformers/server/schemas/assistants/threads.py
================================================
from enum import Enum
from typing import List
from typing_extensions import Self 

from pydantic import BaseModel, Field, model_validator

from ktransformers.server.schemas.base import Metadata, MetadataField, ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.tool import ToolResource
from ktransformers.server.schemas.assistants.messages import MessageCore


class ThreadBase(BaseModel):
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values

    tool_resources: List[ToolResource] = Field([], max_length=128)


class ThreadObject(ThreadBase, ObjectWithCreatedTime):
    is_related_threads:bool = Field(False,exclude=True)

    @model_validator(mode='after')
    def check_is_related_threads(self)->Self:
        # logger.debug(f'check thread {self.id} is related thread? by {self}')
        if 'assistant_id' in self.meta_data:
            self.is_related_threads = True
        return self

    class StreamEvent(Enum):
        created = 'created'

    def to_stream_reply(self,event:StreamEvent):
        return f"event: thread.{event.value}\ndata: {self.model_dump_json()}\n\n"
    

class ThreadCreate(ThreadBase):
    messages: List[MessageCore] = Field(default=[])


class ThreadModify(ThreadBase):
    pass


# other than OpenAI API


================================================
FILE: archive/ktransformers/server/schemas/assistants/tool.py
================================================
from enum import Enum
from typing import List, Optional, Union

from pydantic import BaseModel, Field

from ktransformers.server.schemas.base import ObjectID


class ToolType(str, Enum):
    CODE_INTERPRETER = "code_interpreter"
    FILE_SEARCH = "file_search"
    RELATED_THREADS = "related_threads"
    FUNCTION = "function"


class ToolBase(BaseModel):
    type: ToolType


class CodeInterpreter(ToolBase):
    pass


class FileSearch(ToolBase):
    pass


class RelatedThreads(ToolBase):
    pass


class FuntionTool(ToolBase):
    description: str
    name: str
    parameters: List[str]


Tool = Union[CodeInterpreter, FileSearch, RelatedThreads, FuntionTool]


class CodeInterpreterResource(BaseModel):
    file_ids: Optional[List[str]] = Field(default_factory=list, max_length=20)


class FileSearchResource(BaseModel):
    vector_store_ids: Optional[List[str]] = Field(default_factory=list, max_length=1)
    vector_stores: Optional[List[str]] = Field(default_factory=list, max_length=1)


class RelatedThreadsResource(BaseModel):
    thread_ids: List[ObjectID] = Field(default=[])


ToolResource = Union[CodeInterpreterResource,FileSearchResource,RelatedThreadsResource] 


================================================
FILE: archive/ktransformers/server/schemas/base.py
================================================
from enum import Enum
from typing import Dict

import sqlalchemy
from pydantic import BaseModel, ConfigDict, Field

TODO = BaseModel

ObjectID = str


class Object(BaseModel):
    id: ObjectID
    object: str

    model_config = ConfigDict(from_attributes=True)


# Pydantic Base Models
class ObjectWithCreatedTime(Object):
    created_at: int


class Order(str, Enum):
    ASC = "asc"
    DESC = "desc"

    def to_sqlalchemy_order(self):
        match self:
            case Order.ASC:
                return sqlalchemy.asc
            case Order.DESC:
                return sqlalchemy.desc


Metadata = Dict[str, str]
MetadataField: Metadata = Field({},max_length=16, alias="metadata")


class DeleteResponse(Object):
    deleted: bool = True

class OperationResponse(BaseModel):
    operation: str
    status: str


================================================
FILE: archive/ktransformers/server/schemas/conversation.py
================================================
from typing import Optional

from pydantic import BaseModel

from .assistants.assistants import AssistantObject
from .assistants.threads import ThreadObject
from .assistants.messages import MessageObject

class ThreadPreview(BaseModel):
    assistant: Optional[AssistantObject] = None
    thread: ThreadObject
    first_message: Optional[MessageObject] = None


================================================
FILE: archive/ktransformers/server/schemas/endpoints/chat.py
================================================
from typing import List, Optional, Union, Dict, Any
from typing_extensions import Literal
from enum import Enum
from pydantic import BaseModel, Field
from ktransformers.server.config.config import Config
from ktransformers.server.schemas.base import Object


from openai.types.chat.chat_completion_chunk import Choice

from uuid import uuid4

class CompletionUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    prompt_tokens_details: Optional[Dict[str, Any]] = None
    completion_tokens_details: Optional[Dict[str, Any]] = None
    prefill_time: Optional[float] = None
    decode_time: Optional[float] = None

class Role(Enum):
    system = 'system'
    user = 'user'
    assistant = 'assistant'
    tool = 'tool'
    function = 'function'

class Message(BaseModel):
    content: Optional[str] = None
    role: Role
    name: Optional[str] = None
    tool_calls: Optional[List[Dict[str, Any]]] = {}
    tool_call_id: Optional[str] = None
    
    def to_tokenizer_message(self):
        message = {'role': self.role.value}
        if self.content is not None:
            message['content'] = self.content
        if self.name is not None:
            message['name'] = self.name
        if self.tool_calls is not {}:
            message['tool_calls'] = self.tool_calls
        if self.tool_call_id is not None:
            message['tool_call_id'] = self.tool_call_id
        return message

class FunctionParameters(BaseModel):
    type: str = "object"
    properties: Dict[str, Any] = {}
    required: Optional[List[str]] = None

class FunctionDefinition(BaseModel):
    name: str
    description: Optional[str] = None
    parameters: FunctionParameters = Field(default_factory=FunctionParameters)

class ToolFunction(BaseModel):
    function: FunctionDefinition
    
class Tool(BaseModel):
    type: Literal["function"]
    function: FunctionDefinition

class ChatCompletionCreate(BaseModel):
    messages: List[Message]
    model: str
    stream: bool = False
    temperature: Optional[float] = Field(default=Config().temperature)
    top_p: Optional[float] = Field(default=Config().top_p)
    tools: Optional[List[Tool]] = None
    tool_choice: Optional[Union[str, Dict[str, Any]]] = None
    stream_options: Optional[Dict[str, Any]] = None
    frequency_penalty: float = 0
    presence_penalty: float = 0
    max_tokens: Optional[int] = Field(default=None)
    max_completion_tokens: Optional[int] = Field(default=None)
    return_speed: Optional[bool] = Field(default=False)
    def get_tokenizer_messages(self):
        return [m.to_tokenizer_message() for m in self.messages]

class ChatCompletionChunk(BaseModel):
    id: str
    choices: List[Choice]
    created: int
    model: str
    object: Literal["chat.completion.chunk"]
    service_tier: Optional[Literal["scale", "default"]] = None
    system_fingerprint: Optional[str] = None
    usage: Optional[CompletionUsage] = None

    def to_stream_reply(self):
        return f"data: {self.model_dump_json()}\n\n"

class RawUsage(BaseModel):
    tokenize_time: float
    prefill_time: float
    decode_time: float
    prefill_count: int
    decode_count: int

================================================
FILE: archive/ktransformers/server/schemas/legacy/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/schemas/legacy/completions.py
================================================
from typing import List, Optional
from enum import Enum
from pydantic import BaseModel, Field
from ktransformers.server.config.config import Config
from ..base import Object

class CompletionCreate(BaseModel):
    model: str
    prompt: str | List[str]
    stream: bool = False
    temperature: Optional[float] = Field(default=Config().temperature)
    top_p: Optional[float] = Field(default=Config().top_p)
    max_tokens: Optional[int] = Field(default=None)
    max_completion_tokens: Optional[int] = Field(default=None)
    
    def get_tokenizer_messages(self):
        if isinstance(self.prompt,List):
            self.get_tokenizer_messages('\n'.join(self.prompt))
        return [{'content':self.prompt,'role':'user'}]


class FinishReason(Enum):
    stop = 'stop'
    length = 'length'

class Choice(BaseModel):
    index: int
    text: str
    logprobs: Optional[str] = None
    finish_reason: FinishReason = None


class CompletionObject(Object):
    created:int
    choices: List[Choice] = []
    model:str = 'not implmented'
    system_fingerprint:str = 'not implmented'
    usage: Optional[str] = None

    def set_token(self,token:str):
        if len(self.choices)==0:
            self.choices.append(Choice(index=0,text=''))
        self.choices[0].text = token    

    def append_token(self,token:str):
        if len(self.choices)==0:
            self.choices.append(Choice(index=0,text=''))
        self.choices[0].text += token

    def to_stream_reply(self):
        return f"data:{self.model_dump_json()}\n\n"


================================================
FILE: archive/ktransformers/server/utils/__init__.py
================================================


================================================
FILE: archive/ktransformers/server/utils/create_interface.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : qiyuxinlin
Date         : 2024-07-25 11:50:16
Version      : 1.0.0
LastEditors  : qiyuxinlin 
LastEditTime : 2024-07-25 12:54:48
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
from ktransformers.server.config.config import Config
from ktransformers.server.backend.args import ConfigArgs
from ktransformers.server.backend.context_manager import ThreadContextManager
from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface
from ktransformers.server.backend.interfaces.transformers import TransformersInterface
from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface

def create_interface(config: Config, default_args: ConfigArgs, input_args=None):
    if config.backend_type=='transformers':
        from ktransformers.server.backend.interfaces.transformers import  TransformersInterface as BackendInterface
    elif config.backend_type == 'exllamav2':
        from ktransformers.server.backend.interfaces.exllamav2 import  ExllamaInterface as BackendInterface
    elif config.backend_type == 'ktransformers':
        from ktransformers.server.backend.interfaces.ktransformers import  KTransformersInterface as BackendInterface
    elif config.backend_type == 'balance_serve':
        from ktransformers.server.backend.interfaces.balance_serve import BalanceServeInterface as BackendInterface
    else:
        raise NotImplementedError(f'{config.backend_type} not implemented')
    if config.backend_type == 'ktransformers':
        GlobalInterface.interface = BackendInterface(default_args, input_args)
    elif config.backend_type == 'balance_serve':
        GlobalInterface.interface = BackendInterface(default_args, input_args)
    else:
        GlobalInterface.interface = BackendInterface(default_args)
    GlobalContextManager.context_manager = ThreadContextManager(GlobalInterface.interface)

class GlobalContextManager:
    context_manager: ThreadContextManager
class GlobalInterface:
    interface:  TransformersInterface | KTransformersInterface | ExllamaInterface 
    
def get_thread_context_manager() -> GlobalContextManager:
    return GlobalContextManager.context_manager
def get_interface() -> GlobalInterface:
    return GlobalInterface.interface

================================================
FILE: archive/ktransformers/server/utils/multi_timer.py
================================================
import time


def format_time(seconds):
    units = [
        ("hours", 3600),
        ("minutes", 60),
        ("seconds", 1),
        ("milliseconds", 1e-3),
        ("microseconds", 1e-6),
    ]

    for unit_name, unit_value in units:
        if seconds >= unit_value:
            time_value = seconds / unit_value
            return f"{time_value:.2f} {unit_name}"
    return "0 seconds"  # Handle case for 0 seconds


class Profiler:
    def __init__(self):
        self.timers = {}
        self.counters = {}

    def create_timer(self, name):
        self.timers[name] = {
            "start_time": None,
            "elapsed_time": 0,
            "running": False,
        }

    def start_timer(self, name):
        if name not in self.timers:
            raise ValueError(f"Timer '{name}' does not exist.")
        if self.timers[name]["running"]:
            raise ValueError(f"Timer '{name}' is already running.")
        self.timers[name]["start_time"] = time.time()
        self.timers[name]["running"] = True

    def pause_timer(self, name):
        if name not in self.timers:
            raise ValueError(f"Timer '{name}' does not exist.")
        if not self.timers[name]["running"]:
            raise ValueError(f"Timer '{name}' is not running.")
        self.timers[name]["elapsed_time"] += time.time() - self.timers[name]["start_time"]
        self.timers[name]["running"] = False

    def get_timer_sec(self, name):
        if name not in self.timers:
            raise ValueError(f"Timer '{name}' does not exist.")
        if self.timers[name]["running"]:
            current_time = self.timers[name]["elapsed_time"] + (time.time() - self.timers[name]["start_time"])
        else:
            current_time = self.timers[name]["elapsed_time"]
        return current_time

    def get_all_timers(self):
        all_timers = {}
        for name in self.timers:
            all_timers[name] = self.get_timer_sec(name)
        return all_timers

    def report_timer_string(self, name):
        return f"{name} elapsed time: {format_time(self.get_timer_sec(name))}"

    def create_and_start_timer(self, name):
        self.create_timer(name)
        self.start_timer(name)


    # Counter
    def inc(self,key:str,delta:int=1):
        self.counters[key] = self.counters.get(key,0) + delta

    def set_counter(self,key:str,to=0):
        self.counters[key] = to

    def get_counter(self,key:str):
        return self.counters.get(key,0)


================================================
FILE: archive/ktransformers/server/utils/serve_profiling.py
================================================
import re
import itertools
import time
import enum
import math
from enum import StrEnum

class ProfStatKey(StrEnum):
    ExpertsSummitCurrLayer = "ExpertsSummitCurrLayer"
    ExpertsSummitNextLayer = "ExpertsSummitNextLayer"
    ExpertsCPUForwardOne = "ExpertsCPUForwardOne"
    ExpertsCPUForwardTwo = "ExpertsCPUForwardTwo"
    CPUMoEKExpertsCallback = "CPUMoEKExpertsCallback"

class ProfTimeStat:
    def __init__(self):
        # open_status = os.environ["KT_PERF_STAT"] if "KT_PERF_STAT" in os.environ else "0"
        # if open_status == "0":
        #     self.on = False
        # else:
        #     self.on = True
        self.on = False
        self.prefill_stats = dict()
        self.decode_stats = dict()
        for key in ProfStatKey:
            self.prefill_stats[key] = ProfStatItem()
            self.decode_stats[key] = ProfStatItem()
        self.reset_all()

    def record_start_time(self):
        start_time = time.time_ns()
        return start_time

    def add_time_stat(self, key: ProfStatKey, time_ns, is_prefill):
        if not key:
            return
        # torch.cuda.synchronize()
        cost = time.time_ns() - time_ns
        if is_prefill:
            item = self.prefill_stats[key]
        else:
            item = self.decode_stats[key]
        item.add_item(cost)

    def print_all(self):
        # rank = f"[rank:{torch.distributed.get_rank()}]"
        rank = f"[rank:0]"
        msg = f"\n{rank} Prefill Time Stat\n"
        msg += rank + " {:27}{:>15}{:>15}{:>15}{:>15}{:>15}{:>15}{:>15}\n".format("", "min(ms)", "max(ms)", "avg(ms)", "count", "total(ms)", ">2ms", ">10ms")
        for key, value in self.prefill_stats.items():
            msg += rank + f" {key.value:<25}:{value.get_stat()}\n"
        msg += f"\n{rank} Decode Time Stat\n"
        msg += rank + " {:27}{:>15}{:>15}{:>15}{:>15}{:>15}{:>15}{:>15}\n".format("", "min(ms)", "max(ms)", "avg(ms)", "count", "total(ms)", ">2ms", ">10ms")
        for key, value in self.decode_stats.items():
            msg += rank + f" {key.value:<25}:{value.get_stat()}\n"
        print(msg)

    def reset_all(self):
        for _, value in self.prefill_stats.items():
            value.reset()
        for _, value in self.decode_stats.items():
            value.reset()


class ProfStatItem:
    def __init__(self):
        self.min_time = 100000000
        self.max_time = 0
        self.total_time_ns = 0
        self.count = 0
        self.err_time = []
        self.ms_count2 = 0
        self.ms_count10 = 0

    def add_item(self, cost_time_ns):
        self.count += 1
        self.total_time_ns += cost_time_ns
        self.min_time = min(self.min_time, cost_time_ns)
        self.max_time = max(self.max_time, cost_time_ns)
        if (cost_time_ns > 2000000):
        #   self.err_time.append(round(cost_time_ns / 1000 / 1000, 2))
          self.ms_count2 += 1
        if (cost_time_ns > 10000000):
        #   self.err_time.append(round(cost_time_ns / 1000 / 1000, 2))
          self.ms_count10 += 1
        # self.err_time.append(round(cost_time_ns / 1000 / 1000, 2))

    def reset(self):
        self.min_time = 100000000
        self.max_time = 0
        self.total_time_ns = 0
        self.count = 0

    def get_stat(self):
        min_time = self.min_time / 1000 / 1000
        max_time = self.max_time / 1000 / 1000
        if self.count != 0:
            avg_time = self.total_time_ns / self.count / 1000 / 1000
        else:
            avg_time = 0
        total = self.total_time_ns / 1000 / 1000
        # tmpstr = str(self.err_time)
        # print(f"\r\n err_time: {tmpstr} \r\n ")
        return f"{min_time:15.2f}{max_time:15.2f}{avg_time:15.2f}{self.count:15}{total:15.2f}{self.ms_count2:>15}{self.ms_count10:>15}"


PROF_TIME_STAT = ProfTimeStat()


================================================
FILE: archive/ktransformers/server/utils/sql_utils.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenxl
Date         : 2024-06-12 09:12:58
Version      : 1.0.0
LastEditors  : chenxl 
LastEditTime : 2024-07-27 01:56:04
'''

from urllib.parse import urlparse
import os
from contextlib import contextmanager
from sqlalchemy import create_engine
from sqlalchemy.orm import Session, sessionmaker, declarative_base

from ktransformers.server.config.config import Config
from ktransformers.server.config.singleton import Singleton
from ktransformers.server.config.log import logger
from ktransformers.server.exceptions import db_exception


Base = declarative_base()


class SQLUtil(metaclass=Singleton):
    """
    database connections init and management
    """
    sqlalchemy_engine = None
    session_local = None

    def __init__(self) -> None:
        self.cfg: Config = Config()
        if not self.sqlalchemy_engine:
            SQLUtil.init_engine(self.cfg)

    @contextmanager
    def get_db(self):
        """
        After you finish using the session, it's crucial to close it.
        """
        if not SQLUtil.sqlalchemy_engine:
            SQLUtil.init_engine(self.cfg)
        session = self.session_local()  # type: ignore pylint: disable=not-callable
        try:
            yield session
        finally:
            session.close()

    @staticmethod
    def init_engine(cfg: Config):
        """
        initial engine and session maker Factory
        """
        pool_size = cfg.db_pool_size
        if SQLUtil.sqlalchemy_engine is None:
            if cfg.db_type == "sqllite":
                db_url = SQLUtil.create_sqllite_url(cfg)
            else:
                logger.error("Unsupported database type %s", cfg.db_type)
                exit(-1)
            SQLUtil.sqlalchemy_engine = create_engine(
                db_url, connect_args={"check_same_thread": False}, pool_size=pool_size)
            SQLUtil.session_local = sessionmaker(
                autocommit=False, autoflush=False, bind=SQLUtil.sqlalchemy_engine)

    @staticmethod
    def create_sqllite_url(cfg):
        """
        create and validate SQLLite url
        """
        path: str = cfg.db_host
        database: str = cfg.db_database
        absolute_path: str = os.path.join(path, database)
        url = 'sqlite:///' + absolute_path
        try:
            result = urlparse(url)
            if all([result.scheme, result.path, result.scheme == 'sqlite']):
                return url
            else:
                logger.error("invalid sqllite url: %s", url)
                exit(-1)
        except ValueError:
            logger.error("invalid sqllite url: %s", url)
            exit(-1)

    def db_add_commit_refresh(self, session: Session, what):
        """
        add data to database
        """
        try:
            session.add(what)
            session.commit()
            session.refresh(what)
        except Exception as e:
            logger.exception("db commit error with data %s", str(what.__dict__))
            ex = db_exception()
            ex.detail = str(e)
            session.rollback()
            raise ex from e

    def db_merge_commit(self, session: Session, what):
        try:
            session.merge(what)
            session.commit()
        except Exception as e:
            ex = db_exception()
            ex.detail = str(e)
            logger.exception("db merge commit error with data %s", str(what.__dict__))
            session.rollback()
            raise ex from e

    def db_update_commit_refresh(self, session: Session, existing, what):
        what = what.model_dump(mode="json")
        try:
            for key in what.keys():
                if what[key] is not None:  # 检查b中的字段是否为None
                    setattr(existing, key, what[key])  # 更新a的字段
            session.commit()
            session.refresh(existing)
        except Exception as e:
            ex = db_exception()
            ex.detail = str(e)
            logger.exception("db update commit refresh error with data %s", str(what.__dict__))
            session.rollback()
            raise ex from e


================================================
FILE: archive/ktransformers/tests/.gitignore
================================================
results/

================================================
FILE: archive/ktransformers/tests/AIME_2024/eval_api.py
================================================
# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
import argparse
import json
import os
import time
import requests
import tqdm

from evaluation import filter_answer
from prompts import instruct_prompt
import pandas as pd
from datasets import load_dataset
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'


def generate_text(api_url,question , model_name, stream=False, auth_token=None):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        # 添加 API Key
        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
    }
    question = instruct_prompt(question)
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        "temperature": 0.6,
        "max_tokens": 10240,
    }
    print(f"content: {question}")
    response = requests.post(api_url, headers=headers, json=data,verify=False)
    if response.status_code == 200:
        result = response.json()
        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
        return filter_answer(results)
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None
def load_data(file_path):
        """
        Load data from a Parquet file into a list.
        Each record in the Parquet file should represent an individual record.
        """
        # 读取 Parquet 文件
        # dataset = load_dataset('parquet', data_files=file_path)
        data = []
        ds = load_dataset(file_path)
        df = pd.DataFrame(ds['train'])
        for _, row in df.iterrows():
            data.append(row.to_dict())
        return data

def get_score(pred, answer):
        """
        Calculate scores between the prediction and the answer.
        Uses ROUGE scores as the evaluation metric.
        :param pred: The predicted string.
        :param answer: The reference answer string.
        :return: A dictionary containing ROUGE scores.
        """
        if pred == answer:
            return 1
        # if we need to compare str with number, convert teh str to number
        try:
            pred = float(pred)
            answer = float(answer)
        except:
            pass
        if pred == answer:
            return 1
        return 0

def run_eval_api(
    api_url: str,
    model_name: str,
    out_path: str,
    format_tabs: bool = False,
    auth_token: str = None,
    problem_file: str = None,
    append: bool = False,
    skip: int = 0
):
  
    data = load_data(problem_file)
    pbar = tqdm.tqdm(total=len(data) * 1)
    pbar.update(skip)
    for i in range(len(data)):
        i = i+skip
        data_item = data[i]
        question = data_item['Problem']
        # Start the timer for this evaluation
        start_time = time.time()
        try:
            completion = generate_text(api_url, question, model_name, auth_token=auth_token)
            if completion is None:
                raise Exception(f"Failed to get prediction for {question}")
            answer = data_item['Answer']
            score = get_score(completion, answer)
            elapsed_time = time.time() - start_time
            result = {
                "index": i,
                "question_id": data_item["ID"],
                "answer": answer,
                "prediction": completion,
                "score": score,
                "time": elapsed_time
            }
            with open(out_path, "a" if append else "w") as f:
                f.write(json.dumps(result) + "\n")
            
        except Exception as e:
            print(f"Failed to get prediction for {question}")
            print(e)
            continue

        pbar.update(1)
    

def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-R1", help="Model Name")
    parser.add_argument("--out_path", type=str, default="results/api/eval_aime.jsonl", help="Output Path")
    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
    parser.add_argument("--problem_file", type=str, default="Maxwell-Jia/AIME_2024", help="Evalset File")
    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
    parser.add_argument("--skip", type=int, default=0, help="Skip some tasks")
    args = parser.parse_args()
    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append, args.skip)

================================================
FILE: archive/ktransformers/tests/AIME_2024/evaluation.py
================================================
# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
def filter_answer(completion: str) -> str:
    # the answer is the last part of the completion, it's a int64 number
    # get the last line
    completion = completion.strip().split("\n")[-1]
    # handle the $\\boxed{...}$ format
    if "$\\boxed{" in completion:
        return completion.split("}")[0].split("{")[-1]
    return completion.split()[-1]


================================================
FILE: archive/ktransformers/tests/AIME_2024/prompts.py
================================================
def instruct_prompt(prompt: str) -> str:
    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nSolve the following math problem without any tests or explanation only one answer surrounede by '$\\boxed{{}}$'\n{prompt}\n\n### Response:"""


================================================
FILE: archive/ktransformers/tests/UT/test_kdeepseek_attention_w8a8a2serve_npu.py
================================================
import sys
import types

import torch
import torch.nn as nn
import pytest

torch_npu = pytest.importorskip("torch_npu")

from ktransformers.operators.ascend.ascend_attention import (
    KDeepseekV2AttentionW8A8A2Serve,
)
import ktransformers.operators.ascend.ascend_attention as attn_mod

class DummyConfig:
    def __init__(self, hidden_size=4, num_attention_heads=1):
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads


class DummyOrigAttn(nn.Module):
    def __init__(self, config=None, layer_idx=0):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx

        hidden_dim = config.hidden_size if config is not None else 4

        self.q_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.kv_a_proj_with_mqa = None
        self.kv_a_layernorm = nn.LayerNorm(2)
        self.o_proj = None


class DummyDynamicQuantOps:
    def execute(self, inputs):
        x = inputs[0]
        return [x]


class DummyMatMulOps:
    def execute(self, inputs):
        x = inputs[0]
        return [x]


class DummyQuantProj(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.input_scale = torch.tensor(1.0, dtype=torch.float16)
        self.input_offset = torch.tensor(0.0, dtype=torch.float16)
        self.weight = nn.Parameter(torch.zeros(dim, dim, dtype=torch.float16))
        self.quant_bias = torch.zeros(dim, dtype=torch.float16)
        self.deq_scale = torch.tensor(1.0, dtype=torch.float16)


class DummyStaticCache:
    def __init__(self, page_size=16):
        self.page_size = page_size

    def get_usable_length(self, kv_seq_len, layer_idx):
        return 0

    def update(self, combined, layer_idx, cache_kwargs):
        return combined, None


class DummyNpuFusedAttention:
    def __call__(self, q, k, v, **kwargs):
        bsz, max_q_len, num_heads, dim = q.shape
        out = torch.zeros(
            bsz, max_q_len, num_heads, dim, dtype=q.dtype, device=q.device
        )
        softmax_lse = torch.zeros(1, dtype=q.dtype, device=q.device)
        return out, softmax_lse

    def out(self, q, k, v, workspace=None,
            query_rope=None, key_rope=None,
            num_heads=None, num_key_value_heads=None,
            input_layout=None, scale=None,
            antiquant_mode=None, antiquant_scale=None,
            block_table=None, block_size=None,
            actual_seq_lengths_kv=None,
            sparse_mode=None,
            out=None):
        attn_output, softmax_lse = out
        attn_output.zero_()
        softmax_lse.zero_()
        return attn_output, softmax_lse


class DummyOpsNpu:
    def npu_fused_infer_attention_score(self, q, k, v, **kwargs):
        bsz, num_heads, q_len, dim = q.shape
        out = torch.zeros(
            bsz, num_heads, q_len, dim, dtype=q.dtype, device=q.device
        )
        softmax_lse = torch.zeros(1, dtype=q.dtype, device=q.device)
        return out, softmax_lse

def fake_apply_rotary_pos_emb_fusion(q_pe, k_pe, cos, sin):
    return q_pe, k_pe

def build_attention_module(q_lora_rank=None):
    if hasattr(attn_mod, "get_tensor_parallel_size"):
        attn_mod.get_tensor_parallel_size = lambda: 1  # type: ignore

    config = DummyConfig(hidden_size=4, num_attention_heads=1)
    orig = DummyOrigAttn(config=config, layer_idx=0)

    attn = KDeepseekV2AttentionW8A8A2Serve(
        key="test",
        gguf_loader=None,
        config=config,
        orig_module=orig,
        prefill_device="npu",
        generate_device="npu",
    )

    hidden_dim = 4
    num_heads = 1
    qk_nope_head_dim = 2
    qk_rope_head_dim = 2
    q_head_dim = qk_nope_head_dim + qk_rope_head_dim  # 4
    kv_lora_rank = 2
    v_head_dim = 2

    attn.num_heads = num_heads
    attn.q_head_dim = q_head_dim
    attn.qk_nope_head_dim = qk_nope_head_dim
    attn.qk_rope_head_dim = qk_rope_head_dim
    attn.kv_lora_rank = kv_lora_rank
    attn.v_head_dim = v_head_dim
    attn.softmax_scale = 1.0
    attn.layer_idx = 0
    attn.sparse_mode = 0
    attn.q_lora_rank = q_lora_rank

    attn.elewise_quant = DummyDynamicQuantOps()
    attn.matmulDequant_operation = DummyMatMulOps()
    attn.matmulDequant_operation_aclnn = DummyMatMulOps()

    orig_mod = attn.orig_module

    if q_lora_rank is None:
        orig_mod.q_proj = nn.Linear(hidden_dim, num_heads * q_head_dim, bias=False)
        orig_mod.q_proj = orig_mod.q_proj.to(dtype=torch.float16)
    else:
        orig_mod.q_a_proj = DummyQuantProj(hidden_dim)
        orig_mod.q_b_proj = DummyQuantProj(hidden_dim)
        orig_mod.q_a_layernorm = nn.LayerNorm(hidden_dim)

    orig_mod.kv_a_proj_with_mqa = DummyQuantProj(hidden_dim)
    orig_mod.kv_a_layernorm = nn.LayerNorm(kv_lora_rank)

    orig_mod.o_proj = DummyQuantProj(num_heads * v_head_dim)

    attn.q_absorb = torch.randn(
        num_heads, qk_nope_head_dim, kv_lora_rank, dtype=torch.float16
    )
    attn.out_absorb = torch.randn(
        num_heads, kv_lora_rank, v_head_dim, dtype=torch.float16
    )
    def fake_rotary_emb(q_pe, position_ids):
        bsz, n_heads, q_len, dim = q_pe.shape
        cos = torch.ones(1, 1, q_len, dim, dtype=q_pe.dtype, device=q_pe.device)
        sin = torch.zeros(1, 1, q_len, dim, dtype=q_pe.dtype, device=q_pe.device)
        return cos, sin

    attn.rotary_emb = fake_rotary_emb

    return attn

@pytest.fixture(autouse=True)
def _patch_env(monkeypatch):
    if hasattr(attn_mod, "apply_rotary_pos_emb_fusion"):
        monkeypatch.setattr(
            attn_mod, "apply_rotary_pos_emb_fusion",
            fake_apply_rotary_pos_emb_fusion
        )

    if hasattr(attn_mod, "get_use_npu_graph"):
        monkeypatch.setattr(attn_mod, "get_use_npu_graph", lambda: False)

    if hasattr(attn_mod, "get_tensor_parallel_size"):
        monkeypatch.setattr(attn_mod, "get_tensor_parallel_size", lambda: 1)

    if hasattr(attn_mod, "get_tensor_parallel_group"):
        monkeypatch.setattr(attn_mod, "get_tensor_parallel_group", lambda: None)

    if hasattr(attn_mod, "get_current_device"):
        monkeypatch.setattr(attn_mod, "get_current_device", lambda: "cpu")

    # torch.distributed.barrier -> no-op
    if hasattr(torch, "distributed") and hasattr(torch.distributed, "barrier"):
        monkeypatch.setattr(
            torch.distributed, "barrier",
            lambda *args, **kwargs: None,
            raising=False,
        )

    dummy_op = DummyNpuFusedAttention()
    monkeypatch.setattr(
        torch_npu, "npu_fused_infer_attention_score",
        dummy_op, raising=False
    )

    def fake_get_workspace(q, k, v, **kwargs):
        return torch.empty(1, dtype=q.dtype, device=q.device)

    monkeypatch.setattr(
        torch_npu, "_npu_fused_infer_attention_score_get_max_workspace",
        fake_get_workspace, raising=False
    )

    monkeypatch.setattr(torch.ops, "npu", DummyOpsNpu(), raising=False)

    yield


# ==========================
#  测试用例
# ==========================

def test_print_callback_smoke():
    attn = build_attention_module()
    bsz, q_len, hidden_dim = 1, 3, 4
    hidden_states = torch.randn(bsz, q_len, hidden_dim)
    position_ids = torch.arange(q_len).unsqueeze(0)
    cache_position = torch.arange(q_len).unsqueeze(0)
    page_idx = torch.zeros(bsz, dtype=torch.int32)
    page_offset = torch.zeros(bsz, dtype=torch.int32)
    block_table = torch.zeros(bsz, 1, dtype=torch.int32)

    attn.print_callback(
        (hidden_states, position_ids, cache_position,
         page_idx, page_offset, block_table)
    )


def _common_inputs_prefill():
    bsz, q_len, hidden_dim = 1, 3, 4
    hidden_states = torch.randn(bsz, q_len, hidden_dim, dtype=torch.float16)
    attention_mask = torch.zeros(bsz, 1, q_len, q_len, dtype=torch.float32)
    position_ids = torch.arange(q_len).unsqueeze(0)
    cache_position = torch.arange(q_len).unsqueeze(0)
    page_idx = torch.zeros(bsz, dtype=torch.int32)
    page_offset = torch.zeros(bsz, dtype=torch.int32)
    block_table = torch.zeros(bsz, 1, dtype=torch.int32)
    past_key_value = DummyStaticCache(page_size=16)
    q_len_raw = torch.tensor([q_len], dtype=torch.int32)
    kv_len_raw = torch.tensor([q_len], dtype=torch.int32)

    return (
        hidden_states, attention_mask, position_ids, cache_position,
        page_idx, page_offset, block_table,
        past_key_value, q_len_raw, kv_len_raw
    )


def test_forward_prefill_with_mask():
    """
    is_prefill=True + attention_mask 不为 None + past_key_value 不为 None
    """
    attn = build_attention_module(q_lora_rank=None)

    (hidden_states, attention_mask, position_ids, cache_position,
     page_idx, page_offset, block_table,
     past_key_value, q_len_raw, kv_len_raw) = _common_inputs_prefill()

    outputs = attn.forward(
        hidden_states=hidden_states,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=False,
        use_cache=True,
        cache_position=cache_position,
        is_prefill=True,
        page_idx=page_idx,
        page_offset=page_offset,
        block_table=block_table,
        q_len_raw=q_len_raw,
        kv_len_raw=kv_len_raw,
        stream=None,
    )

    attn_output, attn_weights, new_cache = outputs
    assert attn_output.shape == (
        1,  # bsz
        3,  # q_len
        attn.num_heads * attn.v_head_dim,
    )
    assert attn_weights is None
    assert new_cache is past_key_value


def test_forward_prefill_without_mask_and_q_lora():
    """
    is_prefill=True + attention_mask=None + q_lora_rank 非 None 分支
    """
    attn = build_attention_module(q_lora_rank=1)

    (hidden_states, attention_mask, position_ids, cache_position,
     page_idx, page_offset, block_table,
     past_key_value, q_len_raw, kv_len_raw) = _common_inputs_prefill()

    outputs = attn.forward(
        hidden_states=hidden_states,
        attention_mask=None,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=False,
        use_cache=True,
        cache_position=cache_position,
        is_prefill=True,
        page_idx=None,
        page_offset=None,
        block_table=None,
        q_len_raw=q_len_raw,
        kv_len_raw=kv_len_raw,
        stream=None,
    )

    attn_output, attn_weights, new_cache = outputs
    assert attn_output.shape == (
        1,
        3,
        attn.num_heads * attn.v_head_dim,
    )
    assert attn_weights is None
    assert new_cache is past_key_value


def test_forward_decode_paged_path():
    """
    is_prefill=False + get_use_npu_graph=False
    => 走 forward_paged + torch.ops.npu.npu_fused_infer_attention_score 分支
    """
    attn = build_attention_module(q_lora_rank=None)

    bsz, q_len, hidden_dim = 1, 1, 4
    hidden_states = torch.randn(bsz, q_len, hidden_dim, dtype=torch.float16)
    position_ids = torch.arange(q_len).unsqueeze(0)
    cache_position = torch.arange(q_len).unsqueeze(0)
    past_key_value = DummyStaticCache(page_size=16)
    q_len_raw = torch.tensor([q_len], dtype=torch.int32)
    kv_len_raw = torch.tensor([q_len], dtype=torch.int32)
    block_table = torch.zeros(bsz, 1, dtype=torch.int32)

    outputs = attn.forward(
        hidden_states=hidden_states,
        attention_mask=None,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=False,
        use_cache=True,
        cache_position=cache_position,
        is_prefill=False,
        page_idx=None,
        page_offset=None,
        block_table=block_table,
        q_len_raw=q_len_raw,
        kv_len_raw=kv_len_raw,
        stream=None,
    )

    attn_output, attn_weights, new_cache = outputs
    assert attn_output.shape == (
        bsz,
        q_len,
        attn.num_heads * attn.v_head_dim,
    )
    assert attn_weights is None
    assert new_cache is past_key_value


def test_forward_prefill_layer_idx_none_raises():
    """
    覆盖: past_key_value 不为 None 且 layer_idx 为 None 的异常分支。
    """
    attn = build_attention_module(q_lora_rank=None)
    attn.layer_idx = None  # 手动破坏 layer_idx

    (hidden_states, attention_mask, position_ids, cache_position,
     page_idx, page_offset, block_table,
     past_key_value, q_len_raw, kv_len_raw) = _common_inputs_prefill()

    with pytest.raises(ValueError):
        attn.forward(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=False,
            use_cache=True,
            cache_position=cache_position,
            is_prefill=True,
            page_idx=page_idx,
            page_offset=page_offset,
            block_table=block_table,
            q_len_raw=q_len_raw,
            kv_len_raw=kv_len_raw,
            stream=None,
        )


def test_forward_prefill_attn_output_shape_mismatch_raises(monkeypatch):
    """
    覆盖: attn_output 形状不符合期望时的 ValueError 分支。
    """
    attn = build_attention_module(q_lora_rank=None)

    def bad_fused(q, k, v, **kwargs):
        bsz, max_q_len, num_heads, dim = q.shape
        # 刻意制造 num_heads+1，触发 size 检查不通过
        out = torch.zeros(
            bsz, max_q_len, num_heads + 1, attn.v_head_dim,
            dtype=q.dtype, device=q.device
        )
        lse = torch.zeros(1, dtype=q.dtype, device=q.device)
        return out, lse

    monkeypatch.setattr(
        torch_npu, "npu_fused_infer_attention_score",
        bad_fused, raising=False
    )

    (hidden_states, attention_mask, position_ids, cache_position,
     page_idx, page_offset, block_table,
     past_key_value, q_len_raw, kv_len_raw) = _common_inputs_prefill()

    with pytest.raises(ValueError):
        attn.forward(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=False,
            use_cache=True,
            cache_position=cache_position,
            is_prefill=True,
            page_idx=page_idx,
            page_offset=page_offset,
            block_table=block_table,
            q_len_raw=q_len_raw,
            kv_len_raw=kv_len_raw,
            stream=None,
        )


def test_forward_paged_use_npu_graph(monkeypatch):
    """
    覆盖: get_use_npu_graph() == True 的 graph 路径。
    """
    # 让 ascend_attention.get_use_npu_graph 返回 True
    monkeypatch.setattr(attn_mod, "get_use_npu_graph", lambda: True)

    # 伪造 model_runner 模块，满足 import ktransformers.server.balance_serve.inference.model_runner
    dummy_runner = type(
        "DummyRunner", (), {"__init__": lambda self: setattr(self, "workspace", [None] * 4)}
    )

    dummy_mr = types.SimpleNamespace(
        ModelRunner=dummy_runner,
        get_or_create_model_runner=lambda device=None: dummy_runner(),
    )

    sys.modules[
        "ktransformers.server.balance_serve.inference.model_runner"
    ] = dummy_mr

    attn = build_attention_module(q_lora_rank=None)

    bsz, q_len, hidden_dim = 1, 1, 4
    hidden_states = torch.randn(bsz, q_len, hidden_dim, dtype=torch.float16)
    position_ids = torch.arange(q_len).unsqueeze(0)
    cache_position = torch.arange(q_len).unsqueeze(0)
    past_key_value = DummyStaticCache(page_size=16)
    q_len_raw = torch.tensor([q_len], dtype=torch.int32)
    kv_len_raw = torch.tensor([q_len], dtype=torch.int32)
    block_table = torch.zeros(bsz, 1, dtype=torch.int32)

    outputs = attn.forward(
        hidden_states=hidden_states,
        attention_mask=None,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=False,
        use_cache=True,
        cache_position=cache_position,
        is_prefill=False,
        page_idx=None,
        page_offset=None,
        block_table=block_table,
        q_len_raw=q_len_raw,
        kv_len_raw=kv_len_raw,
        stream=None,
    )

    attn_output, attn_weights, new_cache = outputs
    assert attn_output.shape == (
        bsz,
        q_len,
        attn.num_heads * attn.v_head_dim,
    )
    assert attn_weights is None
    assert new_cache is past_key_value


================================================
FILE: archive/ktransformers/tests/UT/test_kdeepseek_ln_npu.py
================================================
import torch
import torch.nn as nn
import pytest

# 按你实际代码位置改路径：
from ktransformers.operators.ascend.ascend_layernorm import KDeepseekV3RMSNormW8A8
import ktransformers.util.utils as utils_mod

torch_npu = pytest.importorskip("torch_npu")


# ==========================
# Dummy 依赖
# ==========================

class DummyOrigModule(nn.Module):
    def __init__(self, hidden_size=4, variance_epsilon=1e-5):
        super().__init__()
        self.hidden_size = hidden_size
        self.variance_epsilon = variance_epsilon


class DummySafeTensorLoader:
    def __init__(self):
        self.tensors = {}
        self.load_calls = []

    def load_tensor(self, name: str):
        self.load_calls.append(name)
        return self.tensors[name]


class DummyGGUFLoader:
    def __init__(self, safetensor_loader: DummySafeTensorLoader):
        self.safetensor_loader = safetensor_loader


class DummyConfig:
    pass


class FakeRMSNorm:
    def __init__(self):
        self.last_args = None

    def __call__(self, hidden_states, weight, eps):
        self.last_args = (hidden_states, weight, eps)

        out = hidden_states * weight
        return (out,)


def build_rms_module(hidden_size=4, eps=1e-5, safetensor_loader=None):
    orig = DummyOrigModule(hidden_size=hidden_size, variance_epsilon=eps)
    if safetensor_loader is None:
        safetensor_loader = DummySafeTensorLoader()
    gguf_loader = DummyGGUFLoader(safetensor_loader)
    config = DummyConfig()
    module = KDeepseekV3RMSNormW8A8(
        key="rms",
        gguf_loader=gguf_loader,
        config=config,
        orig_module=orig,
        prefill_device="npu",
        generate_device="npu",
    )
    return module, safetensor_loader, orig

@pytest.fixture(autouse=True)
def patch_utils_and_npu(monkeypatch):
    monkeypatch.setattr(utils_mod, "get_current_device", lambda: "cpu", raising=False)

    fake = FakeRMSNorm()
    monkeypatch.setattr(torch_npu, "npu_rms_norm", fake, raising=False)

    import sys
    sys.modules[__name__]._fake_rms = fake

    yield

def get_fake_rms():
    import sys
    return sys.modules[__name__]._fake_rms

def test_forward_preserves_shape_and_dtype():
    hidden_size = 4
    module, _, orig = build_rms_module(hidden_size=hidden_size, eps=1e-6)

    x = torch.randn(2, 3, hidden_size, dtype=torch.float16)

    out = module(x)

    assert out.shape == x.shape
    assert out.dtype == x.dtype

    fake_rms = get_fake_rms()
    hs_arg, w_arg, eps_arg = fake_rms.last_args
    assert hs_arg is x
    assert w_arg is module.weight
    assert eps_arg == orig.variance_epsilon


def test_forward_with_bfloat16_dtype():
    hidden_size = 4
    module, _, _ = build_rms_module(hidden_size=hidden_size, eps=1e-6)

    x = torch.randn(1, 2, hidden_size, dtype=torch.bfloat16)
    out = module(x)

    assert out.shape == x.shape
    assert out.dtype == torch.bfloat16


def test_forward_uses_bias():
    hidden_size = 4
    module, _, _ = build_rms_module(hidden_size=hidden_size, eps=1e-6)

    module.weight.data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)
    module.bias.data = torch.tensor([-1.0, 0.5, 0.0, 2.0], dtype=torch.float32)

    x = torch.arange(2 * 3 * hidden_size, dtype=torch.float16).view(2, 3, hidden_size)

    out = module(x)

    expected_rms = x.to(torch.float32) * module.weight
    expected = expected_rms + module.bias

    assert torch.allclose(out, expected.to(out.dtype))


def test_load_from_safetensor_loader():
    hidden_size = 4
    module, safe_loader, _ = build_rms_module(hidden_size=hidden_size, eps=1e-5)

    w_loaded = torch.arange(hidden_size, dtype=torch.float32)
    b_loaded = torch.full((hidden_size,), 3.0, dtype=torch.float32)

    safe_loader.tensors["rms.weight"] = w_loaded
    safe_loader.tensors["rms.bias"] = b_loaded

    module.load()

    assert torch.allclose(module.weight, w_loaded)
    assert torch.allclose(module.bias, b_loaded)

    assert safe_loader.load_calls == ["rms.weight", "rms.bias"]


def test_unload_sets_weight_and_bias_to_none_idempotent():
    module, _, _ = build_rms_module(hidden_size=4, eps=1e-5)

    assert module.weight is not None
    assert module.bias is not None

    module.unload()
    assert module.weight is None
    assert module.bias is None

    module.unload()
    assert module.weight is None
    assert module.bias is None


================================================
FILE: archive/ktransformers/tests/dequant_gpu.py
================================================
import os 
# os.environ["CUDA_VISIBLE_DEVICES"]="1,2"
# add path
import sys
current_path = os.path.abspath(os.path.dirname(__file__))
sys.path.append(current_path+"/../..")
import numpy as np
# from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
# from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
from ktransformers.util.custom_gguf import GGUFLoader
import torch
import KTransformersOps
torch.set_default_dtype(torch.bfloat16)
import time
from transformers import (
    AutoConfig,
)
import os
# CUDA_LAUNCH_BLOCKING=1
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
model_name = "/data/Qwen2-57B-A14B-Instruct"

# Q4k
key = "blk.1."
target = "attn_q.weight"

t1 = time.time()
q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
# q_weight_cpu = torch.from_numpy(q_weight_cpu)

t2 = time.time()
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
t3 = time.time()
print()
allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu(), atol=1e-6)
print(f"Q4k {key+target}")
print("load gguf tensor from cpu cost: ", t2-t1)
print("load gguf tensor from gpu cost: ", t3-t2)
print("allclose: ", allclose)


# Q6k
key = "blk.0."
target = "ffn_down_exps.weight"

t1 = time.time()
q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
t2 = time.time()
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
t3 = time.time()
print()
allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6)
print(f"Q6k {key+target}")
print("load gguf tensor from cpu cost: ", t2-t1)
print("load gguf tensor from gpu cost: ", t3-t2)
print("allclose: ", allclose)


================================================
FILE: archive/ktransformers/tests/dequant_gpu_t.py
================================================
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# add path
import sys
sys.path.append("../..")
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
import torch
import KTransformersOps
torch.set_default_dtype(torch.bfloat16)
import time
from transformers import (
    AutoConfig,
)

gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
model_name = "/data/Qwen2-57B-A14B-Instruct"
key = "blk.0."
target = "ffn_up_exps.weight"

data = gguf_config.get_mmap_tensor(key + target)

_, factors, offsets, qs1, qs2= dequantize_q4_k(data)
factors_cpu = torch.from_numpy(factors)
offsets_cpu = torch.from_numpy(offsets)
qs1_cpu = torch.from_numpy(qs1)
qs2_cpu = torch.from_numpy(qs2)


_, factors, offsets, qs1, qs2 = dequantize_q4_k_gpu(data)

print(torch.allclose(factors.cpu(), factors_cpu))
print(torch.allclose(offsets.cpu(), offsets_cpu))
print(torch.allclose(qs1.cpu(), qs1_cpu))
print(torch.allclose(qs2.cpu(), qs2_cpu))

================================================
FILE: archive/ktransformers/tests/function_call_test.py
================================================
from openai import OpenAI

def send_messages(messages):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        tools=tools
    )
    return response.choices[0].message

client = OpenAI(
    api_key="placeholder",
    base_url="http://0.0.0.0:10002/v1",
)

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get weather of an location, the user shoud supply a location first",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    }
                },
                "required": ["location"]
            },
        }
    },
]

messages = [{"role": "user", "content": "How's the weather in Hangzhou?"}]
message = send_messages(messages)
print(f"User>\t {messages[0]['content']}")
print(message)
tool = message.tool_calls[0]
messages.append(message)

messages.append({"role": "tool", "tool_call_id": tool.id, "content": "24℃"})
message = send_messages(messages)
print(f"Model>\t {message.content}")

================================================
FILE: archive/ktransformers/tests/humaneval/eval_api.py
================================================
# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
import argparse
import os
import requests
from human_eval.data import write_jsonl, read_problems
import tqdm

from evaluation import filter_code, fix_indents
from prompts import instruct_prompt

def generate_text(api_url,question , model_name, stream=False, auth_token=None):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        # 添加 API Key
        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
    }
    question = instruct_prompt(question)
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        "temperature": 0.6
    }
    print(f"content: {question}")
    response = requests.post(api_url, headers=headers, json=data,verify=False)
    if response.status_code == 200:
        result = response.json()
        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
        return [filter_code(fix_indents(results))]
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

def run_eval_api(
    api_url: str,
    model_name: str,
    out_path: str,
    format_tabs: bool = False,
    auth_token: str = None,
    problem_file: str = None,
    append: bool = False,
    skip: int = 0
):
    if(problem_file is None):
        problems = read_problems()
    else:
        problems = read_problems(problem_file)
    samples = []
    pbar = tqdm.tqdm(total=len(problems) * 1)
    pbar.update(skip)
    try:
        for task_id in problems:
            # skip some tasks
            if skip > 0:
                skip -= 1
                continue

            if format_tabs:
                prompt = problems[task_id]["prompt"].replace("    ", "\t")
            else:
                prompt = problems[task_id]["prompt"]
            completion = generate_text(api_url, prompt, model_name, auth_token=auth_token)
            # samples.append({"task_id": task_id, "completion": completion})
            for sample in completion:
                result = dict(
                    task_id=task_id,
                    completion=sample,
                )
                samples += [result]
                if append:
                    write_jsonl(out_path, [result],append=append)
            pbar.update(1)
        if not append:
            write_jsonl(out_path, samples,append=append)
    except Exception as e:
        if not append:
            write_jsonl(out_path, samples,append=append)
        print(f"Error: {e}")

def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    #parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model Name")
    parser.add_argument("--out_path", type=str, default="results/api/eval_b.jsonl", help="Output Path")
    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
    parser.add_argument("--problem_file", type=str, default=None, help="Evalset File")
    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
    parser.add_argument("--skip", type=int, default=0, help="Skip first n problems")
    args = parser.parse_args()
    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append,args.skip)

================================================
FILE: archive/ktransformers/tests/humaneval/evaluation.py
================================================
# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
def filter_code(completion: str) -> str:
    # The program tends to overwrite, we only take the first function
    completion = completion.lstrip("\n")
    # we also remove ```python\n and ```
    completion = completion.replace("```python\n", "").replace("```", "")
    if 'if __name__ == "__main__":' in completion:
        completion = completion.split('if __name__ == "__main__":')[0]
    if "# Example usage" in completion:
        completion = completion.split("# Example usage")[0]
    return completion


def fix_indents(text: str) -> str:
    return text.replace("\t", "    ")


================================================
FILE: archive/ktransformers/tests/humaneval/prompts.py
================================================
def instruct_prompt(prompt: str) -> str:
    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""


def standard_prompt(prompt: str) -> str:
    return f"""Complete the following Python code without any tests or explanation\n{prompt}"""


def write_prompt(prompt: str) -> str:
    return f"""Write a python program to complete the following code:\n{prompt}"""


def replit_glaive_prompt(prompt: str) -> str:
    return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n Write a response that appropriately completes the request.\n\n ### Instruction:\nWrite a program to perform the given task.\n\n Input:\n{prompt}\n\n### Response:"""


================================================
FILE: archive/ktransformers/tests/mmlu_pro_test.py
================================================
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D, E, F, G, H, I, J. No other answers are accepted. Just the letter.'


class DataEvaluator:
    def __init__(self):
        # self.template_prompt = template_prompt
        self.data = []

    def load_data(self, file_path):
        """
        Load data from a Parquet file into a list.
        Each record in the Parquet file should represent an individual record.
        """
        # 读取 Parquet 文件
        # dataset = load_dataset('parquet', data_files=file_path)
        ds = load_dataset("TIGER-Lab/MMLU-Pro")
        df = pd.DataFrame(ds['test'])
        # print(ds)
        # # ds_1 =  ds['train']
        # ds_2 =  ds['validation']
        # ds_3 =  ds['test']
        # # 将数据集转换为 Pandas DataFrame
        # df_test = pd.DataFrame(ds['test'])
        # df_val = pd.DataFrame(ds['validation'])

        # for _, row in df.iterrows():
        #     self.data.append(row.to_dict())
        # df = pd.read_parquet(file_path)

        for _, row in df.iterrows():
            self.data.append(row.to_dict())

    def get_prompt(self, record):
        """
        Combine fields from a record with the template prompt to create a full prompt.
        :param record: Dictionary containing fields to populate the template.
        :return: A formatted prompt string.
        """
        # 查看ABCD。。。的选项
        options_str = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(record['options'])])
        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
        return prompt
        
    def post_processing(self, text):
        """
        Perform post-processing on the prediction string.
        :param text: The raw prediction string.
        :return: Processed prediction string.
        """
        text = text.lstrip('\n').split('\n')[-1]
        return text[-1:]

    def score(self, pred, answers):
        """
        Calculate scores between the prediction and the answer.
        Uses ROUGE scores as the evaluation metric.
        :param pred: The predicted string.
        :param answer: The reference answer string.
        :return: A dictionary containing ROUGE scores.
        """
        for answer in answers:
            if pred == answer:
                return 1

        return 0

# Function to generate text using API
def generate_text(api_url, question, model_name, stream=False):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        # 添加 API Key
        'Authorization' : 'Bearer '
    }
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        # "temperature": 0.0
    }
    
    print("POST data:", data)
    response = requests.post(api_url, headers=headers, json=data)
    
    if response.status_code == 200:
        result = response.json()
        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

# Main function to handle multiple evaluations
def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
    start_total_time = time.time()

    total_score = 0

    results = []
    # 设置随机数种子
    random.seed(42)
    random.shuffle(data_evaluator.data)
    for i in range(min(concurrent_requests, len(data_evaluator.data))):
        # Randomly select a data item from data for each request
        data_item = data_evaluator.data[i]
        question = data_evaluator.get_prompt(data_item)
        # print(question)

        # Start the timer for this evaluation
        start_time = time.time()
        try:
            # Generate prediction using the API
            prediction = generate_text(api_url, question, model_name)

            if prediction is None:
                raise Exception(f"Failed to get prediction for {question}")

            answer = data_item['answer']
            # Compute score
            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)

            # Calculate the time taken
            elapsed_time = time.time() - start_time

            # Collect the result data
            result_data = {
                "question_id": data_item['question_id'],
                "answer": answer,
                "prediction": data_evaluator.post_processing(prediction),
                "score": score,
                "time": elapsed_time
            }

            # Write results to result.json with each field on a new line
            with open(result_file, 'a', encoding='utf-8') as f:
                json.dump(result_data, f, ensure_ascii=False, indent=4)
                f.write("\n")  # Ensure each JSON object is on a new line

            results.append(result_data)

            # Aggregate scores
            total_score += score

        except Exception as e:
            print(f"Error processing request {i}: {e}")

    # Calculate total time and throughput
    total_time = time.time() - start_total_time
    throughput = concurrent_requests / total_time

    # Log the total time, throughput, and average ROUGE scores
    with open(log_file, 'a', encoding='utf-8') as log_f:
        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
        log_f.write('-' * 40 + '\n')

    print(f"Results saved to {result_file}")
    print(f"Log saved to {log_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
    parser.add_argument("--file", type=str, default="TIGER-Lab/MMLU-Pro", help="Path to the mmlu.jsonl file")
    parser.add_argument("--result", type=str, default="./mmlu_result_pro.json", help="Path to save the result JSON file")
    parser.add_argument("--log", type=str, default="./mmlu_result_pro.log", help="Path to save the log file")
    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
    parser.add_argument("--api_url", type=str, default="http://localhost:15488/v1/chat/completions", help="API URL")
    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")

    args = parser.parse_args()

    # Load the data from the provided file
    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"


    # Load the data from the provided file
    data_evaluator = DataEvaluator()
    data_evaluator.load_data(args.file)

    # Run the main function with the specified number of concurrent evaluations
    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)

================================================
FILE: archive/ktransformers/tests/mmlu_test.py
================================================
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter.'


class DataEvaluator:
    def __init__(self):
        # self.template_prompt = template_prompt
        self.data = []

    def load_data(self, file_path):
        """
        Load data from a Parquet file into a list.
        Each record in the Parquet file should represent an individual record.
        """
        # 读取 Parquet 文件
        # dataset = load_dataset('parquet', data_files=file_path)
        splits = {'test': 'all/test-00000-of-00001.parquet', 'validation': 'all/validation-00000-of-00001.parquet',
                  'dev': 'all/dev-00000-of-00001.parquet',
                  'auxiliary_train': 'all/auxiliary_train-00000-of-00001.parquet'}
        df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])

        for _, row in df.iterrows():
            self.data.append(row.to_dict())

    def get_prompt(self, record):
        """
        Combine fields from a record with the template prompt to create a full prompt.
        :param record: Dictionary containing fields to populate the template.
        :return: A formatted prompt string.
        """
        # 查看ABCD。。。的选项
        options_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(record['choices'])])
        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
        return prompt
        
    def post_processing(self, text):
        """
        Perform post-processing on the prediction string.
        :param text: The raw prediction string.
        :return: Processed prediction string.
        """
        text = text.lstrip('\n').split('\n')[-1]
        return text[-1:]

    def score(self, pred, answers):
        """
        Calculate scores between the prediction and the answer.
        Uses ROUGE scores as the evaluation metric.
        :param pred: The predicted string.
        :param answer: The reference answer string.
        :return: A dictionary containing ROUGE scores.
        """
        for answer in answers:
            if pred == answer:
                return 1

        return 0

# Function to generate text using API
def generate_text(api_url, question, model_name, stream=False):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        # 添加 API Key
        'Authorization' : 'Bearer '
    }
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        # "temperature": 0.0
    }
    
    print("POST data:", data)
    response = requests.post(api_url, headers=headers, json=data)
    
    if response.status_code == 200:
        result = response.json()
        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

# Main function to handle multiple evaluations
def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
    start_total_time = time.time()

    total_score = 0

    results = []
   # 设置随机数种子
    random.seed(42)
    random.shuffle(data_evaluator.data)
    for i in range(min(concurrent_requests, len(data_evaluator.data))):
        # Randomly select a data item from data for each request
        data_item = data_evaluator.data[i]
        question = data_evaluator.get_prompt(data_item)
        # print(question)

        # Start the timer for this evaluation
        start_time = time.time()
        try:
            # Generate prediction using the API
            prediction = generate_text(api_url, question, model_name)

            if prediction is None:
                raise Exception(f"Failed to get prediction for {question}")

            answer = chr(data_item['answer'] + 65)
            # Compute score
            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)

            # Calculate the time taken
            elapsed_time = time.time() - start_time

            # Collect the result data
            result_data = {
                "question_id": i,
                "answer": answer,
                "prediction": data_evaluator.post_processing(prediction),
                "score": score,
                "time": elapsed_time
            }

            # Write results to result.json with each field on a new line
            with open(result_file, 'a', encoding='utf-8') as f:
                json.dump(result_data, f, ensure_ascii=False, indent=4)
                f.write("\n")  # Ensure each JSON object is on a new line

            results.append(result_data)

            # Aggregate scores
            total_score += score

        except Exception as e:
            print(f"Error processing request {i}: {e}")

    # Calculate total time and throughput
    total_time = time.time() - start_total_time
    throughput = concurrent_requests / total_time

    # Log the total time, throughput, and average ROUGE scores
    with open(log_file, 'a', encoding='utf-8') as log_f:
        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
        log_f.write('-' * 40 + '\n')

    print(f"Results saved to {result_file}")
    print(f"Log saved to {log_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
    parser.add_argument("--file", type=str, default="cais/mmlu", help="Path to the mmlu.jsonl file")
    parser.add_argument("--result", type=str, default="./mmlu_result_silicon.json", help="Path to save the result JSON file")
    parser.add_argument("--log", type=str, default="./mmlu_result_silicon.log", help="Path to save the log file")
    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
    parser.add_argument("--api_url", type=str, default="http://localhost:10003/v1/chat/completions", help="API URL")
    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")

    args = parser.parse_args()

    # Load the data from the provided file
    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"


    # Load the data from the provided file
    data_evaluator = DataEvaluator()
    data_evaluator.load_data(args.file)

    # Run the main function with the specified number of concurrent evaluations
    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)

================================================
FILE: archive/ktransformers/tests/mmlu_test_multi.py
================================================
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset
import os
import concurrent.futures
import threading
import re

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter.'


def extract_final_answer(text):
    """
    提取模型预测的最终选项（如 A/B/C/D）
    支持自然语言、多行、markdown、高亮、非末尾结论等格式
    """
    text = text.strip()

    # 1. 显式语句匹配（优先）
    explicit_patterns = [
        r'Answer:\s*([A-D])\b',
        r'Correct answer:\s*([A-D])\b',
        r'The correct answer is\s*\*?\*?\s*([A-D])\b',
        r'Answer is\s*([A-D])\b',
        r'Therefore,\s*answer is\s*([A-D])\b',
        r'Therefore,\s*the answer should be\s*(?:Option\s*)?([A-D])\b',
        r'The answer should be\s*(?:Option\s*)?([A-D])\b',
        r'Option\s+([A-D])\s+is correct',
    ]
    for pat in explicit_patterns:
        match = re.search(pat, text, re.IGNORECASE)
        if match:
            return match.group(1).upper()

    # 2. markdown 强调 **C**, **C. something**
    markdown_match = re.findall(r'\*\*\s*([A-D])[\.\s]?', text)
    if markdown_match:
        return markdown_match[-1].upper()

    # 3. 查找单引号中的 'C' 或 "C"
    quote_match = re.findall(r"['\"]([A-D])['\"]", text)
    if quote_match:
        return quote_match[-1].upper()

    # 4. 倒数几行是否以 "C." 或 "C" 开头
    lines = text.splitlines()
    for line in reversed(lines[-5:]):
        line = line.strip()
        match = re.match(r'^([A-D])([.\s]|$)', line)
        if match:
            return match.group(1).upper()
    
    # 再不行就返回 None
    return None
class DataEvaluator:
    def __init__(self):
        self.data = []

    def load_data(self, file_path):
        """
        从数据文件中加载数据，每条记录对应一个实例
        """
        splits = {'test': 'all/test-00000-of-00001.parquet', 'validation': 'all/validation-00000-of-00001.parquet',
                  'dev': 'all/dev-00000-of-00001.parquet',
                  'auxiliary_train': 'all/auxiliary_train-00000-of-00001.parquet'}
        df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])
        for _, row in df.iterrows():
            self.data.append(row.to_dict())

    def get_prompt(self, record):
        """
        结合提示信息和记录数据生成完整的题目
        """
        options_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(record['choices'])])
        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
        return prompt

    def post_processing(self, text):
        """
        对生成的文本进行后处理，提取最终答案（只返回最后一个字符）
        """
        text = text.lstrip('\n').split('\n')[-1]
        return text[-1:]

    def score(self, pred, answer):
        """
        对比预测答案和正确答案，返回得分
        """
        if pred == answer:
            return 1
        return 0

def generate_text(api_url, question, model_name, stream=False):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': 'Bearer '  # 如有需要，请填入 API Key
    }
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
    }
    print("POST data:", data)
    response = requests.post(api_url, headers=headers, json=data, timeout=5000000)
    if response.status_code == 200:
        result = response.json()
        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
    start_total_time = time.time()
    total_score = 0
    total_exact_score = 0
    results = []
    file_lock = threading.Lock()
    
    # 打乱数据顺序，并选择需要测试的实例数
    random.seed(42)
    random.shuffle(data_evaluator.data)
    data_subset = data_evaluator.data[:min(concurrent_requests, len(data_evaluator.data))]
    
    batch_size = 10  # 每批次最多 10 个实例

    def worker(index, data_item):
        nonlocal total_score
        nonlocal total_exact_score
        question = data_evaluator.get_prompt(data_item)
        start_time = time.time()
        try:
            prediction = generate_text(api_url, question, model_name)
            if prediction is None:
                raise Exception(f"Failed to get prediction for question: {question}")
            # 正确答案：将数字转换成字母（0->A, 1->B, 2->C, 3->D）
            answer = chr(data_item['answer'] + 65)
            processed_prediction = data_evaluator.post_processing(prediction)
            score = data_evaluator.score(processed_prediction, answer)
            exact_score = data_evaluator.score(extract_final_answer(prediction), answer)
            elapsed_time = time.time() - start_time
            result_data = {
                "question_id": index,
                "answer": answer,
                "prediction": processed_prediction,
                "full_prediction": prediction,
                "score": score,
                "exact_score": exact_score,
                "time": elapsed_time
            }
            # 写入结果时加锁保证线程安全
            with file_lock:
                with open(result_file, 'a', encoding='utf-8') as f:
                    json.dump(result_data, f, ensure_ascii=False, indent=4)
                    f.write("\n")
            return result_data
        except Exception as e:
            print(f"Error processing request {index}: {e}")
            return None

    # 按批次处理，每批最多 10 个任务
    for batch_start in range(0, len(data_subset), batch_size):
        batch = data_subset[batch_start: batch_start + batch_size]
        with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = [executor.submit(worker, batch_start + j, data_item) for j, data_item in enumerate(batch)]
            for future in concurrent.futures.as_completed(futures):
                res = future.result()
                if res is not None:
                    results.append(res)
                    total_score += res['score']
                    total_exact_score += res['exact_score']
    
    total_time = time.time() - start_total_time
    throughput = len(data_subset) / total_time if total_time > 0 else 0
    
    with open(log_file, 'a', encoding='utf-8') as log_f:
        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
        average_score = total_score / len(data_subset) if data_subset else 0
        log_f.write(f"Average Score: {average_score}\n")
        average_exact_score = total_exact_score / len(data_subset) if data_subset else 0
        log_f.write(f"Average Exact Score: {average_exact_score}\n")
        log_f.write('-' * 40 + '\n')
    
    print(f"Results saved to {result_file}")
    print(f"Log saved to {log_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--concurrent", type=int, default=1000, help="需要测试的实例总数")
    parser.add_argument("--file", type=str, default="cais/mmlu", help="数据文件路径")
    parser.add_argument("--result", type=str, default="./mmlu_result_silicon.json", help="结果文件保存路径")
    parser.add_argument("--log", type=str, default="./mmlu_result_silicon.log", help="日志文件保存路径")
    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="模型名称或路径")
    parser.add_argument("--api_url", type=str, default="http://localhost:10006/v1/chat/completions", help="API URL")

    args = parser.parse_args()
    
    data_evaluator = DataEvaluator()
    data_evaluator.load_data(args.file)
    
    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)

================================================
FILE: archive/ktransformers/tests/parse_cover_info.py
================================================
import os
import ast
import argparse
from coverage import Coverage


def main():
    parser = argparse.ArgumentParser(
        description="统计某个类在 .coverage 数据中的行覆盖率"
    )
    parser.add_argument(
        "--data-file",
        default=".coverage",
        help="coverage 数据文件路径（默认 ./.coverage）",
    )
    parser.add_argument(
        "--file",
        dest="file_pattern",
        default="ktransformers/operators/ascend/ascend_attention.py",
        help=(
            "要统计的源码文件路径（可用结尾匹配，默认 "
            "ktransformers/operators/ascend/ascend_attention.py）"
        ),
    )
    parser.add_argument(
        "--class",
        dest="class_name",
        default="KDeepseekV2AttentionW8A8A2Serve",
        help="要统计的类名（默认 KDeepseekV2AttentionW8A8A2Serve）",
    )

    args = parser.parse_args()

    if not os.path.exists(args.data_file):
        print(f"找不到 coverage 数据文件: {args.data_file}")
        raise SystemExit(1)

    cov = Coverage(data_file=args.data_file)
    cov.load()
    data = cov.get_data()

    file_pattern_norm = os.path.normpath(args.file_pattern)

    target_file = None
    for f in data.measured_files():
        f_norm = os.path.normpath(f)
        if f_norm.endswith(file_pattern_norm) or file_pattern_norm in f_norm:
            target_file = f
            break

    if not target_file:
        print(
            f"没有在 coverage 数据里找到匹配文件: {args.file_pattern}\n"
            f"实际记录的文件有:"
        )
        for f in data.measured_files():
            print("  ", f)
        raise SystemExit(1)

    print("使用的源码文件:", target_file)
    executed_lines = set(data.lines(target_file) or [])
    try:
        with open(target_file, "r", encoding="utf-8") as f:
            source_text = f.read()
    except OSError as e:
        print(f"无法打开源码文件 {target_file}: {e}")
        raise SystemExit(1)

    source_lines = source_text.splitlines()
    tree = ast.parse(source_text)

    class_start = None
    class_end = None

    for node in tree.body:
        if isinstance(node, ast.ClassDef) and node.name == args.class_name:
            class_start = node.lineno
            max_lineno = node.lineno
            for sub in ast.walk(node):
                ln = getattr(sub, "end_lineno", getattr(sub, "lineno", None))
                if ln is not None and ln > max_lineno:
                    max_lineno = ln
            class_end = max_lineno
            break

    if class_start is None:
        print(f"在源码 {target_file} 中没有找到类 {args.class_name}")
        raise SystemExit(1)

    print(
        f"类 {args.class_name} 行范围: {class_start} ~ {class_end}"
    )

    total = 0
    covered = 0
    missed_lines = []

    for lineno in range(class_start, class_end + 1):
        line = source_lines[lineno - 1].strip()
        # 跳过空行和纯注释
        if not line or line.startswith("#"):
            continue

        total += 1
        if lineno in executed_lines:
            covered += 1
        else:
            missed_lines.append(lineno)

    percent = (covered / total * 100) if total > 0 else 0.0

    print(
        f"类 {args.class_name} 覆盖: {covered}/{total} 行, 覆盖率 = {percent:.1f}%"
    )
    if missed_lines:
        print("未覆盖行号:", missed_lines)
    else:
        print("该类所有有效代码行均被覆盖")


if __name__ == "__main__":
    main()


================================================
FILE: archive/ktransformers/tests/score.py
================================================
import subprocess
import time
import requests
import sys
import os

def wait_for_server(base_url: str, timeout: int = None) -> None:
    start_time = time.time()
    while True:
        try:
            response = requests.get(
                f"{base_url}/v1/models",
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
                print("Server is ready.")
                break
        except requests.exceptions.RequestException:
            time.sleep(1)
            if timeout and time.time() - start_time > timeout:
                raise TimeoutError("Server did not become ready within timeout period")

server_cmd = [
    "numactl", "-N", "1", "-m", "1",
    "/home/qujing3/anaconda3/envs/ktransformers-dev/bin/ktransformers",
    "--model_path", "/home/qujing3/models/DeepSeek-R1-Q4_K_M/config",
    "--gguf_path", "/home/qujing3/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M",
    "--port", "10002",
    "--cpu_infer", "48",
    "--optimize_config_path", "ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml",
    "--max_new_tokens", "3000",
    "--cache_lens", "6000"
]

print("Starting ktransformers server...")
print(" ".join(server_cmd))
with open("/tmp/server_log.txt", "w") as f:
    server_process = subprocess.Popen(server_cmd, stdout=f, stderr=f, text=True)

try:
    wait_for_server("http://localhost:10002", timeout=600)

    eval_cmd = ["python", "ktransformers/tests/humaneval/eval_api.py"]
    print("Running eval_api.py...")
    print(f"Command: {' '.join(eval_cmd)}")
    
    env = os.environ.copy()
    env["PYTHONUNBUFFERED"] = "1"
    
    eval_process = subprocess.Popen(
        eval_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        bufsize=1,
        env=env,
        universal_newlines=True
    )
    
    import threading
    import queue
    
    def enqueue_output(out, queue):
        for line in iter(out.readline, ''):
            queue.put(line)
        out.close()
    
    stdout_queue = queue.Queue()
    stderr_queue = queue.Queue()
    
    stdout_thread = threading.Thread(target=enqueue_output, args=(eval_process.stdout, stdout_queue))
    stderr_thread = threading.Thread(target=enqueue_output, args=(eval_process.stderr, stderr_queue))
    
    stdout_thread.daemon = True
    stderr_thread.daemon = True
    stdout_thread.start()
    stderr_thread.start()
    
    while eval_process.poll() is None:
        try:
            line = stdout_queue.get_nowait()
            print(line, end='', flush=True)
        except queue.Empty:
            pass
            
        try:
            line = stderr_queue.get_nowait()
            print(line, end='', file=sys.stderr, flush=True)
        except queue.Empty:
            pass
        
        time.sleep(1)

    while not stdout_queue.empty():
        print(stdout_queue.get(), end='', flush=True)
    while not stderr_queue.empty():
        print(stderr_queue.get(), end='', file=sys.stderr, flush=True)
        
    eval_process.wait()
    print(f"eval_api.py completed with exit code: {eval_process.returncode}")

    evaluate_cmd = [
        "evaluate_functional_correctness",
        "ktransformers/tests/humaneval/results/api/eval_b.jsonl"
    ]
    print("Running evaluate_functional_correctness...")
    print(f"Command: {' '.join(evaluate_cmd)}")
    
    evaluate_process = subprocess.Popen(
        evaluate_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        bufsize=1,
        universal_newlines=True
    )
    
    for line in evaluate_process.stdout:
        print(line, end='', flush=True)
    for line in evaluate_process.stderr:
        print(line, end='', file=sys.stderr, flush=True)
        
    evaluate_process.wait()
    
    print(f"evaluate_functional_correctness completed with exit code: {evaluate_process.returncode}")
    if evaluate_process.returncode != 0:
        print(f"evaluate_functional_correctness exited with code {evaluate_process.returncode}")
        sys.exit(evaluate_process.returncode)

finally:
    print("Stopping ktransformers server...")
    server_process.terminate()
    try:
        server_process.wait(timeout=30)
    except subprocess.TimeoutExpired:
        print("Server did not terminate gracefully, forcing...")
        server_process.kill()

================================================
FILE: archive/ktransformers/tests/test_client.py
================================================
import asyncio
import json
import sys
import aiohttp
import argparse

prompt_list = [
    'Please elaborate on modern world history.',
    'Please introduce Harry Potter.',
    'I want to learn Python. Please give me some advice.',
    'Please tell me a joke '
]


async def fetch_event_stream(session, payload, request_id, stream):
    try:
        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }

        async with session.post(SERVER_URL, json=payload, headers=headers, timeout=50000) as response:
            print(f"Request {request_id}: Connected, status {response.status}")

            if response.status != 200:
                print(f"Request {request_id}: Error, status {response.status}")
                return

            output_text = ""

            if stream:
                async for line in response.content:
                    try:
                        decoded_line = line.decode("utf-8").strip()
                        if not decoded_line or not decoded_line.startswith("data: "):
                            continue

                        decoded_line = decoded_line[6:].strip()
                        if not decoded_line:
                            continue

                        response_data = json.loads(decoded_line)
                        choices = response_data.get("choices", [])
                        if not choices:
                            continue

                        delta = choices[0].get("delta", {})
                        token = delta.get("content", "")

                        if token:
                            output_text += token
                            sys.stdout.write(token)
                            sys.stdout.flush()

                        finish_reason = choices[0].get("finish_reason", None)
                        if finish_reason:
                            break

                    except json.JSONDecodeError as e:
                        print(f"\nRequest {request_id}: JSON Decode Error - {e}")
                    except IndexError:
                        print(f"\nRequest {request_id}: List Index Error - choices is empty")
                    except Exception as e:
                        print(f"\nRequest {request_id}: Error parsing stream - {e}")
            else:
                # 非 stream 模式下，一次性接收完整 json
                response_data = await response.json()
                choices = response_data.get("choices", [])
                if choices:
                    content = choices[0].get("message", {}).get("content", "")
                    print(f"Request {request_id} Output:\n{content}")
                    output_text += content

    except Exception as e:
        print(f"\nRequest {request_id}: Exception - {e}")

async def main(prompt_id, model, stream, max_tokens, temperature, top_p):
    async with aiohttp.ClientSession() as session:
        payload = {
            "messages": [
                {"role": "system", "content": ""},
                {"role": "user", "content": prompt_list[prompt_id]}
            ],
            "model": model,
            "stream": stream,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p
        }
        tasks = [fetch_event_stream(session, payload, prompt_id, stream)]
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Event Stream Request Tester")
    parser.add_argument("--question_id", type=int, default=0)
    parser.add_argument("--model", type=str, default="DeepSeek-V3")
    parser.add_argument("--stream", type=bool, default=True)  
    parser.add_argument("--max_tokens", type=int, default=500)
    parser.add_argument("--temperature", type=float, default=0.8)
    parser.add_argument("--top_p", type=float, default=1)
    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")

    args = parser.parse_args()
    SERVER_URL = args.api_url
    asyncio.run(main(args.question_id, args.model, args.stream, args.max_tokens, args.temperature, args.top_p))


================================================
FILE: archive/ktransformers/tests/test_prefix.py
================================================
import asyncio
import json
import sys
import aiohttp
import random
import argparse
import yaml
import os
import time
from time import sleep

decodesz = 128
# Server URL (replace with your server URL)
decodesz_list = [128]
prefill_speeds = []
decode_speeds = []

async def fetch_message_once(session, request_id, messages, max_tokens, model):
    try:
        payload = {
            "messages": messages,
            "model": model,
            "temperature": 0.3,
            "top_p": 1.0,
            "stream": True,
            "return_speed": True,
            "max_tokens": max_tokens,
        }

        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }

        async with session.post(SERVER_URL, json=payload, headers=headers, timeout=500000) as response:
            if response.status != 200:
                print(f"[Request {request_id}] Error: Status {response.status}")
                return None, None, None

            buffer = ""
            usage_info = None
            answer = ""

            async for line in response.content:
                decoded_line = line.decode("utf-8").strip()
                if not decoded_line or not decoded_line.startswith("data: "):
                    continue

                decoded_line = decoded_line[6:].strip()
                if not decoded_line:
                    continue

                response_data = json.loads(decoded_line)

                if "usage" in response_data:
                    usage_info = response_data["usage"]

                choices = response_data.get("choices", [])
                if not choices:
                    continue

                delta = choices[0].get("delta", {})
                token = delta.get("content", "")
                if token:
                    buffer += token
                    answer += token

                finish_reason = choices[0].get("finish_reason", None)
                if finish_reason:
                    break

            return answer.strip(), usage_info, buffer.strip()

    except Exception as e:
        print(f"[Request {request_id}] Exception: {e}")
        return None, None, None


async def multi_turn_conversation(session, request_id, rounds, max_tokens, model):
    prompt = ["介绍一下秦始皇", "秦始皇的成就有哪些", "秦始皇的历史影响", "介绍一下秦始皇的陵墓", "秦始皇的统一措施", "秦始皇的政治制度", "秦始皇的文化政策", "秦始皇的军事行动"]
    
    messages = [{"role": "system", "content": ""}]
    global prefill_speeds, decode_speeds

    for i in range(rounds):
        user_msg = f"这是第{i + 1}轮对话，请回答以下问题：{prompt[i % len(prompt)]}"
        messages.append({"role": "user", "content": user_msg})
        print(f"\n[Request {request_id}] >> User: {user_msg}")

        answer, usage_info, _ = await fetch_message_once(session, request_id, messages, max_tokens, model)
        if answer:
            messages.append({"role": "user", "content": answer})
            print(f"[Request {request_id}] << Assistant: {answer}")

        if usage_info:
            prefill_speed = usage_info["prompt_tokens"] / usage_info["prefill_time"]
            decode_speed = usage_info["completion_tokens"] / usage_info["decode_time"]
            prefill_speeds.append(prefill_speed)
            decode_speeds.append(decode_speed)
            print(f'[Request {request_id}] prefill speed: {prefill_speed}')
            print(f'[Request {request_id}] decode speed: {decode_speed}')


async def main(concurrent_requests, rounds, max_tokens, model):
    async with aiohttp.ClientSession() as session:
        tasks = [multi_turn_conversation(session, i, rounds, max_tokens, model) for i in range(concurrent_requests)]
        await asyncio.gather(*tasks)

    if prefill_speeds:
        import numpy as np
        print(f"\n=== Summary ===")
        print(f"Total concurrency: {concurrent_requests}")
        print(f"Avg prefill speed: {np.mean(prefill_speeds)}")
        print(f"Avg decode speed: {np.mean(decode_speeds)}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Event Stream Request Tester")
    parser.add_argument("--concurrent", type=int, default=1, help="Number of concurrent requests")
    parser.add_argument("--model", type=str, default="DeepSeek-V3", help="Model name")
    parser.add_argument("--prompt_lens", type=int, default=1024, help="prefill prompt lens, 1024 or 2048")
    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
    parser.add_argument("--max_tokens", type=int, default=50, help="max decode tokens")
    parser.add_argument("--rounds", type=int, default=8, help="Number of multi-turn rounds (before final query)")    
    
    args = parser.parse_args()
    SERVER_URL = args.api_url
    max_tokens = args.max_tokens
    model = args.model

    asyncio.run(main(args.concurrent, args.rounds, max_tokens, model))


================================================
FILE: archive/ktransformers/tests/test_pytorch_q8.py
================================================
import torch

# 定义一个包含线性层的浮点模型
class LinearModel(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear = torch.nn.Linear(in_features, out_features)
    
    def forward(self, x):
        return self.linear(x)

# 创建浮点模型实例
in_features = 64
out_features = 128
model_fp32 = LinearModel(in_features, out_features)

# 创建量化模型实例
model_int8 = torch.ao.quantization.quantize_dynamic(
    model_fp32,          # 原始浮点模型
    {torch.nn.Linear},   # 要量化的层类型集合
    dtype=torch.qint8    # 量化的目标数据类型
)

# 测试模型
batch_size = 32
input_fp32 = torch.randn(1, batch_size, in_features)  # 生成随机输入数据
output_int8 = model_int8(input_fp32)               # 通过量化模型运行数据

# 打印输出形状验证
print(f"输入形状: {input_fp32.shape}")
print(f"输出形状: {output_int8.shape}")

# 比较原始模型和量化模型的输出
with torch.no_grad():
    output_fp32 = model_fp32(input_fp32)
    
print(f"FP32输出的前几个值: {output_fp32[0, :5]}")
print(f"INT8输出的前几个值: {output_int8[0, :5]}")

# 计算平均误差
error = torch.abs(output_fp32 - output_int8).mean().item()
print(f"平均绝对误差: {error}")

# 打印模型类型信息
print(f"量化前模型类型: {type(model_fp32.linear)}")
print(f"量化后模型类型: {type(model_int8.linear)}")

================================================
FILE: archive/ktransformers/tests/test_speed.py
================================================
import asyncio
import json
import sys
import aiohttp
import random
import argparse
import yaml
import os
import time
from time import sleep

decodesz = 128
# Server URL (replace with your server URL)
decodesz_list = [128]
prefill_speeds = []
decode_speeds = []
ktansformer_prompt1024="""Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. 
They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.Mr. Dursley was the director of a firm called Grunnings, which made drills. 
He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. 
Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. 
The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.
The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. 
They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. 
Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. 
The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. 
The Dursleys knew that the Potters had a small son, too, but they had never even seen him. 
This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. 
Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. 
Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair.None of them noticed a large, tawny owl flutter past the window.
At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls.
“Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive.
It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. 
For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. 
There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. 
What could he have been thinking of? It must have been a trick of the light. 
Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. 
It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. 
Mr. Dursley gave himself a little shake and put the cat out of his mind. 
As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day.
But on the edge of town, drills were driven out of his mind by something else. 
As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. 
People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! 
He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. 
They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! 
The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. 
The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
Mr. Dursley always sat with his back to the window in his office on the ninth floor."""
async def fetch_event_stream(session, request_id, prompt, max_tokens, model):
    try:
        payload = {
            "messages": [
                {"role": "system", "content": ""},
                {"role": "user", "content": prompt}
            ],
            "model": model,
            "temperature": 0.3,
            "top_p": 1.0,
            "stream": True,
            "return_speed": True,
            "max_tokens": max_tokens,
        }

        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }

        async with session.post(SERVER_URL, json=payload, headers=headers, timeout=500000) as response:
            if response.status != 200:
                print(f"[Request {request_id}] Error: Status {response.status}")
                return

            buffer = ""  
            total_tokens = 0
            decode_start_time = None
            decode_end_time = None
            usage_info = None  

            async for line in response.content:
                try:
                    decoded_line = line.decode("utf-8").strip()
                    if not decoded_line or not decoded_line.startswith("data: "):
                        continue

                    decoded_line = decoded_line[6:].strip()
                    if not decoded_line:
                        continue

                    response_data = json.loads(decoded_line)
                    
                    if "usage" in response_data:
                        usage_info = response_data["usage"]
                    
                    choices = response_data.get("choices", [])
                    if not choices:
                        continue

                    delta = choices[0].get("delta", {})
                    token = delta.get("content", "")

                    if token:
                        if decode_start_time is None:
                            decode_start_time = time.time()
                        buffer += token
                        total_tokens += 1
                        decode_end_time = time.time()

                        while "\n" in buffer:
                            line, buffer = buffer.split("\n", 1)
                            print(f"[Request {request_id}] {line}")

                    finish_reason = choices[0].get("finish_reason", None)
                    if finish_reason:
                        break

                except Exception as e:
                    print(f"[Request {request_id}] Stream Error: {e}")

            if buffer.strip():
                print(f"[Request {request_id}] {buffer.strip()}")

            if usage_info:
                if "prefill_time" in usage_info:
                    # print(f"[Request {request_id}] Usage:")
                    # for key, value in usage_info.items():
                    #     print(f"  {key}: {value}")
                    prefill_speed = usage_info["prompt_tokens"] / usage_info["prefill_time"]
                    decode_speed = usage_info["completion_tokens"] / usage_info["decode_time"]
                    prefill_speeds.append(prefill_speed)
                    decode_speeds.append(decode_speed)
                    print(f'[Request {request_id}] prefill speed: {prefill_speed}')
                    print(f'[Request {request_id}] decode speed: {decode_speed}')

    except Exception as e:
        print(f"[Request {request_id}] Exception: {e}")

async def main(concurrent_requests , prompt, max_tokens, model):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_event_stream(session, i , prompt, max_tokens, model) for i in range(concurrent_requests)]
        await asyncio.gather(*tasks)
    if len(prefill_speeds) != 0:
        import numpy as np
        print(f"concurrency: {len(prefill_speeds)}")
        print(f"total prefill speed: {np.sum(prefill_speeds)}\n total decode speed: {np.sum(decode_speeds)}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Event Stream Request Tester")
    parser.add_argument("--concurrent", type=int, default=1, help="Number of concurrent requests")
    parser.add_argument("--model", type=str, default="DeepSeek-V3", help="Model name")
    parser.add_argument("--prompt_lens", type=int, default=1024, help="prefill prompt lens, 1024 or 2048")
    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
    parser.add_argument("--max_tokens", type=int, default=500, help="max decode tokens")
    
    args = parser.parse_args()
    SERVER_URL = args.api_url
    max_tokens = args.max_tokens
    model = args.model
    if args.prompt_lens == 1024:
        prompt = ktansformer_prompt1024
    elif args.prompt_lens == 2048:
        prompt = ktansformer_prompt1024 * 2
    elif args.prompt_lens == 4096:
        prompt = ktansformer_prompt1024 * 4


    asyncio.run(main(args.concurrent, prompt, max_tokens, model))


================================================
FILE: archive/ktransformers/tests/triton_fp8gemm_test.py
================================================
import torch
import torch.nn.functional as F
from typing import Optional
import pytest
from typing import Tuple, Optional, Literal
import time
# use dir path
import os
import sys
sys.path.insert(0, "/home/azure/ktransformers")
print(sys.path)
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from safetensors import safe_open

world_size = 1
rank = 0
block_size = 128
gemm_impl: Literal["bf16", "fp8"] = "bf16"
# Assuming `fp8_gemm`, `act_quant`, `weight_dequant` and other relevant functions are already defined

def test_fp8_gemm_vs_torch_matmul():
    # Test case 1: Create random matrices of size (M, K) and (K, N)
    M, K, N = 64, 128, 256  # Matrix dimensions
    x = torch.randn(M, K, dtype=torch.bfloat16, device='cuda')
    weight = torch.randn(N, K, dtype=torch.bfloat16, device='cuda')

    # Apply act_quant to both matrices
    x_quantized, scale_x = act_quant(x, block_size)
    weight_quantized, scale_w = act_quant(weight, block_size)
    
    # mk continous
    x_quantized = x_quantized.contiguous()
    weight_quantized = weight_quantized.contiguous()
    scale_x = scale_x.contiguous()
    scale_w = scale_w.contiguous()

    # Perform fp8_gemm using the quantized tensors
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight_quantized, scale_w)

    # Perform torch.matmul using the original floating point tensors
    result_torch_matmul = torch.matmul(x, weight.T)
    print(f'result_torch_matmul: {result_torch_matmul.shape}')
    print(f'result_fp8_gemm: {result_fp8_gemm.shape}')

    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
    print(f"result_torch_matmul:\n {result_torch_matmul}")
    
def test_fp8_gemm_vs_torch_matmul_load():
    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
    with safe_open(file_path, framework="pt", device=0) as f:
        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")

    # weight_dequant
    weight_dequantized = weight_dequant(weight, scale)
    print(f"weight_dequantized: {weight_dequantized.shape}")
    N, K = weight_dequantized.shape
    M = 64
    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
    x_quantized, scale_x = act_quant(x, block_size)
    
    # Test case 1: quantized x matmal with undequantized weight
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
    print(f"dtype {result_fp8_gemm.dtype}")

    # Perform torch.matmul using the original floating point tensors
    result_torch_matmul = torch.matmul(x, weight_dequantized.to(torch.bfloat16).T)
    print(f"result_torch_matmul:\n {result_torch_matmul}")

def test_fp8_gemm_tplops():
    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
    with safe_open(file_path, framework="pt", device=0) as f:
        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")

    # weight_dequant
    weight_dequantized = weight_dequant(weight, scale)
    print(f"weight_dequantized: {weight_dequantized.shape}")
    N, K = weight_dequantized.shape
    M = 6400
    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
    # x_quantized, scale_x = act_quant(x, block_size)
    
    # Calculate time for 1000 fp8_gemm
    i = 10
    flops_per_gemm = 2 * M * N * K
    total_flops = i * flops_per_gemm
    
    x_quantized, scale_x = act_quant(x, block_size)
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
    x_quantized, scale_x = act_quant(x, block_size)
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)

    
    t0 = time.time()
    torch.cuda.synchronize()
    for i in range(i):
        x_quantized, scale_x = act_quant(x, block_size)
        result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
    torch.cuda.synchronize()
    t1 = time.time()
    
    total_time = t1 - t0
    tflops = total_flops / total_time / 1e12
    print(f"total_time: {total_time}")
    print(f"tflops: {tflops}")
    

if __name__ == "__main__":
    test_fp8_gemm_vs_torch_matmul()
    test_fp8_gemm_vs_torch_matmul_load()
    test_fp8_gemm_tplops()
    

================================================
FILE: archive/ktransformers/util/ascend/ascend_utils.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from datetime import timedelta

import torch
import torch_npu
import torch.distributed as dist

_DATA_PARALLEL_SIZE = 0
_TENSOR_PARALLEL_SIZE = 0
_DATA_PARALLEL_GROUP = None
_TENSOR_PARALLEL_RANKS = None
_TENSOR_PARALLEL_GROUP = None
_DATA_PARALLEL_GROUP_GLOO = None
_DATA_PARALLEL_RANKS = None


def setup_model_parallel(distributed_timeout_minutes: int = 30, tp: int = 1):
    global _DATA_PARALLEL_SIZE
    global _DATA_PARALLEL_GROUP
    global _DATA_PARALLEL_RANKS
    global _TENSOR_PARALLEL_SIZE
    global _TENSOR_PARALLEL_RANKS
    global _TENSOR_PARALLEL_GROUP

    # os.environ["MASTER_ADDR"] = "localhost"
    # os.environ["MASTER_PORT"] = "12345"
    local_rank = int(os.getenv("LOCAL_RANK", '0'))
    world_size = int(os.getenv("WORLD_SIZE", '1'))
    torch_npu.npu.set_device(local_rank)
    tp_size = tp
    dp_size = world_size // tp_size
    _DATA_PARALLEL_SIZE = dp_size
    _TENSOR_PARALLEL_SIZE = tp_size

    torch.set_num_threads(8)
    timeout = timedelta(minutes=distributed_timeout_minutes)
    print(f"start to init process group ------rank is {local_rank}, world_size is {world_size}")
    torch.distributed.init_process_group(
        backend='hccl',
        world_size=world_size, rank=local_rank
    )
    print(f"init process group success ------rank is {local_rank}, world_size is {world_size}")

    rank = torch.distributed.get_rank()
    nccl_comm_cfgs = {}
    # DP 组由每隔 tp_size 的进程组成
    for dp_group_id in range(tp_size):
        ranks = list(range(dp_group_id, world_size, tp_size))
        dp_group = torch.distributed.new_group(
            ranks, timeout=timeout, pg_options=get_nccl_options('dp', nccl_comm_cfgs)
        )
        if rank in ranks:
            global _DATA_PARALLEL_GROUP
            _DATA_PARALLEL_GROUP = dp_group
            _DATA_PARALLEL_RANKS = ranks

    # TP 组由连续的 dp_size 个进程组成
    for tp_group_id in range(dp_size):
        start_rank = tp_group_id * tp_size
        end_rank = (tp_group_id + 1) * tp_size
        ranks = list(range(start_rank, end_rank))
        tp_group = torch.distributed.new_group(
            ranks, timeout=timeout, pg_options=get_nccl_options('tp', nccl_comm_cfgs)
        )
        if rank in ranks:
            global _TENSOR_PARALLEL_GROUP
            _TENSOR_PARALLEL_GROUP = tp_group
            _TENSOR_PARALLEL_RANKS = ranks
    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size


def get_tensor_parallel_size():
    assert _TENSOR_PARALLEL_SIZE is not None, "tensor parallel size is not set"
    return _TENSOR_PARALLEL_SIZE


def get_tensor_parallel_group():
    assert _TENSOR_PARALLEL_GROUP is not None, "tensor parallel group is not initialized"
    return _TENSOR_PARALLEL_GROUP


def get_tensor_parallel_rank():
    assert _TENSOR_PARALLEL_RANKS is not None, "tensor parallel rank is not initialized"
    return _TENSOR_PARALLEL_RANKS


def get_data_parallel_size():
    assert _DATA_PARALLEL_SIZE is not None, "data parallel size is not initialized"
    return _DATA_PARALLEL_SIZE


def get_data_parallel_gloo():
    assert _DATA_PARALLEL_GROUP_GLOO is not None, "data parallel gloo group is not initialized"
    return _DATA_PARALLEL_GROUP_GLOO


def get_data_parallel_group():
    assert _DATA_PARALLEL_GROUP is not None, "data parallel group is not initialized"
    return _DATA_PARALLEL_GROUP


def get_data_parallel_rank():
    assert _DATA_PARALLEL_RANKS is not None, "data parallel rank is not initialized"
    return _DATA_PARALLEL_RANKS


def get_nccl_options(pg_name, nccl_comm_cfgs):
    if pg_name in nccl_comm_cfgs:
        nccl_options = torch.distributed.ProcessGroupNCCL.Options()
        nccl_options.config.cga_cluster_size = nccl_comm_cfgs[pg_name].get('cga_cluster_size', 4)
        nccl_options.config.max_ctas = nccl_comm_cfgs[pg_name].get('max_ctas', 32)
        nccl_options.config.min_ctas = nccl_comm_cfgs[pg_name].get('min_ctas', 1)
        return nccl_options
    else:
        return None


def get_safetensors_cut_weight(name: str, weights: torch.Tensor):
    translate_col_cut_tensors = ["ffn_down", "attn_output"]  # "kv_b_proj"
    translate_row_cut_tensors = ["ffn_gate", "ffn_up", "attn_q_b"]
    tp = get_tensor_parallel_size()
    if tp == 1 or weights.shape == torch.Size([1]):
        return weights
    rank = torch.distributed.get_rank()
    rank %= tp
    assert 0 <= rank < tp and tp > 0, f"unexpected {rank=}, {tp=}"
    if any(t in name for t in translate_col_cut_tensors):
        if weights.dim() == 1:
            return weights
        dim = weights.shape[-1]
        assert dim % tp == 0, f"unexpected division {dim=}, {tp=}"
        chunk_size = dim // tp
        output_weights = weights[:, rank * chunk_size:(rank + 1) * chunk_size]
        # print(f"col cut weights {name=} from {weights.shape=} to {output_weights.shape=}")
        return output_weights
    elif any(t in name for t in translate_row_cut_tensors):
        dim = weights.shape[0]
        assert dim % tp == 0, f"unexpected division {dim=}, {tp=}"
        chunk_size = dim // tp
        output_weights = weights[rank * chunk_size: (rank + 1) * chunk_size:]
        # print(f"row cut weights {name=} from {weights.shape=} to {output_weights.shape=}")
        return output_weights
    else:
        return weights


def get_absort_weight(model, config):
    if not dist.is_initialized():
        return
    local_rank = dist.get_rank()
    tp = get_tensor_parallel_size()
    local_rank %= tp
    tp_heads = config.num_attention_heads // tp
    for i in range(config.num_hidden_layers):
        attn = model.model.layers[i].self_attn
        if hasattr(attn, "q_absorb") and hasattr(attn, "out_absorb"):
            continue
        if not (hasattr(attn, "kv_b_proj")
                and hasattr(attn, "kv_lora_rank")
                and hasattr(attn, "qk_nope_head_dim")):
            continue

        kv_b_proj = attn.kv_b_proj.weight.view(config.num_attention_heads, -1, attn.kv_lora_rank)
        q_absorb = kv_b_proj[:, :attn.qk_nope_head_dim, :].clone()
        out_absorb = kv_b_proj[:, attn.qk_nope_head_dim:, :].clone()

        q_absorb = q_absorb[local_rank * tp_heads: (local_rank + 1) * tp_heads, :, :].contiguous()
        out_absorb = out_absorb[local_rank * tp_heads: (local_rank + 1) * tp_heads, :, :].contiguous()
        out_absorb = out_absorb.transpose(1, 2).contiguous()

        setattr(attn, "q_absorb", q_absorb)
        setattr(attn, "out_absorb", out_absorb)

        if hasattr(attn, "orig_module") and hasattr(attn.orig_module, "kv_b_proj"):
            del attn.orig_module.kv_b_proj
    dist.barrier(get_tensor_parallel_group())


def allredeuce_warpper(func):
    def wrapper(*args, **kwargs):
        orig_output = func(*args, **kwargs)
        if isinstance(orig_output, tuple):
            if get_tensor_parallel_size() > 1:
                org_dtype = orig_output[0].dtype
                if org_dtype == torch.bfloat16:
                    dist.all_reduce(orig_output[0].to(dtype=torch.float16), op=dist.ReduceOp.SUM,
                                    group=get_tensor_parallel_group())
                else:
                    dist.all_reduce(orig_output[0], op=dist.ReduceOp.SUM, group=get_tensor_parallel_group())
                if org_dtype == torch.bfloat16:
                    bf_orig_output = orig_output[0].to(dtype=org_dtype)
                else:
                    bf_orig_output = orig_output[0]
            else:
                bf_orig_output = orig_output[0]
            return (bf_orig_output,) + orig_output[1:]
        else:
            if get_tensor_parallel_size() > 1:
                org_dtype = orig_output.dtype
                if org_dtype == torch.bfloat16:
                    orig_output = orig_output.to(dtype=torch.float16)
                dist.all_reduce(orig_output, op=dist.ReduceOp.SUM, group=get_tensor_parallel_group())
                if org_dtype == torch.bfloat16:
                    orig_output = orig_output.to(dtype=org_dtype)
            return orig_output

    return wrapper

================================================
FILE: archive/ktransformers/util/cuda_graph_runner.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import torch
from typing import Dict

class CUDAGraphRunner:

    def __init__(self):
        self.graph = None
        self.input_buffers: Dict[str, torch.Tensor] = {}
        self.output_buffers: Dict[str, torch.Tensor] = {}

    def capture(
        self,
        model,
        cur_token,
        position_ids,
        cache_position,
        past_key_values,
        main_device,
        **kwargs,
    ) -> None:
        assert self.graph is None
        # Capture the graph.
        torch.cuda.synchronize()
        self.graph = torch.cuda.CUDAGraph()
        #self.graph.enable_debug_mode()
        self.model = model
        inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(main_device)
        # torch.cuda.set_device can't set "cuda", must have a index
        if main_device == "cuda":
            main_device = "cuda:0"
        torch.cuda.set_device(main_device)
        self.main_device = main_device
        capture_stream = torch.cuda.Stream()
        with torch.cuda.graph(self.graph, stream = capture_stream):
            logits=model(inputs_embeds=inputs_embeds, 
                         position_ids=position_ids,
                         cache_position=cache_position,
                         past_key_values=past_key_values,
                         **kwargs)[0]
            capture_stream.wait_stream(torch.cuda.current_stream())
            torch.cuda.set_device(main_device)
            torch.cuda.set_stream(capture_stream)
        if past_key_values != None:    
            past_key_values.change_seq_length(-1)
        torch.cuda.synchronize(self.main_device)
        #self.graph.debug_dump("cuda_graph_hooked.dot")

        # Save the input and output buffers.
        self.input_buffers = {
            "inputs_embeds": inputs_embeds,
            "position_ids": position_ids,
            "cache_position": cache_position,
        }
        self.output_buffers = {"logits": logits}
        return

    def forward(
        self,
        cur_token,
        position_ids,
        cache_position,
    ) -> torch.Tensor:
        # Copy the input tensors to the input buffers.
        inputs_embeds = self.model.model.embed_tokens(cur_token.to("cpu"))
        self.input_buffers["inputs_embeds"].copy_(inputs_embeds)
        self.input_buffers["position_ids"].copy_(position_ids)
        self.input_buffers["cache_position"].copy_(cache_position)

        # Run the graph.
        #print("begin replay")
        #time.sleep(1)
        self.graph.replay()
        torch.cuda.synchronize(self.main_device)
        # Return the output tensor.
        return self.output_buffers["logits"]

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)


================================================
FILE: archive/ktransformers/util/custom_gguf.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Azure-Tang, Boxin Zhang, chenht2022
Date         : 2024-07-26 08:48:54
Version      : 1.0.0
LastEditors  : kkk1nak0
LastEditTime : 2024-08-14 08:20:45
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
# GGUF specification
# https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
import struct
import warnings
import numpy as np
import re
import numpy.typing as npt
from typing import Sequence
import os
from enum import IntEnum
import torch

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False

if not torch.xpu.is_available() and not use_torch_npu:
    import KTransformersOps

import ctypes
import math

class GGMLQuantizationType(IntEnum):
    F32     = 0
    F16     = 1
    Q4_0    = 2
    Q4_1    = 3
    Q5_0    = 6
    Q5_1    = 7
    Q8_0    = 8
    Q8_1    = 9
    Q2_K    = 10
    Q3_K    = 11
    Q4_K    = 12
    Q5_K    = 13
    Q6_K    = 14
    Q8_K    = 15
    IQ2_XXS = 16
    IQ2_XS  = 17
    IQ3_XXS = 18
    IQ1_S   = 19
    IQ4_NL  = 20
    IQ3_S   = 21
    IQ2_S   = 22
    IQ4_XS  = 23
    I8      = 24
    I16     = 25
    I32     = 26
    I64     = 27
    F64     = 28
    IQ1_M   = 29
    BF16    = 30

QK_K = 256
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.F32:     (1, 4),
    GGMLQuantizationType.F16:     (1, 2),
    GGMLQuantizationType.Q4_0:    (32, 2 + 16),
    GGMLQuantizationType.Q4_1:    (32, 2 + 2 + 16),
    GGMLQuantizationType.Q5_0:    (32, 2 + 4 + 16),
    GGMLQuantizationType.Q5_1:    (32, 2 + 2 + 4 + 16),
    GGMLQuantizationType.Q8_0:    (32, 2 + 32),
    GGMLQuantizationType.Q8_1:    (32, 4 + 4 + 32),
    GGMLQuantizationType.Q2_K:    (256, 2 + 2 + QK_K // 16 + QK_K // 4),
    GGMLQuantizationType.Q3_K:    (256, 2 + QK_K // 4 + QK_K // 8 + 12),
    GGMLQuantizationType.Q4_K:    (256, 2 + 2 + QK_K // 2 + 12),
    GGMLQuantizationType.Q5_K:    (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
    GGMLQuantizationType.Q6_K:    (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
    GGMLQuantizationType.Q8_K:    (256, 4 + QK_K + QK_K // 8),
    GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
    GGMLQuantizationType.IQ2_XS:  (256, 2 + QK_K // 4 + QK_K // 32),
    GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
    GGMLQuantizationType.IQ1_S:   (256, 2 + QK_K // 8 + QK_K // 16),
    GGMLQuantizationType.IQ4_NL:  (32, 2 + 16),
    GGMLQuantizationType.IQ3_S:   (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
    GGMLQuantizationType.IQ2_S:   (256, 2 + QK_K // 4 + QK_K // 16),
    GGMLQuantizationType.IQ4_XS:  (256, 2 + 2 + QK_K // 2 + QK_K // 64),
    GGMLQuantizationType.I8:      (1, 1),
    GGMLQuantizationType.I16:     (1, 2),
    GGMLQuantizationType.I32:     (1, 4),
    GGMLQuantizationType.I64:     (1, 8),
    GGMLQuantizationType.F64:     (1, 8),
    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
    GGMLQuantizationType.BF16:    (1, 2),
}

# copied from llama.cpp/gguf-py/gguf/quants.py to avoid dependence of gguf
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % block_size != 0:
        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
    return (*shape[:-1], shape[-1] // block_size * type_size)

GGML_TYPES = {
    "F32": 0,
    "F16": 1,
    "Q4_0": 2,
    "Q5_0": 6,
    "Q8_0": 8,
    "Q2_K": 10,
    "Q3_K": 11,
    "Q4_K": 12,
    "Q5_K": 13,
    "Q6_K": 14,
    "IQ4_XS": 23,
    "BF16": 30,
}

GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}

GGML_BLOCK_SIZES = {
    "F32": 4,
    "F16": 2,
    "BF16": 2,
    "Q4_0": 2 + 16,
    "Q5_0": 2 + 4 + 16,
    "Q8_0": 2 + 32,
    "Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
    "Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
    "Q4_K": 2 + 2 + 12 + 256 // 2,
    "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
    "Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2,
    "IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64,
    "FP8": 1,
}

GGML_ELEMENTS_PER_BLOCK = {
    "F32": 1,
    "F16": 1,
    "BF16": 1,
    "Q4_0": 32,
    "Q5_0": 32,
    "Q8_0": 32,
    "Q2_K": 256,
    "Q3_K": 256,
    "Q4_K": 256,
    "Q5_K": 256,
    "Q6_K": 256,
    "IQ4_XS": 256,
    "FP8": 1,
}

DATA_TYPES = {
    "uint8": 0,
    "int8": 1,
    "uint16": 2,
    "int16": 3,
    "uint32": 4,
    "int32": 5,
    "float32": 6,
    "bool": 7,
    "string": 8,
    "array": 9,
    "uint64": 10,
    "int64": 11,
    "float64": 12,
    "FP8": 13,
}

def read_value(f, data_type):
    if data_type == DATA_TYPES["string"]:
        length = struct.unpack("<Q", f.read(8))[0]
        return f.read(length).decode("utf-8")

    elif data_type == DATA_TYPES["bool"]:
        return bool(struct.unpack("<?", f.read(1))[0])

    elif data_type == DATA_TYPES["uint8"]:
        return struct.unpack("<B", f.read(1))[0]

    elif data_type == DATA_TYPES["int8"]:
        return struct.unpack("<b", f.read(1))[0]

    elif data_type == DATA_TYPES["uint16"]:
        return struct.unpack("<H", f.read(2))[0]

    elif data_type == DATA_TYPES["int16"]:
        return struct.unpack("<h", f.read(2))[0]

    elif data_type == DATA_TYPES["uint32"]:
        return struct.unpack("<I", f.read(4))[0]

    elif data_type == DATA_TYPES["int32"]:
        return struct.unpack("<i", f.read(4))[0]

    elif data_type == DATA_TYPES["float32"]:
        return struct.unpack("<f", f.read(4))[0]

    elif data_type == DATA_TYPES["uint64"]:
        return struct.unpack("<Q", f.read(8))[0]

    elif data_type == DATA_TYPES["int64"]:
        return struct.unpack("<q", f.read(8))[0]

    elif data_type == DATA_TYPES["float64"]:
        return struct.unpack("<d", f.read(8))[0]

    elif data_type == DATA_TYPES["array"]:
        elem_type, count = struct.unpack("<IQ", f.read(4 + 8))
        return [read_value(f, elem_type) for _ in range(count)]

    elif data_type == DATA_TYPES["FP8"]:
        return struct.unpack("<B", f.read(1))[0]

    else:
        raise NotImplementedError(f"Data type {data_type} not implemented")

def dequantize_q2_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74
    block_size = GGML_BLOCK_SIZES["Q2_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

    dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
    d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32)
    scales = data_u8[:, :16].reshape(num_blocks, 16, 1)
    qs = data_u8[:, 16:80].reshape(num_blocks, 64)

    tmp = np.stack([
        qs[:, 00:16] >> 0,
        qs[:, 16:32] >> 0,
        qs[:, 00:16] >> 2,
        qs[:, 16:32] >> 2,
        qs[:, 00:16] >> 4,
        qs[:, 16:32] >> 4,
        qs[:, 00:16] >> 6,
        qs[:, 16:32] >> 6,
        qs[:, 32:48] >> 0,
        qs[:, 48:64] >> 0,
        qs[:, 32:48] >> 2,
        qs[:, 48:64] >> 2,
        qs[:, 32:48] >> 4,
        qs[:, 48:64] >> 4,
        qs[:, 32:48] >> 6,
        qs[:, 48:64] >> 6,
    ], axis=1)

    return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)

def dequantize_q2_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q2_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q2_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q2_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q3_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95
    block_size = GGML_BLOCK_SIZES["Q3_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

    d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
    bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little")
    bits = 4 ^ (bits << 2)
    qs = data_u8[:, 32:32 + 64].astype(np.int16)
    a, b, c = data_u8[:, 96: 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2)
    scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8)
    scales[:, 0] = (a & 15) | ((c & 3) << 4)
    scales[:, 1] = (b & 15) | (((c >> 2) & 3) << 4)
    scales[:, 2] = (a >> 4) | (((c >> 4) & 3) << 4)
    scales[:, 3] = (b >> 4) | ((c >> 6) << 4)
    scales = scales.reshape(num_blocks, 16, 1).astype(np.int16)

    return d * (scales - 32) * np.stack([
        (((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]),
        (((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]),
        (((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]),
        (((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]),
        (((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]),
        (((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]),
        (((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]),
        (((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]),
        (((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]),
        (((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]),
        (((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]),
        (((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]),
        (((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]),
        (((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]),
        (((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]),
        (((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7])
    ], axis=1)

def dequantize_q3_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q3_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q3_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q3_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q4_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
    block_size = GGML_BLOCK_SIZES["Q4_K"]
    num_blocks = len(data) // block_size
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
    # Casting to float32 because float16 is very slow on CPU
    scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
    scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
    qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
    qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)
    # Dequantize scales and offsets (6 bits and 4 + 2 bits)
    factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1)
    offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1)
    # Interleave low and high quantized bits
    qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
    # Dequantize final weights using scales and offsets
    return factors * qs2 - offsets

def dequantize_q4_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q4_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q4_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q4_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q5_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138
    block_size = GGML_BLOCK_SIZES["Q5_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

    d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32)
    dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32)
    scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
    qh = data_u8[:, 16: 16 + 32].reshape(num_blocks, 32, 1)
    qs = data_u8[:, 48: 48 + 128].reshape(num_blocks, 4, 32)

    bits = np.unpackbits(qh, axis=-1, bitorder="little")

    qs_hi_4 = qs >> 4
    qs_lo_4 = qs & 15

    scales_lo_6 = scales[:, :8] & 63
    scales_hi_6 = scales[:, :8] >> 6
    scales_lo_4 = scales[:, 8:] & 15
    scales_hi_4 = scales[:, 8:] >> 4

    m1 = dmin * scales_lo_6[:, 4]
    m2 = dmin * scales_lo_6[:, 5]
    m3 = dmin * scales_lo_6[:, 6]
    m4 = dmin * scales_lo_6[:, 7]
    m5 = dmin * (scales_hi_4[:, 0] | (scales_hi_6[:, 4] << 4))
    m6 = dmin * (scales_hi_4[:, 1] | (scales_hi_6[:, 5] << 4))
    m7 = dmin * (scales_hi_4[:, 2] | (scales_hi_6[:, 6] << 4))
    m8 = dmin * (scales_hi_4[:, 3] | (scales_hi_6[:, 7] << 4))

    d1 = d * scales_lo_6[:, 0]
    d2 = d * scales_lo_6[:, 1]
    d3 = d * scales_lo_6[:, 2]
    d4 = d * scales_lo_6[:, 3]
    d5 = d * (scales_lo_4[:, 0] | (scales_hi_6[:, 0] << 4))
    d6 = d * (scales_lo_4[:, 1] | (scales_hi_6[:, 1] << 4))
    d7 = d * (scales_lo_4[:, 2] | (scales_hi_6[:, 2] << 4))
    d8 = d * (scales_lo_4[:, 3] | (scales_hi_6[:, 3] << 4))

    return np.concatenate([
        d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1,
        d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2,
        d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3,
        d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4,
        d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5,
        d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6,
        d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7,
        d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
    ], axis=1)

def dequantize_q5_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q5_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q5_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q5_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q6_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152
    block_size = GGML_BLOCK_SIZES["Q6_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
    data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size)

    scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32)
    # TODO use uint8 and cast later?
    ql = data_u8[:, :128].astype(np.int16)
    qh = data_u8[:, 128:192].astype(np.int16)
    sc = data_i8[:, 192:208, np.newaxis].astype(np.float32)

    # Unpack bits, subtraction requires signed data type
    q1 = (ql[:,   :32 ] & 0xF) | (((qh[:, :32] >> 0) & 3) << 4) - 32
    q2 = (ql[:, 32:64 ] & 0xF) | (((qh[:, :32] >> 2) & 3) << 4) - 32
    q3 = (ql[:,   :32 ] >>  4) | (((qh[:, :32] >> 4) & 3) << 4) - 32
    q4 = (ql[:, 32:64 ] >>  4) | (((qh[:, :32] >> 6) & 3) << 4) - 32
    q5 = (ql[:, 64:96 ] & 0xF) | (((qh[:, 32:] >> 0) & 3) << 4) - 32
    q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:] >> 2) & 3) << 4) - 32
    q7 = (ql[:, 64:96 ] >>  4) | (((qh[:, 32:] >> 4) & 3) << 4) - 32
    q8 = (ql[:, 96:128] >>  4) | (((qh[:, 32:] >> 6) & 3) << 4) - 32

    # Dequantize
    return scales * np.concatenate([
        sc[:,  0] * q1[:, :16],
        sc[:,  1] * q1[:, 16:],
        sc[:,  2] * q2[:, :16],
        sc[:,  3] * q2[:, 16:],
        sc[:,  4] * q3[:, :16],
        sc[:,  5] * q3[:, 16:],
        sc[:,  6] * q4[:, :16],
        sc[:,  7] * q4[:, 16:],
        sc[:,  8] * q5[:, :16],
        sc[:,  9] * q5[:, 16:],
        sc[:, 10] * q6[:, :16],
        sc[:, 11] * q6[:, 16:],
        sc[:, 12] * q7[:, :16],
        sc[:, 13] * q7[:, 16:],
        sc[:, 14] * q8[:, :16],
        sc[:, 15] * q8[:, 16:],
    ], axis=1) 

# @torch.jit.script
def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q6_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q6_K"]
    device = torch.device(device)
    num_blocks = len(data) // block_size
    data = np.frombuffer(data, dtype=data.dtype)
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q6_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

kvalues_iq4nl = np.array([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=np.int8)

def dequantize_iq4_xs(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/21d3a308fcb7f31cb9beceaeebad4fb622f3c337/src/ggml-quants.c#L3568
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/21d3a308fcb7f31cb9beceaeebad4fb622f3c337/src/ggml-common.h#L393
    block_size = GGML_BLOCK_SIZES["IQ4_XS"]
    num_blocks = len(data) // block_size

    d = np.frombuffer(data, dtype=np.float16)[0::block_size//2].astype(np.float32).reshape(num_blocks, 1)
    scales_h = np.frombuffer(data, dtype=np.uint16)[1::block_size//2].reshape(num_blocks, 1)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)[:, 4:]
    scales_l = data_u8[:, :4].reshape(num_blocks, 4)
    qs = data_u8[:, 4:].reshape(num_blocks, block_size - 8)

    ls = np.zeros((num_blocks, QK_K // 32), dtype=np.int8)
    for ib in range(QK_K // 32):
        ls[:, ib] = ((scales_l[:, ib // 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h[:, 0] >> 2 * ib) & 3) << 4)

    dl = (d * (ls - 32)).reshape(num_blocks, -1, 1)

    qs_lo_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) & 0xf
    qs_hi_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) >> 4

    y = np.zeros((num_blocks, QK_K), dtype=np.float32)
    for ib in range(QK_K // 32):
        y[:, ib*32:(ib*32)+16] = dl[:, ib] * kvalues_iq4nl[qs_lo_4[:, ib]]
        y[:, (ib*32)+16:(ib*32)+32] = dl[:, ib] * kvalues_iq4nl[qs_hi_4[:, ib]]

    return y.flatten()

def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["IQ4_XS"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["IQ4_XS"]
    device = torch.device(device)
    num_blocks = len(data) // block_size
    data = np.frombuffer(data, dtype=data.dtype)
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_iq4_xs(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q4_0(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141
    num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"]

    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32)
    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:]

    return np.concatenate([
        scales * ((qs & 0xf).astype(np.int8) - 8),
        scales * ((qs >> 4).astype(np.int8) - 8),
    ], axis=1)

def dequantize_q4_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    raise NotImplementedError()

def dequantize_q5_0(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161
    num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"]

    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32)
    qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4]
    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:]

    bits = np.unpackbits(qh, axis=-1, bitorder="little")

    x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16
    x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16

    return np.concatenate([
        scales * x0,
        scales * x1,
    ], axis=1)

def dequantize_q5_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    raise NotImplementedError()

def dequantize_q8_0(data):
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
    num_blocks = len(data) // GGML_BLOCK_SIZES["Q8_0"]

    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32)
    qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
    return scales * qs

def dequantize_q8_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
    
    block_size = GGML_BLOCK_SIZES["Q8_0"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q8_0"]
    device = torch.device(device)
    data = np.frombuffer(data, dtype=data.dtype)
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q8_0(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)


def dequantize_f32(data):
    return np.frombuffer(data, dtype=np.float32)

def dequantize_f32_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float32)
    res = torch.from_numpy(data.copy())
    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
    res_gpu.copy_(res)
    return res_gpu

def dequantize_f16(data):
    return np.frombuffer(data, dtype=np.float16)

def dequantize_f16_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float16)
    res = torch.from_numpy(data.copy())
    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
    res_gpu.copy_(res)
    return res_gpu

def dequantize_bf16_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float16)
    res = torch.from_numpy(data.copy())
    res_gpu = torch.empty_like(res, device=device)
    res_gpu.copy_(res)
    return res_gpu

GGML_DEQUANTIZE = {
    "F32": dequantize_f32,
    "F16": dequantize_f16,
    "BF16": dequantize_f16,
    "Q4_0": dequantize_q4_0,
    "Q5_0": dequantize_q5_0,
    "Q8_0": dequantize_q8_0,
    "Q2_K": dequantize_q2_k,
    "Q3_K": dequantize_q3_k,
    "Q4_K": dequantize_q4_k,
    "Q5_K": dequantize_q5_k,
    "Q6_K": dequantize_q6_k,
    "IQ4_XS": dequantize_iq4_xs,
}

GGML_DEQUANTIZE_GPU = {
    "F32": dequantize_f32_gpu,
    "F16": dequantize_f16_gpu,
    "BF16": dequantize_bf16_gpu,
    "Q4_0": dequantize_q4_0_gpu,
    "Q5_0": dequantize_q5_0_gpu,
    "Q8_0": dequantize_q8_0_gpu,
    "Q2_K": dequantize_q2_k_gpu,
    "Q3_K": dequantize_q3_k_gpu,
    "Q4_K": dequantize_q4_k_gpu,
    "Q5_K": dequantize_q5_k_gpu,
    "Q6_K": dequantize_q6_k_gpu,
    "IQ4_XS": dequantize_iq4_xs_gpu,
}


def translate_name_to_gguf_mixtral(name):
    
    replacement_template = {
        "w1.weight": "ffn_gate",
        "w2.weight": "ffn_down",
        "w3.weight": "ffn_up"
    }  

    pattern = re.compile(r"model.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.(w\d\.weight)")

    def replace_match(match):
        blk_id = match.group(1)
        expert_id = match.group(2)
        weight_type = match.group(3)
        if weight_type in replacement_template:
            return f"blk.{blk_id}.{replacement_template[weight_type]}.{expert_id}.weight"
        else:
            return match.group(0)

    new_name = re.sub(pattern, replace_match, name)
    
    return new_name

def translate_name_to_gguf(name):

    name = translate_name_to_gguf_mixtral(name)

    if ".ffn_gate_exp." in name:
        name = name.replace(".ffn_gate_exp.", ".ffn_gate_exps.")
    if ".ffn_up_exp." in name:
        name = name.replace(".ffn_up_exp.", ".ffn_up_exps.")
    if ".ffn_down_exp." in name:
        name = name.replace(".ffn_down_exp.", ".ffn_down_exps.")
    
    m = re.match(r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)", name)
    if m:
        layer, expert, proj = m.groups()
        if proj == "gate_proj":
            return f"blk.{layer}.{expert}.ffn_gate_exps"
        elif proj == "up_proj":
            return f"blk.{layer}.{expert}.ffn_up_exps"
        else:
            return f"blk.{layer}.{expert}.ffn_down_exps"

    m = re.match(r"blk\.(\d+)\.mlp\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)", name)
    if m:
        layer, expert, proj = m.groups()
        if proj == "gate_proj":
            return f"blk.{layer}.{expert}.ffn_gate_exps"
        elif proj == "up_proj":
            return f"blk.{layer}.{expert}.ffn_up_exps"
        else:
            return f"blk.{layer}.{expert}.ffn_down_exps"

    name = name.replace("lm_head.", "output.")
    name = name.replace("model.embed_tokens.", "token_embd.")
    name = name.replace("model.norm.", "output_norm.")
    
    name = name.replace("model.layers.", "blk.")
    name = name.replace(".input_layernorm", ".attn_norm")
    name = name.replace(".mlp.down_proj", ".ffn_down")
    name = name.replace(".mlp.gate_proj", ".ffn_gate")
    name = name.replace(".mlp.up_proj", ".ffn_up")
    name = name.replace(".post_attention_layernorm", ".ffn_norm")
    name = name.replace(".self_attn.q_proj", ".attn_q")
    name = name.replace(".self_attn.k_proj", ".attn_k")
    name = name.replace(".self_attn.v_proj", ".attn_v")
    name = name.replace(".self_attn.o_proj", ".attn_output")
    name = name.replace(".self_attn.qkv_proj", ".attn_qkv")
    name = name.replace(".self_attn.kv_a_proj_with_mqa", ".attn_kv_a_mqa")
    name = name.replace(".self_attn.kv_a_layernorm", ".attn_kv_a_norm")
    name = name.replace(".self_attn.kv_b_proj", ".attn_kv_b")
    name = name.replace(".self_attn.q_a_proj", ".attn_q_a")
    name = name.replace(".self_attn.q_a_layernorm", ".attn_q_a_norm")
    name = name.replace(".self_attn.q_b_proj", ".attn_q_b")

    name = name.replace(".self_attn.q_norm", ".attn_q_norm")
    name = name.replace(".self_attn.k_norm", ".attn_k_norm")
    
    name = name.replace(".shared_expert.", ".shared_experts.")
    name = name.replace(".shared_expert_", ".shared_experts_")
    name = name.replace(".gate_up_proj.", ".up_proj")
    
    name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
    name = name.replace(".mlp.gate.e_score_correction_bias", ".exp_probs_b.bias")
    name = name.replace(".mlp.gate", ".ffn_gate_inp")
    name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
    name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
    name = name.replace(".mlp.shared_experts_gate", ".ffn_gate_inp_shexp")
    name = name.replace(".mlp.experts", "")

    name = name.replace(".mlp.experts.ffn_down_exps", ".ffn_down_exps")
    name = name.replace(".mlp.experts.ffn_gate_exps", ".ffn_gate_exps")
    name = name.replace(".mlp.experts.ffn_up_exps", ".ffn_up_exps")

    
    name = name.replace(".block_sparse_moe.gate.", ".ffn_gate_inp.")
    name = name.replace(".block_sparse_moe.experts", "")
    
    name = name.replace(".feed_forward.experts", "")
    name = name.replace(".feed_forward.router", ".ffn_gate_inp")
    name = name.replace(".feed_forward.shared_experts.down_proj", ".ffn_down_shexp")
    name = name.replace(".feed_forward.shared_experts.gate_proj", ".ffn_gate_shexp")
    name = name.replace(".feed_forward.shared_experts.up_proj", ".ffn_up_shexp")
    return name

if __name__ == '__main__':
    gguf_path = '/mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH'
    loader = GGUFLoader(gguf_path)
    loader.load_gguf_tensor('token_embd.weight')


================================================
FILE: archive/ktransformers/util/custom_loader.py
================================================
import struct
import warnings
import numpy as np
import re
import numpy.typing as npt
from typing import Sequence
import os
from enum import IntEnum
import torch

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
except:
    use_torch_npu = False

if not torch.xpu.is_available() and not use_torch_npu:
    import KTransformersOps
from safetensors import safe_open

if not use_torch_npu:
    from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from ktransformers.util.custom_gguf import *
from safetensors.torch import save_file
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, Union

class ModelLoader(ABC):
    """
    Abstract base class for model loaders.
    Defines the interface that all model loaders must implement.
    """
    tensor_file_map = {}
    @abstractmethod
    def has_tensor(cls, name: str):
        """
        Check if the tensor exists in the loader.
        
        Args:
            name: Name of the tensor to check
            
        Returns:
            bool: True if the tensor exists, False otherwise
        """
        pass

class SafeTensorLoader(ModelLoader):
    tensor_file_map: dict
    tensor_type_map: dict
    file_handle_map: dict
    tensor_device_map: dict
    
    def __init__(self, file_path: str):
        self.__load_tensor_file_map(file_path)

    def __load_tensor_file_map(self, file_path: str):
        # 处理传入路径，确保是文件夹路径
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Path not found: {file_path}")
        if os.path.isfile(file_path):
            folder_path = os.path.dirname(file_path)
        else:
            folder_path = file_path
        self.file_handle_map = {}
        self.tensor_file_map = {}
        self.tensor_type_map = {}
        self.tensor_device_map = {}

        found_safetensor = False
        for root, _, files in os.walk(folder_path):
            files = sorted(files)
            for file in files:
                if file.endswith(".safetensors"):
                    found_safetensor = True
                    file_path = os.path.join(root, file)
                    if file not in self.file_handle_map:
                        try:
                            handle = safe_open(file_path, framework="pt")
                            self.file_handle_map[file] = handle
                        except Exception as e:
                            print(f"Error opening Safetensor file {file_path}: {e}")
                            continue

                    f = self.file_handle_map.get(file)
                    if f is None:
                        continue
                    try:
                        for key in f.keys():
                            self.tensor_file_map[key] = file
                    except Exception as e:
                        print(f"Error reading Safetensor file {file_path}: {e}")

        # if not found_safetensor:
        #     raise FileNotFoundError(f"No Safetensor files found in {folder_path}")

    def load_tensor(self, key: str, device: str = "cpu"):
        if translate_name_to_gguf(key) in self.tensor_file_map:
            key = translate_name_to_gguf(key)
        elif key in self.tensor_file_map:
            pass
        else:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        if use_torch_npu:
            tensor = f.get_tensor(key).to(torch.float16)
        else:
            tensor = f.get_tensor(key)

        return tensor.to(device)

    def load_experts(self, key: str, device: str="cpu"):
        '''
        Load experts from safetensor
        key: the name of the experts
        device: the device to load the experts to
        return: dict, 
        {up: tensor, down: tensor, gate: tensor, up_type: int, down_type: int, gate_type: int}
        {xxx}_type: the type of the up tensor, corresponding to the ggml type
        '''
        if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
            # legacy branch for loading hybrid model
            base_key = translate_name_to_gguf(key)
            # Load experts from safetensor
            gate_key = f"{base_key}.ffn_gate_exps.weight"
            gate_type_key = f"{base_key}.ffn_gate_exps.ggml_type"
            up_key = f"{base_key}.ffn_up_exps.weight"
            up_type_key = f"{base_key}.ffn_up_exps.ggml_type"
            down_key = f"{base_key}.ffn_down_exps.weight"
            down_type_key = f"{base_key}.ffn_down_exps.ggml_type"
            gate_tensor = self.load_tensor(gate_key, device).numpy()
            up_tensor = self.load_tensor(up_key, device).numpy()
            down_tensor = self.load_tensor(down_key, device).numpy()
            gate_type = self.load_tensor(gate_type_key, device).item()
            up_type = self.load_tensor(up_type_key, device).item()
            down_type = self.load_tensor(down_type_key, device).item()

            return {
                "up": up_tensor,
                "gate": gate_tensor,
                "down": down_tensor,
                "up_type": up_type,
                "gate_type": gate_type,
                "down_type": down_type
            }

        else:
            # Load experts from safetensor
            base_key = key  # e.g. "model.layers.3.mlp.experts"
            experts_count = 0
            
            key_no_proj = False
            if self.has_tensor(f"{base_key}.{experts_count}.up.weight"):
                key_no_proj = True

            # First, count how many experts we have by checking for expert 0's up_proj
            while self.has_tensor(f"{base_key}.{experts_count}.up_proj.weight") or self.has_tensor(f"{base_key}.{experts_count}.up.weight"):
                experts_count += 1
            
            if experts_count == 0:
                raise ValueError(f"No experts found for key {base_key}")
            
            # Initialize empty lists to store tensors for each projection type
            up_projs = []
            gate_projs = []
            down_projs = []
            
            # Load all expert weights
            for expert_id in range(experts_count):

                if key_no_proj:
                    up_key = f"{base_key}.{expert_id}.up.weight"
                    gate_key = f"{base_key}.{expert_id}.gate.weight"
                    down_key = f"{base_key}.{expert_id}.down.weight"
                else:
                    up_key = f"{base_key}.{expert_id}.up_proj.weight"
                    gate_key = f"{base_key}.{expert_id}.gate_proj.weight"
                    down_key = f"{base_key}.{expert_id}.down_proj.weight"
                
                up_tensor = self.load_tensor(up_key, device)
                gate_tensor = self.load_tensor(gate_key, device)
                down_tensor = self.load_tensor(down_key, device)
                
                up_projs.append(up_tensor)
                gate_projs.append(gate_tensor)
                down_projs.append(down_tensor)
            
            # Stack the tensors along a new dimension
            up_tensor = torch.stack(up_projs, dim=0)
            gate_tensor = torch.stack(gate_projs, dim=0)
            down_tensor = torch.stack(down_projs, dim=0)
            
            # Get original dtype for GGML type determination
            orig_up_dtype = up_tensor.dtype
            orig_gate_dtype = gate_tensor.dtype
            orig_down_dtype = down_tensor.dtype
            
            # Convert to numpy with proper bfloat16 support
            up_numpy = up_tensor.view(torch.uint16).numpy()
            gate_numpy = gate_tensor.view(torch.uint16).numpy()
            down_numpy = down_tensor.view(torch.uint16).numpy()
            
            # Determine tensor data types for GGML conversion
            def get_ggml_type(dtype):
                if dtype == torch.float32:
                    return GGMLQuantizationType.F32
                elif dtype == torch.float16:
                    return GGMLQuantizationType.F16
                elif dtype == torch.bfloat16:
                    return GGMLQuantizationType.BF16
                else:
                    raise ValueError(f"Unsupported tensor dtype: {dtype}")
            
            return {
                "up": up_numpy,
                "gate": gate_numpy,
                "down": down_numpy,
                "up_type": get_ggml_type(orig_up_dtype),
                "gate_type": get_ggml_type(orig_gate_dtype),
                "down_type": get_ggml_type(orig_down_dtype)
            }
                
    def load_gate(self, key: str, device: str="cpu"):
        '''
        Load gate from safetensor
        key: the name of the gate
        device: the device to load the gate to
        return: dict, 
        {'weight': tensor, 'e_score_correction_bias': tensor}
        '''
        target = ["weight", "e_score_correction_bias"]
        res = {'weight': None, 'e_score_correction_bias': None}
        if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
            # legacy branch for loading hybrid model
            base_key = key
            for k in target:
                translated_key = translate_name_to_gguf(f"{base_key}.{k}")
                if self.has_tensor(translated_key):
                    tensor = self.load_tensor(translated_key, device)
                    res[k] = tensor
        else:
            # Load gate from safetensor
            base_key = key
            for k in target:
                if self.has_tensor(f"{base_key}.{k}"):
                    tensor = self.load_tensor(f"{base_key}.{k}", device)
                    res[k] = tensor
        return res

    def close_all_handles(self):
        for handle in self.file_handle_map.values():
            handle.close()
        self.file_handle_map.clear()

    def load_dequantized_tensor(self, key: str, device: str = "cpu"):
        if key in self.tensor_file_map and translate_name_to_gguf(key):
            pass
        elif translate_name_to_gguf(key) in self.tensor_file_map:
            key = translate_name_to_gguf(key)
        else:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key).to(device)
        if key.endswith(".weight"):
            if key[:-7] + ".weight_scale_inv" in self.tensor_file_map:
                weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
                tensor = weight_dequant(tensor, weight_scale_inv)
        return tensor.to(device)

    def has_tensor(self, name: str):
        return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map

class GGUFLoader(ModelLoader):
    tensor_info: dict
    gguf_path: str
    tensor_file_map: dict # {tensor_name: tensor_file_path}
    gguf_file_meta: dict
    safetensor_loader: SafeTensorLoader
    def __init__(self, gguf_path: str, quantize: str = None):
        # Check dir exist
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
        if os.path.isfile(gguf_path):
            gguf_path = os.path.dirname(gguf_path)

        self.safetensor_loader = None
        
        self.tensor_info = {}
        self.gguf_path = gguf_path
        self.tensor_file_map = {}
        self.file_data_map = {}
        self.gguf_file_meta = {}
        self.tensor_device_map = {}

        if use_torch_npu:
            if quantize == "w8a8_dynamic":
                safetensor_loader = W8A8SafeTensorLoader(gguf_path)
            else:
                safetensor_loader = SafeTensorLoader(gguf_path)
            if safetensor_loader.tensor_file_map:
                self.safetensor_loader = safetensor_loader
                return

        # Walk through all the .gguf files in the directory
        found_gguf = False
        for root, dirs, files in os.walk(gguf_path):
            for file in files:
                if file.endswith(".gguf"):
                    found_gguf = True
                    file_name = os.path.join(root, file)
                    with open(file_name, "rb") as f:
                        self.load_gguf(f)
                        if file_name not in self.file_data_map:
                            self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
        if not found_gguf:
            raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
                            
    def load_gguf(self, f):
        f.seek(0)
        assert f.read(4) == b'GGUF'
        values = struct.unpack("<IQQ", f.read(4+8+8))
        version, n_tensors, n_kv = values
        if version != 3:
            warnings.warn(f"Version {version} has never been tested, might not work")

        info = {}
        for _ in range(n_kv):
            name = read_value(f, DATA_TYPES["string"])

            data_type = struct.unpack("<I", f.read(4))[0]

            info[name] = read_value(f, data_type)

        tensor_info = {}
        for _ in range(n_tensors):
            name = read_value(f, DATA_TYPES["string"])
            shape_len = read_value(f, DATA_TYPES["uint32"])
            shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
            ggml_type = read_value(f, DATA_TYPES["uint32"])
            bad_offset = read_value(f, DATA_TYPES["uint64"])
            n_elems = int(math.prod(shape))
            block_size, type_size = GGML_QUANT_SIZES[ggml_type]
            n_bytes = n_elems * type_size // block_size
            np_dims = tuple(reversed(shape))
        
            item_type: npt.DTypeLike
            if ggml_type == GGMLQuantizationType.F16:
                item_count = n_elems
                item_type = np.float16
            elif ggml_type == GGMLQuantizationType.F32:
                item_count = n_elems
                item_type = np.float32
            elif ggml_type == GGMLQuantizationType.F64:
                item_count = n_elems
                item_type = np.float64
            elif ggml_type == GGMLQuantizationType.I8:
                item_count = n_elems
                item_type = np.int8
            elif ggml_type == GGMLQuantizationType.I16:
                item_count = n_elems
                item_type = np.int16
            elif ggml_type == GGMLQuantizationType.I32:
                item_count = n_elems
                item_type = np.int32
            elif ggml_type == GGMLQuantizationType.I64:
                item_count = n_elems
                item_type = np.int64
            else:
                item_count = n_bytes
                item_type = np.uint8
                np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)

            tensor_info[name] = {
                "ggml_type": ggml_type,
                "shape": shape,
                "bad_offset": bad_offset,
                "item_type": item_type,
                "item_count": item_count,
                "np_dims": np_dims
            }

        start = f.tell()
        # Alignment is 32 by default.
        # https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
        alignment = info.get("general.alignment", 32)

        # Inconveniently, the offset defined in gguf files is relative to the
        # end of the header and is unaligned.
        # We need to compute the absolute file offset ourselves instead.
        for t in tensor_info.values():
            offset = start + t["bad_offset"]
            offset += (alignment - offset % alignment) % alignment
            t["offset"] = offset
            
        for name in tensor_info:
            self.tensor_file_map[name] = f.name
        self.tensor_info.update(tensor_info)
        self.gguf_file_meta.update(info)
    
    def get_mmap_tensor(self, name):
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        mmap_data = self.file_data_map[ self.tensor_file_map[name] ]

        offset = t["offset"]
        item_type = t["item_type"]
        item_count = t["item_count"]
        itemsize = int(np.empty([], dtype = item_type).itemsize)
        return mmap_data[offset : offset + itemsize * item_count]

    def get_undequanted_tensor_and_ggml_type(self, name):
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        data = self.get_mmap_tensor(name)
        ggml_type = t["ggml_type"]
        data = torch.from_numpy(data)
        return data, ggml_type

    def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        shape = t["shape"]
        ggml_type = t["ggml_type"]
        if ggml_type not in GGML_NAMES:
            raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
        ggml_name = GGML_NAMES[ggml_type]

        # TODO: experts may fused in quant block, split it
        assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"

        blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
        block_size = GGML_BLOCK_SIZES[ggml_name]
        offset = expert_id * block_size * blocks_per_experts
        data = data[offset: offset + block_size * blocks_per_experts]

        if "cuda" in device.lower():
            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
        else:
            values = GGML_DEQUANTIZE[ggml_name](data)
            values = torch.from_numpy(values.copy())

        if ggml_name == "BF16":
            values = values.view(torch.bfloat16)
        values = values.view(shape[-2::-1])

        return values

    def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        if target_dtype == None:
            target_dtype = torch.get_default_dtype()
        
        shape = t["shape"]
        ggml_type = t["ggml_type"]

        if ggml_type not in GGML_NAMES:
            raise NotImplementedError(f"ggml_type {ggml_type} not implemented")

        ggml_name = GGML_NAMES[ggml_type]

        data = self.get_mmap_tensor(name)

        block_size = GGML_BLOCK_SIZES[ggml_name]
        elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
        num_elements = int(np.prod(shape))
        num_blocks = num_elements // elements_per_block
        
        blocks_per_iter = 16384
        if num_blocks > blocks_per_iter: # dequant large tensor
            values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
            for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
                blocks_begin = i * blocks_per_iter
                blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
                if "cuda" in device.lower():
                    try:
                        cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
                    except:
                        cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
                        cur_values = torch.from_numpy(cur_values.copy()).to(device)
                else:
                    cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
                    cur_values = torch.from_numpy(cur_values.copy())
                
                cur_values = cur_values.view(-1, elements_per_block)
                if ggml_name == "BF16":
                    cur_values = cur_values.view(torch.bfloat16)
                values[blocks_begin : blocks_end] = cur_values
        else:
            if "cuda" in device.lower():
                values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
            else:
                np_values = np.copy(GGML_DEQUANTIZE[ggml_name](data))
                values = torch.from_numpy(np_values).to(device)
                del np_values

        if ggml_name == "BF16":
            values = values.view(torch.bfloat16)
            

        values = values.view(shape[::-1])
        if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
            n_head = self.gguf_file_meta['llama.attention.head_count']
            values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
            .swapaxes(1, 2)
            .reshape(values.shape))
        elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
            n_head = self.gguf_file_meta['llama.attention.head_count_kv'] 
            values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
            .swapaxes(1, 2)
            .reshape(values.shape))
        return values
    def has_tensor(self, name: str):
        name = translate_name_to_gguf(name)
        return name in self.tensor_info

    def get_ggml_type(self, name: str):
        name = translate_name_to_gguf(name)
        if name not in self.tensor_info:
            raise KeyError(f"Key {name} not found in GGUF files")
        return self.tensor_info[name]["ggml_type"]
    
class ModelLoaderFactory:
    """
    Factory class for creating model loaders.
    Automatically detects the model format based on file extensions in the directory.
    """
    
    @staticmethod
    def create_loader(path: str):
        """
        Create a model loader for the given path by detecting the model format.
        The function checks for the presence of .safetensors or .gguf files
        in the specified path and creates the appropriate loader.
        
        Args:
            path: Path to the model directory or file
            
        Returns:
            An appropriate ModelLoader instance (SafeTensorLoader or GGUFLoader)
        
        Raises:
            FileNotFoundError: If no supported model files are found in the path
        """
        if not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
            
        # Normalize to directory path if a file was provided
        if os.path.isfile(path):
            if path.endswith(".safetensors"):
                return SafeTensorLoader(path)
            elif path.endswith(".gguf"):
                return GGUFLoader(path)
            else:
                folder_path = os.path.dirname(path)
        else:
            folder_path = path
            
        # Check for safetensors files
        has_safetensors = False
        has_gguf = False
        
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith(".safetensors"):
                    has_safetensors = True
                    break
                elif file.endswith(".gguf"):
                    has_gguf = True
                    break
            if has_safetensors or has_gguf:
                break
                
        # Create the appropriate loader based on detected file types
        # Prioritize SafeTensor over GGUF if both are present
        if has_safetensors:
            try:
                return SafeTensorLoader(folder_path)
            except Exception as e:
                print(f"Failed to create SafeTensorLoader: {e}")
                # Fall through to try GGUF if SafeTensor fails
                if not has_gguf:
                    raise
        
        if has_gguf:
            try:
                return GGUFLoader(folder_path)
            except Exception as e:
                print(f"Failed to create GGUFLoader: {e}")
                raise
        
        # No supported model files found
        raise FileNotFoundError(f"No .safetensors or .gguf files found in: {folder_path}")

class W8A8SafeTensorLoader(SafeTensorLoader):
    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        if 'deq_scale' in key:
            tensor = torch.from_numpy(
                np.frombuffer(tensor.to(torch.float16).to(torch.float32).numpy().tobytes(), dtype=np.int32).astype(np.int64))
        if 'input_scale' in key:
            tensor = tensor.to(torch.float16)
        if "weight_scale" in key or "weight_offset" in key:
            if "ffn" in key:
                tensor = tensor.to(torch.float32)
            else:
                tensor = tensor.to(torch.float16)
        if 'input_offset' in key:
            tensor = tensor.to(torch.int8)
        if tensor.dtype == torch.bfloat16:
            tensor = tensor.to(torch.float16)
        return tensor.to(device)

    def load_dequantized_tensor(self, key: str, device: str = "cpu"):
        tensor = self.load_tensor(key, device)
        return tensor


================================================
FILE: archive/ktransformers/util/modeling_rope_utils.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from typing import Optional, Tuple

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import is_torch_available, logging


logger = logging.get_logger(__name__)


if is_torch_available():
    import torch


def _compute_default_rope_parameters(
    config: Optional[PretrainedConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        base = rope_kwargs["base"]
        dim = rope_kwargs["dim"]
    elif config is not None:
        base = config.rope_theta
        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        dim = int(head_dim * partial_rotary_factor)

    attention_factor = 1.0  # Unused in this type of RoPE

    # Compute the inverse frequencies
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, attention_factor


def _compute_linear_scaling_rope_parameters(
    config: Optional[PretrainedConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        factor = rope_kwargs["factor"]
    elif config is not None:
        factor = config.rope_scaling["factor"]

    # Gets the default RoPE parameters
    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)

    # Then applies linear scaling to the frequencies.
    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
    # applying scaling to the inverse frequencies is equivalent.
    inv_freq /= factor
    return inv_freq, attention_factor


def _compute_dynamic_ntk_parameters(
    config: Optional[PretrainedConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        base = rope_kwargs["base"]
        dim = rope_kwargs["dim"]
        max_position_embeddings = rope_kwargs["max_position_embeddings"]
        factor = rope_kwargs["factor"]
    elif config is not None:
        base = config.rope_theta
        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        dim = int(head_dim * partial_rotary_factor)
        max_position_embeddings = config.max_position_embeddings
        factor = config.rope_scaling["factor"]

    attention_factor = 1.0  # Unused in this type of RoPE

    # seq_len: default to max_position_embeddings, e.g. at init time
    seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings

    # Compute the inverse frequencies
    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, attention_factor


def _compute_yarn_parameters(
    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://arxiv.org/abs/2309.00071)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    """
    # No need to keep BC with yarn, unreleased when this new pattern was created.
    if len(rope_kwargs) > 0:
        raise ValueError(
            f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
        )

    base = config.rope_theta
    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
    head_dim = getattr(config, "qk_rope_head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    factor = config.rope_scaling["factor"]
    attention_factor = config.rope_scaling.get("attention_factor")
    mscale = config.rope_scaling.get("mscale")
    mscale_all_dim = config.rope_scaling.get("mscale_all_dim")

    # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
    # values to compute the default attention scaling factor, instead of using `factor`.
    if "original_max_position_embeddings" in config.rope_scaling:
        original_max_position_embeddings = config.rope_scaling["original_max_position_embeddings"]
        factor = config.max_position_embeddings / original_max_position_embeddings
    else:
        original_max_position_embeddings = config.max_position_embeddings

    def get_mscale(scale, mscale=1):
        if scale <= 1:
            return 1.0
        return 0.1 * mscale * math.log(scale) + 1.0

    # Sets the attention factor as suggested in the paper
    if attention_factor is None:
        if mscale and mscale_all_dim:
            attention_factor = float(get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim))
        else:
            attention_factor = get_mscale(factor)

    # Optional config options
    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
    beta_fast = config.rope_scaling.get("beta_fast") or 32
    beta_slow = config.rope_scaling.get("beta_slow") or 1

    # Compute the inverse frequencies
    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
        """Inverse dimension formula to find the dimension based on the number of rotations"""
        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))

    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
        """Find dimension range bounds based on rotations"""
        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
        return max(low, 0), min(high, dim - 1)

    def linear_ramp_factor(min, max, dim):
        if min == max:
            max += 0.001  # Prevent singularity

        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
        ramp_func = torch.clamp(linear_func, 0, 1)
        return ramp_func

    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
    # to expand the possible context length. In other words, interpolation = apply scaling factor.
    pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
    inv_freq_extrapolation = 1.0 / pos_freqs
    inv_freq_interpolation = 1.0 / (factor * pos_freqs)

    low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings)

    # Get n-dimensional rotational scaling corrected for extrapolation
    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
    inv_freq = (
        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
        + inv_freq_extrapolation * inv_freq_extrapolation_factor
    )
    return inv_freq, attention_factor


def _compute_longrope_parameters(
    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    """
    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
    # No need to keep BC with longrope, unreleased when this new pattern was created.
    if len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
            f"{rope_kwargs}"
        )

    base = config.rope_theta
    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    long_factor = config.rope_scaling["long_factor"]
    short_factor = config.rope_scaling["short_factor"]
    factor = config.rope_scaling.get("factor")
    attention_factor = config.rope_scaling.get("attention_factor")

    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
    # values to compute the default attention scaling factor, instead of using `factor`.
    if hasattr(config, "original_max_position_embeddings"):
        original_max_position_embeddings = config.original_max_position_embeddings
        factor = config.max_position_embeddings / config.original_max_position_embeddings
    else:
        original_max_position_embeddings = config.max_position_embeddings

    # Sets the attention factor as suggested in the paper
    if attention_factor is None:
        if factor <= 1.0:
            attention_factor = 1.0
        else:
            attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))

    # Compute the inverse frequencies -- scaled based on the target sequence length
    if seq_len and seq_len > original_max_position_embeddings:
        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
    else:
        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)

    return inv_freq, attention_factor


def _compute_llama3_parameters(
    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    """
    # Gets the default RoPE parameters
    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)

    factor = config.rope_scaling["factor"]  # `8` in the original implementation
    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation

    low_freq_wavelen = old_context_len / low_freq_factor
    high_freq_wavelen = old_context_len / high_freq_factor

    wavelen = 2 * math.pi / inv_freq
    # wavelen < high_freq_wavelen: do nothing
    # wavelen > low_freq_wavelen: divide by factor
    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
    # otherwise: interpolate between the two, using a smooth factor
    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)

    return inv_freq_llama, attention_factor


# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
# parameterizations, as long as the callable has the same signature.
ROPE_INIT_FUNCTIONS = {
    "default": _compute_default_rope_parameters,
    "linear": _compute_linear_scaling_rope_parameters,
    "dynamic": _compute_dynamic_ntk_parameters,
    "yarn": _compute_yarn_parameters,
    "longrope": _compute_longrope_parameters,
    "llama3": _compute_llama3_parameters,
}


def _check_received_keys(
    rope_type: str,
    received_keys: set,
    required_keys: set,
    optional_keys: Optional[set] = None,
    ignore_keys: Optional[set] = None,
):
    """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
    # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
    if "type" in received_keys:
        received_keys -= {"type"}
        required_keys.add("rope_type")

    # Some models need to store model-specific keys, and we don't want to throw warning at them
    if ignore_keys is not None:
        received_keys -= ignore_keys

    missing_keys = required_keys - received_keys
    if missing_keys:
        raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")

    if optional_keys is not None:
        unused_keys = received_keys - required_keys - optional_keys
    else:
        unused_keys = received_keys - required_keys
    if unused_keys:
        logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")


def _validate_default_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)


def _validate_linear_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
    optional_keys = {"original_max_position_embeddings"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    optional_keys = {
        "attention_factor",
        "beta_fast",
        "beta_slow",
        "original_max_position_embeddings",
        "mscale",
        "mscale_all_dim",
    }
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

    attention_factor = rope_scaling.get("attention_factor")
    if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
        logger.warning(
            f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
        )
    beta_fast = rope_scaling.get("beta_fast")
    if beta_fast is not None and not isinstance(beta_fast, float):
        logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
    beta_slow = rope_scaling.get("beta_slow")
    if beta_slow is not None and not isinstance(beta_slow, float):
        logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")

    if (beta_fast or 32) < (beta_slow or 1):
        logger.warning(
            f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
            f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
        )


def _validate_longrope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "short_factor", "long_factor"}
    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
    optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)

    short_factor = rope_scaling.get("short_factor")
    if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
        logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
    if not len(short_factor) == dim // 2:
        logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")

    long_factor = rope_scaling.get("long_factor")
    if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
        logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
    if not len(long_factor) == dim // 2:
        logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")

    # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
    # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
    # unique to longrope (= undesirable)
    if hasattr(config, "original_max_position_embeddings"):
        logger.warning_once(
            "This model has set a `original_max_position_embeddings` field, to be used together with "
            "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
            "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
            "as it is compatible with most model architectures."
        )
    else:
        factor = rope_scaling.get("factor")
        if factor is None:
            logger.warning("Missing required keys in `rope_scaling`: 'factor'")
        elif not isinstance(factor, float) or factor < 1.0:
            logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

        attention_factor = rope_scaling.get("attention_factor")
        if attention_factor is not None:
            if not isinstance(attention_factor, float) or attention_factor < 0.0:
                logger.warning(
                    f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
                )


def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

    low_freq_factor = rope_scaling["low_freq_factor"]
    high_freq_factor = rope_scaling["high_freq_factor"]
    if low_freq_factor is None or not isinstance(low_freq_factor, float):
        logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
    if high_freq_factor is None or not isinstance(high_freq_factor, float):
        logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
    if high_freq_factor <= low_freq_factor:
        logger.warning(
            "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
            f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
        )

    original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
    if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
        logger.warning(
            "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
            f"{original_max_position_embeddings}"
        )
    if original_max_position_embeddings >= config.max_position_embeddings:
        logger.warning(
            "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
            f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
        )


# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
ROPE_VALIDATION_FUNCTIONS = {
    "default": _validate_default_rope_parameters,
    "linear": _validate_linear_scaling_rope_parameters,
    "dynamic": _validate_dynamic_scaling_rope_parameters,
    "yarn": _validate_yarn_parameters,
    "longrope": _validate_longrope_parameters,
    "llama3": _validate_llama3_parameters,
}


def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    """
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    """
    rope_scaling = getattr(config, "rope_scaling", None)  # not a default parameter in `PretrainedConfig`
    if rope_scaling is None:
        return

    # BC: "rope_type" was originally "type"
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
    validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
    if validation_fn is not None:
        validation_fn(config, ignore_keys=ignore_keys)
    else:
        logger.warning(
            f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
        )

================================================
FILE: archive/ktransformers/util/npu_graph_runner.py
================================================
'''
Description :
Author      : Boxin Zhang
Version     : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
from typing import Dict

import threading
import torch
import torch_npu


class NPUGraphRunner:

    def __init__(self, deviceId):
        torch.npu.set_compile_mode(jit_compile=False)
        self.deviceId = deviceId
        self.input_buffers: Dict[str, torch.Tensor] = {}
        self.output_buffers: Dict[str, torch.Tensor] = {}
        self.past_key_value = None

    def init(self, batch_size, seq_length):
        self.graph = torch.npu.NPUGraph()
        self.main_stream = torch_npu.npu.Stream(device=self.deviceId)
        self.share_experts_stream = torch_npu.npu.Stream(device=self.deviceId)
        self.logits = torch.zeros((batch_size, seq_length, 7168), dtype=torch.float16).to(self.deviceId)  # deepseekV3 hidden_size
        self.workspace = None
        self.model_capture = True
        torch_npu.npu._subscribe_report(self.main_stream)

    def destroy(self):
        torch_npu.npu._unsubscribe_report(self.main_stream)
        del self.graph
        destory_runner(self.deviceId)

    def capture(
            self,
            model,
            cur_token,
            position_ids,
            cache_position,
            past_key_values,
            main_device,
            **kwargs,
    ) -> None:
        inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(main_device)
        with torch.no_grad():
            with torch.npu.graph(self.graph, stream=self.main_stream, auto_dispatch_capture=True):
                logits = model(inputs_embeds=inputs_embeds,
                            position_ids=position_ids,
                            cache_position=cache_position,
                            past_key_values=past_key_values,
                            is_prefill=False,
                            **kwargs)
        self.input_buffers = {
            "inputs_embeds": inputs_embeds,
            "position_ids": position_ids,
            "cache_position": cache_position,
        }
        self.output_buffers = {
            "logits": logits,
        }

    def forward(
            self,
            inputs_embeds,
            position_ids,
            cache_position,
    ) -> torch.Tensor:
        thread = threading.Thread(target=self.graph.update, kwargs={"cpu_update_input": [{"actual_seq_lengths_kv": self.past_key_value.position}]})
        thread.start()

        self.input_buffers["inputs_embeds"].copy_(inputs_embeds)
        self.input_buffers["position_ids"].copy_(position_ids)
        self.input_buffers["cache_position"].copy_(cache_position)
        torch_npu.npu.synchronize()
        with torch_npu.npu.stream(self.main_stream):
            # Run the graph.
            self.graph.replay()
        thread.join()

        # Return the output tensor.
        return self.output_buffers["logits"]

    def launch_callback(self, func, data, block, stream):
        torch_npu.npu._launch_host_func(stream, func, data)

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

runner_dict = dict()

def check_runner(deviceId: int):
    runner = runner_dict.get(deviceId)
    if runner is None:
        return True
    else:
        return False

def destory_runner(deviceId: int):
    # print("the new NPUGraphRunner and deviceId is ", deviceId)
    runner = runner_dict.get(deviceId)
    if runner is not None:
        runner_dict[deviceId] = None

def get_or_create_runner(deviceId: int):
    runner = runner_dict.get(deviceId)
    if runner is None:
        runner = NPUGraphRunner(deviceId)
        runner_dict[deviceId] = runner
    return runner

================================================
FILE: archive/ktransformers/util/textstream.py
================================================
from typing import Any, List, Optional, Set
class TextStreamer:

    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
        self.tokenizer = tokenizer
        self.skip_prompt = skip_prompt
        self.decode_kwargs = decode_kwargs

        # variables used in the streaming process
        self.token_cache = []
        self.print_len = 0
        self.next_tokens_are_prompt = True

    def reset(self):
        self.token_cache = []
        self.print_len = 0

    def put(self, value)->Optional[str]:
        """
        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
        """        
        if not isinstance(value,int):
            raise ValueError("TextStreamer only supports batch size 1, and int type input")


        if self.skip_prompt and self.next_tokens_are_prompt:
            self.next_tokens_are_prompt = False
            return None

        # Add the new token to the cache and decodes the entire thing.
        self.token_cache.append(value)
        text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True,**self.decode_kwargs)

        # After the symbol for a new line, we flush the cache.
        if text.endswith("\n"):
            printable_text = text[self.print_len :]
            self.reset()
        # If the last token is a CJK character, we print the characters.
        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
            printable_text = text[self.print_len :]
            self.print_len += len(printable_text)
        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
        # which may change with the subsequent token -- there are probably smarter ways to do this!)
        else:
            printable_text = text[self.print_len : text.rfind(" ") + 1]
            self.print_len += len(printable_text)
        return printable_text

    def end(self)->Optional[str]:
        """Flushes any remaining cache and prints a newline to stdout."""
        # Flush the cache, if it exists
        if len(self.token_cache) > 0:
            text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True, **self.decode_kwargs)
            printable_text = text[self.print_len :]
            self.reset()
        else:
            printable_text = ""

        self.next_tokens_are_prompt = True
        return printable_text
   
    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

================================================
FILE: archive/ktransformers/util/utils.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Boxin Zhang, Azure-Tang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import re
import sys
import threading

import torch
import torch.distributed as dist
from torch import nn
import itertools
import time
import enum
from transformers import (
    LogitsProcessorList,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    MinPLogitsWarper,
    TypicalLogitsWarper,
    EpsilonLogitsWarper,
    EtaLogitsWarper,
)

from ktransformers.util.custom_loader import ModelLoaderFactory, ModelLoader, SafeTensorLoader, translate_name_to_gguf
from ktransformers.operators import base_operator
from ktransformers.models.custom_cache import StaticCache
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
from ktransformers.util.textstream import TextStreamer
if not torch.xpu.is_available():
    from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
# from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
import socket

warm_uped = False
CUR_DEVICE = None
W8A8_ENABLE = False
Q4_GGUF_LODER = None
_USE_NPU_GRAPH = False
_MAX_DECODE_PROFILE = 1
WARM_UP_SKIP_CNT = [1, 1]
_SPECULATE_STEP = 1

try:
    import torch_npu
    use_torch_npu = torch_npu.npu.is_available()
    from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size
except:
    use_torch_npu = False

def get_use_npu_graph():
    assert _USE_NPU_GRAPH is not None, "use npu graph is not setting"
    return _USE_NPU_GRAPH

from enum import StrEnum

class StatKey(StrEnum):
    Embedding = "Embedding"
    GraphCapture = "GraphCapture"
    GraphReplay = "GraphReplay"
    ExpertsForward1 = "ExpertsForward1"
    ExpertsForward2 = "ExpertsForward2"
    CPUExperts = "CPUExperts"
    GraphDestroy = "GraphDestroy"
    DecodeOneTokenPost = "DecodeOneTokenPost"
    DecodeOneToken = "DecodeOneToken"
    GraphInit = "GraphInit"

class TimeStat:
    def __init__(self):
        # open_status = os.environ["KT_PERF_STAT"] if "KT_PERF_STAT" in os.environ else "0"
        # if open_status == "0":
        #     self.on = False
        # else:
        #     self.on = True
        self.on = True
        self.prefill_stats = dict()
        self.decode_stats = dict()
        for key in StatKey:
            self.prefill_stats[key] = StatItem()
            self.decode_stats[key] = StatItem()
        self.reset_all()

    def record_start_time(self):
        start_time = time.time_ns()
        return start_time

    def add_time_stat(self, key: StatKey, time_ns, is_prefill):
        if not key:
            return
        # torch.cuda.synchronize()
        cost = time.time_ns() - time_ns
        if is_prefill:
            item = self.prefill_stats[key]
        else:
            item = self.decode_stats[key]
        item.add_item(cost)

    def print_all(self):
        # rank = f"[rank:{torch.distributed.get_rank()}]"
        rank = f"[rank:0]"
        msg = f"\n{rank} Prefill Time Stat\n"
        msg += rank + " {:27}{:>15}{:>15}{:>15}{:>15}{:>15}\n".format("", "min(ms)", "max(ms)", "avg(ms)", "count", "total(ms)")
        for key, value in self.prefill_stats.items():
            msg += rank + f" {key.value:<25}:{value.get_stat()}\n"
        msg += f"\n{rank} Decode Time Stat\n"
        msg += rank + " {:27}{:>15}{:>15}{:>15}{:>15}{:>15}\n".format("", "min(ms)", "max(ms)", "avg(ms)", "count", "total(ms)")
        for key, value in self.decode_stats.items():
            msg += rank + f" {key.value:<25}:{value.get_stat()}\n"
        print(msg)

    def reset_all(self):
        for _, value in self.prefill_stats.items():
            value.reset()
        for _, value in self.decode_stats.items():
            value.reset()


class StatItem:
    def __init__(self):
        self.min_time = 100000000
        self.max_time = 0
        self.total_time_ns = 0
        self.count = 0

    def add_item(self, cost_time_ns):
        self.count += 1
        self.total_time_ns += cost_time_ns
        self.min_time = min(self.min_time, cost_time_ns)
        self.max_time = max(self.max_time, cost_time_ns)

    def reset(self):
        self.min_time = 100000000
        self.max_time = 0
        self.total_time_ns = 0
        self.count = 0

    def get_stat(self):
        min_time = self.min_time / 1000 / 1000
        max_time = self.max_time / 1000 / 1000
        if self.count != 0:
            avg_time = self.total_time_ns / self.count / 1000 / 1000
        else:
            avg_time = 0
        total = self.total_time_ns / 1000 / 1000
        return f"{min_time:15.2f}{max_time:15.2f}{avg_time:15.2f}{self.count:15}{total:15.2f}"


timeStat = TimeStat()


def get_free_ports(n: int, continue_prot: list):
    sockets = []
    ports = []
    for _ in range(n):
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind(("", 0)) 
        port = s.getsockname()[1]
        if port in continue_prot:
            s.close()
            continue
        ports.append(port)
        sockets.append(s)
    for s in sockets:
        s.close()
    return ports

def get_current_device():
    if use_torch_npu:
        return f"npu:{torch.npu.current_device()}"
    else:
        return f"cuda:{torch.npu.current_device()}"

def get_compute_capability(device:torch.device = None):
    if use_torch_npu:
        return 0
    if torch.cuda.is_available():
        if device is None:
            num_gpus = torch.cuda.device_count()
            min_compute_capability_major = 100
            for gpu_id in range(num_gpus):
                gpu_props = torch.cuda.get_device_properties(gpu_id)
                min_compute_capability_major = min(min_compute_capability_major, gpu_props.major)
            return min_compute_capability_major
        else:
            return torch.cuda.get_device_properties(device)

def set_module(model, submodule_key, module):
    tokens = submodule_key.split('.')
    sub_tokens = tokens[:-1]
    cur_mod = model
    for s in sub_tokens:
        if hasattr(cur_mod, s):
            cur_mod = getattr(cur_mod, s)
        else: # nn.ModuleList or nn.ModuleList
            cur_mod=cur_mod[int(s)]
    if hasattr(cur_mod, tokens[-1]):
        setattr(cur_mod, tokens[-1], module)
    else: # nn.ModuleList or nn.ModuleList
        cur_mod[int(tokens[-1])] = module

def set_param(module: nn.Module, name: str, weights: torch.Tensor):
    
    param=nn.parameter.Parameter(weights, requires_grad=False)
    if isinstance(module, nn.Linear) and len(weights.shape)==1:
        param.unsqueeze_(0)
    setattr(module, name, param)

def get_device(gguf_module_key:str, device_map:dict):
    if gguf_module_key in device_map:
        return device_map[gguf_module_key]["generate_device"]
    else:
        return "cuda"

def get_all_used_cuda_device(device_map:dict):
    all_device_list = set()
    for key in device_map:
        all_device_list.add(device_map[key]["generate_device"]) if "generate_device" in device_map[key] else None
        all_device_list.add(device_map[key]["prefill_device"]) if "prefill_device" in device_map[key] else None
    if "cpu" in all_device_list:
        all_device_list.remove("cpu")
    if use_torch_npu:
        all_device_list = set([device.replace('cuda', 'npu') for device in all_device_list])
    all_device_list = list(all_device_list)
    return all_device_list

def load_cur_state_dict_npu(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="npu"):
    prefix = prefix.replace("orig_module.", "")
    persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
    local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
    local_state = {k: v for k, v in local_name_params if v is not None}
    for name, param in local_state.items():
        key = prefix + name
        translated_key = translate_name_to_gguf(key)
        # TODO: Merge all loader.
        # I know this is ugly but lets do it for now.
        if gguf_loader.safetensor_loader is not None:
            load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
            tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
        else:
            load_dequantized_tensor = gguf_loader.load_gguf_tensor
            tensor_file_map = gguf_loader.tensor_file_map
        
        if translated_key in tensor_file_map:
            target_dtype = torch.get_default_dtype()
            device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
            # Todo need fix
            device = "cpu" if "embd" in translated_key else get_current_device()
            print(f"loading layer {translated_key} to {device}")
            torch.cuda.empty_cache()
            weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
            set_param(module, name, weights)
            del weights
        else:
            #print(load_config.tensor_file_map.keys())
            raise Exception(f"can't find {translated_key} in GGUF file!")

def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"):
    if use_torch_npu:
        load_cur_state_dict_npu(module, gguf_loader, prefix, device)
        return

    prefix = prefix.replace("orig_module.", "")
    persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
    local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
    local_state = {k: v for k, v in local_name_params if v is not None}
    for name, param in local_state.items():
        key = prefix + name
        translated_key = key
        
        # TODO: Merge all loader.
        # I know this is ugly but lets do it for now.
        if isinstance(gguf_loader, SafeTensorLoader):
            load_dequantized_tensor = gguf_loader.load_dequantized_tensor
        else:
            load_dequantized_tensor = gguf_loader.load_gguf_tensor
            tensor_file_map = gguf_loader.tensor_file_map
        
        if gguf_loader.has_tensor(translated_key) or "kv_b_proj" in translated_key:
            target_dtype = torch.get_default_dtype()
            device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
            print(f"loading {translated_key} to {device}")
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            elif torch.xpu.is_available():
                torch.xpu.empty_cache()
            if "kv_b_proj" in translated_key and not gguf_loader.has_tensor(translated_key):
                attn_k_b = load_dequantized_tensor(translated_key.replace("self_attn.kv_b_proj", "attn_k_b"), device=device).to(dtype=target_dtype)
                attn_k_b = attn_k_b.transpose(1, 2).contiguous()
                attn_v_b = load_dequantized_tensor(translated_key.replace("self_attn.kv_b_proj", "attn_v_b"), device=device).to(dtype=target_dtype)
                kv_b_proj = torch.cat((attn_k_b, attn_v_b), dim=1)
                kv_b_proj = kv_b_proj.contiguous() if kv_b_proj.ndim == 2 else kv_b_proj.flatten(0, 1).contiguous()
                set_param(module, name, kv_b_proj)
                del attn_k_b
                del attn_v_b
            else:
                weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
                set_param(module, name, weights)
                del weights
        else:
            #print(load_config.tensor_file_map.keys())
            raise Exception(f"can't find {translated_key} in GGUF file!")

  
def sync_all_device(all_device_list):
    for device in all_device_list:
        if "cuda" in device.lower():
            torch.cuda.synchronize(device)
        elif "xpu" in device.lower():
            torch.xpu.synchronize(device)
        elif use_torch_npu:
            torch_npu.synchronize(device)
        else:
            raise RuntimeError("The device {} is not available".format(device))

torch_device_mapping ={"cuda": "cuda:0", "xpu": "xpu:0"}

def xpu_fp16_model(config):
    # This function is to check if we run this model on XPU with FP16 dtype
    if not torch.xpu.is_available():
        return False
    if config.architectures[0] == "DeepseekV3ForCausalLM":
        return True
    if config.architectures[0] == "Qwen3MoeForCausalLM" and config.hidden_size == 4096:
        # Qwen3-30B seems have precision issue with FP16
        # so we only use FP16 for Qwen3-235B now
        return True
    return False

def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda"):
    #print(f"recursively loading weights {prefix}")
    if not isinstance(module, base_operator.BaseInjectedModule):
        load_cur_state_dict(module, gguf_loader, prefix, device=device)
        for name, child in module._modules.items():
            load_weights(child, gguf_loader, prefix+name+".", device=device)
    else:
        module.load()

def tf_logits_warper(generation_config):
        """
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
        used for multinomial sampling.
        """

        # instantiate warpers list
        warpers = LogitsProcessorList()

        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
        # better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
        if generation_config.num_beams > 1:
            if isinstance(generation_config._eos_token_tensor, list):
                min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
            elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
                min_tokens_to_keep = generation_config._eos_token_tensor.shape[0] + 1
            else:
                min_tokens_to_keep = 2
        else:
            min_tokens_to_keep = 1

        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
        # all samplers can be found in `generation_utils_samplers.py`
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TemperatureLogitsWarper(generation_config.temperature))
        if generation_config.top_k is not None and generation_config.top_k != 0:
            warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
            warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.min_p is not None:
            # Applied after temperature scaling (see https://github.com/ggerganov/llama.cpp/pull/3841#issuecomment-2073826084)
            warpers.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
            warpers.append(
                TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
            warpers.append(
                EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
            warpers.append(
               EtaLogitsWarper(
                    epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep, device=device
                )
            )
        # `LogitNormalization` should always be the last logit processor, when present
        if generation_config.renormalize_logits is True:
            warpers.append(LogitNormalization())
        return warpers
def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
                         mode = 'normal', force_think: bool = False, chunk_size = 16384, use_flashinfer_mla = False,
                         num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None,
                         static_cache = None, draft_model=None, draft_cache=None):
    import os
    
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    torch._dynamo.config.suppress_errors = True
    batch_size, seq_length = inputs.shape
    device_map = model.gguf_loader.tensor_device_map
    if use_torch_npu:
        CUR_DEVICE = f"npu:{torch.npu.current_device()}"
        vocabulary_size = model.config.vocab_size
        topp = torch.tensor([[model.generation_config.top_p]], dtype=torch.float16).npu()
        topk = torch.tensor([[model.generation_config.top_k]], dtype=torch.int32).npu()
        temperature = torch.tensor([[model.generation_config.temperature]], dtype=torch.float16).npu()
        next_token_fake = torch.tensor([[1]], dtype=torch.int32).npu()
        next_token_probs = torch.tensor([[1.0]], dtype=torch.float16).npu()
        torch_device = torch.npu.current_device()
    else:
        torch_device = get_device('model.layers.0.self_attn', device_map)
        torch_device = torch_device_mapping[torch_device] if torch_device in torch_device_mapping else torch_device
    inputs = inputs.to(torch_device)
    all_cuda_device = get_all_used_cuda_device(device_map)

    tokens = []

    def decode_one_tokens_npu(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True):
        if cuda_graph_runner is None:
            use_cuda_graph = False
        
        inputs_embeds = model.model.embed_tokens(cur_token.to('cpu')).to(torch_device)
        if use_cuda_graph:
            if cuda_graph_runner.model_capture:
                cuda_graph_runner.capture(model, cur_token, position_ids, cache_position, past_key_values, CUR_DEVICE, return_dict=False, use_cache=True)
                cuda_graph_runner.model_capture = False

            ret = cuda_graph_runner(inputs_embeds, position_ids, cache_position)
            logits = ret[0]
            next_token = torch.argmax(logits, dim=-1)
        else:
            torch_npu.npu.set_device(torch_device)
            logits = model(inputs_embeds=inputs_embeds,
                       position_ids=position_ids,
                       cache_position=cache_position,
                       past_key_values=past_key_values,
                       return_dict=False, use_cache=True, is_prefill=False)[0]
        if past_key_values != None:
            past_key_values.change_seq_length(1)

        if generation_config.do_sample:
            logits = logits / temperature
            torch.manual_seed(0)
            probs = logits.view(batch_size, vocabulary_size)
            sm = nn.Softmax(dim=-1)
            probs = sm(probs).half().npu()
            next_token = next_token_fake
            torch_npu._npu_topk_topp_sampling(probs, topk, topp, next_token, next_token_probs)
            next_token = next_token.squeeze(-1)
        else:
            next_token_scores = logits_warper(inputs, logits[:, -1, :])
            next_token = torch.argmax(next_token_scores, dim=-1)
        
        return next_token
            
    
    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True):
        if use_torch_npu:
            return decode_one_tokens_npu(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph)
        if cuda_graph_runner is None:
            use_cuda_graph = False
        if use_cuda_graph:
            logits = cuda_graph_runner(cur_token, position_ids, cache_position)
        else:
            # custom_stream = torch.cuda.Stream()
            if torch.cuda.is_available():
                torch.cuda.set_device(torch_device)
            elif torch.xpu.is_available():
                torch.xpu.set_device(torch_device)
            elif use_torch_npu:
                torch_npu.set_device(torch_device)
            else:
                raise RuntimeError(f"The device: {torch_device} is not available")
            inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(torch_device)
            # with torch.cuda.stream(custom_stream):
            logits=model(inputs_embeds=inputs_embeds,
                        position_ids=position_ids,
                        cache_position=cache_position,
                        past_key_values=past_key_values,
                        return_dict=False, use_cache=True)[0]
        if past_key_values != None and isinstance(past_key_values, StaticCache):
            past_key_values.change_seq_length(1)
        sync_all_device(all_cuda_device)
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
            probs = nn.functional.softmax(next_token_scores, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)
        return next_token

    # TODO: use CUDA Graph for chunk prefill, may get small improvement
    def chunk_prefill(inputs, cache_position, past_key_values):
        if mode == "long_context":
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
        else:
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
            # inputs_embeds = torch_npu.npu_format_cast_(inputs_embeds, 29)
        if use_flashinfer_mla:
            MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
            MLAWrapperSingleton.need_plan_all()

        ret = model(
            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True, is_prefill=True
        )
        logits = ret[0][:,-1,:].unsqueeze(0).clone().to(torch_device)

        return logits

    def decode_wrapper(next_token, position_ids, cache_position, cuda_graph_runner, past_key_values, inputs, seq_length, prof=None):
        global warm_uped
        global _USE_NPU_GRAPH
        if use_cuda_graph:
            from ktransformers.util.npu_graph_runner import get_or_create_runner
            npu_graph_runner = get_or_create_runner(CUR_DEVICE)
            npu_graph_runner.init(batch_size, seq_length)
            
            with torch_npu.npu.stream(npu_graph_runner.main_stream):
                gen_num_tokens = 1
                while gen_num_tokens < max_new_tokens:
                    start_time = timeStat.record_start_time()
                    if use_flashinfer_mla:
                        MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None,
                                                    num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
                                                    model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
                    if gen_num_tokens == 1:
                        warm_uped = True
                        _USE_NPU_GRAPH = True
                        #np_graph_runner.capture(model, draft_model, next_token, torch.tensor(draft_token), position_ids, cache_position, past_key_values, draft_cache, torch_device, return_dict=False, use_cache=True)
                        cuda_graph_runner = npu_graph_runner
                    next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph)
                    next_token = next_token.to(torch_device)
                    inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
                    generated_ids[:, cache_position] = next_token.int()
                    tokens.append(int(next_token))
                    
                    seq_length += 1

                    if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                        print(stream.end(), end="", flush=True)
                        break
                    else:
                        if torch.distributed.get_rank() % get_tensor_parallel_size() == 0:
                            print(stream.put(next_token.item()), end="", flush=True)

                    cache_position += 1
                    past_key_values.position[0] += 1
                    position_ids = cache_position.unsqueeze(0)
                    gen_num_tokens += 1
                    
                    if prof is not None:
                        prof.step()

                npu_graph_runner.destroy()
                _USE_NPU_GRAPH = False
        else:
            gen_num_tokens = 1
            while gen_num_tokens < max_new_tokens:
                if use_flashinfer_mla:
                    MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None,
                                                num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
                                                model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
                next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph)
                next_token = next_token.to(torch_device)
                inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
                generated_ids[:, cache_position] = next_token.int()
                tokens.append(int(next_token))
                seq_length += 1

                if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                    print(stream.end(), end="", flush=True)
                    break
                else:
                    if torch.distributed.get_rank() % get_tensor_parallel_size() == 0:
                        print(stream.put(next_token.item()), end="", flush=True)

                cache_position += 1
                past_key_values.position[0] += 1
                position_ids = cache_position.unsqueeze(0)
                gen_num_tokens += 1

                if prof is not None:
                    prof.step()
        
        if prof is not None:
            prof.stop()
    
    if torch.cuda.is_available():
        torch.cuda.set_device(torch_device)
    elif torch.xpu.is_available():
        torch.xpu.set_device(torch_device)
    elif use_torch_npu:
        torch_npu.set_device(torch_device)
    else:
        raise RuntimeError(f"The device: {torch_device} is not available")

    with torch.no_grad():

        stream = TextStreamer(tokenizer)
        if torch.xpu.is_available():
            from ipex_llm.transformers.kv import DynamicUnbalancedFp8Cache, DynamicNormalCache
            if model.config.architectures[0] in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
                past_key_values = DynamicUnbalancedFp8Cache.from_legacy_cache(None)
            else:
                past_key_values = DynamicNormalCache.from_legacy_cache(None)
        elif use_torch_npu and static_cache:
            assert isinstance(static_cache, StaticCache), '[ERROR] static_cache format not equal to StaticCache'
            past_key_values = static_cache
            if past_key_values.max_batch_size < batch_size or past_key_values.max_cache_len < seq_length + max_new_tokens:
                print('[WARN] current staticCache size exceeded, try create new staticCache...')
                past_key_values = StaticCache(
                    config=model.config, max_batch_size=1, max_cache_len=seq_length + max_new_tokens, device=device_map, dtype=model.dtype
                )
            else:
                past_key_values.reset()
        elif mode != 'long_context':
            past_key_values = StaticCache(
                config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
            )
        else:
            past_key_values = None

        generation_config, model_kwargs = model._prepare_generation_config(
            None, do_sample=False
            # change this to modify generate config
            #top_k=5, top_p=0.85, temperature=0.1
        )
        
        logits_warper = tf_logits_warper(generation_config)

        cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32)
        if use_torch_npu:
            past_key_values.position[0] = seq_length + 1
        generated_ids = torch.zeros(
            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
        )
        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
        start_time = time.time()
        logits = None

        def prefill_wrapper(prof=None):
            nonlocal logits
            chunk_start = 0
            while chunk_start < seq_length:
                chunk_end = min(chunk_start + chunk_size, seq_length)
                if past_key_values != None:
                    past_key_values.cur_idx=cache_position[chunk_start:chunk_end]
                logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values)
                chunk_start += chunk_size
                if prof is not None:
                    prof.step()
            if prof is not None:
                prof.stop()
            if logits is None:
                raise ValueError('logits cannot be None')

        if use_torch_npu:
            global WARM_UP_SKIP_CNT
            prof_prefill = os.environ["PROF_PREFILL"] if "PROF_PREFILL" in os.environ else "0"
            if prof_prefill == "1" and WARM_UP_SKIP_CNT[0] <= 0:
                experimental_config = torch_npu.profiler._ExperimentalConfig(
                    aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
                    profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False
                )
                with torch_npu.profiler.profile(
                        activities=[
                            torch_npu.profiler.ProfilerActivity.CPU,
                            torch_npu.profiler.ProfilerActivity.NPU
                        ],
                        schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=8, repeat=1, skip_first=0),
                        on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./prefill_prof"),
                        record_shapes=True,
                        profile_memory=True,
                        with_stack=False,
                        with_flops=False,
                        with_modules=False,
                        experimental_config=experimental_config) as prof:
                    prefill_wrapper(prof)
            else:
                prefill_wrapper()
            WARM_UP_SKIP_CNT[0] -= 1
        else:

            chunk_start = 0
            while chunk_start < seq_length:
                chunk_end = min(chunk_start + chunk_size, seq_length)
                if past_key_values != None:
                    past_key_values.cur_idx=cache_position[chunk_start:chunk_end]
                logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values)
                chunk_start += chunk_size

        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
            probs = nn.functional.softmax(next_token_scores, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)

        first_token_time = time.time() - start_time

        # print(f"------------------------------------- prefill next_token {next_token}  draft_token {draft_token} ")
        if use_flashinfer_mla:
            MLAWrapperSingleton.reset_buffer()

        prefill_count = seq_length
        prefill_time = first_token_time
        if use_torch_npu and torch.distributed.get_rank() % get_tensor_parallel_size() == 0:
            if force_think:
                print("<think>")
            print(stream.put(next_token.item()), end="", flush=True)
        elif not use_torch_npu:
            if force_think:
                print("<think>")
            print(stream.put(next_token.item()), end="", flush=True)

        generated_ids[:, seq_length] = next_token
        tokens.append(int(next_token))
        inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
        cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
        position_ids = cache_position.unsqueeze(0)
        seq_length += 1
        
        cuda_graph_runner = None
        
        start_time = time.time()

        if not use_torch_npu:
            for i in range(1, max_new_tokens):
                if use_flashinfer_mla:
                    MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None,
                                                num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
                                                model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
                global warm_uped
                if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
                    warm_uped = True
                    cuda_graph_runner = CUDAGraphRunner()
                    cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
                next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device)
                inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
                generated_ids[:, cache_position] = next_token.int()
                tokens.append(int(next_token))
                seq_length += 1
                
                if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                    print(stream.end(), end="", flush=True)
                    break
                else:
                    print(stream.put(next_token.item()), end="", flush=True)
                cache_position += 1
                position_ids = cache_position.unsqueeze(0)
        else:
            prof_decode = os.environ["PROF_DECODE"] if "PROF_DECODE" in os.environ else "0"
            prof_ranks = os.environ["PROF_RANK"] if "PROF_RANK" in os.environ else "0"
            prof_ranks = [int(r.strip()) for r in prof_ranks.split(",")]
            if prof_decode == "1" and torch.distributed.get_rank() in prof_ranks and WARM_UP_SKIP_CNT[1] <= 0:
                experimental_config = torch_npu.profiler._ExperimentalConfig(
                    aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
                    profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False
                )
                with torch_npu.profiler.profile(
                        activities=[
                            torch_npu.profiler.ProfilerActivity.CPU,
                            torch_npu.profiler.ProfilerActivity.NPU
                        ],
                        schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=_MAX_DECODE_PROFILE, repeat=1, skip_first=0),
                        on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./decode_prof"),
                        record_shapes=True,
                        profile_memory=True,
                        with_stack=False,
                        with_flops=False,
                        with_modules=False,
                        experimental_config=experimental_config) as prof:
                    decode_wrapper(next_token, position_ids, cache_position, cuda_graph_runner, past_key_values, inputs, seq_length, prof)
            else:
                decode_wrapper(next_token, position_ids, cache_position, cuda_graph_runner, past_key_values, inputs, seq_length)
            WARM_UP_SKIP_CNT[1] -= 1 

    total_time = time.time() - start_time
    tokens_generated = len(tokens)
    tokens_per_second = tokens_generated / total_time

    if not use_torch_npu:
        print("")

        print(f"prompt eval count:    {prefill_count} token(s)")
        print(f"prompt eval duration: {prefill_time}s")
        print(f"prompt eval rate:     {prefill_count/prefill_time} tokens/s")
        print(f"eval count:           {tokens_generated} token(s)")
        print(f"eval duration:        {total_time}s")
        print(f"eval rate:            {tokens_per_second} tokens/s")
    else:
        tp_size = get_tensor_parallel_size()
        if torch.distributed.get_rank() % tp_size == 0:
            rank = f"[rank:{torch.distributed.get_rank()}]"
            msg = f"\n{rank} Eval Time\n"
            msg += rank + f"prompt eval count:    {prefill_count} token(s)\n"
            msg += rank + f"prompt eval duration: {prefill_time:.9f}s\n"
            msg += rank + f"prompt eval rate:     {prefill_count/prefill_time:.9f} tokens/s\n"
            msg += rank + f"eval count:           {tokens_generated} token(s)\n"
            msg += rank + f"eval duration:        {total_time:.9f}s\n"
            msg += rank + f"eval rate:            {tokens_per_second:.9f} tokens/s\n"
            print(msg)

    return tokens

class InferenceState(enum.Enum):
    UNLOAD = 0
    PREFILL = 1
    GENERATE = 2
    RESTORE = 3


================================================
FILE: archive/ktransformers/util/vendors.py
================================================
from __future__ import annotations

from enum import IntEnum, auto
from typing import Optional, Union, List
import torch

class GPUVendor(IntEnum):
    NVIDIA = auto()
    AMD = auto()
    MooreThreads = auto()
    MetaX = auto()
    MUSA = auto()
    Unknown = auto()

class DeviceManager:
    """
    Device manager that provides a unified interface for handling different GPU vendors
    """
    def __init__(self):
        self.gpu_vendor = self._detect_gpu_vendor()
        self.available_devices = self._get_available_devices()
    
    def _detect_gpu_vendor(self) -> GPUVendor:
        """Detect GPU vendor type"""
        if not torch.cuda.is_available():
            # Check MUSA availability (assuming a musa module exists)
            try:
                import musa
                if musa.is_available():
                    return GPUVendor.MUSA
            except (ImportError, AttributeError):
                pass
            
            return GPUVendor.Unknown
        
        device_name = torch.cuda.get_device_name(0).lower()
        
        if any(name in device_name for name in ["nvidia", "geforce", "quadro", "tesla", "titan", "rtx", "gtx"]):
            return GPUVendor.NVIDIA
        elif any(name in device_name for name in ["amd", "radeon", "rx", "vega", "instinct", "firepro", "mi"]):
            return GPUVendor.AMD
        elif any(name in device_name for name in ["mthreads", "moore", "mtt"]):
            return GPUVendor.MooreThreads
        elif any(name in device_name for name in ["metax", "meta"]):
            return GPUVendor.MetaX
        elif "musa" in device_name:
            return GPUVendor.MUSA
        
        # Backend check
        try:
            if hasattr(torch.version, 'hip') and torch.version.hip is not None:
                return GPUVendor.AMD
            elif hasattr(torch.version, 'cuda') and torch.version.cuda is not None:
                return GPUVendor.NVIDIA
        except:
            pass
            
        return GPUVendor.Unknown
    
    def _get_available_devices(self) -> List[int]:
        """Get list of available device indices"""
        devices = []
        
        if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
            devices = list(range(torch.cuda.device_count()))
        elif self.gpu_vendor == GPUVendor.MUSA:
            try:
                import musa
                devices = list(range(musa.device_count()))
            except (ImportError, AttributeError):
                pass
            
        return devices
    
    def get_device_str(self, device_id: Union[int, str]) -> str:
        """
        Get device string for the given device ID
        
        Args:
            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
            
        Returns:
            Device string representation (e.g., "cuda:0", "musa:1", "cpu")
        """
        if device_id == -1 or device_id == "cpu":
            return "cpu"
            
        if isinstance(device_id, int):
            if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
                if device_id < torch.cuda.device_count():
                    return f"cuda:{device_id}"
            elif self.gpu_vendor == GPUVendor.MUSA:
                try:
                    import musa
                    if device_id < musa.device_count():
                        return f"musa:{device_id}"
                except (ImportError, AttributeError):
                    pass
        
        return "cpu"
    
    def to_torch_device(self, device_id: Union[int, str] = 0) -> torch.device:
        """
        Convert device ID to torch.device object
        
        Args:
            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
            
        Returns:
            torch.device object
        """
        device_str = self.get_device_str(device_id)
        
        # Handle MUSA device
        if device_str.startswith("musa:"):
            try:
                import musa
                index = int(device_str.split(":")[-1])
                return musa.device(index)
            except (ImportError, ValueError, AttributeError):
                return torch.device("cpu")
        
        # Standard PyTorch device
        return torch.device(device_str)
    
    def move_tensor_to_device(self, tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
        """
        Move tensor to specified device
        
        Args:
            tensor: PyTorch tensor to move
            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
            
        Returns:
            Tensor moved to the specified device
        """
        device = self.to_torch_device(device_id)
        return tensor.to(device)
    
    def is_available(self, index: int = 0) -> bool:
        """
        Check if device at specified index is available
        
        Args:
            index: Device index to check
            
        Returns:
            True if the device is available, False otherwise
        """
        if index < 0:
            return True  # CPU is always available
            
        return index in self.available_devices
    
    def get_all_devices(self) -> List[int]:
        """
        Get all available device indices
        
        Returns:
            List of available device indices (0, 1, 2, etc.)
        """
        return self.available_devices

# Create global device manager instance
device_manager = DeviceManager()

# Convenience functions
def get_device(device_id: Union[int, str] = 0) -> torch.device:
    """
    Get torch.device object for the specified device ID
    
    Args:
        device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
        
    Returns:
        torch.device object
    """
    return device_manager.to_torch_device(device_id)

def to_device(tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
    """
    Move tensor to specified device
    
    Args:
        tensor: PyTorch tensor to move
        device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
        
    Returns:
        Tensor moved to the specified device
    """
    return device_manager.move_tensor_to_device(tensor, device_id)

# Get devices
cpu_device = get_device(-1)        # CPU using index -1
cpu_device2 = get_device("cpu")    # CPU using string "cpu"
gpu0 = get_device(0)               # First GPU

# Move tensors
x = torch.randn(3, 3)
x_gpu = to_device(x, 0)            # Move to first GPU
x_cpu1 = to_device(x, -1)          # Move to CPU using index -1
x_cpu2 = to_device(x, "cpu")       # Move to CPU using string "cpu"

================================================
FILE: archive/ktransformers/util/weight_loader.py
================================================
from abc import ABC, abstractmethod
import os
import torch
import numpy as np
from safetensors import safe_open
from typing import Dict, Any, Optional, Union

class ModelLoader(ABC):
    """
    Abstract base class for model loaders.
    Defines the interface that all model loaders must implement.
    """
    
    @abstractmethod
    def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load a tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The loaded tensor
        """
        pass
    
    @classmethod
    @abstractmethod
    def supports_format(cls, path: str) -> bool:
        """
        Check if this loader supports the given path format.
        
        Args:
            path: Path to check
            
        Returns:
            True if this loader supports the given path, False otherwise
        """
        pass


class SafeTensorLoader(ModelLoader):
    """
    Loader for SafeTensor format models.
    """
    
    def __init__(self, path: str):
        """
        Initialize the SafeTensor loader.
        
        Args:
            path: Path to the model directory or file
        """
        self.tensor_file_map = {}  # Maps tensor names to file paths
        self.file_handle_map = {}  # Maps file names to file handles
        self._load_tensor_file_map(path)
    
    def _load_tensor_file_map(self, path: str) -> None:
        """
        Load the tensor file map from the given path.
        
        Args:
            path: Path to the model directory or file
        """
        # Normalize path to directory
        if not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        if os.path.isfile(path):
            folder_path = os.path.dirname(path)
        else:
            folder_path = path

        found_safetensor = False
        for root, _, files in os.walk(folder_path):
            files = sorted(files)
            for file in files:
                if file.endswith(".safetensors"):
                    found_safetensor = True
                    file_path = os.path.join(root, file)
                    if file not in self.file_handle_map:
                        try:
                            handle = safe_open(file_path, framework="pt")
                            self.file_handle_map[file] = handle
                        except Exception as e:
                            print(f"Error opening Safetensor file {file_path}: {e}")
                            continue

                    f = self.file_handle_map.get(file)
                    if f is None:
                        continue
                    try:
                        for key in f.keys():
                            self.tensor_file_map[key] = file
                    except Exception as e:
                        print(f"Error reading Safetensor file {file_path}: {e}")

        if not found_safetensor:
            # Not raising an error here allows for the factory to try other loaders
            print(f"No Safetensor files found in {folder_path}")
    
    def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load a tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The loaded tensor
        """
        if name not in self.tensor_file_map:
            raise KeyError(f"Key {name} not found in Safetensor files")
        file = self.tensor_file_map[name]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(name)
        return tensor.to(device)
    
    def load_dequantized_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load and dequantize a tensor.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The dequantized tensor
        """
        if name not in self.tensor_file_map:
            raise KeyError(f"Key {name} not found in Safetensor files")
        file = self.tensor_file_map[name]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(name).to(device)
        if name.endswith(".weight"):
            if name[:-7] + ".weight_scale_inv" in self.tensor_file_map:
                weight_scale_inv = f.get_tensor(name[:-7] + ".weight_scale_inv").to(device)
                # Assuming weight_dequant function is imported
                from ktransformers.ktransformers_ext.triton.fp8gemm import weight_dequant
                tensor = weight_dequant(tensor, weight_scale_inv)
        return tensor.to(device)
    
    def close_all_handles(self) -> None:
        """
        Close all file handles.
        """
        for handle in self.file_handle_map.values():
            handle.close()
        self.file_handle_map.clear()

    @classmethod
    def supports_format(cls, path: str) -> bool:
        """
        Check if this loader supports the given path format.
        
        Args:
            path: Path to check
            
        Returns:
            True if safetensor files are found in the path, False otherwise
        """
        # Normalize path to directory
        if not os.path.exists(path):
            return False
        if os.path.isfile(path):
            if path.endswith(".safetensors"):
                return True
            folder_path = os.path.dirname(path)
        else:
            folder_path = path
            
        # Check if any safetensor files exist in the folder
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith(".safetensors"):
                    return True
        return False


class GGUFLoader(ModelLoader):
    """
    Loader for GGUF format models.
    """
    
    def __init__(self, path: str):
        """
        Initialize the GGUF loader.
        
        Args:
            path: Path to the model directory or file
        """
        # Check if path exists
        if not os.path.exists(path):
            raise FileNotFoundError(f"GGUF dir not found: {path}")
        if os.path.isfile(path):
            self.gguf_path = os.path.dirname(path)
        else:
            self.gguf_path = path
            
        self.tensor_info = {}  # Stores tensor metadata
        self.tensor_file_map = {}  # Maps tensor names to file paths
        self.file_data_map = {}  # Maps file paths to memory-mapped data
        self.gguf_file_meta = {}  # Stores GGUF metadata
        
        # For compatibility with the factory pattern
        self.safetensor_loader = None
        
        # Scan all GGUF files in the directory
        found_gguf = False
        for root, _, files in os.walk(self.gguf_path):
            for file in files:
                if file.endswith(".gguf"):
                    found_gguf = True
                    file_path = os.path.join(root, file)
                    with open(file_path, "rb") as f:
                        self._load_gguf(f)
                        if file_path not in self.file_data_map:
                            self.file_data_map[file_path] = np.memmap(file_path, mode='r')
        
        if not found_gguf:
            raise FileNotFoundError(f"Cannot find any .gguf files in: {self.gguf_path}")
    
    def _load_gguf(self, f) -> None:
        """
        Load GGUF file metadata and tensor info.
        
        Args:
            f: File handle of the GGUF file
        """
        # Implementation should follow the original GGUFLoader._load_gguf
        # This is a simplified version for illustration
        f.seek(0)
        assert f.read(4) == b'GGUF'
        
        # Read header
        values = struct.unpack("<IQQ", f.read(4+8+8))
        version, n_tensors, n_kv = values
        if version != 3:
            warnings.warn(f"Version {version} has never been tested, might not work")

        # Read key-value pairs
        info = {}
        for _ in range(n_kv):
            name = self._read_value(f, 8)  # DATA_TYPES["string"]
            data_type = struct.unpack("<I", f.read(4))[0]
            info[name] = self._read_value(f, data_type)

        # Read tensor info
        tensor_info = {}
        for _ in range(n_tensors):
            name = self._read_value(f, 8)  # DATA_TYPES["string"]
            shape_len = self._read_value(f, 4)  # DATA_TYPES["uint32"]
            shape = [self._read_value(f, 10) for _ in range(shape_len)]  # DATA_TYPES["uint64"]
            ggml_type = self._read_value(f, 4)  # DATA_TYPES["uint32"]
            offset = self._read_value(f, 10)  # DATA_TYPES["uint64"]
            
            # Additional tensor metadata would be calculated here
            # For brevity, we're omitting the detailed tensor metadata calculation
            tensor_info[name] = {
                "ggml_type": ggml_type,
                "shape": shape,
                "offset": offset,
                # ... other tensor metadata
            }
            
        start = f.tell()
        alignment = info.get("general.alignment", 32)
        
        # Calculate actual file offsets
        for t in tensor_info.values():
            offset = start + t["offset"]
            offset += (alignment - offset % alignment) % alignment
            t["offset"] = offset
            
        # Update file maps
        for name in tensor_info:
            self.tensor_file_map[name] = f.name
            
        self.tensor_info.update(tensor_info)
        self.gguf_file_meta.update(info)
    
    def _read_value(self, f, data_type) -> Any:
        """
        Read a value from the file according to its data type.
        
        Args:
            f: File handle
            data_type: Type of data to read
            
        Returns:
            The read value
        """
        # Simplified implementation
        # In a complete implementation, this would handle all data types
        if data_type == 8:  # DATA_TYPES["string"]
            length = struct.unpack("<Q", f.read(8))[0]
            return f.read(length).decode("utf-8")
        elif data_type == 4:  # DATA_TYPES["uint32"]
            return struct.unpack("<I", f.read(4))[0]
        elif data_type == 10:  # DATA_TYPES["uint64"]
            return struct.unpack("<Q", f.read(8))[0]
        # ... handling for other data types
        return None
    
    def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load a tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The loaded tensor
        """
        # This should call load_gguf_tensor with the appropriate parameters
        return self.load_gguf_tensor(name, device)
    
    def load_gguf_tensor(self, name: str, device: str = "cpu", target_dtype = None) -> torch.Tensor:
        """
        Load a GGUF tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            target_dtype: Target data type for the tensor
            
        Returns:
            The loaded tensor
        """
        # Implementation would follow the original GGUFLoader.load_gguf_tensor
        # This is a placeholder for illustration
        if name not in self.tensor_info:
            raise KeyError(f"Tensor {name} not found")
            
        # Actual implementation would dequantize the tensor data
        # and return a torch.Tensor
        return torch.zeros(1, device=device)  # Placeholder
    
    @classmethod
    def supports_format(cls, path: str) -> bool:
        """
        Check if this loader supports the given path format.
        
        Args:
            path: Path to check
            
        Returns:
            True if GGUF files are found in the path, False otherwise
        """
        # Normalize path to directory
        if not os.path.exists(path):
            return False
        if os.path.isfile(path):
            return path.endswith(".gguf")
        
        # Check if any GGUF files exist in the folder
        for root, _, files in os.walk(path):
            for file in files:
                if file.endswith(".gguf"):
                    return True
        return False

================================================
FILE: archive/ktransformers/website/.browserslistrc
================================================
> 1%
last 2 versions
not dead
not ie 11


================================================
FILE: archive/ktransformers/website/.eslintrc.js
================================================
module.exports = {
  root: true,
  env: {
    node: true
  },
  'extends': [
    'plugin:vue/vue3-essential',
    'eslint:recommended',
    '@vue/typescript/recommended'
  ],
  parserOptions: {
    ecmaVersion: 2020
  },
  rules: {
    'no-console': process.env.NODE_ENV === 'production' ? 'warn' : 'off',
    'no-debugger': process.env.NODE_ENV === 'production' ? 'warn' : 'off'
  },
  overrides: [
    {
      files: [
        '**/__tests__/*.{j,t}s?(x)',
        '**/tests/unit/**/*.spec.{j,t}s?(x)'
      ],
      env: {
        jest: true
      }
    }
  ]
}


================================================
FILE: archive/ktransformers/website/.gitignore
================================================
.DS_Store
node_modules
/dist


# local env files
.env.local
.env.*.local

# Log files
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*

# Editor directories and files
.idea
.vscode
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?


================================================
FILE: archive/ktransformers/website/README.md
================================================
# 

## Project setup
```
npm install
```

### Compiles and hot-reloads for development
```
npm run serve
```

### Compiles and minifies for production
```
npm run build
```

### Run your unit tests
```
npm run test:unit
```

### Lints and fixes files
```
npm run lint
```

### Customize configuration
See [Configuration Reference](https://cli.vuejs.org/config/).


================================================
FILE: archive/ktransformers/website/config.d.ts
================================================
declare module '*.js' {
    const config: {
      apiUrl: string;
      port:number;
    };
    export { config };
  }

================================================
FILE: archive/ktransformers/website/jest.config.js
================================================
module.exports = {
  preset: '@vue/cli-plugin-unit-jest/presets/typescript'
}


================================================
FILE: archive/ktransformers/website/package.json
================================================
{
  "name": "",
  "version": "",
  "private": true,
  "scripts": {
    "serve": "vue-cli-service serve",
    "build": "vue-cli-service build",
    "test:unit": "vue-cli-service test:unit",
    "lint": "vue-cli-service lint"
  },
  "dependencies": {
    "@types/pdfjs-dist": "^2.10.378",
    "@types/websocket": "^1.0.10",
    "@vue/cli": "^5.0.8",
    "ant-design-vue": "^4.2.1",
    "apexcharts": "^3.49.1",
    "axios": "^1.7.0",
    "axios-extensions": "^3.1.6",
    "better-scroll": "^2.5.1",
    "element-plus": "^2.7.3",
    "marked": "^12.0.2",
    "marked-highlight": "^2.1.1",
    "pdf-lib": "^1.17.1",
    "pdfobject": "^2.3.0",
    "v-clipboard": "^3.0.0-next.1",
    "vue": "^3.4.27",
    "vue-i18n": "^9.13.1",
    "vue-pdf": "^4.3.0",
    "vue-router": "^4.0.3",
    "vue3-apexcharts": "^1.5.3",
    "vuex": "^4.0.0",
    "webpack": "^5.91.0",
    "webpack-cli": "^5.1.4",
    "websocket": "^1.0.35"
  },
  "devDependencies": {
    "@types/jest": "^27.0.1",
    "@types/pdfobject": "^2.2.5",
    "@typescript-eslint/eslint-plugin": "^5.4.0",
    "@typescript-eslint/parser": "^5.4.0",
    "@vue/cli-plugin-eslint": "~5.0.0",
    "@vue/cli-plugin-router": "~5.0.0",
    "@vue/cli-plugin-typescript": "~5.0.0",
    "@vue/cli-plugin-unit-jest": "~5.0.0",
    "@vue/cli-plugin-vuex": "~5.0.0",
    "@vue/cli-service": "~5.0.0",
    "@vue/eslint-config-typescript": "^9.1.0",
    "@vue/test-utils": "^2.0.0-0",
    "@vue/vue3-jest": "^27.0.0-alpha.1",
    "babel-jest": "^27.0.6",
    "eslint": "^7.32.0",
    "eslint-plugin-vue": "^8.0.3",
    "jest": "^27.0.5",
    "stylus": "^0.55.0",
    "stylus-loader": "^6.1.0",
    "ts-jest": "^27.0.4",
    "typescript": "~4.5.5"
  },
  "_id": "@",
  "readme": "ERROR: No README data found!"
}


================================================
FILE: archive/ktransformers/website/public/config.js
================================================
window.configWeb = {
    apiUrl: 'http://119.255.238.12:15670/v1',
    port: 8080,
  };

================================================
FILE: archive/ktransformers/website/public/css/reset.css
================================================
html, body, div, span, applet, object, iframe,
h1, h2, h3, h4, h5, h6, p, blockquote, pre,
a, abbr, acronym, address, big, cite, code,
del, dfn, em, img, ins, kbd, q, s, samp,
small, strike, strong, sub, sup, tt, var,
b, u, i, center,
dl, dt, dd, ol, ul, li,
fieldset, form, label, legend,textarea,
table, caption, tbody, tfoot, thead, tr, th, td,
article, aside, canvas, details, embed,
figure, figcaption, footer, header, hgroup,
menu, nav, output, ruby, section, summary,
time, mark, audio, video {
    margin: 0;
    padding: 0;
    border: 0;
    font-size: 100%;
    *font: inherit;
    font-family: Arial, Microsoft YaHei, SimHei, Tahoma, sans-serif !important;
    vertical-align: baseline;
}
/* HTML5 display-role reset for older browsers */
article, aside, details, figcaption, figure,
footer, header, hgroup, menu, nav, section {
    display: block;
}
body {
    line-height: 1;
    -webkit-text-size-adjust: 100%!important;
    margin: 0;
}
html,body {
    height: 100%;
    width: 100%;
    overflow: hidden;
}
ol, ul {
    list-style: none;
}
blockquote, q {
    quotes: none;
}
blockquote:before, blockquote:after,
q:before, q:after {
    content: '';
    content: none;
}
table {
    border-collapse: collapse;
    border-spacing: 0;
}

.clearfix:before,
.clearfix:after {
    content:"";
    display:table
}
.clearfix:after {
    clear:both
}

/*显示省略号*/
.ellipsis{
    overflow: hidden;
    text-overflow: ellipsis;
    white-space: nowrap;
}


================================================
FILE: archive/ktransformers/website/public/index.html
================================================
<!DOCTYPE html>
<html lang="">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,minimum-scale=1.0,user-scalable=no">
    <script src="./config.js"></script>
    <link rel="icon" href="./balck.ico" />
    <link type="text/css" rel="stylesheet" href="<%= BASE_URL %>/css/reset.css">
    <title>KTransformers</title>
  </head>
  <body onselectstart='return false' onselect='return false'>
    <noscript>
      <strong>We're sorry but <%= htmlWebpackPlugin.options.title %> doesn't work properly without JavaScript enabled. Please enable it to continue.</strong>
    </noscript>
    <div id="app"></div>
    <!-- built files will be auto injected -->
  </body>
</html>


================================================
FILE: archive/ktransformers/website/src/App.vue
================================================
<template>
  <div class="app-container" @contextmenu.prevent.stop="">
    <keep-alive>
      <router-view/>
    </keep-alive>
  </div>
</template>

<script setup lang="ts">
</script>

<style lang="stylus">
  @import "assets/iconfont/iconfont.css"
  #app
  .app-container
    width: 100%
    height: 100%
    position: relative
</style>

================================================
FILE: archive/ktransformers/website/src/api/api-client.ts
================================================
import axios, { AxiosInstance } from 'axios';
import {baseURL} from '@/conf/config';
const apiClient: AxiosInstance = axios.create({
    baseURL: baseURL,
    // baseURL: '/api',
    headers: {
        'Content-Type': 'application/json',
    },
    withCredentials: true,
});
export default apiClient;


================================================
FILE: archive/ktransformers/website/src/api/assistant.ts
================================================
import apiClient from './api-client';
import { IAssistant,IDeleteResult, IAssistantWithStatus } from '../utils/types';
function filterAndConvert(
    assistantsWithStatus: IAssistantWithStatus[],
    statusCondition: string
  ): IAssistant[] {
    return assistantsWithStatus
      .filter((assistant) => assistant.build_status.status === statusCondition)
      .map(({ build_status, ...rest }) => rest);
  }

interface IAssistantData {
    model: string;
    prefix_system_prompt?: string;
    suffix_system_prompt?: string;
    name?: string;
    description?: string;
    tools?: any[];
    tool_resources?: object;
    metadata?:{[key:string]:any}
    top_p?: number;
    temperature?: number;
    response_format?: string;
    instructions?: string;
}

export const createAssistant = async (data: IAssistantData): Promise<IAssistant> => {
    const assistant_data: {
        model: string;
        instructions?: string;
        name?: string;
        description?: string;
        tools?: any[];
        tool_resources?: object;
        metadata?:{[key:string]:any}
        top_p?: number;
        temperature?: number;
        response_format?: string;
    } = {
        model: data.model
    };

    if (data.prefix_system_prompt) {
        assistant_data.instructions = data.prefix_system_prompt;
    }
    if (data.suffix_system_prompt) {
        assistant_data.instructions = data.suffix_system_prompt;
    }
    if (data.name) {
        assistant_data.name = data.name;
    }
    if (data.description) {
        assistant_data.description = data.description;
    }
    if (data.tools) {
        assistant_data.tools = data.tools;
    }
    if (data.tool_resources) {
        assistant_data.tool_resources = data.tool_resources;
    }
    if (data.metadata) {
        assistant_data.metadata = data.metadata
    }
    if (typeof data.top_p !== 'undefined') {
        assistant_data.top_p = data.top_p;
    }
    if (typeof data.temperature !== 'undefined') {
        assistant_data.temperature = data.temperature;
    }
    if (data.response_format) {
        assistant_data.response_format = data.response_format;
    }
    if (data.instructions) {
        assistant_data.instructions = data.instructions;
    }
    console.log(assistant_data)
    const response = await apiClient.post<IAssistant>(
        '/assistants/',
        assistant_data
    );
    console.log("response", response)
    return response.data;
};


export const listAssistants = async (
    limit?: number,
    order?: string,
    after?: string,
    before?: string,
    run_id?: string,
): Promise<IAssistant[]> => {
    const params: {
        limit?: number,
        order?: string,
        after?: string,
        before?: string,
        run_id?: string
    } = {};

    if (typeof limit !== 'undefined') {
        params.limit = limit;
    }
    if (typeof order !== 'undefined') {
        params.order = order;
    }
    if (typeof after !== 'undefined') {
        params.after = after;
    }
    if (typeof before !== 'undefined') {
        params.before = before;
    }
    if (typeof run_id !== 'undefined') {
        params.run_id = run_id;
    }
    const response = await apiClient.get<IAssistantWithStatus[]>('/assistants/status', {
        params
    });
    let tmp = response.data
    let result = [] as IAssistant[]
    const filteredAssistants = filterAndConvert(tmp, 'completed');
    return filteredAssistants
};

export const getAssistant = async (
    assistant_id: string
): Promise<IAssistant> => {
    const response = await apiClient.get<IAssistant>(`/assistants/${assistant_id}`);
    return response.data;
}

export const deleteAssistant = async (
    assistant_id: string
): Promise<IDeleteResult> => {
    const response = await apiClient.delete<IDeleteResult>(`/assistants/${assistant_id}`);
    return response.data;
}

export const getRelatedThreadId = async (
    assistant_id: string
): Promise<string[]> => {
    const response = await apiClient.get<string[]>(`/assistants/${assistant_id}/related_thread`);
    return response.data;
}

export const listAssistantsWithStatus = async (
    limit?: number,
    order?: string,
    after?: string,
    before?: string,
    run_id?: string,
): Promise<IAssistantWithStatus[]> => {
    const params: {
        limit?: number,
        order?: string,
        after?: string,
        before?: string,
        run_id?: string
    } = {};

    if (typeof limit !== 'undefined') {
        params.limit = limit;
    }
    if (typeof order !== 'undefined') {
        params.order = order;
    }
    if (typeof after !== 'undefined') {
        params.after = after;
    }
    if (typeof before !== 'undefined') {
        params.before = before;
    }
    if (typeof run_id !== 'undefined') {
        params.run_id = run_id;
    }
    console.log(params)
    const response = await apiClient.get<IAssistantWithStatus[]>('/assistants/status', {
        params
    });

    return response.data;
};


================================================
FILE: archive/ktransformers/website/src/api/message.ts
================================================
import apiClient from './api-client';
import { IMessage,IDeleteResult } from '../utils/types';

export const createMessage = async (
    thread_id: string,
    content: string,
    role?: string,
    attachments?: any[],
    metadata?:{[key:string]:any}
): Promise<IMessage> => {
    const message_data: {
        content: string;
        role?: string;
        attachments?: any[];
        metadata?:{[key:string]:any}
    } = {
        content,
    };

    if (metadata) {
        message_data.metadata = metadata;
    }
    if (role) {
        message_data.role = role;
    }
    if (attachments) {
        message_data.attachments = attachments;
    }
    const response = await apiClient.post<IMessage>(`/threads/${thread_id}/messages`, message_data);
    return response.data;
};


export const listMessages = async (
    thread_id: string,
    limit?: number,
    order?: string,
    after?: string,
    before?: string,
    run_id?: string,
): Promise<IMessage[]> => {
    const params: {
        limit?: number,
        order?: string,
        after?: string,
        before?: string,
        run_id?: string
    } = {};

    if (typeof limit !== 'undefined') {
        params.limit = limit;
    }
    if (typeof order !== 'undefined') {
        params.order = order;
    }
    if (typeof after !== 'undefined') {
        params.after = after;
    }
    if (typeof before !== 'undefined') {
        params.before = before;
    }
    if (typeof run_id !== 'undefined') {
        params.run_id = run_id;
    }

    const response = await apiClient.get<IMessage[]>(`/threads/${thread_id}/messages`, {
        params
    });

    return response.data;
};
export const deleteMessage = async(thread_id:string, message_id:string): Promise<IDeleteResult> => {
    const response = await apiClient.delete<IDeleteResult>(`/threads/${thread_id}/messages/${message_id}`);
    return response.data;
}


================================================
FILE: archive/ktransformers/website/src/api/run.ts
================================================
import apiClient from './api-client';
import { IRun } from '../utils/types';
import {baseURL} from '@/conf/config';
interface IRunData {
    assistant_id: string;
    model?: string;
    instructions?: string;
    additional_instructions?: string;
    additional_messages?: any[];
    tools?: any[];
    metadata?: { [key: string]: any }
    temperature?: number;
    top_p?: number;
    stream?: boolean;
    max_prompt_tokens?: number;
    max_completion_tokens?: number;
    truncation_strategy?: object;
    tool_choice?: string;
    response_format?: string | object;
}


export async function* createRun(
    data: IRunData,
    thread_id: string
): AsyncGenerator<string> {
    const run_data = {
        ...data, 
        assistant_id: data.assistant_id, 
    };

    const response = await fetch(`${baseURL}/threads/${thread_id}/runs`, {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json',
        },
        body: JSON.stringify(run_data),
    });

    if (!response.ok) {
        throw new Error(`HTTP error! status: ${response.status}`);
    }

    if (!response.body) {
        throw new Error('Response body is missing');
    }
    const reader = response.body.getReader();
    const decoder = new TextDecoder();
    let buffer = '';
    try {
        while (true) {
            const { done, value } = await reader.read();
            if (done) return;
            buffer += decoder.decode(value, { stream: true });

            let eventIndex = buffer.indexOf("\n\n");
            while (eventIndex !== -1) {
                const event = buffer.slice(0, eventIndex);
                buffer = buffer.slice(eventIndex + 2);
                if (event.startsWith("event: thread.run.created")) {
                    const dataIndex = event.indexOf("data: ");
                    if (dataIndex !== -1) {
                        const datads = event.slice(39, 75)
                        yield datads;
                    }
                } else if (event.startsWith("event: thread.message.delta")) {
                    const dataIndex = event.indexOf("data: ");
                    if (dataIndex !== -1) {
                        const data = JSON.parse(event.slice(dataIndex + 6));
                        yield data.delta.content[0].text.value || '';
                    }
                } else if (event.startsWith("event: done")) {
                    return;
                }

                eventIndex = buffer.indexOf("\n\n");
            }
        }
    } catch (e) {

        console.error('An error occurred while reading the response stream:', e);
        // throw e; 
        return e
    }
}
// 定义取消运行的函数
export async function cancelRun(threadId: string, runId: string){
    const run_data = {
        thread_id:threadId,
        run_id:runId,
    };
    try {
        const response = await fetch(`${baseURL}/threads/${threadId}/runs/${runId}/cancel`, {
            method: 'POST',
        });

        if (!response.ok) {
            throw new Error(`HTTP error! status: ${response.status}`);
        }

        return response;
    } catch (error) {
        console.error('An error occurred while cancelling the run:', error);
        throw error;
    }
}

================================================
FILE: archive/ktransformers/website/src/api/thread.ts
================================================
import apiClient from './api-client';
import { IThread, IMessage, IThreadAndMessageAndAssistant, IDeleteResult } from '../utils/types';
export const createThread = async (
    message?: IMessage,
    tool_resources?: object,
    metadata?: { [key: string]: any }
): Promise<IThread> => {
    const thread_data: { message?: object, metadata?: { [key: string]: any } } = {};
    if (message) {
        thread_data.message = message;
    }
    if (metadata) {
        thread_data.metadata = metadata;
    }
    const response = await apiClient.post<IThread>(
        '/threads',
        thread_data);
    return response.data;
};

export const listThreads = async (
    limit?: number,
    order?: string,
): Promise<IThreadAndMessageAndAssistant[]> => {
    const params: {
        limit?: number,
        order?: string,
    } = { limit, order };
    const response = await apiClient.get<IThreadAndMessageAndAssistant[]>('/threads', {
        params
    });

    return response.data;
};

export const deleteThread = async (
    thread_id: string
): Promise<IDeleteResult> => {
    const response = await apiClient.delete<IDeleteResult>(`/threads/${thread_id}`);
    return response.data;
}

export const getThread = async (
    thread_id: string
): Promise<IThread> => {
    const response = await apiClient.get<IThread>(`/threads/${thread_id}`);
    return response.data;
}

================================================
FILE: archive/ktransformers/website/src/assets/css/mixins.styl
================================================

/*Define color variables*/
$bg_gray_light_normal = #F9F9F9
$bg_gray_light_hover = #E8E8E8
$bg_gray_light_active = #E8E8E8

$border_gray_light_normal = rgba(0, 0, 0, .15)
$border_gray_light_hover = #8080FF

$gray_20 = #333333
$gray_40 = #585858
$gray_50 = #7F7F7F
$gray_60 = #9F9F9F
$gray_70 = #BFBFBF
$gray_80 = #DFDFDF
$gray_85 = #F2F2F2
$gray_90 = #F7F7F7

$gray = #53525B
$gray_dark = #42414a
$gray_hover = #121212
$gray_action = #6C757D

$primary = #409eff
$primary_hover = #428bca
$primary_middle = #9DDDF9
$primary_light = #D4F0FC

$cyan = #66CCCC
$cyan_hover = #46C2C2


/*Define common modules*/
$input-duration = .25s
input-border()
  -webkit-transition: border-color ease-in-out $input-duration,-webkit-box-shadow ease-in-out $input-duration
  -o-transition: border-color ease-in-out $input-duration,box-shadow ease-in-out $input-duration
  transition: border-color ease-in-out $input-duration,box-shadow ease-in-out $input-duration
input-focus()
  border-color: #66afe9
  outline: 0
  z-index: 100
  -webkit-box-shadow: inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(102,175,233,.6)
  box-shadow: inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(102,175,233,.6)


/*Define common class*/
.flex-column
  display: -webkit-box
  display: -webkit-flex
  display: flex
  box-sizing: border-box
  -webkit-box-orient: vertical
  -webkit-box-direction: normal
  -webkit-flex-direction: column
  flex-direction: column
  height: 100%

.flex-row
  position: relative
  display: -webkit-box
  display: -ms-flexbox
  display: flex
  box-sizing: border-box
  -webkit-box-align: center
  -ms-flex-align: center
  align-items: center

.flex-unit
  -webkit-box-flex: 1
  -ms-flex: 1
  flex: 1
  // overflow: hidden

.clearfix
  &:after
    clear: both
    content: "\20"
    display: block
    height: 0
    visibility: hidden

a,a:hover
  text-decoration:none

button:focus
  outline: none

.btn
  display: inline-block
  margin-bottom: 0
  padding:0px 15px
  font-size: 14px
  height: 34px
  line-height: 32px
  float: left /*去掉inline-block之间的空格*/
  font-weight: normal
  text-align: center
  white-space: nowrap
  vertical-align: middle
  cursor: pointer
  background-image: none
  border-radius: 3px
  -webkit-user-select: none
  -moz-user-select: none
  -ms-user-select: none
  -o-user-select: none
  user-select: none
  &:hover
    .dropdown-list
      display: block
  i
    font-size: 16px
  .text
    float: right
    margin-left: 3px

.btn-gray
  color: $gray_action
  background-color: #FFFFFF
  border: 1px solid $gray_action
  &:not(.is-disabled):hover
    color: #FFFFFF
    background-color: $gray_action
    border: 1px solid $gray_action

.btn-primary
  color: #FFFFFF
  background-color: $primary
  border: 1px solid $primary
  &:not(.is-disabled):hover
    color: #FFFFFF
    background-color: $primary_hover
    border: 1px solid $primary_hover

.chat-box
  position: relative
  .chat-input
    border: 1px solid $border_gray_light_normal
    height: 48px
    line-height: 48px
    font-size: 16px
    outline: 0
    box-sizing: border-box
    padding:0 30px0 20px
    color: #7F7F7F
    width: 800px
    border-radius: 12px
    position: relative
    &:focus
      input-focus()
  i
    position: absolute
    font-size: 26px
    right: 13px
    bottom:0px
    color: $border_gray_light_normal
    z-index: 100
    cursor: pointer
    &:hover
      color: $border_gray_light_hover


================================================
FILE: archive/ktransformers/website/src/assets/iconfont/demo.css
================================================
/* Logo 字体 */
@font-face {
  font-family: "iconfont logo";
  src: url('https://at.alicdn.com/t/font_985780_km7mi63cihi.eot?t=1545807318834');
  src: url('https://at.alicdn.com/t/font_985780_km7mi63cihi.eot?t=1545807318834#iefix') format('embedded-opentype'),
    url('https://at.alicdn.com/t/font_985780_km7mi63cihi.woff?t=1545807318834') format('woff'),
    url('https://at.alicdn.com/t/font_985780_km7mi63cihi.ttf?t=1545807318834') format('truetype'),
    url('https://at.alicdn.com/t/font_985780_km7mi63cihi.svg?t=1545807318834#iconfont') format('svg');
}

.logo {
  font-family: "iconfont logo";
  font-size: 160px;
  font-style: normal;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

/* tabs */
.nav-tabs {
  position: relative;
}

.nav-tabs .nav-more {
  position: absolute;
  right: 0;
  bottom: 0;
  height: 42px;
  line-height: 42px;
  color: #666;
}

#tabs {
  border-bottom: 1px solid #eee;
}

#tabs li {
  cursor: pointer;
  width: 100px;
  height: 40px;
  line-height: 40px;
  text-align: center;
  font-size: 16px;
  border-bottom: 2px solid transparent;
  position: relative;
  z-index: 1;
  margin-bottom: -1px;
  color: #666;
}


#tabs .active {
  border-bottom-color: #f00;
  color: #222;
}

.tab-container .content {
  display: none;
}

/* 页面布局 */
.main {
  padding: 30px 100px;
  width: 960px;
  margin: 0 auto;
}

.main .logo {
  color: #333;
  text-align: left;
  margin-bottom: 30px;
  line-height: 1;
  height: 110px;
  margin-top: -50px;
  overflow: hidden;
  *zoom: 1;
}

.main .logo a {
  font-size: 160px;
  color: #333;
}

.helps {
  margin-top: 40px;
}

.helps pre {
  padding: 20px;
  margin: 10px 0;
  border: solid 1px #e7e1cd;
  background-color: #fffdef;
  overflow: auto;
}

.icon_lists {
  width: 100% !important;
  overflow: hidden;
  *zoom: 1;
}

.icon_lists li {
  width: 100px;
  margin-bottom: 10px;
  margin-right: 20px;
  text-align: center;
  list-style: none !important;
  cursor: default;
}

.icon_lists li .code-name {
  line-height: 1.2;
}

.icon_lists .icon {
  display: block;
  height: 100px;
  line-height: 100px;
  font-size: 42px;
  margin: 10px auto;
  color: #333;
  -webkit-transition: font-size 0.25s linear, width 0.25s linear;
  -moz-transition: font-size 0.25s linear, width 0.25s linear;
  transition: font-size 0.25s linear, width 0.25s linear;
}

.icon_lists .icon:hover {
  font-size: 100px;
}

.icon_lists .svg-icon {
  /* 通过设置 font-size 来改变图标大小 */
  width: 1em;
  /* 图标和文字相邻时，垂直对齐 */
  vertical-align: -0.15em;
  /* 通过设置 color 来改变 SVG 的颜色/fill */
  fill: currentColor;
  /* path 和 stroke 溢出 viewBox 部分在 IE 下会显示
      normalize.css 中也包含这行 */
  overflow: hidden;
}

.icon_lists li .name,
.icon_lists li .code-name {
  color: #666;
}

/* markdown 样式 */
.markdown {
  color: #666;
  font-size: 14px;
  line-height: 1.8;
}

.highlight {
  line-height: 1.5;
}

.markdown img {
  vertical-align: middle;
  max-width: 100%;
}

.markdown h1 {
  color: #404040;
  font-weight: 500;
  line-height: 40px;
  margin-bottom: 24px;
}

.markdown h2,
.markdown h3,
.markdown h4,
.markdown h5,
.markdown h6 {
  color: #404040;
  margin: 1.6em 0 0.6em 0;
  font-weight: 500;
  clear: both;
}

.markdown h1 {
  font-size: 28px;
}

.markdown h2 {
  font-size: 22px;
}

.markdown h3 {
  font-size: 16px;
}

.markdown h4 {
  font-size: 14px;
}

.markdown h5 {
  font-size: 12px;
}

.markdown h6 {
  font-size: 12px;
}

.markdown hr {
  height: 1px;
  border: 0;
  background: #e9e9e9;
  margin: 16px 0;
  clear: both;
}

.markdown p {
  margin: 1em 0;
}

.markdown>p,
.markdown>blockquote,
.markdown>.highlight,
.markdown>ol,
.markdown>ul {
  width: 80%;
}

.markdown ul>li {
  list-style: circle;
}

.markdown>ul li,
.markdown blockquote ul>li {
  margin-left: 20px;
  padding-left: 4px;
}

.markdown>ul li p,
.markdown>ol li p {
  margin: 0.6em 0;
}

.markdown ol>li {
  list-style: decimal;
}

.markdown>ol li,
.markdown blockquote ol>li {
  margin-left: 20px;
  padding-left: 4px;
}

.markdown code {
  margin: 0 3px;
  padding: 0 5px;
  background: #eee;
  border-radius: 3px;
}

.markdown strong,
.markdown b {
  font-weight: 600;
}

.markdown>table {
  border-collapse: collapse;
  border-spacing:0;
  empty-cells: show;
  border: 1px solid #e9e9e9;
  width: 95%;
  margin-bottom: 24px;
}

.markdown>table th {
  white-space: nowrap;
  color: #333;
  font-weight: 600;
}

.markdown>table th,
.markdown>table td {
  border: 1px solid #e9e9e9;
  padding: 8px 16px;
  text-align: left;
}

.markdown>table th {
  background: #F7F7F7;
}

.markdown blockquote {
  font-size: 90%;
  color: #999;
  border-left: 4px solid #e9e9e9;
  padding-left: 0.8em;
  margin: 1em 0;
}

.markdown blockquote p {
  margin: 0;
}

.markdown .anchor {
  opacity: 0;
  transition: opacity 0.3s ease;
  margin-left: 8px;
}

.markdown .waiting {
  color: #ccc;
}

.markdown h1:hover .anchor,
.markdown h2:hover .anchor,
.markdown h3:hover .anchor,
.markdown h4:hover .anchor,
.markdown h5:hover .anchor,
.markdown h6:hover .anchor {
  opacity: 1;
  display: inline-block;
}

.markdown>br,
.markdown>p>br {
  clear: both;
}


.hljs {
  display: block;
  background: white;
  padding: 0.5em;
  color: #333333;
  overflow-x: auto;
}

.hljs-comment,
.hljs-meta {
  color: #969896;
}

.hljs-string,
.hljs-variable,
.hljs-template-variable,
.hljs-strong,
.hljs-emphasis,
.hljs-quote {
  color: #df5000;
}

.hljs-keyword,
.hljs-selector-tag,
.hljs-type {
  color: #a71d5d;
}

.hljs-literal,
.hljs-symbol,
.hljs-bullet,
.hljs-attribute {
  color: #0086b3;
}

.hljs-section,
.hljs-name {
  color: #63a35c;
}

.hljs-tag {
  color: #333333;
}

.hljs-title,
.hljs-attr,
.hljs-selector-id,
.hljs-selector-class,
.hljs-selector-attr,
.hljs-selector-pseudo {
  color: #795da3;
}

.hljs-addition {
  color: #55a532;
  background-color: #eaffea;
}

.hljs-deletion {
  color: #bd2c00;
  background-color: #ffecec;
}

.hljs-link {
  text-decoration: underline;
}

/* 代码高亮 */
/* PrismJS 1.15.0
https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript */
/**
 * prism.js default theme for JavaScript, CSS and HTML
 * Based on dabblet (http://dabblet.com)
 * @author Lea Verou
 */
code[class*="language-"],
pre[class*="language-"] {
  color: black;
  background: none;
  text-shadow: 0 1px white;
  font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
  text-align: left;
  white-space: pre;
  word-spacing: normal;
  word-break: normal;
  word-wrap: normal;
  line-height: 1.5;

  -moz-tab-size: 4;
  -o-tab-size: 4;
  tab-size: 4;

  -webkit-hyphens: none;
  -moz-hyphens: none;
  -ms-hyphens: none;
  hyphens: none;
}

pre[class*="language-"]::-moz-selection,
pre[class*="language-"] ::-moz-selection,
code[class*="language-"]::-moz-selection,
code[class*="language-"] ::-moz-selection {
  text-shadow: none;
  background: #b3d4fc;
}

pre[class*="language-"]::selection,
pre[class*="language-"] ::selection,
code[class*="language-"]::selection,
code[class*="language-"] ::selection {
  text-shadow: none;
  background: #b3d4fc;
}

@media print {

  code[class*="language-"],
  pre[class*="language-"] {
    text-shadow: none;
  }
}

/* Code blocks */
pre[class*="language-"] {
  padding: 1em;
  margin: .5em 0;
  overflow: auto;
}

:not(pre)>code[class*="language-"],
pre[class*="language-"] {
  background: #f5f2f0;
}

/* Inline code */
:not(pre)>code[class*="language-"] {
  padding: .1em;
  border-radius: .3em;
  white-space: normal;
}

.token.comment,
.token.prolog,
.token.doctype,
.token.cdata {
  color: slategray;
}

.token.punctuation {
  color: #999;
}

.namespace {
  opacity: .7;
}

.token.property,
.token.tag,
.token.boolean,
.token.number,
.token.constant,
.token.symbol,
.token.deleted {
  color: #905;
}

.token.selector,
.token.attr-name,
.token.string,
.token.char,
.token.builtin,
.token.inserted {
  color: #690;
}

.token.operator,
.token.entity,
.token.url,
.language-css .token.string,
.style .token.string {
  color: #9a6e3a;
  background: hsla(0, 0%, 100%, .5);
}

.token.atrule,
.token.attr-value,
.token.keyword {
  color: #07a;
}

.token.function,
.token.class-name {
  color: #DD4A68;
}

.token.regex,
.token.important,
.token.variable {
  color: #e90;
}

.token.important,
.token.bold {
  font-weight: bold;
}

.token.italic {
  font-style: italic;
}

.token.entity {
  cursor: help;
}


================================================
FILE: archive/ktransformers/website/src/assets/iconfont/demo_index.html
================================================
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8"/>
  <title>iconfont Demo</title>
  <link rel="shortcut icon" href="//img.alicdn.com/imgextra/i4/O1CN01Z5paLz1O0zuCC7osS_!!6000000001644-55-tps-83-82.svg" type="image/x-icon"/>
  <link rel="icon" type="image/svg+xml" href="//img.alicdn.com/imgextra/i4/O1CN01Z5paLz1O0zuCC7osS_!!6000000001644-55-tps-83-82.svg"/>
  <link rel="stylesheet" href="https://g.alicdn.com/thx/cube/1.3.2/cube.min.css">
  <link rel="stylesheet" href="demo.css">
  <link rel="stylesheet" href="iconfont.css">
  <script src="iconfont.js"></script>
  <!-- jQuery -->
  <script src="https://a1.alicdn.com/oss/uploads/2018/12/26/7bfddb60-08e8-11e9-9b04-53e73bb6408b.js"></script>
  <!-- 代码高亮 -->
  <script src="https://a1.alicdn.com/oss/uploads/2018/12/26/a3f714d0-08e6-11e9-8a15-ebf944d7534c.js"></script>
  <style>
    .main .logo {
      margin-top: 0;
      height: auto;
    }

    .main .logo a {
      display: flex;
      align-items: center;
    }

    .main .logo .sub-title {
      margin-left: 0.5em;
      font-size: 22px;
      color: #fff;
      background: linear-gradient(-45deg, #3967FF, #B500FE);
      -webkit-background-clip: text;
      -webkit-text-fill-color: transparent;
    }
  </style>
</head>
<body>
  <div class="main">
    <h1 class="logo"><a href="https://www.iconfont.cn/" title="iconfont 首页" target="_blank">
      <img width="200" src="https://img.alicdn.com/imgextra/i3/O1CN01Mn65HV1FfSEzR6DKv_!!6000000000514-55-tps-228-59.svg">
      
    </a></h1>
    <div class="nav-tabs">
      <ul id="tabs" class="dib-box">
        <li class="dib active"><span>Unicode</span></li>
        <li class="dib"><span>Font class</span></li>
        <li class="dib"><span>Symbol</span></li>
      </ul>
      
      <a href="https://www.iconfont.cn/manage/index?manage_type=myprojects&projectId=4550268" target="_blank" class="nav-more">查看项目</a>
      
    </div>
    <div class="tab-container">
      <div class="content unicode" style="display: block;">
          <ul class="icon_lists dib-box">
          
            <li class="dib">
              <span class="icon iconfont">&#xe8b0;</span>
                <div class="name">复制</div>
                <div class="code-name">&amp;#xe8b0;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe85e;</span>
                <div class="name">箭头下</div>
                <div class="code-name">&amp;#xe85e;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe651;</span>
                <div class="name">进度</div>
                <div class="code-name">&amp;#xe651;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe617;</span>
                <div class="name">环形进度条</div>
                <div class="code-name">&amp;#xe617;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe779;</span>
                <div class="name">向左1</div>
                <div class="code-name">&amp;#xe779;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe608;</span>
                <div class="name">点</div>
                <div class="code-name">&amp;#xe608;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe7dd;</span>
                <div class="name">编辑</div>
                <div class="code-name">&amp;#xe7dd;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe614;</span>
                <div class="name">删除</div>
                <div class="code-name">&amp;#xe614;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe618;</span>
                <div class="name">上传</div>
                <div class="code-name">&amp;#xe618;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe621;</span>
                <div class="name">探索-选中</div>
                <div class="code-name">&amp;#xe621;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe657;</span>
                <div class="name">ellipsis</div>
                <div class="code-name">&amp;#xe657;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe60c;</span>
                <div class="name">发送</div>
                <div class="code-name">&amp;#xe60c;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe62d;</span>
                <div class="name">列表</div>
                <div class="code-name">&amp;#xe62d;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe639;</span>
                <div class="name">列表</div>
                <div class="code-name">&amp;#xe639;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe6bd;</span>
                <div class="name">重试</div>
                <div class="code-name">&amp;#xe6bd;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe826;</span>
                <div class="name">Fork 记录</div>
                <div class="code-name">&amp;#xe826;</div>
              </li>
          
          </ul>
          <div class="article markdown">
          <h2 id="unicode-">Unicode 引用</h2>
          <hr>

          <p>Unicode 是字体在网页端最原始的应用方式，特点是：</p>
          <ul>
            <li>支持按字体的方式去动态调整图标大小，颜色等等。</li>
            <li>默认情况下不支持多色，直接添加多色图标会自动去色。</li>
          </ul>
          <blockquote>
            <p>注意：新版 iconfont 支持两种方式引用多色图标：SVG symbol 引用方式和彩色字体图标模式。（使用彩色字体图标需要在「编辑项目」中开启「彩色」选项后并重新生成。）</p>
          </blockquote>
          <p>Unicode 使用步骤如下：</p>
          <h3 id="-font-face">第一步：拷贝项目下面生成的 <code>@font-face</code></h3>
<pre><code class="language-css"
>@font-face {
  font-family: 'iconfont';
  src: url('iconfont.woff2?t=1717950820214') format('woff2'),
       url('iconfont.woff?t=1717950820214') format('woff'),
       url('iconfont.ttf?t=1717950820214') format('truetype'),
       url('iconfont.svg?t=1717950820214#iconfont') format('svg');
}
</code></pre>
          <h3 id="-iconfont-">第二步：定义使用 iconfont 的样式</h3>
<pre><code class="language-css"
>.iconfont {
  font-family: "iconfont" !important;
  font-size: 16px;
  font-style: normal;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}
</code></pre>
          <h3 id="-">第三步：挑选相应图标并获取字体编码，应用于页面</h3>
<pre>
<code class="language-html"
>&lt;span class="iconfont"&gt;&amp;#x33;&lt;/span&gt;
</code></pre>
          <blockquote>
            <p>"iconfont" 是你项目下的 font-family。可以通过编辑项目查看，默认是 "iconfont"。</p>
          </blockquote>
          </div>
      </div>
      <div class="content font-class">
        <ul class="icon_lists dib-box">
          
          <li class="dib">
            <span class="icon iconfont icon-copy"></span>
            <div class="name">
              复制
            </div>
            <div class="code-name">.icon-copy
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-arrow-down"></span>
            <div class="name">
              箭头下
            </div>
            <div class="code-name">.icon-arrow-down
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-usage-progress"></span>
            <div class="name">
              进度
            </div>
            <div class="code-name">.icon-usage-progress
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-gen-progress"></span>
            <div class="name">
              环形进度条
            </div>
            <div class="code-name">.icon-gen-progress
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-back"></span>
            <div class="name">
              向左1
            </div>
            <div class="code-name">.icon-back
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-point"></span>
            <div class="name">
              点
            </div>
            <div class="code-name">.icon-point
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-edit"></span>
            <div class="name">
              编辑
            </div>
            <div class="code-name">.icon-edit
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-delete"></span>
            <div class="name">
              删除
            </div>
            <div class="code-name">.icon-delete
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-upload-1"></span>
            <div class="name">
              上传
            </div>
            <div class="code-name">.icon-upload-1
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-explore"></span>
            <div class="name">
              探索-选中
            </div>
            <div class="code-name">.icon-explore
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-ellipsis"></span>
            <div class="name">
              ellipsis
            </div>
            <div class="code-name">.icon-ellipsis
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-sent"></span>
            <div class="name">
              发送
            </div>
            <div class="code-name">.icon-sent
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-list-list"></span>
            <div class="name">
              列表
            </div>
            <div class="code-name">.icon-list-list
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-list-icon"></span>
            <div class="name">
              列表
            </div>
            <div class="code-name">.icon-list-icon
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-zhongshi"></span>
            <div class="name">
              重试
            </div>
            <div class="code-name">.icon-zhongshi
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-log"></span>
            <div class="name">
              Fork 记录
            </div>
            <div class="code-name">.icon-log
            </div>
          </li>
          
        </ul>
        <div class="article markdown">
        <h2 id="font-class-">font-class 引用</h2>
        <hr>

        <p>font-class 是 Unicode 使用方式的一种变种，主要是解决 Unicode 书写不直观，语意不明确的问题。</p>
        <p>与 Unicode 使用方式相比，具有如下特点：</p>
        <ul>
          <li>相比于 Unicode 语意明确，书写更直观。可以很容易分辨这个 icon 是什么。</li>
          <li>因为使用 class 来定义图标，所以当要替换图标时，只需要修改 class 里面的 Unicode 引用。</li>
        </ul>
        <p>使用步骤如下：</p>
        <h3 id="-fontclass-">第一步：引入项目下面生成的 fontclass 代码：</h3>
<pre><code class="language-html">&lt;link rel="stylesheet" href="./iconfont.css"&gt;
</code></pre>
        <h3 id="-">第二步：挑选相应图标并获取类名，应用于页面：</h3>
<pre><code class="language-html">&lt;span class="iconfont icon-xxx"&gt;&lt;/span&gt;
</code></pre>
        <blockquote>
          <p>"
            iconfont" 是你项目下的 font-family。可以通过编辑项目查看，默认是 "iconfont"。</p>
        </blockquote>
      </div>
      </div>
      <div class="content symbol">
          <ul class="icon_lists dib-box">
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-copy"></use>
                </svg>
                <div class="name">复制</div>
                <div class="code-name">#icon-copy</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-arrow-down"></use>
                </svg>
                <div class="name">箭头下</div>
                <div class="code-name">#icon-arrow-down</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-usage-progress"></use>
                </svg>
                <div class="name">进度</div>
                <div class="code-name">#icon-usage-progress</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-gen-progress"></use>
                </svg>
                <div class="name">环形进度条</div>
                <div class="code-name">#icon-gen-progress</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-back"></use>
                </svg>
                <div class="name">向左1</div>
                <div class="code-name">#icon-back</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-point"></use>
                </svg>
                <div class="name">点</div>
                <div class="code-name">#icon-point</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-edit"></use>
                </svg>
                <div class="name">编辑</div>
                <div class="code-name">#icon-edit</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-delete"></use>
                </svg>
                <div class="name">删除</div>
                <div class="code-name">#icon-delete</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-upload-1"></use>
                </svg>
                <div class="name">上传</div>
                <div class="code-name">#icon-upload-1</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-explore"></use>
                </svg>
                <div class="name">探索-选中</div>
                <div class="code-name">#icon-explore</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-ellipsis"></use>
                </svg>
                <div class="name">ellipsis</div>
                <div class="code-name">#icon-ellipsis</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-sent"></use>
                </svg>
                <div class="name">发送</div>
                <div class="code-name">#icon-sent</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-list-list"></use>
                </svg>
                <div class="name">列表</div>
                <div class="code-name">#icon-list-list</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-list-icon"></use>
                </svg>
                <div class="name">列表</div>
                <div class="code-name">#icon-list-icon</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-zhongshi"></use>
                </svg>
                <div class="name">重试</div>
                <div class="code-name">#icon-zhongshi</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-log"></use>
                </svg>
                <div class="name">Fork 记录</div>
                <div class="code-name">#icon-log</div>
            </li>
          
          </ul>
          <div class="article markdown">
          <h2 id="symbol-">Symbol 引用</h2>
          <hr>

          <p>这是一种全新的使用方式，应该说这才是未来的主流，也是平台目前推荐的用法。相关介绍可以参考这篇<a href="">文章</a>
            这种用法其实是做了一个 SVG 的集合，与另外两种相比具有如下特点：</p>
          <ul>
            <li>支持多色图标了，不再受单色限制。</li>
            <li>通过一些技巧，支持像字体那样，通过 <code>font-size</code>, <code>color</code> 来调整样式。</li>
            <li>兼容性较差，支持 IE9+，及现代浏览器。</li>
            <li>浏览器渲染 SVG 的性能一般，还不如 png。</li>
          </ul>
          <p>使用步骤如下：</p>
          <h3 id="-symbol-">第一步：引入项目下面生成的 symbol 代码：</h3>
<pre><code class="language-html">&lt;script src="./iconfont.js"&gt;&lt;/script&gt;
</code></pre>
          <h3 id="-css-">第二步：加入通用 CSS 代码（引入一次就行）：</h3>
<pre><code class="language-html">&lt;style&gt;
.icon {
  width: 1em;
  height: 1em;
  vertical-align: -0.15em;
  fill: currentColor;
  overflow: hidden;
}
&lt;/style&gt;
</code></pre>
          <h3 id="-">第三步：挑选相应图标并获取类名，应用于页面：</h3>
<pre><code class="language-html">&lt;svg class="icon" aria-hidden="true"&gt;
  &lt;use xlink:href="#icon-xxx"&gt;&lt;/use&gt;
&lt;/svg&gt;
</code></pre>
          </div>
      </div>

    </div>
  </div>
  <script>
  $(document).ready(function () {
      $('.tab-container .content:first').show()

      $('#tabs li').click(function (e) {
        var tabContent = $('.tab-container .content')
        var index = $(this).index()

        if ($(this).hasClass('active')) {
          return
        } else {
          $('#tabs li').removeClass('active')
          $(this).addClass('active')

          tabContent.hide().eq(index).fadeIn()
        }
      })
    })
  </script>
</body>
</html>


================================================
FILE: archive/ktransformers/website/src/assets/iconfont/iconfont.css
================================================
@font-face {
  font-family: "iconfont"; /* Project id 4550268 */
  src: url('iconfont.woff2?t=1717950820214') format('woff2'),
       url('iconfont.woff?t=1717950820214') format('woff'),
       url('iconfont.ttf?t=1717950820214') format('truetype'),
       url('iconfont.svg?t=1717950820214#iconfont') format('svg');
}

.iconfont {
  font-family: "iconfont" !important;
  font-size: 16px;
  font-style: normal;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

.icon-copy:before {
  content: "\e8b0";
}

.icon-arrow-down:before {
  content: "\e85e";
}

.icon-usage-progress:before {
  content: "\e651";
}

.icon-gen-progress:before {
  content: "\e617";
}

.icon-back:before {
  content: "\e779";
}

.icon-point:before {
  content: "\e608";
}

.icon-edit:before {
  content: "\e7dd";
}

.icon-delete:before {
  content: "\e614";
}

.icon-upload-1:before {
  content: "\e618";
}

.icon-explore:before {
  content: "\e621";
}

.icon-ellipsis:before {
  content: "\e657";
}

.icon-sent:before {
  content: "\e60c";
}

.icon-list-list:before {
  content: "\e62d";
}

.icon-list-icon:before {
  content: "\e639";
}

.icon-zhongshi:before {
  content: "\e6bd";
}

.icon-log:before {
  content: "\e826";
}


================================================
FILE: archive/ktransformers/website/src/assets/iconfont/iconfont.js
================================================
window._iconfont_svg_string_4550268='<svg><symbol id="icon-copy" viewBox="0 0 1024 1024"><path d="M394.666667 106.666667h448a74.666667 74.666667 0 0 1 74.666666 74.666666v448a74.666667 74.666667 0 0 1-74.666666 74.666667H394.666667a74.666667 74.666667 0 0 1-74.666667-74.666667V181.333333a74.666667 74.666667 0 0 1 74.666667-74.666666z m0 64a10.666667 10.666667 0 0 0-10.666667 10.666666v448a10.666667 10.666667 0 0 0 10.666667 10.666667h448a10.666667 10.666667 0 0 0 10.666666-10.666667V181.333333a10.666667 10.666667 0 0 0-10.666666-10.666666H394.666667z m245.333333 597.333333a32 32 0 0 1 64 0v74.666667a74.666667 74.666667 0 0 1-74.666667 74.666666H181.333333a74.666667 74.666667 0 0 1-74.666666-74.666666V394.666667a74.666667 74.666667 0 0 1 74.666666-74.666667h74.666667a32 32 0 0 1 0 64h-74.666667a10.666667 10.666667 0 0 0-10.666666 10.666667v448a10.666667 10.666667 0 0 0 10.666666 10.666666h448a10.666667 10.666667 0 0 0 10.666667-10.666666v-74.666667z" fill="#000000" ></path></symbol><symbol id="icon-arrow-down" viewBox="0 0 1024 1024"><path d="M554.666667 690.005333l228.864-228.864 60.330666 60.330667L512 853.333333l-331.861333-331.861333 60.330666-60.330667L469.333333 690.005333V170.666667h85.333334v519.338666z"  ></path></symbol><symbol id="icon-usage-progress" viewBox="0 0 1024 1024"><path d="M512 125.098667A386.901333 386.901333 0 1 1 125.098667 512 386.901333 386.901333 0 0 1 512 125.098667z" fill="#ACE9C5" ></path><path d="M512 318.634667A193.365333 193.365333 0 1 1 318.634667 512 193.365333 193.365333 0 0 1 512 318.634667z" fill="#2BA866" ></path></symbol><symbol id="icon-gen-progress" viewBox="0 0 1024 1024"><path d="M692.004733 714.930578l96.018649 96.017519C715.492309 877.950022 618.525386 918.887417 512 918.887417c-104.225342 0-199.297978-39.187779-271.287664-103.631964l96.127152-96.126023C384.097201 759.135506 445.230905 783.258278 512 783.258278c69.07253 0 132.114084-25.817007 180.004733-68.3277z m-202.61185-609.200883L489.395143 241.670781C350.16053 253.157439 240.741722 369.800759 240.741722 512c0 66.767965 24.122773 127.900539 64.127717 175.160512l-96.126022 96.126022C144.299232 711.295717 105.112583 616.225342 105.112583 512c0-217.130949 170.07894-394.539514 384.2803-406.270305z m325.8637 134.984901C879.700768 312.702022 918.887417 407.774658 918.887417 512c0 101.921907-37.474331 195.091214-99.395814 266.479611l-96.270694-96.268432C760.774358 635.667779 783.258278 576.460009 783.258278 512c0-66.767965-24.122773-127.901669-64.128848-175.161642l96.127153-96.124892zM534.608247 105.728565c95.334852 5.221722 181.928406 43.261174 248.678287 103.013722l-96.127152 96.127152c-41.869845-35.444415-94.631841-58.422252-152.553395-63.199788l0.00226-135.941086z" fill="#448AFF" fill-opacity=".6" ></path><path d="M489.392883 105.729695L489.395143 241.670781C350.16053 253.157439 240.741722 369.800759 240.741722 512c0 66.767965 24.122773 127.900539 64.127717 175.160512l-96.126022 96.126022C144.299232 711.295717 105.112583 616.225342 105.112583 512c0-217.130949 170.07894-394.539514 384.2803-406.270305z" fill="#448AFF" ></path></symbol><symbol id="icon-back" viewBox="0 0 1024 1024"><path d="M671.968176 911.99957c-12.287381 0-24.576482-4.67206-33.951566-14.047144L286.048434 545.984249c-18.751888-18.719204-18.751888-49.12028 0-67.872168L638.016611 126.111222c18.751888-18.751888 49.12028-18.751888 67.872168 0 18.751888 18.719204 18.751888 49.12028 0 67.872168l-318.016611 318.047574L705.888778 830.047574c18.751888 18.751888 18.751888 49.12028 0 67.872168C696.544658 907.32751 684.255557 911.99957 671.968176 911.99957z" fill="#2c2c2c" ></path></symbol><symbol id="icon-point" viewBox="0 0 1024 1024"><path d="M512 307.2a204.86826667 204.86826667 0 0 1 0 409.6 204.8 204.8 0 0 1 0-409.6z" fill="" ></path></symbol><symbol id="icon-edit" viewBox="0 0 1024 1024"><path d="M899.072 125.44c-28.672-28.672-67.072-44.544-107.52-44.544s-78.848 15.872-107.52 44.544L251.392 558.08c-34.304 34.304-60.416 74.752-78.336 119.808L88.576 896c-4.608 11.264-1.536 24.064 7.168 32.768 5.632 5.632 13.824 9.216 21.504 9.216 3.584 0 7.68-0.512 11.264-2.048l218.624-84.48c45.056-17.408 85.504-44.032 119.808-78.336l351.744-351.744 80.896-80.896c58.88-59.392 58.88-155.648-0.512-215.04z m-475.648 604.16c-28.16 28.16-61.44 50.176-98.816 64.512l-153.6 59.392 59.392-153.6c14.336-37.376 35.84-70.656 64.512-98.816L625.152 271.36l128.512 128.512-330.24 329.728z m432.64-432.128l-58.88 58.88-128.512-128.512L727.552 168.96c16.896-16.896 39.936-26.624 64.512-26.624s47.104 9.216 64.512 26.624c34.816 35.328 34.816 92.672-0.512 128.512z" fill="#333333" ></path></symbol><symbol id="icon-delete" viewBox="0 0 1024 1024"><path d="M742.4 944H281.6c-49.4 0-89.6-43.1-89.6-96V368h64v480c0 17.3 11.7 32 25.6 32h460.8c13.9 0 25.6-14.7 25.6-32V368h64v480c0 52.9-40.2 96-89.6 96z"  ></path><path d="M384 368h64v416h-64zM592 368h64v416h-64zM64 224h896v64H64z"  ></path><path d="M768 288H256V160c0-52.9 43.1-96 96-96h320c52.9 0 96 43.1 96 96v128z m-448-64h384v-64c0-17.6-14.4-32-32-32H352c-17.6 0-32 14.4-32 32v64z"  ></path></symbol><symbol id="icon-upload-1" viewBox="0 0 1024 1024"><path d="M323.034074 291.934815l383.620741 0c9.481481 0 17.256296-8.533333 17.256296-18.962963 0-10.42963-7.68-18.962963-17.256296-18.962963L323.034074 254.008889c-9.481481 0-17.256296 8.533333-17.256296 18.962963C305.777778 283.496296 313.457778 291.934815 323.034074 291.934815z" fill="#272536" ></path><path d="M522.05037 328.628148c-1.232593-1.232593-2.844444-1.896296-4.740741-1.991111-1.706667-0.094815-3.318519-0.094815-5.025185 0-1.896296 0.094815-3.508148 0.758519-4.740741 1.991111L349.013333 487.253333c-3.887407 3.887407-1.896296 12.325926 4.456296 18.773333 6.447407 6.447407 14.791111 8.438519 18.773333 4.456296l125.060741-125.060741 0 367.122963c0 9.671111 7.86963 17.540741 17.540741 17.540741l0 0c9.671111 0 17.540741-7.86963 17.540741-17.540741L532.385185 385.327407l125.060741 125.060741c3.887407 3.887407 12.325926 1.896296 18.773333-4.456296 6.447407-6.447407 8.438519-14.791111 4.456296-18.773333L522.05037 328.628148z" fill="#272536" ></path></symbol><symbol id="icon-explore" viewBox="0 0 1024 1024"><path d="M926.352541 89.231277c-0.029676-7.432273-1.212618-13.651928-2.837628-19.264762-31.228235-8.264221-71.898517 1.24127-106.283652 17.927301-7.049556 3.41068-23.762193 13.583366-48.51597 28.643364-10.237155 6.250354-19.264762 11.739369-23.251563 14.002922-0.384763 0.224104-0.608867 0.63752-0.958838 0.861624-67.557652-41.147142-146.571217-65.327868-231.319389-65.327868-246.251474 0-446.569802 200.319351-446.569802 446.564685 0 82.554204 22.904663 159.683862 62.105476 226.062666-46.315862 71.387887-69.2809 122.93182-63.283302 157.863401 1.24127 7.144724 13.555737 8.28878 20.316721 8.28878 137.989771 0 453.393207-302.802444 492.628814-341.399507C751.64859 393.022235 926.449755 184.667883 926.352541 89.231277L926.352541 89.231277zM305.847292 611.014084c-43.956118 0-79.744205-35.757388-79.744205-79.743182 0-43.956118 35.789111-79.744205 79.744205-79.744205 43.956118 0 79.743182 35.789111 79.743182 79.744205C385.591497 575.256696 349.803409 611.014084 305.847292 611.014084L305.847292 611.014084zM446.19783 387.730719c-52.760644 0-95.694479-42.937928-95.694479-95.692433 0-52.760644 42.933835-95.694479 95.694479-95.694479 52.761668 0 95.694479 42.933835 95.694479 95.694479C541.892309 344.79279 498.958474 387.730719 446.19783 387.730719L446.19783 387.730719zM893.595486 279.9469c-66.889433 99.330286-172.055634 218.596623-276.967032 321.751005-28.551266 28.104081-201.624067 195.822944-346.982666 285.198507 0.12689-0.097214 0.223081-0.160659 0.349971-0.224104 70.049403 45.708018 153.491837 72.536037 243.189741 72.536037 246.246357 0 446.565708-200.318328 446.565708-446.570825C959.716416 427.317319 935.282934 347.82587 893.595486 279.9469L893.595486 279.9469zM638.54051 799.720957c-35.180244 0-63.793932-28.614711-63.793932-63.794955 0-35.184337 28.613688-63.799048 63.793932-63.799048 35.184337 0 63.793932 28.614711 63.793932 63.799048C702.334441 771.106246 673.724847 799.720957 638.54051 799.720957L638.54051 799.720957zM638.54051 799.720957" fill="#615CED" ></path></symbol><symbol id="icon-ellipsis" viewBox="0 0 1024 1024"><path d="M322.292 505.5m-66 0a66 66 0 1 0 132 0 66 66 0 1 0-132 0Z" fill="#272636" ></path><path d="M509.791 505.5m-66 0a66 66 0 1 0 132 0 66 66 0 1 0-132 0Z" fill="#272636" ></path><path d="M701.791 505.5m-66 0a66 66 0 1 0 132 0 66 66 0 1 0-132 0Z" fill="#272636" ></path></symbol><symbol id="icon-sent" viewBox="0 0 1024 1024"><path d="M998.976 554.3232C1031.232 539.6032 1031.328 515.7952 998.976 501.0432L122.88 101.3312C90.624 86.6112 64.448 103.5072 64.384 138.4832L64 426.9952 773.568 527.6672 64 628.3392 64.384 916.8832C64.448 952.1152 90.528 968.7872 122.88 954.0352L998.976 554.3232Z"  ></path></symbol><symbol id="icon-list-list" viewBox="0 0 1024 1024"><path d="M419.037 287.953h413.124c17.673 0 32-14.327 32-32s-14.327-32-32-32H419.037c-17.673 0-32 14.327-32 32s14.327 32 32 32zM419.028 543.17h411.608c17.673 0 32-14.327 32-32s-14.327-32-32-32H419.028c-17.673 0-32 14.327-32 32s14.327 32 32 32zM832.161 735.802H419.037c-17.673 0-32 14.327-32 32s14.327 32 32 32h413.124c17.673 0 32-14.327 32-32s-14.327-32-32-32z" fill="" ></path><path d="M256.037 255.953m-64 0a64 64 0 1 0 128 0 64 64 0 1 0-128 0Z" fill="" ></path><path d="M256.037 510.787m-64 0a64 64 0 1 0 128 0 64 64 0 1 0-128 0Z" fill="" ></path><path d="M256.037 767.621m-64 0a64 64 0 1 0 128 0 64 64 0 1 0-128 0Z" fill="" ></path></symbol><symbol id="icon-list-icon" viewBox="0 0 1024 1024"><path d="M841.6 489.6h-214.4c-48 0-86.4-38.4-86.4-86.4V188.8c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c0 48-38.4 86.4-86.4 86.4z m-211.2-320c-12.8 0-22.4 9.6-22.4 22.4v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4V188.8c0-12.8-9.6-22.4-22.4-22.4h-214.4zM393.6 489.6H182.4c-48 0-86.4-38.4-86.4-86.4V188.8c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c-3.2 48-41.6 86.4-89.6 86.4z m-211.2-320c-12.8 0-22.4 9.6-22.4 19.2v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4V188.8c0-12.8-9.6-22.4-22.4-22.4H182.4zM841.6 937.6h-214.4c-48 0-86.4-38.4-86.4-86.4v-214.4c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c0 48-38.4 86.4-86.4 86.4z m-211.2-323.2c-12.8 0-22.4 9.6-22.4 22.4v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4v-214.4c0-12.8-9.6-22.4-22.4-22.4h-214.4zM393.6 937.6H182.4c-48 0-86.4-38.4-86.4-86.4v-214.4c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c-3.2 48-41.6 86.4-89.6 86.4zM182.4 614.4c-12.8 0-22.4 9.6-22.4 22.4v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4v-214.4c0-12.8-9.6-22.4-22.4-22.4H182.4z" fill="#333333" ></path></symbol><symbol id="icon-zhongshi" viewBox="0 0 1024 1024"><path d="M973.53044 167.133265l-65.609003 50.468463A491.226376 491.226376 0 0 0 522.971405 33.282123C253.074841 33.282123 34.74388 247.370807 34.378166 512.220525c-0.365714 265.142289 218.550389 480.108685 488.593239 480.108686 211.016691 0 390.728306-131.291147 459.189873-315.245039a9.069695 9.069695 0 0 0-5.851416-11.775975l-65.82843-22.308523a9.435408 9.435408 0 0 0-11.775975 5.485702 392.48373 392.48373 0 0 1-92.525516 141.896839 402.650566 402.650566 0 0 1-282.915965 115.12661c-54.125598 0-106.495772-10.386263-155.793952-30.793077a398.627717 398.627717 0 0 1-212.845258-209.188123 383.779749 383.779749 0 0 1-31.451361-152.868244c0-53.1016 10.532549-104.374633 31.451361-152.868243 20.114243-46.738186 49.005609-88.795238 85.723245-124.85459a401.260854 401.260854 0 0 1 282.915965-115.12661c54.052456 0 106.422629 10.459406 155.720809 30.866219a398.627717 398.627717 0 0 1 159.52423 120.100314l-69.997565 53.686742a9.069695 9.069695 0 0 0 3.437707 16.091394l204.287562 49.151895c5.851416 1.316569 11.556547-2.998851 11.556547-8.777124l0.950855-206.554986a9.508551 9.508551 0 0 0-15.213681-7.167985z" fill="#000000" ></path></symbol><symbol id="icon-log" viewBox="0 0 1024 1024"><path d="M288 64c70.692 0 128 57.308 128 128 0 58.192-38.833 107.315-91.998 122.867L324 571.5h225c48.8 0 84.134-19.864 110.1-62.009 15.655-25.408 27.76-58.805 36.092-100.127C648.71 390.177 616 344.408 616 291c0-70.692 57.308-128 128-128 70.692 0 128 57.308 128 128 0 62.814-45.245 115.06-104.923 125.925-9.94 52.391-25.407 95.81-46.677 130.334-38.644 62.721-96.365 95.58-169.189 96.231l-2.211 0.01H324l0.002 65.633c52.52 15.363 91.052 63.486 91.98 120.75L416 832c0 70.692-57.308 128-128 128-70.692 0-128-57.308-128-128 0-58.193 38.833-107.315 91.999-122.868V314.868C198.833 299.315 160 250.193 160 192c0-70.692 57.308-128 128-128z" fill="#333333" ></path></symbol></svg>',function(l){var t=(t=document.getElementsByTagName("script"))[t.length-1],c=t.getAttribute("data-injectcss"),t=t.getAttribute("data-disable-injectsvg");if(!t){var i,o,e,a,h,n=function(t,c){c.parentNode.insertBefore(t,c)};if(c&&!l.__iconfont__svg__cssinject__){l.__iconfont__svg__cssinject__=!0;try{document.write("<style>.svgfont {display: inline-block;width: 1em;height: 1em;fill: currentColor;vertical-align: -0.1em;font-size:16px;}</style>")}catch(t){console&&console.log(t)}}i=function(){var t,c=document.createElement("div");c.innerHTML=l._iconfont_svg_string_4550268,(c=c.getElementsByTagName("svg")[0])&&(c.setAttribute("aria-hidden","true"),c.style.position="absolute",c.style.width=0,c.style.height=0,c.style.overflow="hidden",c=c,(t=document.body).firstChild?n(c,t.firstChild):t.appendChild(c))},document.addEventListener?~["complete","loaded","interactive"].indexOf(document.readyState)?setTimeout(i,0):(o=function(){document.removeEventListener("DOMContentLoaded",o,!1),i()},document.addEventListener("DOMContentLoaded",o,!1)):document.attachEvent&&(e=i,a=l.document,h=!1,d(),a.onreadystatechange=function(){"complete"==a.readyState&&(a.onreadystatechange=null,s())})}function s(){h||(h=!0,e())}function d(){try{a.documentElement.doScroll("left")}catch(t){return void setTimeout(d,50)}s()}}(window);

================================================
FILE: archive/ktransformers/website/src/assets/iconfont/iconfont.json
================================================
{
  "id": "4550268",
  "name": "Lexllama",
  "font_family": "iconfont",
  "css_prefix_text": "icon-",
  "description": "Lexllama开源项目使用",
  "glyphs": [
    {
      "icon_id": "11372665",
      "name": "复制",
      "font_class": "copy",
      "unicode": "e8b0",
      "unicode_decimal": 59568
    },
    {
      "icon_id": "34202237",
      "name": "箭头下",
      "font_class": "arrow-down",
      "unicode": "e85e",
      "unicode_decimal": 59486
    },
    {
      "icon_id": "7766233",
      "name": "进度",
      "font_class": "usage-progress",
      "unicode": "e651",
      "unicode_decimal": 58961
    },
    {
      "icon_id": "38865122",
      "name": "环形进度条",
      "font_class": "gen-progress",
      "unicode": "e617",
      "unicode_decimal": 58903
    },
    {
      "icon_id": "577406",
      "name": "向左1",
      "font_class": "back",
      "unicode": "e779",
      "unicode_decimal": 59257
    },
    {
      "icon_id": "1920286",
      "name": "点",
      "font_class": "point",
      "unicode": "e608",
      "unicode_decimal": 58888
    },
    {
      "icon_id": "8866967",
      "name": "编辑",
      "font_class": "edit",
      "unicode": "e7dd",
      "unicode_decimal": 59357
    },
    {
      "icon_id": "10199175",
      "name": "删除",
      "font_class": "delete",
      "unicode": "e614",
      "unicode_decimal": 58900
    },
    {
      "icon_id": "1010111",
      "name": "上传",
      "font_class": "upload-1",
      "unicode": "e618",
      "unicode_decimal": 58904
    },
    {
      "icon_id": "351773",
      "name": "探索-选中",
      "font_class": "explore",
      "unicode": "e621",
      "unicode_decimal": 58913
    },
    {
      "icon_id": "564941",
      "name": "ellipsis",
      "font_class": "ellipsis",
      "unicode": "e657",
      "unicode_decimal": 58967
    },
    {
      "icon_id": "1048859",
      "name": "发送",
      "font_class": "sent",
      "unicode": "e60c",
      "unicode_decimal": 58892
    },
    {
      "icon_id": "1304951",
      "name": "列表",
      "font_class": "list-list",
      "unicode": "e62d",
      "unicode_decimal": 58925
    },
    {
      "icon_id": "8676284",
      "name": "列表",
      "font_class": "list-icon",
      "unicode": "e639",
      "unicode_decimal": 58937
    },
    {
      "icon_id": "22290034",
      "name": "重试",
      "font_class": "zhongshi",
      "unicode": "e6bd",
      "unicode_decimal": 59069
    },
    {
      "icon_id": "22961085",
      "name": "Fork 记录",
      "font_class": "log",
      "unicode": "e826",
      "unicode_decimal": 59430
    }
  ]
}


================================================
FILE: archive/ktransformers/website/src/components/chat/index.vue
================================================
<template>
  <div class="chat-panel">
    <!-- <div class="chat-model">{{ activeAssistant?.model }}</div> -->
    <div class="chat-panel-inner flex-column">
      <div class="chat-init flex-unit flex-column" v-if="isNotChating">
        <div class="assistant-info flex-column flex-unit">
          <div class="avatar">
            <img src="../../../public/images/avatar.png" />
          </div>
          <div class="name">
            {{ activeAssistant.name }}
          </div>
          <div class="desc">
            {{ activeAssistant.description }}
          </div>
        </div>
      </div>
      <div class="chat-msg flex-unit" v-else>
        <ul>
          <li
            class="chat-msg-item flex-row"
            v-for="(msg, index) in localMessages"
            :key="index"
          >
            <div class="avatar" v-if="msg.role == 'user'">
              <img src="../../../public/images/user-filling.png" />
            </div>
            <div class="avatar" v-else>
              <img src="../../../public/images/avatar.png" />
            </div>
            <div class="msg flex-unit">
              <div class="title flex-row">
                <div class="name">{{ msg.role }}</div>
                <div class="time flex-row">
                  {{ timeFormat(msg.created_at) }}
                </div>
              </div>
              <div
                class="content"
                v-html="markedText(msg.content)"
                ref="content_Ref"
              ></div>
              <div class="copy-btn flex-row" v-show="msgBttnBoxShow[index]">
                <i
                  class="iconfont icon-copy"
                  @click="copy(createText(msg.content))"
                ></i>
              </div>
            </div>
          </li>
        </ul>
      </div>
      <div class="scroll-box" v-show="showScrollButton" @click="scrollToBottom">
        <i class="iconfont icon-arrow-down"></i>
      </div>
      <div class="chat-send">
        <div
          class="chat-box flex-row"
          :style="{ height: textareaHeight + 'px' }"
          ref="chatBox_Ref"
        >
          <button @click="StopOutput" class="stop-btn" v-show="isRunning">
            stop
          </button>
          <textarea
            name="chat-input"
            class="chat-input flex-unit"
            :placeholder="inputPlaceholder"
            v-model="inputQuestion"
            @keydown="keyBoardCommitQuestion"
            :disabled="inputDisabled"
            :style="{ height: textareaHeight + 'px' }"
            @input="handleInput"
            ref="textarea_ref"
            maxlength="2000"
            cols="20"
          ></textarea>
          <i class="iconfont icon-sent" @click="clickCommitQuestion"></i>
        </div>
      </div>
    </div>
  </div>
</template>

<script lang="ts">
import {
  defineComponent,
  nextTick,
  PropType,
  ref,
  watch,
  computed,
  onMounted,
} from "vue";
import { IThread, IMessageData, IAssistant } from "@/utils/types";
import { marked } from "marked";
import { createMessage } from "@/api/message";
import { createRun, cancelRun } from "@/api/run";
import { getAssistant } from "@/api/assistant";
import { createThread } from "@/api/thread";
import BScroll from "better-scroll";
import { useRouter, useRoute } from "vue-router";
import { useI18n } from "vue-i18n";
import { ElMessage } from "element-plus";
import { tr } from "element-plus/es/locale";
import copy from "@/utils/copy";
export default defineComponent({
  name: "ChatChat",
  props: {
    messages: {
      type: Array as PropType<IMessageData[]>,
      required: true,
    },
    chatInit: {
      type: Boolean,
      required: true,
    },
    activeAssistant: {
      type: Object as PropType<IAssistant>,
      required: true,
    },
    activeThread: {
      type: Object as PropType<IThread>,
      required: true,
    },
    inputDisabled: {
      type: Boolean,
      default: false,
    },
  },
  setup(props, context) {
    const { t } = useI18n();
    const router = useRouter();
    const route = useRoute();
    const localMessages = ref<IMessageData[]>([...props.messages]);
    const showScrollButton = ref(false);
    const messageScroll = ref<BScroll | null>(null);
    const inputQuestion = ref<string>("");
    const inputDisabled = ref(false);
    const msgBttnBoxShow = ref<boolean[]>([]);
    const answer = ref("");
    const activeThread = ref<IThread>({} as IThread);
    const activeAssistant = ref<IAssistant>({} as IAssistant);
    const isNotChating = ref(true);
    const isRunning = ref(false);
    const stopRunId = ref<string>("");
    const shouldContinueReceiving = ref(true);
    const textareaHeight = ref(48);
    const chatBox_Ref = ref();
    const textarea_ref = ref();
    const content_Ref = ref();
    // Boolean if go
    isNotChating.value = props.chatInit;
    activeThread.value = props.activeThread;
    activeAssistant.value = props.activeAssistant;
    watch(
      () => props.messages,
      (newMessages) => {
        localMessages.value = [...newMessages];
        msgBttnBoxShow.value = new Array(newMessages.length).fill(true);
      }
    );
    watch(
      () => props.inputDisabled,
      (newValue) => {
        inputDisabled.value = newValue;
      }
    );
    // Update scrollbars and scrolling events
    watch(
      () => localMessages.value,
      (newMessages) => {
        if (messageScroll.value) {
          scrollToTop();
          messageScroll.value.destroy();
          messageScroll.value = null;
        }
        if (!isNotChating.value) {
          nextTick(() => {
            messageScroll.value = new BScroll(".chat-msg", {
              click: true,
              mouseWheel: true,
              probeType: 3, //Only when set to 3 can the event of scrolling binding be triggered
            });
          });
        }
      },
      {
        immediate: true,
        deep: true,
      }
    );
    watch(
      () => messageScroll.value,
      (newValue) => {
        if (newValue) {
          messageScroll.value?.on("scroll", handleScroll);
          showScrollButton.value = false;
          scrollToBottom();
        }
      }
    );
    watch(
      () => props.chatInit,
      (newValue) => {
        isNotChating.value = newValue;
      }
    );
    watch(
      () => props.activeThread,
      (newValue) => {
        activeThread.value = newValue;
      }
    );
    watch(
      () => props.activeAssistant,
      (newValue) => {
        activeAssistant.value = newValue;
      }
    );

    const handleInput = (event:any) => {
      adjustHeight();
      const maxLength = 2000; 
      if (inputQuestion.value?.length > maxLength) {
        event.preventDefault(); 
        inputQuestion.value = inputQuestion.value.substring(0, maxLength); 
      }
    };
    const adjustHeight = () => {
      const currentScrollTop = textarea_ref.value.scrollTop;
      textarea_ref.value.style.height = textarea_ref.value.scrollHeight + "px";
      chatBox_Ref.value.style.height = textarea_ref.value.style.height;
      textarea_ref.value.scrollTop = currentScrollTop;
    };

    const inputPlaceholder = computed(() => {
      if (typeof activeAssistant.value.name != "undefined") {
        return replaceAssistant(t("chat.inputTip"), activeAssistant.value.name);
      } else {
        return t("chat.inputTip");
      }
    });
    // Block events
    const StopOutput = async () => {
      shouldContinueReceiving.value = false;
      try {
        const response = await cancelRun(
          activeThread.value.id,
          stopRunId.value
        );
        if (!response.ok) {
          console.error("Failed to cancel run");
        }
      } catch (error) {
        console.error("Failed to cancel run:", error);
      }
    };
    // dialogue
    const commitQuestion: () => void = async () => {
      const question = inputQuestion.value;
      // If it came in by clicking on assistants without clicking on thread, or through preview
      if (Object.keys(activeThread.value).length == 0) {
        try {
          let res = {} as IThread;
          // If you click thread and do not select assistant
          if (route.name == "preview") {
            let metadata = {
              hidden: "true",
            };
            res = await createThread(undefined, undefined, metadata);
          } else {
            res = await createThread();
          }
          activeThread.value = res;
        } catch (err) {
          console.error(err);
        }
      }
      //If you click thread and do not select assistant
      else if (Object.keys(activeAssistant.value).length == 0) {
        try {
          const messageOfAssistant = props.messages.find(
            (message) => message.role === "assistant"
          );
          if (messageOfAssistant && messageOfAssistant.assistant_id) {
            const res = await getAssistant(messageOfAssistant.assistant_id);
            activeAssistant.value = res;
          }
        } catch (err) {
          console.error(err);
        }
      }
      if (question) {
        inputQuestion.value = "";
        textareaHeight.value = 48;
        // inputDisabled.value = true;
        isNotChating.value = false;
        isRunning.value = true;
        await createMessage(activeThread.value.id, question)
          .then((res: any) => {})
          .catch((err: any) => {
            ElMessage({
              type: "warning",
              message: "Request error",
            });
            return;
          });
        // Current message queue insertion issue
        localMessages.value.push({
          role: "user",
          content: [
            { type: "text", text: { value: question }, annotatons: [] },
          ],
          created_at: Date.now() / 1000,
        });
        msgBttnBoxShow.value.push(true);
        // Insert answer into the current message queue
        localMessages.value.push({
          role: "assistant",
          content: [{ type: "text", text: { value: "" }, annotatons: [] }],
          created_at: Date.now() / 1000,
        });
        msgBttnBoxShow.value.push(false);
        try {
          const asyncGenerator = createRun(
            {
              assistant_id: activeAssistant.value.id,
              stream: true,
            },
            activeThread.value.id
          );
          for await (const word of asyncGenerator) {
            if (!shouldContinueReceiving.value) {
              break;
            }
            if (word.length == 36) {
              stopRunId.value = word;
              console.log(stopRunId.value);
            } else {
              answer.value += word;
              const index = localMessages.value.length - 1;
              localMessages.value[index].content[0].text.value += word;
              if (answer.value.length <= 3) {
                localMessages.value[index].created_at = Date.now() / 1000;
              }
            }
          }
        } catch (err) {
          console.error(err);
        }
        shouldContinueReceiving.value = true;
        answer.value = "";
        inputDisabled.value = false;
        msgBttnBoxShow.value[msgBttnBoxShow.value.length - 1] = true;
        scrollToBottom();
        isRunning.value = false;
        context.emit("updateAssistant", true);
        textarea_ref.value.focus();
      }
    };
    // Keyboard event stabilization
    const keyBoardCommitQuestion = (event: any) => {
      const question = inputQuestion.value?.trim();
      if (event.keyCode === 13) {
        event.preventDefault();

        const cursorPosition = event.target.selectionStart;
        if ((event.metaKey || event.ctrlKey) && question) {
          event.target.value =
            event.target.value.substring(0, cursorPosition) +
            "\n" +
            event.target.value.substring(cursorPosition);
          event.target.selectionStart = event.target.selectionEnd =
            cursorPosition + 1;
          adjustHeight();
          return;
        }
        if (!question) {
          ElMessage({
            message: "Please enter the content!",
            type: "warning",
            plain: true,
          });
          return;
        }
        if (!isRunning.value) {
          commitQuestion();
          inputQuestion.value = "";
        }
      }
    };
    const clickCommitQuestion = () => {
      if (!isRunning.value && inputQuestion.value?.trim() != "") {
        commitQuestion();
        return;
      }
      ElMessage({
        message: "Please enter the content!",
        type: "warning",
        plain: true,
      });
    };
    //Bottom scrolling
    const scrollToBottom = () => {
      //If messageScroll. value exists
      if (messageScroll.value) {
        //Call the scrollTo method of messageScroll. value and scroll to the bottom
        messageScroll.value.scrollTo(0, messageScroll.value?.maxScrollY, 800);
      }
    };
    // Top scrolling
    const scrollToTop = () => {
      if (messageScroll.value) {
        messageScroll.value.scrollTo(0, messageScroll.value?.minScrollY, 800);
      }
    };
    // Handling rolling events
    const handleScroll = (pos: any) => {
      if (messageScroll.value) {
        const distanceToBottom =
          messageScroll.value.y - messageScroll.value.maxScrollY;
        showScrollButton.value = distanceToBottom > 100;
      }
    };
    // Replace characters

    function replaceAssistant(input: string, newString: string) {
      return input.replace(/assistant/g, newString);
    }
    // Extract the markup text to convert the passed in object array into an HTML string parsed by market.js
    const markedText = (content: object[]) => {
      let context = "";
      for (const item of content) {
        if ((item as { type: string }).type === "text") {
          context += ((item as { text: object }).text as { value: string })
            .value;
        }
      }
      return marked.parse(context);
    };
    // Extract text content
    const createText = (content: object[]) => {
      let context = "";
      for (const item of content) {
        if ((item as { type: string }).type === "text") {
          context += ((item as { text: object }).text as { value: string })
            .value;
        }
      }
      return context;
    };
    // Time formatting
    const timeFormat = (timestamp: number | undefined) => {
      if (!timestamp) {
        return "";
      }
      const date = new Date(timestamp * 1000);
      // Obtain various time sections
      const year = date.getFullYear();
      const month = String(date.getMonth() + 1).padStart(2, "0"); // The month starts from 0 and needs to be increased by 1, with zeros added
      const day = String(date.getDate()).padStart(2, "0"); // Zero padding
      const hours = String(date.getHours()).padStart(2, "0"); // Zero padding
      const minutes = String(date.getMinutes()).padStart(2, "0"); // Zero padding
      const seconds = String(date.getSeconds()).padStart(2, "0"); // Zero padding
      // Format as "YYYY-MM-DD HH: mm: ss"
      const formattedDate = `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`;
      return formattedDate;
    };
    onMounted(() => {
      adjustHeight();
    });
    return {
      inputQuestion,
      inputDisabled,
      msgBttnBoxShow,
      localMessages,
      textareaHeight,
      answer,
      StopOutput,
      isNotChating,
      handleInput,
      chatBox_Ref,
      adjustHeight,
      content_Ref,
      markedText,
      timeFormat,
      createText,
      inputPlaceholder,
      keyBoardCommitQuestion,
      clickCommitQuestion,
      messageScroll,
      showScrollButton,
      commitQuestion,
      scrollToBottom,
      scrollToTop,
      isRunning,
      copy,
      replaceAssistant,
      textarea_ref,
    };
  },
});
</script>

<style scoped lang="stylus">
@import '@/assets/css/mixins.styl';

.chat-panel {
  justify-content: center;
  display: flex;
  position: relative;
  height: 100%;

  .chat-model {
    font-size: 16px;
    font-weight: bold;
    position: absolute;
    top: 20px;
    left: 30px;
  }

  .chat-panel-inner {
    width: 920px;
    padding-top: 80px;
  }

  .chat-init {
    padding: 0 20px;

    .assistant-info {
      text-align: center;
      align-items: center;
      justify-content: center;

      .avatar img {
        width: 70px;
        height: 70px;
      }

      .name {
        margin: 40px 0;
        font-size: 20px;
        font-weight: bold;
      }

      .desc {
        color: $gray_40;
      }
    }

    .assistant-tips {
      margin-bottom: 80px;

      .tips-item {
        width: 44%;
        height: 70px;
        line-height: 70px;
        float: left;
        border: 1px solid $border_gray_light_normal;
        border-radius: 8px;
        margin-top: 10px;
        margin-bottom: 10px;
        padding: 0 20px;
        color: $gray_40;

        &:nth-child(odd) {
          margin-left: 4%;
          margin-right: 4%;
        }

        &:nth-child(even) {
          margin-right: 4%;
        }

        .tips-ops {
          display: none;
          width: 24px;
          height: 24px;
          line-height: 24px;
          border-radius: 4px;
          text-align: center;
          border: 1px solid $border_gray_light_normal;

          i {
            font-size: 20px;
          }
        }

        &:hover {
          cursor: pointer;
          background-color: $bg_gray_light_hover;

          .tips-ops {
            display: block;
            background-color: #FFFFFF;
          }
        }
      }
    }
  }

  .chat-msg {
    overflow-y: hidden;

    ul {
      li.chat-msg-item {
        margin-bottom: 40px;
        align-items: flex-start !important;
        // border: 1px solid;
        border-radius: 15px;
        padding: 20px;
        margin-right: 20px;
        background-color: #313344;
        box-shadow: 12.5px 12.5px 10px rgba(0, 0, 0, 0.035), 10px 10px 8px rgba(0, 0, 0, 0.07);

        .avatar {
          margin-right: 15px;
          width: 36px;
          height: 36px;

          img {
            width: 100%;
            height: 100%;
            border-radius: 25px;
          }
        }

        .msg {
          .title {
            display: flex;
            align-items: center;
            justify-content: space-between;
            margin-bottom: 12px;
            height: 36px;
            line-height: 24px;

            .time {
              justify-content: center;
              // margin-bottom: 12px;
              line-height: 20px;
              font-size: 14px;
              color: $gray_80;
            }

            .name {
              color: #edf2ea;
              font-size: 16px;
              font-weight: bold;
              margin-right: 15px;
            }

            .tips {
              font-size: 14px;
              color: $gray_50;
            }
          }

          .content {
            max-width: 829px;
            color: #edf2ea;
            font-size: 14px;
            line-height: 20px;
            word-wrap: break-word;
            margin-bottom: 12px;
          }

          .copy-btn {
            margin-top: 10px;
            justify-content: left;

            i {
              font-size: 20px;
              color: $gray_70;

              &:hover {
                cursor: pointer;
                color: $gray_50;

                .tips-ops {
                  display: block;
                  background-color: #FFFFFF;
                }
              }
            }
          }
        }
      }
    }
  }

  .chat-send {
    width: 900px;
    padding: 40px 0;
    position: relative;

    .chat-box {
      width: 100%;
      height: auto;
      min-height: 48px;
      max-height: 192px !important;
      border: none;
      border-radius: 15px;
      background: white;
      line-height: 48px;

      // overflow: hidden;
      .chat-input {
        height: auto;
        min-width: 900px;
        max-height: 192px !important;
        width: 100%;
        border: none;
        overflow-anchor: auto;
        overflow-x: hidden;
        overflow-y: auto;
        resize: none;
        background: white;
        display: inline-block;
      }

      .chat-input::-webkit-scrollbar {
        width: 10px;
      }

      .chat-input::-webkit-scrollbar-track {
        background-color: #f1f1f1;
      }

      .chat-input::-webkit-scrollbar-thumb {
        background-color: #888;
        border-radius: 5px;
      }

      .chat-input::-webkit-scrollbar-thumb:hover {
        background-color: #555;
      }

      .chat-input::-webkit-resizer {
        display: none;
      }

      .stop-btn {
        border: none;
        width: 60px;
        position: absolute;
        right: 50%;
        transform: translateX(50%);
        top: -40px;
        -webkit-border-radius: 50;
        -moz-border-radius: 50;
        border-radius: 50px;
        font-family: Arial;
        color: #ffffff;
        font-size: 16px;
        background: #cacdd1;
        padding: 10px 15px 10px 15px;
        text-decoration: none;
      }

      .stop-btn:hover {
        background: #8080e1;
        text-decoration: none;
        cursor: pointer;
      }
    }
  }
}

.scroll-box {
  position: absolute;
  bottom: 130px;
  right: 50%;
  transform: translateX(50%);
  margin: 0 auto;
  width: 32px;
  height: 32px;
  border-radius: 16px;
  border: 1px solid $gray_80;
  background-color: var(--el-bg-color-overlay);
  box-shadow: var(--el-box-shadow-lighter);
  text-align: center;
  line-height: 32px;
  color: #1989fa;

  i {
    font-size: 24px;
    color: $gray_60;
  }

  &:hover {
    cursor: pointer;
    background-color: $bg_gray_light_hover;

    i {
      color: $gray_50;
    }
  }
}
</style>

================================================
FILE: archive/ktransformers/website/src/conf/config.ts
================================================
declare global {
    interface Window {
      configWeb: {
        apiUrl: string;
        port: string;
       };
     }
  }

export const baseURL = window.configWeb.apiUrl;
export const basePort = window.configWeb.port;


================================================
FILE: archive/ktransformers/website/src/locals/en.js
================================================
// en.js
export default {
    home: {
        explore: 'Explore',
        language: 'Choose Language',
        english: 'English',
        chinese: 'Chinese',
        today: 'Today',
        previous:'Previous',
        withoutAssistantTip:'The KTransformers of this record has been deleted. The user can only view historical conversation information and cannot continue the conversation!',
        deleteThreadTip:'Deleting records will clear historical information~'
    },
    chat:{
        inputTip:"Send a message and chat with the KTransformers ~",
    },
    explore:{
        description: "Based on Lexllama, let’s create your own KTransformers~",
        configuring: "Configuring",
        completed: "Completed",
        assistantName: "Name",
        assistantDescription: "Description",
        assistantStatus: "Status",
        createAssistant: "Create New KTransformers",
        deleteAssistant: "Are you sure to delete this? After deleting the KTransformers, its KVCache will also be cleared simultaneously~",
    },
    config:{
        title:'Configure your KTransformers',
        fileTip:"Only support text, docx, .ppt, .pdf format.",
        reConfigTip:'Reconfig KTransformers needs to delete kvcache, please choose carefully',
        secletFile:'Select Files',
        outOfSize:'File size exceeds 10MB, please reselect',
        fileExist:'The file already exists, please reselect',
        createAssistant:'Assistant created successfully, click the build button to start building KVCache',
    },
    build:{
        title:'Building Logs',
        step1:'Parse uploded files',
        parsingFileStep1:'File upload and reception completed',
        parsingFileStep2:{
            parse:"Parsing",
            file:"file(s)",
            total:'total',
        },
        parsingFileStep3:'Prompt loaded, ready to generate KVCache',
        step2:'Generate KVCache',
        generateStep1:'Generate KVCache calculation plan',
        generateStep2:{
            calculate:"calculating",
            token:"tokens",
            total:'total',
        },
        generateStep3:'KVCache has been generated successfully',
        durationTime:'Duration:',
        remainTime:'Time left:',
        buildProgress:'Building Progress',
        storageUsage:'KVCache Storage Usage',
    }
}


================================================
FILE: archive/ktransformers/website/src/locals/index.js
================================================
// index.js
import { createI18n } from 'vue-i18n'
import zh from './zh'
import en from './en'

const messages = {
  en,
  zh,
}
const language = (navigator.language || 'en').toLocaleLowerCase() // 这是获取浏览器的语言
const i18n = createI18n({
  legacy: false, // you must set `false`, to use Compostion API
  locale: localStorage.getItem('lang') || language.split('-')[0] || 'en', // 首先从缓存里拿，没有的话就用浏览器语言，
  fallbackLocale: 'en', // 设置备用语言
  messages, 
})

export default i18n

================================================
FILE: archive/ktransformers/website/src/locals/zh.js
================================================
// zh.js
export default {
    home: {
        explore: '探索',
        language: '选择语言',
        english: '英语',
        chinese: '中文',
        today: '今天',
        previous:'历史',
        withoutAssistantTip:'本记录的KTransformers已被删除，用户只能查看历史对话信息而无法继续对话!',
        deleteThreadTip:'删除记录会清除历史信息哦～'
    },
    chat:{
        inputTip:"发送信息和 KTransformers 畅聊吧～",
    },
    explore:{
        description: "基于Lexllama，一起来创建你的专属KTransformers吧~",
        configuring: "配置中",
        completed: "完成",
        assistantName: "名称",
        assistantDescription: "描述",
        assistantStatus: "Status",
        createAssistant: "创建新的KTransformers",
        deleteAssistant: "是否确认删除KTransformers，删除KTransformers之后其KVCache也会被同步清理掉哦~",
    },
    config:{
        title:'配置你的KTransformers',
        fileTip:"仅支持上传文件格式为 .text, docx, .ppt, .pdf format.",
        secletFile:'选择文件',
        outOfSize:'文件大小超出10MB，请重新选择',
        fileExist:'文件已存在，请重新选择',
        createAssistant:'KTransformers创建成功，点击build按钮开始构建KVCache',
    },
    build:{
        title:'构建日志',
        step1:'解析上传文件',
        parsingFileStep1:'文件上传接收完成',
        parsingFileStep2:{
            parse:"正在解析第",
            file:"文件",
            total:'共',
        },
        parsingFileStep3:'Prompt装载完毕，准备生成KVCache',
        step2:'生成 KVCache',
        generateStep1:'生成KVCache计算计划',
        generateStep2:{
            calculate:"正在计算",
            token:"tokens",
            total:'共',
        },
        generateStep3:'KVCache已生成完成',
        durationTime:'持续时间：',
        remainTime:'剩余时间：',
        buildProgress:'构建进度',
        storageUsage:'存储使用：',
        
    }
}


================================================
FILE: archive/ktransformers/website/src/main.ts
================================================
import { createApp } from 'vue'
import App from './App.vue'
import router from './router'
import store from './store'
import ElementPlus from 'element-plus'
import 'element-plus/dist/index.css'
import VueApexCharts from "vue3-apexcharts"
import i18n from '@/locals'

const app = createApp(App)

app.use(ElementPlus)

app.use(i18n)
app.use(VueApexCharts)
app.use(store)
app.use(router)
app.mount('#app')


================================================
FILE: archive/ktransformers/website/src/router/index.ts
================================================
import { createRouter, createWebHashHistory, RouteRecordRaw, createWebHistory } from 'vue-router'
import HomeView from '@/views/home.vue'

const routes: Array<RouteRecordRaw> = [
  {
    path: '/',
    name: 'home',
    component: HomeView,
    redirect: '/chat',
    children: [{
      path: '/chat',
      name: '',
      component: () => import(/* webpackChunkName: "about" */ '../components/chat/index.vue')
    },]
  },

]

const router = createRouter({
  history: createWebHashHistory(),
  routes
})

export default router


================================================
FILE: archive/ktransformers/website/src/shims-vue.d.ts
================================================
/* eslint-disable */
declare module '*.vue' {
  import type { DefineComponent } from 'vue'
  const component: DefineComponent<{}, {}, any>
  export default component
  
}

declare module '@/locals'
declare module 'pdfobject';


================================================
FILE: archive/ktransformers/website/src/store/index.ts
================================================
import { createStore } from 'vuex'

export default createStore({
  state: {
  },
  getters: {
  },
  mutations: {
  },
  actions: {
  },
  modules: {
  }
})


================================================
FILE: archive/ktransformers/website/src/utils/copy.ts
================================================
import { ElMessage } from "element-plus";
const copy = (value: string) => {
  //Try using the navigator.clipboard.writeText method
  if (navigator.clipboard && window.isSecureContext) {
    navigator.clipboard.writeText(value)
      .then(() => {
        //Using ElMessage to Display Success Messages in Windows Systems
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制成功!",
            type: "success",
            plain: true,
          });
        } else {
          //Using custom DOM elements to display success messages in macOS system
          showCopySuccessMessage();
        }
      })
      .catch(() => {
        //Using ElMessage to Display Failure Messages in Windows Systems
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制失败!",
            type: "error",
            plain: true,
          });
        } else {
          //Using custom DOM elements to display failure messages in macOS system
          showCopyErrorMessage();
        }
      });
  } else {
    const textarea = document.createElement("textarea");
    textarea.value = value;
    document.body.appendChild(textarea);
    textarea.select();
    try {
      const successful = document.execCommand('copy');
      if (successful) {
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制成功!",
            type: "success",
            plain: true,
          });
        } else {
          showCopySuccessMessage();
        }
      } else {
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制失败!",
            type: "error",
            plain: true,
          });
        } else {
          showCopyErrorMessage();
        }
      }
    } catch (err) {
      if (navigator.appVersion.includes("Win")) {
        ElMessage({
          message: "内容复制失败!",
          type: "error",
          plain: true,
        });
      } else {
        showCopyErrorMessage();
      }
    }
    document.body.removeChild(textarea);
  }
};

function showCopySuccessMessage() {
  const messageElement = document.createElement('div');
  messageElement.textContent = '内容复制成功!';
  messageElement.style.position = 'fixed';
  messageElement.style.bottom = '10px';
  messageElement.style.left = '50%';
  messageElement.style.transform = 'translateX(-50%)';
  messageElement.style.padding = '10px';
  messageElement.style.backgroundColor = '#4CAF50';
  messageElement.style.color = 'white';
  messageElement.style.borderRadius = '15px';
  messageElement.style.zIndex = '1000';
  document.body.appendChild(messageElement);
  setTimeout(() => {
    document.body.removeChild(messageElement);
  }, 3000);
}

function showCopyErrorMessage() {
  const messageElement = document.createElement('div');
  messageElement.textContent = '内容复制失败!';
  messageElement.style.position = 'fixed';
  messageElement.style.bottom = '10px';
  messageElement.style.left = '50%';
  messageElement.style.transform = 'translateX(-50%)';
  messageElement.style.padding = '10px';
  messageElement.style.backgroundColor = '#F44336';
  messageElement.style.color = 'white';
  messageElement.style.borderRadius = '5px';
  messageElement.style.zIndex = '1000';
  document.body.appendChild(messageElement);
  setTimeout(() => {
    document.body.removeChild(messageElement);
  }, 3000);
}

export default copy;

================================================
FILE: archive/ktransformers/website/src/utils/types.ts
================================================
export interface IAssistant {
  id: string;
  object: string;
  created_at: number;
  name?: string;
  description?: string;
  model: string;
  instructions?: string;
  tools: any[];
  tool_resources?: object;
  metadata?:{[key:string]:any}
  top_p?: number;
  temperature?: number;
  response_format: string | object;
}

export interface IAssistantWithStatus {
  build_status:{status:string}
  id: string;
  object: string;
  created_at: number;
  name?: string;
  description?: string;
  model: string;
  instructions?: string;
  tools: any[];
  tool_resources?: object;
  metadata?:{[key:string]:any}
  top_p?: number;
  temperature?: number;
  response_format: string | object;
}

export interface IMessage {
  id: string;
  object: string;
  created_at: number;
  thread_id: string;
  status: string;
  incomplete_details?: object;
  completed_at?: number;
  incomplete_at?: number;
  role: string;
  content: any[];
  assistant_id?: string;
  run_id?: string;
  attachments?: any[];
  metadata:{[key:string]:any}
}

export interface IThread {
  id: string;
  object: string;
  created_at: number;
  tool_resources?: object;
  metadata?:{[key:string]:any}
}

export interface IRun {
  id: string;
  object: string;
  created_at: number;
  thread_id: string,
  assistant_id: string,
  status: string,
  required_action?: object,
  last_error?: object,
  expires_at?: number,
  started_at?: number,
  cancelled_at?: number,
  failed_at?: number,
  completed_at?: number,
  incomplete_details?: object,
  model: string,
  instructions: string,
  tools: any[],
  metadata: Map<string, string>,
  usage?: object,
  temperature?: number,
  top_p?: number,
  max_prompt_tokens?: number,
  max_completion_tokens?: number,
  truncation_strategy: object,
  tool_choice: string | object,
  response_format: string | object,
}

export interface IFile {
  id: string,
  bytes: number,
  created_at: number,
  filename: string,
  object: string,
  purpose: string,
}

export interface IMessageData {
  role: string;
  content: any[];
  created_at?: number;
  assistant_id?: string,
}

export interface IThreadAndMessageAndAssistant {

  thread: IThread;
  first_message: IMessage;
  assistant: IAssistantWithStatus
}
export interface IDeleteResult {
  id: string;
  object: string;
  deleted: boolean;
}
export interface IBuildData {
  parsed_file_count:number;
  total_file_count:number;
  prefilling_current:number;
  prefilling_total:number;
  build_completed_time:number;
  build_started_time:number;
  storage_total:number;
  storage_usage:number;
  status:string
}

================================================
FILE: archive/ktransformers/website/src/views/home.vue
================================================
<template>
  <div class="home flex-row">
    <nav class="left-panel flex-column">
      <div class="logo-box">
        <div class="logo flex-row">
          <img class="img" src="../../public/images/three.png" />
          <span class="text">{{ projectName }}</span>
        </div>
        <div class="version">{{ projectVersion }}</div>
      </div>
      <div class="divider"></div>
      <div class="assistant-box">
        <div class="assistant-list">
          <ul>
            <li
              class="assistant-item flex-row"
              v-for="(item, index) in assistantList"
              :key="index"
              @click="setActiveAssistant(item)"
            >
              <img src="../../public/images/avatar.png" />
              <span class="name flex-unit">{{ item.name }}</span>
              <i class="iconfont icon-edit"></i>
            </li>
          </ul>
        </div>
      </div>
      <div class="divider"></div>
      <!-- History area -->
      <div class="history-box flex-unit">
        <div class="">
          <div class="date">{{ $t("home.today") }}</div>
          <ul>
            <li
              v-for="(item, index) in todayThreads"
              :key="index"
              class="chat-item"
              :class="{ active: activeThreadIndex === index }"
              @click="setActiveThreadIndex(index)"
            >
              <div class="chat-abbr">
                {{ firstMessages[index] }}
              </div>
              <div class="chat-ops flex-row">
                <img src="../../public/images/avatar.png" />
                <div class="name flex-unit">
                  {{ assistantOfThread[index].name || "" }}
                </div>
                <i class="iconfont icon-delete" @click="delThread(index)"></i>
              </div>
            </li>
          </ul>
          <div class="date" v-if="previousThreads.length > 0">
            {{ $t("home.previous") }}
          </div>
          <ul>
            <li
              v-for="(item, index) in previousThreads"
              :key="index"
              class="chat-item"
              :class="{
                active: activeThreadIndex === index + todayThreads.length,
              }"
              @click="setActiveThreadIndex(index + todayThreads.length)"
            >
              <div class="chat-abbr">
                {{ firstMessages[index + todayThreads.length] }}
              </div>
              <div class="chat-ops flex-row">
                <img src="../../public/images/avatar.png" />
                <div class="name flex-unit">
                  {{
                    assistantOfThread[index + todayThreads.length].name || ""
                  }}
                </div>
                <i
                  class="iconfont icon-delete"
                  @click="delThread(index + todayThreads.length)"
                ></i>
              </div>
            </li>
          </ul>
        </div>
      </div>
      <div class="icon-box example-2">
        <div class="iconhub icon-content" @click="navigateToIconHub">
          <svg
            xmlns="http://www.w3.org/2000/svg"
            width="16"
            height="16"
            fill="currentColor"
            class="bi bi-github"
            viewBox="0 0 16 16"
            xml:space="preserve"
          >
            <path
              d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27s1.36.09 2 .27c1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.01 8.01 0 0 0 16 8c0-4.42-3.58-8-8-8"
              fill="currentColor"
            ></path>
          </svg>
          <div class="tooltip">GitHub</div>
        </div>
        <div class="iconlanguage" @click="changeLanguage">
          <svg
            v-if="!flag"
            t="1719306572024"
            class="icon"
            viewBox="0 0 1024 1024"
            version="1.1"
            xmlns="http://www.w3.org/2000/svg"
            p-id="16849"
            data-spm-anchor-id="a313x.search_index.0.i21.366e3a81tz0TYS"
            width="18"
            height="18"
          >
            <path
              d="M64.064 768V192H448.64v64H127.936v192h320v64h-320v192h320v64H64.064z m511.872 0V192h64l256 447.68V192h64v576h-64l-256-447.168V768h-64z"
              p-id="16850"
              data-spm-anchor-id="a313x.search_index.0.i22.366e3a81tz0TYS"
              class="selected"
              fill="#000000"
            ></path>
          </svg>
          <svg
            v-else
            t="1719306494614"
            class="icon"
            viewBox="0 0 1024 1024"
            version="1.1"
            xmlns="http://www.w3.org/2000/svg"
            p-id="12325"
            width="18"
            height="18"
          >
            <path
              d="M1023.488 831.552h-96l-265.472-451.904c-8.96-12.8-16-25.344-21.44-37.888H638.08c2.176 12.992 3.2 40.128 3.2 81.408v408.32L576 836.928V256h101.568l257.024 445.632c14.592 20.992 23.232 34.368 25.92 40.128h1.6c-2.688-16.512-4.032-44.8-4.032-84.736v-399.36L1024 256l-0.512 575.552zM435.008 804.224c-42.752 21.76-96.384 32.64-160.896 32.64-83.2 0-149.76-25.6-199.488-76.736C24.896 708.928 0 641.344 0 557.12c0-90.432 27.968-163.2 84.032-218.368C140.032 283.52 211.072 256 297.344 256c55.552 0 101.376 7.616 137.6 22.848v75.84a284.992 284.992 0 0 0-136.832-33.408c-64.768 0-117.504 20.864-158.208 62.592-40.768 41.728-61.184 98.048-61.184 168.96 0 67.2 19.008 120.576 57.024 160.128 38.016 39.552 87.744 59.328 149.248 59.328 57.536 0 107.52-12.544 150.016-37.76v69.696z"
              fill="#000000"
              p-id="12326"
              data-spm-anchor-id="a313x.search_index.0.i16.366e3a81tz0TYS"
              class="selected"
            ></path>
          </svg>
        </div>
      </div>
    </nav>
    <router-view v-slot="{ Component }" class="main-panel flex-unit">
      <component
        :is="Component"
        :chatInit="chatInit"
        :activeAssistant="activeAssistant"
        :activeThread="activeThread"
        :messages="allMessageInCurrentThread"
        :completedAssistant="assistantList"
        :inputDisabled="inputDisabled"
        @updateAssistant="handleUpdateAssistant"
      />
    </router-view>
  </div>
</template>

<script lang="ts">
import { defineComponent, ref, onMounted, computed, nextTick } from "vue";
import {
  IThread,
  IAssistant,
  IMessageData,
  IThreadAndMessageAndAssistant,
  IAssistantWithStatus,
} from "@/utils/types";
import { listThreads, deleteThread, getThread } from "@/api/thread";
import { ElMessage, ElMessageBox } from "element-plus";
import { listAssistants } from "@/api/assistant";
import { listMessages } from "@/api/message";
import { useRouter } from "vue-router";
import BScroll from "better-scroll";
import { useI18n } from "vue-i18n";

export default defineComponent({
  name: "HomeView",
  setup() {
    const assistantList = ref<IAssistant[]>([]);
    const threadsList = ref<IThread[]>([]);
    const firstMessages = ref<string[]>([]);
    const activeAssistant = ref({} as IAssistant);
    const assistantOfThread = ref<IAssistantWithStatus[]>([]);
    const threadAndMessages = ref<IThreadAndMessageAndAssistant[]>([]);
    const assistantScroll = ref<BScroll | null>(null);
    const historyScroll = ref<BScroll | null>(null);
    const router = useRouter();
    const { t, locale } = useI18n();
    const flag = ref(true);
    const changeLanguage = () => {
      if (flag.value) {
        locale.value = "zh";
        localStorage.setItem("lang", "zh");
        flag.value = false;
      } else {
        locale.value = "en";
        flag.value = true;
        localStorage.setItem("lang", "en");
      }
    };
    // Initialize data
    const initData = async () => {
      try {
        threadsList.value = [];
        firstMessages.value = [];
        assistantOfThread.value = [];

        const assistantsRes = await listAssistants();
        if (assistantsRes && assistantsRes.length > 0) {
          assistantList.value = assistantsRes;
          activeAssistant.value = assistantsRes[0];
        }

        const threadsRes = await listThreads(100);
        if (threadsRes) {
          threadAndMessages.value = threadsRes;
          for (let t of threadsRes) {
            if (t.thread && !t.thread.metadata?.hidden) {
              threadsList.value.push(t.thread);
              if (
                t.first_message &&
                t.first_message.content &&
                t.first_message.content.length > 0
              ) {
                firstMessages.value.push(t.first_message.content[0].text.value);
              } else {
                firstMessages.value.push("no message yet");
              }
              assistantOfThread.value.push(
                t.assistant || ({} as IAssistantWithStatus)
              );
            }
          }
        }

        assistantScroll.value = new BScroll(".assistant-list", {
          click: true,
          mouseWheel: true,
          scrollbar: {
            fade: true,
            interactive: true,
          },
        });

        historyScroll.value = new BScroll(".history-box", {
          click: true,
          mouseWheel: true,
          scrollbar: {
            fade: true,
            interactive: true,
          },
        });
      } catch (err) {
        console.error("Failed to initialize data:", err);
      }
    };
    const navigateToIconHub = () => {
      window.open("https://github.com/kvcache-ai/Lexllama");
    };
    const isEmptyObject = (obj: object): boolean => {
      //Determine if the object is empty
      return Object.keys(obj).length === 0;
    };
    //Jump route
    const navigateToExplore = () => {
      router.push("/explore");
    };
    const navigatorToChat = () => {
      router.push("/chat");
    };
    // Calculate date
    const todayThreads = computed(() => {
      const today = Math.floor(Date.now() / 1000);
      return threadsList.value.filter((thread) => {
        return today - thread.created_at <= 86400;
      });
    });
    const previousThreads = computed(() => {
      const today = Math.floor(Date.now() / 1000);
      return threadsList.value.filter((thread) => {
        return today - thread.created_at > 86400;
      });
    });

    onMounted(async () => {
      initData();
    });

    return {
      t,
      flag,
      assistantList,
      isEmptyObject,
      activeAssistant,
      navigateToExplore,
      navigatorToChat,
      threadsList,
      firstMessages,
      navigateToIconHub,
      assistantScroll,
      historyScroll,
      assistantOfThread,
      changeLanguage,
      initData,
      todayThreads,
      previousThreads,
    };
  },
  data() {
    return {
      projectName: "KTransformers",
      projectVersion: "v0.01",
      activeThreadIndex: -1,
      chatInit: true,
      activeThread: {} as IThread,
      allMessageInCurrentThread: [] as IMessageData[],
      inputDisabled: false,
      isSettingActiveThread: false,
      isDeletingThread: false,
      threadAndMessages: <IThreadAndMessageAndAssistant[]>[],
    };
  },
  methods: {
    setActiveAssistant(assistant: IAssistant) {
      this.chatInit = true;
      this.inputDisabled = false;
      this.activeThreadIndex = -1;
      this.activeAssistant = assistant;
      this.activeThread = {} as IThread;
      this.allMessageInCurrentThread = [];
      if (this.$route.path != "/chat") {
        this.navigatorToChat();
      }
    },
    async setActiveThreadIndex(index: number) {
      //If setting up an active thread, return directly
      if (this.isSettingActiveThread) {
        return;
      }
      this.isSettingActiveThread = true;
      this.activeThreadIndex = index;
      this.chatInit = false;
      this.inputDisabled = false;
      this.activeAssistant = {} as IAssistant;
      this.activeThread = this.threadsList[index];
      //If the assistant of the current thread is an empty object
      if (this.isEmptyObject(this.assistantOfThread[index])) {
        ElMessage({
          message: this.t("home.withoutAssistantTip"),
          type: "warning",
        });
        this.inputDisabled = true;
      }
      try {
        //Call asynchronous function to obtain the message list of the current thread
        const res = await listMessages(this.activeThread.id, 100, "asc");
        //Convert the obtained message list to the specified format and assign values to all messages of the current thread
        this.allMessageInCurrentThread = res.map((m) => ({
          role: m.role,
          content: m.content,
          assistant_id: m.assistant_id,
          created_at: m.created_at,
        }));
      } catch (err) {
        console.log(err);
      } finally {
        this.isSettingActiveThread = false;
      }
      if (this.$route.path != "/chat") {
        this.navigatorToChat();
      }
    },

    async delThread(index: number) {
      // If the thread is currently being deleted, return directly
      if (this.isDeletingThread) {
        return;
      }
      this.isDeletingThread = true;
      try {
        //Pop up a confirmation box and ask the user if they are sure to delete the thread
        await ElMessageBox.confirm(this.t("home.deleteThreadTip"), "Warning", {
          confirmButtonText: "OK",
          cancelButtonText: "Cancel",
          type: "warning",
        });

        const res = await deleteThread(this.threadsList[index].id);
        this.threadsList.splice(index, 1);
        this.firstMessages.splice(index, 1);
        this.assistantOfThread.splice(index, 1);
        // Jump to the first assistant or other suitable page
        this.setActiveAssistant(this.assistantList[0]);
        ElMessage({
          type: "success",
          message: "Delete completed",
        });
      } catch (err) {
        // Specific error handling, such as logging or displaying specific error messages to users
        console.error("Delete session failed:", err);
        ElMessage({
          type: "error",
          message: `Delete failed`, // Display specific error messages
        });
      } finally {
        this.isDeletingThread = false; //Ensure that the delete thread flag is reset no matter what
      }
    },
    // Handles the update of the assistant asynchronously.
    async handleUpdateAssistant(value: any) {
      await this.initData();
      if (this.activeThreadIndex != -1) {
        this.setActiveThreadIndex(this.activeThreadIndex);
      } else if (this.activeAssistant.id) {
        this.setActiveThreadIndex(0);
      } else {
        this.setActiveAssistant(this.assistantList[0]);
      }
    },
  },
});
</script>


<style lang="stylus" rel="stylesheet/stylus" scoped>
@import '../assets/css/mixins.styl';

.home {
  width: 100%;
  height: 100%;
  position: relative;
}

.left-panel {
  width: 320px;
  height: 100%;
  background-color: #363433;
  padding: 30px 30px;
  .logo-box {
    .logo {
      .img {
        width: 36px;
        height: 36px;
      }

      .text {
        font-size: 28px;
        font-weight: bold;
        margin-left: 10px;
        color: #edf2ea;
      }
    }

    .version {
      text-align: right;
      font-size: 14px;
      color: #bdbdbd;
    }
  }

  .divider {
    border-bottom: 1px solid #D7D7D7;
    width: 30%;
    margin: 30px auto;
  }

  .lang-box {
    position: relative;
    width: 100%;
    height: 30px;
    margin: auto;
    margin-bottom: 10px;

    .el-dropdown {
      font-size: 14px;
      position: absolute;
      top: 50%;
      left: 50%;
      transform: translate(-50%, -50%);
    }
  }

  .assistant-box {
    .assistant-list {
      min-height: 50px;
      max-height: 300px;
      overflow: hidden;
      position: relative;

      ul > li.assistant-item {
        padding: 8px 15px;
        color: #edf2ea;

        img {
          width: 32px;
          height: 32px;
        }

        .name {
          margin-left: 12px;
          font-size: 14px;
          color: #edf2ea;
        }

        i.iconfont {
          display: none;
          margin-left: 10px;
        }

        &:hover {
          background-color: $bg_gray_light_hover;
          cursor: pointer;
          border-radius: 4px;

          .name {
            color: #313433;
          }

          i.iconfont {
            display: block;
          }
        }
      }
    }

    .explore {
      position: relative;
      justify-content: center;
      display: flex;
      margin-top: 10px;

      .explore-btn {
        margin: 0 auto;
        padding: 0 20px;
        justify-content: center;
        height: 32px;
        line-height: 32px;
        background-color: #FFFFFF;
        border: 1px solid RGBA(0, 0, 0, 0.15);
        border-radius: 16px;

        i {
          color: #8080FF;
        }

        .text {
          color: #7F7F7F;
          margin-left: 4px;
        }

        &:hover {
          background-color: #FAFAFA;
          cursor: pointer;
        }
      }
    }
  }

  .history-box {
    position: relative;

    .date {
      font-size: 14px;
      color: #7F7F7F;
      margin: 8px 0;

      &:first-child {
        margin-top: 0;
      }
    }

    li.chat-item {
      padding: 12px 15px;
      cursor: pointer;
      background-color: #edf2ea;
      border-radius: 4px;
      margin-bottom: 10px;
      font-size: 16px;

      .chat-abbr {
        font-size: 14px;
        color: #313433;
        white-space: nowrap;
        overflow: hidden;
        text-overflow: ellipsis;
      }

      .chat-ops {
        display: flex;
        margin-top: 5px;

        img {
          width: 16px;
          height: 16px;
        }

        .name {
          font-size: 12px;
          color: #898989;
          margin-left: 8px;
        }

        i.iconfont {
          color: $gray_60;
        }
      }

      &:hover, &.active {
        transition: 0.3s all;
        cursor: pointer;
        background-color: #a2a79f;
        .chat-abbr {
          color: black;
        }

        .name, i.iconfont {
          color: black;
        }
      }
    }
  }

  .icon-box {
    width: 100%;
    display: flex;
    flex-direction: row;
    justify-content: flex-end;
    align-items: center;

    .iconhub {
      width: 32px;
      height: 24px;
      background: white;
      font-size: 30px;
      border: none;
      ovferflow: hidden;
      border-radius: 15%;
      display: flex;
      flex-direction: column;
      justify-content: center;
      align-items: center;
      color: #898989;
      transition: all 0.5s;
      cursor: pointer;
    }

    .iconhub:hover {
      background: #e5e5e5;
      text-decoration: none;
    }

    .iconlanguage {
      margin-left: 15px;
      width: 32px;
      height: 24px;
      background: white;
      font-size: 30px;
      border: none;
      ovferflow: hidden;
      border-radius: 15%;
      display: flex;
      flex-direction: column;
      justify-content: center;
      align-items: center;
      color: #898989;
      transition: all 0.5s;
      cursor: pointer;
    }

    .iconlanguage:hover {
      background: #e5e5e5;
      text-decoration: none;
    }
  }
}

ul {
  list-style: none;
}

.example-2 {
  display: flex;
  justify-content: center;
  align-items: center;
}

.example-2 .icon-content {
  margin: 0 10px;
  position: relative;
}

.example-2 .icon-content .tooltip {
  position: absolute;
  top: -30px;
  left: 50%;
  transform: translateX(-50%);
  color: #fff;
  padding: 6px 10px;
  border-radius: 5px;
  opacity: 0;
  visibility: hidden;
  font-size: 14px;
  transition: all 0.3s ease;
}

.example-2 .icon-content:hover .tooltip {
  opacity: 1;
  visibility: visible;
  top: -50px;
}

.main-panel {
  height: 100%;
  background-color: #f1f0ed;
}
</style>


================================================
FILE: archive/ktransformers/website/tests/unit/example.spec.ts
================================================
import { shallowMount } from '@vue/test-utils'
import HelloWorld from '@/components/HelloWorld.vue'

describe('HelloWorld.vue', () => {
  it('renders props.msg when passed', () => {
    const msg = 'new message'
    const wrapper = shallowMount(HelloWorld, {
      props: { msg }
    })
    expect(wrapper.text()).toMatch(msg)
  })
})


================================================
FILE: archive/ktransformers/website/tsconfig.json
================================================
{
  "compilerOptions": {
    "target": "es5",
    "module": "esnext",
    "strict": true,
    "jsx": "preserve",
    "importHelpers": true,
    "moduleResolution": "node",
    "skipLibCheck": true,
    "esModuleInterop": true,
    "allowSyntheticDefaultImports": true,
    "forceConsistentCasingInFileNames": true,
    "useDefineForClassFields": true,
    "sourceMap": true,
    "allowJs": true,
    "baseUrl": ".",
    "types": [
      "webpack-env",
      "jest"
    ],
    "paths": {
      "@/*": [
        "src/*"
      ]
    },
    "lib": [
      "esnext",
      "dom",
      "dom.iterable",
      "scripthost"
    ]
  },
  "include": [
    "src/**/*.ts",
    "src/**/*.tsx",
    "src/**/*.vue",
    "tests/**/*.ts",
    "tests/**/*.tsx",
    "config.d.ts"
  ],
 
  "exclude": [
    "node_modules"
  ]
}

================================================
FILE: archive/ktransformers/website/vue.config.js
================================================

module.exports = {
  // 配置 webpack-dev-server 行为。
  devServer: {
    open: false, // 编译后默认打开浏览器
    host: '0.0.0.0',  // 域名
    port: 8082,  // 端口
    https: false,  // 是否https
    proxy: {
        '/api': {
          target: 'http://localhost:9016/v1', // 你的后端服务器地址
          changeOrigin: true, // 是否允许跨域
          pathRewrite: {
            '/api': '' // 将 '/api' 前缀替换为空，如果你的后端不需要这个前缀
          }
        }
      }
},
publicPath: '/web/',  // 基本路径
outputDir: 'dist', // 构建时的输出目录
assetsDir: 'static', // 放置静态资源的目录
indexPath: 'index.html', // html 的输出路径
filenameHashing: true, // 文件名哈希值
lintOnSave: false, // 是否在保存的时候使用 `eslint-loader` 进行检查。

// 组件是如何被渲染到页面中的？ （ast：抽象语法树；vDom：虚拟DOM）
// template ---> ast ---> render ---> vDom ---> 真实的Dom ---> 页面
// runtime-only：将template在打包的时候，就已经编译为render函数
// runtime-compiler：在运行的时候才去编译template
runtimeCompiler: false,

transpileDependencies: [], // babel-loader 默认会跳过 node_modules 依赖。
productionSourceMap: false, // 是否为生产环境构建生成 source map

//调整内部的 webpack 配置
configureWebpack: () => {},

chainWebpack: () => {},
  
}

================================================
FILE: archive/merge_tensors/merge_safetensor_gguf.py
================================================
# this script targets to merge the fp8 safe tensor and the gguf quantized tensors.

import os
# insert the path of the project
import sys
# sys.path.insert(0, "/home/azure/ktransformers")
import argparse
import torch
from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf
from safetensors import safe_open
from safetensors.torch import save_file
import re
from collections import defaultdict

def read_safetensor_keys_from_folder(folder_path)->dict:
    """    
    :param folder_path: folder path
    :return: key_to_file_map
    """
    # check if the folder path is exist
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"GGUF dir not found: {folder_path}")
    if os.path.isfile(folder_path):
        folder_path = os.path.dirname(folder_path)
    
    key_to_file_map = {}

    found_safetensor = False
    for root, dirs, files in os.walk(folder_path):
        # sort files
        files = sorted(files)
        for file in files:
            if file.endswith(".safetensors"):
                found_safetensor = True
                file_path = os.path.join(root, file)
                try:
                    with safe_open(file_path, framework="pt") as f:
                        for key in f.keys():
                            if "model.layers.61" in key:
                                # skip MTP layer
                                continue
                            # try:
                            #     if int(key.split('.')[2]) > 4:
                            #         continue
                            # except:
                            #     pass
                            key_to_file_map[key] = file_path
                except Exception as e:
                    print(f"Error reading Safetensor file {file_path}: {e}")
    
    if not found_safetensor:
        raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
    
    return key_to_file_map

tensor_from_gguf = [] # todo: add keys in gguf that should be used in the final tensor

def translate_name(name:str)->str:
    """
    :param name: name of the tensor
    :return: translated name
    """
    name = translate_name_to_gguf(name)
    name = name.replace(".up_proj.", ".ffn_up_exps.")
    name = name.replace(".down_proj.", ".ffn_down_exps.")
    name = name.replace(".gate_proj.", ".ffn_gate_exps.")
    name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias") 
    return name
    

def combine_tensor_sources(safetensor_path:str, gguf_path:str):
    gguf_loader = GGUFLoader(gguf_path)
    gguf_tensor_file_map = gguf_loader.tensor_file_map
    safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)
    
    # build a map for the key to the tensor
    # according to the key, we can get the tensor from the file
    
    target_tensor_map = {}
    for key in safetensor_tensor_file_map.keys():
        # for all experts, we use the gguf tensor
        if ".mlp.experts." in key:
            if '.weight_scale_inv' in key:
                continue
            key = '.'.join(key.split('.')[:5]+key.split('.')[-2:])
            translated_key = translate_name(key)
            target_tensor_map[key] = gguf_tensor_file_map[translated_key]
            continue
        
        if any(target_key in key for target_key in tensor_from_gguf):
            target_tensor_map[key] = gguf_tensor_file_map[translate_name(key)]
        else:
            target_tensor_map[key] = safetensor_tensor_file_map[key]
    
    return target_tensor_map, gguf_loader

def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader):
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)
    
    # Cache for safetensor file handles and GGUF loaders
    safetensors_cache = {}
    gguf_cache = {}
    
    # Group tensors by layer
    layer_groups = defaultdict(list)
    non_layer_keys = []
    layer_pattern = re.compile(r'\.layers\.(\d+)\.')
    
    for key in target_tensor_map:
        match = layer_pattern.search(key)
        if match:
            layer_num = int(match.group(1))
            layer_groups[layer_num].append(key)
        else:
            non_layer_keys.append(key)
    
    # Calculate total shards
    total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1
    if total_shards == 0:
        raise ValueError("No tensors to save")
    
    shard_idx = 0
    
    # Save non-layer tensors to the first shard if they exist
    if non_layer_keys:
        tensors = {}
        for key in non_layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None
            if file_path.endswith('.safetensors'):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
            elif file_path.endswith('.gguf'):
                gguf_name = translate_name(key)
                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
            tensors[translate_name(key)] = tensor
            if ggml_type:
                ggml_type = torch.tensor(ggml_type)
                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
                tensors[ggml_key] = ggml_type
        
        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
        print(f"Saving non-layer tensors to {output_file}")
        save_file(tensors, output_file)
        print(tensors.keys())

        shard_idx += 1
    
    # Save each layer's tensors to subsequent shards
    for layer_num in sorted(layer_groups.keys()):
        layer_keys = layer_groups[layer_num]
        tensors = {}
        for key in layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None
            if file_path.endswith('.safetensors'):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
                tensor_info = tensor.shape
            elif file_path.endswith('.gguf'):
                gguf_name = translate_name(key)
                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
                # tensor_info = gguf_loader.tensor_info[gguf_name]
                # ggml_type = gguf_loader.tensor_info[gguf_name]['ggml_type']
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
            tensors[translate_name(key)] = tensor
            if ggml_type:
                ggml_type = torch.tensor(ggml_type)
                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
                tensors[ggml_key] = ggml_type
        
        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
        print(f"Saving layer {layer_num} to {output_file}")
        # print(tensors.keys())
        save_file(tensors, output_file)
        shard_idx += 1
    
    return
    
def main():
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser(description="Read parameters from Safetensor and GGUF files")
    parser.add_argument("--safetensor_path", type=str, help="Path to the Safetensor file", default="/mnt/data/model/DeepSeek-V3")
    parser.add_argument("--gguf_path", type=str, help="Path to the GGUF file", default="/mnt/data/model/DeepseekV3-q4km-gguf")
    parser.add_argument("--output_path", type=str, help="Path to the output file", default="/mnt/data/model/ktrans-safetensors/DeepSeek-V3-q4km-fp8")
    
    # print all the arguments
    print("All the arguments:")
    print(parser.parse_args())
    
    # 解析命令行参数
    args = parser.parse_args()

    safetensor_path = args.safetensor_path
    gguf_path = args.gguf_path
    output_path = args.output_path
    
    target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path)
    write_combined_tensor(target_tensor_map, output_path, gguf_loader)
    
    return

if __name__ == "__main__":
    main()


================================================
FILE: archive/merge_tensors/merge_safetensor_gguf_for_qwen3.py
================================================
# coding=utf-8
# Copyright (c) 2025. Huawei Technologies Co., Ltd. All rights reserved.
# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import argparse
import torch
from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf
from safetensors import safe_open
from safetensors.torch import save_file
import re
from collections import defaultdict

def read_safetensor_keys_from_folder(folder_path) -> dict:
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Safetensors dir not found: {folder_path}")
    if os.path.isfile(folder_path):
        folder_path = os.path.dirname(folder_path)

    key_to_file_map = {}
    found_safetensor = False

    for root, dirs, files in os.walk(folder_path):
        files = sorted(files)
        for file in files:
            if not file.endswith(".safetensors"):
                continue
            found_safetensor = True
            file_path = os.path.join(root, file)
            try:
                with safe_open(file_path, framework="pt") as f:
                    for key in f.keys():
                        key_to_file_map[key] = file_path
            except Exception as e:
                print(f"Error reading Safetensor file {file_path}: {e}")

    if not found_safetensor:
        raise FileNotFoundError(f"No Safetensor files found in {folder_path}")

    return key_to_file_map


# 可选：如果你希望对某些非 MoE tensor 也用 GGUF，可以把关键子串填到下面这个列表里
tensor_from_gguf = []  # e.g. ["self_attn.q_proj.weight"]


def translate_name(name: str) -> str:
    name = translate_name_to_gguf(name)
    name = name.replace(".up_proj.", ".ffn_up_exps.")
    name = name.replace(".down_proj.", ".ffn_down_exps.")
    name = name.replace(".gate_proj.", ".ffn_gate_exps.")
    name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias")
    return name


def combine_tensor_sources(safetensor_path: str, gguf_path: str):
    gguf_loader = GGUFLoader(gguf_path)
    gguf_tensor_file_map = gguf_loader.tensor_file_map
    safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)

    target_tensor_map = {}

    for key, st_file in safetensor_tensor_file_map.items():
        if ".mlp.experts." in key and key.endswith(".weight"):
            parts = key.split(".")
            if len(parts) < 8:
                raise ValueError(f"Unexpected MoE expert key format: {key}")
            norm_key = ".".join(parts[:5] + parts[-2:])

            gguf_name = translate_name(norm_key)
            if gguf_name not in gguf_tensor_file_map:
                raise KeyError(
                    f"[MoE] GGUF tensor not found for safetensors key {key} -> {gguf_name}"
                )
            target_tensor_map[norm_key] = gguf_tensor_file_map[gguf_name]
            continue
        if any(tag in key for tag in tensor_from_gguf):
            gguf_name = translate_name(key)
            if gguf_name not in gguf_tensor_file_map:
                raise KeyError(
                    f"[Non-MoE] GGUF tensor not found for safetensors key {key} -> {gguf_name}"
                )
            target_tensor_map[key] = gguf_tensor_file_map[gguf_name]
        else:
            target_tensor_map[key] = st_file

    return target_tensor_map, gguf_loader


def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader):
    os.makedirs(output_path, exist_ok=True)

    safetensors_cache = {}
    layer_groups = defaultdict(list)
    non_layer_keys = []
    layer_pattern = re.compile(r"\.layers\.(\d+)\.")

    for key in target_tensor_map:
        m = layer_pattern.search(key)
        if m:
            layer_num = int(m.group(1))
            layer_groups[layer_num].append(key)
        else:
            non_layer_keys.append(key)

    total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1
    if total_shards <= 0:
        raise ValueError("No tensors to save")

    shard_idx = 0

    if non_layer_keys:
        tensors = {}
        for key in non_layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None

            if file_path.endswith(".safetensors"):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework="pt")
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
            elif file_path.endswith(".gguf"):
                gguf_name = translate_name(key)
                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")

            out_key = translate_name(key)
            tensors[out_key] = tensor
            if ggml_type is not None:
                ggml_type = torch.tensor(ggml_type)
                if out_key.endswith(".weight"):
                    ggml_key = out_key[:-7] + ".ggml_type"
                else:
                    ggml_key = out_key + ".ggml_type"
                tensors[ggml_key] = ggml_type

        output_file = os.path.join(
            output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors"
        )
        print(f"[WRITE] Saving non-layer tensors to {output_file}")
        save_file(tensors, output_file)
        shard_idx += 1

    for layer_num in sorted(layer_groups.keys()):
        layer_keys = layer_groups[layer_num]
        tensors = {}

        for key in layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None

            if file_path.endswith(".safetensors"):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework="pt")
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
            elif file_path.endswith(".gguf"):
                gguf_name = translate_name(key)
                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")

            out_key = translate_name(key)
            tensors[out_key] = tensor
            if ggml_type is not None:
                ggml_type = torch.tensor(ggml_type)
                if out_key.endswith(".weight"):
                    ggml_key = out_key[:-7] + ".ggml_type"
                else:
                    ggml_key = out_key + ".ggml_type"
                tensors[ggml_key] = ggml_type

        output_file = os.path.join(
            output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors"
        )
        print(f"[WRITE] Saving layer {layer_num} to {output_file}")
        save_file(tensors, output_file)
        shard_idx += 1


def main():
    parser = argparse.ArgumentParser(
        description="Merge FP8 safetensors and GGUF tensors for Qwen3-30B-A3B"
    )
    parser.add_argument(
        "--safetensor_path",
        type=str,
        help="Path to the FP8 Safetensor folder",
        default="/mnt/data/model/Qwen3-30B-A3B-FP8",
    )
    parser.add_argument(
        "--gguf_path",
        type=str,
        help="Path to the GGUF file or folder",
        default="/mnt/data/model/Qwen3-30B-A3B-GGUF",
    )
    parser.add_argument(
        "--output_path",
        type=str,
        help="Path to the output safetensors folder",
        default="/mnt/data/model/ktrans-safetensors/Qwen3-30B-A3B-q4km-fp8",
    )

    args = parser.parse_args()

    print("[ARGS]", args)

    safetensor_path = args.safetensor_path
    gguf_path = args.gguf_path
    output_path = args.output_path

    target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path)
    write_combined_tensor(target_tensor_map, output_path, gguf_loader)


if __name__ == "__main__":
    main()


================================================
FILE: archive/pyproject.toml
================================================
[build-system]
requires = [
  "setuptools",
  "torch >= 2.3.0", 
  "ninja",
  "packaging",
  "cpufeature"
  ]
build-backend = "setuptools.build_meta"

[project]

name = "ktransformers"

dynamic = ["version"]

dependencies = [
  "torch >= 2.3.0",
  "transformers",
  "fastapi >= 0.111.0",
  "uvicorn >= 0.30.1",
  "langchain >= 0.2.0",
  "blessed >= 1.20.0",
  "accelerate >= 0.31.0",
  "sentencepiece >= 0.1.97",
  "setuptools",
  "ninja",
  "wheel",
  "colorlog",
  "build",
  "fire",
  "protobuf",
]

requires-python = ">=3.10"

authors = [
  {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"}
]

maintainers = [
  {name = "james0zan", email = "zhang.mingxing@outlook.com"},
  {name = "awake", email = "awake@approaching.ai"},
  {name = "unicorn chan", email = "nl@approaching.ai"}
]

description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies."

readme = "README.md"
license = {file = "LICENSE"}

keywords = ["ktransformers", "llm"]

classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12"
]

[project.urls]
Homepage = "https://kvcache.ai"
Repository = "https://github.com/kvcache-ai/ktransformers.git"
Issues = "https://github.com/kvcache-ai/ktransformers/issues"


[project.scripts]
ktransformers = "ktransformers.server.main:main"

[tool.setuptools.packages.find]
where = ["./", ]
include = ["ktransformers","ktransformers.*"]
[tool.black]
line-length = 120
preview = true
unstable = true


================================================
FILE: archive/requirements-local_chat.txt
================================================
fire
transformers
numpy
torch>=2.3.0
packaging
cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
protobuf
tiktoken
blobfile


================================================
FILE: archive/setup.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :
Author       : chenxl
Date         : 2024-07-27 16:15:27
Version      : 1.0.0
LastEditors  : chenxl
LastEditTime : 2024-08-14 16:36:19
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''

import os
import sys
import re
import ast
from collections import deque
import subprocess
import select
import time
import platform
import shutil
from typing import List, Optional, Literal
import http.client
import urllib.request
import urllib.error
from pathlib import Path
from packaging.version import parse
import torch
import torch.version
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
from setuptools import setup, Extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
try:
    from torch_musa.utils.simple_porting import SimplePorting
    from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
except ImportError:
    MUSA_HOME=None
KTRANSFORMERS_BUILD_XPU = torch.xpu.is_available()


try:
    import torch_npu
    KTRANSFORMERS_BUILD_NPU = torch_npu.npu.is_available()
except:
    KTRANSFORMERS_BUILD_NPU = False

# 检测 DEV_BACKEND 环境变量
dev_backend = os.environ.get("DEV_BACKEND", "").lower()
if dev_backend == "xpu":
    triton_dep = [
        "pytorch-triton-xpu==3.3.0"
    ]
else:
    triton_dep = ["triton>=3.2"]

with_balance = os.environ.get("USE_BALANCE_SERVE", "0") == "1"

class CpuInstructInfo:
    CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
    FANCY = "FANCY"
    AVX512 = "AVX512"
    AVX2 = "AVX2"
    CMAKE_NATIVE = "-DLLAMA_NATIVE=ON"
    CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON"
    CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON"
    CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON"

class VersionInfo:
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))
    PACKAGE_NAME = "ktransformers"
    BASE_WHEEL_URL:str = (
        "https://github.com/kvcache-ai/ktransformers/releases/download/{tag_name}/{wheel_filename}"
    )
    FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE"

    def get_musa_bare_metal_version(self, musa_dir):
        raw_output = subprocess.run(
            [musa_dir + "/bin/mcc", "-v"], check=True,
            stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8")
        output = raw_output.split()
        release_idx = output.index("version") + 1
        bare_metal_version = parse(output[release_idx].split(",")[0])
        musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
        return musa_version

    def get_rocm_bare_metal_version(self, rocm_dir):
        """
        Get the ROCm version from the ROCm installation directory.

        Args:
            rocm_dir: Path to the ROCm installation directory

        Returns:
            A string representation of the ROCm version (e.g., "63" for ROCm 6.3)
        """
        try:
            # Try using rocm_agent_enumerator to get version info
            raw_output = subprocess.check_output(
                [rocm_dir + "/bin/rocminfo", "--version"],
                universal_newlines=True,
                stderr=subprocess.STDOUT)
            # Extract version number from output
            match = re.search(r'(\d+\.\d+)', raw_output)
            if match:
                version_str = match.group(1)
                version = parse(version_str)
                rocm_version = f"{version.major}{version.minor}"
                return rocm_version
        except (subprocess.CalledProcessError, FileNotFoundError):
            # If rocminfo --version fails, try alternative methods
            pass

        try:
            # Try reading version from release file
            with open(os.path.join(rocm_dir, "share/doc/hip/version.txt"), "r") as f:
                version_str = f.read().strip()
                version = parse(version_str)
                rocm_version = f"{version.major}{version.minor}"
                return rocm_version
        except (FileNotFoundError, IOError):
            pass

        # If all else fails, try to extract from directory name
        dir_name = os.path.basename(os.path.normpath(rocm_dir))
        match = re.search(r'rocm-(\d+\.\d+)', dir_name)
        if match:
            version_str = match.group(1)
            version = parse(version_str)
            rocm_version = f"{version.major}{version.minor}"
            return rocm_version

        # Fallback to extracting from hipcc version
        try:
            raw_output = subprocess.check_output(
                [rocm_dir + "/bin/hipcc", "--version"],
                universal_newlines=True,
                stderr=subprocess.STDOUT)
            match = re.search(r'HIP version: (\d+\.\d+)', raw_output)
            if match:
                version_str = match.group(1)
                version = parse(version_str)
                rocm_version = f"{version.major}{version.minor}"
                return rocm_version
        except (subprocess.CalledProcessError, FileNotFoundError):
            pass

        # If we still can't determine the version, raise an error
        raise ValueError(f"Could not determine ROCm version from directory: {rocm_dir}")

    def get_cuda_bare_metal_version(self, cuda_dir):
        raw_output = subprocess.check_output(
            [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
        output = raw_output.split()
        release_idx = output.index("release") + 1
        bare_metal_version = parse(output[release_idx].split(",")[0])
        cuda_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
        return cuda_version

    def get_cuda_version_of_torch(self):
        if KTRANSFORMERS_BUILD_NPU:
            return 'aarch64'
        torch_cuda_version = parse(torch.version.cuda)
        cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
        return cuda_version

    def get_platform(self,):
        """
        Returns the platform name as used in wheel filenames.
        """
        if sys.platform.startswith("linux"):
            return f'linux_{platform.uname().machine}'
        elif sys.platform == "win32":
            return "win_amd64"
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))

    def get_cpu_instruct(self,):
        if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
            return "fancy"
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
            return "avx512"
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
            return "avx2"
        else:
            print("Using native cpu instruct")
        if sys.platform.startswith("linux"):
            if KTRANSFORMERS_BUILD_NPU:
                return 'aarch64'
            with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
                cpuinfo = cpu_f.read()
            flags_line = [line for line in cpuinfo.split(
                '\n') if line.startswith('flags')][0]
            flags = flags_line.split(':')[1].strip().split(' ')
            # fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI
            for flag in flags:
                if 'avx512bw' in flag:
                    return 'fancy'
            for flag in flags:
                if 'avx512' in flag:
                    return 'avx512'
            for flag in flags:
                if 'avx2' in flag:
                    return 'avx2'
            raise ValueError(
                "Unsupported cpu Instructions: {}".format(flags_line))
        elif sys.platform == "win32":
            from cpufeature.extension import CPUFeature

            if CPUFeature.get("AVX512bw", False):
                return 'fancy'
            if CPUFeature.get("AVX512f", False):
                return 'avx512'
            if CPUFeature.get("AVX2", False):
                return 'avx2'
            raise ValueError(
                "Unsupported cpu Instructions: {}".format(str(CPUFeature)))
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))

    def get_torch_version(self,):
        torch_version_raw = parse(torch.__version__)
        torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}"
        return torch_version

    def get_flash_version(self,):
        version_file = os.path.join(
            Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
        with open(version_file, "r", encoding="utf-8") as f:
            version_match = re.search(
                r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
        flash_version = ast.literal_eval(version_match.group(1))
        return flash_version

    def get_package_version(self, full_version=False):
        flash_version = str(self.get_flash_version())
        torch_version = self.get_torch_version()
        cpu_instruct = self.get_cpu_instruct()
        backend_version = ""
        if CUDA_HOME is not None:
            backend_version = f"cu{self.get_cuda_bare_metal_version(CUDA_HOME)}"
        elif MUSA_HOME is not None:
            backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
        elif ROCM_HOME is not None:
            backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
        elif torch.xpu.is_available():
            backend_version = f"xpu"
        elif KTRANSFORMERS_BUILD_NPU:
            backend_version = f"npu{torch_npu.__version__}"
        else:
            raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set and XPU is not available.")
        package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
        if full_version:
            return package_version
        if not VersionInfo.FORCE_BUILD:
            return flash_version
        return package_version


class BuildWheelsCommand(_bdist_wheel):
    def get_wheel_name(self,):
        version_info = VersionInfo()
        package_version = version_info.get_package_version(full_version=True)
        flash_version = version_info.get_flash_version()
        python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
        wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{package_version}-{python_version}-{python_version}-{version_info.get_platform()}.whl"
        wheel_url = VersionInfo.BASE_WHEEL_URL.format(tag_name=f"v{flash_version}", wheel_filename=wheel_filename)
        return wheel_filename, wheel_url


    def run(self):
        if VersionInfo.FORCE_BUILD:
            super().run()
            return
        wheel_filename, wheel_url = self.get_wheel_name()
        print("Guessing wheel URL: ", wheel_url)
        try:
            urllib.request.urlretrieve(wheel_url, wheel_filename)
            # Make the archive
            # Lifted from the root wheel processing command
            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
            if not os.path.exists(self.dist_dir):
                os.makedirs(self.dist_dir)

            impl_tag, abi_tag, plat_tag = self.get_tag()
            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"

            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
            print("Raw wheel path", wheel_path)
            shutil.move(wheel_filename, wheel_path)
        except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected):
            print("Precompiled wheel not found. Building from source...")
            # If the wheel could not be downloaded, build from source
            super().run()


ANSI_ESCAPE = re.compile(
    r'\033[@-Z\\-_\[\]P]|\033\[[0-?]*[ -/]*[@-~]|\033][^\007\033]*\007|[\000-\037]'
)

def colored(text, color=None, bold=False):
    fmt = []
    if color== 'red':
        fmt.append('31')
    elif color == 'green':
        fmt.append('32')
    if bold:
        fmt.append('1')

    return f"\033[{';'.join(fmt)}m{text}\033[0m"


def split_line(text: str) -> List[str]:
    """Split text into lines based on terminal width."""
    term_width = shutil.get_terminal_size().columns or 80
    if not text.strip():
        return []
    # Split by explicit newlines and wrap long lines
    lines = []
    for line in text.split('\n'):
        while len(line) > term_width:
            lines.append(line[:term_width])
            line = line[term_width:]
        if line:
            lines.append(line)
    return lines


ANSI_ESCAPE = re.compile(
    r'\033[@-Z\\-_\[\]P]|\033\[[0-?]*[ -/]*[@-~]|\033][^\007\033]*\007|[\000-\037]'
)

def colored(text, color=None, bold=False):
    fmt = []
    if color== 'red':
        fmt.append('31')
    elif color == 'green':
        fmt.append('32')
    if bold:
        fmt.append('1')

    return f"\033[{';'.join(fmt)}m{text}\033[0m"


def split_line(text: str) -> List[str]:
    """Split text into lines based on terminal width."""
    term_width = shutil.get_terminal_size().columns or 80
    if not text.strip():
        return []
    # Split by explicit newlines and wrap long lines
    lines = []
    for line in text.split('\n'):
        while len(line) > term_width:
            lines.append(line[:term_width])
            line = line[term_width:]
        if line:
            lines.append(line)
    return lines


def run_command_with_live_tail(ext: str, command: List[str], output_lines: int = 20,
                               refresh_rate: float = 0.1, cwd: Optional[str] = None):
    """
    Execute a script-like command with real-time output of the last `output_lines` lines.

    - during execution: displays the last `output_lines` lines of output in real-time.
    - On success: Clears the displayed output.
    - On failure: Prints the full command output.

    Args:
        ext (str): the name of the native extension currently building.
        command (List[str]): The command to execute, as a list of arguments.
        output_lines (int, optional): Number of terminal lines to display during live output. Defaults to 20.
        refresh_rate (float, optional): Time in seconds between output refreshes. Defaults to 0.1.
        cwd (Optional[str], optional): Working directory to run the command in. Defaults to current directory.
    """
    # Dump all subprocess output without any buffering if stdout is not a terminal
    if not sys.stdout.isatty():
        return subprocess.run(command, cwd=cwd, check=True)
    # Start time for elapsed time calculation
    start = time.time()
    # Buffer for all output
    all_output = []
    write_buffer = deque(maxlen=output_lines)
    # Current number of lines from sub process displayed
    current_lines = 0

    # ANSI escape codes for terminal control
    CLEAR_LINE = '\033[K'
    MOVE_UP = '\033[1A'
    SAVE_CURSOR = '\0337'
    RESTORE_CURSOR = '\0338'
    CLEAR_REMAINING = '\033[J'

    def write_progress(status: Literal['RUNNING', 'SUCCEED', 'FAILED'] = 'RUNNING',
                       new_line: Optional[str] = None):
        """Update terminal display with latest output"""
        nonlocal current_lines, process
        sys.stdout.write(SAVE_CURSOR)
        sys.stdout.write(MOVE_UP * current_lines)
        banner = f"ext={ext} pid={process.pid} status={status.upper()} elapsed=({time.time()-start:.2f}S)\n"
        if status != 'FAILED':
            banner = colored(banner, 'green', bold=True)
        else:
            banner = colored(banner, 'red', bold=True)
        sys.stdout.write(CLEAR_LINE + banner)
        if new_line is not None:
            all_output.append(new_line)
            write_buffer.extend(split_line(ANSI_ESCAPE.sub('', new_line).rstrip()))
        elif status == 'RUNNING':
            sys.stdout.write(RESTORE_CURSOR)
            sys.stdout.flush()
            return

        sys.stdout.write(CLEAR_REMAINING)
        if status == 'RUNNING':
            current_lines = 1 + len(write_buffer)
            for text in write_buffer:
                sys.stdout.write(text + '\n')
        elif status == 'FAILED':
            for text in all_output:
                sys.stdout.write(text)
        sys.stdout.flush()

    # Start subprocess
    sys.stdout.write(colored(f'ext={ext} command={" ".join(str(c) for c in command)}\n', bold=True))
    sys.stdout.flush()
    process = subprocess.Popen(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        cwd=cwd,
        text=True,
        bufsize=1
    )

    try:
        write_progress()
        poll_obj = select.poll()
        poll_obj.register(process.stdout, select.POLLIN)
        while process.poll() is None:
            poll_result = poll_obj.poll(refresh_rate * 1000)
            if poll_result:
                write_progress(new_line=process.stdout.readline())
            else:
                write_progress()

        # Get any remaining output
        while True:
            line = process.stdout.readline()
            if not line:
                break
            write_progress(new_line=line)
    except BaseException as e:
        process.terminate()
        raise e
    finally:
        exit_code = process.wait()
        write_progress(status='SUCCEED' if exit_code == 0 else 'FAILED')


# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE = {
    "win32": "Win32",
    "win-amd64": "x64",
    "win-arm32": "ARM",
    "win-arm64": "ARM64",
}


class CMakeExtension(Extension):
    def __init__(self, name: str, sourcedir: str) -> None:
        super().__init__(name, sources=[])
        print(name, sourcedir)
        self.sourcedir = sourcedir

def get_cmake_abi_args(cmake_args):
    if torch.compiled_with_cxx11_abi():
        cmake_args.append("-D_GLIBCXX_USE_CXX11_ABI=1")
    else:
        cmake_args.append("-D_GLIBCXX_USE_CXX11_ABI=0")
    return cmake_args

class CMakeBuild(BuildExtension):

    def build_extension(self, ext) -> None:
        if not isinstance(ext, CMakeExtension):
            super().build_extension(ext)
            return
        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
        extdir = ext_fullpath.parent.resolve()

        # Using this requires trailing slash for auto-detection & inclusion of
        # auxiliary "native" libs

        debug = int(os.environ.get("DEBUG", 0)
                    ) if self.debug is None else self.debug
        cfg = "Debug" if debug else "Release"

        # CMake lets you override the generator - we need to check this.
        # Can be set with Conda-Build, for example.
        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")

        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
        # from Python.
        cmake_args = [
            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
            f"-DPYTHON_EXECUTABLE={sys.executable}",
            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
        ]

        if CUDA_HOME is not None:
            cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
        elif MUSA_HOME is not None:
            cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
        elif ROCM_HOME is not None:
            cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
        elif KTRANSFORMERS_BUILD_XPU:
            cmake_args += ["-DKTRANSFORMERS_USE_XPU=ON", "-DKTRANSFORMERS_USE_CUDA=OFF"]
        elif KTRANSFORMERS_BUILD_NPU:
            cmake_args += ["-DKTRANSFORMERS_USE_NPU=ON", "-DKTRANSFORMERS_USE_CUDA=OFF"]
        else:
            raise ValueError("Unsupported backend: CUDA_HOME, MUSA_HOME, and ROCM_HOME are not set and XPU is not available.")
        
        cmake_args = get_cmake_abi_args(cmake_args)
        # log cmake_args
        print("CMake args:", cmake_args)

        build_args = []
        if "CMAKE_ARGS" in os.environ:
            cmake_args += [
                item for item in os.environ["CMAKE_ARGS"].split(" ") if item]

        if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
            cpu_args = CpuInstructInfo.CMAKE_FANCY
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
            cpu_args = CpuInstructInfo.CMAKE_AVX512
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
            cpu_args = CpuInstructInfo.CMAKE_AVX2
        else:
            cpu_args = CpuInstructInfo.CMAKE_NATIVE

        cmake_args += [
            item for item in cpu_args.split(" ") if item
        ]
        # In this example, we pass in the version to C++. You might not need to.
        cmake_args += [
            f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
        if self.compiler.compiler_type != "msvc":
            if not cmake_generator or cmake_generator == "Ninja":
                pass
                # try:
                #     import ninja

                #     ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
                #     cmake_args += [
                #         "-GNinja",
                #         f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
                #     ]
                # except ImportError:
                #     pass

        else:
            # Single config generators are handled "normally"
            single_config = any(
                x in cmake_generator for x in {"NMake", "Ninja"})

            # CMake allows an arch-in-generator style for backward compatibility
            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
            if not single_config and not contains_arch and cmake_generator:
                cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]

            # Multi-config generators have a different way to specify configs
            if not single_config:
                cmake_args += [
                    f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"
                ]
                build_args += ["--config", cfg]

        if sys.platform.startswith("darwin"):
            # Cross-compile support for macOS - respect ARCHFLAGS if set
            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
            if archs:
                cmake_args += [
                    "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]

        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
            cpu_count = os.cpu_count()
            if cpu_count is None:
                cpu_count = 1
            if hasattr(self, "parallel") and self.parallel:
                build_args += [f"--parallel={self.parallel}"]
            else:
                build_args += [f"--parallel={cpu_count}"]
        print("CMake args:", cmake_args)
        build_temp = Path(ext.sourcedir) / "build"
        print("build_temp:", build_temp)

        if not build_temp.exists():
            build_temp.mkdir(parents=True)
        run_command_with_live_tail(ext.name,
            ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp
        )
        run_command_with_live_tail(ext.name,
            ["cmake", "--build", build_temp, "--verbose", *build_args], cwd=build_temp
        )

if CUDA_HOME is not None or ROCM_HOME is not None:
    ops_module = CUDAExtension('KTransformersOps', [
        'csrc/ktransformers_ext/cuda/custom_gguf/dequant.cu',
        'csrc/ktransformers_ext/cuda/binding.cpp',
        'csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
    ],
    extra_compile_args={
            'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
            'nvcc': [
                '-O3',
                # '--use_fast_math',
                '-Xcompiler', '-fPIC',
                '-DKTRANSFORMERS_USE_CUDA',
            ]
        }
    )
elif MUSA_HOME is not None:
    SimplePorting(cuda_dir_path="csrc/ktransformers_ext/cuda", mapping_rule={
        # Common rules
        "at::cuda": "at::musa",
        "#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
        "#include <c10/cuda/CUDAGuard.h>": "#include \"torch_musa/csrc/core/MUSAGuard.h\"",
        "nv_bfloat16": "mt_bfloat16",
        }).run()
    ops_module = MUSAExtension('KTransformersOps', [
        'csrc/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
        'csrc/ktransformers_ext/cuda_musa/binding.cpp',
        # TODO: Add Marlin support for MUSA.
        # 'csrc/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
    ],
    extra_compile_args={
            'cxx': ['force_mcc'],
            'mcc': [
                '-O3',
                '-DKTRANSFORMERS_USE_MUSA',
                '-DTHRUST_IGNORE_CUB_VERSION_CHECK',
            ]
        }
    )
elif torch.xpu.is_available(): #XPUExtension is not available now.
    ops_module = None
elif KTRANSFORMERS_BUILD_NPU:
    pass
else:
    raise ValueError("Unsupported backend: CUDA_HOME ROCM_HOME MUSA_HOME are not set and XPU is not available.")

if not torch.xpu.is_available() and not KTRANSFORMERS_BUILD_NPU:
    ext_modules = [
        CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
        ops_module,
        CUDAExtension(
            'vLLMMarlin', [
                'csrc/custom_marlin/binding.cpp',
                'csrc/custom_marlin/gptq_marlin/gptq_marlin.cu',
                'csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu',
            ],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': ['-O3', '-Xcompiler', '-fPIC'],
            },
        )
    ]
    if with_balance:
        print("using balance_serve")
        ext_modules.append(
            CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
        )

    setup(
        name=VersionInfo.PACKAGE_NAME,
        version=VersionInfo().get_package_version(),
        install_requires=triton_dep,
        cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
        ext_modules=ext_modules
    )


elif torch.xpu.is_available():
    ext_modules = [
        CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
    ]
    setup(
        name=VersionInfo.PACKAGE_NAME,
        version=VersionInfo().get_package_version(),
        install_requires=triton_dep,
        cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
        ext_modules=ext_modules
    )

elif KTRANSFORMERS_BUILD_NPU:
    ext_modules = [
        CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
    ] 
    if with_balance:
        print("using balance_serve")
        ext_modules.append(
            CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
        )

    setup(
        name=VersionInfo.PACKAGE_NAME,
        version=VersionInfo().get_package_version(),
        cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
        ext_modules=ext_modules
    )


================================================
FILE: archive/third_party/llamafile/README.md
================================================
The code in this folder is copied from [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile). Special thanks to the Mozilla-Ocho team.


================================================
FILE: archive/third_party/llamafile/bench.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/bench.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#pragma once

#include <stdio.h>

#include "micros.h"

#define BENCH(x)                                                                       \
    do {                                                                               \
        x;                                                                             \
        __asm__ volatile("" ::: "memory");                                             \
        long long start = micros();                                                    \
        for (int i = 0; i < ITERATIONS; ++i) {                                         \
            __asm__ volatile("" ::: "memory");                                         \
            x;                                                                         \
            __asm__ volatile("" ::: "memory");                                         \
        }                                                                              \
        printf("%9lld us %s\n", (micros() - start + ITERATIONS - 1) / ITERATIONS, #x); \
    } while (0)


================================================
FILE: archive/third_party/llamafile/flags.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#include "flags.h"

bool FLAG_precise = false;


================================================
FILE: archive/third_party/llamafile/flags.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#pragma once

extern bool FLAG_precise;


================================================
FILE: archive/third_party/llamafile/iqk_mul_mat.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc
// Copyrigth 2024 Iwan Kawrakow - Apache 2.0 Licens
// with additions from
// https://github.com/ikawrakow/ik_llama.cpp/blob/main/ggml/src/iqk/iqk_mul_mat.cpp
// Copyrigth 2024-2025 Iwan Kawrakow - MIT Licens
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
// Copyright 2024 Iwan Kawrakow
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//
// Copyright (C) 2024-2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//

#if defined(KTRANSFORMERS_USE_NPU) && KTRANSFORMERS_USE_NPU
        // use ARM version
        #include "iqk_mul_mat_arm.inc"
#else
        // use x86 version
        #include "iqk_mul_mat_x86.inc"
#endif

================================================
FILE: archive/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_avx2.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#include "iqk_mul_mat.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_zen4.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define iqk_mul_mat iqk_mul_mat_zen4
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
#include "iqk_mul_mat.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/iqk_mul_mat_arm.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
// Copyright 2024 Iwan Kawrakow
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstring>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)

#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "sgemm.h"

// For i-quants, I had to explicitely specify which
// functions to inline / not inline (at least for some
// of the functions), else performance would be significantly
// lower. This is worrysome as things can change with,
// e.g., a different compiler version or running on a different
// CPU.
#ifdef _MSC_VER
#define IQK_NOINLINE __declspec(noinline)
#define IQK_ALWAYS_INLINE inline
#else
#define IQK_NOINLINE __attribute__((__noinline__))
#define IQK_ALWAYS_INLINE __attribute__((always_inline))
#endif

#define GGML_COMMON_IMPL_C
#include "llama.cpp/ggml-common.h"

// clang-format off

// This matrix - vector and matrix - matrix multiplication implementation
// for legacy quants, k-quants and i-quants makes prompt processing 150-200%
// (legacy and k-quants) or 250-400% (i-quants) faster.
// compared to mainline llama.cpp (and llamafile).
// It provides implementations for ARM_NEON (all quants) and AVX2
// (all quants except sub-4 bit i-quants).
//
// Main idea is that unpacking the quants and the block scales to
// be ready for dot products with the corresponding Q8_Y quants
// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type).
// Hence, if we are performing a QX x Q8_Y matrix matrix
// multiplication (as needed for prompt processing), we can get
// a significant speedup by reusing the unpacked QX quants and scales
// for multiplication with several Q8_K columns. We also achieve fewer
// loads from memory, which is the main purpose of tiling in general
// purpose matrix multiplication packages.

#include <utility>
#include <array>

#endif

constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast<ggml_type>(98);
constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast<ggml_type>(99);


namespace {
#define GEMV_Q4K
#define GEMV_Q6K
#define GEMM_Q4K_Q6K

typedef struct {
    int32_t i1;
    int32_t i2;
} mmid_row_mapping;

struct DataInfo {
    float       * s;
    const char  * cy;
    size_t        bs;
    size_t        by;
    int           cur_y = 0;
    int           ne11;
    const mmid_row_mapping * row_mapping = nullptr;
    size_t        bs2 = 0;

    inline const char * src1_row(int iy) const {
        if (!row_mapping) return cy + (cur_y + iy)*by;
        int i11 = row_mapping[cur_y + iy].i1 % ne11;
        int i12 = row_mapping[cur_y + iy].i2;
        return cy + (i11 + i12*ne11)*by;
    }

    inline void store(int ix, int iy, float result) const {
        *(dst_row(iy) + ix) = result;
        //dst_row(iy)[ix] = result;
    }
    inline float* ptr(int ix, int iy) const {
        return dst_row(iy) + ix;
    }
    inline float * dst_row(int iy) const {
        if (!row_mapping) return s + (cur_y + iy)*bs;
        int i12 = row_mapping[cur_y + iy].i2;
        int i1  = row_mapping[cur_y + iy].i1;
        int i2  = i12;
        return s + i1*bs + i2*bs2;
    }
};

/*
moonll 
change param for set_mul_mat 
add func16
*/

typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);
typedef void (*mul_mat_t_v2)(int m, int n, int k, const void *vx, size_t bx, const DataInfo& info);

struct MulMat {
    std::array<mul_mat_t, 8> funcs = {};
    mul_mat_t func16 = nullptr;
    mul_mat_t_v2 funcs_v2;
    //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
    IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
        constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)

        if (func16 && nrc_y >= 16) {
            int n_step = (nrc_y - info.cur_y)/16;
            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
                auto this_info = info;
                this_info.s += ix;
                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
                for (int iy = 0; iy < n_step; ++iy) {
                    func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
                    this_info.cur_y += 16;
                }
            }
            info.cur_y += 16 * n_step;
            if (info.cur_y == nrc_y) return;
        }

        int n_step = (nrc_y - info.cur_y)/funcs.size();
        if (n_step > 0) {
            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
                auto this_info = info;
                this_info.s += ix;
                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
                for (int iy = 0; iy < n_step; ++iy) {
                    funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
                    this_info.cur_y += funcs.size();
                }
            }
            info.cur_y += funcs.size() * n_step;
        }
        int n_left = nrc_y - info.cur_y;
        if (n_left > 0) {
            funcs[n_left-1](n, vx, bx, info, nrc_x);
        }
    }
#if defined __x86_64__ || defined(_M_X64)
    static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
#else
    IQK_NOINLINE void mul_mat_NxM_v2(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
        funcs_v2(nrc_x, nrc_y, n, vx, bx, info);
        return;
    }
    static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny);
#endif
private:
    template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
};

inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
    const uint16_t * scales = (const uint16_t *)scales8;
    const uint32_t a0 = scales[0] | (scales[1] << 16);
    const uint32_t a1 = scales[2] | (scales[3] << 16);
    const uint32_t a2 = scales[4] | (scales[5] << 16);
    aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030);
    aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030);
    aux32[2] = a1 & 0x3f3f3f3f;
    aux32[0] = a0 & 0x3f3f3f3f;
}

/*
moonll
decoding tables
*/
#ifdef __AVX2__
static const uint64_t iq1s_grid_us[2048] = {
    0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
    0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
    0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
    0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
    0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
    0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
    0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
    0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
    0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
    0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
    0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
    0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
    0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
    0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
    0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
    0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
    0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
    0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
    0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
    0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
    0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
    0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
    0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
    0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
    0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
    0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
    0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
    0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
    0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
    0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
    0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
    0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
    0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
    0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
    0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
    0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
    0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
    0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
    0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
    0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
    0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
    0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
    0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
    0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
    0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
    0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
    0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
    0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
    0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
    0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
    0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
    0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
    0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
    0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
    0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
    0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
    0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
    0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
    0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
    0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
    0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
    0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
    0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
    0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
    0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
    0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
    0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
    0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
    0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
    0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
    0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
    0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
    0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
    0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
    0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
    0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
    0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
    0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
    0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
    0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
    0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
    0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
    0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
    0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
    0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
    0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
    0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
    0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
    0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
    0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
    0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
    0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
    0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
    0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
    0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
    0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
    0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
    0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
    0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
    0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
    0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
    0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
    0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
    0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
    0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
    0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
    0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
    0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
    0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
    0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
    0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
    0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
    0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
    0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
    0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
    0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
    0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
    0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
    0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
    0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
    0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
    0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
    0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
    0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
    0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
    0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
    0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
    0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
    0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
    0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
    0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
    0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
    0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
    0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
    0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
    0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
    0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
    0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
    0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
    0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
    0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
    0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
    0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
    0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
    0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
    0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
    0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
    0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
    0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
    0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
    0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
    0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
    0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
    0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
    0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
    0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
    0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
    0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
    0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
    0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
    0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
    0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
    0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
    0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
    0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
    0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
    0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
    0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
    0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
    0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
    0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
    0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
    0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
    0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
    0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
    0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
    0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
    0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
    0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
    0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
    0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
    0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
    0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
    0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
    0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
    0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
    0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
    0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
    0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
    0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
    0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
    0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
    0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
    0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
    0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
    0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
    0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
    0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
    0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
    0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
    0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
    0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
    0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
    0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
    0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
    0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
    0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
    0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
    0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
    0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
    0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
    0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
    0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
    0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
    0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
    0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
    0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
    0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
    0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
    0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
    0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
    0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
    0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
    0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
    0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
    0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
    0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
    0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
    0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
    0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
    0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
    0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
    0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
    0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
    0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
    0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
    0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
    0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
    0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
    0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
    0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
    0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
    0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
    0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
    0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
    0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
    0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
    0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
    0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
    0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
    0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
    0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
    0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
    0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
    0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
    0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
    0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
    0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
    0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
    0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
    0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
    0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
    0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
    0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
    0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
    0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
    0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
    0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
    0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
    0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
    0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
    0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
    0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
    0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
    0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
    0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
    0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
    0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
    0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
    0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
    0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
    0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
    0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
    0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
    0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
    0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
    0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
    0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
    0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
    0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
    0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
    0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
    0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
    0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
    0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
    0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
    0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
    0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
    0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
    0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
    0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
    0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
    0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
    0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
    0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
    0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
    0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
    0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
    0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
    0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
    0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
    0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
    0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
    0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
    0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
    0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
    0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
    0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
    0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
    0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
    0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
    0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
    0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
    0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
    0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
    0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
    0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
    0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
    0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
    0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
    0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
    0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
    0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
    0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
    0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
    0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
    0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
    0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
    0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
    0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
    0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
    0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
    0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
    0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
    0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
    0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
    0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
    0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
    0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
    0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
    0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
    0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
    0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
    0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
    0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
    0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
    0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
    0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
    0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
    0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
    0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
    0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
    0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
    0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
    0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
    0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
    0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
    0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
    0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
    0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
    0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
    0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
    0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
    0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
    0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
    0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
    0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
    0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
    0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
    0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
    0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
    0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
    0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
    0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
    0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
    0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
    0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
    0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
    0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
    0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
    0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
    0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
    0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
    0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
    0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
    0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
    0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
    0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
    0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
    0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
    0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
    0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
    0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
    0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
    0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
    0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
    0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
    0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
    0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
    0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
    0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
    0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
    0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
    0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
    0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
    0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
    0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
    0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
    0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
    0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
    0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
    0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
    0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
    0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
    0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
    0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
    0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
    0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
    0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
    0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
    0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
    0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
    0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
    0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
    0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
    0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
    0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
    0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
    0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
    0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
    0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
    0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
    0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
    0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
    0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
    0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
    0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
    0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
    0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
    0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
    0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
    0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
    0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
    0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
    0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
    0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
    0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
    0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
    0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
    0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
    0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
    0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
    0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
    0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
    0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
    0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
    0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
    0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
    0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
    0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
    0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
    0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
    0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
    0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
    0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
    0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
    0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
    0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
    0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
    0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
    0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
    0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
    0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
    0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
    0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
    0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
    0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
    0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
    0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
    0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
    0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
    0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
    0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
    0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
    0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
    0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
    0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
    0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
    0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
    0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
    0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
    0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
    0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
    0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
    0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
    0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
    0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
    0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
    0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
    0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
    0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
    0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
};
#else
static const uint32_t iq1s_grid_us[2048] = {
    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
};
#endif

#ifndef HAVE_FANCY_SIMD
const uint64_t keven_signs[128] = {
    0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
    0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff,
    0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff,
    0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff,
    0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff,
    0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff,
    0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff,
    0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff,
    0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff,
    0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff,
    0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff,
    0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff,
    0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff,
    0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff,
    0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff,
    0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff,
    0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff,
    0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff,
    0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff,
    0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff,
    0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
    0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
};
#endif

}

/* moonll change mulmat
add typeB and strideB
}*/

bool iqk_mul_mat(long Nx, long Ny, long ne00,
    int typeA, const void * A, long strideA,
    int typeB, const void * B, long strideB,
    float * C, long stride_C, int ith, int nth) {

        MulMat mm;
#if defined __x86_64__ || defined(_M_X64)
        if (!MulMat::set_mul_mat(typeA, typeB, (int)ne00, mm, Ny)) {
            return false;
        }
#else
        int row_size_q8;
        if (!MulMat::set_mul_mat(typeA, (int)ne00, mm, row_size_q8, Ny)) {
            return false;
        }
#endif


        size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA));
        size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB));
      
        
        auto nrc_x = (Nx + nth - 1)/nth;
        auto first_x = ith*nrc_x;
        if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;

        DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};
#ifdef __ARM_NEON
#ifdef GEMM_Q4K_Q6K
        if (Ny >= 8 && (typeA == GGML_TYPE_Q4_K || typeA == GGML_TYPE_Q6_K)) {
            mm.mul_mat_NxM_v2(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
        } else
#endif
#endif
        {
            mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
        }

        return true;
}


bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
        float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
    const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
    assert(row_mapping != nullptr);

    MulMat mm;
    int row_size_q8;
    /* moonll

    if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
        return false;
    }*/
    int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
    int nrc_x = (Nx + nth - 1)/nth;
    int first_x = ith*nrc_x;
    if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
    DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)};
    mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
    return true;
}

#if defined __x86_64__ || defined(_M_X64)

#if defined HAVE_FANCY_SIMD
    #undef HAVE_FANCY_SIMD
#endif
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
    #define HAVE_FANCY_SIMD
#endif
//#define HAVE_FANCY_SIMD

namespace {

inline float hsum_float_4(__m128 x) {
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
    x = _mm_add_ss(x, _mm_movehdup_ps(x));
    return _mm_cvtss_f32(x);
}
inline float hsum_float_8(__m256 x) {
    return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
}

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)


template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

#ifdef HAVE_FANCY_SIMD
    inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
#endif
    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
    inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

// Handles q4_K and q5_K scales/mins
struct Scales8K {
    template <typename Q8>
    inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        make_q4_scales(data, utmp);
        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
        const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1);
        accum_mins(mins128, q8, i, c, accd);
        const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
        return MM256_SET_M128I(sc128, sc128);
    }
#ifdef HAVE_FANCY_SIMD
    template <typename Q8>
    inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        auto scales = process_mins_and_scales(data, c, i, q8, accd);
        return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1);
    }
#endif
    template <typename Q8>
    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i q8s = q8.load_bsums(iy, i);
            const __m256i prod = _mm256_madd_epi16(mins, q8s);
            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
        }
    }
#ifdef HAVE_FANCY_SIMD
    const __m512i shuffles512[2] = {
        _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302,
                         0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100),
        _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a,
                         0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
    };
#endif
    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};

    uint32_t utmp[4];
};

template <typename Q8>
inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        const __m256i prod  = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i));
        accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]);
    }
}
inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) {
    const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
    const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
    scales[0] = MM256_SET_M128I(l_scales, l_scales);
    scales[1] = MM256_SET_M128I(h_scales, h_scales);
}

struct ScaleQ3 {
    inline __m128i make_scales(const uint16_t * s8) const {
        const uint16_t * scales16 = (const uint16_t *)s8;
        uint32_t aux0 = scales16[0] | (scales16[1] << 16);
        uint32_t aux1 = scales16[2] | (scales16[3] << 16);
        uint32_t aux2 = scales16[4] | (scales16[5] << 16);
        __m128i scales128 = _mm_set_epi32(
            ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030),
            ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030),
             (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030),
             (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030));
        return _mm_add_epi8(scales128, m32);
    }
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct ScaleIQ4XS {
    inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) {
        uint32_t tmp32 = scales_h | (scales_h << 14);
        const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4);
        const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask);
        return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32);
    }
    const __m128i hshift = _mm_set_epi32(12, 8, 4, 0);
    const __m128i lshift = _mm_set_epi32(4, 0, 4, 0);
    const __m128i hmask  = _mm_set1_epi16(0x03);
    const __m128i lmask  = _mm_set1_epi8(0xf);
    const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400);
    const __m128i m32 = _mm_set1_epi16(-32);
};

struct Scales8KBase {
    template <typename Q8>
    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i q8s = q8.load_bsums(iy, i);
            const __m256i prod = _mm256_madd_epi16(mins, q8s);
            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
        }
    }
    inline __m256i shuffle(__m128i mins) const {
        return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0]));
    }
    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
};

template <typename Block>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
    inline void new_row(int ix) {
        x = (const Block *)((const char *)vx + bx*ix);
    }

    const void *  vx;
    size_t        bx;
    const Block * x;

    float d;
};

__m128i inline load_iq4nl_values_128() {
    static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
    return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
}

__m256i inline load_iq4nl_values_256() {
    auto val128 = load_iq4nl_values_128();
    return MM256_SET_M128I(val128, val128);
}

#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================

struct BlockPermuter {
    const __m512i permute1 = _mm512_set_epi64(11, 10,  9,  8, 3, 2, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
};

struct Q4Bits {
    inline void prepare(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        auto tmp1 = _mm512_and_si512(q4bits, ml);
        auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        tmp1 = _mm512_and_si512(q4bits, ml);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
    }
    inline void prepare64(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        values[0] = _mm512_and_si512(q4bits, ml);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        values[2] = _mm512_and_si512(q4bits, ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0xf);
    BlockPermuter perm;
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2) {

        auto q2bits = _mm512_loadu_si512((const __m512i*)q2);
        auto tmp = _mm512_srli_epi16(q2bits, 2);

        values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp);
        values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml);
        values[0] = _mm512_and_si512(values[0], ml);
        values[2] = _mm512_and_si512(values[2], ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0x03);
    BlockPermuter perm;
};

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    Scales8K s8k;
};

/*
moonll DequantizerIQ4XS
*/

__m512i inline load_iq4nl_values_512() {
    auto val256 = load_iq4nl_values_256();
    return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        prepare(x[i].qs);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        auto scales256 = MM256_SET_M128I(scales128, scales128);
        auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
        scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
        scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
        scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
    }
    inline void prepare(const uint8_t * q4) {
        bits.prepare64(q4);
        // We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111
        //                bits.valuse[1]: 16..31, 48...63, 80...95, 112..127
        //                etc.
        auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]);
        bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1]));
        bits.values[0] = _mm512_shuffle_epi8(values, tmp);
        tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]);
        bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3]));
        bits.values[2] = _mm512_shuffle_epi8(values, tmp);
    }

    Q4Bits bits;
    Scales8KBase s8k;
    ScaleIQ4XS siq4;
    const __m512i values;
    const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2,  9,  8, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
    const __m512i shuffles[4] = {
        _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
    };
};

struct HighBit5 {
    inline void apply(const uint8_t * h, Q4Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x10);
};

struct HighBit3 {
    inline void apply(const uint8_t * h, Q2Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x04);
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].qh, bits);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    HighBit5 hbits;
    Scales8K s8k;
};

struct Scale16 {
    inline void make_scales(const __m128i& scales8, __m512i * scales) const {
        auto all_scales8 = MM256_SET_M128I(scales8, scales8);
        auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1);
        auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2);
        scales[0] = _mm512_cvtepi8_epi16(scales1);
        scales[1] = _mm512_cvtepi8_epi16(scales2);
    }
    template <typename Q8>
    inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8,
        const Q8& q8, __m256 * accm, __m512i * scales) const {
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm);
        make_scales(scales8, scales);
    }
    const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202,
                                              0x05050505, 0x01010101, 0x04040404, 0x00000000);
    const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a,
                                              0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales);
    }

    Q2Bits bits;
    Scale16 sc16;
    const __m128i m4 = _mm_set1_epi8(0xf);

};

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].hmask, bits);
        auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales);
        sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales);
    }

    Q2Bits bits;
    HighBit3 hbits;
    ScaleQ3 sc3;
    Scale16 sc16;
    const __m128i m4  = _mm_set1_epi8(0xf);
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare64(x[i].ql);
        add_high_bits(x[i].qh, bits);
        auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales);
        sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales);
    }

    inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const {
        auto hbits = _mm512_loadu_si512((const __m512i *)qh);
        auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh);
        auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
        tmp1 = _mm512_and_si512(hbits, mh);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh);
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
    }

    Q4Bits bits;
    HighBit3 hbits;
    Scale16 sc16;

    const __m512i mh = _mm512_set1_epi8(0x30);

};

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0));
                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1));
                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2));
                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}
template <typename Q8>
inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
    const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
    const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1));
    const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2));
    const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3));
    auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
    sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
    accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0));
                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1));
                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2));
                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[4];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0));
                const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1));
                const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2));
                const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(),
                                    p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]);
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}

template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    constexpr int k_nx = 2;

    Q8<1> q8(info);

    Dequantizer deq1(vx, bx);
    Dequantizer deq2(vx, bx);

    Dequantizer * deq[k_nx];
    deq[0] = &deq1;
    deq[1] = &deq2;

    __m512i scales[2*k_nx];

    for (int ix = 0; ix < nrc_x; ++ix) {

        auto accd = _mm512_setzero_ps();
        auto accm = _mm256_setzero_ps();

        for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix);

        for (int i = 0; i < nb/k_nx; ++i) {

            for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx);

            for (int kx = 0; kx < k_nx; ++kx) {
                compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd);
            }

        }
        if (2*(nb/2) < nb) {
            int i0 = 2*(nb/2);
            deq[0]->new_block(i0, q8, &accm, scales);
            compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd);
        }

        auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1));
        info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256)));
    }
}

#else
// ===================================== Vanilla AVX2 =====================================

struct Q4Bits {
    inline void prepare(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[2] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare64(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[1] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare16(const uint8_t * q4, int j) {
        values[0] = dequant16(q4 + 64*j +  0);
        values[1] = dequant16(q4 + 64*j + 16);
        values[2] = dequant16(q4 + 64*j + 32);
        values[3] = dequant16(q4 + 64*j + 48);
    }
    inline __m256i dequant16(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128);
        return _mm256_and_si256(ml, aux256);
    };
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0xf);
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2, int j) {
        auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j);
        values[0] = _mm256_and_si256(q2bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml);
    }
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0x03);
};

struct HighBit5 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q4Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x10);
    __m256i hbits;
};

struct HighBit3 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q2Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x04);
    __m256i hbits;
};


/*
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
    if (j == 0) {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
        }
    } else {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
        }
    }
}*/

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q4Bits bits;
    Scales8K s8k;
};

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        return MM256_SET_M128I(scales128, scales128);
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs, j);
        bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]);
        bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]);
        bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]);
        bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]);
    }

    static __m256i load_values() {
        static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
        auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
        return MM256_SET_M128I(val128, val128);
    }

    Q4Bits bits;
    Scales8K s8k;
    ScaleIQ4XS siq4;
    const __m256i values;
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].qh);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q4Bits  bits;
    HighBit5 hbits;
    Scales8K s8k;
};

template <typename Q8>
inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d,
    __m256 * accm, __m256i * scales) {
    const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
    process_mins_16(all_scales, q8, i, d, accm);
    prepare_scales_16(all_scales, scales);
}

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].hmask);
        process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q2Bits  bits;
    HighBit3 hbits;
    ScaleQ3 sc3;

    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm);
        prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q2Bits  bits;

    const __m128i m4 = _mm_set1_epi8(0xf);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare64(x[i].ql, j);
        auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j);
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh));
    }

    Q4Bits  bits;
    const __m256i mh = _mm256_set1_epi8(0x30);
};

inline __m256i get_scale_shuffle_8(int i);

inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales);

inline __m256i get_scale_shuffle_16(int i);

inline void set_scales_16(const __m256i& all_scales, __m256i* scales);


template <typename Dequantizer, int nrc_y>
static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%QK_K == 0);
    const int nb = n/QK_K;

    Q8<nrc_y> q8(info);

    __m256i all_scales[2];
    __m256i scales[4];
    __m256  accd[nrc_y];

    Dequantizer deq(vx, bx);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accd, all_scales);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {
                deq.prepare(i, j);
                set_scales_16(all_scales[j], scales);
                multiply_add(deq.bits, scales, j, i, q8, sumi);
            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }

}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accd[nrc_y];
    __m256i scales[4];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            auto all_scales = deq.new_block(i, q8, accd);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {

                deq.prepare(i, j);

                set_scales_8(all_scales, j, scales);

                multiply_add(deq.bits, scales, j, i, q8, sumi);

            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }
}
#endif  // Zen4 or vanilla AVX2


//
// ============================== Legacy quants
//

struct DotHelper {
    const __m256i m1 = _mm256_set1_epi16(1);
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y);
    }
#else
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y));
    }
#endif
};

struct SignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x));
    }
};
struct UnsignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(x, y);
    }
};
template <typename Q8, typename Dot> struct Sum4 {
    Dot dot;
    inline __m256i compute(const __m256i * qx, const Q8 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1));    // 0,0, 1,1, 0,0, 1,1
        const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3));    // 2,2, 3,3, 2,2, 3,3
        return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
    }
};

struct Sum4_Q8 {
    SignedDot dot;
    static inline __m256i add1(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b));
    }
    static inline __m256i add2(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b));
    }
    inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = add1(p0, p1);  // 0,1, 0,1, 0,1, 0,1
        const __m256i p23 = add1(p2, p3);  // 2,3, 2,3, 2,3, 2,3
        return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3
    }
};

struct ScaleHelperQ_0 {
    ggml_half scales8[4];
    template <typename Q>
    inline __m128 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
        return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
    }
    template <typename Q>
    inline __m128 prepare4(__m128 other_scales, const Q * y) {
        return _mm_mul_ps(other_scales, prepare4<Q>(y));
    }
    template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
    template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
};
template <int min_value>
struct ScaleHelperQ_0_1 {
    ggml_half scales8[4];
    template <typename Q>
    inline __m256 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
        auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
        return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
    }
    template <typename Q>
    inline __m256 prepare4(__m256 other_scales, const Q * y) {
        return _mm_mul256_ps(other_scales, prepare4<Q>(y));
    }
    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
        float d = GGML_FP16_TO_FP32(y->d);
        return std::make_pair(d, -d*float(min_value));
    }
    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
    }
    const __m128 min = _mm_set1_ps(float(-min_value));
};

struct ScaleHelperQ_1 {
    uint32_t scales8[4];
    const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);

    template <typename Q>
    inline __m256 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) {
            // it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
            // complain that this breaks strict-aliasing rules.
            memcpy(scales8 + j, &y[j].d, sizeof(uint32_t));
        }
        return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle));
    }

    template <typename Q>
    inline __m256 prepare4(__m256 other_scales, const Q * y) {
        return _mm256_mul_ps(other_scales, prepare4<Q>(y));
    }

    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
        return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
    }
    template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
    }
    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
    }
};

struct MinusType0 {
    inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); }
    inline float compute(float d, int) const { return d; }
    inline float result(__m256 acc, int) const { return hsum_float_8(acc); }
};

template <int nrc_y> struct MinusType1 {
    __m128 accm[nrc_y];
    MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); }
    inline __m256 compute(__m256 dm, int iy) {
        const __m128 d = _mm256_castps256_ps128(dm);
        const __m128 m = _mm256_extractf128_ps(dm, 1);
        accm[iy] = _mm_add_ps(accm[iy], m);
        return _mm256_set_m128(d, d);
    }
    inline float compute(const std::pair<float, float>& dm, int iy) {
        accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f));
        return dm.first;
    }
    inline float result(__m256 acc, int iy) const {
        const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
        return hsum_float_4(_mm_add_ps(sum, accm[iy]));
    }
};

template <typename Minus, int nrc_y, bool is_multiple_of_4> struct AccumT {
    __m256 acc[nrc_y];
    Minus accm;
    AccumT() {  for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); }
    template <typename Unpacker, typename Scales, typename Sum, typename Q8>
    inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) {
        auto qx = unp.quants();
        __m256 dall[nrc_y];
        for (int i = 0; i < nb/4; ++i) {
            auto other_scales = unp.set_block_4(i);
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
                dall[iy] = accm.compute(s12, iy);
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto pall = sum.compute(qx, y[iy] + 4*i);
                acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
            }
        }
        if (!is_multiple_of_4) {
            for (int i = 4*(nb/4); i < nb; ++i) {
                auto other_scales = unp.set_block(i);
                for (int iy = 0; iy < nrc_y; ++iy) {
                    auto s12 = scales.prepare1(other_scales, y[iy] + i);
                    auto d = accm.compute(s12, iy);
                    const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
                    acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
                }
            }
        }
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, accm.result(acc[iy], iy));
            //s[iy*bs] = accm.result(acc[iy], iy);
        }
    }
};

template <int nrc_y, bool is_multiple_of_4>
using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;

template <int nrc_y, bool is_multiple_of_4>
using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;

using Sum4Type0 = Sum4<block_q8_0, SignedDot>;
using Sum4Type1 = Sum4<block_q8_1, UnsignedDot>;

template <typename Unpacker, typename Sum4Type, typename AccumType, typename Scales, typename Q8, int nrc_y>
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
    Unpacker unp(vx, bx);
    Sum4Type sum4;
    Scales scales;
    for (int ix = 0; ix < nrc_x; ++ix) {
        unp.set_row(ix);
        AccumType accum;
        accum.compute(nb, unp, scales, sum4, y, info, ix);
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_1> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, true>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, false>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

struct Dequantizer4bit {
    const __m256i m4 = _mm256_set1_epi8(0xf);
    inline __m256i dequant(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4);
    }
};

struct Q8_0_Dequantizer {
    inline __m256i dequant(const block_q8_0 * x) const {
        return _mm256_loadu_si256((const __m256i *)x->qs);
    }
};

struct Q8_0_1_Dequantizer {
    inline __m256i dequant(const block_q8_0 * x) const {
        return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
    }
};

struct Q4_0_Dequantizer {
    Dequantizer4bit b4;
    const __m256i m8 = _mm256_set1_epi8(-8);
    inline __m256i dequant(const block_q4_0 * x) const {
        return _mm256_add_epi8(b4.dequant(x->qs), m8);
    }
};

struct Q4_1_Dequantizer {
    Dequantizer4bit b4;
    inline __m256i dequant(const block_q4_1 * x) const {
        return b4.dequant(x->qs);
    }
};

struct HBitDequantizer {
    const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
    const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
    const __m256i minus1 = _mm256_set1_epi64x(-1);
    inline __m256i to_bytes(const uint8_t * bits) const {
        // Note: Data in all ggml quants is at least 2-byte aligned.
        // => we can cast to uint16_t and use or on two consecutive entries
        // which is faster than memcpy
        const uint16_t * aux16 = (const uint16_t *)bits;
        const uint32_t aux32 = aux16[0] | (aux16[1] << 16);
        //uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
        __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle);
        bytes = _mm256_or_si256(bytes, mask);
        return _mm256_cmpeq_epi8(bytes, minus1);
    }
};

struct Q5_0_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8((char)0xF0);
    inline __m256i dequant(const block_q5_0 * x) const {
        const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

struct Q5_1_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8(0x10);
    inline __m256i dequant(const block_q5_1 * x) const {
        const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

template <typename Q, typename Scales, typename Dequantizer>
struct Q_Unpacker {
    Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {}

    const char * cx_0;
    const Q    * x;
    size_t       bx;

    Scales scales;
    Dequantizer deq;

    __m256i qx[4];

    inline const __m256i* quants() const { return qx; }

    inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); }

    inline auto set_block_4(int i) {
        for (int j = 0; j < 4; ++j) {
            qx[j] = deq.dequant(x + 4*i + j);
        }
        return scales.prepare4(x + 4*i);
    }
    inline auto set_block(int i) {
        qx[0] = deq.dequant(x + i);
        return scales.prepare1(x + i);
    }
};

struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
    Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
    Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
//    using Sum4T = Sum4TypeQ81;
    inline static int block_size() { return QK8_0; }
};
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
    Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
    Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK5_0; }
};
struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> {
    Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};
struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer> {
    Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};

template <int nrc_y>
void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Q8_0_Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Q8_0_Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}


/*
moonll
add some structs for DequantizerIQ2XXS
SimpleBits
EvenSignHelper
*/
struct SimpleBits {
    __m256i values[4];
};

// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
#define HAVE_AVX512_POPCNT 1
#else
#define HAVE_AVX512_POPCNT 0
#endif

struct EvenSignHelper {
    #if defined HAVE_FANCY_SIMD
    // #pragma message("Using AVX512VPOPCNTDQ in even sign helper")
        union sbits_t {
            __m128i vec;
            __mmask32 mask[4];
        };
        IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
            aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
            
            // fix for #829: 兼容Intel Cascade Lake架构的CPU，如果不支持AVX512VPOPCNTDQ扩展，则使用替代实现
            #if HAVE_AVX512_POPCNT
                auto pcnt = _mm256_popcnt_epi32(aux);
                
            #else
                // 提供替代实现，使用标准的位计数方法
                __m256i pcnt;
                int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
                int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址，避免不必要的复制
                
                #pragma unroll 8  // 提示编译器展开循环，提高 SIMD 计算吞吐量
                for (int i = 0; i < 8; i++) {
                    pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
                }
            #endif
            
            sbits_t sbits;
            sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
            values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
            values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]);
            //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
            //const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
            //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
            //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
        }
        const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0);
        const __m256i mask   = _mm256_set1_epi32(127);
        const __m256i mone   = _mm256_set1_epi32(1);
    #else
        inline void sign_value(uint32_t aux32, __m256i& value) const {
            auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127],
                                           keven_signs[(aux32 >>  7) & 127], keven_signs[(aux32 >>  0) & 127]);
            value = _mm256_sign_epi8(value, signs);
        }
    #endif
};

/*
moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
add func
get_scale_shuffle_8
get_scale_shuffle_16
set_scales_16
*/

inline __m256i get_scale_shuffle_8(int i) {
    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
}

inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
}


inline __m256i get_scale_shuffle_16(int i) {
    static const uint8_t k_shuffle[128] = {
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
    };
    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
}

inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
}


template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
    if (j == 0) {
#ifdef HAVE_FANCY_SIMD
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
        }
#else
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
        }
#endif
    } else {
#ifdef HAVE_FANCY_SIMD
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
        }
#else
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
        }
#endif
    }
}

/*
moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
add func
set_scales_8_iq
set_scales_16_iq

add MUL_MAT
mul_mat_qX_K_q8_K_IQ_1
mul_mat_qX_K_q8_K_IQ_N
mul_mat_qX_K_q8_K_IQ
*/

template <typename Bits>
inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) {
    if (j == 0) {
#ifdef HAVE_FANCY_SIMD
        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
        sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2));
        sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4));
#else
        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
        sumi[0] = _mm256_add_epi32(p1, p3);
        sumi[1] = _mm256_add_epi32(p2, p4);
#endif
    } else {
#ifdef HAVE_FANCY_SIMD
        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
        sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2));
        sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4));
#else
        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
        sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3));
        sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4));
#endif
    }
}


inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) {
    //#ifdef HAVE_FANCY_SIMD
        auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100)
                              : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908);
        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)));
    //#else
    //    set_scales_8(all_scales, j, scales);
    //#endif
    }
    
inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) {
    #ifdef HAVE_FANCY_SIMD
        auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100);
        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8)));
    #else
        set_scales_16(all_scales, scales);
    #endif
    }
    
template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
        const int nb = n / QK_K;
        Q8<1> q8(info);
        Dequantizer deq(vx, bx);
        __m256i scales[2];
        __m256i q8_quants[4];
        for (int ix = 0; ix < nrc_x; ++ix) {
    
            __m256 accd = _mm256_setzero_ps();
            deq.new_row(ix);
    
            for (int i = 0; i < nb; ++i) {
    
                __m256i sumi[2], all_scales[Dequantizer::num_blocks/8];
                deq.new_block(i, all_scales);
    
                for (int j = 0; j < QK_K/128; ++j) {
                    deq.prepare(i, j, q8, q8_quants);
                    if constexpr (Dequantizer::num_blocks == 8) {
                        set_scales_8_iq(j, all_scales[0], scales);
                    } else {
                        set_scales_16_iq(all_scales[j], scales);
                    }
                    multiply_add_1(j, deq.bits, scales, q8_quants, sumi);
                }
                accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd);
            }
    
            info.store(ix, 0, hsum_float_8(accd));
        }
    }


template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    const int nb = n / QK_K;
    Q8<nrc_y> q8(info);
    Dequantizer deq(vx, bx);
    __m256i scales[4];
    __m256  accd[nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8];
            //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
            __m256i mins;
            float dmin = deq.new_block(i, all_scales, mins);
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto bsums = q8.load_bsums(iy, i);
                auto prod  = _mm256_madd_epi16(mins, bsums);
                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
            }

            for (int j = 0; j < QK_K/128; ++j) {
                deq.prepare(i, j);
                if constexpr (Dequantizer::num_blocks == 8) {
                    set_scales_8(all_scales[0], j, scales);
                } else {
                    set_scales_16(all_scales[j], scales);
                }
                //multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
                multiply_add(deq.bits, scales, j, i, q8, sumi);
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }
        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }
    }
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
#ifdef HAVE_FANCY_SIMD
    if constexpr (nrc_y == 1) {
        mul_mat_qX_K_q8_K_IQ_1<Dequantizer>(n, vx, bx, info, nrc_x);
    } else {
        mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
    }
#else
    mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
#endif
}

/*
moonll iq1s
core func for iq1s mul_mat_iq1_s_q8_K

*/

template <int nrc_y>
static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    GGML_ASSERT(n%QK_K == 0);
    Q8<nrc_y, block_q8_K> q8(info);
    __m256i qx[8];
    __m256i scales[4];
    __m256  acc[nrc_y] = {};
    auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000
    __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100);
    for (int ix = 0; ix < nrc_x; ++ix) {
        auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx);
        for (int ibl = 0; ibl < n/QK_K; ++ibl) {
            float d = GGML_FP16_TO_FP32(iq1s[ibl].d);
            auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh);
            auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7));
            scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1));
#ifdef HAVE_FANCY_SIMD
            auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask);
            auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9));
#else
            auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask);
            auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7)));
#endif
            deltas128 = _mm_mullo_epi16(scales128, deltas128);
            scales128 = _mm_slli_epi16(scales128, 3);
            auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128);
            auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128);
            auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7
            auto all_scales = MM256_SET_M128I(scales128, scales128);
            auto shuffle = shuffle0;
            for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
                scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle);
                shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4));
            }
            const uint8_t  * qs = iq1s[ibl].qs;
            const uint16_t * qh = iq1s[ibl].qh;
            for (int ib = 0; ib < QK_K/32; ib += 2) {
                qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)],
                                             iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
                qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)],
                                             iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
                qs += 8;
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto bsums = q8.load_bsums(iy, ibl);
                auto sumi = _mm256_setzero_si256();
                for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
                    auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0);
                    auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1);
#ifdef HAVE_FANCY_SIMD
                    auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1);
                    auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2);
                    sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2));
#else
                    auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1);
                    auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2);
                    auto dot  = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2));
                    sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot));
#endif
                }
#ifdef HAVE_FANCY_SIMD
                sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas);
#else
                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas));
#endif
                acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]);
            }
        }
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, 0.125f*hsum_float_8(acc[iy]));
            acc[iy] = _mm256_setzero_ps();
        }
    }
}

/*
moonll iq1s
DequantizerIQ2XXS
DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
*/

struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
    DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    constexpr static int num_blocks = 8;

    union Data {
        __m256i vec;
        uint32_t val[8];
    };

    inline __m128i load_scales(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        const uint16_t * a16 = (const uint16_t *)x[i].qs;
        auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12);
        return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1));
    }

    inline void new_block(int i, __m256i * scales) {
        auto sc16 = load_scales(i);
        scales[0] = MM256_SET_M128I(sc16, sc16);
    }
    inline float new_block(int i, __m256i * scales, __m256i& mins) {
        auto sc16 = load_scales(i);
        mins = scb.shuffle(sc16);
        scales[0] = MM256_SET_M128I(sc16, sc16);
        return -d*minv;
    }

    inline static void make4(const uint32_t * aux32, __m256i * values) {
        const uint8_t * aux8 = (const uint8_t *)aux32;
        values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]);
        values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]);
        values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]);
        values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]);
    }

    IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const {
#ifdef HAVE_FANCY_SIMD
        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0);
        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2);
#else
        esh.sign_value(aux32[1], values[0]);
        esh.sign_value(aux32[3], values[1]);
        esh.sign_value(aux32[5], values[2]);
        esh.sign_value(aux32[7], values[3]);
#endif
    }
    inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const {
        make4(aux32, values);
        sign_values(aux32, values);
        for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value);
    }
    inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const {
        make4(aux32, values);
        sign_values(aux32, q8);
    }
    inline void prepare(int i, int j) {
        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
        make4_signed(data.val, min_value, bits.values);
    }
    inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) {
        for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k);
        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
        make4(data.val, bits.values, q8_quants);
    }

    constexpr static int minv = 43;
    SimpleBits bits;
    Scales8KBase scb;
    EvenSignHelper esh;
    const __m256i min_value = _mm256_set1_epi8(minv);
    const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
};

/*
moonll
add Q8_0_Unpacker && DequantizerIQ2XXS support
add func mul_mat_qX_K_q8_K_IQ
*/

template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
    if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
        std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
            m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
        }
        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>|| std::is_same_v<Dequantizer, Q8_0_1_Unpacker>) {
            m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_1_q8_1_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_1_q8_1_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_1_q8_1_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
        }
        else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
            m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>;
            }
            else {
#ifdef HAVE_FANCY_SIMD
            if constexpr (std::is_same_v<Dequantizer, DequantizerIQ4XS>) {
            m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
            m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
            m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
            m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 4>;
            m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 5>;
            m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 6>;
            m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 7>;
            m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 8>;
            } else {
            m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1<Dequantizer>;
            m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>;
            }
#else
            if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
                          std::is_same_v<Dequantizer, DequantizerQ3K> ||
                          std::is_same_v<Dequantizer, DequantizerQ6K>) {
                m.funcs[0] = mul_mat_qY_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qY_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qY_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qY_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qY_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qY_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qY_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qY_K_q8_K_T<Dequantizer, 8>;
            } else {
                m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
            }
#endif
        }
}

struct QFBase {
    #ifdef __AVX512F__
        constexpr static int k_step = 16;
        using Data = __m512;
        using Acc  = __m512;
        static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); }
        static inline Data load(const float * x) { return _mm512_loadu_ps(x); }
        static inline Data load(const ggml_bf16_t * x) {
            return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16));
        }
        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
            return _mm512_fmadd_ps(y, x, prev);
        }
        static inline Acc acc_first(const Data& y, const Data& x) {
            return _mm512_mul_ps(y, x);
        }
        static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); }
        static inline float hsum(Acc acc) {
            return _mm512_reduce_add_ps(acc);
        }
        template <typename Float>
        static inline Data load4Floats(const Float * x) {
            return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
        }
        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
            acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc);
            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
            auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00));
            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline __m128 hsum_r4(Acc acc) {
            auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1));
            auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3));
            return _mm_add_ps(sum1, sum2);
        }
    #else
        constexpr static int k_step = 8;
        using Data = __m256;
        using Acc  = __m256;
        static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); }
        static inline Data load(const float * x) { return _mm256_loadu_ps(x); }
        static inline Data load(const ggml_bf16_t * x) {
            return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16));
        }
        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
            return _mm256_fmadd_ps(y, x, prev);
        }
        static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); }
        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
            acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc);
            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
            auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00));
            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_first(const Data& y, const Data& x) {
            return _mm256_mul_ps(y, x);
        }
        static inline float hsum(Acc acc) {
            return hsum_float_8(acc);
        }
        static inline __m128 hsum_r4(Acc acc) {
            return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
        }
        template <typename Float>
        static inline Data load4Floats(const Float * x) {
            return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
        }
    #endif
        static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
        static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
        static inline __m128 load128(const ggml_bf16_t * x) {
            return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16));
        }
    };
    template <typename Float, int nrc_in> struct QFT final : public QFBase {
        constexpr static int nrc = nrc_in;
        QFT(const DataInfo& info) {
            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy);
        }
        QFT(const char * cx, size_t bx) {
            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
        }
        IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
        IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
        IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const {
            xv[0] = load1(ix+0, i);
            xv[1] = load1(ix+1, i);
            xv[2] = load1(ix+2, i);
            xv[3] = load1(ix+3, i);
    #ifdef __AVX512F__
            auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]);
            auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]);
            auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]);
            auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]);
            xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
            xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
            xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
            xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
    #else
            auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]);
            auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]);
            auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]);
            auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]);
            xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
            xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
            xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
            xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
    #endif
        }
        const Float * y[nrc];
    };
    

template <typename Qy, typename Qx>
IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
    int nb = n/QFBase::k_step;
    int nb4 = n/4;
    Qy y(info);
    Qx x(cx + ix0*bx, bx);
    QFBase::Data xv[Qx::nrc];
    QFBase::Acc  acc[Qx::nrc*Qy::nrc];
    auto yv = y.load1(0, 0);
    for (int ix = 0; ix < Qx::nrc; ++ix) {
        xv[ix] = x.load1(ix, 0);
        acc[ix] = QFBase::acc_first(yv, xv[ix]);
    }
    for (int iy = 1; iy < Qy::nrc; ++iy) {
        yv = y.load1(iy, 0);
        for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]);
    }
    for (int i = 1; i < nb; ++i) {
        yv = y.load1(0, i);
        for (int ix = 0; ix < Qx::nrc; ++ix) {
            xv[ix] = x.load1(ix, i);
            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
        }
        for (int iy = 1; iy < Qy::nrc; ++iy) {
            yv = y.load1(iy, i);
            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
        }
    }
    for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
        yv = y.load_tail(0, i);
        for (int ix = 0; ix < Qx::nrc; ++ix) {
            xv[ix] = x.load_tail(ix, i);
            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
        }
        for (int iy = 1; iy < Qy::nrc; ++iy) {
            yv = y.load_tail(iy, i);
            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
        }
    }
    for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
}
// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
template <int nrc_y, typename FloatX, typename FloatY>
void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    const char * cx = (const char *)vx;
    // TBD if we want this
    //if constexpr (nrc_y == 1) {
    //    constexpr int k_nx = 2;
    //    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
    //        mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
    //    }
    //    if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
    //        int nx = nrc_x - lastx;
    //        switch (nx) {
    //            case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
    //            case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
    //            case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
    //        }
    //        //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
    //    }
    //    return;
    //}
#ifdef __AVX512F__
    constexpr int k_nx = 5;
#else
    constexpr int k_nx = nrc_y == 1 ? 4 : 2;
#endif
    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
        mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
    }
    int last_x = k_nx*(nrc_x/k_nx);
    if (last_x == nrc_x) return;
    int nx = nrc_x - last_x;
#ifdef __AVX512F__
    switch (nx) {
        case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
        case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
        case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
        case 4: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 4>>(n, cx, bx, last_x, info); break;
    }
#else
    if constexpr (nrc_y == 1) {
        switch (nx) {
            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
            case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
            case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
        }
    } else {
        switch (nx) {
            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
        }
    }
#endif
}

template <typename FloatX, typename FloatY>
void set_mul_mat_f(MulMat& mm) {
    for (auto& f : mm.funcs) f = nullptr;
    mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>;
    mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>;
    mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>;
    mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>;
    mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>;
#ifndef __AVX512F__
    mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>;
#endif
}


/*
moonll
add typeb TO compare return not expected type of weight matrix
add IQ2XSS
add IQ1_S
add GGML_TYPE_IQ4_XS
*/

bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
    (void)Ny;

        auto expected_typeB = GGML_TYPE_Q8_K;
    switch (typeA) {
        case GGML_TYPE_Q2_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ2K>(mm);
            break;
        case GGML_TYPE_Q3_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ3K>(mm);
            break;
        case GGML_TYPE_Q4_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ4K>(mm);
            break;
        case GGML_TYPE_Q5_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ5K>(mm);
            break;
        case GGML_TYPE_Q6_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ6K>(mm);
            break;
        case GGML_TYPE_IQ4_XS:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerIQ4XS>(mm);
            break;
        case GGML_TYPE_IQ2_XXS:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerIQ2XXS>(mm);
            break;
        case GGML_TYPE_Q4_0:
            assert (ne00 % QK4_0 == 0);
            MulMat::set_functions<Q4_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0;
            break;
        case GGML_TYPE_Q4_1:
            assert (ne00 % QK4_1 == 0);
            MulMat::set_functions<Q4_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
            break;
        case GGML_TYPE_Q5_0:
            assert (ne00 % QK5_0 == 0);
            MulMat::set_functions<Q5_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0;
            break;
        case GGML_TYPE_Q5_1:
            assert (ne00 % QK5_1 == 0);
            MulMat::set_functions<Q5_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
            break;
        case GGML_TYPE_Q8_0:
            assert (ne00 % QK8_0 == 0);
#ifdef HAVE_FANCY_SIMD
            MulMat::set_functions<Q8_0_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
#else
            MulMat::set_functions<Q8_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0_X4;
#endif
            break;
        case GGML_TYPE_IQ1_S:
            mm.funcs[0] = mul_mat_iq1_s_q8_K<1>;
            mm.funcs[1] = mul_mat_iq1_s_q8_K<2>;
            mm.funcs[2] = mul_mat_iq1_s_q8_K<3>;
            mm.funcs[3] = mul_mat_iq1_s_q8_K<4>;
            mm.funcs[4] = mul_mat_iq1_s_q8_K<5>;
            mm.funcs[5] = mul_mat_iq1_s_q8_K<6>;
            mm.funcs[6] = mul_mat_iq1_s_q8_K<7>;
            mm.funcs[7] = mul_mat_iq1_s_q8_K<8>;
        #ifdef HAVE_FANCY_SIMD
             mm.func16 = mul_mat_iq1_s_q8_K<16>;
        #endif
       // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
              expected_typeB = GGML_TYPE_Q8_K;
            break;

        default:
        {
            printf("case:%d",typeA);
            return false;
        }
            
    }


    return ggml_type(typeB) == expected_typeB;

}

} // namespace

/*
iq1_s is not support for arm
*/
#else   // __aarch64__
#include <arm_neon.h>

namespace {

template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

    inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); }
    inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); }
    inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); }
    inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); }
    inline int16x8_t load_bsums8(int iy, int i) const {
        auto q8s = vld1q_s16_x2(y[iy][i].bsums);
        return vpaddq_s16(q8s.val[0], q8s.val[1]);
    }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

template <typename block_q>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {}
    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); }
    const void * vx;
    const block_q * x;
    const size_t bx;
    const int nrc;
};

struct Q4bits {
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    uint8x16x4_t b1, b2;
    inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[2] = vshrq_n_u8(val[0], 4);
        b.val[1] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[1] = vshrq_n_u8(val[0], 4);
        b.val[2] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4(b2, q4bits.val);
    }
    inline void prepare_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4(b1, q4bits.val+0);
        prepare4(b2, q4bits.val+2);
    }
    inline void prepare64(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        b1.val[0] = vandq_u8(q4bits.val[0], m4b);
        b1.val[1] = vandq_u8(q4bits.val[1], m4b);
        b1.val[2] = vandq_u8(q4bits.val[2], m4b);
        b1.val[3] = vandq_u8(q4bits.val[3], m4b);
        b2.val[0] = vshrq_n_u8(q4bits.val[0], 4);
        b2.val[1] = vshrq_n_u8(q4bits.val[1], 4);
        b2.val[2] = vshrq_n_u8(q4bits.val[2], 4);
        b2.val[3] = vshrq_n_u8(q4bits.val[3], 4);
    }
    inline void prepare16(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4_16(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4_16(b2, q4bits.val);
    }
    inline void prepare16_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4_16(b1, q4bits.val+0);
        prepare4_16(b2, q4bits.val+2);
    }
};

struct Scales8 {
    uint32_t utmp[4];
    const uint8_t * sc8 = (const uint8_t *)utmp;
    template <typename Q8, typename Qx>
    inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) {
        make_q4_scales(x.scales, utmp);
        int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8));
        accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin));

        uint8x8_t scales8 = vld1_u8(sc8);
        uint16x8_t scales16 = vmovl_u8(scales8);
        int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))),
                              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))};
        return scales;
    }
};


struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        if (nrc == 1) bits.prepare_v2(x[i].qs+64*j);
        else bits.prepare(x[i].qs+64*j);
    }

    Q4bits bits;
    Scales8 s8;

    float d;
};


struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d);
    }
    inline void prepare(int i, int j) {

        auto hbits = vld1q_u8_x2(x[i].qh + 32*j);

        bits.prepare64(x[i].ql+64*j);
        bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb));
        bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb));
        bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb));
        bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb));

        bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb));
        bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb));
        bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb));
        bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb));

    }

    Q4bits bits;

    const uint8x16_t mhb = vdupq_n_u8(0x30);

    float d;
};

template <typename Dequantizer>
struct BlockQxK {
    inline BlockQxK(const int maxn, const int maxk): maxn(maxn), maxk(maxk) {
        values = (int8_t*)aligned_alloc(256, maxn * maxk * sizeof(int8_t));
        scales = (int*)aligned_alloc(256,    maxn * maxk / SS * sizeof(int));
        ds     = (float*)aligned_alloc(256,  maxn * maxk / QK * sizeof(int));
        if constexpr (NeedSum) {
            dmins = (float*)aligned_alloc(256, maxn * maxk / QK * sizeof(int));
            scalems = (int16_t*)aligned_alloc(256, maxn * maxk / SS * sizeof(int16_t));
        }
    }
    inline ~BlockQxK() {
        free(values);
        free(scales);
        free(ds);
        if constexpr (NeedSum) {
            free(dmins);
            free(scalems);
        }
    }
    inline int FromDequantizer(const void * vx, size_t bx, int idx, int n_, int k_) {
        n = n_;
        k = k_;
        bn = n / BS;
        bk = k / QK;

        Dequantizer deq(vx, bx, 1);
        for (int i = 0; i < n; i += BS) {
            for (int j = 0; j < BS; j ++) {
                deq.new_row(j + i + idx);
                for (int x = 0; x < bk; x ++) {
                    {
                        int8x16_t base = NeedSum ? vdupq_n_s8(0) : vdupq_n_s8(32);
                        int32_t *dst = (int32_t*)(values + i*k + j*4 + x*QK*BS);
                        deq.prepare(x, 0);
                        int8x16_t v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[0]), base);
                        int8x16_t v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[1]), base);
                        int8x16_t v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[2]), base);
                        int8x16_t v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[3]), base);
                        *(dst + (0 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
                        *(dst + (1 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
                        *(dst + (2 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
                        *(dst + (3 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
                        *(dst + (0 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
                        *(dst + (1 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
                        *(dst + (2 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
                        *(dst + (3 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
                        *(dst + (0 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
                        *(dst + (1 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
                        *(dst + (2 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
                        *(dst + (3 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
                        *(dst + (0 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
                        *(dst + (1 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
                        *(dst + (2 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
                        *(dst + (3 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
                        v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[0]), base);
                        v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[1]), base);
                        v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[2]), base);
                        v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[3]), base);
                        *(dst + (0 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
                        *(dst + (1 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
                        *(dst + (2 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
                        *(dst + (3 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
                        *(dst + (0 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
                        *(dst + (1 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
                        *(dst + (2 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
                        *(dst + (3 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
                        *(dst + (0 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
                        *(dst + (1 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
                        *(dst + (2 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
                        *(dst + (3 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
                        *(dst + (0 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
                        *(dst + (1 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
                        *(dst + (2 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
                        *(dst + (3 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
                        deq.prepare(x, 1);
                        v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[0]), base);
                        v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[1]), base);
                        v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[2]), base);
                        v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[3]), base);
                        *(dst + (0 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
                        *(dst + (1 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
                        *(dst + (2 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
                        *(dst + (3 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
                        *(dst + (0 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
                        *(dst + (1 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
                        *(dst + (2 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
                        *(dst + (3 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
                        *(dst + (0 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
                        *(dst + (1 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
                        *(dst + (2 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
                        *(dst + (3 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
                        *(dst + (0 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
                        *(dst + (1 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
                        *(dst + (2 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
                        *(dst + (3 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
                        v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[0]), base);
                        v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[1]), base);
                        v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[2]), base);
                        v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[3]), base);
                        *(dst + (0 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
                        *(dst + (1 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
                        *(dst + (2 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
                        *(dst + (3 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
                        *(dst + (0 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
                        *(dst + (1 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
                        *(dst + (2 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
                        *(dst + (3 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
                        *(dst + (0 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
                        *(dst + (1 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
                        *(dst + (2 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
                        *(dst + (3 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
                        *(dst + (0 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
                        *(dst + (1 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
                        *(dst + (2 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
                        *(dst + (3 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
                    }
                    if constexpr (std::is_same_v<DequantizerQ6K, Dequantizer>)
                    {
                        int32_t *dst = (int32_t*)(scales + i*(k/SS) + j + x*QK/SS*BS);
                        int8x16_t ss = vld1q_s8(deq.x[x].scales);
                        int16x8_t s16_0 = vmovl_s8(vget_low_s8(ss));
                        int16x8_t s16_1 = vmovl_s8(vget_high_s8(ss));
                        int32x4_t s32_0 = vmovl_s16(vget_low_s16(s16_0));
                        int32x4_t s32_1 = vmovl_s16(vget_high_s16(s16_0));
                        int32x4_t s32_2 = vmovl_s16(vget_low_s16(s16_1));
                        int32x4_t s32_3 = vmovl_s16(vget_high_s16(s16_1));
                        *(dst + (0+0*4)*BS) = vgetq_lane_s32(s32_0, 0);
                        *(dst + (1+0*4)*BS) = vgetq_lane_s32(s32_0, 1);
                        *(dst + (2+0*4)*BS) = vgetq_lane_s32(s32_0, 2);
                        *(dst + (3+0*4)*BS) = vgetq_lane_s32(s32_0, 3);
                        *(dst + (0+1*4)*BS) = vgetq_lane_s32(s32_1, 0);
                        *(dst + (1+1*4)*BS) = vgetq_lane_s32(s32_1, 1);
                        *(dst + (2+1*4)*BS) = vgetq_lane_s32(s32_1, 2);
                        *(dst + (3+1*4)*BS) = vgetq_lane_s32(s32_1, 3);
                        *(dst + (0+2*4)*BS) = vgetq_lane_s32(s32_2, 0);
                        *(dst + (1+2*4)*BS) = vgetq_lane_s32(s32_2, 1);
                        *(dst + (2+2*4)*BS) = vgetq_lane_s32(s32_2, 2);
                        *(dst + (3+2*4)*BS) = vgetq_lane_s32(s32_2, 3);
                        *(dst + (0+3*4)*BS) = vgetq_lane_s32(s32_3, 0);
                        *(dst + (1+3*4)*BS) = vgetq_lane_s32(s32_3, 1);
                        *(dst + (2+3*4)*BS) = vgetq_lane_s32(s32_3, 2);
                        *(dst + (3+3*4)*BS) = vgetq_lane_s32(s32_3, 3);
                    }
                    if constexpr (std::is_same_v<DequantizerQ4K, Dequantizer>)
                    {
                        int32_t *dst = (int32_t*)(scales + i*(k/SS) + j + x*QK/SS*BS);
                        int16_t *dst2 = (int16_t*)(scalems + i*(k/SS) + j + x*QK/SS*BS);
                        uint32_t utmp[4];
                        const uint8_t * sc8 = (const uint8_t *)utmp;
                        make_q4_scales(deq.x[x].scales, utmp);
                        int8x16_t ss = vld1q_s8((const int8_t *)sc8);
                        int16x8_t scale = vmovl_s8(vget_low_s8(ss));
                        int16x8_t scale_min = vmovl_high_s8(ss);
                        int32x4_t s32_0 = vmovl_s16(vget_low_s16(scale));
                        int32x4_t s32_1 = vmovl_s16(vget_high_s16(scale));
                        *(dst + (0+0*4)*BS) = vgetq_lane_s32(s32_0, 0);
                        *(dst + (1+0*4)*BS) = vgetq_lane_s32(s32_0, 1);
                        *(dst + (2+0*4)*BS) = vgetq_lane_s32(s32_0, 2);
                        *(dst + (3+0*4)*BS) = vgetq_lane_s32(s32_0, 3);
                        *(dst + (0+1*4)*BS) = vgetq_lane_s32(s32_1, 0);
                        *(dst + (1+1*4)*BS) = vgetq_lane_s32(s32_1, 1);
                        *(dst + (2+1*4)*BS) = vgetq_lane_s32(s32_1, 2);
                        *(dst + (3+1*4)*BS) = vgetq_lane_s32(s32_1, 3);
                        *(dst2 + 0*BS) = vgetq_lane_s16(scale_min, 0);
                        *(dst2 + 1*BS) = vgetq_lane_s16(scale_min, 1);
                        *(dst2 + 2*BS) = vgetq_lane_s16(scale_min, 2);
                        *(dst2 + 3*BS) = vgetq_lane_s16(scale_min, 3);
                        *(dst2 + 4*BS) = vgetq_lane_s16(scale_min, 4);
                        *(dst2 + 5*BS) = vgetq_lane_s16(scale_min, 5);
                        *(dst2 + 6*BS) = vgetq_lane_s16(scale_min, 6);
                        *(dst2 + 7*BS) = vgetq_lane_s16(scale_min, 7);
                    }
                    {
                        float *dst = ds + i*bk + j + x*BS;
                        *dst = GGML_FP16_TO_FP32(deq.x[x].d);
                    }
                    if constexpr (std::is_same_v<DequantizerQ4K, Dequantizer>)
                    {
                        float *dst = dmins + i*bk + j + x*BS;
                        *dst = - GGML_FP16_TO_FP32(deq.x[x].dmin);
                    }
                }
            }
        }
        return 0;
    }

    int8_t *values;     // [bn][k/4][BS][4]
    int    *scales;     // [bn][k/SS][BS]
    float  *ds;         // [bn][bk][BS]
    float  *dmins;      // [bn][bk][BS]
    int16_t *scalems;   // [bn][k/SS][BS]

    static constexpr int BS = 8;
    static constexpr int QK = 256;
    static constexpr int SS = std::is_same_v<Dequantizer, DequantizerQ6K> ? 16 : 32;
    static constexpr int NeedSum = std::is_same_v<Dequantizer, DequantizerQ6K> ? 0 : 1;
    const int maxn;
    const int maxk;
    int n;
    int k;
    int bn;
    int bk;
};

template <typename Dequantizer, int BN>
IQK_NOINLINE void matmul_v2_kernel(const Dequantizer *a, const block_q8_K *y[BN], const DataInfo &info, int idx, int idy) {
    constexpr int BS = a->BS;
    constexpr int QK = a->QK;
    constexpr int SS = a->SS;
    for (int s = 0; s < a->n; s += BS) {
        float32x4_t cc[BN][BS/4];
        for (int i = 0; i < BN; i ++) {
            for (int j = 0; j < BS/4; j ++) {
                cc[i][j] = vdupq_n_f32(0);
            }
        }
        const int8_t *a_ptr = a->values + s*a->k;
        const int8_t *b_ptr[BN];
        for (int k = 0; k < a->bk; k ++) {
            for (int i = 0; i < BN; i ++) {
                b_ptr[i] = y[i][k].qs;
            }
            int32x4_t cci[BN][BS/4];
            if constexpr (BN == 4 && SS == 16) {
                int64_t length = QK/SS;
                auto ap = a_ptr;
                auto sp = a->scales + s*a->k/SS + (k*QK/SS)*BS;
                // asm volatile (
                asm volatile (
                    " eor    %[c00].16b, %[c00].16b, %[c00].16b \n"
                    " eor    %[c10].16b, %[c10].16b, %[c10].16b \n"
                    " eor    %[c20].16b, %[c20].16b, %[c20].16b \n"
                    " eor    %[c30].16b, %[c30].16b, %[c30].16b \n"
                    " eor    %[c01].16b, %[c01].16b, %[c01].16b \n"
                    " eor    %[c11].16b, %[c11].16b, %[c11].16b \n"
                    " eor    %[c21].16b, %[c21].16b, %[c21].16b \n"
                    " eor    %[c31].16b, %[c31].16b, %[c31].16b \n"
                    " loop_%=: \n"
                    " subs   %[len], %[len], #1 \n"
                    " ld1    {v12.16b}, [%[bp0]], #16 \n"
                    " ld1    {v13.16b}, [%[bp1]], #16 \n"
                    " ld1    {v14.16b}, [%[bp2]], #16 \n"
                    " ld1    {v15.16b}, [%[bp3]], #16 \n"
                    " prfm   pldl1strm, [%[ap], #256] \n"
                    " ld1    {v8.16b},  [%[ap]], #16 \n"
                    " ld1    {v9.16b},  [%[ap]], #16 \n"
                    " eor    v0.16b, v0.16b, v0.16b \n"
                    " eor    v1.16b, v1.16b, v1.16b \n"
                    " eor    v2.16b, v2.16b, v2.16b \n"
                    " eor    v3.16b, v3.16b, v3.16b \n"
                    " eor    v4.16b, v4.16b, v4.16b \n"
                    " eor    v5.16b, v5.16b, v5.16b \n"
                    " eor    v6.16b, v6.16b, v6.16b \n"
                    " eor    v7.16b, v7.16b, v7.16b \n"
                    " ld1    {v10.16b}, [%[ap]], #16 \n"
                    " ld1    {v11.16b}, [%[ap]], #16 \n"
                    " sdot   v0.4s, v8.16b,  v12.4b[0] \n"
                    " sdot   v1.4s, v8.16b,  v13.4b[0] \n"
                    " sdot   v2.4s, v8.16b,  v14.4b[0] \n"
                    " sdot   v3.4s, v8.16b,  v15.4b[0] \n"
                    " sdot   v4.4s, v9.16b,  v12.4b[0] \n"
                    " sdot   v5.4s, v9.16b,  v13.4b[0] \n"
                    " sdot   v6.4s, v9.16b,  v14.4b[0] \n"
                    " sdot   v7.4s, v9.16b,  v15.4b[0] \n"
                    " prfm   pldl1strm, [%[ap], #256] \n"
                    " ld1    {v8.16b},  [%[ap]], #16 \n"
                    " ld1    {v9.16b},  [%[ap]], #16 \n"
                    " sdot   v0.4s, v10.16b, v12.4b[1] \n"
                    " sdot   v1.4s, v10.16b, v13.4b[1] \n"
                    " sdot   v2.4s, v10.16b, v14.4b[1] \n"
                    " sdot   v3.4s, v10.16b, v15.4b[1] \n"
                    " sdot   v4.4s, v11.16b, v12.4b[1] \n"
                    " sdot   v5.4s, v11.16b, v13.4b[1] \n"
                    " sdot   v6.4s, v11.16b, v14.4b[1] \n"
                    " sdot   v7.4s, v11.16b, v15.4b[1] \n"
                    " ld1    {v10.16b}, [%[ap]], #16 \n"
                    " ld1    {v11.16b}, [%[ap]], #16 \n"
                    " sdot   v0.4s, v8.16b,  v12.4b[2] \n"
                    " sdot   v1.4s, v8.16b,  v13.4b[2] \n"
                    " sdot   v2.4s, v8.16b,  v14.4b[2] \n"
                    " sdot   v3.4s, v8.16b,  v15.4b[2] \n"
                    " sdot   v4.4s, v9.16b,  v12.4b[2] \n"
                    " sdot   v5.4s, v9.16b,  v13.4b[2] \n"
                    " sdot   v6.4s, v9.16b,  v14.4b[2] \n"
                    " sdot   v7.4s, v9.16b,  v15.4b[2] \n"
                    " ld1    {v8.4s}, [%[sp]], #16 \n"
                    " ld1    {v9.4s}, [%[sp]], #16 \n"
                    " sdot   v0.4s, v10.16b, v12.4b[3] \n"
                    " sdot   v1.4s, v10.16b, v13.4b[3] \n"
                    " sdot   v2.4s, v10.16b, v14.4b[3] \n"
                    " sdot   v3.4s, v10.16b, v15.4b[3] \n"
                    " sdot   v4.4s, v11.16b, v12.4b[3] \n"
                    " sdot   v5.4s, v11.16b, v13.4b[3] \n"
                    " sdot   v6.4s, v11.16b, v14.4b[3] \n"
                    " sdot   v7.4s, v11.16b, v15.4b[3] \n"
                    " mla    %[c00].4s, v0.4s, v8.4s \n"
                    " mla    %[c10].4s, v1.4s, v8.4s \n"
                    " mla    %[c20].4s, v2.4s, v8.4s \n"
                    " mla    %[c30].4s, v3.4s, v8.4s \n"
                    " mla    %[c01].4s, v4.4s, v9.4s \n"
                    " mla    %[c11].4s, v5.4s, v9.4s \n"
                    " mla    %[c21].4s, v6.4s, v9.4s \n"
                    " mla    %[c31].4s, v7.4s, v9.4s \n"
                    " bne    loop_%= \n"
                    " exit_%=:\n"
                    : [len]    "+r" (length)
                    , [ap]     "+r" (ap)
                    , [bp0]    "+r" (b_ptr[0])
                    , [bp1]    "+r" (b_ptr[1])
                    , [bp2]    "+r" (b_ptr[2])
                    , [bp3]    "+r" (b_ptr[3])
                    , [sp]     "+r" (sp)
                    , [c00]    "+w" (cci[0][0])
                    , [c10]    "+w" (cci[1][0])
                    , [c20]    "+w" (cci[2][0])
                    , [c30]    "+w" (cci[3][0])
                    , [c01]    "+w" (cci[0][1])
                    , [c11]    "+w" (cci[1][1])
                    , [c21]    "+w" (cci[2][1])
                    , [c31]    "+w" (cci[3][1])
                    :
                    : "v0",  "v1",  "v2",  "v3"
                    , "v4",  "v5",  "v6",  "v7"
                    , "v8",  "v9",  "v10", "v11"
                    , "v12", "v13", "v14", "v15"
                    , "memory", "cc"
                );
                a_ptr += BS * QK;
            } else if (BN == 4 && SS == 32) {
                int64_t length = QK/SS;
                auto ap = a_ptr;
                auto sp = a->scales + s*a->k/SS + (k*QK/SS)*BS;
                // asm volatile (
                asm volatile (
                    " eor    %[c00].16b, %[c00].16b, %[c00].16b \n"
                    " eor    %[c10].16b, %[c10].16b, %[c10].16b \n"
                    " eor    %[c20].16b, %[c20].16b, %[c20].16b \n"
                    " eor    %[c30].16b, %[c30].16b, %[c30].16b \n"
                    " eor    %[c01].16b, %[c01].16b, %[c01].16b \n"
                    " eor    %[c11].16b, %[c11].16b, %[c11].16b \n"
                    " eor    %[c21].16b, %[c21].16b, %[c21].16b \n"
                    " eor    %[c31].16b, %[c31].16b, %[c31].16b \n"
                    " loop_%=: \n"
                    " subs   %[len], %[len], #1 \n"
                    " ld1    {v12.16b}, [%[bp0]], #16 \n"
                    " ld1    {v13.16b}, [%[bp1]], #16 \n"
                    " ld1    {v14.16b}, [%[bp2]], #16 \n"
                    " ld1    {v15.16b}, [%[bp3]], #16 \n"
                    " prfm   pldl1strm, [%[ap], #256] \n"
                    " ld1    {v8.16b},  [%[ap]], #16 \n"
                    " ld1    {v9.16b},  [%[ap]], #16 \n"
                    " eor    v0.16b, v0.16b, v0.16b \n"
                    " eor    v1.16b, v1.16b, v1.16b \n"
                    " eor    v2.16b, v2.16b, v2.16b \n"
                    " eor    v3.16b, v3.16b, v3.16b \n"
                    " eor    v4.16b, v4.16b, v4.16b \n"
                    " eor    v5.16b, v5.16b, v5.16b \n"
                    " eor    v6.16b, v6.16b, v6.16b \n"
                    " eor    v7.16b, v7.16b, v7.16b \n"
                    " ld1    {v10.16b}, [%[ap]], #16 \n"
                    " ld1    {v11.16b}, [%[ap]], #16 \n"
                    " sdot   v0.4s, v8.16b,  v12.4b[0] \n"
                    " sdot   v1.4s, v8.16b,  v13.4b[0] \n"
                    " sdot   v2.4s, v8.16b,  v14.4b[0] \n"
                    " sdot   v3.4s, v8.16b,  v15.4b[0] \n"
                    " sdot   v4.4s, v9.16b,  v12.4b[0] \n"
                    " sdot   v5.4s, v9.16b,  v13.4b[0] \n"
                    " sdot   v6.4s, v9.16b,  v14.4b[0] \n"
                    " sdot   v7.4s, v9.16b,  v15.4b[0] \n"
                    " prfm   pldl1strm, [%[ap], #256] \n"
                    " ld1    {v8.16b},  [%[ap]], #16 \n"
                    " ld1    {v9.16b},  [%[ap]], #16 \n"
                    " sdot   v0.4s, v10.16b, v12.4b[1] \n"
                    " sdot   v1.4s, v10.16b, v13.4b[1] \n"
                    " sdot   v2.4s, v10.16b, v14.4b[1] \n"
                    " sdot   v3.4s, v10.16b, v15.4b[1] \n"
                    " sdot   v4.4s, v11.16b, v12.4b[1] \n"
                    " sdot   v5.4s, v11.16b, v13.4b[1] \n"
                    " sdot   v6.4s, v11.16b, v14.4b[1] \n"
                    " sdot   v7.4s, v11.16b, v15.4b[1] \n"
                    " ld1    {v10.16b}, [%[ap]], #16 \n"
                    " ld1    {v11.16b}, [%[ap]], #16 \n"
                    " sdot   v0.4s, v8.16b,  v12.4b[2] \n"
                    " sdot   v1.4s, v8.16b,  v13.4b[2] \n"
                    " sdot   v2.4s, v8.16b,  v14.4b[2] \n"
                    " sdot   v3.4s, v8.16b,  v15.4b[2] \n"
                    " sdot   v4.4s, v9.16b,  v12.4b[2] \n"
                    " sdot   v5.4s, v9.16b,  v13.4b[2] \n"
                    " sdot   v6.4s, v9.16b,  v14.4b[2] \n"
                    " sdot   v7.4s, v9.16b,  v15.4b[2] \n"
                    " prfm   pldl1strm, [%[ap], #256] \n"
                    " ld1    {v8.16b},  [%[ap]], #16 \n"
                    " ld1    {v9.16b},  [%[ap]], #16 \n"
                    " sdot   v0.4s, v10.16b, v12.4b[3] \n"
                    " sdot   v1.4s, v10.16b, v13.4b[3] \n"
                    " sdot   v2.4s, v10.16b, v14.4b[3] \n"
                    " sdot   v3.4s, v10.16b, v15.4b[3] \n"
                    " sdot   v4.4s, v11.16b, v12.4b[3] \n"
                    " sdot   v5.4s, v11.16b, v13.4b[3] \n"
                    " sdot   v6.4s, v11.16b, v14.4b[3] \n"
                    " sdot   v7.4s, v11.16b, v15.4b[3] \n"
                    " ld1    {v10.16b}, [%[ap]], #16 \n"
                    " ld1    {v11.16b}, [%[ap]], #16 \n"
                    " ld1    {v12.16b}, [%[bp0]], #16 \n"
                    " ld1    {v13.16b}, [%[bp1]], #16 \n"
                    " ld1    {v14.16b}, [%[bp2]], #16 \n"
                    " ld1    {v15.16b}, [%[bp3]], #16 \n"
                    " sdot   v0.4s, v8.16b,  v12.4b[0] \n"
                    " sdot   v1.4s, v8.16b,  v13.4b[0] \n"
                    " sdot   v2.4s, v8.16b,  v14.4b[0] \n"
                    " sdot   v3.4s, v8.16b,  v15.4b[0] \n"
                    " sdot   v4.4s, v9.16b,  v12.4b[0] \n"
                    " sdot   v5.4s, v9.16b,  v13.4b[0] \n"
                    " sdot   v6.4s, v9.16b,  v14.4b[0] \n"
                    " sdot   v7.4s, v9.16b,  v15.4b[0] \n"
                    " prfm   pldl1strm, [%[ap], #256] \n"
                    " ld1    {v8.16b},  [%[ap]], #16 \n"
                    " ld1    {v9.16b},  [%[ap]], #16 \n"
                    " sdot   v0.4s, v10.16b, v12.4b[1] \n"
                    " sdot   v1.4s, v10.16b, v13.4b[1] \n"
                    " sdot   v2.4s, v10.16b, v14.4b[1] \n"
                    " sdot   v3.4s, v10.16b, v15.4b[1] \n"
                    " sdot   v4.4s, v11.16b, v12.4b[1] \n"
                    " sdot   v5.4s, v11.16b, v13.4b[1] \n"
                    " sdot   v6.4s, v11.16b, v14.4b[1] \n"
                    " sdot   v7.4s, v11.16b, v15.4b[1] \n"
                    " ld1    {v10.16b}, [%[ap]], #16 \n"
                    " ld1    {v11.16b}, [%[ap]], #16 \n"
                    " sdot   v0.4s, v8.16b,  v12.4b[2] \n"
                    " sdot   v1.4s, v8.16b,  v13.4b[2] \n"
                    " sdot   v2.4s, v8.16b,  v14.4b[2] \n"
                    " sdot   v3.4s, v8.16b,  v15.4b[2] \n"
                    " sdot   v4.4s, v9.16b,  v12.4b[2] \n"
                    " sdot   v5.4s, v9.16b,  v13.4b[2] \n"
                    " sdot   v6.4s, v9.16b,  v14.4b[2] \n"
                    " sdot   v7.4s, v9.16b,  v15.4b[2] \n"
                    " ld1    {v8.4s}, [%[sp]], #16 \n"
                    " ld1    {v9.4s}, [%[sp]], #16 \n"
                    " sdot   v0.4s, v10.16b, v12.4b[3] \n"
                    " sdot   v1.4s, v10.16b, v13.4b[3] \n"
                    " sdot   v2.4s, v10.16b, v14.4b[3] \n"
                    " sdot   v3.4s, v10.16b, v15.4b[3] \n"
                    " sdot   v4.4s, v11.16b, v12.4b[3] \n"
                    " sdot   v5.4s, v11.16b, v13.4b[3] \n"
                    " sdot   v6.4s, v11.16b, v14.4b[3] \n"
                    " sdot   v7.4s, v11.16b, v15.4b[3] \n"
                    " mla    %[c00].4s, v0.4s, v8.4s \n"
                    " mla    %[c10].4s, v1.4s, v8.4s \n"
                    " mla    %[c20].4s, v2.4s, v8.4s \n"
                    " mla    %[c30].4s, v3.4s, v8.4s \n"
                    " mla    %[c01].4s, v4.4s, v9.4s \n"
                    " mla    %[c11].4s, v5.4s, v9.4s \n"
                    " mla    %[c21].4s, v6.4s, v9.4s \n"
                    " mla    %[c31].4s, v7.4s, v9.4s \n"
                    " bne    loop_%= \n"
                    " exit_%=:\n"
                    : [len]    "+r" (length)
                    , [ap]     "+r" (ap)
                    , [bp0]    "+r" (b_ptr[0])
                    , [bp1]    "+r" (b_ptr[1])
                    , [bp2]    "+r" (b_ptr[2])
                    , [bp3]    "+r" (b_ptr[3])
                    , [sp]     "+r" (sp)
                    , [c00]    "+w" (cci[0][0])
                    , [c10]    "+w" (cci[1][0])
                    , [c20]    "+w" (cci[2][0])
                    , [c30]    "+w" (cci[3][0])
                    , [c01]    "+w" (cci[0][1])
                    , [c11]    "+w" (cci[1][1])
                    , [c21]    "+w" (cci[2][1])
                    , [c31]    "+w" (cci[3][1])
                    :
                    : "v0",  "v1",  "v2",  "v3"
                    , "v4",  "v5",  "v6",  "v7"
                    , "v8",  "v9",  "v10", "v11"
                    , "v12", "v13", "v14", "v15"
                    , "memory", "cc"
                );
                a_ptr += BS * QK;
            } else
            {
                for (int i = 0; i < BN; i ++) {
                    for (int j = 0; j < BS/4; j ++) {
                        cci[i][j] = vdupq_n_s32(0);
                    }
                }
                for (int k0 = 0; k0 < QK/SS; k0 ++) {
                    int32x4_t ccv[BN][BS/4];
                    for (int i = 0; i < BN; i ++) {
                        for (int j = 0; j < BS/4; j ++) {
                            ccv[i][j] = vdupq_n_s32(0);
                        }
                    }
                    #pragma unroll
                    for (int k2 = 0; k2 < SS; k2 += 16) {
                        const int OFFSET = 256;
                        __builtin_prefetch((a_ptr + OFFSET + 0*64), 0, 0);
                        __builtin_prefetch((a_ptr + OFFSET + 1*64), 0, 0);

                        int8x16_t bb[BN];
                        int8x16_t aa[BS/4];
                        for (int i = 0; i < BN; i ++) {
                            bb[i] = vld1q_s8(b_ptr[i]); b_ptr[i] += 16;
                        }
                        for (int k1 = 0; k1 < 4; k1 ++) {
                            for (int i = 0; i < BS/4; i ++) {
                                aa[i] = vld1q_s8(a_ptr); a_ptr += 16;
                            }
                            for (int i = 0; i < BN; i ++) {
                                for (int j = 0; j < BS/4; j ++) {
                                    ccv[i][j] = vdotq_laneq_s32(ccv[i][j], aa[j], bb[i], k1);
                                }
                            }
                        }
                    }
                    int32x4_t scal[BS/4];
                    for (int i = 0; i < BS/4; i ++) {
                        scal[i] = vld1q_s32(a->scales + s*a->k/SS + (k*QK/SS+k0)*BS + i*4);
                    }
                    for (int i = 0; i < BN; i ++) {
                        for (int j = 0; j < BS/4; j ++) {
                            cci[i][j] = vmlaq_s32(cci[i][j], ccv[i][j], scal[j]);
                        }
                    }
                }
            }
            float32x4_t scalf[BS/4];
            for (int i = 0; i < BS/4; i ++) {
                scalf[i] = vld1q_f32(a->ds + s*a->bk + k*BS + i*4);
            }
            for (int i = 0; i < BN; i ++) {
                for (int j = 0; j < BS/4; j ++) {
                    cc[i][j] = vfmaq_f32(cc[i][j], vcvtq_f32_s32(cci[i][j]), vmulq_n_f32(scalf[j], y[i][k].d));
                }
            }
        }
        if constexpr (a->NeedSum) {
            const int16_t *a_ptr = a->scalems + s*a->k/SS;
            const int16_t *b_ptr[BN];
            for (int k = 0; k < a->bk; k ++) {
                for (int i = 0; i < BN; i ++) {
                    b_ptr[i] = y[i][k].bsums;
                }
                int32x4_t cci[BN][BS/4];
                for (int i = 0; i < BN; i ++) {
                    for (int j = 0; j < BS/4; j ++) {
                        cci[i][j] = vdupq_n_s32(0);
                    }
                }
                for (int k0 = 0; k0 < QK/SS/4; k0 ++) {
                    int16x8_t bb[BN];
                    int16x8_t aa[BS/8];
                    for (int i = 0; i < BN; i ++) {
                        bb[i] = vld1q_s16(b_ptr[i]); b_ptr[i] += 8;
                    }
                    for (int k1 = 0; k1 < 4; k1 ++) {
                        for (int i = 0; i < BS/8; i ++) {
                            aa[i] = vld1q_s16(a_ptr); a_ptr += 8;
                        }
                        for (int i = 0; i < BN; i ++) {
                            for (int j = 0; j < BS/8; j ++) {
                                cci[i][2*j+0] = vmlal_laneq_s16(cci[i][2*j+0], vget_low_s16(aa[j]), bb[i], 2*k1+0);
                                cci[i][2*j+1] = vmlal_high_laneq_s16(cci[i][2*j+1], aa[j], bb[i], 2*k1+0);
                                cci[i][2*j+0] = vmlal_laneq_s16(cci[i][2*j+0], vget_low_s16(aa[j]), bb[i], 2*k1+1);
                                cci[i][2*j+1] = vmlal_high_laneq_s16(cci[i][2*j+1], aa[j], bb[i], 2*k1+1);
                            }
                        }
                    }
                }
                float32x4_t scalf[BS/4];
                for (int i = 0; i < BS/4; i ++) {
                    scalf[i] = vld1q_f32(a->dmins + s*a->bk + k*BS + i*4);
                }
                for (int i = 0; i < BN; i ++) {
                    for (int j = 0; j < BS/4; j ++) {
                        cc[i][j] = vfmaq_f32(cc[i][j], vcvtq_f32_s32(cci[i][j]), vmulq_n_f32(scalf[j], y[i][k].d));
                    }
                }
            }
        }
        for (int i = 0; i < BN; i ++) {
            for (int j = 0; j < BS/4; j ++) {
                vst1q_f32(info.ptr(j*4+s+idx, i), cc[i][j]);
            }
        }
    }
    return;
}

template <typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_T_v2(int m, int n, int k, const void * vx, size_t bx, const DataInfo& info) {
    constexpr int m_step = 64;
    constexpr int n_step = 4;
    assert(m%m_step == 0);
    int n2 = n - (n%n_step);
    int left = n%n_step;
    BlockQxK<Dequantizer> xx(m_step, k);
    for (int i = 0; i < m; i += m_step) {
        auto this_info = info;
        int bm = (m - i) < m_step ? (m - i) : m_step;
        xx.FromDequantizer(vx, bx, i, bm, k);
        for (int j = 0; j < n2; j += n_step) {
            Q8<n_step, block_q8_K> q8(this_info);
            matmul_v2_kernel<BlockQxK<Dequantizer>, n_step>(&xx, q8.y, this_info, i, j);
            this_info.cur_y += n_step;
        }
        if (left) {
            switch (left) {
                case 1:
                {
                    Q8<1, block_q8_K> q8(this_info);
                    matmul_v2_kernel<BlockQxK<Dequantizer>, 1>(&xx, q8.y, this_info, i, n2);
                    this_info.cur_y += 1;
                    break;
                }
                case 2:
                {
                    Q8<2, block_q8_K> q8(this_info);
                    matmul_v2_kernel<BlockQxK<Dequantizer>, 2>(&xx, q8.y, this_info, i, n2);
                    this_info.cur_y += 2;
                    break;
                }
                case 3:
                {
                    Q8<3, block_q8_K> q8(this_info);
                    matmul_v2_kernel<BlockQxK<Dequantizer>, 3>(&xx, q8.y, this_info, i, n2);
                    this_info.cur_y += 3;
                    break;
                }
            }
        }
    }
    return;
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);
    const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val;
    const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val;

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales.val[j], pall);
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8,
        const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales, pall);
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {

    auto mzero = vdupq_n_s32(0);
    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1,
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4,
    auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3
    sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5,
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7,
    auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7
    sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34);
}

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {

            int32x4_t sumi[nrc_y];
            for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

            if constexpr (Dequantizer::num_blocks() == 8) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else if constexpr (Dequantizer::num_blocks() == 16) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else {
                GGML_ASSERT(false);
            }
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Q8>
inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums8(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}

template <typename Q8>
inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0]));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0]));
        int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1]));
        int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1]));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4)));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}

struct Q2bits {
    const uint8x16_t m4b = vdupq_n_u8(0x03);
    uint8x16x4_t b1, b2;
    inline void prepare(const uint8_t * qs) {
        auto q2bits = vld1q_u8_x2(qs);
        b1.val[0] = vandq_u8(q2bits.val[0], m4b);
        b1.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b1.val[2] = vandq_u8(q2bits.val[0], m4b);
        b1.val[3] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[0] = vandq_u8(q2bits.val[0], m4b);
        b2.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[2] = vandq_u8(q2bits.val[0], m4b);
        b2.val[3] = vandq_u8(q2bits.val[1], m4b);
    }
};

struct HighBit5 {
    const uint8x16_t mhb = vdupq_n_u8(0x10);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct HighBit3 {
    const uint8x16_t mhb = vdupq_n_u8(0x04);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].qh);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+64*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    Q4bits bits;
    HighBit5 h;
    Scales8 s8;

    uint8x16x2_t hbits;

    float d;
};

inline int32x4x4_t make_wider(const int16x8x2_t& scales16) {
    int32x4x4_t scales = {
        vmovl_s16(vget_low_s16 (scales16.val[0])),
        vmovl_s16(vget_high_s16(scales16.val[0])),
        vmovl_s16(vget_low_s16 (scales16.val[1])),
        vmovl_s16(vget_high_s16(scales16.val[1])),
    };
    return scales;
}

template <typename Q8>
inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) {
    int16x8x2_t scales16;
    scales16.val[0] = vmovl_s8(vget_low_s8(scales8));
    scales16.val[1] = vmovl_s8(vget_high_s8(scales8));
    accum_mins_16(scales16, q8, acc, i, c);
    return make_wider(scales16);
}

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].hmask);
        const uint16_t * sc16 = (const uint16_t *)x[i].scales;
        uint32_t aux0 = sc16[0] | (sc16[1] << 16);
        uint32_t aux1 = sc16[2] | (sc16[3] << 16);
        uint32_t aux2 = sc16[4] | (sc16[5] << 16);
        aux32[0] =  (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030);
        aux32[1] =  (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030);
        aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030);
        aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030);
        return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d);
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    uint32_t aux32[4];

    Q2bits bits;

    HighBit3 h;

    float d;
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return true; }

    template <typename Q8>
    inline void process_scales(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales_and_mins = vld1q_u8(x[i].scales);
        auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4));
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(mins8));
        scales16.val[1] = vmovl_s8(vget_high_s8(mins8));
        accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin));

        scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf));
    }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        process_scales(i, q8, acc);
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8)));
        scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8)));
        return make_wider(scales16);
    }

    template <typename Q8>
    inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) {
        auto m1 = vdupq_n_u8(1);
        auto shuffle = vdupq_n_u8(8*j);
        bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]);

            auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]);

            auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]);

            auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]);
        }
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
    }

    uint32_t aux32[4];

    uint8x16_t scales8;

    Q2bits bits;

    float d;
};

IQK_ALWAYS_INLINE void fusion_mul_mat_qX_K_q8_K_T_y1_d6k(
    float32x4_t &acc,
    const uint8_t *x_ql, // [128] 4bit
    const uint8_t *x_qh, // [64] 2bit
    const int8_t *x_scale, // [16] 8bit
    float x_d,
    const int8_t *y_qs, // [256] 8bit
    const int16_t *y_bsums, // [16] 16bit
    float y_d)
{
    float c0 = x_d * y_d;
    float c1 = -32.0f * c0;
    const int OFFSET = 1024;
    __builtin_prefetch((x_ql + OFFSET + 0*64), 0, 0);
    __builtin_prefetch((x_ql + OFFSET + 1*64), 0, 0);
    __builtin_prefetch((x_ql + OFFSET + 2*64), 0, 0);

    int16x8_t scale16_0, scale16_1;
    {
        int8x16_t tmp = vld1q_s8(x_scale);
        scale16_0 = vmovl_s8(vget_low_s8(tmp));
        scale16_1 = vmovl_high_s8(tmp);
    }
    {
        int16x8_t q8s0 = vld1q_s16(y_bsums + 0);
        int16x8_t q8s1 = vld1q_s16(y_bsums + 8);
        int32x4_t b0 = vmull_s16(vget_low_s16(scale16_0), vget_low_s16(q8s0));
        b0 = vmlal_high_s16(b0, scale16_0, q8s0);
        b0 = vmlal_s16(b0, vget_low_s16(scale16_1), vget_low_s16(q8s1));
        b0 = vmlal_high_s16(b0, scale16_1, q8s1);
        acc = vfmaq_n_f32(acc, vcvtq_f32_s32(b0), c1);
    }
    uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7;
    int32x4_t sumi = vdupq_n_s32(0);
    {
        const uint8x16_t m0 = vdupq_n_u8(0x3f);
        const uint8x16_t m1 = vdupq_n_u8(0x30);
        const uint8x16_t m2 = vdupq_n_u8(0x0f);
        x0 = vld1q_u8(x_ql + 0*16 + 0*64);
        x1 = vld1q_u8(x_ql + 1*16 + 0*64);
        x2 = vld1q_u8(x_ql + 2*16 + 0*64);
        x3 = vld1q_u8(x_ql + 3*16 + 0*64);
        uint8x16_t hbits0 = vld1q_u8(x_qh + 0*16 + 0*32);
        uint8x16_t hbits1 = vld1q_u8(x_qh + 1*16 + 0*32);
        x4 = vandq_u8(hbits0, m0);
        x4 = vsriq_n_u8(x4, x0, 4);
        x5 = vandq_u8(hbits1, m0);
        x5 = vsriq_n_u8(x5, x1, 4);
        x6 = vshrq_n_u8(hbits0, 2);
        x6 = vsriq_n_u8(x6, x2, 4);
        x7 = vshrq_n_u8(hbits1, 2);
        x7 = vsriq_n_u8(x7, x3, 4);
        x0 = vsliq_n_u8(x0, hbits0, 4);
        x0 = vandq_u8(x0, m0);
        x1 = vsliq_n_u8(x1, hbits1, 4);
        x1 = vandq_u8(x1, m0);
        hbits0 = vshlq_n_u8(hbits0, 2);
        hbits0 = vandq_u8(hbits0, m1);
        x2 = vandq_u8(x2, m2);
        x2 = vorrq_u8(x2, hbits0);
        hbits1 = vshlq_n_u8(hbits1, 2);
        hbits1 = vandq_u8(hbits1, m1);
        x3 = vandq_u8(x3, m2);
        x3 = vorrq_u8(x3, hbits1);
    }
    {
        int8x16_t base = vdupq_n_s8(32);
        int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 0*128);
        int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 0*128);
        int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 0*128);
        int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 0*128);
        int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 0*128);
        int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 0*128);
        int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 0*128);
        int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 0*128);
        int32x4_t p00 = vdupq_n_s32(0);
        int32x4_t p01 = vdupq_n_s32(0);
        int32x4_t p10 = vdupq_n_s32(0);
        int32x4_t p11 = vdupq_n_s32(0);
        int32x4_t p20 = vdupq_n_s32(0);
        int32x4_t p21 = vdupq_n_s32(0);
        int32x4_t p30 = vdupq_n_s32(0);
        int32x4_t p31 = vdupq_n_s32(0);
        p00 = vdotq_s32(p00, vreinterpretq_s8_u8(x0), y0);
        p01 = vdotq_s32(p01, vreinterpretq_s8_u8(x1), y1);
        p10 = vdotq_s32(p10, vreinterpretq_s8_u8(x2), y2);
        p11 = vdotq_s32(p11, vreinterpretq_s8_u8(x3), y3);
        p20 = vdotq_s32(p20, vreinterpretq_s8_u8(x4), y4);
        p21 = vdotq_s32(p21, vreinterpretq_s8_u8(x5), y5);
        p30 = vdotq_s32(p30, vreinterpretq_s8_u8(x6), y6);
        p31 = vdotq_s32(p31, vreinterpretq_s8_u8(x7), y7);
        // p00 = vdotq_s32(p00, vsubq_s8(vreinterpretq_s8_u8(x0), base), y0);
        // p01 = vdotq_s32(p01, vsubq_s8(vreinterpretq_s8_u8(x1), base), y1);
        // p10 = vdotq_s32(p10, vsubq_s8(vreinterpretq_s8_u8(x2), base), y2);
        // p11 = vdotq_s32(p11, vsubq_s8(vreinterpretq_s8_u8(x3), base), y3);
        // p20 = vdotq_s32(p20, vsubq_s8(vreinterpretq_s8_u8(x4), base), y4);
        // p21 = vdotq_s32(p21, vsubq_s8(vreinterpretq_s8_u8(x5), base), y5);
        // p30 = vdotq_s32(p30, vsubq_s8(vreinterpretq_s8_u8(x6), base), y6);
        // p31 = vdotq_s32(p31, vsubq_s8(vreinterpretq_s8_u8(x7), base), y7);
        p00 = vpaddq_s32(p00, p01);
        p10 = vpaddq_s32(p10, p11);
        p20 = vpaddq_s32(p20, p21);
        p30 = vpaddq_s32(p30, p31);
        p00 = vpaddq_s32(p00, p10);
        p20 = vpaddq_s32(p20, p30);
        sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale16_0)), p00);
        sumi = vmlaq_s32(sumi, vmovl_high_s16(scale16_0), p20);
    }
    {
        const uint8x16_t m0 = vdupq_n_u8(0x3f);
        const uint8x16_t m1 = vdupq_n_u8(0x30);
        const uint8x16_t m2 = vdupq_n_u8(0x0f);
        x0 = vld1q_u8(x_ql + 0*16 + 1*64);
        x1 = vld1q_u8(x_ql + 1*16 + 1*64);
        x2 = vld1q_u8(x_ql + 2*16 + 1*64);
        x3 = vld1q_u8(x_ql + 3*16 + 1*64);
        uint8x16_t hbits0 = vld1q_u8(x_qh + 0*16 + 1*32);
        uint8x16_t hbits1 = vld1q_u8(x_qh + 1*16 + 1*32);
        x4 = vandq_u8(hbits0, m0);
        x4 = vsriq_n_u8(x4, x0, 4);
        x5 = vandq_u8(hbits1, m0);
        x5 = vsriq_n_u8(x5, x1, 4);
        x6 = vshrq_n_u8(hbits0, 2);
        x6 = vsriq_n_u8(x6, x2, 4);
        x7 = vshrq_n_u8(hbits1, 2);
        x7 = vsriq_n_u8(x7, x3, 4);
        x0 = vsliq_n_u8(x0, hbits0, 4);
        x0 = vandq_u8(x0, m0);
        x1 = vsliq_n_u8(x1, hbits1, 4);
        x1 = vandq_u8(x1, m0);
        hbits0 = vshlq_n_u8(hbits0, 2);
        hbits0 = vandq_u8(hbits0, m1);
        x2 = vandq_u8(x2, m2);
        x2 = vorrq_u8(x2, hbits0);
        hbits1 = vshlq_n_u8(hbits1, 2);
        hbits1 = vandq_u8(hbits1, m1);
        x3 = vandq_u8(x3, m2);
        x3 = vorrq_u8(x3, hbits1);
    }
    {
        int8x16_t base = vdupq_n_s8(32);
        int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 1*128);
        int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 1*128);
        int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 1*128);
        int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 1*128);
        int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 1*128);
        int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 1*128);
        int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 1*128);
        int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 1*128);
        int32x4_t p00 = vdupq_n_s32(0);
        int32x4_t p01 = vdupq_n_s32(0);
        int32x4_t p10 = vdupq_n_s32(0);
        int32x4_t p11 = vdupq_n_s32(0);
        int32x4_t p20 = vdupq_n_s32(0);
        int32x4_t p21 = vdupq_n_s32(0);
        int32x4_t p30 = vdupq_n_s32(0);
        int32x4_t p31 = vdupq_n_s32(0);
        p00 = vdotq_s32(p00, vreinterpretq_s8_u8(x0), y0);
        p01 = vdotq_s32(p01, vreinterpretq_s8_u8(x1), y1);
        p10 = vdotq_s32(p10, vreinterpretq_s8_u8(x2), y2);
        p11 = vdotq_s32(p11, vreinterpretq_s8_u8(x3), y3);
        p20 = vdotq_s32(p20, vreinterpretq_s8_u8(x4), y4);
        p21 = vdotq_s32(p21, vreinterpretq_s8_u8(x5), y5);
        p30 = vdotq_s32(p30, vreinterpretq_s8_u8(x6), y6);
        p31 = vdotq_s32(p31, vreinterpretq_s8_u8(x7), y7);
        // p00 = vdotq_s32(p00, vsubq_s8(vreinterpretq_s8_u8(x0), base), y0);
        // p01 = vdotq_s32(p01, vsubq_s8(vreinterpretq_s8_u8(x1), base), y1);
        // p10 = vdotq_s32(p10, vsubq_s8(vreinterpretq_s8_u8(x2), base), y2);
        // p11 = vdotq_s32(p11, vsubq_s8(vreinterpretq_s8_u8(x3), base), y3);
        // p20 = vdotq_s32(p20, vsubq_s8(vreinterpretq_s8_u8(x4), base), y4);
        // p21 = vdotq_s32(p21, vsubq_s8(vreinterpretq_s8_u8(x5), base), y5);
        // p30 = vdotq_s32(p30, vsubq_s8(vreinterpretq_s8_u8(x6), base), y6);
        // p31 = vdotq_s32(p31, vsubq_s8(vreinterpretq_s8_u8(x7), base), y7);
        p00 = vpaddq_s32(p00, p01);
        p10 = vpaddq_s32(p10, p11);
        p20 = vpaddq_s32(p20, p21);
        p30 = vpaddq_s32(p30, p31);
        p00 = vpaddq_s32(p00, p10);
        p20 = vpaddq_s32(p20, p30);
        sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale16_1)), p00);
        sumi = vmlaq_s32(sumi, vmovl_high_s16(scale16_1), p20);
    }
    {
        acc = vfmaq_n_f32(acc, vcvtq_f32_s32(sumi), c0);
    }
    return;
}

IQK_ALWAYS_INLINE void fusion_mul_mat_qX_K_q8_K_T_y1_d4k(
    float32x4_t &acc,
    const uint8_t *x_scale, // [12] 8*2*6bits
    const uint8_t *x_qs, // [128] 256*4bits
    float x_d,
    float x_dmin,
    const int8_t *y_qs, // [256] 8bit
    const int16_t *y_bsums, // [16] 16bit
    float y_d)
{
    float c0 = x_d * y_d;
    float c1 = -x_dmin * y_d;
    const int OFFSET = 1024;
    __builtin_prefetch((x_scale + OFFSET + 0*64), 0, 0);
    __builtin_prefetch((x_scale + OFFSET + 1*64), 0, 0);

    int16x8_t scale_min;
    int16x8_t scale;
    {
        uint32_t utmp[4];
        const uint8_t * sc8 = (const uint8_t *)utmp;
        make_q4_scales(x_scale, utmp);
        int8x16_t ss = vld1q_s8((const int8_t *)sc8);
        scale = vmovl_s8(vget_low_s8(ss));
        scale_min = vmovl_high_s8(ss);
    }
    {
        int16x8_t q8s0 = vld1q_s16(y_bsums + 0);
        int16x8_t q8s1 = vld1q_s16(y_bsums + 8);
        q8s0 = vpaddq_s16(q8s0, q8s1);
        int32x4_t b0 = vmull_s16(vget_low_s16(scale_min), vget_low_s16(q8s0));
        b0 = vmlal_high_s16(b0, scale_min, q8s0);
        acc = vfmaq_n_f32(acc, vcvtq_f32_s32(b0), c1);
    }
    int32x4_t sumi = vdupq_n_s32(0);
    const uint8x16_t m4b = vdupq_n_u8(0x0f);
    uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7;
    {
        x0 = vld1q_u8(x_qs + 0*16 + 0*64);
        x1 = vld1q_u8(x_qs + 1*16 + 0*64);
        x4 = vld1q_u8(x_qs + 2*16 + 0*64);
        x5 = vld1q_u8(x_qs + 3*16 + 0*64);
        x2 = vshrq_n_u8(x0, 4);
        x3 = vshrq_n_u8(x1, 4);
        x6 = vshrq_n_u8(x4, 4);
        x7 = vshrq_n_u8(x5, 4);
        x0 = vandq_u8(x0, m4b);
        x1 = vandq_u8(x1, m4b);
        x4 = vandq_u8(x4, m4b);
        x5 = vandq_u8(x5, m4b);
    }
    {
        int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 0*128);
        int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 0*128);
        int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 0*128);
        int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 0*128);
        int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 0*128);
        int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 0*128);
        int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 0*128);
        int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 0*128);
        int32x4_t p0 = vdupq_n_s32(0);
        int32x4_t p1 = vdupq_n_s32(0);
        int32x4_t p2 = vdupq_n_s32(0);
        int32x4_t p3 = vdupq_n_s32(0);
        p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x0), y0);
        p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x2), y2);
        p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x4), y4);
        p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x6), y6);
        p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x1), y1);
        p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x3), y3);
        p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x5), y5);
        p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x7), y7);
        p0 = vpaddq_s32(p0, p1);
        p2 = vpaddq_s32(p2, p3);
        p0 = vpaddq_s32(p0, p2);
        sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale)), p0);
    }
    {
        x0 = vld1q_u8(x_qs + 0*16 + 1*64);
        x1 = vld1q_u8(x_qs + 1*16 + 1*64);
        x4 = vld1q_u8(x_qs + 2*16 + 1*64);
        x5 = vld1q_u8(x_qs + 3*16 + 1*64);
        x2 = vshrq_n_u8(x0, 4);
        x3 = vshrq_n_u8(x1, 4);
        x6 = vshrq_n_u8(x4, 4);
        x7 = vshrq_n_u8(x5, 4);
        x0 = vandq_u8(x0, m4b);
        x1 = vandq_u8(x1, m4b);
        x4 = vandq_u8(x4, m4b);
        x5 = vandq_u8(x5, m4b);
    }
    {
        int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 1*128);
        int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 1*128);
        int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 1*128);
        int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 1*128);
        int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 1*128);
        int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 1*128);
        int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 1*128);
        int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 1*128);
        int32x4_t p0 = vdupq_n_s32(0);
        int32x4_t p1 = vdupq_n_s32(0);
        int32x4_t p2 = vdupq_n_s32(0);
        int32x4_t p3 = vdupq_n_s32(0);
        p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x0), y0);
        p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x2), y2);
        p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x4), y4);
        p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x6), y6);
        p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x1), y1);
        p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x3), y3);
        p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x5), y5);
        p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x7), y7);
        p0 = vpaddq_s32(p0, p1);
        p2 = vpaddq_s32(p2, p3);
        p0 = vpaddq_s32(p0, p2);
        sumi = vmlaq_s32(sumi, vmovl_high_s16(scale), p0);
    }
    {
        acc = vfmaq_n_f32(acc, vcvtq_f32_s32(sumi), c0);
    }
}

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

//#pragma GCC unroll 4
        for (int i = 0; i < nb; ++i) {
#ifdef GEMV_Q4K
            if constexpr (nrc_y == 1 && std::is_same<Dequantizer, DequantizerQ6K>::value) {
                fusion_mul_mat_qX_K_q8_K_T_y1_d6k(
                    acc[0],
                    deq.x[i].ql,
                    deq.x[i].qh,
                    deq.x[i].scales,
                    GGML_FP16_TO_FP32(deq.x[i].d),
                    q8.y[0][i].qs,
                    q8.y[0][i].bsums,
                    q8.y[0][i].d);
            } else
#endif
#ifdef GEMV_Q6K
            if constexpr (nrc_y == 1 && std::is_same<Dequantizer, DequantizerQ4K>::value) {
                fusion_mul_mat_qX_K_q8_K_T_y1_d4k(
                    acc[0],
                    deq.x[i].scales,
                    deq.x[i].qs,
                    GGML_FP16_TO_FP32(deq.x[i].d),
                    GGML_FP16_TO_FP32(deq.x[i].dmin),
                    q8.y[0][i].qs,
                    q8.y[0][i].bsums,
                    q8.y[0][i].d);
            } else
#endif
            {
                int32x4_t sumi[nrc_y];
                for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

                if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) {
                    deq.process_scales(i, q8, acc);
                    deq.prepare(i, 0);
                    deq.compute(q8, i, 0, sumi);
                    deq.prepare(i, 1);
                    deq.compute(q8, i, 1, sumi);
                } else {
                    if constexpr (Dequantizer::num_blocks() == 8) {
                        auto scales = deq.new_block(i, q8, acc);
                        deq.prepare(i, 0);
#pragma GCC unroll 8
                        for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                        deq.prepare(i, 1);
#pragma GCC unroll 8
                        for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                    }
                    else if constexpr (Dequantizer::num_blocks() == 16) {
                        auto scales = deq.new_block(i, q8, acc);
                        deq.prepare(i, 0);
#pragma GCC unroll 8
                        for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                        deq.prepare(i, 1);
#pragma GCC unroll 8
                        for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                    }
                    else {
                        GGML_ASSERT(false);
                    }
                }

#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) {
                    acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
                }
            }

#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                info.store(ix, iy, vaddvq_f32(acc[iy]));
            }
        }
    }
}

// ============================= i-quants

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {

    static int8x16_t load_values() {
        static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
        return vld1q_s8(iq4nl_values);
    }

    DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        (void)q8;
        (void)acc;
        d = GGML_FP16_TO_FP32(x[i].d);
        const uint16_t scales_h = x[i].scales_h;
        const uint16_t * scales_l = (const uint16_t *)x[i].scales_l;
        aux32[0] = scales_l[0] | (scales_l[1] << 16);
        aux32[1] = aux32[0] >> 4;
        // scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7
        uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf));
        uint16_t * aux16 = (uint16_t *)aux32;
        aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2;
        // sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7
        uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30));
        int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32));
        // shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7
        scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff));
        int16x8_t scales16 = vmovl_s8(scales8);
        int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))};
        return scales;
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs+64*j);
        for (int k = 0; k < 4; ++k) {
            bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k]));
            bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k]));
        }
    }

    Q4bits bits;
    const int8x16_t values;
    uint32_t aux32[2];

    constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602};

    float d;
};

struct SimpleBits {
    uint8x16x4_t b1;
    uint8x16x4_t b2;
};

IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) {
    int32x4x2_t scales;
    auto one = vdupq_n_u32(1);
    scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1));
    scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1));
    return scales;
}

inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) {
    auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127))));
    auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127))));
    b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1));
    b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2));
}

IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) {
    return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1));
}

struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
    DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j));
        prepare_all(data, q);
        return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1]));
    }

private:

    static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) {
        const uint8_t * idx = (const uint8_t *)bits;
        b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]});
        b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]});
        apply_signs_2(b, signs, bits[1]);
    }

    inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) {
        const uint32_t * q2 = (const uint32_t *)data.val;
        prepare2(quants+0, q2+0, keven_signs);
        prepare2(quants+2, q2+2, keven_signs);
        prepare2(quants+4, q2+4, keven_signs);
        prepare2(quants+6, q2+6, keven_signs);
    }
};

inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) {
    auto aux = vld1_u8(sc);
    auto scales_l = vand_u8(aux, vdup_n_u8(0xf));
    auto scales_h = vshr_n_u8(aux, 4);
    auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));

    auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1)));
    int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) };
    return make_wider(scales16);
}

struct DequantizerIQ2XS final : public BaseDequantizer<block_iq2_xs> {
    DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1);
    }

private:

    static void make2(const uint16_t * qs, uint8x16_t * b) {
        auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511))));
        auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511))));
        auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9))));
        auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9))));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1));
        b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2));
    }

    inline static void make4(const uint16_t * qs, uint8x16_t * b) {
        make2(qs + 0, b + 0);
        make2(qs + 4, b + 2);
    }

    IQK_ALWAYS_INLINE void prepare_internal(int i, int j) {
        make4(x[i].qs + 16*j + 0, bits.b1.val);
        make4(x[i].qs + 16*j + 8, bits.b2.val);
    }

};

// So, I hate to include this table, but with the GCC 12.3 compiler
// bundled in the Cosmopolitan tools, loading the unpacked sign bytes
// from this table using the packed 8 sign bits as index is faster than
// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to
// expand the bits to bytes.
static const uint64_t kall_signs[256] = {
    0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff,
    0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff,
    0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff,
    0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff,
    0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff,
    0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff,
    0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff,
    0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff,
    0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff,
    0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff,
    0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff,
    0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff,
    0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff,
    0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff,
    0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff,
    0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff,
    0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff,
    0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff,
    0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff,
    0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff,
    0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff,
    0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff,
    0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff,
    0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff,
    0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff,
    0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff,
    0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff,
    0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff,
    0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff,
    0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff,
    0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff,
    0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff,
    0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff,
    0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff,
    0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff,
    0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff,
    0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff,
    0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff,
    0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff,
    0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff,
    0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff,
    0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff,
    0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff,
    0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff,
    0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff,
    0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff,
    0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff,
    0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff,
    0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff,
    0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff,
    0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff,
    0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff,
    0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff,
    0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff,
};

struct SignHelper {

    IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const {
        auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]});
        // Normally we would expect this to be faster, but it isn't.
        // auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1]));
        // auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s));
    }

    // We would need these two if we weren't loading from the unpacked sign table.
    //const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201));
    //const uint8x16_t m1    = vdupq_n_u8(1);
};

struct DequantizerIQ2S final : public BaseDequantizer<block_iq2_s> {
    DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0, bits);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1, bits);
    }

private:

    static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) {
        uint32_t aux32[2];
        const uint16_t * aux16 = (const uint16_t *)aux32;
        for (int k = 0; k < 2; ++k) {
            aux32[1] = (qh[k] << 4) | (qh[k] << 18);
            aux32[0] = (aux32[1] << 4) & 0x03000300;
            aux32[1] &= 0x03000300;
            b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1]))));
            b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3]))));
            sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2;
            sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2;
        }
    }

    void prepare_internal(int i, int j, SimpleBits& sb) {

        const auto * qs = x[i].qs + 16*j;
        const auto * qh = x[i].qh + 4*j;
        const auto * sign_bits = qs + QK_K/8;

        make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val);
        make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val);
    }

    SignHelper sh;
};

struct DequantizerIQ3XXS final : public BaseDequantizer<block_iq3_xxs> {
    DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto q3data = vld1q_u8_x2(x[i].qs + 32*j);
        auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j));
        prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q);
        return prepare_scales_8(gas);
    }

private:

    inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) {
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]});
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]});
        apply_signs_2(b, keven_signs, sidx);
    }
    inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) {
        make2(q3+ 0, signs[0], quants + 0);
        make2(q3+ 8, signs[1], quants + 2);
        make2(q3+16, signs[2], quants + 4);
        make2(q3+24, signs[3], quants + 6);
    }
};

struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> {
    DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x2_t new_block(int i) {
        d = GGML_FP16_TO_FP32(x[i].d);
        uint32_t scales32[2];
        auto qs = vld1q_u8_x2(x[i].qs);
        auto signs = vld1q_u8(x[i].signs);

        prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs);

        std::memcpy(scales32, x[i].scales, 4);
        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
        auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7
        scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400)));
        auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8));
        int32x4x2_t scales;
        scales.val[0] = vmovl_s16(vget_low_s16(scales16));
        scales.val[1] = vmovl_s16(vget_high_s16(scales16));
        return scales;
    }

    inline void prepare(int i, int j) {
        if (j == 1) {
            auto qs = vld1q_u8_x2(x[i].qs + 32);
            auto signs = vld1q_u8(x[i].signs + 16);
            prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs);
        }
    }

private:

    static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256)));
        const uint16_t * idx = (const uint16_t *)&vindex;
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]});
        sh.apply_signs_1x(b+0, sign_bits+0);
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]});
        sh.apply_signs_1x(b+1, sign_bits+2);
    }
    static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto idx_l = vld1q_u8(qs);
        make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0);
        make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2);
    }

    static int16x8_t load_shift() {
        static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
        return vld1q_s16(k_shift);
    }

    inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) {
        auto signs = vld1q_u8(sign_bits);
        auto s = (const uint8_t *)&signs;
        make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val);
        make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val);
    }

    SignHelper sh;
    const int16x8_t hshift = load_shift();

};

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);
    Dequantizer deq(vx, bx, nrc_y);
    uint8x16_t  qx[8];
    int32x4_t   sumi[nrc_y];
    float32x4_t acc[nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {
            float d = deq.new_block(i);
            auto scales = deq.unpack(i, 0, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                sumi[iy] = vdupq_n_s32(0);
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]);
            }
            scales = deq.unpack(i, 1, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]);
                acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy]));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

// =========================================== Legacy quants

template <typename Block>
inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) {
    for (int k = 0; k < 4; ++k) aux[k] = x[k].d;
    return vld1_f16((const float16_t *)aux);
}

template <typename Block>
inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) {
    if constexpr (std::is_same_v<Block, block_q8_1>) {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; }
    } else {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; }
    }
    return vld1q_f16((const float16_t *)aux);
}

struct Q4LegacyBits {
    template <typename Block>
    inline void prepare(const Block * x) {
        for (int i = 0; i < 4; ++i) {
            auto q4bits = vld1q_u8(x[i].qs);
            b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
            b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
        }
    }
    inline void prepare1(const uint8_t * qs, int8x16_t * q) const {
        auto q4bits = vld1q_u8(qs);
        q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
        q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
    }
    inline void prepare1(const uint8_t * qs) {
        prepare1(qs, b);
    }
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    int8x16_t b[8];
};

// One would think this commented out version would do better than the one below
// because it offers more opportunities to execute instructions in parallel.
// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers
// cannot it just do the sequential version below on its own?
//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
//    const auto q8b_1 = vld1q_s8_x2(qs + 0);
//    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]);
//    const auto q8b_2 = vld1q_s8_x2(qs + 32);
//    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]);
//    auto p1234 = vpaddq_s32(p12, p34);
//    const auto q8b_3 = vld1q_s8_x2(qs + 64);
//    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]);
//    const auto q8b_4 = vld1q_s8_x2(qs + 96);
//    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]);
//    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
//}

inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
    auto q8b = vld1q_s8_x2(qs + 0);
    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 32);
    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]);
    auto p1234 = vpaddq_s32(p12, p34);
    q8b = vld1q_s8_x2(qs + 64);
    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 96);
    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]);
    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
}

typedef struct {
    ggml_half d[4];
    int8_t qs[4*QK8_0];
} block_q8_0_x4;
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");

template <int nrc> struct Q80 {

    constexpr static int nrc_y = nrc;

    Q80(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x4_t load_scales(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return vld1_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            sc16[iy] = vmul_f16(qx_scales, q8_scales);
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
        }
    }

    const block_q8_0 * y[nrc_y];
};

typedef struct {
    ggml_half d[8];
    int8_t qs[4*QK8_1];
} block_q8_1_x4;
static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");

template <int nrc> struct Q81 {

    constexpr static int nrc_y = nrc;

    Q81(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x8_t load_scales(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return vld1q_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales));
            acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m));
            sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales));
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
            acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s)));
        }
    }

    const block_q8_1 * y[nrc_y];
};

template <typename block_q>
struct BaseLegacyDequantizer {

    BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {}

    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); }

    Q4LegacyBits bits;

    const void * vx;
    const block_q * x;
    size_t bx;
};

struct DequantizerQ40 final : public BaseLegacyDequantizer<block_q4_0> {

    DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        q[0] = vaddq_s8(q[0], m8);
        q[1] = vaddq_s8(q[1], m8);
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    const int8x16_t m8 = vdupq_n_s8(-8);
    //ggml_half aux[4];
};

struct DequantizerQ41 : public BaseLegacyDequantizer<block_q4_1> {

    DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.prepare1(x[i].qs);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q4_1)/4;
            bits.prepare1(x[4*i+k].qs, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }
    // Leaving this commented out attempt to be reminded that I already tried this.
    // It has basically the same performance as the version above.
    //inline float16x8_t new_block(int i) {
    //    uint32x4_t scales = {};
    //    const block_q4_1 * xi = x + 4*i;
    //    const uint32_t * s32 = (const uint32_t *)&xi->d;
    //    scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[0].qs, bits.b + 0);
    //    scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[1].qs, bits.b + 2);
    //    scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[2].qs, bits.b + 4);
    //    scales = vsetq_lane_u32(*s32, scales, 3);
    //    bits.prepare1(xi[3].qs, bits.b + 6);
    //    return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle)));
    //}

    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};
};

struct HighBit5Legacy {
    inline uint8x16_t to_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask));
    }
    inline uint8x16_t to_negated_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0));
    }
    const uint64x2_t mask = vdupq_n_u64(0x8040201008040201);
    const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1));
};

struct DequantizerQ50 final : public BaseLegacyDequantizer<block_q5_0> {

    DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0xf0);

};

struct DequantizerQ80 final : public BaseLegacyDequantizer<block_q8_0> {

    DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.b[0] = vld1q_s8(x[i].qs);
        bits.b[1] = vld1q_s8(x[i].qs+16);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs);
            bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16);
        }
        return vld1_f16((const float16_t *)aux);
    }

};

struct DequantizerQ51 final : public BaseLegacyDequantizer<block_q5_1> {

    DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        bits.prepare1(x[i].qs, bits.b);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q5_1)/4;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0x10);
    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};

};

template <typename Dequantizer, typename Q8>
inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i));
        auto scale = vcvt_f32_f16(sc16[iy]);
        acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall));
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[Q8::nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[Q8::nrc_y];
        for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb/4; ++i) {
            q8.process_scales(i, deq, sc16, acc);
            sum_4(i, deq, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq, acc);
        }

        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq1.new_row(ix);
        deq2.new_row(ix);

        float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) };

        for (int i = 0; i < nb/8; ++i) {
            q8.process_scales(2*i+0, deq1, sc16+0, acc+0);
            q8.process_scales(2*i+1, deq2, sc16+1, acc+1);
            sum_4(2*i+0, deq1, q8, sc16+0, acc+0);
            sum_4(2*i+1, deq2, q8, sc16+1, acc+1);
        }
        for (int i = 2*(nb/8); i < nb/4; ++i) {
            q8.process_scales(i, deq1, sc16, acc);
            sum_4(i, deq1, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq1, acc);
        }

        info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1])));
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q81<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q80<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q81<1> q8(info);
    mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q80<1> q8(info);
    mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
    if constexpr (std::is_same_v<Dequantizer, DequantizerQ40> || std::is_same_v<Dequantizer, DequantizerQ50> ||
                  std::is_same_v<Dequantizer, DequantizerQ80>) {
        m.funcs[0] = mul_mat_qX_0_q8_0<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_0_q8_0<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_0_q8_0<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_0_q8_0<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_0_q8_0<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_0_q8_0<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_0_q8_0<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_0_q8_0<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerQ41> || std::is_same_v<Dequantizer, DequantizerQ51>) {
        m.funcs[0] = mul_mat_qX_1_q8_1<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_1_q8_1<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_1_q8_1<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_1_q8_1<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_1_q8_1<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_1_q8_1<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_1_q8_1<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_1_q8_1<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS> || std::is_same_v<Dequantizer, DequantizerIQ3XXS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ3S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ2XS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>;
    }
    else {
        m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>;
        m.funcs_v2 = mul_mat_qX_K_q8_K_T_v2<Dequantizer>;
    }
}

bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) {
    row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);

    (void)Ny;
    // Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications.
    //if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S ||
    //                typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false;

    switch (typeA) {
        case GGML_TYPE_Q2_K:
            MulMat::set_functions<DequantizerQ2K>(m);
            break;
        case GGML_TYPE_Q3_K:
            MulMat::set_functions<DequantizerQ3K>(m);
            break;
        case GGML_TYPE_Q4_K:
            MulMat::set_functions<DequantizerQ4K>(m);
            break;
        case GGML_TYPE_Q5_K:
            MulMat::set_functions<DequantizerQ5K>(m);
            break;
        case GGML_TYPE_Q6_K:
            MulMat::set_functions<DequantizerQ6K>(m);
            break;
        case GGML_TYPE_IQ4_XS:
            MulMat::set_functions<DequantizerIQ4XS>(m);
            break;
        case GGML_TYPE_IQ3_S:
            MulMat::set_functions<DequantizerIQ3S>(m);
            break;
        case GGML_TYPE_IQ3_XXS:
            MulMat::set_functions<DequantizerIQ3XXS>(m);
            break;
        case GGML_TYPE_IQ2_S:
            MulMat::set_functions<DequantizerIQ2S>(m);
            break;
        case GGML_TYPE_IQ2_XS:
            MulMat::set_functions<DequantizerIQ2XS>(m);
            break;
        case GGML_TYPE_IQ2_XXS:
            MulMat::set_functions<DequantizerIQ2XXS>(m);
            break;
        case GGML_TYPE_Q4_0:
            MulMat::set_functions<DequantizerQ40>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q4_1:
            MulMat::set_functions<DequantizerQ41>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q5_0:
            MulMat::set_functions<DequantizerQ50>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q5_1:
            MulMat::set_functions<DequantizerQ51>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q8_0:
            MulMat::set_functions<DequantizerQ80>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        default:
            return false;
    }
    return true;
}

}

#endif // __x86_64__ or __aarch64__


================================================
FILE: archive/third_party/llamafile/iqk_mul_mat_arm82.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_arm82.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define iqk_mul_mat iqk_mul_mat_arm82
#define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
#include "iqk_mul_mat.inc"
#endif  // __aarch64__


================================================
FILE: archive/third_party/llamafile/iqk_mul_mat_x86.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc
// Copyrigth 2024 Iwan Kawrakow - Apache 2.0 Licens
// with additions from
// https://github.com/ikawrakow/ik_llama.cpp/blob/main/ggml/src/iqk/iqk_mul_mat.cpp
// Copyrigth 2024-2025 Iwan Kawrakow - MIT Licens
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
// Copyright 2024 Iwan Kawrakow
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//
// Copyright (C) 2024-2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//

#include <cstring>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)

#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "sgemm.h"

// For i-quants, I had to explicitely specify which
// functions to inline / not inline (at least for some
// of the functions), else performance would be significantly
// lower. This is worrysome as things can change with,
// e.g., a different compiler version or running on a different
// CPU.
#ifdef _MSC_VER
#define IQK_NOINLINE __declspec(noinline)
#define IQK_ALWAYS_INLINE inline
#else
#define IQK_NOINLINE __attribute__((__noinline__))
#define IQK_ALWAYS_INLINE __attribute__((always_inline))
#endif

#define GGML_COMMON_IMPL_C
#include "llama.cpp/ggml-common.h"

// clang-format off

// This matrix - vector and matrix - matrix multiplication implementation
// for legacy quants, k-quants and i-quants makes prompt processing 150-200%
// (legacy and k-quants) or 250-400% (i-quants) faster.
// compared to mainline llama.cpp (and llamafile).
// It provides implementations for ARM_NEON (all quants) and AVX2
// (all quants except sub-4 bit i-quants).
//
// Main idea is that unpacking the quants and the block scales to
// be ready for dot products with the corresponding Q8_Y quants
// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type).
// Hence, if we are performing a QX x Q8_Y matrix matrix
// multiplication (as needed for prompt processing), we can get
// a significant speedup by reusing the unpacked QX quants and scales
// for multiplication with several Q8_K columns. We also achieve fewer
// loads from memory, which is the main purpose of tiling in general
// purpose matrix multiplication packages.

#include <utility>
#include <array>

#endif

constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast<ggml_type>(98);
constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast<ggml_type>(99);


namespace {

typedef struct {
    int32_t i1;
    int32_t i2;
} mmid_row_mapping;

struct DataInfo {
    float       * s;
    const char  * cy;
    size_t        bs;
    size_t        by;
    int           cur_y = 0;
    int           ne11;
    const mmid_row_mapping * row_mapping = nullptr;
    size_t        bs2 = 0;

    inline const char * src1_row(int iy) const {
        if (!row_mapping) return cy + (cur_y + iy)*by;
        int i11 = row_mapping[cur_y + iy].i1 % ne11;
        int i12 = row_mapping[cur_y + iy].i2;
        return cy + (i11 + i12*ne11)*by;
    }

    inline void store(int ix, int iy, float result) const {
        *(dst_row(iy) + ix) = result;
        //dst_row(iy)[ix] = result;
    }
    inline float * dst_row(int iy) const {
        if (!row_mapping) return s + (cur_y + iy)*bs;
        int i12 = row_mapping[cur_y + iy].i2;
        int i1  = row_mapping[cur_y + iy].i1;
        int i2  = i12;
        return s + i1*bs + i2*bs2;
    }
};

/*
moonll 
change param for set_mul_mat 
add func16
*/

typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);

struct MulMat {
    std::array<mul_mat_t, 8> funcs = {};
    mul_mat_t func16 = nullptr;
    //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
    IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
        constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)

        // copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L162
        // MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
        if (func16 && nrc_y >= 16) {
            int n_step = (nrc_y - info.cur_y)/16;
            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
                auto this_info = info;
                this_info.s += ix;
                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
                for (int iy = 0; iy < n_step; ++iy) {
                    func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
                    this_info.cur_y += 16;
                }
            }
            info.cur_y += 16 * n_step;
            if (info.cur_y == nrc_y) return;
        }
        // end copy

        int n_step = (nrc_y - info.cur_y)/funcs.size();
        if (n_step > 0) {
            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
                auto this_info = info;
                this_info.s += ix;
                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
                for (int iy = 0; iy < n_step; ++iy) {
                    funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
                    this_info.cur_y += funcs.size();
                }
            }
            info.cur_y += funcs.size() * n_step;
        }
        int n_left = nrc_y - info.cur_y;
        if (n_left > 0) {
            funcs[n_left-1](n, vx, bx, info, nrc_x);
        }
    }
    static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
private:
    template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
};

inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
    const uint16_t * scales = (const uint16_t *)scales8;
    const uint32_t a0 = scales[0] | (scales[1] << 16);
    const uint32_t a1 = scales[2] | (scales[3] << 16);
    const uint32_t a2 = scales[4] | (scales[5] << 16);
    aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030);
    aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030);
    aux32[2] = a1 & 0x3f3f3f3f;
    aux32[0] = a0 & 0x3f3f3f3f;
}

/*
moonll
decoding tables
*/
// copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L570
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
#ifdef __AVX2__
static const uint64_t iq1s_grid_us[2048] = {
    0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
    0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
    0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
    0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
    0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
    0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
    0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
    0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
    0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
    0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
    0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
    0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
    0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
    0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
    0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
    0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
    0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
    0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
    0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
    0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
    0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
    0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
    0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
    0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
    0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
    0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
    0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
    0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
    0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
    0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
    0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
    0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
    0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
    0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
    0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
    0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
    0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
    0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
    0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
    0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
    0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
    0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
    0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
    0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
    0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
    0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
    0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
    0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
    0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
    0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
    0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
    0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
    0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
    0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
    0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
    0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
    0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
    0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
    0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
    0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
    0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
    0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
    0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
    0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
    0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
    0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
    0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
    0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
    0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
    0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
    0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
    0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
    0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
    0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
    0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
    0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
    0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
    0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
    0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
    0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
    0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
    0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
    0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
    0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
    0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
    0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
    0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
    0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
    0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
    0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
    0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
    0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
    0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
    0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
    0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
    0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
    0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
    0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
    0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
    0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
    0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
    0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
    0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
    0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
    0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
    0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
    0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
    0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
    0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
    0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
    0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
    0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
    0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
    0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
    0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
    0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
    0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
    0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
    0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
    0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
    0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
    0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
    0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
    0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
    0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
    0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
    0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
    0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
    0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
    0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
    0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
    0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
    0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
    0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
    0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
    0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
    0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
    0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
    0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
    0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
    0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
    0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
    0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
    0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
    0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
    0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
    0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
    0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
    0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
    0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
    0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
    0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
    0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
    0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
    0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
    0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
    0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
    0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
    0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
    0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
    0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
    0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
    0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
    0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
    0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
    0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
    0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
    0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
    0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
    0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
    0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
    0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
    0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
    0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
    0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
    0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
    0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
    0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
    0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
    0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
    0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
    0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
    0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
    0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
    0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
    0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
    0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
    0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
    0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
    0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
    0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
    0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
    0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
    0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
    0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
    0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
    0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
    0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
    0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
    0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
    0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
    0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
    0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
    0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
    0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
    0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
    0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
    0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
    0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
    0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
    0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
    0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
    0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
    0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
    0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
    0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
    0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
    0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
    0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
    0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
    0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
    0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
    0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
    0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
    0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
    0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
    0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
    0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
    0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
    0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
    0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
    0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
    0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
    0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
    0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
    0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
    0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
    0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
    0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
    0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
    0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
    0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
    0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
    0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
    0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
    0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
    0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
    0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
    0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
    0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
    0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
    0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
    0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
    0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
    0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
    0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
    0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
    0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
    0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
    0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
    0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
    0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
    0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
    0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
    0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
    0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
    0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
    0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
    0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
    0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
    0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
    0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
    0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
    0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
    0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
    0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
    0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
    0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
    0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
    0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
    0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
    0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
    0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
    0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
    0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
    0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
    0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
    0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
    0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
    0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
    0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
    0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
    0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
    0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
    0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
    0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
    0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
    0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
    0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
    0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
    0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
    0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
    0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
    0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
    0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
    0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
    0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
    0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
    0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
    0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
    0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
    0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
    0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
    0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
    0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
    0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
    0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
    0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
    0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
    0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
    0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
    0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
    0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
    0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
    0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
    0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
    0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
    0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
    0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
    0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
    0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
    0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
    0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
    0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
    0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
    0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
    0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
    0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
    0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
    0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
    0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
    0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
    0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
    0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
    0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
    0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
    0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
    0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
    0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
    0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
    0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
    0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
    0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
    0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
    0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
    0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
    0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
    0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
    0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
    0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
    0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
    0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
    0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
    0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
    0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
    0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
    0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
    0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
    0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
    0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
    0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
    0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
    0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
    0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
    0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
    0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
    0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
    0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
    0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
    0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
    0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
    0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
    0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
    0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
    0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
    0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
    0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
    0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
    0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
    0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
    0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
    0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
    0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
    0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
    0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
    0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
    0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
    0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
    0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
    0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
    0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
    0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
    0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
    0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
    0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
    0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
    0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
    0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
    0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
    0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
    0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
    0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
    0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
    0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
    0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
    0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
    0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
    0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
    0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
    0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
    0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
    0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
    0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
    0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
    0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
    0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
    0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
    0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
    0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
    0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
    0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
    0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
    0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
    0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
    0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
    0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
    0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
    0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
    0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
    0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
    0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
    0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
    0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
    0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
    0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
    0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
    0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
    0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
    0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
    0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
    0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
    0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
    0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
    0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
    0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
    0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
    0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
    0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
    0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
    0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
    0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
    0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
    0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
    0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
    0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
    0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
    0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
    0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
    0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
    0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
    0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
    0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
    0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
    0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
    0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
    0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
    0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
    0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
    0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
    0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
    0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
    0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
    0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
    0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
    0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
    0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
    0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
    0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
    0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
    0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
    0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
    0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
    0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
    0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
    0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
    0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
    0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
    0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
    0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
    0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
    0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
    0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
    0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
    0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
    0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
    0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
    0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
    0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
    0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
    0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
    0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
    0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
};
#else
static const uint32_t iq1s_grid_us[2048] = {
    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
};
#endif
// end copy https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L570

#ifndef HAVE_FANCY_SIMD
const uint64_t keven_signs[128] = {
    0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
    0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff,
    0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff,
    0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff,
    0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff,
    0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff,
    0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff,
    0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff,
    0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff,
    0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff,
    0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff,
    0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff,
    0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff,
    0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff,
    0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff,
    0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff,
    0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff,
    0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff,
    0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff,
    0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff,
    0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
    0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
};
#endif

}

/* moonll change mulmat
add typeB and strideB
}*/

// Adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L406
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
bool iqk_mul_mat(long Nx, long Ny, long ne00,
    int typeA, const void * A, long strideA,
    int typeB, const void * B, long strideB,
    float * C, long stride_C, int ith, int nth) {

        MulMat mm;
    
        if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) {
            return false;
        }

        size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA));
        size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB));
      
        
        auto nrc_x = (Nx + nth - 1)/nth;
        auto first_x = ith*nrc_x;
        if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;

        DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};

        mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);

        return true;
}
// end adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L406


bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
        float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
    const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
    assert(row_mapping != nullptr);

    MulMat mm;
    int row_size_q8;
    /* moonll

    if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
        return false;
    }*/
    int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
    int nrc_x = (Nx + nth - 1)/nth;
    int first_x = ith*nrc_x;
    if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
    DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)};
    mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
    return true;
}

#if defined __x86_64__ || defined(_M_X64)

#if defined HAVE_FANCY_SIMD
    #undef HAVE_FANCY_SIMD
#endif
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
    #define HAVE_FANCY_SIMD
#endif
//#define HAVE_FANCY_SIMD

namespace {

inline float hsum_float_4(__m128 x) {
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
    x = _mm_add_ss(x, _mm_movehdup_ps(x));
    return _mm_cvtss_f32(x);
}
inline float hsum_float_8(__m256 x) {
    return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
}

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)


template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

#ifdef HAVE_FANCY_SIMD
    inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
#endif
    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
    inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

// Handles q4_K and q5_K scales/mins
struct Scales8K {
    template <typename Q8>
    inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        make_q4_scales(data, utmp);
        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
        const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1);
        accum_mins(mins128, q8, i, c, accd);
        const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
        return MM256_SET_M128I(sc128, sc128);
    }
#ifdef HAVE_FANCY_SIMD
    template <typename Q8>
    inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        auto scales = process_mins_and_scales(data, c, i, q8, accd);
        return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1);
    }
#endif
    template <typename Q8>
    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i q8s = q8.load_bsums(iy, i);
            const __m256i prod = _mm256_madd_epi16(mins, q8s);
            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
        }
    }
#ifdef HAVE_FANCY_SIMD
    const __m512i shuffles512[2] = {
        _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302,
                         0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100),
        _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a,
                         0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
    };
#endif
    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};

    uint32_t utmp[4];
};

template <typename Q8>
inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        const __m256i prod  = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i));
        accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]);
    }
}
inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) {
    const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
    const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
    scales[0] = MM256_SET_M128I(l_scales, l_scales);
    scales[1] = MM256_SET_M128I(h_scales, h_scales);
}

struct ScaleQ3 {
    inline __m128i make_scales(const uint16_t * s8) const {
        const uint16_t * scales16 = (const uint16_t *)s8;
        uint32_t aux0 = scales16[0] | (scales16[1] << 16);
        uint32_t aux1 = scales16[2] | (scales16[3] << 16);
        uint32_t aux2 = scales16[4] | (scales16[5] << 16);
        __m128i scales128 = _mm_set_epi32(
            ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030),
            ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030),
             (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030),
             (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030));
        return _mm_add_epi8(scales128, m32);
    }
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct ScaleIQ4XS {
    inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) {
        uint32_t tmp32 = scales_h | (scales_h << 14);
        const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4);
        const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask);
        return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32);
    }
    const __m128i hshift = _mm_set_epi32(12, 8, 4, 0);
    const __m128i lshift = _mm_set_epi32(4, 0, 4, 0);
    const __m128i hmask  = _mm_set1_epi16(0x03);
    const __m128i lmask  = _mm_set1_epi8(0xf);
    const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400);
    const __m128i m32 = _mm_set1_epi16(-32);
};

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1455
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
struct Scales8KBase {
    template <typename Q8>
    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i q8s = q8.load_bsums(iy, i);
            const __m256i prod = _mm256_madd_epi16(mins, q8s);
            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
        }
    }
    inline __m256i shuffle(__m128i mins) const {
        return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0]));
    }
    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
};
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1455

template <typename Block>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
    inline void new_row(int ix) {
        x = (const Block *)((const char *)vx + bx*ix);
    }

    const void *  vx;
    size_t        bx;
    const Block * x;

    float d;
};

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1698
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
__m128i inline load_iq4nl_values_128() {
    static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
    return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
}

__m256i inline load_iq4nl_values_256() {
    auto val128 = load_iq4nl_values_128();
    return MM256_SET_M128I(val128, val128);
}
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1698

#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================

struct BlockPermuter {
    const __m512i permute1 = _mm512_set_epi64(11, 10,  9,  8, 3, 2, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
};

struct Q4Bits {
    inline void prepare(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        auto tmp1 = _mm512_and_si512(q4bits, ml);
        auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        tmp1 = _mm512_and_si512(q4bits, ml);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
    }
    inline void prepare64(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        values[0] = _mm512_and_si512(q4bits, ml);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        values[2] = _mm512_and_si512(q4bits, ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0xf);
    BlockPermuter perm;
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2) {

        auto q2bits = _mm512_loadu_si512((const __m512i*)q2);
        auto tmp = _mm512_srli_epi16(q2bits, 2);

        values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp);
        values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml);
        values[0] = _mm512_and_si512(values[0], ml);
        values[2] = _mm512_and_si512(values[2], ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0x03);
    BlockPermuter perm;
};

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    Scales8K s8k;
};

/*
moonll DequantizerIQ4XS
*/

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1775
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
__m512i inline load_iq4nl_values_512() {
    auto val256 = load_iq4nl_values_256();
    return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1775

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1781
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    // Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1782
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        prepare(x[i].qs);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        auto scales256 = MM256_SET_M128I(scales128, scales128);
        auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
        scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
        scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
        scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
    }
    inline void prepare(const uint8_t * q4) {
        bits.prepare64(q4);
        // We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111
        //                bits.valuse[1]: 16..31, 48...63, 80...95, 112..127
        //                etc.
        auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]);
        bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1]));
        bits.values[0] = _mm512_shuffle_epi8(values, tmp);
        tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]);
        bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3]));
        bits.values[2] = _mm512_shuffle_epi8(values, tmp);
    }

    Q4Bits bits;
    Scales8KBase s8k;
    ScaleIQ4XS siq4;
    const __m512i values;
    const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2,  9,  8, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
    const __m512i shuffles[4] = {
        _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
    };
};
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1781

struct HighBit5 {
    inline void apply(const uint8_t * h, Q4Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x10);
};

struct HighBit3 {
    inline void apply(const uint8_t * h, Q2Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x04);
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].qh, bits);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    HighBit5 hbits;
    Scales8K s8k;
};

struct Scale16 {
    inline void make_scales(const __m128i& scales8, __m512i * scales) const {
        auto all_scales8 = MM256_SET_M128I(scales8, scales8);
        auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1);
        auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2);
        scales[0] = _mm512_cvtepi8_epi16(scales1);
        scales[1] = _mm512_cvtepi8_epi16(scales2);
    }
    template <typename Q8>
    inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8,
        const Q8& q8, __m256 * accm, __m512i * scales) const {
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm);
        make_scales(scales8, scales);
    }
    const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202,
                                              0x05050505, 0x01010101, 0x04040404, 0x00000000);
    const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a,
                                              0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales);
    }

    Q2Bits bits;
    Scale16 sc16;
    const __m128i m4 = _mm_set1_epi8(0xf);

};

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].hmask, bits);
        auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales);
        sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales);
    }

    Q2Bits bits;
    HighBit3 hbits;
    ScaleQ3 sc3;
    Scale16 sc16;
    const __m128i m4  = _mm_set1_epi8(0xf);
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare64(x[i].ql);
        add_high_bits(x[i].qh, bits);
        auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales);
        sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales);
    }

    inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const {
        auto hbits = _mm512_loadu_si512((const __m512i *)qh);
        auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh);
        auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
        tmp1 = _mm512_and_si512(hbits, mh);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh);
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
    }

    Q4Bits bits;
    HighBit3 hbits;
    Scale16 sc16;

    const __m512i mh = _mm512_set1_epi8(0x30);

};

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0));
                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1));
                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2));
                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}
// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L2408
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
template <typename Q8>
inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
    const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
    const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1));
    const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2));
    const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3));
    auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
    sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
    accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0));
                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1));
                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2));
                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[4];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0));
                const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1));
                const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2));
                const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(),
                                    p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]);
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}

template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    constexpr int k_nx = 2;

    Q8<1> q8(info);

    Dequantizer deq1(vx, bx);
    Dequantizer deq2(vx, bx);

    Dequantizer * deq[k_nx];
    deq[0] = &deq1;
    deq[1] = &deq2;

    __m512i scales[2*k_nx];

    for (int ix = 0; ix < nrc_x; ++ix) {

        auto accd = _mm512_setzero_ps();
        auto accm = _mm256_setzero_ps();

        for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix);

        for (int i = 0; i < nb/k_nx; ++i) {

            for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx);

            for (int kx = 0; kx < k_nx; ++kx) {
                compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd);
            }

        }
        if (2*(nb/2) < nb) {
            int i0 = 2*(nb/2);
            deq[0]->new_block(i0, q8, &accm, scales);
            compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd);
        }

        auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1));
        info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256)));
    }
}
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L2408

#else
// ===================================== Vanilla AVX2 =====================================

struct Q4Bits {
    inline void prepare(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[2] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare64(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[1] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare16(const uint8_t * q4, int j) {
        values[0] = dequant16(q4 + 64*j +  0);
        values[1] = dequant16(q4 + 64*j + 16);
        values[2] = dequant16(q4 + 64*j + 32);
        values[3] = dequant16(q4 + 64*j + 48);
    }
    inline __m256i dequant16(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128);
        return _mm256_and_si256(ml, aux256);
    };
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0xf);
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2, int j) {
        auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j);
        values[0] = _mm256_and_si256(q2bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml);
    }
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0x03);
};

struct HighBit5 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q4Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x10);
    __m256i hbits;
};

struct HighBit3 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q2Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x04);
    __m256i hbits;
};


/*
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
    if (j == 0) {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
        }
    } else {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
        }
    }
}*/

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q4Bits bits;
    Scales8K s8k;
};

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        return MM256_SET_M128I(scales128, scales128);
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs, j);
        bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]);
        bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]);
        bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]);
        bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]);
    }

    static __m256i load_values() {
        static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
        auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
        return MM256_SET_M128I(val128, val128);
    }

    Q4Bits bits;
    Scales8K s8k;
    ScaleIQ4XS siq4;
    const __m256i values;
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].qh);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q4Bits  bits;
    HighBit5 hbits;
    Scales8K s8k;
};

template <typename Q8>
inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d,
    __m256 * accm, __m256i * scales) {
    const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
    process_mins_16(all_scales, q8, i, d, accm);
    prepare_scales_16(all_scales, scales);
}

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].hmask);
        process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q2Bits  bits;
    HighBit3 hbits;
    ScaleQ3 sc3;

    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm);
        prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q2Bits  bits;

    const __m128i m4 = _mm_set1_epi8(0xf);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare64(x[i].ql, j);
        auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j);
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh));
    }

    Q4Bits  bits;
    const __m256i mh = _mm256_set1_epi8(0x30);
};


inline __m256i get_scale_shuffle_8(int i);

inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales);

inline __m256i get_scale_shuffle_16(int i);

inline void set_scales_16(const __m256i& all_scales, __m256i* scales);


template <typename Dequantizer, int nrc_y>
static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%QK_K == 0);
    const int nb = n/QK_K;

    Q8<nrc_y> q8(info);

    __m256i all_scales[2];
    __m256i scales[4];
    __m256  accd[nrc_y];

    Dequantizer deq(vx, bx);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accd, all_scales);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {
                deq.prepare(i, j);
                set_scales_16(all_scales[j], scales);
                multiply_add(deq.bits, scales, j, i, q8, sumi);
            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }

}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accd[nrc_y];
    __m256i scales[4];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            auto all_scales = deq.new_block(i, q8, accd);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {

                deq.prepare(i, j);

                set_scales_8(all_scales, j, scales);

                multiply_add(deq.bits, scales, j, i, q8, sumi);

            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }
}
#endif  // Zen4 or vanilla AVX2


//
// ============================== Legacy quants
//

struct DotHelper {
    const __m256i m1 = _mm256_set1_epi16(1);
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y);
    }
#else
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y));
    }
#endif
};

struct SignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x));
    }
};
struct UnsignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(x, y);
    }
};
template <typename Q8, typename Dot> struct Sum4 {
    Dot dot;
    inline __m256i compute(const __m256i * qx, const Q8 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1));    // 0,0, 1,1, 0,0, 1,1
        const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3));    // 2,2, 3,3, 2,2, 3,3
        return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
    }
};

struct Sum4_Q8 {
    SignedDot dot;
    static inline __m256i add1(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b));
    }
    static inline __m256i add2(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b));
    }
    inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = add1(p0, p1);  // 0,1, 0,1, 0,1, 0,1
        const __m256i p23 = add1(p2, p3);  // 2,3, 2,3, 2,3, 2,3
        return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3
    }
};

struct ScaleHelperQ_0 {
    ggml_half scales8[4];
    template <typename Q>
    inline __m128 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
        return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
    }
    template <typename Q>
    inline __m128 prepare4(__m128 other_scales, const Q * y) {
        return _mm_mul_ps(other_scales, prepare4<Q>(y));
    }
    template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
    template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
};
// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8187
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
template <int min_value>
struct ScaleHelperQ_0_1 {
    ggml_half scales8[4];
    template <typename Q>
    inline __m256 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
        auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
        return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
    }
    template <typename Q>
    inline __m256 prepare4(__m256 other_scales, const Q * y) {
        return _mm_mul256_ps(other_scales, prepare4<Q>(y));
    }
    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
        float d = GGML_FP16_TO_FP32(y->d);
        return std::make_pair(d, -d*float(min_value));
    }
    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
    }
    const __m128 min = _mm_set1_ps(float(-min_value));
};
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8187

struct ScaleHelperQ_1 {
    uint32_t scales8[4];
    const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);

    template <typename Q>
    inline __m256 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) {
            // it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
            // complain that this breaks strict-aliasing rules.
            memcpy(scales8 + j, &y[j].d, sizeof(uint32_t));
        }
        return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle));
    }

    template <typename Q>
    inline __m256 prepare4(__m256 other_scales, const Q * y) {
        return _mm256_mul_ps(other_scales, prepare4<Q>(y));
    }

    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
        return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
    }
    template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
    }
    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
    }
};

struct MinusType0 {
    inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); }
    inline float compute(float d, int) const { return d; }
    inline float result(__m256 acc, int) const { return hsum_float_8(acc); }
};

template <int nrc_y> struct MinusType1 {
    __m128 accm[nrc_y];
    MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); }
    inline __m256 compute(__m256 dm, int iy) {
        const __m128 d = _mm256_castps256_ps128(dm);
        const __m128 m = _mm256_extractf128_ps(dm, 1);
        accm[iy] = _mm_add_ps(accm[iy], m);
        return _mm256_set_m128(d, d);
    }
    inline float compute(const std::pair<float, float>& dm, int iy) {
        accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f));
        return dm.first;
    }
    inline float result(__m256 acc, int iy) const {
        const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
        return hsum_float_4(_mm_add_ps(sum, accm[iy]));
    }
};

template <typename Minus, int nrc_y, bool is_multiple_of_4> struct AccumT {
    __m256 acc[nrc_y];
    Minus accm;
    AccumT() {  for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); }
    template <typename Unpacker, typename Scales, typename Sum, typename Q8>
    inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) {
        auto qx = unp.quants();
        __m256 dall[nrc_y];
        for (int i = 0; i < nb/4; ++i) {
            auto other_scales = unp.set_block_4(i);
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
                dall[iy] = accm.compute(s12, iy);
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto pall = sum.compute(qx, y[iy] + 4*i);
                acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
            }
        }
        if (!is_multiple_of_4) {
            for (int i = 4*(nb/4); i < nb; ++i) {
                auto other_scales = unp.set_block(i);
                for (int iy = 0; iy < nrc_y; ++iy) {
                    auto s12 = scales.prepare1(other_scales, y[iy] + i);
                    auto d = accm.compute(s12, iy);
                    const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
                    acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
                }
            }
        }
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, accm.result(acc[iy], iy));
            //s[iy*bs] = accm.result(acc[iy], iy);
        }
    }
};

template <int nrc_y, bool is_multiple_of_4>
using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;

template <int nrc_y, bool is_multiple_of_4>
using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;

using Sum4Type0 = Sum4<block_q8_0, SignedDot>;
using Sum4Type1 = Sum4<block_q8_1, UnsignedDot>;

template <typename Unpacker, typename Sum4Type, typename AccumType, typename Scales, typename Q8, int nrc_y>
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
    Unpacker unp(vx, bx);
    Sum4Type sum4;
    Scales scales;
    for (int ix = 0; ix < nrc_x; ++ix) {
        unp.set_row(ix);
        AccumType accum;
        accum.compute(nb, unp, scales, sum4, y, info, ix);
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_1> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, true>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, false>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

struct Dequantizer4bit {
    const __m256i m4 = _mm256_set1_epi8(0xf);
    inline __m256i dequant(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4);
    }
};

struct Q8_0_Dequantizer {
    inline __m256i dequant(const block_q8_0 * x) const {
        return _mm256_loadu_si256((const __m256i *)x->qs);
    }
};

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8455
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
struct Q8_0_1_Dequantizer {
    inline __m256i dequant(const block_q8_0 * x) const {
        return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
    }
};
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8455

struct Q4_0_Dequantizer {
    Dequantizer4bit b4;
    const __m256i m8 = _mm256_set1_epi8(-8);
    inline __m256i dequant(const block_q4_0 * x) const {
        return _mm256_add_epi8(b4.dequant(x->qs), m8);
    }
};

struct Q4_1_Dequantizer {
    Dequantizer4bit b4;
    inline __m256i dequant(const block_q4_1 * x) const {
        return b4.dequant(x->qs);
    }
};

struct HBitDequantizer {
    const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
    const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
    const __m256i minus1 = _mm256_set1_epi64x(-1);
    inline __m256i to_bytes(const uint8_t * bits) const {
        // Note: Data in all ggml quants is at least 2-byte aligned.
        // => we can cast to uint16_t and use or on two consecutive entries
        // which is faster than memcpy
        const uint16_t * aux16 = (const uint16_t *)bits;
        const uint32_t aux32 = aux16[0] | (aux16[1] << 16);
        //uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
        __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle);
        bytes = _mm256_or_si256(bytes, mask);
        return _mm256_cmpeq_epi8(bytes, minus1);
    }
};

struct Q5_0_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8((char)0xF0);
    inline __m256i dequant(const block_q5_0 * x) const {
        const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

struct Q5_1_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8(0x10);
    inline __m256i dequant(const block_q5_1 * x) const {
        const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

template <typename Q, typename Scales, typename Dequantizer>
struct Q_Unpacker {
    Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {}

    const char * cx_0;
    const Q    * x;
    size_t       bx;

    Scales scales;
    Dequantizer deq;

    __m256i qx[4];

    inline const __m256i* quants() const { return qx; }

    inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); }

    inline auto set_block_4(int i) {
        for (int j = 0; j < 4; ++j) {
            qx[j] = deq.dequant(x + 4*i + j);
        }
        return scales.prepare4(x + 4*i);
    }
    inline auto set_block(int i) {
        qx[0] = deq.dequant(x + i);
        return scales.prepare1(x + i);
    }
};

struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
    Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8574
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
    Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
//    using Sum4T = Sum4TypeQ81;
    inline static int block_size() { return QK8_0; }
};
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8574
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
    Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
    Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK5_0; }
};
struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> {
    Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};
struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer> {
    Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};

template <int nrc_y>
void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Q8_0_Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Q8_0_Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}


/*
moonll
add some structs for DequantizerIQ2XXS
SimpleBits
EvenSignHelper
*/
struct SimpleBits {
    __m256i values[4];
};

// fix for #829: Add checks of AVX512VPOPCNTDQ
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
#define HAVE_AVX512_POPCNT 1
#else
#define HAVE_AVX512_POPCNT 0
#endif

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7736
// with the addition of a branch that handles a missing _mm256_popcnt_epi32 instruction
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
struct EvenSignHelper {
    #if defined HAVE_FANCY_SIMD
    // #pragma message("Using AVX512VPOPCNTDQ in even sign helper")
        union sbits_t {
            __m128i vec;
            __mmask32 mask[4];
        };
        IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
            aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
            
            // fix for #829: Compatibility with processors using Intel Cascade Lake architecture
            // If AVX512VPOPCNTDQ extension is not supported, use alternative implementation
            #if HAVE_AVX512_POPCNT
                auto pcnt = _mm256_popcnt_epi32(aux);
                
            #else
                // Alternative implementation: Using standard bit counting method
                __m256i pcnt;
                int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
                int* aux_ptr = reinterpret_cast<int*>(&aux); // Get address of aux directly, avoid unnecessary copies
                
                #pragma unroll 8  // Hint compiler to unroll loops, increasing throughput of SIMD computing
                for (int i = 0; i < 8; i++) {
                    pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // Use compiler builtin popcount
                }
            #endif
            
            sbits_t sbits;
            sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
            values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
            values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]);
            //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
            //const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
            //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
            //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
        }
        const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0);
        const __m256i mask   = _mm256_set1_epi32(127);
        const __m256i mone   = _mm256_set1_epi32(1);
    #else
        inline void sign_value(uint32_t aux32, __m256i& value) const {
            auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127],
                                           keven_signs[(aux32 >>  7) & 127], keven_signs[(aux32 >>  0) & 127]);
            value = _mm256_sign_epi8(value, signs);
        }
    #endif
};

/*
moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
add func
get_scale_shuffle_8
get_scale_shuffle_16
set_scales_16
*/

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1578
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
inline __m256i get_scale_shuffle_8(int i) {
    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
}

inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
}


inline __m256i get_scale_shuffle_16(int i) {
    static const uint8_t k_shuffle[128] = {
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
    };
    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
}

inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
}

template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
    if (j == 0) {
#ifdef HAVE_FANCY_SIMD
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
        }
#else
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
        }
#endif
    } else {
#ifdef HAVE_FANCY_SIMD
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
        }
#else
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
        }
#endif
    }
}

/*
moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
add func
set_scales_8_iq
set_scales_16_iq

add MUL_MAT
mul_mat_qX_K_q8_K_IQ_1
mul_mat_qX_K_q8_K_IQ_N
mul_mat_qX_K_q8_K_IQ
*/

template <typename Bits>
inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) {
    if (j == 0) {
#ifdef HAVE_FANCY_SIMD
        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
        sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2));
        sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4));
#else
        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
        sumi[0] = _mm256_add_epi32(p1, p3);
        sumi[1] = _mm256_add_epi32(p2, p4);
#endif
    } else {
#ifdef HAVE_FANCY_SIMD
        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
        sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2));
        sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4));
#else
        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
        sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3));
        sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4));
#endif
    }
}
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1578


// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7278
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) {
    //#ifdef HAVE_FANCY_SIMD
        auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100)
                              : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908);
        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)));
    //#else
    //    set_scales_8(all_scales, j, scales);
    //#endif
    }
    
inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) {
    #ifdef HAVE_FANCY_SIMD
        auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100);
        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8)));
    #else
        set_scales_16(all_scales, scales);
    #endif
    }
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7278
    
// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7299
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
        const int nb = n / QK_K;
        Q8<1> q8(info);
        Dequantizer deq(vx, bx);
        __m256i scales[2];
        __m256i q8_quants[4];
        for (int ix = 0; ix < nrc_x; ++ix) {
    
            __m256 accd = _mm256_setzero_ps();
            deq.new_row(ix);
    
            for (int i = 0; i < nb; ++i) {
    
                __m256i sumi[2], all_scales[Dequantizer::num_blocks/8];
                deq.new_block(i, all_scales);
    
                for (int j = 0; j < QK_K/128; ++j) {
                    deq.prepare(i, j, q8, q8_quants);
                    if constexpr (Dequantizer::num_blocks == 8) {
                        set_scales_8_iq(j, all_scales[0], scales);
                    } else {
                        set_scales_16_iq(all_scales[j], scales);
                    }
                    multiply_add_1(j, deq.bits, scales, q8_quants, sumi);
                }
                accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd);
            }
    
            info.store(ix, 0, hsum_float_8(accd));
        }
    }


template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    const int nb = n / QK_K;
    Q8<nrc_y> q8(info);
    Dequantizer deq(vx, bx);
    __m256i scales[4];
    __m256  accd[nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8];
            //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
            __m256i mins;
            float dmin = deq.new_block(i, all_scales, mins);
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto bsums = q8.load_bsums(iy, i);
                auto prod  = _mm256_madd_epi16(mins, bsums);
                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
            }

            for (int j = 0; j < QK_K/128; ++j) {
                deq.prepare(i, j);
                if constexpr (Dequantizer::num_blocks == 8) {
                    set_scales_8(all_scales[0], j, scales);
                } else {
                    set_scales_16(all_scales[j], scales);
                }
                //multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
                multiply_add(deq.bits, scales, j, i, q8, sumi);
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }
        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }
    }
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
#ifdef HAVE_FANCY_SIMD
    if constexpr (nrc_y == 1) {
        mul_mat_qX_K_q8_K_IQ_1<Dequantizer>(n, vx, bx, info, nrc_x);
    } else {
        mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
    }
#else
    mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
#endif
}
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7299

/*
moonll iq1s
core func for iq1s mul_mat_iq1_s_q8_K

*/
// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L3813
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
template <int nrc_y>
static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    GGML_ASSERT(n%QK_K == 0);
    Q8<nrc_y, block_q8_K> q8(info);
    __m256i qx[8];
    __m256i scales[4];
    __m256  acc[nrc_y] = {};
    auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000
    __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100);
    for (int ix = 0; ix < nrc_x; ++ix) {
        auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx);
        for (int ibl = 0; ibl < n/QK_K; ++ibl) {
            float d = GGML_FP16_TO_FP32(iq1s[ibl].d);
            auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh);
            auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7));
            scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1));
#ifdef HAVE_FANCY_SIMD
            auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask);
            auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9));
#else
            auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask);
            auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7)));
#endif
            deltas128 = _mm_mullo_epi16(scales128, deltas128);
            scales128 = _mm_slli_epi16(scales128, 3);
            auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128);
            auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128);
            auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7
            auto all_scales = MM256_SET_M128I(scales128, scales128);
            auto shuffle = shuffle0;
            for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
                scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle);
                shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4));
            }
            const uint8_t  * qs = iq1s[ibl].qs;
            const uint16_t * qh = iq1s[ibl].qh;
            for (int ib = 0; ib < QK_K/32; ib += 2) {
                qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)],
                                             iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
                qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)],
                                             iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
                qs += 8;
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto bsums = q8.load_bsums(iy, ibl);
                auto sumi = _mm256_setzero_si256();
                for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
                    auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0);
                    auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1);
#ifdef HAVE_FANCY_SIMD
                    auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1);
                    auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2);
                    sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2));
#else
                    auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1);
                    auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2);
                    auto dot  = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2));
                    sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot));
#endif
                }
#ifdef HAVE_FANCY_SIMD
                sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas);
#else
                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas));
#endif
                acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]);
            }
        }
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, 0.125f*hsum_float_8(acc[iy]));
            acc[iy] = _mm256_setzero_ps();
        }
    }
}
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L3813

/*
moonll iq1s
DequantizerIQ2XXS
DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
*/

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8035
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
    DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    constexpr static int num_blocks = 8;

    union Data {
        __m256i vec;
        uint32_t val[8];
    };

    inline __m128i load_scales(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        const uint16_t * a16 = (const uint16_t *)x[i].qs;
        auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12);
        return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1));
    }

    inline void new_block(int i, __m256i * scales) {
        auto sc16 = load_scales(i);
        scales[0] = MM256_SET_M128I(sc16, sc16);
    }
    inline float new_block(int i, __m256i * scales, __m256i& mins) {
        auto sc16 = load_scales(i);
        mins = scb.shuffle(sc16);
        scales[0] = MM256_SET_M128I(sc16, sc16);
        return -d*minv;
    }

    inline static void make4(const uint32_t * aux32, __m256i * values) {
        const uint8_t * aux8 = (const uint8_t *)aux32;
        values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]);
        values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]);
        values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]);
        values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]);
    }

    IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const {
#ifdef HAVE_FANCY_SIMD
        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0);
        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2);
#else
        esh.sign_value(aux32[1], values[0]);
        esh.sign_value(aux32[3], values[1]);
        esh.sign_value(aux32[5], values[2]);
        esh.sign_value(aux32[7], values[3]);
#endif
    }
    inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const {
        make4(aux32, values);
        sign_values(aux32, values);
        for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value);
    }
    inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const {
        make4(aux32, values);
        sign_values(aux32, q8);
    }
    inline void prepare(int i, int j) {
        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
        make4_signed(data.val, min_value, bits.values);
    }
    inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) {
        for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k);
        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
        make4(data.val, bits.values, q8_quants);
    }

    constexpr static int minv = 43;
    SimpleBits bits;
    Scales8KBase scb;
    EvenSignHelper esh;
    const __m256i min_value = _mm256_set1_epi8(minv);
    const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
};

/*
moonll
add Q8_0_Unpacker && DequantizerIQ2XXS support
add func mul_mat_qX_K_q8_K_IQ
*/

// Copied/adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9092
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
    if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
        std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
            m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
        }
        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>|| std::is_same_v<Dequantizer, Q8_0_1_Unpacker>) {
            m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_1_q8_1_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_1_q8_1_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_1_q8_1_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
        }
        else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
            m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>;
            }
            else {
#ifdef HAVE_FANCY_SIMD
            if constexpr (std::is_same_v<Dequantizer, DequantizerIQ4XS>) {
            m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
            m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
            m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
            m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 4>;
            m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 5>;
            m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 6>;
            m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 7>;
            m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 8>;
            } else {
            m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1<Dequantizer>;
            m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>;
            }
#else
            if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
                          std::is_same_v<Dequantizer, DequantizerQ3K> ||
                          std::is_same_v<Dequantizer, DequantizerQ6K>) {
                m.funcs[0] = mul_mat_qY_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qY_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qY_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qY_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qY_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qY_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qY_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qY_K_q8_K_T<Dequantizer, 8>;
            } else {
                m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
            }
#endif
        }
}
// end copied/adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9092

// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8622
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
struct QFBase {
    #ifdef __AVX512F__
        constexpr static int k_step = 16;
        using Data = __m512;
        using Acc  = __m512;
        static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); }
        static inline Data load(const float * x) { return _mm512_loadu_ps(x); }
        static inline Data load(const ggml_bf16_t * x) {
            return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16));
        }
        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
            return _mm512_fmadd_ps(y, x, prev);
        }
        static inline Acc acc_first(const Data& y, const Data& x) {
            return _mm512_mul_ps(y, x);
        }
        static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); }
        static inline float hsum(Acc acc) {
            return _mm512_reduce_add_ps(acc);
        }
        template <typename Float>
        static inline Data load4Floats(const Float * x) {
            return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
        }
        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
            acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc);
            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
            auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00));
            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline __m128 hsum_r4(Acc acc) {
            auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1));
            auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3));
            return _mm_add_ps(sum1, sum2);
        }
    #else
        constexpr static int k_step = 8;
        using Data = __m256;
        using Acc  = __m256;
        static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); }
        static inline Data load(const float * x) { return _mm256_loadu_ps(x); }
        static inline Data load(const ggml_bf16_t * x) {
            return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16));
        }
        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
            return _mm256_fmadd_ps(y, x, prev);
        }
        static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); }
        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
            acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc);
            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
            auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00));
            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_first(const Data& y, const Data& x) {
            return _mm256_mul_ps(y, x);
        }
        static inline float hsum(Acc acc) {
            return hsum_float_8(acc);
        }
        static inline __m128 hsum_r4(Acc acc) {
            return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
        }
        template <typename Float>
        static inline Data load4Floats(const Float * x) {
            return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
        }
    #endif
        static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
        static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
        static inline __m128 load128(const ggml_bf16_t * x) {
            return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16));
        }
    };
    template <typename Float, int nrc_in> struct QFT final : public QFBase {
        constexpr static int nrc = nrc_in;
        QFT(const DataInfo& info) {
            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy);
        }
        QFT(const char * cx, size_t bx) {
            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
        }
        IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
        IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
        IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const {
            xv[0] = load1(ix+0, i);
            xv[1] = load1(ix+1, i);
            xv[2] = load1(ix+2, i);
            xv[3] = load1(ix+3, i);
    #ifdef __AVX512F__
            auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]);
            auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]);
            auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]);
            auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]);
            xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
            xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
            xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
            xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
    #else
            auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]);
            auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]);
            auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]);
            auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]);
            xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
            xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
            xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
            xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
    #endif
        }
        const Float * y[nrc];
    };
    

template <typename Qy, typename Qx>
IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
    int nb = n/QFBase::k_step;
    int nb4 = n/4;
    Qy y(info);
    Qx x(cx + ix0*bx, bx);
    QFBase::Data xv[Qx::nrc];
    QFBase::Acc  acc[Qx::nrc*Qy::nrc];
    auto yv = y.load1(0, 0);
    for (int ix = 0; ix < Qx::nrc; ++ix) {
        xv[ix] = x.load1(ix, 0);
        acc[ix] = QFBase::acc_first(yv, xv[ix]);
    }
    for (int iy = 1; iy < Qy::nrc; ++iy) {
        yv = y.load1(iy, 0);
        for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]);
    }
    for (int i = 1; i < nb; ++i) {
        yv = y.load1(0, i);
        for (int ix = 0; ix < Qx::nrc; ++ix) {
            xv[ix] = x.load1(ix, i);
            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
        }
        for (int iy = 1; iy < Qy::nrc; ++iy) {
            yv = y.load1(iy, i);
            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
        }
    }
    for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
        yv = y.load_tail(0, i);
        for (int ix = 0; ix < Qx::nrc; ++ix) {
            xv[ix] = x.load_tail(ix, i);
            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
        }
        for (int iy = 1; iy < Qy::nrc; ++iy) {
            yv = y.load_tail(iy, i);
            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
        }
    }
    for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
}
// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
template <int nrc_y, typename FloatX, typename FloatY>
void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    const char * cx = (const char *)vx;
    // TBD if we want this
    //if constexpr (nrc_y == 1) {
    //    constexpr int k_nx = 2;
    //    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
    //        mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
    //    }
    //    if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
    //        int nx = nrc_x - lastx;
    //        switch (nx) {
    //            case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
    //            case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
    //            case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
    //        }
    //        //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
    //    }
    //    return;
    //}
#ifdef __AVX512F__
    constexpr int k_nx = 5;
#else
    constexpr int k_nx = nrc_y == 1 ? 4 : 2;
#endif
    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
        mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
    }
    int last_x = k_nx*(nrc_x/k_nx);
    if (last_x == nrc_x) return;
    int nx = nrc_x - last_x;
#ifdef __AVX512F__
    switch (nx) {
        case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
        case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
        case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
        case 4: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 4>>(n, cx, bx, last_x, info); break;
    }
#else
    if constexpr (nrc_y == 1) {
        switch (nx) {
            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
            case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
            case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
        }
    } else {
        switch (nx) {
            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
        }
    }
#endif
}

template <typename FloatX, typename FloatY>
void set_mul_mat_f(MulMat& mm) {
    for (auto& f : mm.funcs) f = nullptr;
    mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>;
    mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>;
    mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>;
    mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>;
    mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>;
#ifndef __AVX512F__
    mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>;
#endif
}
// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8622

/*
moonll
add typeb TO compare return not expected type of weight matrix
add IQ2XSS
add IQ1_S
add GGML_TYPE_IQ4_XS
*/

// Modifications extracted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9231
// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow
bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
    (void)Ny;

        auto expected_typeB = GGML_TYPE_Q8_K;
    switch (typeA) {
        case GGML_TYPE_Q2_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ2K>(mm);
            break;
        case GGML_TYPE_Q3_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ3K>(mm);
            break;
        case GGML_TYPE_Q4_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ4K>(mm);
            break;
        case GGML_TYPE_Q5_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ5K>(mm);
            break;
        case GGML_TYPE_Q6_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ6K>(mm);
            break;
        case GGML_TYPE_IQ4_XS:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerIQ4XS>(mm);
            break;
        case GGML_TYPE_IQ2_XXS:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerIQ2XXS>(mm);
            break;
        case GGML_TYPE_Q4_0:
            assert (ne00 % QK4_0 == 0);
            MulMat::set_functions<Q4_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0;
            break;
        case GGML_TYPE_Q4_1:
            assert (ne00 % QK4_1 == 0);
            MulMat::set_functions<Q4_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
            break;
        case GGML_TYPE_Q5_0:
            assert (ne00 % QK5_0 == 0);
            MulMat::set_functions<Q5_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0;
            break;
        case GGML_TYPE_Q5_1:
            assert (ne00 % QK5_1 == 0);
            MulMat::set_functions<Q5_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
            break;
        case GGML_TYPE_Q8_0:
            assert (ne00 % QK8_0 == 0);
#ifdef HAVE_FANCY_SIMD
            MulMat::set_functions<Q8_0_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
#else
            MulMat::set_functions<Q8_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0_X4;
#endif
            break;
        case GGML_TYPE_IQ1_S:
            mm.funcs[0] = mul_mat_iq1_s_q8_K<1>;
            mm.funcs[1] = mul_mat_iq1_s_q8_K<2>;
            mm.funcs[2] = mul_mat_iq1_s_q8_K<3>;
            mm.funcs[3] = mul_mat_iq1_s_q8_K<4>;
            mm.funcs[4] = mul_mat_iq1_s_q8_K<5>;
            mm.funcs[5] = mul_mat_iq1_s_q8_K<6>;
            mm.funcs[6] = mul_mat_iq1_s_q8_K<7>;
            mm.funcs[7] = mul_mat_iq1_s_q8_K<8>;
        #ifdef HAVE_FANCY_SIMD
             mm.func16 = mul_mat_iq1_s_q8_K<16>;
        #endif
       // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
              expected_typeB = GGML_TYPE_Q8_K;
            break;

        default:
        {
            // printf("case:%d",typeA);
            return false;
        }
            
    }


    return ggml_type(typeB) == expected_typeB;

}
// end extracted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9231

} // namespace

/*
iq1_s is not support for arm
*/
#else   // __aarch64__

namespace {

template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

    inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); }
    inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); }
    inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); }
    inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); }
    inline int16x8_t load_bsums8(int iy, int i) const {
        auto q8s = vld1q_s16_x2(y[iy][i].bsums);
        return vpaddq_s16(q8s.val[0], q8s.val[1]);
    }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

//#pragma GCC unroll 4
        for (int i = 0; i < nb; ++i) {

            int32x4_t sumi[nrc_y];
            for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

            if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) {
                deq.process_scales(i, q8, acc);
                deq.prepare(i, 0);
                deq.compute(q8, i, 0, sumi);
                deq.prepare(i, 1);
                deq.compute(q8, i, 1, sumi);
            } else {
                if constexpr (Dequantizer::num_blocks() == 8) {
                    auto scales = deq.new_block(i, q8, acc);
                    deq.prepare(i, 0);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                    deq.prepare(i, 1);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                }
                else if constexpr (Dequantizer::num_blocks() == 16) {
                    auto scales = deq.new_block(i, q8, acc);
                    deq.prepare(i, 0);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                    deq.prepare(i, 1);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                }
                else {
                    GGML_ASSERT(false);
                }
            }

#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
            }
        }

#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}
template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {

            int32x4_t sumi[nrc_y];
            for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

            if constexpr (Dequantizer::num_blocks() == 8) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else if constexpr (Dequantizer::num_blocks() == 16) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else {
                GGML_ASSERT(false);
            }
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);
    const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val;
    const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val;

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales.val[j], pall);
}
template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8,
        const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales, pall);
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {

    auto mzero = vdupq_n_s32(0);
    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1,
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4,
    auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3
    sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5,
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7,
    auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7
    sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34);
}

template <typename Q8>
inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums8(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}
template <typename Q8>
inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0]));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0]));
        int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1]));
        int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1]));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4)));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}

struct Scales8 {
    uint32_t utmp[4];
    const uint8_t * sc8 = (const uint8_t *)utmp;
    template <typename Q8, typename Qx>
    inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) {
        make_q4_scales(x.scales, utmp);
        int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8));
        accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin));

        uint8x8_t scales8 = vld1_u8(sc8);
        uint16x8_t scales16 = vmovl_u8(scales8);
        int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))),
                              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))};
        return scales;
    }
};

struct Q4bits {
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    uint8x16x4_t b1, b2;
    inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[2] = vshrq_n_u8(val[0], 4);
        b.val[1] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[1] = vshrq_n_u8(val[0], 4);
        b.val[2] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4(b2, q4bits.val);
    }
    inline void prepare_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4(b1, q4bits.val+0);
        prepare4(b2, q4bits.val+2);
    }
    inline void prepare64(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        b1.val[0] = vandq_u8(q4bits.val[0], m4b);
        b1.val[1] = vandq_u8(q4bits.val[1], m4b);
        b1.val[2] = vandq_u8(q4bits.val[2], m4b);
        b1.val[3] = vandq_u8(q4bits.val[3], m4b);
        b2.val[0] = vshrq_n_u8(q4bits.val[0], 4);
        b2.val[1] = vshrq_n_u8(q4bits.val[1], 4);
        b2.val[2] = vshrq_n_u8(q4bits.val[2], 4);
        b2.val[3] = vshrq_n_u8(q4bits.val[3], 4);
    }
    inline void prepare16(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4_16(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4_16(b2, q4bits.val);
    }
    inline void prepare16_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4_16(b1, q4bits.val+0);
        prepare4_16(b2, q4bits.val+2);
    }
};

struct Q2bits {
    const uint8x16_t m4b = vdupq_n_u8(0x03);
    uint8x16x4_t b1, b2;
    inline void prepare(const uint8_t * qs) {
        auto q2bits = vld1q_u8_x2(qs);
        b1.val[0] = vandq_u8(q2bits.val[0], m4b);
        b1.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b1.val[2] = vandq_u8(q2bits.val[0], m4b);
        b1.val[3] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[0] = vandq_u8(q2bits.val[0], m4b);
        b2.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[2] = vandq_u8(q2bits.val[0], m4b);
        b2.val[3] = vandq_u8(q2bits.val[1], m4b);
    }
};

template <typename block_q>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {}
    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); }
    const void * vx;
    const block_q * x;
    const size_t bx;
    const int nrc;
};

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        if (nrc == 1) bits.prepare_v2(x[i].qs+64*j);
        else bits.prepare(x[i].qs+64*j);
    }

    Q4bits bits;
    Scales8 s8;

    float d;
};

struct HighBit5 {
    const uint8x16_t mhb = vdupq_n_u8(0x10);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct HighBit3 {
    const uint8x16_t mhb = vdupq_n_u8(0x04);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].qh);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+64*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    Q4bits bits;
    HighBit5 h;
    Scales8 s8;

    uint8x16x2_t hbits;

    float d;
};

inline int32x4x4_t make_wider(const int16x8x2_t& scales16) {
    int32x4x4_t scales = {
        vmovl_s16(vget_low_s16 (scales16.val[0])),
        vmovl_s16(vget_high_s16(scales16.val[0])),
        vmovl_s16(vget_low_s16 (scales16.val[1])),
        vmovl_s16(vget_high_s16(scales16.val[1])),
    };
    return scales;
}

template <typename Q8>
inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) {
    int16x8x2_t scales16;
    scales16.val[0] = vmovl_s8(vget_low_s8(scales8));
    scales16.val[1] = vmovl_s8(vget_high_s8(scales8));
    accum_mins_16(scales16, q8, acc, i, c);
    return make_wider(scales16);
}

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d);
    }
    inline void prepare(int i, int j) {

        auto hbits = vld1q_u8_x2(x[i].qh + 32*j);

        bits.prepare64(x[i].ql+64*j);
        bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb));
        bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb));
        bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb));
        bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb));

        bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb));
        bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb));
        bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb));
        bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb));

    }

    Q4bits bits;

    const uint8x16_t mhb = vdupq_n_u8(0x30);

    float d;
};

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].hmask);
        const uint16_t * sc16 = (const uint16_t *)x[i].scales;
        uint32_t aux0 = sc16[0] | (sc16[1] << 16);
        uint32_t aux1 = sc16[2] | (sc16[3] << 16);
        uint32_t aux2 = sc16[4] | (sc16[5] << 16);
        aux32[0] =  (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030);
        aux32[1] =  (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030);
        aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030);
        aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030);
        return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d);
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    uint32_t aux32[4];

    Q2bits bits;

    HighBit3 h;

    float d;
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return true; }

    template <typename Q8>
    inline void process_scales(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales_and_mins = vld1q_u8(x[i].scales);
        auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4));
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(mins8));
        scales16.val[1] = vmovl_s8(vget_high_s8(mins8));
        accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin));

        scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf));
    }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        process_scales(i, q8, acc);
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8)));
        scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8)));
        return make_wider(scales16);
    }

    template <typename Q8>
    inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) {
        auto m1 = vdupq_n_u8(1);
        auto shuffle = vdupq_n_u8(8*j);
        bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]);

            auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]);

            auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]);

            auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]);
        }
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
    }

    uint32_t aux32[4];

    uint8x16_t scales8;

    Q2bits bits;

    float d;
};

// ============================= i-quants

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {

    static int8x16_t load_values() {
        static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
        return vld1q_s8(iq4nl_values);
    }

    DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        (void)q8;
        (void)acc;
        d = GGML_FP16_TO_FP32(x[i].d);
        const uint16_t scales_h = x[i].scales_h;
        const uint16_t * scales_l = (const uint16_t *)x[i].scales_l;
        aux32[0] = scales_l[0] | (scales_l[1] << 16);
        aux32[1] = aux32[0] >> 4;
        // scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7
        uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf));
        uint16_t * aux16 = (uint16_t *)aux32;
        aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2;
        // sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7
        uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30));
        int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32));
        // shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7
        scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff));
        int16x8_t scales16 = vmovl_s8(scales8);
        int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))};
        return scales;
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs+64*j);
        for (int k = 0; k < 4; ++k) {
            bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k]));
            bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k]));
        }
    }

    Q4bits bits;
    const int8x16_t values;
    uint32_t aux32[2];

    constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602};

    float d;
};

struct SimpleBits {
    uint8x16x4_t b1;
    uint8x16x4_t b2;
};

IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) {
    int32x4x2_t scales;
    auto one = vdupq_n_u32(1);
    scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1));
    scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1));
    return scales;
}

inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) {
    auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127))));
    auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127))));
    b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1));
    b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2));
}

IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) {
    return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1));
}

struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
    DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j));
        prepare_all(data, q);
        return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1]));
    }

private:

    static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) {
        const uint8_t * idx = (const uint8_t *)bits;
        b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]});
        b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]});
        apply_signs_2(b, signs, bits[1]);
    }

    inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) {
        const uint32_t * q2 = (const uint32_t *)data.val;
        prepare2(quants+0, q2+0, keven_signs);
        prepare2(quants+2, q2+2, keven_signs);
        prepare2(quants+4, q2+4, keven_signs);
        prepare2(quants+6, q2+6, keven_signs);
    }
};

inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) {
    auto aux = vld1_u8(sc);
    auto scales_l = vand_u8(aux, vdup_n_u8(0xf));
    auto scales_h = vshr_n_u8(aux, 4);
    auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));

    auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1)));
    int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) };
    return make_wider(scales16);
}

struct DequantizerIQ2XS final : public BaseDequantizer<block_iq2_xs> {
    DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1);
    }

private:

    static void make2(const uint16_t * qs, uint8x16_t * b) {
        auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511))));
        auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511))));
        auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9))));
        auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9))));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1));
        b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2));
    }

    inline static void make4(const uint16_t * qs, uint8x16_t * b) {
        make2(qs + 0, b + 0);
        make2(qs + 4, b + 2);
    }

    IQK_ALWAYS_INLINE void prepare_internal(int i, int j) {
        make4(x[i].qs + 16*j + 0, bits.b1.val);
        make4(x[i].qs + 16*j + 8, bits.b2.val);
    }

};

// So, I hate to include this table, but with the GCC 12.3 compiler
// bundled in the Cosmopolitan tools, loading the unpacked sign bytes
// from this table using the packed 8 sign bits as index is faster than
// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to
// expand the bits to bytes.
static const uint64_t kall_signs[256] = {
    0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff,
    0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff,
    0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff,
    0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff,
    0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff,
    0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff,
    0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff,
    0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff,
    0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff,
    0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff,
    0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff,
    0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff,
    0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff,
    0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff,
    0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff,
    0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff,
    0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff,
    0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff,
    0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff,
    0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff,
    0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff,
    0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff,
    0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff,
    0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff,
    0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff,
    0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff,
    0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff,
    0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff,
    0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff,
    0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff,
    0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff,
    0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff,
    0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff,
    0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff,
    0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff,
    0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff,
    0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff,
    0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff,
    0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff,
    0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff,
    0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff,
    0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff,
    0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff,
    0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff,
    0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff,
    0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff,
    0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff,
    0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff,
    0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff,
    0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff,
    0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff,
    0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff,
    0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff,
    0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff,
};

struct SignHelper {

    IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const {
        auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]});
        // Normally we would expect this to be faster, but it isn't.
        // auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1]));
        // auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s));
    }

    // We would need these two if we weren't loading from the unpacked sign table.
    //const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201));
    //const uint8x16_t m1    = vdupq_n_u8(1);
};

struct DequantizerIQ2S final : public BaseDequantizer<block_iq2_s> {
    DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0, bits);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1, bits);
    }

private:

    static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) {
        uint32_t aux32[2];
        const uint16_t * aux16 = (const uint16_t *)aux32;
        for (int k = 0; k < 2; ++k) {
            aux32[1] = (qh[k] << 4) | (qh[k] << 18);
            aux32[0] = (aux32[1] << 4) & 0x03000300;
            aux32[1] &= 0x03000300;
            b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1]))));
            b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3]))));
            sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2;
            sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2;
        }
    }

    void prepare_internal(int i, int j, SimpleBits& sb) {

        const auto * qs = x[i].qs + 16*j;
        const auto * qh = x[i].qh + 4*j;
        const auto * sign_bits = qs + QK_K/8;

        make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val);
        make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val);
    }

    SignHelper sh;
};

struct DequantizerIQ3XXS final : public BaseDequantizer<block_iq3_xxs> {
    DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto q3data = vld1q_u8_x2(x[i].qs + 32*j);
        auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j));
        prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q);
        return prepare_scales_8(gas);
    }

private:

    inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) {
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]});
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]});
        apply_signs_2(b, keven_signs, sidx);
    }
    inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) {
        make2(q3+ 0, signs[0], quants + 0);
        make2(q3+ 8, signs[1], quants + 2);
        make2(q3+16, signs[2], quants + 4);
        make2(q3+24, signs[3], quants + 6);
    }
};

struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> {
    DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x2_t new_block(int i) {
        d = GGML_FP16_TO_FP32(x[i].d);
        uint32_t scales32[2];
        auto qs = vld1q_u8_x2(x[i].qs);
        auto signs = vld1q_u8(x[i].signs);

        prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs);

        std::memcpy(scales32, x[i].scales, 4);
        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
        auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7
        scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400)));
        auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8));
        int32x4x2_t scales;
        scales.val[0] = vmovl_s16(vget_low_s16(scales16));
        scales.val[1] = vmovl_s16(vget_high_s16(scales16));
        return scales;
    }

    inline void prepare(int i, int j) {
        if (j == 1) {
            auto qs = vld1q_u8_x2(x[i].qs + 32);
            auto signs = vld1q_u8(x[i].signs + 16);
            prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs);
        }
    }

private:

    static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256)));
        const uint16_t * idx = (const uint16_t *)&vindex;
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]});
        sh.apply_signs_1x(b+0, sign_bits+0);
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]});
        sh.apply_signs_1x(b+1, sign_bits+2);
    }
    static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto idx_l = vld1q_u8(qs);
        make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0);
        make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2);
    }

    static int16x8_t load_shift() {
        static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
        return vld1q_s16(k_shift);
    }

    inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) {
        auto signs = vld1q_u8(sign_bits);
        auto s = (const uint8_t *)&signs;
        make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val);
        make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val);
    }

    SignHelper sh;
    const int16x8_t hshift = load_shift();

};

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);
    Dequantizer deq(vx, bx, nrc_y);
    uint8x16_t  qx[8];
    int32x4_t   sumi[nrc_y];
    float32x4_t acc[nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {
            float d = deq.new_block(i);
            auto scales = deq.unpack(i, 0, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                sumi[iy] = vdupq_n_s32(0);
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]);
            }
            scales = deq.unpack(i, 1, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]);
                acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy]));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

// =========================================== Legacy quants

template <typename Block>
inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) {
    for (int k = 0; k < 4; ++k) aux[k] = x[k].d;
    return vld1_f16((const float16_t *)aux);
}

template <typename Block>
inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) {
    if constexpr (std::is_same_v<Block, block_q8_1>) {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; }
    } else {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; }
    }
    return vld1q_f16((const float16_t *)aux);
}

struct Q4LegacyBits {
    template <typename Block>
    inline void prepare(const Block * x) {
        for (int i = 0; i < 4; ++i) {
            auto q4bits = vld1q_u8(x[i].qs);
            b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
            b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
        }
    }
    inline void prepare1(const uint8_t * qs, int8x16_t * q) const {
        auto q4bits = vld1q_u8(qs);
        q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
        q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
    }
    inline void prepare1(const uint8_t * qs) {
        prepare1(qs, b);
    }
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    int8x16_t b[8];
};

// One would think this commented out version would do better than the one below
// because it offers more opportunities to execute instructions in parallel.
// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers
// cannot it just do the sequential version below on its own?
//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
//    const auto q8b_1 = vld1q_s8_x2(qs + 0);
//    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]);
//    const auto q8b_2 = vld1q_s8_x2(qs + 32);
//    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]);
//    auto p1234 = vpaddq_s32(p12, p34);
//    const auto q8b_3 = vld1q_s8_x2(qs + 64);
//    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]);
//    const auto q8b_4 = vld1q_s8_x2(qs + 96);
//    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]);
//    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
//}

inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
    auto q8b = vld1q_s8_x2(qs + 0);
    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 32);
    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]);
    auto p1234 = vpaddq_s32(p12, p34);
    q8b = vld1q_s8_x2(qs + 64);
    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 96);
    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]);
    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
}

template <int nrc> struct Q80 {

    constexpr static int nrc_y = nrc;

    Q80(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x4_t load_scales(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return vld1_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            sc16[iy] = vmul_f16(qx_scales, q8_scales);
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
        }
    }

    const block_q8_0 * y[nrc_y];
};

template <int nrc> struct Q81 {

    constexpr static int nrc_y = nrc;

    Q81(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x8_t load_scales(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return vld1q_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales));
            acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m));
            sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales));
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
            acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s)));
        }
    }

    const block_q8_1 * y[nrc_y];
};

template <typename block_q>
struct BaseLegacyDequantizer {

    BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {}

    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); }

    Q4LegacyBits bits;

    const void * vx;
    const block_q * x;
    size_t bx;
};

struct DequantizerQ40 final : public BaseLegacyDequantizer<block_q4_0> {

    DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        q[0] = vaddq_s8(q[0], m8);
        q[1] = vaddq_s8(q[1], m8);
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    const int8x16_t m8 = vdupq_n_s8(-8);
    //ggml_half aux[4];
};

struct DequantizerQ41 : public BaseLegacyDequantizer<block_q4_1> {

    DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.prepare1(x[i].qs);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q4_1)/4;
            bits.prepare1(x[4*i+k].qs, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }
    // Leaving this commented out attempt to be reminded that I already tried this.
    // It has basically the same performance as the version above.
    //inline float16x8_t new_block(int i) {
    //    uint32x4_t scales = {};
    //    const block_q4_1 * xi = x + 4*i;
    //    const uint32_t * s32 = (const uint32_t *)&xi->d;
    //    scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[0].qs, bits.b + 0);
    //    scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[1].qs, bits.b + 2);
    //    scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[2].qs, bits.b + 4);
    //    scales = vsetq_lane_u32(*s32, scales, 3);
    //    bits.prepare1(xi[3].qs, bits.b + 6);
    //    return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle)));
    //}

    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};
};

struct HighBit5Legacy {
    inline uint8x16_t to_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask));
    }
    inline uint8x16_t to_negated_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0));
    }
    const uint64x2_t mask = vdupq_n_u64(0x8040201008040201);
    const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1));
};

struct DequantizerQ50 final : public BaseLegacyDequantizer<block_q5_0> {

    DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0xf0);

};

struct DequantizerQ80 final : public BaseLegacyDequantizer<block_q8_0> {

    DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.b[0] = vld1q_s8(x[i].qs);
        bits.b[1] = vld1q_s8(x[i].qs+16);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs);
            bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16);
        }
        return vld1_f16((const float16_t *)aux);
    }

};

struct DequantizerQ51 final : public BaseLegacyDequantizer<block_q5_1> {

    DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        bits.prepare1(x[i].qs, bits.b);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q5_1)/4;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0x10);
    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};

};

template <typename Dequantizer, typename Q8>
inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i));
        auto scale = vcvt_f32_f16(sc16[iy]);
        acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall));
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[Q8::nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[Q8::nrc_y];
        for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb/4; ++i) {
            q8.process_scales(i, deq, sc16, acc);
            sum_4(i, deq, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq, acc);
        }

        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq1.new_row(ix);
        deq2.new_row(ix);

        float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) };

        for (int i = 0; i < nb/8; ++i) {
            q8.process_scales(2*i+0, deq1, sc16+0, acc+0);
            q8.process_scales(2*i+1, deq2, sc16+1, acc+1);
            sum_4(2*i+0, deq1, q8, sc16+0, acc+0);
            sum_4(2*i+1, deq2, q8, sc16+1, acc+1);
        }
        for (int i = 2*(nb/8); i < nb/4; ++i) {
            q8.process_scales(i, deq1, sc16, acc);
            sum_4(i, deq1, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq1, acc);
        }

        info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1])));
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q81<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q80<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q81<1> q8(info);
    mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q80<1> q8(info);
    mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
    if constexpr (std::is_same_v<Dequantizer, DequantizerQ40> || std::is_same_v<Dequantizer, DequantizerQ50> ||
                  std::is_same_v<Dequantizer, DequantizerQ80>) {
        m.funcs[0] = mul_mat_qX_0_q8_0<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_0_q8_0<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_0_q8_0<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_0_q8_0<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_0_q8_0<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_0_q8_0<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_0_q8_0<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_0_q8_0<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerQ41> || std::is_same_v<Dequantizer, DequantizerQ51>) {
        m.funcs[0] = mul_mat_qX_1_q8_1<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_1_q8_1<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_1_q8_1<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_1_q8_1<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_1_q8_1<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_1_q8_1<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_1_q8_1<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_1_q8_1<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS> || std::is_same_v<Dequantizer, DequantizerIQ3XXS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ3S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ2XS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>;
    }
    else {
        m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>;
    }
}

bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) {
    row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);

    (void)Ny;
    // Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications.
    //if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S ||
    //                typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false;

    switch (typeA) {
        case GGML_TYPE_Q2_K:
            MulMat::set_functions<DequantizerQ2K>(m);
            break;
        case GGML_TYPE_Q3_K:
            MulMat::set_functions<DequantizerQ3K>(m);
            break;
        case GGML_TYPE_Q4_K:
            MulMat::set_functions<DequantizerQ4K>(m);
            break;
        case GGML_TYPE_Q5_K:
            MulMat::set_functions<DequantizerQ5K>(m);
            break;
        case GGML_TYPE_Q6_K:
            MulMat::set_functions<DequantizerQ6K>(m);
            break;
        case GGML_TYPE_IQ4_XS:
            MulMat::set_functions<DequantizerIQ4XS>(m);
            break;
        case GGML_TYPE_IQ3_S:
            MulMat::set_functions<DequantizerIQ3S>(m);
            break;
        case GGML_TYPE_IQ3_XXS:
            MulMat::set_functions<DequantizerIQ3XXS>(m);
            break;
        case GGML_TYPE_IQ2_S:
            MulMat::set_functions<DequantizerIQ2S>(m);
            break;
        case GGML_TYPE_IQ2_XS:
            MulMat::set_functions<DequantizerIQ2XS>(m);
            break;
        case GGML_TYPE_IQ2_XXS:
            MulMat::set_functions<DequantizerIQ2XXS>(m);
            break;
        case GGML_TYPE_Q4_0:
            MulMat::set_functions<DequantizerQ40>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q4_1:
            MulMat::set_functions<DequantizerQ41>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q5_0:
            MulMat::set_functions<DequantizerQ50>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q5_1:
            MulMat::set_functions<DequantizerQ51>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q8_0:
            MulMat::set_functions<DequantizerQ80>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        default:
            return false;
    }
    return true;
}

}

#endif // __x86_64__ or __aarch64__


================================================
FILE: archive/third_party/llamafile/macros.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/macros.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#pragma once

#define MIN(X, Y) ((Y) > (X) ? (X) : (Y))
#define MAX(X, Y) ((Y) < (X) ? (X) : (Y))
#define CEIL_DIV(M, N) (((M) + (N) - 1) / (N))
#define ROUNDUP(X, K) (((X) + (K) - 1) & -(K))
#define ARRAYLEN(A) ((sizeof(A) / sizeof(*(A))) / ((unsigned)!(sizeof(A) % sizeof(*(A)))))


================================================
FILE: archive/third_party/llamafile/micros.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/micros.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#pragma once

#include <ctime>

#ifndef _WIN32
#include <unistd.h>
#else
#include <windows.h>
#endif

#ifdef _WIN32
static long long GetQueryPerformanceFrequency() {
    LARGE_INTEGER t;
    QueryPerformanceFrequency(&t);
    return t.QuadPart;
}
static long long GetQueryPerformanceCounter() {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return t.QuadPart;
}
#endif

static long long micros(void) {
#ifndef _WIN32
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000;
#else
    static long long timer_freq = GetQueryPerformanceFrequency();
    static long long timer_start = GetQueryPerformanceCounter();
    return ((GetQueryPerformanceCounter() - timer_start) * 1000000) / timer_freq;
#endif
}


================================================
FILE: archive/third_party/llamafile/numba.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/numba.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#pragma once

inline int rand32(void) {
    static unsigned long long lcg = 1;
    lcg *= 6364136223846793005;
    lcg += 1442695040888963407;
    return lcg >> 32;
}

inline int popcount(unsigned x) {
    x = x - ((x >> 1) & 0x55555555);
    x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = (x + (x >> 16));
    return (x + (x >> 8)) & 0x0000003F;
}

inline int hamming(int x, int y) {
    return popcount(x ^ y);
}

inline float float01(unsigned x) {  // (0,1)
    return 1.f / 8388608 * ((x >> 9) + .5f);
}

inline float numba(void) {  // (-10,10)
    return float01(rand32()) * 2.f - 1.f;
}

template <typename T>
void randomize(T* A, int n) {
    for (int i = 0; i < n; ++i)
        A[i] = numba();
}

template <typename T>
void randomize(int m, int n, T* A, int lda) {
    for (int j = 0; j < n; ++j)
        for (int i = 0; i < m; ++i)
            A[lda * j + i] = numba();
}

template <typename T, typename U>
void broadcast(T* A, int n, U x) {
    for (int i = 0; i < n; ++i)
        A[i] = x;
}

template <typename T, typename U>
void broadcast(int m, int n, T* A, int lda, U x) {
    for (int j = 0; j < n; ++j)
        for (int i = 0; i < m; ++i)
            A[lda * j + i] = x;
}


================================================
FILE: archive/third_party/llamafile/sgemm.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#if defined(KTRANSFORMERS_USE_NPU) && KTRANSFORMERS_USE_NPU
        // use ARM version
        #include "sgemm_arm.cpp"
#else
        // use x86 version
        #include "sgemm_x86.cpp"
#endif

================================================
FILE: archive/third_party/llamafile/sgemm.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#pragma once
#include <stdbool.h>
#include <cstddef>
#ifdef __cplusplus
extern "C" {
#endif

struct ggml_tensor;
struct ggml_compute_params;
/*moonll old
add more params typeb...
*/


bool iqk_mul_mat(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
bool iqk_mul_mat_zen4(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
bool iqk_mul_mat_arm82(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);


bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);

bool llamafile_sgemm(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_mixmul(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
size_t llamafile_mixmul_needs(const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*);

bool llamafile_sgemm_unsupported(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_fma(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx2(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avxvnni(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx512f(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_zen4(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_arm80(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_arm82(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);

bool llamafile_mixmul_unsupported(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_fma(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_arm80(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_arm82(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_iqk(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);

#ifdef __cplusplus
}
#endif


================================================
FILE: archive/third_party/llamafile/sgemm_arm.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "sgemm.h"
// #include <cosmo.h>
// #include <cpuid.h>
// #include <libc/sysv/consts/hwcap.h>
#include <stdio.h>
// #include <sys/auxv.h>
#include <cassert>
// #include "llamafile.h"

static const struct GemmFuncs {
    bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
    bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
    bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
    // typeof(llamafile_sgemm)* sgemm;
    // typeof(llamafile_mixmul)* mixmul;
    // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
    GemmFuncs() {
#if defined(__x86_64__) || defined(_M_X64)
        // if (X86_HAVE(AVX)) {
        //     if (X86_HAVE(FMA)) {
        //         if (X86_HAVE(AVX2)) {
        //             if (X86_HAVE(AVX512F)) {
        //                 if (X86_HAVE(AVX512VL) &&     //
        //                     X86_HAVE(AVX512BW) &&     //
        //                     X86_HAVE(AVX512DQ) &&     //
        //                     X86_HAVE(AVX512_VNNI) &&  //
        //                     X86_HAVE(AVX512_BF16)) {
        //                     // AMD Zen4+ (2023-)
        //                     sgemm = llamafile_sgemm_amd_zen4;
        //                     mixmul = llamafile_mixmul_amd_zen4;
        //                     iqk_mixmul = iqk_mul_mat_moe_zen4;
        //                 } else {
        //                     // Intel Xeon Skylake+ (2015-)
        //                     sgemm = llamafile_sgemm_amd_avx512f;
        //                     mixmul = llamafile_mixmul_amd_avx512f;
        //                     iqk_mixmul = iqk_mul_mat_moe;
        //                 }
        //             } else if (X86_HAVE(AVXVNNI)) {
        //                 // Intel Alderlake (2021-)
        //                 sgemm = llamafile_sgemm_amd_avxvnni;
        //                 mixmul = llamafile_mixmul_amd_avxvnni;
        //                 iqk_mixmul = iqk_mul_mat_moe;
        //             } else {
        //                 // Intel Haswell/Broadwell/Skylake (2013-2020)
        //                 // AMD Excavator (2015-2022)
        //                 sgemm = llamafile_sgemm_amd_avx2;
        //                 mixmul = llamafile_mixmul_amd_avx2;
        //                 if (X86_HAVE(F16C))
        //                     iqk_mixmul = iqk_mul_mat_moe;
        //             }
        //         } else {
        //             // AMD Piledriver (2011-2014)
        //             sgemm = llamafile_sgemm_amd_fma;
        //             mixmul = llamafile_mixmul_amd_fma;
        //             if (X86_HAVE(F16C))
        //                 iqk_mixmul = iqk_mul_mat_moe;
        //         }
        //     } else {
        //         // Intel Sandybridge/Ivybridge (2010-2012)
        //         // AMD Bulldozer (2011)
        //         sgemm = llamafile_sgemm_amd_avx;
        //         mixmul = llamafile_mixmul_amd_avx;
        //     }
        // } else {
        //     // AMD K8/Barcelona (2003-2010)
        //     // Intel Core/Nehalem (2006-2009)
        //     sgemm = llamafile_sgemm_unsupported;
        //     mixmul = llamafile_mixmul_unsupported;
        // }

#if defined(__AVX__)
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX512F__)
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
        // AMD Zen4+ (2023-)
        sgemm = llamafile_sgemm_amd_zen4;
        mixmul = llamafile_mixmul_amd_zen4;
        iqk_mixmul = iqk_mul_mat_moe_zen4;
#else
        // Intel Xeon Skylake+ (2015-)
        sgemm = llamafile_sgemm_amd_avx512f;
        mixmul = llamafile_mixmul_amd_avx512f;
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#elif defined(__AVXVNNI__)
        // Intel Alderlake (2021-)
        sgemm = llamafile_sgemm_amd_avxvnni;
        mixmul = llamafile_mixmul_amd_avxvnni;
        iqk_mixmul = iqk_mul_mat_moe;
#else
        // Intel Haswell/Broadwell/Skylake (2013-2020)
        // AMD Excavator (2015-2022)
        sgemm = llamafile_sgemm_amd_avx2;
        mixmul = llamafile_mixmul_amd_avx2;
#if defined(__F16C__)
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
        // AMD Piledriver (2011-2014)
        sgemm = llamafile_sgemm_amd_fma;
        mixmul = llamafile_mixmul_amd_fma;
#if defined(__F16C__)
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
        // Intel Sandybridge/Ivybridge (2010-2012)
        // AMD Bulldozer (2011)
        sgemm = llamafile_sgemm_amd_avx;
        mixmul = llamafile_mixmul_amd_avx;
#endif
#else
        // AMD K8/Barcelona (2003-2010)
        // Intel Core/Nehalem (2006-2009)
        sgemm = llamafile_sgemm_unsupported;
        mixmul = llamafile_mixmul_unsupported;
#endif

#elif defined(__aarch64__)
//        long hwcap = getauxval(AT_HWCAP);
//        if ((hwcap & HWCAP_FPHP) &&     // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
//            (hwcap & HWCAP_ASIMDHP) &&  // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
//            (hwcap & HWCAP_ASIMDDP)) {  // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
//            // e.g. Apple M1, Raspberry Pi 5
//            sgemm = llamafile_sgemm_arm82;
//            mixmul = llamafile_mixmul_arm82;
//            iqk_mixmul = iqk_mul_mat_moe_arm82;
//        } else {
            // ARM64 baseline ISA
            sgemm = llamafile_sgemm_arm80;
            mixmul = llamafile_mixmul_arm80;
//        }
#else
        sgemm = llamafile_sgemm_unsupported;
        mixmul = llamafile_mixmul_unsupported;
#endif
    }
} funcs;

/**
 * Performs optimized matrix multiplication on CPU.
 *
 * This subroutine may compute C = Aᵀ * B with column major ordering.
 * Despite its name, this isn't a generalized implementation. Work is
 * only performed when a handwritten kernel is written and available.
 * Otherwise the caller should fall back to a general matmul routine.
 *
 * @param m is rows in `A` and `C`
 * @param n is cols in `B` and `C`
 * @param k is cols in `A` and rows in `B`
 * @param A is first input matrix (always transposed)
 * @param lda is row stride of `A`
 * @param B is second input matrix (never transposed)
 * @param ldb is row stride of `B`
 * @param C is input/output array of output matrices
 * @param ldc is row stride of `C`
 * @param ith is thread id (must be less than `nth`)
 * @param nth is number of threads (must be greater than zero)
 * @param task is GGML task type
 * @param Atype is GGML data type of `A`
 * @param Btype is GGML data type of `B`
 * @param Ctype is GGML data type of `C`
 * @param precision may be used to control the internal compute type
 * @return true if this function was able to service the matmul request
 */
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype,
                       precision);
}

/**
 * Performs "mixture of experts" tensor multiplication on CPU.
 */
bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
    return funcs.mixmul(params, weights, thought, plan, result);
}

bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) {
    return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth);
}


================================================
FILE: archive/third_party/llamafile/sgemm_x86.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "sgemm.h"
// #include <cosmo.h>
// #include <cpuid.h>
// #include <libc/sysv/consts/hwcap.h>
#include <stdio.h>
// #include <sys/auxv.h>
#include <cassert>
// #include "llamafile.h"

static const struct GemmFuncs {
    bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
    bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
    bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
    // typeof(llamafile_sgemm)* sgemm;
    // typeof(llamafile_mixmul)* mixmul;
    // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
    GemmFuncs() {
#if defined(__x86_64__) || defined(_M_X64)
        // if (X86_HAVE(AVX)) {
        //     if (X86_HAVE(FMA)) {
        //         if (X86_HAVE(AVX2)) {
        //             if (X86_HAVE(AVX512F)) {
        //                 if (X86_HAVE(AVX512VL) &&     //
        //                     X86_HAVE(AVX512BW) &&     //
        //                     X86_HAVE(AVX512DQ) &&     //
        //                     X86_HAVE(AVX512_VNNI) &&  //
        //                     X86_HAVE(AVX512_BF16)) {
        //                     // AMD Zen4+ (2023-)
        //                     sgemm = llamafile_sgemm_amd_zen4;
        //                     mixmul = llamafile_mixmul_amd_zen4;
        //                     iqk_mixmul = iqk_mul_mat_moe_zen4;
        //                 } else {
        //                     // Intel Xeon Skylake+ (2015-)
        //                     sgemm = llamafile_sgemm_amd_avx512f;
        //                     mixmul = llamafile_mixmul_amd_avx512f;
        //                     iqk_mixmul = iqk_mul_mat_moe;
        //                 }
        //             } else if (X86_HAVE(AVXVNNI)) {
        //                 // Intel Alderlake (2021-)
        //                 sgemm = llamafile_sgemm_amd_avxvnni;
        //                 mixmul = llamafile_mixmul_amd_avxvnni;
        //                 iqk_mixmul = iqk_mul_mat_moe;
        //             } else {
        //                 // Intel Haswell/Broadwell/Skylake (2013-2020)
        //                 // AMD Excavator (2015-2022)
        //                 sgemm = llamafile_sgemm_amd_avx2;
        //                 mixmul = llamafile_mixmul_amd_avx2;
        //                 if (X86_HAVE(F16C))
        //                     iqk_mixmul = iqk_mul_mat_moe;
        //             }
        //         } else {
        //             // AMD Piledriver (2011-2014)
        //             sgemm = llamafile_sgemm_amd_fma;
        //             mixmul = llamafile_mixmul_amd_fma;
        //             if (X86_HAVE(F16C))
        //                 iqk_mixmul = iqk_mul_mat_moe;
        //         }
        //     } else {
        //         // Intel Sandybridge/Ivybridge (2010-2012)
        //         // AMD Bulldozer (2011)
        //         sgemm = llamafile_sgemm_amd_avx;
        //         mixmul = llamafile_mixmul_amd_avx;
        //     }
        // } else {
        //     // AMD K8/Barcelona (2003-2010)
        //     // Intel Core/Nehalem (2006-2009)
        //     sgemm = llamafile_sgemm_unsupported;
        //     mixmul = llamafile_mixmul_unsupported;
        // }

#if defined(__AVX__)
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX512F__)
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
        // AMD Zen4+ (2023-)
        sgemm = llamafile_sgemm_amd_zen4;
        mixmul = llamafile_mixmul_amd_zen4;
        iqk_mixmul = iqk_mul_mat_moe_zen4;
#else
        // Intel Xeon Skylake+ (2015-)
        sgemm = llamafile_sgemm_amd_avx512f;
        mixmul = llamafile_mixmul_amd_avx512f;
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#elif defined(__AVXVNNI__)
        // Intel Alderlake (2021-)
        sgemm = llamafile_sgemm_amd_avxvnni;
        mixmul = llamafile_mixmul_amd_avxvnni;
        iqk_mixmul = iqk_mul_mat_moe;
#else
        // Intel Haswell/Broadwell/Skylake (2013-2020)
        // AMD Excavator (2015-2022)
        sgemm = llamafile_sgemm_amd_avx2;
        mixmul = llamafile_mixmul_amd_avx2;
#if defined(__F16C__)
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
        // AMD Piledriver (2011-2014)
        sgemm = llamafile_sgemm_amd_fma;
        mixmul = llamafile_mixmul_amd_fma;
#if defined(__F16C__)
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
        // Intel Sandybridge/Ivybridge (2010-2012)
        // AMD Bulldozer (2011)
        sgemm = llamafile_sgemm_amd_avx;
        mixmul = llamafile_mixmul_amd_avx;
#endif
#else
        // AMD K8/Barcelona (2003-2010)
        // Intel Core/Nehalem (2006-2009)
        sgemm = llamafile_sgemm_unsupported;
        mixmul = llamafile_mixmul_unsupported;
#endif

#elif defined(__aarch64__)
        long hwcap = getauxval(AT_HWCAP);
        if ((hwcap & HWCAP_FPHP) &&     // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
            (hwcap & HWCAP_ASIMDHP) &&  // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
            (hwcap & HWCAP_ASIMDDP)) {  // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
            // e.g. Apple M1, Raspberry Pi 5
            sgemm = llamafile_sgemm_arm82;
            mixmul = llamafile_mixmul_arm82;
            iqk_mixmul = iqk_mul_mat_moe_arm82;
        } else {
            // ARM64 baseline ISA
            sgemm = llamafile_sgemm_arm80;
            mixmul = llamafile_mixmul_arm80;
        }
#else
        sgemm = llamafile_sgemm_unsupported;
        mixmul = llamafile_mixmul_unsupported;
#endif
    }
} funcs;

/**
 * Performs optimized matrix multiplication on CPU.
 *
 * This subroutine may compute C = Aᵀ * B with column major ordering.
 * Despite its name, this isn't a generalized implementation. Work is
 * only performed when a handwritten kernel is written and available.
 * Otherwise the caller should fall back to a general matmul routine.
 *
 * @param m is rows in `A` and `C`
 * @param n is cols in `B` and `C`
 * @param k is cols in `A` and rows in `B`
 * @param A is first input matrix (always transposed)
 * @param lda is row stride of `A`
 * @param B is second input matrix (never transposed)
 * @param ldb is row stride of `B`
 * @param C is input/output array of output matrices
 * @param ldc is row stride of `C`
 * @param ith is thread id (must be less than `nth`)
 * @param nth is number of threads (must be greater than zero)
 * @param task is GGML task type
 * @param Atype is GGML data type of `A`
 * @param Btype is GGML data type of `B`
 * @param Ctype is GGML data type of `C`
 * @param precision may be used to control the internal compute type
 * @return true if this function was able to service the matmul request
 */
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype,
                       precision);
}

/**
 * Performs "mixture of experts" tensor multiplication on CPU.
 */
bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
    return funcs.mixmul(params, weights, thought, plan, result);
}

bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) {
    return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth);
}


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//                   BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

#pragma once

#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
// #include "log.h"
#include "flags.h"
#include "sgemm.h"
// #include <cosmo.h>

#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wignored-attributes"

#define ROW_ALIGN 64
#define MATRIX_ALIGN 4096
#define MAX_ALIGN 4096

#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else
#define NOINLINE __attribute__((__noinline__))
#endif

#if defined(__ARM_NEON) || defined(__AVX512F__)
#define VECTOR_REGISTERS 32
#else
#define VECTOR_REGISTERS 16
#endif

#if 0
#define NOT_SUPPORTED tinyBLAS_not_supported(__FILE__, __LINE__)
#else
#define NOT_SUPPORTED false
#endif
#define WANT_QUANTIZATION false

namespace {

bool tinyBLAS_not_supported(const char* file, int line) {
    // tinylogf("%s:%d: tinyBLAS not supported\n", file, line);
    return false;
}

inline float unhalf(ggml_fp16_t d) {
    return GGML_FP16_TO_FP32(d);
}
inline float unhalf(ggml_bf16_t d) {
    return GGML_BF16_TO_FP32(d);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// MATRIX MEMORY INDEXING

#define NCA 1
#define NCB 2
#define NCC 4

#define INDEX(A, lda, j, i) (CONFIG & NC##A ? ((T##A**)A)[j] + i : A + lda * (j) + i)

////////////////////////////////////////////////////////////////////////////////////////////////////
// GGML TYPE TRAITS

template <typename T>
struct ggml_type_trait;
template <>
struct ggml_type_trait<float> {
    static constexpr ggml_type id = GGML_TYPE_F32;
};
template <>
struct ggml_type_trait<ggml_bf16_t> {
    static constexpr ggml_type id = GGML_TYPE_BF16;
};
template <>
struct ggml_type_trait<ggml_fp16_t> {
    static constexpr ggml_type id = GGML_TYPE_F16;
};
template <>
struct ggml_type_trait<block_q8_0> {
    static constexpr ggml_type id = GGML_TYPE_Q8_0;
};

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED ARITHMETIC OPERATIONS

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline __m128 add(__m128 x, __m128 y) {
    return _mm_add_ps(x, y);
}
inline __m128 sub(__m128 x, __m128 y) {
    return _mm_sub_ps(x, y);
}
inline __m128 mul(__m128 x, __m128 y) {
    return _mm_mul_ps(x, y);
}
#endif  // __SSE__

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline __m256 add(__m256 x, __m256 y) {
    return _mm256_add_ps(x, y);
}
inline __m256 sub(__m256 x, __m256 y) {
    return _mm256_sub_ps(x, y);
}
inline __m256 mul(__m256 x, __m256 y) {
    return _mm256_mul_ps(x, y);
}
#endif  // __AVX__

#if defined(__AVX512F__)
inline __m512 add(__m512 x, __m512 y) {
    return _mm512_add_ps(x, y);
}
inline __m512 sub(__m512 x, __m512 y) {
    return _mm512_sub_ps(x, y);
}
inline __m512 mul(__m512 x, __m512 y) {
    return _mm512_mul_ps(x, y);
}
#endif  // __AVX512F__

#if defined(__ARM_NEON)
inline float32x4_t add(float32x4_t x, float32x4_t y) {
    return vaddq_f32(x, y);
}
inline float32x4_t sub(float32x4_t x, float32x4_t y) {
    return vsubq_f32(x, y);
}
inline float32x4_t mul(float32x4_t x, float32x4_t y) {
    return vmulq_f32(x, y);
}
#endif  // __ARM_NEON

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
inline float16x8_t add(float16x8_t x, float16x8_t y) {
    return vaddq_f16(x, y);
}
inline float16x8_t sub(float16x8_t x, float16x8_t y) {
    return vsubq_f16(x, y);
}
inline float16x8_t mul(float16x8_t x, float16x8_t y) {
    return vmulq_f16(x, y);
}
#endif  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED FUSED MULTIPLY ADD

/**
 * Computes a * b + c.
 */
template <typename T, typename U>
inline U madd(T a, T b, U c) {
    return add(mul(a, b), c);
}

/**
 * Computes a * b + c with error correction.
 *
 * @see W. Kahan, "Further remarks on reducing truncation errors,"
 *    Communications of the ACM, vol. 8, no. 1, p. 40, Jan. 1965,
 *    doi: 10.1145/363707.363723.
 */
template <typename T, typename U>
inline U madder(T a, T b, U c, U* e) {
    U y = sub(mul(a, b), *e);
    U t = add(c, y);
    *e = sub(sub(t, c), y);
    return t;
}

#ifdef __ARM_NEON
inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e) {
    float32x4_t y = sub(vmulq_n_f32(a, b), *e);
    float32x4_t t = add(c, y);
    *e = sub(sub(t, c), y);
    return t;
}
#endif

#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m256 madd(__m256 a, __m256 b, __m256 c) {
    return _mm256_fmadd_ps(a, b, c);
}
#endif
#if defined(__AVX512F__)
template <>
inline __m512 madd(__m512 a, __m512 b, __m512 c) {
    return _mm512_fmadd_ps(a, b, c);
}
#endif
#endif

#if defined(__ARM_FEATURE_FMA)
template <>
inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
    return vfmaq_f32(c, a, b);
}
#if 0  // todo: this specialization chops gcc 12.3 performance in half
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) && 0
template <>
inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
    return vfmaq_f16(c, b, a);
}
#endif
#endif
#endif

#if defined(__AVX512BF16__)
template <>
inline __m512 madd(__m512bh x, __m512bh y, __m512 z) {
    return _mm512_dpbf16_ps(z, x, y);
}
template <>
inline __m512 madder(__m512bh x, __m512bh y, __m512 z, __m512* _) {
    return _mm512_dpbf16_ps(z, x, y);
}
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED HORIZONTAL SUM

#if defined(__ARM_NEON)
inline float hsum(float32x4_t x) {
    return vaddvq_f32(x);
}
#endif  // __ARM_NEON

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
inline float hsum(float16x8_t x) {
    // todo: this works great on clang but it produces terrible code on gcc 12.3
    return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_high_f16(x))));
}
#endif  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline float hsum(__m128 x) {
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
    x = _mm_add_ss(x, _mm_movehdup_ps(x));
#else
    __m128 t;
    t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
    x = _mm_add_ps(x, t);
    t = _mm_movehl_ps(t, x);
    x = _mm_add_ss(x, t);
#endif
    return _mm_cvtss_f32(x);
}
#endif

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline float hsum(__m256 x) {
    return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x)));
}
#endif  // __AVX__

#if defined(__AVX512F__)
inline float hsum(__m512 x) {
    return _mm512_reduce_add_ps(x);
}
#endif  // __AVX512F__

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED MEMORY LOADING

template <typename T, typename U>
T load(const U*);

template <>
inline float load(const float* p) {
    return *p;
}
template <>
inline float load(const ggml_fp16_t* p) {
    return unhalf(*p);
}
template <>
inline float load(const ggml_bf16_t* p) {
    return unhalf(*p);
}

#if defined(__ARM_NEON)
template <>
inline float32x4_t load(const float* p) {
    return vld1q_f32(p);
}
template <>
inline float32x4_t load(const ggml_bf16_t* p) {
    return vreinterpretq_f32_u32(vshll_n_u16(vld1_u16((const unsigned short*)p), 16));
}
#if !defined(_MSC_VER)
template <>
inline float16x8_t load(const ggml_fp16_t* p) {
    return vld1q_f16((const float16_t*)p);
}
template <>
inline float32x4_t load(const ggml_fp16_t* p) {
    return vcvt_f32_f16(vld1_f16((const float16_t*)p));
}
#endif  // _MSC_VER
#endif  // __ARM_NEON

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m128 load(const float* p) {
    return _mm_loadu_ps(p);
}
#endif  // __SSE__

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m256 load(const float* p) {
    return _mm256_loadu_ps(p);
}
#endif  // __AVX__

#if defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m256 load(const ggml_bf16_t* p) {
    return _mm256_castsi256_ps(
        _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)p)), 16));
}
#endif  // __AVX2__

#if defined(__F16C__)
template <>
inline __m256 load(const ggml_fp16_t* p) {
    return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)p));
}
#endif  // __F16C__

#if defined(__AVX512F__)
template <>
inline __m512 load(const float* p) {
    return _mm512_loadu_ps(p);
}
template <>
inline __m512 load(const ggml_fp16_t* p) {
    return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)p));
}
template <>
inline __m512 load(const ggml_bf16_t* p) {
    return _mm512_castsi512_ps(
        _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)p)), 16));
}
#endif  // __AVX512F__

#if defined(__AVX512BF16__)
template <>
inline __m512bh load(const ggml_bf16_t* p) {
    return (__m512bh)_mm512_loadu_ps((const float*)p);
}
template <>
inline __m512bh load(const float* p) {
    return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
}
#endif  // __AVX512BF16__

////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT OUTPUT STREAMING

inline void store(float* p, float f) {
    *p = f;
}

inline void store(ggml_fp16_t* p, float f) {
    *p = GGML_FP32_TO_FP16(f);
}

inline void store(ggml_bf16_t* p, float f) {
    *p = GGML_FP32_TO_BF16(f);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION

template <int CONFIG, int KN, typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS {
   public:
    tinyBLAS(long k, const TA* A, long lda, const TB* B, long ldb, TC* C, long ldc, int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(long m, long n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }

   private:
    NOINLINE void mnpack(long m0, long m, long n0, long n) {
        long mc, nc, mp, np;

#if VECTOR_REGISTERS == 32
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
                case 0x55:
                    mc = 5;
                    nc = 5;
                    gemm<5, 5, false>(m0, m, n0, n);
                    break;
                case 0x54:
                case 0x53:
                case 0x52:
                case 0x45:
                case 0x44:
                case 0x43:
                case 0x42:
                case 0x35:
                case 0x34:
                case 0x33:
                case 0x32:
                case 0x25:
                case 0x24:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x51:
                case 0x41:
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x15:
                case 0x14:
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 3)) {
                case 0x43:
                    mc = 4;
                    nc = 3;
                    gemm<4, 3, true>(m0, m, n0, n);
                    break;
                case 0x42:
                case 0x33:
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x41:
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

#if VECTOR_REGISTERS == 16
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 3)) {
                case 0x43:
                    mc = 4;
                    nc = 3;
                    gemm<4, 3, false>(m0, m, n0, n);
                    break;
                case 0x42:
                case 0x33:
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x41:
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 2)) {
                case 0x32:
                    mc = 3;
                    nc = 2;
                    gemm<3, 2, true>(m0, m, n0, n);
                    break;
                case 0x23:
                    mc = 2;
                    nc = 3;
                    gemm<2, 3, true>(m0, m, n0, n);
                    break;
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN, int PRECISE>
    NOINLINE void gemm(long m0, long m, long n0, long n) {
        long ytiles = RM > 1 ? (m - m0) / RM : 1;
        long xtiles = RN > 1 ? (n - n0) / RN : 1;
        long tiles = xtiles * ytiles;
        long duty = (tiles + nth - 1) / nth;
        long start = duty * ith;
        long end = start + duty;
        if (end > tiles)
            end = tiles;
        for (long job = start; job < end; ++job) {
            long ii = m0 + job / xtiles * RM;
            long jj = n0 + job % xtiles * RN;
            D Cv[RN][RM] = {};
            D Ce[RN][RM] = {};
            for (long l = 0; l < k; l += KN)
#pragma GCC unroll 100
                for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                    for (int i = 0; i < RM; ++i)
                        if (PRECISE)
                            Cv[j][i] = madder(load<V>(INDEX(A, lda, ii + i, l)),  //
                                              load<V>(INDEX(B, ldb, jj + j, l)),  //
                                              Cv[j][i], &Ce[j][i]);
                        else
                            Cv[j][i] = madd(load<V>(INDEX(A, lda, ii + i, l)),  //
                                            load<V>(INDEX(B, ldb, jj + j, l)),  //
                                            Cv[j][i]);
#pragma GCC unroll 100
            for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                for (int i = 0; i < RM; ++i)
                    store(INDEX(C, ldc, jj + j, ii + i), hsum(Cv[j][i]));
        }
    }

    const TA* const A;
    const TB* const B;
    TC* const C;
    const long k;
    const long lda;
    const long ldb;
    const long ldc;
    const int ith;
    const int nth;
};

//////////////////////////////////////////////////////////////////////////////////////////
// QUANT ZERO MATRIX MULTIPLICATION

#if defined(__ARM_FEATURE_DOTPROD)
template <int CONFIG, typename TA, typename TB, typename TC>
class tinyBLAS_Q0_ARM {
   public:
    tinyBLAS_Q0_ARM(long k, const TA* A, long lda, const TB* B, long ldb, TC* C, long ldc, int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(long m, long n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }

   private:
    NOINLINE void mnpack(long m0, long m, long n0, long n) {
        long mc, nc, mp, np;

        if (!FLAG_precise) {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, false>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, true>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }

        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN, int PRECISE>
    NOINLINE void gemm(long m0, long m, long n0, long n) {
        long ytiles = RM > 1 ? (m - m0) / RM : 1;
        long xtiles = RN > 1 ? (n - n0) / RN : 1;
        long tiles = xtiles * ytiles;
        long duty = (tiles + nth - 1) / nth;
        long start = duty * ith;
        long end = start + duty;
        if (end > tiles)
            end = tiles;
        for (long job = start; job < end; ++job) {
            long ii = m0 + job / xtiles * RM;
            long jj = n0 + job % xtiles * RN;
            float32x4_t Cv[RN][RM] = {};
            float32x4_t Ce[RN][RM] = {};
            for (int l = 0; l < k; ++l)
#pragma GCC unroll 100
                for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                    for (int i = 0; i < RM; ++i) {
                        float32x4_t a = vcvtq_f32_s32(vdotq_s32(
                            vdotq_s32(vdupq_n_s32(0), load_lo(INDEX(A, lda, ii + i, l)),
                                      load_lo(INDEX(B, ldb, jj + j, l))),
                            load_hi(INDEX(A, lda, ii + i, l)), load_hi(INDEX(B, ldb, jj + j, l))));
                        float b = unhalf(INDEX(A, lda, ii + i, l)->d) *
                                  unhalf(INDEX(B, ldb, jj + j, l)->d);
                        if (PRECISE)
                            Cv[j][i] = badder(a, b, Cv[j][i], &Ce[j][i]);
                        else
                            Cv[j][i] = vmlaq_n_f32(Cv[j][i], a, b);
                    }
#pragma GCC unroll 100
            for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                for (int i = 0; i < RM; ++i)
                    store(INDEX(C, ldc, jj + j, ii + i), hsum(Cv[j][i]));
        }
    }

    inline int8x16_t load_lo(const block_q8_0* b) {
        return vld1q_s8(b->qs);
    }

    inline int8x16_t load_hi(const block_q8_0* b) {
        return vld1q_s8(b->qs + 16);
    }

    inline int8x16_t load_lo(const block_q4_0* b) {
        return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs), vdupq_n_u8(0x0f))),
                        vdupq_n_s8(0x8));
    }

    inline int8x16_t load_hi(const block_q4_0* b) {
        return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)), vdupq_n_s8(0x8));
    }

    const TA* const A;
    const TB* const B;
    TC* const C;
    const long k;
    const long lda;
    const long ldb;
    const long ldc;
    const int ith;
    const int nth;
};
#endif  // __ARM_FEATURE_DOTPROD

#if defined(__AVX2__) || defined(__AVX512F__)
template <int CONFIG, typename TA, typename TB, typename TC>
class tinyBLAS_Q0_AVX2 {
   public:
    tinyBLAS_Q0_AVX2(long k, const TA* A, long lda, const TB* B, long ldb, TC* C, long ldc, int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(long m, long n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }

   private:
    void mnpack(long m0, long m, long n0, long n) {
        long mc, nc, mp, np;

#if VECTOR_REGISTERS == 32
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, false>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, true>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

#if VECTOR_REGISTERS == 16
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 2)) {
                case 0x32:
                    mc = 3;
                    nc = 2;
                    gemm<3, 2, false>(m0, m, n0, n);
                    break;
                case 0x23:
                    mc = 2;
                    nc = 3;
                    gemm<2, 3, false>(m0, m, n0, n);
                    break;
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 2) << 4) | MIN(n - n0, 1)) {
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN, int PRECISE>
    NOINLINE void gemm(long m0, long m, long n0, long n) {
        long ytiles = RM > 1 ? (m - m0) / RM : 1;
        long xtiles = RN > 1 ? (n - n0) / RN : 1;
        long tiles = xtiles * ytiles;
        long duty = (tiles + nth - 1) / nth;
        long start = duty * ith;
        long end = start + duty;
        if (end > tiles)
            end = tiles;
        for (long job = start; job < end; ++job) {
            long ii = m0 + job / xtiles * RM;
            long jj = n0 + job % xtiles * RN;
            __m256 Cv[RN][RM] = {};
            __m256 Ce[RN][RM] = {};
            for (long l = 0; l < k; ++l)
#pragma GCC unroll 100
                for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                    for (int i = 0; i < RM; ++i) {
                        __m256 a = _mm256_set1_ps(unhalf(INDEX(A, lda, ii + i, l)->d) *
                                                  unhalf(INDEX(B, ldb, jj + j, l)->d));
                        __m256 b = updot(_mm256_sign_epi8(load(INDEX(A, lda, ii + i, l)),
                                                          load(INDEX(A, lda, ii + i, l))),
                                         _mm256_sign_epi8(load(INDEX(B, ldb, jj + j, l)),
                                                          load(INDEX(A, lda, ii + i, l))));
                        if (PRECISE)
                            Cv[j][i] = madder(a, b, Cv[j][i], &Ce[j][i]);
                        else
                            Cv[j][i] = madd(a, b, Cv[j][i]);
                    }
#pragma GCC unroll 100
            for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                for (int i = 0; i < RM; ++i)
                    store(INDEX(C, ldc, jj + j, ii + i), hsum(Cv[j][i]));
        }
    }

    inline __m256i load(const block_q8_0* b) {
        return _mm256_loadu_si256((const __m256i*)b->qs);
    }

    inline __m256i load(const block_q4_0* b) {
        __m128i x = _mm_loadu_si128((const __m128i*)b->qs);
        return _mm256_sub_epi8(_mm256_and_si256(_mm256_set1_epi8(15),
                                                _mm256_insertf128_si256(_mm256_castsi128_si256(x),
                                                                        _mm_srli_epi16(x, 4), 1)),
                               _mm256_set1_epi8(8));
    }

    inline __m256 updot(__m256i u, __m256i s) {
        __m256i res;
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
        res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
#else
        res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
#endif
        return _mm256_cvtepi32_ps(res);
    }

    const TA* const A;
    const TB* const B;
    TC* const C;
    const long k;
    const long lda;
    const long ldb;
    const long ldc;
    const int ith;
    const int nth;
};
#endif  // __AVX2__

}  // namespace


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul.inc
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tinyblas_cpu.h"

//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//               MIXTURE OF EXPERTS TENSOR MULTIPLICATION
//
//
// SHAPES
//
//   - weights [cols, rows, experts]
//   - thought [cols, tasks, tokens] w/ tasks ≤ thinkers
//   - result  [rows, thinkers, tokens] w/ thinkers ≤ experts
//   - plan    [thinkers, tokens] w/ i32 < experts
//
// DEFINITION
//
//   for thinker in range(thinkers):
//     for token in range(tokens):
//       for row in range(rows):
//         c = 0
//         for col in range(cols):
//           expert = plan[token][thinker]
//           a = weights[expert][row][col]
//           b = thought[token][thinker % tasks][col]
//           c += a * b
//         result[token][thinker][row] = c
//
// REGULARITIES
//
//   - tokens can be odd
//   - thinkers is usually 2
//   - tasks is usually 1 or 2
//   - cols should be a multiple of 64
//   - rows should be a multiple of 64
//   - experts is usually 8 but could be 60
//   - tokens is always 1 for token generation
//   - tokens can be huge for prompt processing
//
// EXAMPLE
//
//   mixtral 8x7b w/ 217 token prompt
//
//           |  ne*0 ne*1 ne*2 ne*3 | nb*0    nb*1      nb*2       nb*3 | type
//   =========================================================================
//   weights | 16384 6144    8    1 |   18  0x2400 0x3600000 0x1b000000 | q4_0
//   thought | 16384    2  217    1 |    4 0x10000   0x20000  0x1b20000 | f32
//   result  |  6144    2  217    1 |    4  0x6000    0xc000   0xa2c000 | f32
//   plan    |     2  217    1    1 |    4    0x20    0x1b20     0x1b20 | i32
//

namespace {

class MixMul {
   public:
    MixMul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result)
        : params(params),
          weights(weights),
          thought(thought),
          plan(plan),
          result(result),
          rows(weights->ne[1]),
          cols(weights->ne[0]),
          experts(weights->ne[2]),
          thinkers(plan->ne[0]),
          tasks(thought->ne[1]),
          tokens(thought->ne[2]),
          ldq((cols * 2 + ROW_ALIGN - 1) & -ROW_ALIGN),
          wdata_((char*)(((uintptr_t)params->wdata + MAX_ALIGN - 1) & -MAX_ALIGN)),
          allocated_(0) {
    }

    bool allocate_shared_memory() {
        if (!(quantized_thought_ = allocate<char>(MATRIX_ALIGN, tokens * tasks * ldq)))
            return false;
        if (!(rowptr_result_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
            return false;
        if (!(rowptr_thought_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
            return false;
        if (!(rowptr_count_ = allocate<long>(sizeof(long), experts)))
            return false;
        return true;
    }

    size_t get_allocated_bytes() {
        return (wdata_ - (char*)params->wdata) + allocated_;
    }

    bool mixmul() {
        // invariants
        assert(tasks <= thinkers);
        assert(thinkers <= experts);
        assert(tokens == plan->ne[1]);
        assert(rows == result->ne[0]);
        assert(cols == thought->ne[0]);
        assert(tokens == result->ne[2]);
        assert(thinkers == result->ne[1]);

        // dimensionality
        assert(plan->ne[2] == 1);
        assert(plan->ne[3] == 1);
        assert(result->ne[3] == 1);
        assert(weights->ne[3] == 1);
        assert(thought->ne[3] == 1);

        // miscellaneous
        assert(params->nth > 0);
        assert(params->ith < params->nth);
        assert(plan->type == GGML_TYPE_I32);

        // check nb01 is convertible to lda
        if (weights->nb[1] % ggml_type_size(weights->type))
            return false;

        // no support for column strides
        if (result->nb[0] != ggml_type_size(result->type))
            return false;
        if (thought->nb[0] != ggml_type_size(thought->type))
            return false;
        if (weights->nb[0] != ggml_type_size(weights->type))
            return false;

        // supported output types
        switch (result->type) {
            case GGML_TYPE_F32:
                return mixmuler<float>();
            default:
                return false;
        }
    }

   private:
    template <typename TC>
    bool mixmuler() {
        switch (weights->type) {
            case GGML_TYPE_F32:
                if (thought->type != GGML_TYPE_F32)
                    return false;
#if defined(__AVX512F__)
                return mixmat<16, 1, tinyBLAS<NCB | NCC, 16, __m512, __m512, float, float, TC>, float,
                              float, TC>();
#elif defined(__AVX__) || defined(__AVX2__)
                return mixmat<8, 1, tinyBLAS<NCB | NCC, 8, __m256, __m256, float, float, TC>, float,
                              float, TC>();
#elif defined(__SSE__)
                return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, __m128, __m128, float, float, TC>, float,
                              float, TC>();
#elif defined(__ARM_NEON)
                return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, float, float, TC>,
                              float, float, TC>();
#else
                return false;
#endif

            case GGML_TYPE_BF16:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_BF16)
                    return false;
#if defined(__AVX512BF16__)
                if (!FLAG_precise) {
                    return mixmat<
                        32, 1, tinyBLAS<NCB | NCC, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC>,
                        ggml_bf16_t, ggml_bf16_t, TC>();
                } else {
                    return mixmat<16, 1,
                                  tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
                                  ggml_bf16_t, ggml_bf16_t, TC>();
                }
#elif defined(__AVX512F__)
                return mixmat<16, 1,
                              tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
                              ggml_bf16_t, ggml_bf16_t, TC>();
#elif defined(__AVX2__)
                return mixmat<8, 1,
                              tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, TC>,
                              ggml_bf16_t, ggml_bf16_t, TC>();
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
                return mixmat<
                    4, 1,
                    tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_bf16_t, ggml_bf16_t, TC>,
                    ggml_bf16_t, ggml_bf16_t, TC>();
#else
                return false;
#endif

            case GGML_TYPE_F16:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_F16)
                    return false;
#if defined(__AVX512F__)
                return mixmat<16, 1,
                              tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC>,
                              ggml_fp16_t, ggml_fp16_t, TC>();
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
                // if (X86_CHECK(F16C)) {
                return mixmat<8, 1,
                              tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC>,
                              ggml_fp16_t, ggml_fp16_t, TC>();
                // } else {
                //     return false;
                // }
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
                if (result->op_params[0] == GGML_PREC_F32) {
                    return mixmat<
                        4, 1,
                        tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
                        ggml_fp16_t, ggml_fp16_t, TC>();
                } else {
                    return mixmat<
                        8, 1,
                        tinyBLAS<NCB | NCC, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC>,
                        ggml_fp16_t, ggml_fp16_t, TC>();
                }
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
                return mixmat<
                    4, 1,
                    tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
                    ggml_fp16_t, ggml_fp16_t, TC>();
#else
                return false;
#endif

            case GGML_TYPE_Q4_0:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
                    return false;
#if defined(__AVX2__) || defined(__AVX512F__)
                return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q4_0, block_q8_0, TC>,
                              block_q4_0, block_q8_0, TC>();
#elif defined(__ARM_FEATURE_DOTPROD)
                return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q4_0, block_q8_0, TC>,
                              block_q4_0, block_q8_0, TC>();
#else
                return false;
#endif

            case GGML_TYPE_Q8_0:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
                    return false;
#if defined(__AVX2__) || defined(__AVX512F__)
                return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q8_0, block_q8_0, TC>,
                              block_q8_0, block_q8_0, TC>();
#elif defined(__ARM_FEATURE_DOTPROD)
                return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q8_0, block_q8_0, TC>,
                              block_q8_0, block_q8_0, TC>();
#else
                return false;
#endif

            default:
                return false;
        }
    }

    template <int KN, int BS, typename BLAS, typename TA, typename TB, typename TC>
    bool mixmat() {
        if (cols % KN)
            return false;
        switch (params->type) {
            case GGML_TASK_TYPE_INIT:
                if (thought->type != ggml_type_trait<TB>::id)
                    quantize_thought(ggml_type_trait<TB>::id);
                build_row_pointers(ggml_type_trait<TB>::id);
                return true;
            case GGML_TASK_TYPE_COMPUTE:
                assert(!(cols % BS));
                assert(!(weights->nb[1] % sizeof(TA)));
                for (int expert = 0; expert < experts; ++expert) {
                    BLAS tb{cols / BS,
                            (const TA*)((const char*)weights->data + expert * weights->nb[2]),
                            (long)(weights->nb[1] / sizeof(TA)),
                            (const TB*)(rowptr_thought_ + expert * tokens * thinkers),
                            0,
                            (TC*)(rowptr_result_ + expert * tokens * thinkers),
                            0,
                            params->ith,
                            params->nth};
                    tb.matmul(rows, rowptr_count_[expert], GGML_TASK_TYPE_COMPUTE);
                }
                return true;
            default:
                return true;
        }
    }

    void build_row_pointers(ggml_type vec_dot_type) {
        for (int expert = params->ith; expert < experts; expert += params->nth) {
            long count = 0;
            for (long token = 0; token < tokens; ++token)
                for (int thinker = 0; thinker < thinkers; ++thinker)
                    if (expert == *(const int32_t*)((const char*)plan->data +
                                                    token * plan->nb[1] + thinker * plan->nb[0])) {
                        long row = count++;
                        long idx = expert * thinkers * tokens + row;
                        rowptr_result_[idx] =
                            (uintptr_t)((char*)result->data + token * result->nb[2] +
                                        thinker * result->nb[1]);
                        if (thought->type == vec_dot_type)
                            rowptr_thought_[idx] =
                                (uintptr_t)((char*)thought->data + token * thought->nb[2] +
                                            thinker % tasks * thought->nb[1]);
                        else
                            rowptr_thought_[idx] =
                                (uintptr_t)((char*)quantized_thought_ + token * tasks * ldq +
                                            thinker % tasks * ldq);
                    }
            rowptr_count_[expert] = count;
        }
    }

    void quantize_thought(ggml_type vec_dot_type) {
        long chore = 0;
        for (long token = 0; token < tokens; ++token)
            for (int task = 0; task < tasks; ++task)
                if (chore++ % params->nth == params->ith)
                    quantize_row(quantized_thought_ + token * tasks * ldq + task * ldq,
                                 (const float*)((const char*)thought->data +
                                                token * thought->nb[2] + task * thought->nb[1]),
                                 vec_dot_type);
    }

    void quantize_row(void* dst, const float* src, ggml_type type) {
        assert((long)ggml_row_size(type, cols) <= ldq);
        switch (type) {
            case GGML_TYPE_F16:
                ggml_fp32_to_fp16_row(src, (ggml_fp16_t*)dst, cols);
                break;
            case GGML_TYPE_BF16:
                ggml_fp32_to_bf16_row(src, (ggml_bf16_t*)dst, cols);
                break;
            case GGML_TYPE_Q8_0:
                quantize_row_q8_0((const float*)src, (block_q8_0*)dst, cols);
                break;
            default:
                GGML_UNREACHABLE();
        }
    }

    template <typename T>
    T* allocate(size_t align, size_t elems) {
        T* res = nullptr;
        size_t need = sizeof(T) * elems;
        size_t base = allocated_;
        base += align - 1;
        base &= -align;
        size_t toto = base + need;
        if (toto >= allocated_ && toto <= params->wsize) {
            res = (T*)(wdata_ + base);
            allocated_ = toto;
        }
        return res;
    }

    const ggml_compute_params* const params;
    const ggml_tensor* const weights;
    const ggml_tensor* const thought;
    const ggml_tensor* const plan;
    ggml_tensor* const result;
    const long rows;
    const long cols;
    const int experts;
    const int thinkers;
    const int tasks;
    const long tokens;
    const long ldq;

    // variables
    char* const wdata_;
    size_t allocated_;

    // shared memory
    long* rowptr_count_ /*[experts]*/;
    char* quantized_thought_ /*[tokens][tasks][cols][2]*/;
    uintptr_t* rowptr_result_ /*[experts][tokens*thinkers]*/;
    uintptr_t* rowptr_thought_ /*[experts][tokens*thinkers]*/;
};

}  // namespace

/**
 * Performs "mixture of experts" tensor multiplication on CPU.
 */
bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
    MixMul mm{params, weights, thought, plan, result};
    return mm.allocate_shared_memory() && mm.mixmul();
}


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avx
#include "tinyblas_cpu_mixmul.inc"

/**
 * Returns number of shared memory bytes llamafile_mixmul() needs.
 */
size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
    ggml_compute_params params{};
    params.wsize = 0x7ffff000;
    params.wdata = (void*)0x1000;
    MixMul mm{&params, weights, thought, plan, 0};
    if (mm.allocate_shared_memory())
        return mm.get_allocated_bytes();
    else
        return 0;
}

#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avx2
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avx512f
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avxvnni
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_fma
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_zen4
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm80.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define llamafile_mixmul llamafile_mixmul_arm80
#include "tinyblas_cpu_mixmul.inc"

/**
 * Returns number of shared memory bytes llamafile_mixmul() needs.
 */
size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
    ggml_compute_params params{};
    params.wsize = 0x7ffff000;
    params.wdata = (void*)0x1000;
    MixMul mm{&params, weights, thought, plan, 0};
    if (mm.allocate_shared_memory())
        return mm.get_allocated_bytes();
    else
        return 0;
}

#endif  // __aarch64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm82.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define llamafile_mixmul llamafile_mixmul_arm82
#include "tinyblas_cpu_mixmul.inc"
#endif  // __aarch64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//                   BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

#if defined(KTRANSFORMERS_USE_NPU) && KTRANSFORMERS_USE_NPU
        // use ARM version
        #include "tinyblas_cpu_sgemm_arm.inc"
#else
        // use x86 version
        #include "tinyblas_cpu_sgemm_x86.inc"
#endif

================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avx
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avx2
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avx512f
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avxvnni
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_fma
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_zen4
#define iqk_mul_mat iqk_mul_mat_zen4
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_arm.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tinyblas_cpu.h"
#include <arm_neon.h>
#include <ostream>
#include <iostream>
//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//                   BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

namespace {

template <typename TC>
void SgemmHelperN1Neon2(long m, long n, long k, const float16_t* A, long lda, const float16_t* B, long ldb,
                        TC* C, long ldc, int ith, int nth) {
    // A m * k    B n * k    c n * m
    const long NVL = 8;
    long kk = k / (NVL * 4);
    kk = kk * (NVL * 4);
    long length = (m / nth) + (ith < (m % nth) ? 1 : 0);
    long startRow = ith * (m / nth) + (ith < (m % nth) ? ith : (m % nth));
    long endRow = startRow + length;
    for (long i = startRow; i < endRow; i ++) {
        const float16_t* tA = A + i * lda;
        float32x4_t c0 = vdupq_n_f32(0);
        float32x4_t c1 = vdupq_n_f32(0);
        float32x4_t c2 = vdupq_n_f32(0);
        float32x4_t c3 = vdupq_n_f32(0);
        float32x4_t c4 = vdupq_n_f32(0);
        float32x4_t c5 = vdupq_n_f32(0);
        float32x4_t c6 = vdupq_n_f32(0);
        float32x4_t c7 = vdupq_n_f32(0);
        for (long j = 0; j < kk; j += NVL * 4) {
            __builtin_prefetch(tA + 192, 0, 0);
            float16x8_t a0 = vld1q_f16(tA + j);
            float16x8_t b0 = vld1q_f16(B + j);
            c0 = vfmlalq_low_f16(c0, a0, b0);
            c1 = vfmlalq_high_f16(c1, a0, b0);
            float16x8_t a1 = vld1q_f16(tA + j + NVL);
            float16x8_t b1 = vld1q_f16(B + j + NVL);
            c2 = vfmlalq_low_f16(c2, a1, b1);
            c3 = vfmlalq_high_f16(c3, a1, b1);
            float16x8_t a2 = vld1q_f16(tA + j + NVL * 2);
            float16x8_t b2 = vld1q_f16(B + j + NVL * 2);
            c4 = vfmlalq_low_f16(c4, a2, b2);
            c5 = vfmlalq_high_f16(c5, a2, b2);
            float16x8_t a3 = vld1q_f16(tA + j + NVL * 3);
            float16x8_t b3 = vld1q_f16(B + j + NVL * 3);
            c6 = vfmlalq_low_f16(c6, a3, b3);
            c7 = vfmlalq_high_f16(c7, a3, b3);
        }
        if (k - kk >= NVL * 2) {
            float16x8_t a0 = vld1q_f16(tA + kk);
            float16x8_t b0 = vld1q_f16(B + kk);
            c0 = vfmlalq_low_f16(c0, a0, b0);
            c1 = vfmlalq_high_f16(c1, a0, b0);
            float16x8_t a1 = vld1q_f16(tA + kk + NVL);
            float16x8_t b1 = vld1q_f16(B + kk + NVL);
            c2 = vfmlalq_low_f16(c2, a1, b1);
            c3 = vfmlalq_high_f16(c3, a1, b1);
            kk += NVL * 2;
        }
        if (k - kk >= NVL) {
            float16x8_t a = vld1q_f16(tA + kk);
            float16x8_t b = vld1q_f16(B + kk);
            c0 = vfmlalq_low_f16(c0, a, b);
            c1 = vfmlalq_high_f16(c1, a, b);
            kk += NVL;
        }
        TC sum = 0.0f;
        for (long j = kk; j < k; j ++) {
            sum += (float32_t)tA[j] * (float32_t)B[j];
        }
        c0 = vaddq_f32(c0, c1);
        c2 = vaddq_f32(c2, c3);
        c4 = vaddq_f32(c4, c5);
        c6 = vaddq_f32(c6, c7);
        c0 = vaddq_f32(c0, c2);
        c4 = vaddq_f32(c4, c6);
        sum += vaddvq_f32(c0) + vaddvq_f32(c4);
        C[i] = sum;
    }
    return;
}

template <typename TC>
void SgemmHelperN1(long m, long n, long k, const ggml_fp16_t* A_, long lda, const ggml_fp16_t* B_, long ldb,
                   TC* C, long ldc, int ith, int nth) {
    // A m * k    B n * k    c n * m
    float16_t *A = (float16_t*)A_;
    float16_t *B = (float16_t*)B_;
    long rowsPerThread = m / nth;
    long startRow = ith * rowsPerThread;
    long endRow = (ith == nth - 1) ? m : startRow + rowsPerThread;
    for (long i = startRow; i < endRow; i ++) {
        TC sum = 0.0f;
        for (long j = 0; j < k; j ++) {
            sum += (float32_t)A[i * lda + j] * (float32_t)B[j];
        }
        C[i] = sum;
    }
    return;
}

template <typename TC>
bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    // std::cout << "tinyBLAS tinyBLAS NOT_SUPPORTED FP16  55, n: " << n << ", m: " << m << ", k: " << k << ", FLAG_precise: " << FLAG_precise << "\n"<<std::endl;
    switch (Atype) {
        case GGML_TYPE_F32: {
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
#if defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__AVX__) || defined(__AVX2__)
            if (k % 8)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_NEON)
            if (k % 4)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_BF16: {
#if defined(__AVX512BF16__)
            if (k % 32)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_BF16)
                return NOT_SUPPORTED;
            if (!FLAG_precise) {
                tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            } else {
                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
#elif defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__AVX2__)
            if (k % 8)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
            if (k % 4)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_F16: {
#if defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_F16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
            // if (X86_CHECK(F16C)) {
            if (k % 8)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_F16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
            // } else {
            //     return NOT_SUPPORTED;
            // }
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
            if (n < 2 && !FLAG_precise) {
                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
                if (Btype == GGML_TYPE_F16 && task == GGML_TASK_TYPE_COMPUTE) {
                    SgemmHelperN1Neon2<TC>(m, n, k, (const float16_t*)A, lda, (const float16_t*)B, ldb, C, ldc, ith, nth);
                    // SgemmHelperN1<TC>(m, n, k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth);
                    return true;
                }
                return NOT_SUPPORTED;
            }
            if (precision == GGML_PREC_F32) {
                if (k % 4)
                    return NOT_SUPPORTED;
                if (Btype != GGML_TYPE_F32)
                    return NOT_SUPPORTED;
                tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            } else {
                if (k % 8)
                    return NOT_SUPPORTED;
                if (Btype == GGML_TYPE_F32)
                    return WANT_QUANTIZATION;
                if (Btype != GGML_TYPE_F16)
                    return NOT_SUPPORTED;
                tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
            if (n < 2 && !FLAG_precise) {
                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
                // printf("tinyBLAS tinyBLAS NOT_SUPPORTED FP16 225, m: %ld, n: %ld, k: %ld\n", m, n, k);
                if (Btype == GGML_TYPE_F16 && task == GGML_TASK_TYPE_COMPUTE) {
                    SgemmHelperN1Neon2<TC>(m, n, k, (const float16_t*)A, lda, (const float16_t*)B, ldb, C, ldc, ith, nth);
                    // SgemmHelperN1<TC>(m, n, k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth);
                    return true;
                }
                std::cout << "tinyBLAS tinyBLAS NOT_SUPPORTED FP16 231, n: " << n << ", m: " << m << ", k: " << m << ", FLAG_precise: " << FLAG_precise << "\n"<<std::endl;
                return NOT_SUPPORTED;
            }
            if (k % 4) {
                // std::cout << "tinyBLAS tinyBLAS NOT_SUPPORTED FP16  215" <<std::endl;
                return NOT_SUPPORTED;
            }
            if (Btype != GGML_TYPE_F32) {
                // std::cout << "tinyBLAS tinyBLAS NOT_SUPPORTED FP16  218" <<std::endl;
                return NOT_SUPPORTED;
            }
            // std::cout << "tinyBLAS tinyBLAS true FP16" <<std::endl;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            // std::cout << "tinyBLAS tinyBLAS NOT_SUPPORTED FP16" <<std::endl;
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_Q8_0: {
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_Q8_0)
                return NOT_SUPPORTED;
#if defined(__AVX2__) || defined(__AVX512F__)
            tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{
                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_FEATURE_DOTPROD)
            tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{
                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_Q4_0: {
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_Q8_0)
                return NOT_SUPPORTED;
#if defined(__AVX2__) || defined(__AVX512F__)
            tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{
                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_FEATURE_DOTPROD)
            tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{
                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        default:
            return NOT_SUPPORTED;
    }

    (void)m;
    (void)n;
    (void)k;
    (void)A;
    (void)lda;
    (void)B;
    (void)ldb;
    (void)C;
    (void)ldc;
    (void)ith;
    (void)nth;
    (void)Atype;
    (void)Btype;
    (void)precision;
}

}  // namespace

/**
 * Performs optimized matrix multiplication on CPU.
 *
 * This subroutine may compute C = Aᵀ * B with column major ordering.
 * Despite its name, this isn't a generalized implementation. Work is
 * only performed when a handwritten kernel is written and available.
 * Otherwise the caller should fall back to a general matmul routine.
 *
 * For example, for single-threaded single-precision GEMM you can say
 *
 *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1,
 *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32,
 *                     GGML_PREC_DEFAULT);
 *
 * @param m is rows in `A` and `C`
 * @param n is cols in `B` and `C`
 * @param k is cols in `A` and rows in `B`
 * @param A is first input matrix (always transposed)
 * @param lda is row stride of `A`
 * @param B is second input matrix (never transposed)
 * @param ldb is row stride of `B`
 * @param C is input/output array of output matrices
 * @param ldc is row stride of `C`
 * @param ith is thread id (must be less than `nth`)
 * @param nth is number of threads (must be greater than zero)
 * @param Atype is GGML data type of `A`
 * @param Btype is GGML data type of `B`
 * @param Ctype is GGML data type of `C`
 * @param precision may be used to control the internal compute type
 * @return true if this function was able to service the matmul request
 */
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    assert(m >= 0);
    assert(n >= 0);
    assert(k >= 0);
    assert(lda >= k);
    assert(ldb >= k);
    assert(ldc >= m);
    assert(nth > 0);
    assert(ith < nth);

#if QK_K == 256
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
    // if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32){
        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
        // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
        assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
    // }
#endif
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, k, Btype, B, k, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
        // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
        assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, k, Btype, B, k, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
#endif
#endif

    switch (Ctype) {
        case GGML_TYPE_F32:
            return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype,
                                        Btype, Ctype, precision);
        default:
            return NOT_SUPPORTED;
    }
}


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm80.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define llamafile_sgemm llamafile_sgemm_arm80
#include "tinyblas_cpu_sgemm.inc"
#endif  // __aarch64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm82.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define llamafile_sgemm llamafile_sgemm_arm82
#define iqk_mul_mat iqk_mul_mat_arm82
#include "tinyblas_cpu_sgemm.inc"
#endif  // __aarch64__


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_sgemm_x86.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tinyblas_cpu.h"

//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//                   BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

namespace {

template <typename TC>
bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    switch (Atype) {
        case GGML_TYPE_F32: {
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
#if defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__AVX__) || defined(__AVX2__)
            if (k % 8)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_NEON)
            if (k % 4)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_BF16: {
#if defined(__AVX512BF16__)
            if (k % 32)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_BF16)
                return NOT_SUPPORTED;
            if (!FLAG_precise) {
                tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            } else {
                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
#elif defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__AVX2__)
            if (k % 8)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
            if (k % 4)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_F16: {
#if defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_F16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
            // if (X86_CHECK(F16C)) {
            if (k % 8)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_F16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
            // } else {
            //     return NOT_SUPPORTED;
            // }
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
            if (n < 2 && !FLAG_precise)
                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
                return NOT_SUPPORTED;
            if (precision == GGML_PREC_F32) {
                if (k % 4)
                    return NOT_SUPPORTED;
                if (Btype != GGML_TYPE_F32)
                    return NOT_SUPPORTED;
                tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            } else {
                if (k % 8)
                    return NOT_SUPPORTED;
                if (Btype == GGML_TYPE_F32)
                    return WANT_QUANTIZATION;
                if (Btype != GGML_TYPE_F16)
                    return NOT_SUPPORTED;
                tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
            if (n < 2 && !FLAG_precise)
                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
                return NOT_SUPPORTED;
            if (k % 4)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_Q8_0: {
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_Q8_0)
                return NOT_SUPPORTED;
#if defined(__AVX2__) || defined(__AVX512F__)
            tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{
                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_FEATURE_DOTPROD)
            tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{
                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_Q4_0: {
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_Q8_0)
                return NOT_SUPPORTED;
#if defined(__AVX2__) || defined(__AVX512F__)
            tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{
                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_FEATURE_DOTPROD)
            tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{
                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        default:
            return NOT_SUPPORTED;
    }

    (void)m;
    (void)n;
    (void)k;
    (void)A;
    (void)lda;
    (void)B;
    (void)ldb;
    (void)C;
    (void)ldc;
    (void)ith;
    (void)nth;
    (void)Atype;
    (void)Btype;
    (void)precision;
}

}  // namespace

/**
 * Performs optimized matrix multiplication on CPU.
 *
 * This subroutine may compute C = Aᵀ * B with column major ordering.
 * Despite its name, this isn't a generalized implementation. Work is
 * only performed when a handwritten kernel is written and available.
 * Otherwise the caller should fall back to a general matmul routine.
 *
 * For example, for single-threaded single-precision GEMM you can say
 *
 *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1,
 *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32,
 *                     GGML_PREC_DEFAULT);
 *
 * @param m is rows in `A` and `C`
 * @param n is cols in `B` and `C`
 * @param k is cols in `A` and rows in `B`
 * @param A is first input matrix (always transposed)
 * @param lda is row stride of `A`
 * @param B is second input matrix (never transposed)
 * @param ldb is row stride of `B`
 * @param C is input/output array of output matrices
 * @param ldc is row stride of `C`
 * @param ith is thread id (must be less than `nth`)
 * @param nth is number of threads (must be greater than zero)
 * @param Atype is GGML data type of `A`
 * @param Btype is GGML data type of `B`
 * @param Ctype is GGML data type of `C`
 * @param precision may be used to control the internal compute type
 * @return true if this function was able to service the matmul request
 */
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    assert(m >= 0);
    assert(n >= 0);
    assert(k >= 0);
    assert(lda >= k);
    assert(ldb >= k);
    assert(ldc >= m);
    assert(nth > 0);
    assert(ith < nth);

#if QK_K == 256
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
    /* 
    moonll
    more Btype accept
    }*/

    if (Ctype == GGML_TYPE_F32){
        if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }

#endif
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
        // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
        assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
#endif
#endif

    switch (Ctype) {
        case GGML_TYPE_F32:
            return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype,
                                        Btype, Ctype, precision);
        default:
            return NOT_SUPPORTED;
    }
}


================================================
FILE: archive/third_party/llamafile/tinyblas_cpu_unsupported.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_unsupported.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "sgemm.h"

bool llamafile_sgemm_unsupported(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    return false;
}

bool llamafile_mixmul_unsupported(const struct ggml_compute_params* params,
                                  const struct ggml_tensor* weights,
                                  const struct ggml_tensor* thought,
                                  const struct ggml_tensor* plan,
                                  struct ggml_tensor* result) {
    return false;
}

bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int) {
    return false;
}


================================================
FILE: archive/third_party/nlohmann/json.hpp
================================================
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT

/****************************************************************************\
 * Note on documentation: The source files contain links to the online      *
 * documentation of the public API at https://json.nlohmann.me. This URL    *
 * contains the most recent documentation and should also be applicable to  *
 * previous versions; documentation for deprecated functions is not         *
 * removed, but marked deprecated. See "Generate documentation" section in  *
 * file docs/README.md.                                                     *
\****************************************************************************/

#ifndef INCLUDE_NLOHMANN_JSON_HPP_
#define INCLUDE_NLOHMANN_JSON_HPP_

#include <algorithm> // all_of, find, for_each
#include <cstddef> // nullptr_t, ptrdiff_t, size_t
#include <functional> // hash, less
#include <initializer_list> // initializer_list
#ifndef JSON_NO_IO
    #include <iosfwd> // istream, ostream
#endif  // JSON_NO_IO
#include <iterator> // random_access_iterator_tag
#include <memory> // unique_ptr
#include <string> // string, stoi, to_string
#include <utility> // declval, forward, move, pair, swap
#include <vector> // vector

// #include <nlohmann/adl_serializer.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <utility>

// #include <nlohmann/detail/abi_macros.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// This file contains all macro definitions affecting or depending on the ABI

#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
            #warning "Already included a different version of the library!"
        #endif
    #endif
#endif

#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)

#ifndef JSON_DIAGNOSTICS
    #define JSON_DIAGNOSTICS 0
#endif

#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
#endif

#if JSON_DIAGNOSTICS
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
#else
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
#endif

#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
#else
    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
#endif

// Construct the namespace ABI tags component
#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b) json_abi ## a ## b
#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b) \
    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b)

#define NLOHMANN_JSON_ABI_TAGS                                       \
    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON)

// Construct the namespace version component
#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
    _v ## major ## _ ## minor ## _ ## patch
#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)

#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
#define NLOHMANN_JSON_NAMESPACE_VERSION
#else
#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
                                           NLOHMANN_JSON_VERSION_MINOR, \
                                           NLOHMANN_JSON_VERSION_PATCH)
#endif

// Combine namespace components
#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)

#ifndef NLOHMANN_JSON_NAMESPACE
#define NLOHMANN_JSON_NAMESPACE               \
    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
            NLOHMANN_JSON_ABI_TAGS,           \
            NLOHMANN_JSON_NAMESPACE_VERSION)
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
    namespace nlohmann                               \
    {                                                \
    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
                NLOHMANN_JSON_ABI_TAGS,              \
                NLOHMANN_JSON_NAMESPACE_VERSION)     \
    {
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_END
#define NLOHMANN_JSON_NAMESPACE_END                                     \
    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
    }  // namespace nlohmann
#endif

// #include <nlohmann/detail/conversions/from_json.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <algorithm> // transform
#include <array> // array
#include <forward_list> // forward_list
#include <iterator> // inserter, front_inserter, end
#include <map> // map
#include <string> // string
#include <tuple> // tuple, make_tuple
#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
#include <unordered_map> // unordered_map
#include <utility> // pair, declval
#include <valarray> // valarray

// #include <nlohmann/detail/exceptions.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstddef> // nullptr_t
#include <exception> // exception
#if JSON_DIAGNOSTICS
    #include <numeric> // accumulate
#endif
#include <stdexcept> // runtime_error
#include <string> // to_string
#include <vector> // vector

// #include <nlohmann/detail/value_t.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <array> // array
#include <cstddef> // size_t
#include <cstdint> // uint8_t
#include <string> // string

// #include <nlohmann/detail/macro_scope.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <utility> // declval, pair
// #include <nlohmann/detail/meta/detected.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <type_traits>

// #include <nlohmann/detail/meta/void_t.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// #include <nlohmann/detail/abi_macros.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

template<typename ...Ts> struct make_void
{
    using type = void;
};
template<typename ...Ts> using void_t = typename make_void<Ts...>::type;

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

// https://en.cppreference.com/w/cpp/experimental/is_detected
struct nonesuch
{
    nonesuch() = delete;
    ~nonesuch() = delete;
    nonesuch(nonesuch const&) = delete;
    nonesuch(nonesuch const&&) = delete;
    void operator=(nonesuch const&) = delete;
    void operator=(nonesuch&&) = delete;
};

template<class Default,
         class AlwaysVoid,
         template<class...> class Op,
         class... Args>
struct detector
{
    using value_t = std::false_type;
    using type = Default;
};

template<class Default, template<class...> class Op, class... Args>
struct detector<Default, void_t<Op<Args...>>, Op, Args...>
{
    using value_t = std::true_type;
    using type = Op<Args...>;
};

template<template<class...> class Op, class... Args>
using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;

template<template<class...> class Op, class... Args>
struct is_detected_lazy : is_detected<Op, Args...> { };

template<template<class...> class Op, class... Args>
using detected_t = typename detector<nonesuch, void, Op, Args...>::type;

template<class Default, template<class...> class Op, class... Args>
using detected_or = detector<Default, void, Op, Args...>;

template<class Default, template<class...> class Op, class... Args>
using detected_or_t = typename detected_or<Default, Op, Args...>::type;

template<class Expected, template<class...> class Op, class... Args>
using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;

template<class To, template<class...> class Op, class... Args>
using is_detected_convertible =
    std::is_convertible<detected_t<Op, Args...>, To>;

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/thirdparty/hedley/hedley.hpp>


//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-FileCopyrightText: 2016-2021 Evan Nemerson <evan@nemerson.com>
// SPDX-License-Identifier: MIT

/* Hedley - https://nemequ.github.io/hedley
 * Created by Evan Nemerson <evan@nemerson.com>
 */

#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15)
#if defined(JSON_HEDLEY_VERSION)
    #undef JSON_HEDLEY_VERSION
#endif
#define JSON_HEDLEY_VERSION 15

#if defined(JSON_HEDLEY_STRINGIFY_EX)
    #undef JSON_HEDLEY_STRINGIFY_EX
#endif
#define JSON_HEDLEY_STRINGIFY_EX(x) #x

#if defined(JSON_HEDLEY_STRINGIFY)
    #undef JSON_HEDLEY_STRINGIFY
#endif
#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)

#if defined(JSON_HEDLEY_CONCAT_EX)
    #undef JSON_HEDLEY_CONCAT_EX
#endif
#define JSON_HEDLEY_CONCAT_EX(a,b) a##b

#if defined(JSON_HEDLEY_CONCAT)
    #undef JSON_HEDLEY_CONCAT
#endif
#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)

#if defined(JSON_HEDLEY_CONCAT3_EX)
    #undef JSON_HEDLEY_CONCAT3_EX
#endif
#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c

#if defined(JSON_HEDLEY_CONCAT3)
    #undef JSON_HEDLEY_CONCAT3
#endif
#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c)

#if defined(JSON_HEDLEY_VERSION_ENCODE)
    #undef JSON_HEDLEY_VERSION_ENCODE
#endif
#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))

#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
#endif
#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)

#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
#endif
#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)

#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
#endif
#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)

#if defined(JSON_HEDLEY_GNUC_VERSION)
    #undef JSON_HEDLEY_GNUC_VERSION
#endif
#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
#elif defined(__GNUC__)
    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
#endif

#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_GNUC_VERSION)
    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_MSVC_VERSION)
    #undef JSON_HEDLEY_MSVC_VERSION
#endif
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
#elif defined(_MSC_FULL_VER) && !defined(__ICL)
    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
#elif defined(_MSC_VER) && !defined(__ICL)
    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
#endif

#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
#endif
#if !defined(JSON_HEDLEY_MSVC_VERSION)
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
#else
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
#endif

#if defined(JSON_HEDLEY_INTEL_VERSION)
    #undef JSON_HEDLEY_INTEL_VERSION
#endif
#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
#elif defined(__INTEL_COMPILER) && !defined(__ICL)
    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
#endif

#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_INTEL_VERSION)
    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
    #undef JSON_HEDLEY_INTEL_CL_VERSION
#endif
#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
    #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
#endif

#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK)
    #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_PGI_VERSION)
    #undef JSON_HEDLEY_PGI_VERSION
#endif
#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
#endif

#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
    #undef JSON_HEDLEY_PGI_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_PGI_VERSION)
    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_SUNPRO_VERSION)
    #undef JSON_HEDLEY_SUNPRO_VERSION
#endif
#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
#elif defined(__SUNPRO_C)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
#elif defined(__SUNPRO_CC)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
#endif

#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_SUNPRO_VERSION)
    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
#endif
#if defined(__EMSCRIPTEN__)
    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
#endif

#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_ARM_VERSION)
    #undef JSON_HEDLEY_ARM_VERSION
#endif
#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
#endif

#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
    #undef JSON_HEDLEY_ARM_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_ARM_VERSION)
    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_IBM_VERSION)
    #undef JSON_HEDLEY_IBM_VERSION
#endif
#if defined(__ibmxl__)
    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
#elif defined(__xlC__) && defined(__xlC_ver__)
    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
#elif defined(__xlC__)
    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
#endif

#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
    #undef JSON_HEDLEY_IBM_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_IBM_VERSION)
    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_VERSION)
    #undef JSON_HEDLEY_TI_VERSION
#endif
#if \
    defined(__TI_COMPILER_VERSION__) && \
    ( \
      defined(__TMS470__) || defined(__TI_ARM__) || \
      defined(__MSP430__) || \
      defined(__TMS320C2000__) \
    )
#if (__TI_COMPILER_VERSION__ >= 16000000)
    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif
#endif

#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_VERSION)
    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
    #undef JSON_HEDLEY_TI_CL2000_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
    #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL430_VERSION)
    #undef JSON_HEDLEY_TI_CL430_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
    #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL430_VERSION)
    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
    #undef JSON_HEDLEY_TI_ARMCL_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
    #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
    #undef JSON_HEDLEY_TI_CL6X_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
    #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
    #undef JSON_HEDLEY_TI_CL7X_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
    #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
    #undef JSON_HEDLEY_TI_CLPRU_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
    #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_CRAY_VERSION)
    #undef JSON_HEDLEY_CRAY_VERSION
#endif
#if defined(_CRAYC)
    #if defined(_RELEASE_PATCHLEVEL)
        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
    #else
        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
    #endif
#endif

#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_CRAY_VERSION)
    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_IAR_VERSION)
    #undef JSON_HEDLEY_IAR_VERSION
#endif
#if defined(__IAR_SYSTEMS_ICC__)
    #if __VER__ > 1000
        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
    #else
        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
    #endif
#endif

#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
    #undef JSON_HEDLEY_IAR_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_IAR_VERSION)
    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TINYC_VERSION)
    #undef JSON_HEDLEY_TINYC_VERSION
#endif
#if defined(__TINYC__)
    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
#endif

#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TINYC_VERSION)
    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_DMC_VERSION)
    #undef JSON_HEDLEY_DMC_VERSION
#endif
#if defined(__DMC__)
    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
#endif

#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
    #undef JSON_HEDLEY_DMC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_DMC_VERSION)
    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_COMPCERT_VERSION)
    #undef JSON_HEDLEY_COMPCERT_VERSION
#endif
#if defined(__COMPCERT_VERSION__)
    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
#endif

#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_COMPCERT_VERSION)
    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_PELLES_VERSION)
    #undef JSON_HEDLEY_PELLES_VERSION
#endif
#if defined(__POCC__)
    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
#endif

#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_PELLES_VERSION)
    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
    #undef JSON_HEDLEY_MCST_LCC_VERSION
#endif
#if defined(__LCC__) && defined(__LCC_MINOR__)
    #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
#endif

#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK)
    #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_GCC_VERSION)
    #undef JSON_HEDLEY_GCC_VERSION
#endif
#if \
    defined(JSON_HEDLEY_GNUC_VERSION) && \
    !defined(__clang__) && \
    !defined(JSON_HEDLEY_INTEL_VERSION) && \
    !defined(JSON_HEDLEY_PGI_VERSION) && \
    !defined(JSON_HEDLEY_ARM_VERSION) && \
    !defined(JSON_HEDLEY_CRAY_VERSION) && \
    !defined(JSON_HEDLEY_TI_VERSION) && \
    !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL430_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \
    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \
    !defined(__COMPCERT__) && \
    !defined(JSON_HEDLEY_MCST_LCC_VERSION)
    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
#endif

#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
    #undef JSON_HEDLEY_GCC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_GCC_VERSION)
    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_HAS_ATTRIBUTE
#endif
#if \
  defined(__has_attribute) && \
  ( \
    (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \
  )
#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
#else
#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
#endif
#if defined(__has_attribute)
    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
#else
    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
#endif
#if defined(__has_attribute)
    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
#else
    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
#endif
#if \
    defined(__has_cpp_attribute) && \
    defined(__cplusplus) && \
    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
#else
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
#endif

#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
#endif
#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
#elif \
    !defined(JSON_HEDLEY_PGI_VERSION) && \
    !defined(JSON_HEDLEY_IAR_VERSION) && \
    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
#else
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
#endif
#if defined(__has_cpp_attribute) && defined(__cplusplus)
    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
#else
    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
#endif
#if defined(__has_cpp_attribute) && defined(__cplusplus)
    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
#else
    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_BUILTIN)
    #undef JSON_HEDLEY_HAS_BUILTIN
#endif
#if defined(__has_builtin)
    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
#else
    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
#endif
#if defined(__has_builtin)
    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
#else
    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
#endif
#if defined(__has_builtin)
    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
#else
    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_FEATURE)
    #undef JSON_HEDLEY_HAS_FEATURE
#endif
#if defined(__has_feature)
    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
#else
    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
#endif
#if defined(__has_feature)
    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
#else
    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
    #undef JSON_HEDLEY_GCC_HAS_FEATURE
#endif
#if defined(__has_feature)
    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
#else
    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_EXTENSION)
    #undef JSON_HEDLEY_HAS_EXTENSION
#endif
#if defined(__has_extension)
    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
#else
    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
#endif
#if defined(__has_extension)
    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
#else
    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
#endif
#if defined(__has_extension)
    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
#else
    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
#endif
#if defined(__has_declspec_attribute)
    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
#else
    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
#endif
#if defined(__has_declspec_attribute)
    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
#else
    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
#endif
#if defined(__has_declspec_attribute)
    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
#else
    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_WARNING)
    #undef JSON_HEDLEY_HAS_WARNING
#endif
#if defined(__has_warning)
    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
#else
    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
    #undef JSON_HEDLEY_GNUC_HAS_WARNING
#endif
#if defined(__has_warning)
    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
#else
    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
    #undef JSON_HEDLEY_GCC_HAS_WARNING
#endif
#if defined(__has_warning)
    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
#else
    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if \
    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
    defined(__clang__) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
#else
    #define JSON_HEDLEY_PRAGMA(value)
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
#endif
#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
    #undef JSON_HEDLEY_DIAGNOSTIC_POP
#endif
#if defined(__clang__)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
    #define JSON_HEDLEY_DIAGNOSTIC_POP
#endif

/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
#endif
#if defined(__cplusplus)
#  if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
#    if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
#      if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions")
#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
    _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
    xpr \
    JSON_HEDLEY_DIAGNOSTIC_POP
#      else
#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
    xpr \
    JSON_HEDLEY_DIAGNOSTIC_POP
#      endif
#    else
#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
    xpr \
    JSON_HEDLEY_DIAGNOSTIC_POP
#    endif
#  endif
#endif
#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
#endif

#if defined(JSON_HEDLEY_CONST_CAST)
    #undef JSON_HEDLEY_CONST_CAST
#endif
#if defined(__cplusplus)
#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
#elif \
  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
        JSON_HEDLEY_DIAGNOSTIC_PUSH \
        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
        ((T) (expr)); \
        JSON_HEDLEY_DIAGNOSTIC_POP \
    }))
#else
#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
#endif

#if defined(JSON_HEDLEY_REINTERPRET_CAST)
    #undef JSON_HEDLEY_REINTERPRET_CAST
#endif
#if defined(__cplusplus)
    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
#else
    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
#endif

#if defined(JSON_HEDLEY_STATIC_CAST)
    #undef JSON_HEDLEY_STATIC_CAST
#endif
#if defined(__cplusplus)
    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
#else
    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
#endif

#if defined(JSON_HEDLEY_CPP_CAST)
    #undef JSON_HEDLEY_CPP_CAST
#endif
#if defined(__cplusplus)
#  if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
#    define JSON_HEDLEY_CPP_CAST(T, expr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
    ((T) (expr)) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#  elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0)
#    define JSON_HEDLEY_CPP_CAST(T, expr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("diag_suppress=Pe137") \
    JSON_HEDLEY_DIAGNOSTIC_POP
#  else
#    define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr))
#  endif
#else
#  define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786))
#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161))
#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292))
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunused-function")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505))
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
#endif

#if defined(JSON_HEDLEY_DEPRECATED)
    #undef JSON_HEDLEY_DEPRECATED
#endif
#if defined(JSON_HEDLEY_DEPRECATED_FOR)
    #undef JSON_HEDLEY_DEPRECATED_FOR
#endif
#if \
    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
#elif \
    (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
#elif defined(__cplusplus) && (__cplusplus >= 201402L)
    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
#elif \
    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
#else
    #define JSON_HEDLEY_DEPRECATED(since)
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
#endif

#if defined(JSON_HEDLEY_UNAVAILABLE)
    #undef JSON_HEDLEY_UNAVAILABLE
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
#else
    #define JSON_HEDLEY_UNAVAILABLE(available_since)
#endif

#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
#endif
#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
    #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
#elif defined(_Check_return_) /* SAL */
    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
#else
    #define JSON_HEDLEY_WARN_UNUSED_RESULT
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
#endif

#if defined(JSON_HEDLEY_SENTINEL)
    #undef JSON_HEDLEY_SENTINEL
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
#else
    #define JSON_HEDLEY_SENTINEL(position)
#endif

#if defined(JSON_HEDLEY_NO_RETURN)
    #undef JSON_HEDLEY_NO_RETURN
#endif
#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_NO_RETURN __noreturn
#elif \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
    #define JSON_HEDLEY_NO_RETURN _Noreturn
#elif defined(__cplusplus) && (__cplusplus >= 201103L)
    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
#elif \
    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
#else
    #define JSON_HEDLEY_NO_RETURN
#endif

#if defined(JSON_HEDLEY_NO_ESCAPE)
    #undef JSON_HEDLEY_NO_ESCAPE
#endif
#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
#else
    #define JSON_HEDLEY_NO_ESCAPE
#endif

#if defined(JSON_HEDLEY_UNREACHABLE)
    #undef JSON_HEDLEY_UNREACHABLE
#endif
#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
    #undef JSON_HEDLEY_UNREACHABLE_RETURN
#endif
#if defined(JSON_HEDLEY_ASSUME)
    #undef JSON_HEDLEY_ASSUME
#endif
#if \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
#elif \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
    #if defined(__cplusplus)
        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
    #else
        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
    #endif
#endif
#if \
    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \
    JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
#elif defined(JSON_HEDLEY_ASSUME)
    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
#endif
#if !defined(JSON_HEDLEY_ASSUME)
    #if defined(JSON_HEDLEY_UNREACHABLE)
        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
    #else
        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
    #endif
#endif
#if defined(JSON_HEDLEY_UNREACHABLE)
    #if  \
        JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
    #else
        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
    #endif
#else
    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
#endif
#if !defined(JSON_HEDLEY_UNREACHABLE)
    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
#endif

JSON_HEDLEY_DIAGNOSTIC_PUSH
#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
    #pragma clang diagnostic ignored "-Wpedantic"
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
#endif
#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
    #if defined(__clang__)
        #pragma clang diagnostic ignored "-Wvariadic-macros"
    #elif defined(JSON_HEDLEY_GCC_VERSION)
        #pragma GCC diagnostic ignored "-Wvariadic-macros"
    #endif
#endif
#if defined(JSON_HEDLEY_NON_NULL)
    #undef JSON_HEDLEY_NON_NULL
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
#else
    #define JSON_HEDLEY_NON_NULL(...)
#endif
JSON_HEDLEY_DIAGNOSTIC_POP

#if defined(JSON_HEDLEY_PRINTF_FORMAT)
    #undef JSON_HEDLEY_PRINTF_FORMAT
#endif
#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
#elif \
    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
#else
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
#endif

#if defined(JSON_HEDLEY_CONSTEXPR)
    #undef JSON_HEDLEY_CONSTEXPR
#endif
#if defined(__cplusplus)
    #if __cplusplus >= 201103L
        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
    #endif
#endif
#if !defined(JSON_HEDLEY_CONSTEXPR)
    #define JSON_HEDLEY_CONSTEXPR
#endif

#if defined(JSON_HEDLEY_PREDICT)
    #undef JSON_HEDLEY_PREDICT
#endif
#if defined(JSON_HEDLEY_LIKELY)
    #undef JSON_HEDLEY_LIKELY
#endif
#if defined(JSON_HEDLEY_UNLIKELY)
    #undef JSON_HEDLEY_UNLIKELY
#endif
#if defined(JSON_HEDLEY_UNPREDICTABLE)
    #undef JSON_HEDLEY_UNPREDICTABLE
#endif
#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
#endif
#if \
  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
#  define JSON_HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
#  define JSON_HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
#elif \
  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
  JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
  JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
    (__extension__ ({ \
        double hedley_probability_ = (probability); \
        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
    }))
#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
    (__extension__ ({ \
        double hedley_probability_ = (probability); \
        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
    }))
#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
#else
#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
#endif
#if !defined(JSON_HEDLEY_UNPREDICTABLE)
    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
#endif

#if defined(JSON_HEDLEY_MALLOC)
    #undef JSON_HEDLEY_MALLOC
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_MALLOC __declspec(restrict)
#else
    #define JSON_HEDLEY_MALLOC
#endif

#if defined(JSON_HEDLEY_PURE)
    #undef JSON_HEDLEY_PURE
#endif
#if \
  JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
  JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#  define JSON_HEDLEY_PURE __attribute__((__pure__))
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
#  define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
#elif defined(__cplusplus) && \
    ( \
      JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
    )
#  define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
#else
#  define JSON_HEDLEY_PURE
#endif

#if defined(JSON_HEDLEY_CONST)
    #undef JSON_HEDLEY_CONST
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_CONST __attribute__((__const__))
#elif \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
#else
    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
#endif

#if defined(JSON_HEDLEY_RESTRICT)
    #undef JSON_HEDLEY_RESTRICT
#endif
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
    #define JSON_HEDLEY_RESTRICT restrict
#elif \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
    defined(__clang__) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_RESTRICT __restrict
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
    #define JSON_HEDLEY_RESTRICT _Restrict
#else
    #define JSON_HEDLEY_RESTRICT
#endif

#if defined(JSON_HEDLEY_INLINE)
    #undef JSON_HEDLEY_INLINE
#endif
#if \
    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
    (defined(__cplusplus) && (__cplusplus >= 199711L))
    #define JSON_HEDLEY_INLINE inline
#elif \
    defined(JSON_HEDLEY_GCC_VERSION) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
    #define JSON_HEDLEY_INLINE __inline__
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_INLINE __inline
#else
    #define JSON_HEDLEY_INLINE
#endif

#if defined(JSON_HEDLEY_ALWAYS_INLINE)
    #undef JSON_HEDLEY_ALWAYS_INLINE
#endif
#if \
  JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
  JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
#  define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
#elif \
  JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
#  define JSON_HEDLEY_ALWAYS_INLINE __forceinline
#elif defined(__cplusplus) && \
    ( \
      JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
      JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
      JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
      JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
    )
#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
#else
#  define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
#endif

#if defined(JSON_HEDLEY_NEVER_INLINE)
    #undef JSON_HEDLEY_NEVER_INLINE
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
#else
    #define JSON_HEDLEY_NEVER_INLINE
#endif

#if defined(JSON_HEDLEY_PRIVATE)
    #undef JSON_HEDLEY_PRIVATE
#endif
#if defined(JSON_HEDLEY_PUBLIC)
    #undef JSON_HEDLEY_PUBLIC
#endif
#if defined(JSON_HEDLEY_IMPORT)
    #undef JSON_HEDLEY_IMPORT
#endif
#if defined(_WIN32) || defined(__CYGWIN__)
#  define JSON_HEDLEY_PRIVATE
#  define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
#  define JSON_HEDLEY_IMPORT   __declspec(dllimport)
#else
#  if \
    JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
    ( \
      defined(__TI_EABI__) && \
      ( \
        (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
      ) \
    ) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#    define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
#    define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
#  else
#    define JSON_HEDLEY_PRIVATE
#    define JSON_HEDLEY_PUBLIC
#  endif
#  define JSON_HEDLEY_IMPORT    extern
#endif

#if defined(JSON_HEDLEY_NO_THROW)
    #undef JSON_HEDLEY_NO_THROW
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
#else
    #define JSON_HEDLEY_NO_THROW
#endif

#if defined(JSON_HEDLEY_FALL_THROUGH)
    #undef JSON_HEDLEY_FALL_THROUGH
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
#elif defined(__fallthrough) /* SAL */
    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
#else
    #define JSON_HEDLEY_FALL_THROUGH
#endif

#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
    #undef JSON_HEDLEY_RETURNS_NON_NULL
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
#elif defined(_Ret_notnull_) /* SAL */
    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
#else
    #define JSON_HEDLEY_RETURNS_NON_NULL
#endif

#if defined(JSON_HEDLEY_ARRAY_PARAM)
    #undef JSON_HEDLEY_ARRAY_PARAM
#endif
#if \
    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(__STDC_NO_VLA__) && \
    !defined(__cplusplus) && \
    !defined(JSON_HEDLEY_PGI_VERSION) && \
    !defined(JSON_HEDLEY_TINYC_VERSION)
    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
#else
    #define JSON_HEDLEY_ARRAY_PARAM(name)
#endif

#if defined(JSON_HEDLEY_IS_CONSTANT)
    #undef JSON_HEDLEY_IS_CONSTANT
#endif
#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
#endif
/* JSON_HEDLEY_IS_CONSTEXPR_ is for
   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
    #undef JSON_HEDLEY_IS_CONSTEXPR_
#endif
#if \
    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
#endif
#if !defined(__cplusplus)
#  if \
       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
#if defined(__INTPTR_TYPE__)
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
#else
    #include <stdint.h>
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
#endif
#  elif \
       ( \
          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
          !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
          !defined(JSON_HEDLEY_PGI_VERSION) && \
          !defined(JSON_HEDLEY_IAR_VERSION)) || \
       (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
#if defined(__INTPTR_TYPE__)
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
#else
    #include <stdint.h>
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
#endif
#  elif \
       defined(JSON_HEDLEY_GCC_VERSION) || \
       defined(JSON_HEDLEY_INTEL_VERSION) || \
       defined(JSON_HEDLEY_TINYC_VERSION) || \
       defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \
       JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
       defined(JSON_HEDLEY_TI_CL2000_VERSION) || \
       defined(JSON_HEDLEY_TI_CL6X_VERSION) || \
       defined(JSON_HEDLEY_TI_CL7X_VERSION) || \
       defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \
       defined(__clang__)
#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
        sizeof(void) != \
        sizeof(*( \
                  1 ? \
                  ((void*) ((expr) * 0L) ) : \
((struct { char v[sizeof(void) * 2]; } *) 1) \
                ) \
              ) \
                                            )
#  endif
#endif
#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
    #if !defined(JSON_HEDLEY_IS_CONSTANT)
        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
    #endif
    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
#else
    #if !defined(JSON_HEDLEY_IS_CONSTANT)
        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
    #endif
    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
#endif

#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
    #undef JSON_HEDLEY_BEGIN_C_DECLS
#endif
#if defined(JSON_HEDLEY_END_C_DECLS)
    #undef JSON_HEDLEY_END_C_DECLS
#endif
#if defined(JSON_HEDLEY_C_DECL)
    #undef JSON_HEDLEY_C_DECL
#endif
#if defined(__cplusplus)
    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
    #define JSON_HEDLEY_END_C_DECLS }
    #define JSON_HEDLEY_C_DECL extern "C"
#else
    #define JSON_HEDLEY_BEGIN_C_DECLS
    #define JSON_HEDLEY_END_C_DECLS
    #define JSON_HEDLEY_C_DECL
#endif

#if defined(JSON_HEDLEY_STATIC_ASSERT)
    #undef JSON_HEDLEY_STATIC_ASSERT
#endif
#if \
  !defined(__cplusplus) && ( \
      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
      (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
      defined(_Static_assert) \
    )
#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
#elif \
  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
#else
#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
#endif

#if defined(JSON_HEDLEY_NULL)
    #undef JSON_HEDLEY_NULL
#endif
#if defined(__cplusplus)
    #if __cplusplus >= 201103L
        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
    #elif defined(NULL)
        #define JSON_HEDLEY_NULL NULL
    #else
        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
    #endif
#elif defined(NULL)
    #define JSON_HEDLEY_NULL NULL
#else
    #define JSON_HEDLEY_NULL ((void*) 0)
#endif

#if defined(JSON_HEDLEY_MESSAGE)
    #undef JSON_HEDLEY_MESSAGE
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
#  define JSON_HEDLEY_MESSAGE(msg) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
    JSON_HEDLEY_PRAGMA(message msg) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#elif \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
#else
#  define JSON_HEDLEY_MESSAGE(msg)
#endif

#if defined(JSON_HEDLEY_WARNING)
    #undef JSON_HEDLEY_WARNING
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
#  define JSON_HEDLEY_WARNING(msg) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
    JSON_HEDLEY_PRAGMA(clang warning msg) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#elif \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
#elif \
  JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
#else
#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
#endif

#if defined(JSON_HEDLEY_REQUIRE)
    #undef JSON_HEDLEY_REQUIRE
#endif
#if defined(JSON_HEDLEY_REQUIRE_MSG)
    #undef JSON_HEDLEY_REQUIRE_MSG
#endif
#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
#    define JSON_HEDLEY_REQUIRE(expr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
    __attribute__((diagnose_if(!(expr), msg, "error"))) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#  else
#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
#  endif
#else
#  define JSON_HEDLEY_REQUIRE(expr)
#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
#endif

#if defined(JSON_HEDLEY_FLAGS)
    #undef JSON_HEDLEY_FLAGS
#endif
#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
#else
    #define JSON_HEDLEY_FLAGS
#endif

#if defined(JSON_HEDLEY_FLAGS_CAST)
    #undef JSON_HEDLEY_FLAGS_CAST
#endif
#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
        JSON_HEDLEY_DIAGNOSTIC_PUSH \
        _Pragma("warning(disable:188)") \
        ((T) (expr)); \
        JSON_HEDLEY_DIAGNOSTIC_POP \
    }))
#else
#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
#endif

#if defined(JSON_HEDLEY_EMPTY_BASES)
    #undef JSON_HEDLEY_EMPTY_BASES
#endif
#if \
    (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
#else
    #define JSON_HEDLEY_EMPTY_BASES
#endif

/* Remaining macros are deprecated. */

#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
#endif
#if defined(__clang__)
    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
#else
    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
#endif
#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)

#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
#endif
#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)

#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
#endif
#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)

#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
#endif
#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)

#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
#endif
#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)

#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
#endif
#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)

#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
    #undef JSON_HEDLEY_CLANG_HAS_WARNING
#endif
#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)

#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */


// This file contains all internal macro definitions (except those affecting ABI)
// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them

// #include <nlohmann/detail/abi_macros.hpp>


// exclude unsupported compilers
#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
    #if defined(__clang__)
        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
        #endif
    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
        #endif
    #endif
#endif

// C++ language standard detection
// if the user manually specified the used c++ version this is skipped
#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11)
    #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
        #define JSON_HAS_CPP_20
        #define JSON_HAS_CPP_17
        #define JSON_HAS_CPP_14
    #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
        #define JSON_HAS_CPP_17
        #define JSON_HAS_CPP_14
    #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
        #define JSON_HAS_CPP_14
    #endif
    // the cpp 11 flag is always specified because it is the minimal required version
    #define JSON_HAS_CPP_11
#endif

#ifdef __has_include
    #if __has_include(<version>)
        #include <version>
    #endif
#endif

#if !defined(JSON_HAS_FILESYSTEM) && !defined(JSON_HAS_EXPERIMENTAL_FILESYSTEM)
    #ifdef JSON_HAS_CPP_17
        #if defined(__cpp_lib_filesystem)
            #define JSON_HAS_FILESYSTEM 1
        #elif defined(__cpp_lib_experimental_filesystem)
            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
        #elif !defined(__has_include)
            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
        #elif __has_include(<filesystem>)
            #define JSON_HAS_FILESYSTEM 1
        #elif __has_include(<experimental/filesystem>)
            #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
        #endif

        // std::filesystem does not work on MinGW GCC 8: https://sourceforge.net/p/mingw-w64/bugs/737/
        #if defined(__MINGW32__) && defined(__GNUC__) && __GNUC__ == 8
            #undef JSON_HAS_FILESYSTEM
            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
        #endif

        // no filesystem support before GCC 8: https://en.cppreference.com/w/cpp/compiler_support
        #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
            #undef JSON_HAS_FILESYSTEM
            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
        #endif

        // no filesystem support before Clang 7: https://en.cppreference.com/w/cpp/compiler_support
        #if defined(__clang_major__) && __clang_major__ < 7
            #undef JSON_HAS_FILESYSTEM
            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
        #endif

        // no filesystem support before MSVC 19.14: https://en.cppreference.com/w/cpp/compiler_support
        #if defined(_MSC_VER) && _MSC_VER < 1914
            #undef JSON_HAS_FILESYSTEM
            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
        #endif

        // no filesystem support before iOS 13
        #if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 130000
            #undef JSON_HAS_FILESYSTEM
            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
        #endif

        // no filesystem support before macOS Catalina
        #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500
            #undef JSON_HAS_FILESYSTEM
            #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
        #endif
    #endif
#endif

#ifndef JSON_HAS_EXPERIMENTAL_FILESYSTEM
    #define JSON_HAS_EXPERIMENTAL_FILESYSTEM 0
#endif

#ifndef JSON_HAS_FILESYSTEM
    #define JSON_HAS_FILESYSTEM 0
#endif

#ifndef JSON_HAS_THREE_WAY_COMPARISON
    #if defined(__cpp_impl_three_way_comparison) && __cpp_impl_three_way_comparison >= 201907L \
        && defined(__cpp_lib_three_way_comparison) && __cpp_lib_three_way_comparison >= 201907L
        #define JSON_HAS_THREE_WAY_COMPARISON 1
    #else
        #define JSON_HAS_THREE_WAY_COMPARISON 0
    #endif
#endif

#ifndef JSON_HAS_RANGES
    // ranges header shipping in GCC 11.1.0 (released 2021-04-27) has syntax error
    #if defined(__GLIBCXX__) && __GLIBCXX__ == 20210427
        #define JSON_HAS_RANGES 0
    #elif defined(__cpp_lib_ranges)
        #define JSON_HAS_RANGES 1
    #else
        #define JSON_HAS_RANGES 0
    #endif
#endif

#ifndef JSON_HAS_STATIC_RTTI
    #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0
        #define JSON_HAS_STATIC_RTTI 1
    #else
        #define JSON_HAS_STATIC_RTTI 0
    #endif
#endif

#ifdef JSON_HAS_CPP_17
    #define JSON_INLINE_VARIABLE inline
#else
    #define JSON_INLINE_VARIABLE
#endif

#if JSON_HEDLEY_HAS_ATTRIBUTE(no_unique_address)
    #define JSON_NO_UNIQUE_ADDRESS [[no_unique_address]]
#else
    #define JSON_NO_UNIQUE_ADDRESS
#endif

// disable documentation warnings on clang
#if defined(__clang__)
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wdocumentation"
    #pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
#endif

// allow disabling exceptions
#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
    #define JSON_THROW(exception) throw exception
    #define JSON_TRY try
    #define JSON_CATCH(exception) catch(exception)
    #define JSON_INTERNAL_CATCH(exception) catch(exception)
#else
    #include <cstdlib>
    #define JSON_THROW(exception) std::abort()
    #define JSON_TRY if(true)
    #define JSON_CATCH(exception) if(false)
    #define JSON_INTERNAL_CATCH(exception) if(false)
#endif

// override exception macros
#if defined(JSON_THROW_USER)
    #undef JSON_THROW
    #define JSON_THROW JSON_THROW_USER
#endif
#if defined(JSON_TRY_USER)
    #undef JSON_TRY
    #define JSON_TRY JSON_TRY_USER
#endif
#if defined(JSON_CATCH_USER)
    #undef JSON_CATCH
    #define JSON_CATCH JSON_CATCH_USER
    #undef JSON_INTERNAL_CATCH
    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
#endif
#if defined(JSON_INTERNAL_CATCH_USER)
    #undef JSON_INTERNAL_CATCH
    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
#endif

// allow overriding assert
#if !defined(JSON_ASSERT)
    #include <cassert> // assert
    #define JSON_ASSERT(x) assert(x)
#endif

// allow to access some private functions (needed by the test suite)
#if defined(JSON_TESTS_PRIVATE)
    #define JSON_PRIVATE_UNLESS_TESTED public
#else
    #define JSON_PRIVATE_UNLESS_TESTED private
#endif

/*!
@brief macro to briefly define a mapping between an enum and JSON
@def NLOHMANN_JSON_SERIALIZE_ENUM
@since version 3.4.0
*/
#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
    template<typename BasicJsonType>                                                            \
    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
    {                                                                                           \
        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
        auto it = std::find_if(std::begin(m), std::end(m),                                      \
                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
        {                                                                                       \
            return ej_pair.first == e;                                                          \
        });                                                                                     \
        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
    }                                                                                           \
    template<typename BasicJsonType>                                                            \
    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
    {                                                                                           \
        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
        auto it = std::find_if(std::begin(m), std::end(m),                                      \
                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
        {                                                                                       \
            return ej_pair.second == j;                                                         \
        });                                                                                     \
        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
    }

// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
// may be removed in the future once the class is split.

#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
    template<template<typename, typename, typename...> class ObjectType,   \
             template<typename, typename...> class ArrayType,              \
             class StringType, class BooleanType, class NumberIntegerType, \
             class NumberUnsignedType, class NumberFloatType,              \
             template<typename> class AllocatorType,                       \
             template<typename, typename = void> class JSONSerializer,     \
             class BinaryType,                                             \
             class CustomBaseClass>

#define NLOHMANN_BASIC_JSON_TPL                                            \
    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
    AllocatorType, JSONSerializer, BinaryType, CustomBaseClass>

// Macros to simplify conversion from/to types

#define NLOHMANN_JSON_EXPAND( x ) x
#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME
#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \
        NLOHMANN_JSON_PASTE64, \
        NLOHMANN_JSON_PASTE63, \
        NLOHMANN_JSON_PASTE62, \
        NLOHMANN_JSON_PASTE61, \
        NLOHMANN_JSON_PASTE60, \
        NLOHMANN_JSON_PASTE59, \
        NLOHMANN_JSON_PASTE58, \
        NLOHMANN_JSON_PASTE57, \
        NLOHMANN_JSON_PASTE56, \
        NLOHMANN_JSON_PASTE55, \
        NLOHMANN_JSON_PASTE54, \
        NLOHMANN_JSON_PASTE53, \
        NLOHMANN_JSON_PASTE52, \
        NLOHMANN_JSON_PASTE51, \
        NLOHMANN_JSON_PASTE50, \
        NLOHMANN_JSON_PASTE49, \
        NLOHMANN_JSON_PASTE48, \
        NLOHMANN_JSON_PASTE47, \
        NLOHMANN_JSON_PASTE46, \
        NLOHMANN_JSON_PASTE45, \
        NLOHMANN_JSON_PASTE44, \
        NLOHMANN_JSON_PASTE43, \
        NLOHMANN_JSON_PASTE42, \
        NLOHMANN_JSON_PASTE41, \
        NLOHMANN_JSON_PASTE40, \
        NLOHMANN_JSON_PASTE39, \
        NLOHMANN_JSON_PASTE38, \
        NLOHMANN_JSON_PASTE37, \
        NLOHMANN_JSON_PASTE36, \
        NLOHMANN_JSON_PASTE35, \
        NLOHMANN_JSON_PASTE34, \
        NLOHMANN_JSON_PASTE33, \
        NLOHMANN_JSON_PASTE32, \
        NLOHMANN_JSON_PASTE31, \
        NLOHMANN_JSON_PASTE30, \
        NLOHMANN_JSON_PASTE29, \
        NLOHMANN_JSON_PASTE28, \
        NLOHMANN_JSON_PASTE27, \
        NLOHMANN_JSON_PASTE26, \
        NLOHMANN_JSON_PASTE25, \
        NLOHMANN_JSON_PASTE24, \
        NLOHMANN_JSON_PASTE23, \
        NLOHMANN_JSON_PASTE22, \
        NLOHMANN_JSON_PASTE21, \
        NLOHMANN_JSON_PASTE20, \
        NLOHMANN_JSON_PASTE19, \
        NLOHMANN_JSON_PASTE18, \
        NLOHMANN_JSON_PASTE17, \
        NLOHMANN_JSON_PASTE16, \
        NLOHMANN_JSON_PASTE15, \
        NLOHMANN_JSON_PASTE14, \
        NLOHMANN_JSON_PASTE13, \
        NLOHMANN_JSON_PASTE12, \
        NLOHMANN_JSON_PASTE11, \
        NLOHMANN_JSON_PASTE10, \
        NLOHMANN_JSON_PASTE9, \
        NLOHMANN_JSON_PASTE8, \
        NLOHMANN_JSON_PASTE7, \
        NLOHMANN_JSON_PASTE6, \
        NLOHMANN_JSON_PASTE5, \
        NLOHMANN_JSON_PASTE4, \
        NLOHMANN_JSON_PASTE3, \
        NLOHMANN_JSON_PASTE2, \
        NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24)
#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25)
#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26)
#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27)
#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28)
#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29)
#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30)
#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32)
#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33)
#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34)
#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35)
#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36)
#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37)
#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38)
#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39)
#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40)
#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41)
#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42)
#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43)
#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44)
#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45)
#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46)
#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47)
#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48)
#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49)
#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50)
#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51)
#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52)
#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53)
#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54)
#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55)
#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56)
#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57)
#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58)
#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59)
#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60)
#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61)
#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62)
#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63)

#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) nlohmann_json_t.v1 = nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1);

/*!
@brief macro
@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
@since version 3.9.0
*/
#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }

#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }

#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }

/*!
@brief macro
@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
@since version 3.9.0
*/
#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }

#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }

#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }

/*!
@brief macro
@def NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE
@since version 3.11.x
*/
#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE(Type, BaseType, ...)  \
    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }

#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }

/*!
@brief macro
@def NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE
@since version 3.11.x
*/
#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE(Type, BaseType, ...)  \
    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }

#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }


// inspired from https://stackoverflow.com/a/26745591
// allows to call any std function as if (e.g. with begin):
// using std::begin; begin(x);
//
// it allows using the detected idiom to retrieve the return type
// of such an expression
#define NLOHMANN_CAN_CALL_STD_FUNC_IMPL(std_name)                                 \
    namespace detail {                                                            \
    using std::std_name;                                                          \
    \
    template<typename... T>                                                       \
    using result_of_##std_name = decltype(std_name(std::declval<T>()...));        \
    }                                                                             \
    \
    namespace detail2 {                                                           \
    struct std_name##_tag                                                         \
    {                                                                             \
    };                                                                            \
    \
    template<typename... T>                                                       \
    std_name##_tag std_name(T&&...);                                              \
    \
    template<typename... T>                                                       \
    using result_of_##std_name = decltype(std_name(std::declval<T>()...));        \
    \
    template<typename... T>                                                       \
    struct would_call_std_##std_name                                              \
    {                                                                             \
        static constexpr auto const value = ::nlohmann::detail::                  \
                                            is_detected_exact<std_name##_tag, result_of_##std_name, T...>::value; \
    };                                                                            \
    } /* namespace detail2 */ \
    \
    template<typename... T>                                                       \
    struct would_call_std_##std_name : detail2::would_call_std_##std_name<T...>   \
    {                                                                             \
    }

#ifndef JSON_USE_IMPLICIT_CONVERSIONS
    #define JSON_USE_IMPLICIT_CONVERSIONS 1
#endif

#if JSON_USE_IMPLICIT_CONVERSIONS
    #define JSON_EXPLICIT
#else
    #define JSON_EXPLICIT explicit
#endif

#ifndef JSON_DISABLE_ENUM_SERIALIZATION
    #define JSON_DISABLE_ENUM_SERIALIZATION 0
#endif

#ifndef JSON_USE_GLOBAL_UDLS
    #define JSON_USE_GLOBAL_UDLS 1
#endif

#if JSON_HAS_THREE_WAY_COMPARISON
    #include <compare> // partial_ordering
#endif

NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

///////////////////////////
// JSON type enumeration //
///////////////////////////

/*!
@brief the JSON type enumeration

This enumeration collects the different JSON types. It is internally used to
distinguish the stored values, and the functions @ref basic_json::is_null(),
@ref basic_json::is_object(), @ref basic_json::is_array(),
@ref basic_json::is_string(), @ref basic_json::is_boolean(),
@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
@ref basic_json::is_structured() rely on it.

@note There are three enumeration entries (number_integer, number_unsigned, and
number_float), because the library distinguishes these three types for numbers:
@ref basic_json::number_unsigned_t is used for unsigned integers,
@ref basic_json::number_integer_t is used for signed integers, and
@ref basic_json::number_float_t is used for floating-point numbers or to
approximate integers which do not fit in the limits of their respective type.

@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON
value with the default value for a given type

@since version 1.0.0
*/
enum class value_t : std::uint8_t
{
    null,             ///< null value
    object,           ///< object (unordered set of name/value pairs)
    array,            ///< array (ordered collection of values)
    string,           ///< string value
    boolean,          ///< boolean value
    number_integer,   ///< number value (signed integer)
    number_unsigned,  ///< number value (unsigned integer)
    number_float,     ///< number value (floating-point)
    binary,           ///< binary array (ordered collection of bytes)
    discarded         ///< discarded by the parser callback function
};

/*!
@brief comparison operator for JSON types

Returns an ordering that is similar to Python:
- order: null < boolean < number < object < array < string < binary
- furthermore, each type is not smaller than itself
- discarded values are not comparable
- binary is represented as a b"" string in python and directly comparable to a
  string; however, making a binary array directly comparable with a string would
  be surprising behavior in a JSON file.

@since version 1.0.0
*/
#if JSON_HAS_THREE_WAY_COMPARISON
    inline std::partial_ordering operator<=>(const value_t lhs, const value_t rhs) noexcept // *NOPAD*
#else
    inline bool operator<(const value_t lhs, const value_t rhs) noexcept
#endif
{
    static constexpr std::array<std::uint8_t, 9> order = {{
            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */,
            6 /* binary */
        }
    };

    const auto l_index = static_cast<std::size_t>(lhs);
    const auto r_index = static_cast<std::size_t>(rhs);
#if JSON_HAS_THREE_WAY_COMPARISON
    if (l_index < order.size() && r_index < order.size())
    {
        return order[l_index] <=> order[r_index]; // *NOPAD*
    }
    return std::partial_ordering::unordered;
#else
    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
#endif
}

// GCC selects the built-in operator< over an operator rewritten from
// a user-defined spaceship operator
// Clang, MSVC, and ICC select the rewritten candidate
// (see GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105200)
#if JSON_HAS_THREE_WAY_COMPARISON && defined(__GNUC__)
inline bool operator<(const value_t lhs, const value_t rhs) noexcept
{
    return std::is_lt(lhs <=> rhs); // *NOPAD*
}
#endif

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/string_escape.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// #include <nlohmann/detail/abi_macros.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/*!
@brief replace all occurrences of a substring by another string

@param[in,out] s  the string to manipulate; changed so that all
               occurrences of @a f are replaced with @a t
@param[in]     f  the substring to replace with @a t
@param[in]     t  the string to replace @a f

@pre The search string @a f must not be empty. **This precondition is
enforced with an assertion.**

@since version 2.0.0
*/
template<typename StringType>
inline void replace_substring(StringType& s, const StringType& f,
                              const StringType& t)
{
    JSON_ASSERT(!f.empty());
    for (auto pos = s.find(f);                // find first occurrence of f
            pos != StringType::npos;          // make sure f was found
            s.replace(pos, f.size(), t),      // replace with t, and
            pos = s.find(f, pos + t.size()))  // find next occurrence of f
    {}
}

/*!
 * @brief string escaping as described in RFC 6901 (Sect. 4)
 * @param[in] s string to escape
 * @return    escaped string
 *
 * Note the order of escaping "~" to "~0" and "/" to "~1" is important.
 */
template<typename StringType>
inline StringType escape(StringType s)
{
    replace_substring(s, StringType{"~"}, StringType{"~0"});
    replace_substring(s, StringType{"/"}, StringType{"~1"});
    return s;
}

/*!
 * @brief string unescaping as described in RFC 6901 (Sect. 4)
 * @param[in] s string to unescape
 * @return    unescaped string
 *
 * Note the order of escaping "~1" to "/" and "~0" to "~" is important.
 */
template<typename StringType>
static void unescape(StringType& s)
{
    replace_substring(s, StringType{"~1"}, StringType{"/"});
    replace_substring(s, StringType{"~0"}, StringType{"~"});
}

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/input/position_t.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstddef> // size_t

// #include <nlohmann/detail/abi_macros.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/// struct to capture the start position of the current token
struct position_t
{
    /// the total number of characters read
    std::size_t chars_read_total = 0;
    /// the number of characters read in the current line
    std::size_t chars_read_current_line = 0;
    /// the number of lines read
    std::size_t lines_read = 0;

    /// conversion to size_t to preserve SAX interface
    constexpr operator size_t() const
    {
        return chars_read_total;
    }
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-FileCopyrightText: 2018 The Abseil Authors
// SPDX-License-Identifier: MIT


#include <array> // array
#include <cstddef> // size_t
#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
#include <utility> // index_sequence, make_index_sequence, index_sequence_for

// #include <nlohmann/detail/macro_scope.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

template<typename T>
using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;

#ifdef JSON_HAS_CPP_14

// the following utilities are natively available in C++14
using std::enable_if_t;
using std::index_sequence;
using std::make_index_sequence;
using std::index_sequence_for;

#else

// alias templates to reduce boilerplate
template<bool B, typename T = void>
using enable_if_t = typename std::enable_if<B, T>::type;

// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h
// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0.

//// START OF CODE FROM GOOGLE ABSEIL

// integer_sequence
//
// Class template representing a compile-time integer sequence. An instantiation
// of `integer_sequence<T, Ints...>` has a sequence of integers encoded in its
// type through its template arguments (which is a common need when
// working with C++11 variadic templates). `absl::integer_sequence` is designed
// to be a drop-in replacement for C++14's `std::integer_sequence`.
//
// Example:
//
//   template< class T, T... Ints >
//   void user_function(integer_sequence<T, Ints...>);
//
//   int main()
//   {
//     // user_function's `T` will be deduced to `int` and `Ints...`
//     // will be deduced to `0, 1, 2, 3, 4`.
//     user_function(make_integer_sequence<int, 5>());
//   }
template <typename T, T... Ints>
struct integer_sequence
{
    using value_type = T;
    static constexpr std::size_t size() noexcept
    {
        return sizeof...(Ints);
    }
};

// index_sequence
//
// A helper template for an `integer_sequence` of `size_t`,
// `absl::index_sequence` is designed to be a drop-in replacement for C++14's
// `std::index_sequence`.
template <size_t... Ints>
using index_sequence = integer_sequence<size_t, Ints...>;

namespace utility_internal
{

template <typename Seq, size_t SeqSize, size_t Rem>
struct Extend;

// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency.
template <typename T, T... Ints, size_t SeqSize>
struct Extend<integer_sequence<T, Ints...>, SeqSize, 0>
{
    using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >;
};

template <typename T, T... Ints, size_t SeqSize>
struct Extend<integer_sequence<T, Ints...>, SeqSize, 1>
{
    using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >;
};

// Recursion helper for 'make_integer_sequence<T, N>'.
// 'Gen<T, N>::type' is an alias for 'integer_sequence<T, 0, 1, ... N-1>'.
template <typename T, size_t N>
struct Gen
{
    using type =
        typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type;
};

template <typename T>
struct Gen<T, 0>
{
    using type = integer_sequence<T>;
};

}  // namespace utility_internal

// Compile-time sequences of integers

// make_integer_sequence
//
// This template alias is equivalent to
// `integer_sequence<int, 0, 1, ..., N-1>`, and is designed to be a drop-in
// replacement for C++14's `std::make_integer_sequence`.
template <typename T, T N>
using make_integer_sequence = typename utility_internal::Gen<T, N>::type;

// make_index_sequence
//
// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`,
// and is designed to be a drop-in replacement for C++14's
// `std::make_index_sequence`.
template <size_t N>
using make_index_sequence = make_integer_sequence<size_t, N>;

// index_sequence_for
//
// Converts a typename pack into an index sequence of the same length, and
// is designed to be a drop-in replacement for C++14's
// `std::index_sequence_for()`
template <typename... Ts>
using index_sequence_for = make_index_sequence<sizeof...(Ts)>;

//// END OF CODE FROM GOOGLE ABSEIL

#endif

// dispatch utility (taken from ranges-v3)
template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
template<> struct priority_tag<0> {};

// taken from ranges-v3
template<typename T>
struct static_const
{
    static JSON_INLINE_VARIABLE constexpr T value{};
};

#ifndef JSON_HAS_CPP_17
    template<typename T>
    constexpr T static_const<T>::value;
#endif

template<typename T, typename... Args>
inline constexpr std::array<T, sizeof...(Args)> make_array(Args&& ... args)
{
    return std::array<T, sizeof...(Args)> {{static_cast<T>(std::forward<Args>(args))...}};
}

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/meta/type_traits.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <limits> // numeric_limits
#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
#include <utility> // declval
#include <tuple> // tuple
#include <string> // char_traits

// #include <nlohmann/detail/iterators/iterator_traits.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <iterator> // random_access_iterator_tag

// #include <nlohmann/detail/abi_macros.hpp>

// #include <nlohmann/detail/meta/void_t.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

template<typename It, typename = void>
struct iterator_types {};

template<typename It>
struct iterator_types <
    It,
    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
    typename It::reference, typename It::iterator_category >>
{
    using difference_type = typename It::difference_type;
    using value_type = typename It::value_type;
    using pointer = typename It::pointer;
    using reference = typename It::reference;
    using iterator_category = typename It::iterator_category;
};

// This is required as some compilers implement std::iterator_traits in a way that
// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
template<typename T, typename = void>
struct iterator_traits
{
};

template<typename T>
struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
            : iterator_types<T>
{
};

template<typename T>
struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
{
    using iterator_category = std::random_access_iterator_tag;
    using value_type = T;
    using difference_type = ptrdiff_t;
    using pointer = T*;
    using reference = T&;
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/call_std/begin.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// #include <nlohmann/detail/macro_scope.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN

NLOHMANN_CAN_CALL_STD_FUNC_IMPL(begin);

NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/meta/call_std/end.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// #include <nlohmann/detail/macro_scope.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN

NLOHMANN_CAN_CALL_STD_FUNC_IMPL(end);

NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/detected.hpp>

// #include <nlohmann/json_fwd.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT

#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
    #define INCLUDE_NLOHMANN_JSON_FWD_HPP_

    #include <cstdint> // int64_t, uint64_t
    #include <map> // map
    #include <memory> // allocator
    #include <string> // string
    #include <vector> // vector

    // #include <nlohmann/detail/abi_macros.hpp>


    /*!
    @brief namespace for Niels Lohmann
    @see https://github.com/nlohmann
    @since version 1.0.0
    */
    NLOHMANN_JSON_NAMESPACE_BEGIN

    /*!
    @brief default JSONSerializer template argument

    This serializer ignores the template arguments and uses ADL
    ([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
    for serialization.
    */
    template<typename T = void, typename SFINAE = void>
    struct adl_serializer;

    /// a class to store JSON values
    /// @sa https://json.nlohmann.me/api/basic_json/
    template<template<typename U, typename V, typename... Args> class ObjectType =
    std::map,
    template<typename U, typename... Args> class ArrayType = std::vector,
    class StringType = std::string, class BooleanType = bool,
    class NumberIntegerType = std::int64_t,
    class NumberUnsignedType = std::uint64_t,
    class NumberFloatType = double,
    template<typename U> class AllocatorType = std::allocator,
    template<typename T, typename SFINAE = void> class JSONSerializer =
    adl_serializer,
    class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
    class CustomBaseClass = void>
    class basic_json;

    /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
    /// @sa https://json.nlohmann.me/api/json_pointer/
    template<typename RefStringType>
    class json_pointer;

    /*!
    @brief default specialization
    @sa https://json.nlohmann.me/api/json/
    */
    using json = basic_json<>;

    /// @brief a minimal map-like container that preserves insertion order
    /// @sa https://json.nlohmann.me/api/ordered_map/
    template<class Key, class T, class IgnoredLess, class Allocator>
    struct ordered_map;

    /// @brief specialization that maintains the insertion order of object keys
    /// @sa https://json.nlohmann.me/api/ordered_json/
    using ordered_json = basic_json<nlohmann::ordered_map>;

    NLOHMANN_JSON_NAMESPACE_END

#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_


NLOHMANN_JSON_NAMESPACE_BEGIN
/*!
@brief detail namespace with internal helper functions

This namespace collects functions that should not be exposed,
implementations of some @ref basic_json methods, and meta-programming helpers.

@since version 2.1.0
*/
namespace detail
{

/////////////
// helpers //
/////////////

// Note to maintainers:
//
// Every trait in this file expects a non CV-qualified type.
// The only exceptions are in the 'aliases for detected' section
// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
//
// In this case, T has to be properly CV-qualified to constraint the function arguments
// (e.g. to_json(BasicJsonType&, const T&))

template<typename> struct is_basic_json : std::false_type {};

NLOHMANN_BASIC_JSON_TPL_DECLARATION
struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};

// used by exceptions create() member functions
// true_type for pointer to possibly cv-qualified basic_json or std::nullptr_t
// false_type otherwise
template<typename BasicJsonContext>
struct is_basic_json_context :
    std::integral_constant < bool,
    is_basic_json<typename std::remove_cv<typename std::remove_pointer<BasicJsonContext>::type>::type>::value
    || std::is_same<BasicJsonContext, std::nullptr_t>::value >
{};

//////////////////////
// json_ref helpers //
//////////////////////

template<typename>
class json_ref;

template<typename>
struct is_json_ref : std::false_type {};

template<typename T>
struct is_json_ref<json_ref<T>> : std::true_type {};

//////////////////////////
// aliases for detected //
//////////////////////////

template<typename T>
using mapped_type_t = typename T::mapped_type;

template<typename T>
using key_type_t = typename T::key_type;

template<typename T>
using value_type_t = typename T::value_type;

template<typename T>
using difference_type_t = typename T::difference_type;

template<typename T>
using pointer_t = typename T::pointer;

template<typename T>
using reference_t = typename T::reference;

template<typename T>
using iterator_category_t = typename T::iterator_category;

template<typename T, typename... Args>
using to_json_function = decltype(T::to_json(std::declval<Args>()...));

template<typename T, typename... Args>
using from_json_function = decltype(T::from_json(std::declval<Args>()...));

template<typename T, typename U>
using get_template_function = decltype(std::declval<T>().template get<U>());

// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
template<typename BasicJsonType, typename T, typename = void>
struct has_from_json : std::false_type {};

// trait checking if j.get<T> is valid
// use this trait instead of std::is_constructible or std::is_convertible,
// both rely on, or make use of implicit conversions, and thus fail when T
// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
template <typename BasicJsonType, typename T>
struct is_getable
{
    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
};

template<typename BasicJsonType, typename T>
struct has_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
{
    using serializer = typename BasicJsonType::template json_serializer<T, void>;

    static constexpr bool value =
        is_detected_exact<void, from_json_function, serializer,
        const BasicJsonType&, T&>::value;
};

// This trait checks if JSONSerializer<T>::from_json(json const&) exists
// this overload is used for non-default-constructible user-defined-types
template<typename BasicJsonType, typename T, typename = void>
struct has_non_default_from_json : std::false_type {};

template<typename BasicJsonType, typename T>
struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
{
    using serializer = typename BasicJsonType::template json_serializer<T, void>;

    static constexpr bool value =
        is_detected_exact<T, from_json_function, serializer,
        const BasicJsonType&>::value;
};

// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
template<typename BasicJsonType, typename T, typename = void>
struct has_to_json : std::false_type {};

template<typename BasicJsonType, typename T>
struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
{
    using serializer = typename BasicJsonType::template json_serializer<T, void>;

    static constexpr bool value =
        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
        T>::value;
};

template<typename T>
using detect_key_compare = typename T::key_compare;

template<typename T>
struct has_key_compare : std::integral_constant<bool, is_detected<detect_key_compare, T>::value> {};

// obtains the actual object key comparator
template<typename BasicJsonType>
struct actual_object_comparator
{
    using object_t = typename BasicJsonType::object_t;
    using object_comparator_t = typename BasicJsonType::default_object_comparator_t;
    using type = typename std::conditional < has_key_compare<object_t>::value,
          typename object_t::key_compare, object_comparator_t>::type;
};

template<typename BasicJsonType>
using actual_object_comparator_t = typename actual_object_comparator<BasicJsonType>::type;

/////////////////
// char_traits //
/////////////////

// Primary template of char_traits calls std char_traits
template<typename T>
struct char_traits : std::char_traits<T>
{};

// Explicitly define char traits for unsigned char since it is not standard
template<>
struct char_traits<unsigned char> : std::char_traits<char>
{
    using char_type = unsigned char;
    using int_type = uint64_t;

    // Redefine to_int_type function
    static int_type to_int_type(char_type c) noexcept
    {
        return static_cast<int_type>(c);
    }

    static char_type to_char_type(int_type i) noexcept
    {
        return static_cast<char_type>(i);
    }

    static constexpr int_type eof() noexcept
    {
        return static_cast<int_type>(EOF);
    }
};

// Explicitly define char traits for signed char since it is not standard
template<>
struct char_traits<signed char> : std::char_traits<char>
{
    using char_type = signed char;
    using int_type = uint64_t;

    // Redefine to_int_type function
    static int_type to_int_type(char_type c) noexcept
    {
        return static_cast<int_type>(c);
    }

    static char_type to_char_type(int_type i) noexcept
    {
        return static_cast<char_type>(i);
    }

    static constexpr int_type eof() noexcept
    {
        return static_cast<int_type>(EOF);
    }
};

///////////////////
// is_ functions //
///////////////////

// https://en.cppreference.com/w/cpp/types/conjunction
template<class...> struct conjunction : std::true_type { };
template<class B> struct conjunction<B> : B { };
template<class B, class... Bn>
struct conjunction<B, Bn...>
: std::conditional<static_cast<bool>(B::value), conjunction<Bn...>, B>::type {};

// https://en.cppreference.com/w/cpp/types/negation
template<class B> struct negation : std::integral_constant < bool, !B::value > { };

// Reimplementation of is_constructible and is_default_constructible, due to them being broken for
// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367).
// This causes compile errors in e.g. clang 3.5 or gcc 4.9.
template <typename T>
struct is_default_constructible : std::is_default_constructible<T> {};

template <typename T1, typename T2>
struct is_default_constructible<std::pair<T1, T2>>
            : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};

template <typename T1, typename T2>
struct is_default_constructible<const std::pair<T1, T2>>
            : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};

template <typename... Ts>
struct is_default_constructible<std::tuple<Ts...>>
            : conjunction<is_default_constructible<Ts>...> {};

template <typename... Ts>
struct is_default_constructible<const std::tuple<Ts...>>
            : conjunction<is_default_constructible<Ts>...> {};

template <typename T, typename... Args>
struct is_constructible : std::is_constructible<T, Args...> {};

template <typename T1, typename T2>
struct is_constructible<std::pair<T1, T2>> : is_default_constructible<std::pair<T1, T2>> {};

template <typename T1, typename T2>
struct is_constructible<const std::pair<T1, T2>> : is_default_constructible<const std::pair<T1, T2>> {};

template <typename... Ts>
struct is_constructible<std::tuple<Ts...>> : is_default_constructible<std::tuple<Ts...>> {};

template <typename... Ts>
struct is_constructible<const std::tuple<Ts...>> : is_default_constructible<const std::tuple<Ts...>> {};

template<typename T, typename = void>
struct is_iterator_traits : std::false_type {};

template<typename T>
struct is_iterator_traits<iterator_traits<T>>
{
  private:
    using traits = iterator_traits<T>;

  public:
    static constexpr auto value =
        is_detected<value_type_t, traits>::value &&
        is_detected<difference_type_t, traits>::value &&
        is_detected<pointer_t, traits>::value &&
        is_detected<iterator_category_t, traits>::value &&
        is_detected<reference_t, traits>::value;
};

template<typename T>
struct is_range
{
  private:
    using t_ref = typename std::add_lvalue_reference<T>::type;

    using iterator = detected_t<result_of_begin, t_ref>;
    using sentinel = detected_t<result_of_end, t_ref>;

    // to be 100% correct, it should use https://en.cppreference.com/w/cpp/iterator/input_or_output_iterator
    // and https://en.cppreference.com/w/cpp/iterator/sentinel_for
    // but reimplementing these would be too much work, as a lot of other concepts are used underneath
    static constexpr auto is_iterator_begin =
        is_iterator_traits<iterator_traits<iterator>>::value;

  public:
    static constexpr bool value = !std::is_same<iterator, nonesuch>::value && !std::is_same<sentinel, nonesuch>::value && is_iterator_begin;
};

template<typename R>
using iterator_t = enable_if_t<is_range<R>::value, result_of_begin<decltype(std::declval<R&>())>>;

template<typename T>
using range_value_t = value_type_t<iterator_traits<iterator_t<T>>>;

// The following implementation of is_complete_type is taken from
// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/
// and is written by Xiang Fan who agreed to using it in this library.

template<typename T, typename = void>
struct is_complete_type : std::false_type {};

template<typename T>
struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};

template<typename BasicJsonType, typename CompatibleObjectType,
         typename = void>
struct is_compatible_object_type_impl : std::false_type {};

template<typename BasicJsonType, typename CompatibleObjectType>
struct is_compatible_object_type_impl <
    BasicJsonType, CompatibleObjectType,
    enable_if_t < is_detected<mapped_type_t, CompatibleObjectType>::value&&
    is_detected<key_type_t, CompatibleObjectType>::value >>
{
    using object_t = typename BasicJsonType::object_t;

    // macOS's is_constructible does not play well with nonesuch...
    static constexpr bool value =
        is_constructible<typename object_t::key_type,
        typename CompatibleObjectType::key_type>::value &&
        is_constructible<typename object_t::mapped_type,
        typename CompatibleObjectType::mapped_type>::value;
};

template<typename BasicJsonType, typename CompatibleObjectType>
struct is_compatible_object_type
    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};

template<typename BasicJsonType, typename ConstructibleObjectType,
         typename = void>
struct is_constructible_object_type_impl : std::false_type {};

template<typename BasicJsonType, typename ConstructibleObjectType>
struct is_constructible_object_type_impl <
    BasicJsonType, ConstructibleObjectType,
    enable_if_t < is_detected<mapped_type_t, ConstructibleObjectType>::value&&
    is_detected<key_type_t, ConstructibleObjectType>::value >>
{
    using object_t = typename BasicJsonType::object_t;

    static constexpr bool value =
        (is_default_constructible<ConstructibleObjectType>::value &&
         (std::is_move_assignable<ConstructibleObjectType>::value ||
          std::is_copy_assignable<ConstructibleObjectType>::value) &&
         (is_constructible<typename ConstructibleObjectType::key_type,
          typename object_t::key_type>::value &&
          std::is_same <
          typename object_t::mapped_type,
          typename ConstructibleObjectType::mapped_type >::value)) ||
        (has_from_json<BasicJsonType,
         typename ConstructibleObjectType::mapped_type>::value ||
         has_non_default_from_json <
         BasicJsonType,
         typename ConstructibleObjectType::mapped_type >::value);
};

template<typename BasicJsonType, typename ConstructibleObjectType>
struct is_constructible_object_type
    : is_constructible_object_type_impl<BasicJsonType,
      ConstructibleObjectType> {};

template<typename BasicJsonType, typename CompatibleStringType>
struct is_compatible_string_type
{
    static constexpr auto value =
        is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
};

template<typename BasicJsonType, typename ConstructibleStringType>
struct is_constructible_string_type
{
    // launder type through decltype() to fix compilation failure on ICPC
#ifdef __INTEL_COMPILER
    using laundered_type = decltype(std::declval<ConstructibleStringType>());
#else
    using laundered_type = ConstructibleStringType;
#endif

    static constexpr auto value =
        conjunction <
        is_constructible<laundered_type, typename BasicJsonType::string_t>,
        is_detected_exact<typename BasicJsonType::string_t::value_type,
        value_type_t, laundered_type >>::value;
};

template<typename BasicJsonType, typename CompatibleArrayType, typename = void>
struct is_compatible_array_type_impl : std::false_type {};

template<typename BasicJsonType, typename CompatibleArrayType>
struct is_compatible_array_type_impl <
    BasicJsonType, CompatibleArrayType,
    enable_if_t <
    is_detected<iterator_t, CompatibleArrayType>::value&&
    is_iterator_traits<iterator_traits<detected_t<iterator_t, CompatibleArrayType>>>::value&&
// special case for types like std::filesystem::path whose iterator's value_type are themselves
// c.f. https://github.com/nlohmann/json/pull/3073
    !std::is_same<CompatibleArrayType, detected_t<range_value_t, CompatibleArrayType>>::value >>
{
    static constexpr bool value =
        is_constructible<BasicJsonType,
        range_value_t<CompatibleArrayType>>::value;
};

template<typename BasicJsonType, typename CompatibleArrayType>
struct is_compatible_array_type
    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};

template<typename BasicJsonType, typename ConstructibleArrayType, typename = void>
struct is_constructible_array_type_impl : std::false_type {};

template<typename BasicJsonType, typename ConstructibleArrayType>
struct is_constructible_array_type_impl <
    BasicJsonType, ConstructibleArrayType,
    enable_if_t<std::is_same<ConstructibleArrayType,
    typename BasicJsonType::value_type>::value >>
            : std::true_type {};

template<typename BasicJsonType, typename ConstructibleArrayType>
struct is_constructible_array_type_impl <
    BasicJsonType, ConstructibleArrayType,
    enable_if_t < !std::is_same<ConstructibleArrayType,
    typename BasicJsonType::value_type>::value&&
    !is_compatible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
    is_default_constructible<ConstructibleArrayType>::value&&
(std::is_move_assignable<ConstructibleArrayType>::value ||
 std::is_copy_assignable<ConstructibleArrayType>::value)&&
is_detected<iterator_t, ConstructibleArrayType>::value&&
is_iterator_traits<iterator_traits<detected_t<iterator_t, ConstructibleArrayType>>>::value&&
is_detected<range_value_t, ConstructibleArrayType>::value&&
// special case for types like std::filesystem::path whose iterator's value_type are themselves
// c.f. https://github.com/nlohmann/json/pull/3073
!std::is_same<ConstructibleArrayType, detected_t<range_value_t, ConstructibleArrayType>>::value&&
        is_complete_type <
        detected_t<range_value_t, ConstructibleArrayType >>::value >>
{
    using value_type = range_value_t<ConstructibleArrayType>;

    static constexpr bool value =
        std::is_same<value_type,
        typename BasicJsonType::array_t::value_type>::value ||
        has_from_json<BasicJsonType,
        value_type>::value ||
        has_non_default_from_json <
        BasicJsonType,
        value_type >::value;
};

template<typename BasicJsonType, typename ConstructibleArrayType>
struct is_constructible_array_type
    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};

template<typename RealIntegerType, typename CompatibleNumberIntegerType,
         typename = void>
struct is_compatible_integer_type_impl : std::false_type {};

template<typename RealIntegerType, typename CompatibleNumberIntegerType>
struct is_compatible_integer_type_impl <
    RealIntegerType, CompatibleNumberIntegerType,
    enable_if_t < std::is_integral<RealIntegerType>::value&&
    std::is_integral<CompatibleNumberIntegerType>::value&&
    !std::is_same<bool, CompatibleNumberIntegerType>::value >>
{
    // is there an assert somewhere on overflows?
    using RealLimits = std::numeric_limits<RealIntegerType>;
    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;

    static constexpr auto value =
        is_constructible<RealIntegerType,
        CompatibleNumberIntegerType>::value &&
        CompatibleLimits::is_integer &&
        RealLimits::is_signed == CompatibleLimits::is_signed;
};

template<typename RealIntegerType, typename CompatibleNumberIntegerType>
struct is_compatible_integer_type
    : is_compatible_integer_type_impl<RealIntegerType,
      CompatibleNumberIntegerType> {};

template<typename BasicJsonType, typename CompatibleType, typename = void>
struct is_compatible_type_impl: std::false_type {};

template<typename BasicJsonType, typename CompatibleType>
struct is_compatible_type_impl <
    BasicJsonType, CompatibleType,
    enable_if_t<is_complete_type<CompatibleType>::value >>
{
    static constexpr bool value =
        has_to_json<BasicJsonType, CompatibleType>::value;
};

template<typename BasicJsonType, typename CompatibleType>
struct is_compatible_type
    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};

template<typename T1, typename T2>
struct is_constructible_tuple : std::false_type {};

template<typename T1, typename... Args>
struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<is_constructible<T1, Args>...> {};

template<typename BasicJsonType, typename T>
struct is_json_iterator_of : std::false_type {};

template<typename BasicJsonType>
struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::iterator> : std::true_type {};

template<typename BasicJsonType>
struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::const_iterator> : std::true_type
{};

// checks if a given type T is a template specialization of Primary
template<template <typename...> class Primary, typename T>
struct is_specialization_of : std::false_type {};

template<template <typename...> class Primary, typename... Args>
struct is_specialization_of<Primary, Primary<Args...>> : std::true_type {};

template<typename T>
using is_json_pointer = is_specialization_of<::nlohmann::json_pointer, uncvref_t<T>>;

// checks if A and B are comparable using Compare functor
template<typename Compare, typename A, typename B, typename = void>
struct is_comparable : std::false_type {};

template<typename Compare, typename A, typename B>
struct is_comparable<Compare, A, B, void_t<
decltype(std::declval<Compare>()(std::declval<A>(), std::declval<B>())),
decltype(std::declval<Compare>()(std::declval<B>(), std::declval<A>()))
>> : std::true_type {};

template<typename T>
using detect_is_transparent = typename T::is_transparent;

// type trait to check if KeyType can be used as object key (without a BasicJsonType)
// see is_usable_as_basic_json_key_type below
template<typename Comparator, typename ObjectKeyType, typename KeyTypeCVRef, bool RequireTransparentComparator = true,
         bool ExcludeObjectKeyType = RequireTransparentComparator, typename KeyType = uncvref_t<KeyTypeCVRef>>
using is_usable_as_key_type = typename std::conditional <
                              is_comparable<Comparator, ObjectKeyType, KeyTypeCVRef>::value
                              && !(ExcludeObjectKeyType && std::is_same<KeyType,
                                   ObjectKeyType>::value)
                              && (!RequireTransparentComparator
                                  || is_detected <detect_is_transparent, Comparator>::value)
                              && !is_json_pointer<KeyType>::value,
                              std::true_type,
                              std::false_type >::type;

// type trait to check if KeyType can be used as object key
// true if:
//   - KeyType is comparable with BasicJsonType::object_t::key_type
//   - if ExcludeObjectKeyType is true, KeyType is not BasicJsonType::object_t::key_type
//   - the comparator is transparent or RequireTransparentComparator is false
//   - KeyType is not a JSON iterator or json_pointer
template<typename BasicJsonType, typename KeyTypeCVRef, bool RequireTransparentComparator = true,
         bool ExcludeObjectKeyType = RequireTransparentComparator, typename KeyType = uncvref_t<KeyTypeCVRef>>
using is_usable_as_basic_json_key_type = typename std::conditional <
        is_usable_as_key_type<typename BasicJsonType::object_comparator_t,
        typename BasicJsonType::object_t::key_type, KeyTypeCVRef,
        RequireTransparentComparator, ExcludeObjectKeyType>::value
        && !is_json_iterator_of<BasicJsonType, KeyType>::value,
        std::true_type,
        std::false_type >::type;

template<typename ObjectType, typename KeyType>
using detect_erase_with_key_type = decltype(std::declval<ObjectType&>().erase(std::declval<KeyType>()));

// type trait to check if object_t has an erase() member functions accepting KeyType
template<typename BasicJsonType, typename KeyType>
using has_erase_with_key_type = typename std::conditional <
                                is_detected <
                                detect_erase_with_key_type,
                                typename BasicJsonType::object_t, KeyType >::value,
                                std::true_type,
                                std::false_type >::type;

// a naive helper to check if a type is an ordered_map (exploits the fact that
// ordered_map inherits capacity() from std::vector)
template <typename T>
struct is_ordered_map
{
    using one = char;

    struct two
    {
        char x[2]; // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
    };

    template <typename C> static one test( decltype(&C::capacity) ) ;
    template <typename C> static two test(...);

    enum { value = sizeof(test<T>(nullptr)) == sizeof(char) }; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
};

// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
template < typename T, typename U, enable_if_t < !std::is_same<T, U>::value, int > = 0 >
T conditional_static_cast(U value)
{
    return static_cast<T>(value);
}

template<typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
T conditional_static_cast(U value)
{
    return value;
}

template<typename... Types>
using all_integral = conjunction<std::is_integral<Types>...>;

template<typename... Types>
using all_signed = conjunction<std::is_signed<Types>...>;

template<typename... Types>
using all_unsigned = conjunction<std::is_unsigned<Types>...>;

// there's a disjunction trait in another PR; replace when merged
template<typename... Types>
using same_sign = std::integral_constant < bool,
      all_signed<Types...>::value || all_unsigned<Types...>::value >;

template<typename OfType, typename T>
using never_out_of_range = std::integral_constant < bool,
      (std::is_signed<OfType>::value && (sizeof(T) < sizeof(OfType)))
      || (same_sign<OfType, T>::value && sizeof(OfType) == sizeof(T)) >;

template<typename OfType, typename T,
         bool OfTypeSigned = std::is_signed<OfType>::value,
         bool TSigned = std::is_signed<T>::value>
struct value_in_range_of_impl2;

template<typename OfType, typename T>
struct value_in_range_of_impl2<OfType, T, false, false>
{
    static constexpr bool test(T val)
    {
        using CommonType = typename std::common_type<OfType, T>::type;
        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
    }
};

template<typename OfType, typename T>
struct value_in_range_of_impl2<OfType, T, true, false>
{
    static constexpr bool test(T val)
    {
        using CommonType = typename std::common_type<OfType, T>::type;
        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
    }
};

template<typename OfType, typename T>
struct value_in_range_of_impl2<OfType, T, false, true>
{
    static constexpr bool test(T val)
    {
        using CommonType = typename std::common_type<OfType, T>::type;
        return val >= 0 && static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
    }
};

template<typename OfType, typename T>
struct value_in_range_of_impl2<OfType, T, true, true>
{
    static constexpr bool test(T val)
    {
        using CommonType = typename std::common_type<OfType, T>::type;
        return static_cast<CommonType>(val) >= static_cast<CommonType>((std::numeric_limits<OfType>::min)())
               && static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
    }
};

template<typename OfType, typename T,
         bool NeverOutOfRange = never_out_of_range<OfType, T>::value,
         typename = detail::enable_if_t<all_integral<OfType, T>::value>>
struct value_in_range_of_impl1;

template<typename OfType, typename T>
struct value_in_range_of_impl1<OfType, T, false>
{
    static constexpr bool test(T val)
    {
        return value_in_range_of_impl2<OfType, T>::test(val);
    }
};

template<typename OfType, typename T>
struct value_in_range_of_impl1<OfType, T, true>
{
    static constexpr bool test(T /*val*/)
    {
        return true;
    }
};

template<typename OfType, typename T>
inline constexpr bool value_in_range_of(T val)
{
    return value_in_range_of_impl1<OfType, T>::test(val);
}

template<bool Value>
using bool_constant = std::integral_constant<bool, Value>;

///////////////////////////////////////////////////////////////////////////////
// is_c_string
///////////////////////////////////////////////////////////////////////////////

namespace impl
{

template<typename T>
inline constexpr bool is_c_string()
{
    using TUnExt = typename std::remove_extent<T>::type;
    using TUnCVExt = typename std::remove_cv<TUnExt>::type;
    using TUnPtr = typename std::remove_pointer<T>::type;
    using TUnCVPtr = typename std::remove_cv<TUnPtr>::type;
    return
        (std::is_array<T>::value && std::is_same<TUnCVExt, char>::value)
        || (std::is_pointer<T>::value && std::is_same<TUnCVPtr, char>::value);
}

}  // namespace impl

// checks whether T is a [cv] char */[cv] char[] C string
template<typename T>
struct is_c_string : bool_constant<impl::is_c_string<T>()> {};

template<typename T>
using is_c_string_uncvref = is_c_string<uncvref_t<T>>;

///////////////////////////////////////////////////////////////////////////////
// is_transparent
///////////////////////////////////////////////////////////////////////////////

namespace impl
{

template<typename T>
inline constexpr bool is_transparent()
{
    return is_detected<detect_is_transparent, T>::value;
}

}  // namespace impl

// checks whether T has a member named is_transparent
template<typename T>
struct is_transparent : bool_constant<impl::is_transparent<T>()> {};

///////////////////////////////////////////////////////////////////////////////

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/string_concat.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstring> // strlen
#include <string> // string
#include <utility> // forward

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/detected.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

inline std::size_t concat_length()
{
    return 0;
}

template<typename... Args>
inline std::size_t concat_length(const char* cstr, const Args& ... rest);

template<typename StringType, typename... Args>
inline std::size_t concat_length(const StringType& str, const Args& ... rest);

template<typename... Args>
inline std::size_t concat_length(const char /*c*/, const Args& ... rest)
{
    return 1 + concat_length(rest...);
}

template<typename... Args>
inline std::size_t concat_length(const char* cstr, const Args& ... rest)
{
    // cppcheck-suppress ignoredReturnValue
    return ::strlen(cstr) + concat_length(rest...);
}

template<typename StringType, typename... Args>
inline std::size_t concat_length(const StringType& str, const Args& ... rest)
{
    return str.size() + concat_length(rest...);
}

template<typename OutStringType>
inline void concat_into(OutStringType& /*out*/)
{}

template<typename StringType, typename Arg>
using string_can_append = decltype(std::declval<StringType&>().append(std::declval < Arg && > ()));

template<typename StringType, typename Arg>
using detect_string_can_append = is_detected<string_can_append, StringType, Arg>;

template<typename StringType, typename Arg>
using string_can_append_op = decltype(std::declval<StringType&>() += std::declval < Arg && > ());

template<typename StringType, typename Arg>
using detect_string_can_append_op = is_detected<string_can_append_op, StringType, Arg>;

template<typename StringType, typename Arg>
using string_can_append_iter = decltype(std::declval<StringType&>().append(std::declval<const Arg&>().begin(), std::declval<const Arg&>().end()));

template<typename StringType, typename Arg>
using detect_string_can_append_iter = is_detected<string_can_append_iter, StringType, Arg>;

template<typename StringType, typename Arg>
using string_can_append_data = decltype(std::declval<StringType&>().append(std::declval<const Arg&>().data(), std::declval<const Arg&>().size()));

template<typename StringType, typename Arg>
using detect_string_can_append_data = is_detected<string_can_append_data, StringType, Arg>;

template < typename OutStringType, typename Arg, typename... Args,
           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
                         && detect_string_can_append_op<OutStringType, Arg>::value, int > = 0 >
inline void concat_into(OutStringType& out, Arg && arg, Args && ... rest);

template < typename OutStringType, typename Arg, typename... Args,
           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
                         && !detect_string_can_append_op<OutStringType, Arg>::value
                         && detect_string_can_append_iter<OutStringType, Arg>::value, int > = 0 >
inline void concat_into(OutStringType& out, const Arg& arg, Args && ... rest);

template < typename OutStringType, typename Arg, typename... Args,
           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
                         && !detect_string_can_append_op<OutStringType, Arg>::value
                         && !detect_string_can_append_iter<OutStringType, Arg>::value
                         && detect_string_can_append_data<OutStringType, Arg>::value, int > = 0 >
inline void concat_into(OutStringType& out, const Arg& arg, Args && ... rest);

template<typename OutStringType, typename Arg, typename... Args,
         enable_if_t<detect_string_can_append<OutStringType, Arg>::value, int> = 0>
inline void concat_into(OutStringType& out, Arg && arg, Args && ... rest)
{
    out.append(std::forward<Arg>(arg));
    concat_into(out, std::forward<Args>(rest)...);
}

template < typename OutStringType, typename Arg, typename... Args,
           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
                         && detect_string_can_append_op<OutStringType, Arg>::value, int > >
inline void concat_into(OutStringType& out, Arg&& arg, Args&& ... rest)
{
    out += std::forward<Arg>(arg);
    concat_into(out, std::forward<Args>(rest)...);
}

template < typename OutStringType, typename Arg, typename... Args,
           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
                         && !detect_string_can_append_op<OutStringType, Arg>::value
                         && detect_string_can_append_iter<OutStringType, Arg>::value, int > >
inline void concat_into(OutStringType& out, const Arg& arg, Args&& ... rest)
{
    out.append(arg.begin(), arg.end());
    concat_into(out, std::forward<Args>(rest)...);
}

template < typename OutStringType, typename Arg, typename... Args,
           enable_if_t < !detect_string_can_append<OutStringType, Arg>::value
                         && !detect_string_can_append_op<OutStringType, Arg>::value
                         && !detect_string_can_append_iter<OutStringType, Arg>::value
                         && detect_string_can_append_data<OutStringType, Arg>::value, int > >
inline void concat_into(OutStringType& out, const Arg& arg, Args&& ... rest)
{
    out.append(arg.data(), arg.size());
    concat_into(out, std::forward<Args>(rest)...);
}

template<typename OutStringType = std::string, typename... Args>
inline OutStringType concat(Args && ... args)
{
    OutStringType str;
    str.reserve(concat_length(args...));
    concat_into(str, std::forward<Args>(args)...);
    return str;
}

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

////////////////
// exceptions //
////////////////

/// @brief general exception of the @ref basic_json class
/// @sa https://json.nlohmann.me/api/basic_json/exception/
class exception : public std::exception
{
  public:
    /// returns the explanatory string
    const char* what() const noexcept override
    {
        return m.what();
    }

    /// the id of the exception
    const int id; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)

  protected:
    JSON_HEDLEY_NON_NULL(3)
    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {} // NOLINT(bugprone-throw-keyword-missing)

    static std::string name(const std::string& ename, int id_)
    {
        return concat("[json.exception.", ename, '.', std::to_string(id_), "] ");
    }

    static std::string diagnostics(std::nullptr_t /*leaf_element*/)
    {
        return "";
    }

    template<typename BasicJsonType>
    static std::string diagnostics(const BasicJsonType* leaf_element)
    {
#if JSON_DIAGNOSTICS
        std::vector<std::string> tokens;
        for (const auto* current = leaf_element; current != nullptr && current->m_parent != nullptr; current = current->m_parent)
        {
            switch (current->m_parent->type())
            {
                case value_t::array:
                {
                    for (std::size_t i = 0; i < current->m_parent->m_data.m_value.array->size(); ++i)
                    {
                        if (&current->m_parent->m_data.m_value.array->operator[](i) == current)
                        {
                            tokens.emplace_back(std::to_string(i));
                            break;
                        }
                    }
                    break;
                }

                case value_t::object:
                {
                    for (const auto& element : *current->m_parent->m_data.m_value.object)
                    {
                        if (&element.second == current)
                        {
                            tokens.emplace_back(element.first.c_str());
                            break;
                        }
                    }
                    break;
                }

                case value_t::null: // LCOV_EXCL_LINE
                case value_t::string: // LCOV_EXCL_LINE
                case value_t::boolean: // LCOV_EXCL_LINE
                case value_t::number_integer: // LCOV_EXCL_LINE
                case value_t::number_unsigned: // LCOV_EXCL_LINE
                case value_t::number_float: // LCOV_EXCL_LINE
                case value_t::binary: // LCOV_EXCL_LINE
                case value_t::discarded: // LCOV_EXCL_LINE
                default:   // LCOV_EXCL_LINE
                    break; // LCOV_EXCL_LINE
            }
        }

        if (tokens.empty())
        {
            return "";
        }

        auto str = std::accumulate(tokens.rbegin(), tokens.rend(), std::string{},
                                   [](const std::string & a, const std::string & b)
        {
            return concat(a, '/', detail::escape(b));
        });
        return concat('(', str, ") ");
#else
        static_cast<void>(leaf_element);
        return "";
#endif
    }

  private:
    /// an exception object as storage for error messages
    std::runtime_error m;
};

/// @brief exception indicating a parse error
/// @sa https://json.nlohmann.me/api/basic_json/parse_error/
class parse_error : public exception
{
  public:
    /*!
    @brief create a parse error exception
    @param[in] id_       the id of the exception
    @param[in] pos       the position where the error occurred (or with
                         chars_read_total=0 if the position cannot be
                         determined)
    @param[in] what_arg  the explanatory string
    @return parse_error object
    */
    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
    static parse_error create(int id_, const position_t& pos, const std::string& what_arg, BasicJsonContext context)
    {
        const std::string w = concat(exception::name("parse_error", id_), "parse error",
                                     position_string(pos), ": ", exception::diagnostics(context), what_arg);
        return {id_, pos.chars_read_total, w.c_str()};
    }

    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, BasicJsonContext context)
    {
        const std::string w = concat(exception::name("parse_error", id_), "parse error",
                                     (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""),
                                     ": ", exception::diagnostics(context), what_arg);
        return {id_, byte_, w.c_str()};
    }

    /*!
    @brief byte index of the parse error

    The byte index of the last read character in the input file.

    @note For an input with n bytes, 1 is the index of the first character and
          n+1 is the index of the terminating null byte or the end of file.
          This also holds true when reading a byte vector (CBOR or MessagePack).
    */
    const std::size_t byte;

  private:
    parse_error(int id_, std::size_t byte_, const char* what_arg)
        : exception(id_, what_arg), byte(byte_) {}

    static std::string position_string(const position_t& pos)
    {
        return concat(" at line ", std::to_string(pos.lines_read + 1),
                      ", column ", std::to_string(pos.chars_read_current_line));
    }
};

/// @brief exception indicating errors with iterators
/// @sa https://json.nlohmann.me/api/basic_json/invalid_iterator/
class invalid_iterator : public exception
{
  public:
    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
    static invalid_iterator create(int id_, const std::string& what_arg, BasicJsonContext context)
    {
        const std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg);
        return {id_, w.c_str()};
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    invalid_iterator(int id_, const char* what_arg)
        : exception(id_, what_arg) {}
};

/// @brief exception indicating executing a member function with a wrong type
/// @sa https://json.nlohmann.me/api/basic_json/type_error/
class type_error : public exception
{
  public:
    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
    static type_error create(int id_, const std::string& what_arg, BasicJsonContext context)
    {
        const std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg);
        return {id_, w.c_str()};
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
};

/// @brief exception indicating access out of the defined range
/// @sa https://json.nlohmann.me/api/basic_json/out_of_range/
class out_of_range : public exception
{
  public:
    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
    static out_of_range create(int id_, const std::string& what_arg, BasicJsonContext context)
    {
        const std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg);
        return {id_, w.c_str()};
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
};

/// @brief exception indicating other library errors
/// @sa https://json.nlohmann.me/api/basic_json/other_error/
class other_error : public exception
{
  public:
    template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
    static other_error create(int id_, const std::string& what_arg, BasicJsonContext context)
    {
        const std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg);
        return {id_, w.c_str()};
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/identity_tag.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// #include <nlohmann/detail/abi_macros.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

// dispatching helper struct
template <class T> struct identity_tag {};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/meta/std_fs.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// #include <nlohmann/detail/macro_scope.hpp>


#if JSON_HAS_EXPERIMENTAL_FILESYSTEM
#include <experimental/filesystem>
NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{
namespace std_fs = std::experimental::filesystem;
}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END
#elif JSON_HAS_FILESYSTEM
#include <filesystem>
NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{
namespace std_fs = std::filesystem;
}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END
#endif

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/string_concat.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_null()))
    {
        JSON_THROW(type_error::create(302, concat("type must be null, but is ", j.type_name()), &j));
    }
    n = nullptr;
}

// overloads for basic_json template parameters
template < typename BasicJsonType, typename ArithmeticType,
           enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
                         !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
                         int > = 0 >
void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
{
    switch (static_cast<value_t>(j))
    {
        case value_t::number_unsigned:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
            break;
        }
        case value_t::number_integer:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
            break;
        }
        case value_t::number_float:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
            break;
        }

        case value_t::null:
        case value_t::object:
        case value_t::array:
        case value_t::string:
        case value_t::boolean:
        case value_t::binary:
        case value_t::discarded:
        default:
            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
    }
}

template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean()))
    {
        JSON_THROW(type_error::create(302, concat("type must be boolean, but is ", j.type_name()), &j));
    }
    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
}

template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
    {
        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
    }
    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
}

template <
    typename BasicJsonType, typename StringType,
    enable_if_t <
        std::is_assignable<StringType&, const typename BasicJsonType::string_t>::value
        && is_detected_exact<typename BasicJsonType::string_t::value_type, value_type_t, StringType>::value
        && !std::is_same<typename BasicJsonType::string_t, StringType>::value
        && !is_json_ref<StringType>::value, int > = 0 >
inline void from_json(const BasicJsonType& j, StringType& s)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
    {
        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
    }

    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
}

template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
{
    get_arithmetic_value(j, val);
}

template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
{
    get_arithmetic_value(j, val);
}

template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
{
    get_arithmetic_value(j, val);
}

#if !JSON_DISABLE_ENUM_SERIALIZATION
template<typename BasicJsonType, typename EnumType,
         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
inline void from_json(const BasicJsonType& j, EnumType& e)
{
    typename std::underlying_type<EnumType>::type val;
    get_arithmetic_value(j, val);
    e = static_cast<EnumType>(val);
}
#endif  // JSON_DISABLE_ENUM_SERIALIZATION

// forward_list doesn't have an insert method
template<typename BasicJsonType, typename T, typename Allocator,
         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
inline void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
    }
    l.clear();
    std::transform(j.rbegin(), j.rend(),
                   std::front_inserter(l), [](const BasicJsonType & i)
    {
        return i.template get<T>();
    });
}

// valarray doesn't have an insert method
template<typename BasicJsonType, typename T,
         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
inline void from_json(const BasicJsonType& j, std::valarray<T>& l)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
    }
    l.resize(j.size());
    std::transform(j.begin(), j.end(), std::begin(l),
                   [](const BasicJsonType & elem)
    {
        return elem.template get<T>();
    });
}

template<typename BasicJsonType, typename T, std::size_t N>
auto from_json(const BasicJsonType& j, T (&arr)[N])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-> decltype(j.template get<T>(), void())
{
    for (std::size_t i = 0; i < N; ++i)
    {
        arr[i] = j.at(i).template get<T>();
    }
}

template<typename BasicJsonType>
inline void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
{
    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
}

template<typename BasicJsonType, typename T, std::size_t N>
auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
                          priority_tag<2> /*unused*/)
-> decltype(j.template get<T>(), void())
{
    for (std::size_t i = 0; i < N; ++i)
    {
        arr[i] = j.at(i).template get<T>();
    }
}

template<typename BasicJsonType, typename ConstructibleArrayType,
         enable_if_t<
             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
             int> = 0>
auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
-> decltype(
    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
    j.template get<typename ConstructibleArrayType::value_type>(),
    void())
{
    using std::end;

    ConstructibleArrayType ret;
    ret.reserve(j.size());
    std::transform(j.begin(), j.end(),
                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
    {
        // get<BasicJsonType>() returns *this, this won't call a from_json
        // method when value_type is BasicJsonType
        return i.template get<typename ConstructibleArrayType::value_type>();
    });
    arr = std::move(ret);
}

template<typename BasicJsonType, typename ConstructibleArrayType,
         enable_if_t<
             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
             int> = 0>
inline void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
                                 priority_tag<0> /*unused*/)
{
    using std::end;

    ConstructibleArrayType ret;
    std::transform(
        j.begin(), j.end(), std::inserter(ret, end(ret)),
        [](const BasicJsonType & i)
    {
        // get<BasicJsonType>() returns *this, this won't call a from_json
        // method when value_type is BasicJsonType
        return i.template get<typename ConstructibleArrayType::value_type>();
    });
    arr = std::move(ret);
}

template < typename BasicJsonType, typename ConstructibleArrayType,
           enable_if_t <
               is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value&&
               !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value&&
               !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
               !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value&&
               !is_basic_json<ConstructibleArrayType>::value,
               int > = 0 >
auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
j.template get<typename ConstructibleArrayType::value_type>(),
void())
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
    }

    from_json_array_impl(j, arr, priority_tag<3> {});
}

template < typename BasicJsonType, typename T, std::size_t... Idx >
std::array<T, sizeof...(Idx)> from_json_inplace_array_impl(BasicJsonType&& j,
        identity_tag<std::array<T, sizeof...(Idx)>> /*unused*/, index_sequence<Idx...> /*unused*/)
{
    return { { std::forward<BasicJsonType>(j).at(Idx).template get<T>()... } };
}

template < typename BasicJsonType, typename T, std::size_t N >
auto from_json(BasicJsonType&& j, identity_tag<std::array<T, N>> tag)
-> decltype(from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {}))
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
    }

    return from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {});
}

template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_binary()))
    {
        JSON_THROW(type_error::create(302, concat("type must be binary, but is ", j.type_name()), &j));
    }

    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
}

template<typename BasicJsonType, typename ConstructibleObjectType,
         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
inline void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
    {
        JSON_THROW(type_error::create(302, concat("type must be object, but is ", j.type_name()), &j));
    }

    ConstructibleObjectType ret;
    const auto* inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
    using value_type = typename ConstructibleObjectType::value_type;
    std::transform(
        inner_object->begin(), inner_object->end(),
        std::inserter(ret, ret.begin()),
        [](typename BasicJsonType::object_t::value_type const & p)
    {
        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
    });
    obj = std::move(ret);
}

// overload for arithmetic types, not chosen for basic_json template arguments
// (BooleanType, etc..); note: Is it really necessary to provide explicit
// overloads for boolean_t etc. in case of a custom BooleanType which is not
// an arithmetic type?
template < typename BasicJsonType, typename ArithmeticType,
           enable_if_t <
               std::is_arithmetic<ArithmeticType>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
               int > = 0 >
inline void from_json(const BasicJsonType& j, ArithmeticType& val)
{
    switch (static_cast<value_t>(j))
    {
        case value_t::number_unsigned:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
            break;
        }
        case value_t::number_integer:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
            break;
        }
        case value_t::number_float:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
            break;
        }
        case value_t::boolean:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
            break;
        }

        case value_t::null:
        case value_t::object:
        case value_t::array:
        case value_t::string:
        case value_t::binary:
        case value_t::discarded:
        default:
            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
    }
}

template<typename BasicJsonType, typename... Args, std::size_t... Idx>
std::tuple<Args...> from_json_tuple_impl_base(BasicJsonType&& j, index_sequence<Idx...> /*unused*/)
{
    return std::make_tuple(std::forward<BasicJsonType>(j).at(Idx).template get<Args>()...);
}

template < typename BasicJsonType, class A1, class A2 >
std::pair<A1, A2> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::pair<A1, A2>> /*unused*/, priority_tag<0> /*unused*/)
{
    return {std::forward<BasicJsonType>(j).at(0).template get<A1>(),
            std::forward<BasicJsonType>(j).at(1).template get<A2>()};
}

template<typename BasicJsonType, typename A1, typename A2>
inline void from_json_tuple_impl(BasicJsonType&& j, std::pair<A1, A2>& p, priority_tag<1> /*unused*/)
{
    p = from_json_tuple_impl(std::forward<BasicJsonType>(j), identity_tag<std::pair<A1, A2>> {}, priority_tag<0> {});
}

template<typename BasicJsonType, typename... Args>
std::tuple<Args...> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::tuple<Args...>> /*unused*/, priority_tag<2> /*unused*/)
{
    return from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
}

template<typename BasicJsonType, typename... Args>
inline void from_json_tuple_impl(BasicJsonType&& j, std::tuple<Args...>& t, priority_tag<3> /*unused*/)
{
    t = from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
}

template<typename BasicJsonType, typename TupleRelated>
auto from_json(BasicJsonType&& j, TupleRelated&& t)
-> decltype(from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {}))
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
    }

    return from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {});
}

template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
           typename = enable_if_t < !std::is_constructible <
                                        typename BasicJsonType::string_t, Key >::value >>
inline void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
    }
    m.clear();
    for (const auto& p : j)
    {
        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
        {
            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
        }
        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
    }
}

template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
           typename = enable_if_t < !std::is_constructible <
                                        typename BasicJsonType::string_t, Key >::value >>
inline void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
    }
    m.clear();
    for (const auto& p : j)
    {
        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
        {
            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
        }
        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
    }
}

#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
template<typename BasicJsonType>
inline void from_json(const BasicJsonType& j, std_fs::path& p)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
    {
        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
    }
    p = *j.template get_ptr<const typename BasicJsonType::string_t*>();
}
#endif

struct from_json_fn
{
    template<typename BasicJsonType, typename T>
    auto operator()(const BasicJsonType& j, T&& val) const
    noexcept(noexcept(from_json(j, std::forward<T>(val))))
    -> decltype(from_json(j, std::forward<T>(val)))
    {
        return from_json(j, std::forward<T>(val));
    }
};

}  // namespace detail

#ifndef JSON_HAS_CPP_17
/// namespace to hold default `from_json` function
/// to see why this is required:
/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
{
#endif
JSON_INLINE_VARIABLE constexpr const auto& from_json = // NOLINT(misc-definitions-in-headers)
    detail::static_const<detail::from_json_fn>::value;
#ifndef JSON_HAS_CPP_17
}  // namespace
#endif

NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/conversions/to_json.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <algorithm> // copy
#include <iterator> // begin, end
#include <string> // string
#include <tuple> // tuple, get
#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
#include <utility> // move, forward, declval, pair
#include <valarray> // valarray
#include <vector> // vector

// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstddef> // size_t
#include <iterator> // input_iterator_tag
#include <string> // string, to_string
#include <tuple> // tuple_size, get, tuple_element
#include <utility> // move

#if JSON_HAS_RANGES
    #include <ranges> // enable_borrowed_range
#endif

// #include <nlohmann/detail/abi_macros.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

template<typename string_type>
void int_to_string( string_type& target, std::size_t value )
{
    // For ADL
    using std::to_string;
    target = to_string(value);
}
template<typename IteratorType> class iteration_proxy_value
{
  public:
    using difference_type = std::ptrdiff_t;
    using value_type = iteration_proxy_value;
    using pointer = value_type *;
    using reference = value_type &;
    using iterator_category = std::input_iterator_tag;
    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;

  private:
    /// the iterator
    IteratorType anchor{};
    /// an index for arrays (used to create key names)
    std::size_t array_index = 0;
    /// last stringified array index
    mutable std::size_t array_index_last = 0;
    /// a string representation of the array index
    mutable string_type array_index_str = "0";
    /// an empty string (to return a reference for primitive values)
    string_type empty_str{};

  public:
    explicit iteration_proxy_value() = default;
    explicit iteration_proxy_value(IteratorType it, std::size_t array_index_ = 0)
    noexcept(std::is_nothrow_move_constructible<IteratorType>::value
             && std::is_nothrow_default_constructible<string_type>::value)
        : anchor(std::move(it))
        , array_index(array_index_)
    {}

    iteration_proxy_value(iteration_proxy_value const&) = default;
    iteration_proxy_value& operator=(iteration_proxy_value const&) = default;
    // older GCCs are a bit fussy and require explicit noexcept specifiers on defaulted functions
    iteration_proxy_value(iteration_proxy_value&&)
    noexcept(std::is_nothrow_move_constructible<IteratorType>::value
             && std::is_nothrow_move_constructible<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
    iteration_proxy_value& operator=(iteration_proxy_value&&)
    noexcept(std::is_nothrow_move_assignable<IteratorType>::value
             && std::is_nothrow_move_assignable<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
    ~iteration_proxy_value() = default;

    /// dereference operator (needed for range-based for)
    const iteration_proxy_value& operator*() const
    {
        return *this;
    }

    /// increment operator (needed for range-based for)
    iteration_proxy_value& operator++()
    {
        ++anchor;
        ++array_index;

        return *this;
    }

    iteration_proxy_value operator++(int)& // NOLINT(cert-dcl21-cpp)
    {
        auto tmp = iteration_proxy_value(anchor, array_index);
        ++anchor;
        ++array_index;
        return tmp;
    }

    /// equality operator (needed for InputIterator)
    bool operator==(const iteration_proxy_value& o) const
    {
        return anchor == o.anchor;
    }

    /// inequality operator (needed for range-based for)
    bool operator!=(const iteration_proxy_value& o) const
    {
        return anchor != o.anchor;
    }

    /// return key of the iterator
    const string_type& key() const
    {
        JSON_ASSERT(anchor.m_object != nullptr);

        switch (anchor.m_object->type())
        {
            // use integer array index as key
            case value_t::array:
            {
                if (array_index != array_index_last)
                {
                    int_to_string( array_index_str, array_index );
                    array_index_last = array_index;
                }
                return array_index_str;
            }

            // use key from the object
            case value_t::object:
                return anchor.key();

            // use an empty key for all primitive types
            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
                return empty_str;
        }
    }

    /// return value of the iterator
    typename IteratorType::reference value() const
    {
        return anchor.value();
    }
};

/// proxy class for the items() function
template<typename IteratorType> class iteration_proxy
{
  private:
    /// the container to iterate
    typename IteratorType::pointer container = nullptr;

  public:
    explicit iteration_proxy() = default;

    /// construct iteration proxy from a container
    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
        : container(&cont) {}

    iteration_proxy(iteration_proxy const&) = default;
    iteration_proxy& operator=(iteration_proxy const&) = default;
    iteration_proxy(iteration_proxy&&) noexcept = default;
    iteration_proxy& operator=(iteration_proxy&&) noexcept = default;
    ~iteration_proxy() = default;

    /// return iterator begin (needed for range-based for)
    iteration_proxy_value<IteratorType> begin() const noexcept
    {
        return iteration_proxy_value<IteratorType>(container->begin());
    }

    /// return iterator end (needed for range-based for)
    iteration_proxy_value<IteratorType> end() const noexcept
    {
        return iteration_proxy_value<IteratorType>(container->end());
    }
};

// Structured Bindings Support
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391
template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
{
    return i.key();
}
// Structured Bindings Support
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391
template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
{
    return i.value();
}

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// The Addition to the STD Namespace is required to add
// Structured Bindings Support to the iteration_proxy_value class
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391
namespace std
{

#if defined(__clang__)
    // Fix: https://github.com/nlohmann/json/issues/1401
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wmismatched-tags"
#endif
template<typename IteratorType>
class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>> // NOLINT(cert-dcl58-cpp)
            : public std::integral_constant<std::size_t, 2> {};

template<std::size_t N, typename IteratorType>
class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >> // NOLINT(cert-dcl58-cpp)
{
  public:
    using type = decltype(
                     get<N>(std::declval <
                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
};
#if defined(__clang__)
    #pragma clang diagnostic pop
#endif

}  // namespace std

#if JSON_HAS_RANGES
    template <typename IteratorType>
    inline constexpr bool ::std::ranges::enable_borrowed_range<::nlohmann::detail::iteration_proxy<IteratorType>> = true;
#endif

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/std_fs.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

//////////////////
// constructors //
//////////////////

/*
 * Note all external_constructor<>::construct functions need to call
 * j.m_data.m_value.destroy(j.m_data.m_type) to avoid a memory leak in case j contains an
 * allocated value (e.g., a string). See bug issue
 * https://github.com/nlohmann/json/issues/2865 for more information.
 */

template<value_t> struct external_constructor;

template<>
struct external_constructor<value_t::boolean>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::boolean;
        j.m_data.m_value = b;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::string>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::string;
        j.m_data.m_value = s;
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::string;
        j.m_data.m_value = std::move(s);
        j.assert_invariant();
    }

    template < typename BasicJsonType, typename CompatibleStringType,
               enable_if_t < !std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
                             int > = 0 >
    static void construct(BasicJsonType& j, const CompatibleStringType& str)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::string;
        j.m_data.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::binary>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::binary;
        j.m_data.m_value = typename BasicJsonType::binary_t(b);
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::binary;
        j.m_data.m_value = typename BasicJsonType::binary_t(std::move(b));
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::number_float>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::number_float;
        j.m_data.m_value = val;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::number_unsigned>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::number_unsigned;
        j.m_data.m_value = val;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::number_integer>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::number_integer;
        j.m_data.m_value = val;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::array>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::array;
        j.m_data.m_value = arr;
        j.set_parents();
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::array;
        j.m_data.m_value = std::move(arr);
        j.set_parents();
        j.assert_invariant();
    }

    template < typename BasicJsonType, typename CompatibleArrayType,
               enable_if_t < !std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
                             int > = 0 >
    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
    {
        using std::begin;
        using std::end;

        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::array;
        j.m_data.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
        j.set_parents();
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::array;
        j.m_data.m_value = value_t::array;
        j.m_data.m_value.array->reserve(arr.size());
        for (const bool x : arr)
        {
            j.m_data.m_value.array->push_back(x);
            j.set_parent(j.m_data.m_value.array->back());
        }
        j.assert_invariant();
    }

    template<typename BasicJsonType, typename T,
             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::array;
        j.m_data.m_value = value_t::array;
        j.m_data.m_value.array->resize(arr.size());
        if (arr.size() > 0)
        {
            std::copy(std::begin(arr), std::end(arr), j.m_data.m_value.array->begin());
        }
        j.set_parents();
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::object>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::object;
        j.m_data.m_value = obj;
        j.set_parents();
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
    {
        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::object;
        j.m_data.m_value = std::move(obj);
        j.set_parents();
        j.assert_invariant();
    }

    template < typename BasicJsonType, typename CompatibleObjectType,
               enable_if_t < !std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int > = 0 >
    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
    {
        using std::begin;
        using std::end;

        j.m_data.m_value.destroy(j.m_data.m_type);
        j.m_data.m_type = value_t::object;
        j.m_data.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
        j.set_parents();
        j.assert_invariant();
    }
};

/////////////
// to_json //
/////////////

template<typename BasicJsonType, typename T,
         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
inline void to_json(BasicJsonType& j, T b) noexcept
{
    external_constructor<value_t::boolean>::construct(j, b);
}

template < typename BasicJsonType, typename BoolRef,
           enable_if_t <
               ((std::is_same<std::vector<bool>::reference, BoolRef>::value
                 && !std::is_same <std::vector<bool>::reference, typename BasicJsonType::boolean_t&>::value)
                || (std::is_same<std::vector<bool>::const_reference, BoolRef>::value
                    && !std::is_same <detail::uncvref_t<std::vector<bool>::const_reference>,
                                      typename BasicJsonType::boolean_t >::value))
               && std::is_convertible<const BoolRef&, typename BasicJsonType::boolean_t>::value, int > = 0 >
inline void to_json(BasicJsonType& j, const BoolRef& b) noexcept
{
    external_constructor<value_t::boolean>::construct(j, static_cast<typename BasicJsonType::boolean_t>(b));
}

template<typename BasicJsonType, typename CompatibleString,
         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
inline void to_json(BasicJsonType& j, const CompatibleString& s)
{
    external_constructor<value_t::string>::construct(j, s);
}

template<typename BasicJsonType>
inline void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
{
    external_constructor<value_t::string>::construct(j, std::move(s));
}

template<typename BasicJsonType, typename FloatType,
         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
inline void to_json(BasicJsonType& j, FloatType val) noexcept
{
    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
}

template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
inline void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
{
    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
}

template<typename BasicJsonType, typename CompatibleNumberIntegerType,
         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
inline void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
{
    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
}

#if !JSON_DISABLE_ENUM_SERIALIZATION
template<typename BasicJsonType, typename EnumType,
         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
inline void to_json(BasicJsonType& j, EnumType e) noexcept
{
    using underlying_type = typename std::underlying_type<EnumType>::type;
    static constexpr value_t integral_value_t = std::is_unsigned<underlying_type>::value ? value_t::number_unsigned : value_t::number_integer;
    external_constructor<integral_value_t>::construct(j, static_cast<underlying_type>(e));
}
#endif  // JSON_DISABLE_ENUM_SERIALIZATION

template<typename BasicJsonType>
inline void to_json(BasicJsonType& j, const std::vector<bool>& e)
{
    external_constructor<value_t::array>::construct(j, e);
}

template < typename BasicJsonType, typename CompatibleArrayType,
           enable_if_t < is_compatible_array_type<BasicJsonType,
                         CompatibleArrayType>::value&&
                         !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value&&
                         !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value&&
                         !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value&&
                         !is_basic_json<CompatibleArrayType>::value,
                         int > = 0 >
inline void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
{
    external_constructor<value_t::array>::construct(j, arr);
}

template<typename BasicJsonType>
inline void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin)
{
    external_constructor<value_t::binary>::construct(j, bin);
}

template<typename BasicJsonType, typename T,
         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
inline void to_json(BasicJsonType& j, const std::valarray<T>& arr)
{
    external_constructor<value_t::array>::construct(j, std::move(arr));
}

template<typename BasicJsonType>
inline void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
{
    external_constructor<value_t::array>::construct(j, std::move(arr));
}

template < typename BasicJsonType, typename CompatibleObjectType,
           enable_if_t < is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value&& !is_basic_json<CompatibleObjectType>::value, int > = 0 >
inline void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
{
    external_constructor<value_t::object>::construct(j, obj);
}

template<typename BasicJsonType>
inline void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
{
    external_constructor<value_t::object>::construct(j, std::move(obj));
}

template <
    typename BasicJsonType, typename T, std::size_t N,
    enable_if_t < !std::is_constructible<typename BasicJsonType::string_t,
                  const T(&)[N]>::value, // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
                  int > = 0 >
inline void to_json(BasicJsonType& j, const T(&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
{
    external_constructor<value_t::array>::construct(j, arr);
}

template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
inline void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
{
    j = { p.first, p.second };
}

// for https://github.com/nlohmann/json/pull/1134
template<typename BasicJsonType, typename T,
         enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
inline void to_json(BasicJsonType& j, const T& b)
{
    j = { {b.key(), b.value()} };
}

template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
inline void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
{
    j = { std::get<Idx>(t)... };
}

template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
inline void to_json(BasicJsonType& j, const T& t)
{
    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
}

#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
template<typename BasicJsonType>
inline void to_json(BasicJsonType& j, const std_fs::path& p)
{
    j = p.string();
}
#endif

struct to_json_fn
{
    template<typename BasicJsonType, typename T>
    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
    -> decltype(to_json(j, std::forward<T>(val)), void())
    {
        return to_json(j, std::forward<T>(val));
    }
};
}  // namespace detail

#ifndef JSON_HAS_CPP_17
/// namespace to hold default `to_json` function
/// to see why this is required:
/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
{
#endif
JSON_INLINE_VARIABLE constexpr const auto& to_json = // NOLINT(misc-definitions-in-headers)
    detail::static_const<detail::to_json_fn>::value;
#ifndef JSON_HAS_CPP_17
}  // namespace
#endif

NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/meta/identity_tag.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN

/// @sa https://json.nlohmann.me/api/adl_serializer/
template<typename ValueType, typename>
struct adl_serializer
{
    /// @brief convert a JSON value to any value type
    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
    template<typename BasicJsonType, typename TargetType = ValueType>
    static auto from_json(BasicJsonType && j, TargetType& val) noexcept(
        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
    {
        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
    }

    /// @brief convert a JSON value to any value type
    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
    template<typename BasicJsonType, typename TargetType = ValueType>
    static auto from_json(BasicJsonType && j) noexcept(
    noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {})))
    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {}))
    {
        return ::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {});
    }

    /// @brief convert any value type to a JSON value
    /// @sa https://json.nlohmann.me/api/adl_serializer/to_json/
    template<typename BasicJsonType, typename TargetType = ValueType>
    static auto to_json(BasicJsonType& j, TargetType && val) noexcept(
        noexcept(::nlohmann::to_json(j, std::forward<TargetType>(val))))
    -> decltype(::nlohmann::to_json(j, std::forward<TargetType>(val)), void())
    {
        ::nlohmann::to_json(j, std::forward<TargetType>(val));
    }
};

NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/byte_container_with_subtype.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstdint> // uint8_t, uint64_t
#include <tuple> // tie
#include <utility> // move

// #include <nlohmann/detail/abi_macros.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN

/// @brief an internal type for a backed binary type
/// @sa https://json.nlohmann.me/api/byte_container_with_subtype/
template<typename BinaryType>
class byte_container_with_subtype : public BinaryType
{
  public:
    using container_type = BinaryType;
    using subtype_type = std::uint64_t;

    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
    byte_container_with_subtype() noexcept(noexcept(container_type()))
        : container_type()
    {}

    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b)))
        : container_type(b)
    {}

    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
        : container_type(std::move(b))
    {}

    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
    byte_container_with_subtype(const container_type& b, subtype_type subtype_) noexcept(noexcept(container_type(b)))
        : container_type(b)
        , m_subtype(subtype_)
        , m_has_subtype(true)
    {}

    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
    byte_container_with_subtype(container_type&& b, subtype_type subtype_) noexcept(noexcept(container_type(std::move(b))))
        : container_type(std::move(b))
        , m_subtype(subtype_)
        , m_has_subtype(true)
    {}

    bool operator==(const byte_container_with_subtype& rhs) const
    {
        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
    }

    bool operator!=(const byte_container_with_subtype& rhs) const
    {
        return !(rhs == *this);
    }

    /// @brief sets the binary subtype
    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/set_subtype/
    void set_subtype(subtype_type subtype_) noexcept
    {
        m_subtype = subtype_;
        m_has_subtype = true;
    }

    /// @brief return the binary subtype
    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/subtype/
    constexpr subtype_type subtype() const noexcept
    {
        return m_has_subtype ? m_subtype : static_cast<subtype_type>(-1);
    }

    /// @brief return whether the value has a subtype
    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/has_subtype/
    constexpr bool has_subtype() const noexcept
    {
        return m_has_subtype;
    }

    /// @brief clears the binary subtype
    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/clear_subtype/
    void clear_subtype() noexcept
    {
        m_subtype = 0;
        m_has_subtype = false;
    }

  private:
    subtype_type m_subtype = 0;
    bool m_has_subtype = false;
};

NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/conversions/from_json.hpp>

// #include <nlohmann/detail/conversions/to_json.hpp>

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/hash.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstdint> // uint8_t
#include <cstddef> // size_t
#include <functional> // hash

// #include <nlohmann/detail/abi_macros.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

// boost::hash_combine
inline std::size_t combine(std::size_t seed, std::size_t h) noexcept
{
    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
    return seed;
}

/*!
@brief hash a JSON value

The hash function tries to rely on std::hash where possible. Furthermore, the
type of the JSON value is taken into account to have different hash values for
null, 0, 0U, and false, etc.

@tparam BasicJsonType basic_json specialization
@param j JSON value to hash
@return hash value of j
*/
template<typename BasicJsonType>
std::size_t hash(const BasicJsonType& j)
{
    using string_t = typename BasicJsonType::string_t;
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;

    const auto type = static_cast<std::size_t>(j.type());
    switch (j.type())
    {
        case BasicJsonType::value_t::null:
        case BasicJsonType::value_t::discarded:
        {
            return combine(type, 0);
        }

        case BasicJsonType::value_t::object:
        {
            auto seed = combine(type, j.size());
            for (const auto& element : j.items())
            {
                const auto h = std::hash<string_t> {}(element.key());
                seed = combine(seed, h);
                seed = combine(seed, hash(element.value()));
            }
            return seed;
        }

        case BasicJsonType::value_t::array:
        {
            auto seed = combine(type, j.size());
            for (const auto& element : j)
            {
                seed = combine(seed, hash(element));
            }
            return seed;
        }

        case BasicJsonType::value_t::string:
        {
            const auto h = std::hash<string_t> {}(j.template get_ref<const string_t&>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::boolean:
        {
            const auto h = std::hash<bool> {}(j.template get<bool>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::number_integer:
        {
            const auto h = std::hash<number_integer_t> {}(j.template get<number_integer_t>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::number_unsigned:
        {
            const auto h = std::hash<number_unsigned_t> {}(j.template get<number_unsigned_t>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::number_float:
        {
            const auto h = std::hash<number_float_t> {}(j.template get<number_float_t>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::binary:
        {
            auto seed = combine(type, j.get_binary().size());
            const auto h = std::hash<bool> {}(j.get_binary().has_subtype());
            seed = combine(seed, h);
            seed = combine(seed, static_cast<std::size_t>(j.get_binary().subtype()));
            for (const auto byte : j.get_binary())
            {
                seed = combine(seed, std::hash<std::uint8_t> {}(byte));
            }
            return seed;
        }

        default:                   // LCOV_EXCL_LINE
            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
            return 0;              // LCOV_EXCL_LINE
    }
}

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/input/binary_reader.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <algorithm> // generate_n
#include <array> // array
#include <cmath> // ldexp
#include <cstddef> // size_t
#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
#include <cstdio> // snprintf
#include <cstring> // memcpy
#include <iterator> // back_inserter
#include <limits> // numeric_limits
#include <string> // char_traits, string
#include <utility> // make_pair, move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/input/input_adapters.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <array> // array
#include <cstddef> // size_t
#include <cstring> // strlen
#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
#include <memory> // shared_ptr, make_shared, addressof
#include <numeric> // accumulate
#include <string> // string, char_traits
#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
#include <utility> // pair, declval

#ifndef JSON_NO_IO
    #include <cstdio>   // FILE *
    #include <istream>  // istream
#endif                  // JSON_NO_IO

// #include <nlohmann/detail/iterators/iterator_traits.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/// the supported input formats
enum class input_format_t { json, cbor, msgpack, ubjson, bson, bjdata };

////////////////////
// input adapters //
////////////////////

#ifndef JSON_NO_IO
/*!
Input adapter for stdio file access. This adapter read only 1 byte and do not use any
 buffer. This adapter is a very low level adapter.
*/
class file_input_adapter
{
  public:
    using char_type = char;

    JSON_HEDLEY_NON_NULL(2)
    explicit file_input_adapter(std::FILE* f) noexcept
        : m_file(f)
    {
        JSON_ASSERT(m_file != nullptr);
    }

    // make class move-only
    file_input_adapter(const file_input_adapter&) = delete;
    file_input_adapter(file_input_adapter&&) noexcept = default;
    file_input_adapter& operator=(const file_input_adapter&) = delete;
    file_input_adapter& operator=(file_input_adapter&&) = delete;
    ~file_input_adapter() = default;

    std::char_traits<char>::int_type get_character() noexcept
    {
        return std::fgetc(m_file);
    }

  private:
    /// the file pointer to read from
    std::FILE* m_file;
};

/*!
Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
beginning of input. Does not support changing the underlying std::streambuf
in mid-input. Maintains underlying std::istream and std::streambuf to support
subsequent use of standard std::istream operations to process any input
characters following those used in parsing the JSON input.  Clears the
std::istream flags; any input errors (e.g., EOF) will be detected by the first
subsequent call for input from the std::istream.
*/
class input_stream_adapter
{
  public:
    using char_type = char;

    ~input_stream_adapter()
    {
        // clear stream flags; we use underlying streambuf I/O, do not
        // maintain ifstream flags, except eof
        if (is != nullptr)
        {
            is->clear(is->rdstate() & std::ios::eofbit);
        }
    }

    explicit input_stream_adapter(std::istream& i)
        : is(&i), sb(i.rdbuf())
    {}

    // delete because of pointer members
    input_stream_adapter(const input_stream_adapter&) = delete;
    input_stream_adapter& operator=(input_stream_adapter&) = delete;
    input_stream_adapter& operator=(input_stream_adapter&&) = delete;

    input_stream_adapter(input_stream_adapter&& rhs) noexcept
        : is(rhs.is), sb(rhs.sb)
    {
        rhs.is = nullptr;
        rhs.sb = nullptr;
    }

    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
    // end up as the same value, e.g. 0xFFFFFFFF.
    std::char_traits<char>::int_type get_character()
    {
        auto res = sb->sbumpc();
        // set eof manually, as we don't use the istream interface.
        if (JSON_HEDLEY_UNLIKELY(res == std::char_traits<char>::eof()))
        {
            is->clear(is->rdstate() | std::ios::eofbit);
        }
        return res;
    }

  private:
    /// the associated input stream
    std::istream* is = nullptr;
    std::streambuf* sb = nullptr;
};
#endif  // JSON_NO_IO

// General-purpose iterator-based adapter. It might not be as fast as
// theoretically possible for some containers, but it is extremely versatile.
template<typename IteratorType>
class iterator_input_adapter
{
  public:
    using char_type = typename std::iterator_traits<IteratorType>::value_type;

    iterator_input_adapter(IteratorType first, IteratorType last)
        : current(std::move(first)), end(std::move(last))
    {}

    typename char_traits<char_type>::int_type get_character()
    {
        if (JSON_HEDLEY_LIKELY(current != end))
        {
            auto result = char_traits<char_type>::to_int_type(*current);
            std::advance(current, 1);
            return result;
        }

        return char_traits<char_type>::eof();
    }

  private:
    IteratorType current;
    IteratorType end;

    template<typename BaseInputAdapter, size_t T>
    friend struct wide_string_input_helper;

    bool empty() const
    {
        return current == end;
    }
};

template<typename BaseInputAdapter, size_t T>
struct wide_string_input_helper;

template<typename BaseInputAdapter>
struct wide_string_input_helper<BaseInputAdapter, 4>
{
    // UTF-32
    static void fill_buffer(BaseInputAdapter& input,
                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
                            size_t& utf8_bytes_index,
                            size_t& utf8_bytes_filled)
    {
        utf8_bytes_index = 0;

        if (JSON_HEDLEY_UNLIKELY(input.empty()))
        {
            utf8_bytes[0] = std::char_traits<char>::eof();
            utf8_bytes_filled = 1;
        }
        else
        {
            // get the current character
            const auto wc = input.get_character();

            // UTF-32 to UTF-8 encoding
            if (wc < 0x80)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                utf8_bytes_filled = 1;
            }
            else if (wc <= 0x7FF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 2;
            }
            else if (wc <= 0xFFFF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 3;
            }
            else if (wc <= 0x10FFFF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 4;
            }
            else
            {
                // unknown character
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                utf8_bytes_filled = 1;
            }
        }
    }
};

template<typename BaseInputAdapter>
struct wide_string_input_helper<BaseInputAdapter, 2>
{
    // UTF-16
    static void fill_buffer(BaseInputAdapter& input,
                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
                            size_t& utf8_bytes_index,
                            size_t& utf8_bytes_filled)
    {
        utf8_bytes_index = 0;

        if (JSON_HEDLEY_UNLIKELY(input.empty()))
        {
            utf8_bytes[0] = std::char_traits<char>::eof();
            utf8_bytes_filled = 1;
        }
        else
        {
            // get the current character
            const auto wc = input.get_character();

            // UTF-16 to UTF-8 encoding
            if (wc < 0x80)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                utf8_bytes_filled = 1;
            }
            else if (wc <= 0x7FF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 2;
            }
            else if (0xD800 > wc || wc >= 0xE000)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 3;
            }
            else
            {
                if (JSON_HEDLEY_UNLIKELY(!input.empty()))
                {
                    const auto wc2 = static_cast<unsigned int>(input.get_character());
                    const auto charcode = 0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
                    utf8_bytes_filled = 4;
                }
                else
                {
                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                    utf8_bytes_filled = 1;
                }
            }
        }
    }
};

// Wraps another input adapter to convert wide character types into individual bytes.
template<typename BaseInputAdapter, typename WideCharType>
class wide_string_input_adapter
{
  public:
    using char_type = char;

    wide_string_input_adapter(BaseInputAdapter base)
        : base_adapter(base) {}

    typename std::char_traits<char>::int_type get_character() noexcept
    {
        // check if buffer needs to be filled
        if (utf8_bytes_index == utf8_bytes_filled)
        {
            fill_buffer<sizeof(WideCharType)>();

            JSON_ASSERT(utf8_bytes_filled > 0);
            JSON_ASSERT(utf8_bytes_index == 0);
        }

        // use buffer
        JSON_ASSERT(utf8_bytes_filled > 0);
        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
        return utf8_bytes[utf8_bytes_index++];
    }

  private:
    BaseInputAdapter base_adapter;

    template<size_t T>
    void fill_buffer()
    {
        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
    }

    /// a buffer for UTF-8 bytes
    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};

    /// index to the utf8_codes array for the next valid byte
    std::size_t utf8_bytes_index = 0;
    /// number of valid bytes in the utf8_codes array
    std::size_t utf8_bytes_filled = 0;
};

template<typename IteratorType, typename Enable = void>
struct iterator_input_adapter_factory
{
    using iterator_type = IteratorType;
    using char_type = typename std::iterator_traits<iterator_type>::value_type;
    using adapter_type = iterator_input_adapter<iterator_type>;

    static adapter_type create(IteratorType first, IteratorType last)
    {
        return adapter_type(std::move(first), std::move(last));
    }
};

template<typename T>
struct is_iterator_of_multibyte
{
    using value_type = typename std::iterator_traits<T>::value_type;
    enum
    {
        value = sizeof(value_type) > 1
    };
};

template<typename IteratorType>
struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>>
{
    using iterator_type = IteratorType;
    using char_type = typename std::iterator_traits<iterator_type>::value_type;
    using base_adapter_type = iterator_input_adapter<iterator_type>;
    using adapter_type = wide_string_input_adapter<base_adapter_type, char_type>;

    static adapter_type create(IteratorType first, IteratorType last)
    {
        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
    }
};

// General purpose iterator-based input
template<typename IteratorType>
typename iterator_input_adapter_factory<IteratorType>::adapter_type input_adapter(IteratorType first, IteratorType last)
{
    using factory_type = iterator_input_adapter_factory<IteratorType>;
    return factory_type::create(first, last);
}

// Convenience shorthand from container to iterator
// Enables ADL on begin(container) and end(container)
// Encloses the using declarations in namespace for not to leak them to outside scope

namespace container_input_adapter_factory_impl
{

using std::begin;
using std::end;

template<typename ContainerType, typename Enable = void>
struct container_input_adapter_factory {};

template<typename ContainerType>
struct container_input_adapter_factory< ContainerType,
       void_t<decltype(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>()))>>
       {
           using adapter_type = decltype(input_adapter(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>())));

           static adapter_type create(const ContainerType& container)
{
    return input_adapter(begin(container), end(container));
}
       };

}  // namespace container_input_adapter_factory_impl

template<typename ContainerType>
typename container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::adapter_type input_adapter(const ContainerType& container)
{
    return container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::create(container);
}

#ifndef JSON_NO_IO
// Special cases with fast paths
inline file_input_adapter input_adapter(std::FILE* file)
{
    return file_input_adapter(file);
}

inline input_stream_adapter input_adapter(std::istream& stream)
{
    return input_stream_adapter(stream);
}

inline input_stream_adapter input_adapter(std::istream&& stream)
{
    return input_stream_adapter(stream);
}
#endif  // JSON_NO_IO

using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));

// Null-delimited strings, and the like.
template < typename CharT,
           typename std::enable_if <
               std::is_pointer<CharT>::value&&
               !std::is_array<CharT>::value&&
               std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
               sizeof(typename std::remove_pointer<CharT>::type) == 1,
               int >::type = 0 >
contiguous_bytes_input_adapter input_adapter(CharT b)
{
    auto length = std::strlen(reinterpret_cast<const char*>(b));
    const auto* ptr = reinterpret_cast<const char*>(b);
    return input_adapter(ptr, ptr + length);
}

template<typename T, std::size_t N>
auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N)) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
{
    return input_adapter(array, array + N);
}

// This class only handles inputs of input_buffer_adapter type.
// It's required so that expressions like {ptr, len} can be implicitly cast
// to the correct adapter.
class span_input_adapter
{
  public:
    template < typename CharT,
               typename std::enable_if <
                   std::is_pointer<CharT>::value&&
                   std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
                   sizeof(typename std::remove_pointer<CharT>::type) == 1,
                   int >::type = 0 >
    span_input_adapter(CharT b, std::size_t l)
        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}

    template<class IteratorType,
             typename std::enable_if<
                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
                 int>::type = 0>
    span_input_adapter(IteratorType first, IteratorType last)
        : ia(input_adapter(first, last)) {}

    contiguous_bytes_input_adapter&& get()
    {
        return std::move(ia); // NOLINT(hicpp-move-const-arg,performance-move-const-arg)
    }

  private:
    contiguous_bytes_input_adapter ia;
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/input/json_sax.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstddef>
#include <string> // string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/string_concat.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN

/*!
@brief SAX interface

This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
Each function is called in different situations while the input is parsed. The
boolean return value informs the parser whether to continue processing the
input.
*/
template<typename BasicJsonType>
struct json_sax
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;

    /*!
    @brief a null value was read
    @return whether parsing should proceed
    */
    virtual bool null() = 0;

    /*!
    @brief a boolean value was read
    @param[in] val  boolean value
    @return whether parsing should proceed
    */
    virtual bool boolean(bool val) = 0;

    /*!
    @brief an integer number was read
    @param[in] val  integer value
    @return whether parsing should proceed
    */
    virtual bool number_integer(number_integer_t val) = 0;

    /*!
    @brief an unsigned integer number was read
    @param[in] val  unsigned integer value
    @return whether parsing should proceed
    */
    virtual bool number_unsigned(number_unsigned_t val) = 0;

    /*!
    @brief a floating-point number was read
    @param[in] val  floating-point value
    @param[in] s    raw token value
    @return whether parsing should proceed
    */
    virtual bool number_float(number_float_t val, const string_t& s) = 0;

    /*!
    @brief a string value was read
    @param[in] val  string value
    @return whether parsing should proceed
    @note It is safe to move the passed string value.
    */
    virtual bool string(string_t& val) = 0;

    /*!
    @brief a binary value was read
    @param[in] val  binary value
    @return whether parsing should proceed
    @note It is safe to move the passed binary value.
    */
    virtual bool binary(binary_t& val) = 0;

    /*!
    @brief the beginning of an object was read
    @param[in] elements  number of object elements or -1 if unknown
    @return whether parsing should proceed
    @note binary formats may report the number of elements
    */
    virtual bool start_object(std::size_t elements) = 0;

    /*!
    @brief an object key was read
    @param[in] val  object key
    @return whether parsing should proceed
    @note It is safe to move the passed string.
    */
    virtual bool key(string_t& val) = 0;

    /*!
    @brief the end of an object was read
    @return whether parsing should proceed
    */
    virtual bool end_object() = 0;

    /*!
    @brief the beginning of an array was read
    @param[in] elements  number of array elements or -1 if unknown
    @return whether parsing should proceed
    @note binary formats may report the number of elements
    */
    virtual bool start_array(std::size_t elements) = 0;

    /*!
    @brief the end of an array was read
    @return whether parsing should proceed
    */
    virtual bool end_array() = 0;

    /*!
    @brief a parse error occurred
    @param[in] position    the position in the input where the error occurs
    @param[in] last_token  the last read token
    @param[in] ex          an exception object describing the error
    @return whether parsing should proceed (must return false)
    */
    virtual bool parse_error(std::size_t position,
                             const std::string& last_token,
                             const detail::exception& ex) = 0;

    json_sax() = default;
    json_sax(const json_sax&) = default;
    json_sax(json_sax&&) noexcept = default;
    json_sax& operator=(const json_sax&) = default;
    json_sax& operator=(json_sax&&) noexcept = default;
    virtual ~json_sax() = default;
};

namespace detail
{
/*!
@brief SAX implementation to create a JSON value from SAX events

This class implements the @ref json_sax interface and processes the SAX events
to create a JSON value which makes it basically a DOM parser. The structure or
hierarchy of the JSON value is managed by the stack `ref_stack` which contains
a pointer to the respective array or object for each recursion depth.

After successful parsing, the value that is passed by reference to the
constructor contains the parsed value.

@tparam BasicJsonType  the JSON type
*/
template<typename BasicJsonType>
class json_sax_dom_parser
{
  public:
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;

    /*!
    @param[in,out] r  reference to a JSON value that is manipulated while
                       parsing
    @param[in] allow_exceptions_  whether parse errors yield exceptions
    */
    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
        : root(r), allow_exceptions(allow_exceptions_)
    {}

    // make class move-only
    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
    json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~json_sax_dom_parser() = default;

    bool null()
    {
        handle_value(nullptr);
        return true;
    }

    bool boolean(bool val)
    {
        handle_value(val);
        return true;
    }

    bool number_integer(number_integer_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_unsigned(number_unsigned_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_float(number_float_t val, const string_t& /*unused*/)
    {
        handle_value(val);
        return true;
    }

    bool string(string_t& val)
    {
        handle_value(val);
        return true;
    }

    bool binary(binary_t& val)
    {
        handle_value(std::move(val));
        return true;
    }

    bool start_object(std::size_t len)
    {
        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));

        if (JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
        }

        return true;
    }

    bool key(string_t& val)
    {
        JSON_ASSERT(!ref_stack.empty());
        JSON_ASSERT(ref_stack.back()->is_object());

        // add null at given key and store the reference for later
        object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val));
        return true;
    }

    bool end_object()
    {
        JSON_ASSERT(!ref_stack.empty());
        JSON_ASSERT(ref_stack.back()->is_object());

        ref_stack.back()->set_parents();
        ref_stack.pop_back();
        return true;
    }

    bool start_array(std::size_t len)
    {
        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));

        if (JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
        }

        return true;
    }

    bool end_array()
    {
        JSON_ASSERT(!ref_stack.empty());
        JSON_ASSERT(ref_stack.back()->is_array());

        ref_stack.back()->set_parents();
        ref_stack.pop_back();
        return true;
    }

    template<class Exception>
    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
                     const Exception& ex)
    {
        errored = true;
        static_cast<void>(ex);
        if (allow_exceptions)
        {
            JSON_THROW(ex);
        }
        return false;
    }

    constexpr bool is_errored() const
    {
        return errored;
    }

  private:
    /*!
    @invariant If the ref stack is empty, then the passed value will be the new
               root.
    @invariant If the ref stack contains a value, then it is an array or an
               object to which we can add elements
    */
    template<typename Value>
    JSON_HEDLEY_RETURNS_NON_NULL
    BasicJsonType* handle_value(Value&& v)
    {
        if (ref_stack.empty())
        {
            root = BasicJsonType(std::forward<Value>(v));
            return &root;
        }

        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());

        if (ref_stack.back()->is_array())
        {
            ref_stack.back()->m_data.m_value.array->emplace_back(std::forward<Value>(v));
            return &(ref_stack.back()->m_data.m_value.array->back());
        }

        JSON_ASSERT(ref_stack.back()->is_object());
        JSON_ASSERT(object_element);
        *object_element = BasicJsonType(std::forward<Value>(v));
        return object_element;
    }

    /// the parsed JSON value
    BasicJsonType& root;
    /// stack to model hierarchy of values
    std::vector<BasicJsonType*> ref_stack {};
    /// helper to hold the reference for the next object element
    BasicJsonType* object_element = nullptr;
    /// whether a syntax error occurred
    bool errored = false;
    /// whether to throw exceptions in case of errors
    const bool allow_exceptions = true;
};

template<typename BasicJsonType>
class json_sax_dom_callback_parser
{
  public:
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using parser_callback_t = typename BasicJsonType::parser_callback_t;
    using parse_event_t = typename BasicJsonType::parse_event_t;

    json_sax_dom_callback_parser(BasicJsonType& r,
                                 const parser_callback_t cb,
                                 const bool allow_exceptions_ = true)
        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
    {
        keep_stack.push_back(true);
    }

    // make class move-only
    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~json_sax_dom_callback_parser() = default;

    bool null()
    {
        handle_value(nullptr);
        return true;
    }

    bool boolean(bool val)
    {
        handle_value(val);
        return true;
    }

    bool number_integer(number_integer_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_unsigned(number_unsigned_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_float(number_float_t val, const string_t& /*unused*/)
    {
        handle_value(val);
        return true;
    }

    bool string(string_t& val)
    {
        handle_value(val);
        return true;
    }

    bool binary(binary_t& val)
    {
        handle_value(std::move(val));
        return true;
    }

    bool start_object(std::size_t len)
    {
        // check callback for object start
        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
        keep_stack.push_back(keep);

        auto val = handle_value(BasicJsonType::value_t::object, true);
        ref_stack.push_back(val.second);

        // check object limit
        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
        }

        return true;
    }

    bool key(string_t& val)
    {
        BasicJsonType k = BasicJsonType(val);

        // check callback for key
        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
        key_keep_stack.push_back(keep);

        // add discarded value at given key and store the reference for later
        if (keep && ref_stack.back())
        {
            object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded);
        }

        return true;
    }

    bool end_object()
    {
        if (ref_stack.back())
        {
            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
            {
                // discard object
                *ref_stack.back() = discarded;
            }
            else
            {
                ref_stack.back()->set_parents();
            }
        }

        JSON_ASSERT(!ref_stack.empty());
        JSON_ASSERT(!keep_stack.empty());
        ref_stack.pop_back();
        keep_stack.pop_back();

        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
        {
            // remove discarded value
            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
            {
                if (it->is_discarded())
                {
                    ref_stack.back()->erase(it);
                    break;
                }
            }
        }

        return true;
    }

    bool start_array(std::size_t len)
    {
        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
        keep_stack.push_back(keep);

        auto val = handle_value(BasicJsonType::value_t::array, true);
        ref_stack.push_back(val.second);

        // check array limit
        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
        }

        return true;
    }

    bool end_array()
    {
        bool keep = true;

        if (ref_stack.back())
        {
            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
            if (keep)
            {
                ref_stack.back()->set_parents();
            }
            else
            {
                // discard array
                *ref_stack.back() = discarded;
            }
        }

        JSON_ASSERT(!ref_stack.empty());
        JSON_ASSERT(!keep_stack.empty());
        ref_stack.pop_back();
        keep_stack.pop_back();

        // remove discarded value
        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
        {
            ref_stack.back()->m_data.m_value.array->pop_back();
        }

        return true;
    }

    template<class Exception>
    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
                     const Exception& ex)
    {
        errored = true;
        static_cast<void>(ex);
        if (allow_exceptions)
        {
            JSON_THROW(ex);
        }
        return false;
    }

    constexpr bool is_errored() const
    {
        return errored;
    }

  private:
    /*!
    @param[in] v  value to add to the JSON value we build during parsing
    @param[in] skip_callback  whether we should skip calling the callback
               function; this is required after start_array() and
               start_object() SAX events, because otherwise we would call the
               callback function with an empty array or object, respectively.

    @invariant If the ref stack is empty, then the passed value will be the new
               root.
    @invariant If the ref stack contains a value, then it is an array or an
               object to which we can add elements

    @return pair of boolean (whether value should be kept) and pointer (to the
            passed value in the ref_stack hierarchy; nullptr if not kept)
    */
    template<typename Value>
    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
    {
        JSON_ASSERT(!keep_stack.empty());

        // do not handle this value if we know it would be added to a discarded
        // container
        if (!keep_stack.back())
        {
            return {false, nullptr};
        }

        // create value
        auto value = BasicJsonType(std::forward<Value>(v));

        // check callback
        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);

        // do not handle this value if we just learnt it shall be discarded
        if (!keep)
        {
            return {false, nullptr};
        }

        if (ref_stack.empty())
        {
            root = std::move(value);
            return {true, & root};
        }

        // skip this value if we already decided to skip the parent
        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
        if (!ref_stack.back())
        {
            return {false, nullptr};
        }

        // we now only expect arrays and objects
        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());

        // array
        if (ref_stack.back()->is_array())
        {
            ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value));
            return {true, & (ref_stack.back()->m_data.m_value.array->back())};
        }

        // object
        JSON_ASSERT(ref_stack.back()->is_object());
        // check if we should store an element for the current key
        JSON_ASSERT(!key_keep_stack.empty());
        const bool store_element = key_keep_stack.back();
        key_keep_stack.pop_back();

        if (!store_element)
        {
            return {false, nullptr};
        }

        JSON_ASSERT(object_element);
        *object_element = std::move(value);
        return {true, object_element};
    }

    /// the parsed JSON value
    BasicJsonType& root;
    /// stack to model hierarchy of values
    std::vector<BasicJsonType*> ref_stack {};
    /// stack to manage which values to keep
    std::vector<bool> keep_stack {}; // NOLINT(readability-redundant-member-init)
    /// stack to manage which object keys to keep
    std::vector<bool> key_keep_stack {}; // NOLINT(readability-redundant-member-init)
    /// helper to hold the reference for the next object element
    BasicJsonType* object_element = nullptr;
    /// whether a syntax error occurred
    bool errored = false;
    /// callback function
    const parser_callback_t callback = nullptr;
    /// whether to throw exceptions in case of errors
    const bool allow_exceptions = true;
    /// a discarded value for the callback
    BasicJsonType discarded = BasicJsonType::value_t::discarded;
};

template<typename BasicJsonType>
class json_sax_acceptor
{
  public:
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;

    bool null()
    {
        return true;
    }

    bool boolean(bool /*unused*/)
    {
        return true;
    }

    bool number_integer(number_integer_t /*unused*/)
    {
        return true;
    }

    bool number_unsigned(number_unsigned_t /*unused*/)
    {
        return true;
    }

    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
    {
        return true;
    }

    bool string(string_t& /*unused*/)
    {
        return true;
    }

    bool binary(binary_t& /*unused*/)
    {
        return true;
    }

    bool start_object(std::size_t /*unused*/ = static_cast<std::size_t>(-1))
    {
        return true;
    }

    bool key(string_t& /*unused*/)
    {
        return true;
    }

    bool end_object()
    {
        return true;
    }

    bool start_array(std::size_t /*unused*/ = static_cast<std::size_t>(-1))
    {
        return true;
    }

    bool end_array()
    {
        return true;
    }

    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
    {
        return false;
    }
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/input/lexer.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <array> // array
#include <clocale> // localeconv
#include <cstddef> // size_t
#include <cstdio> // snprintf
#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
#include <initializer_list> // initializer_list
#include <string> // char_traits, string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/input/input_adapters.hpp>

// #include <nlohmann/detail/input/position_t.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

///////////
// lexer //
///////////

template<typename BasicJsonType>
class lexer_base
{
  public:
    /// token types for the parser
    enum class token_type
    {
        uninitialized,    ///< indicating the scanner is uninitialized
        literal_true,     ///< the `true` literal
        literal_false,    ///< the `false` literal
        literal_null,     ///< the `null` literal
        value_string,     ///< a string -- use get_string() for actual value
        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
        value_float,      ///< an floating point number -- use get_number_float() for actual value
        begin_array,      ///< the character for array begin `[`
        begin_object,     ///< the character for object begin `{`
        end_array,        ///< the character for array end `]`
        end_object,       ///< the character for object end `}`
        name_separator,   ///< the name separator `:`
        value_separator,  ///< the value separator `,`
        parse_error,      ///< indicating a parse error
        end_of_input,     ///< indicating the end of the input buffer
        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
    };

    /// return name of values of type token_type (only used for errors)
    JSON_HEDLEY_RETURNS_NON_NULL
    JSON_HEDLEY_CONST
    static const char* token_type_name(const token_type t) noexcept
    {
        switch (t)
        {
            case token_type::uninitialized:
                return "<uninitialized>";
            case token_type::literal_true:
                return "true literal";
            case token_type::literal_false:
                return "false literal";
            case token_type::literal_null:
                return "null literal";
            case token_type::value_string:
                return "string literal";
            case token_type::value_unsigned:
            case token_type::value_integer:
            case token_type::value_float:
                return "number literal";
            case token_type::begin_array:
                return "'['";
            case token_type::begin_object:
                return "'{'";
            case token_type::end_array:
                return "']'";
            case token_type::end_object:
                return "'}'";
            case token_type::name_separator:
                return "':'";
            case token_type::value_separator:
                return "','";
            case token_type::parse_error:
                return "<parse error>";
            case token_type::end_of_input:
                return "end of input";
            case token_type::literal_or_value:
                return "'[', '{', or a literal";
            // LCOV_EXCL_START
            default: // catch non-enum values
                return "unknown token";
                // LCOV_EXCL_STOP
        }
    }
};
/*!
@brief lexical analysis

This class organizes the lexical analysis during JSON deserialization.
*/
template<typename BasicJsonType, typename InputAdapterType>
class lexer : public lexer_base<BasicJsonType>
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using char_type = typename InputAdapterType::char_type;
    using char_int_type = typename char_traits<char_type>::int_type;

  public:
    using token_type = typename lexer_base<BasicJsonType>::token_type;

    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
        : ia(std::move(adapter))
        , ignore_comments(ignore_comments_)
        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
    {}

    // delete because of pointer members
    lexer(const lexer&) = delete;
    lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    lexer& operator=(lexer&) = delete;
    lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~lexer() = default;

  private:
    /////////////////////
    // locales
    /////////////////////

    /// return the locale-dependent decimal point
    JSON_HEDLEY_PURE
    static char get_decimal_point() noexcept
    {
        const auto* loc = localeconv();
        JSON_ASSERT(loc != nullptr);
        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
    }

    /////////////////////
    // scan functions
    /////////////////////

    /*!
    @brief get codepoint from 4 hex characters following `\u`

    For input "\u c1 c2 c3 c4" the codepoint is:
      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)

    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
    between the ASCII value of the character and the desired integer value.

    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
            non-hex character)
    */
    int get_codepoint()
    {
        // this function only makes sense after reading `\u`
        JSON_ASSERT(current == 'u');
        int codepoint = 0;

        const auto factors = { 12u, 8u, 4u, 0u };
        for (const auto factor : factors)
        {
            get();

            if (current >= '0' && current <= '9')
            {
                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
            }
            else if (current >= 'A' && current <= 'F')
            {
                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
            }
            else if (current >= 'a' && current <= 'f')
            {
                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
            }
            else
            {
                return -1;
            }
        }

        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
        return codepoint;
    }

    /*!
    @brief check if the next byte(s) are inside a given range

    Adds the current byte and, for each passed range, reads a new byte and
    checks if it is inside the range. If a violation was detected, set up an
    error message and return false. Otherwise, return true.

    @param[in] ranges  list of integers; interpreted as list of pairs of
                       inclusive lower and upper bound, respectively

    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
         1, 2, or 3 pairs. This precondition is enforced by an assertion.

    @return true if and only if no range violation was detected
    */
    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
    {
        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
        add(current);

        for (auto range = ranges.begin(); range != ranges.end(); ++range)
        {
            get();
            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions)
            {
                add(current);
            }
            else
            {
                error_message = "invalid string: ill-formed UTF-8 byte";
                return false;
            }
        }

        return true;
    }

    /*!
    @brief scan a string literal

    This function scans a string according to Sect. 7 of RFC 8259. While
    scanning, bytes are escaped and copied into buffer token_buffer. Then the
    function returns successfully, token_buffer is *not* null-terminated (as it
    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
    string.

    @return token_type::value_string if string could be successfully scanned,
            token_type::parse_error otherwise

    @note In case of errors, variable error_message contains a textual
          description.
    */
    token_type scan_string()
    {
        // reset token_buffer (ignore opening quote)
        reset();

        // we entered the function by reading an open quote
        JSON_ASSERT(current == '\"');

        while (true)
        {
            // get next character
            switch (get())
            {
                // end of file while parsing string
                case char_traits<char_type>::eof():
                {
                    error_message = "invalid string: missing closing quote";
                    return token_type::parse_error;
                }

                // closing quote
                case '\"':
                {
                    return token_type::value_string;
                }

                // escapes
                case '\\':
                {
                    switch (get())
                    {
                        // quotation mark
                        case '\"':
                            add('\"');
                            break;
                        // reverse solidus
                        case '\\':
                            add('\\');
                            break;
                        // solidus
                        case '/':
                            add('/');
                            break;
                        // backspace
                        case 'b':
                            add('\b');
                            break;
                        // form feed
                        case 'f':
                            add('\f');
                            break;
                        // line feed
                        case 'n':
                            add('\n');
                            break;
                        // carriage return
                        case 'r':
                            add('\r');
                            break;
                        // tab
                        case 't':
                            add('\t');
                            break;

                        // unicode escapes
                        case 'u':
                        {
                            const int codepoint1 = get_codepoint();
                            int codepoint = codepoint1; // start with codepoint1

                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
                            {
                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
                                return token_type::parse_error;
                            }

                            // check if code point is a high surrogate
                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
                            {
                                // expect next \uxxxx entry
                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
                                {
                                    const int codepoint2 = get_codepoint();

                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
                                    {
                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
                                        return token_type::parse_error;
                                    }

                                    // check if codepoint2 is a low surrogate
                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
                                    {
                                        // overwrite codepoint
                                        codepoint = static_cast<int>(
                                                        // high surrogate occupies the most significant 22 bits
                                                        (static_cast<unsigned int>(codepoint1) << 10u)
                                                        // low surrogate occupies the least significant 15 bits
                                                        + static_cast<unsigned int>(codepoint2)
                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
                                                        // in the result, so we have to subtract with:
                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
                                                        - 0x35FDC00u);
                                    }
                                    else
                                    {
                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
                                        return token_type::parse_error;
                                    }
                                }
                                else
                                {
                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
                                    return token_type::parse_error;
                                }
                            }
                            else
                            {
                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
                                {
                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
                                    return token_type::parse_error;
                                }
                            }

                            // result of the above calculation yields a proper codepoint
                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);

                            // translate codepoint into bytes
                            if (codepoint < 0x80)
                            {
                                // 1-byte characters: 0xxxxxxx (ASCII)
                                add(static_cast<char_int_type>(codepoint));
                            }
                            else if (codepoint <= 0x7FF)
                            {
                                // 2-byte characters: 110xxxxx 10xxxxxx
                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
                            }
                            else if (codepoint <= 0xFFFF)
                            {
                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
                            }
                            else
                            {
                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
                            }

                            break;
                        }

                        // other characters after escape
                        default:
                            error_message = "invalid string: forbidden character after backslash";
                            return token_type::parse_error;
                    }

                    break;
                }

                // invalid control characters
                case 0x00:
                {
                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
                    return token_type::parse_error;
                }

                case 0x01:
                {
                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
                    return token_type::parse_error;
                }

                case 0x02:
                {
                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
                    return token_type::parse_error;
                }

                case 0x03:
                {
                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
                    return token_type::parse_error;
                }

                case 0x04:
                {
                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
                    return token_type::parse_error;
                }

                case 0x05:
                {
                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
                    return token_type::parse_error;
                }

                case 0x06:
                {
                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
                    return token_type::parse_error;
                }

                case 0x07:
                {
                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
                    return token_type::parse_error;
                }

                case 0x08:
                {
                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
                    return token_type::parse_error;
                }

                case 0x09:
                {
                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
                    return token_type::parse_error;
                }

                case 0x0A:
                {
                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
                    return token_type::parse_error;
                }

                case 0x0B:
                {
                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
                    return token_type::parse_error;
                }

                case 0x0C:
                {
                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
                    return token_type::parse_error;
                }

                case 0x0D:
                {
                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
                    return token_type::parse_error;
                }

                case 0x0E:
                {
                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
                    return token_type::parse_error;
                }

                case 0x0F:
                {
                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
                    return token_type::parse_error;
                }

                case 0x10:
                {
                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
                    return token_type::parse_error;
                }

                case 0x11:
                {
                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
                    return token_type::parse_error;
                }

                case 0x12:
                {
                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
                    return token_type::parse_error;
                }

                case 0x13:
                {
                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
                    return token_type::parse_error;
                }

                case 0x14:
                {
                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
                    return token_type::parse_error;
                }

                case 0x15:
                {
                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
                    return token_type::parse_error;
                }

                case 0x16:
                {
                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
                    return token_type::parse_error;
                }

                case 0x17:
                {
                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
                    return token_type::parse_error;
                }

                case 0x18:
                {
                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
                    return token_type::parse_error;
                }

                case 0x19:
                {
                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
                    return token_type::parse_error;
                }

                case 0x1A:
                {
                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
                    return token_type::parse_error;
                }

                case 0x1B:
                {
                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
                    return token_type::parse_error;
                }

                case 0x1C:
                {
                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
                    return token_type::parse_error;
                }

                case 0x1D:
                {
                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
                    return token_type::parse_error;
                }

                case 0x1E:
                {
                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
                    return token_type::parse_error;
                }

                case 0x1F:
                {
                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
                    return token_type::parse_error;
                }

                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
                case 0x20:
                case 0x21:
                case 0x23:
                case 0x24:
                case 0x25:
                case 0x26:
                case 0x27:
                case 0x28:
                case 0x29:
                case 0x2A:
                case 0x2B:
                case 0x2C:
                case 0x2D:
                case 0x2E:
                case 0x2F:
                case 0x30:
                case 0x31:
                case 0x32:
                case 0x33:
                case 0x34:
                case 0x35:
                case 0x36:
                case 0x37:
                case 0x38:
                case 0x39:
                case 0x3A:
                case 0x3B:
                case 0x3C:
                case 0x3D:
                case 0x3E:
                case 0x3F:
                case 0x40:
                case 0x41:
                case 0x42:
                case 0x43:
                case 0x44:
                case 0x45:
                case 0x46:
                case 0x47:
                case 0x48:
                case 0x49:
                case 0x4A:
                case 0x4B:
                case 0x4C:
                case 0x4D:
                case 0x4E:
                case 0x4F:
                case 0x50:
                case 0x51:
                case 0x52:
                case 0x53:
                case 0x54:
                case 0x55:
                case 0x56:
                case 0x57:
                case 0x58:
                case 0x59:
                case 0x5A:
                case 0x5B:
                case 0x5D:
                case 0x5E:
                case 0x5F:
                case 0x60:
                case 0x61:
                case 0x62:
                case 0x63:
                case 0x64:
                case 0x65:
                case 0x66:
                case 0x67:
                case 0x68:
                case 0x69:
                case 0x6A:
                case 0x6B:
                case 0x6C:
                case 0x6D:
                case 0x6E:
                case 0x6F:
                case 0x70:
                case 0x71:
                case 0x72:
                case 0x73:
                case 0x74:
                case 0x75:
                case 0x76:
                case 0x77:
                case 0x78:
                case 0x79:
                case 0x7A:
                case 0x7B:
                case 0x7C:
                case 0x7D:
                case 0x7E:
                case 0x7F:
                {
                    add(current);
                    break;
                }

                // U+0080..U+07FF: bytes C2..DF 80..BF
                case 0xC2:
                case 0xC3:
                case 0xC4:
                case 0xC5:
                case 0xC6:
                case 0xC7:
                case 0xC8:
                case 0xC9:
                case 0xCA:
                case 0xCB:
                case 0xCC:
                case 0xCD:
                case 0xCE:
                case 0xCF:
                case 0xD0:
                case 0xD1:
                case 0xD2:
                case 0xD3:
                case 0xD4:
                case 0xD5:
                case 0xD6:
                case 0xD7:
                case 0xD8:
                case 0xD9:
                case 0xDA:
                case 0xDB:
                case 0xDC:
                case 0xDD:
                case 0xDE:
                case 0xDF:
                {
                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
                case 0xE0:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
                case 0xE1:
                case 0xE2:
                case 0xE3:
                case 0xE4:
                case 0xE5:
                case 0xE6:
                case 0xE7:
                case 0xE8:
                case 0xE9:
                case 0xEA:
                case 0xEB:
                case 0xEC:
                case 0xEE:
                case 0xEF:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
                case 0xED:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
                case 0xF0:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
                case 0xF1:
                case 0xF2:
                case 0xF3:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
                case 0xF4:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // remaining bytes (80..C1 and F5..FF) are ill-formed
                default:
                {
                    error_message = "invalid string: ill-formed UTF-8 byte";
                    return token_type::parse_error;
                }
            }
        }
    }

    /*!
     * @brief scan a comment
     * @return whether comment could be scanned successfully
     */
    bool scan_comment()
    {
        switch (get())
        {
            // single-line comments skip input until a newline or EOF is read
            case '/':
            {
                while (true)
                {
                    switch (get())
                    {
                        case '\n':
                        case '\r':
                        case char_traits<char_type>::eof():
                        case '\0':
                            return true;

                        default:
                            break;
                    }
                }
            }

            // multi-line comments skip input until */ is read
            case '*':
            {
                while (true)
                {
                    switch (get())
                    {
                        case char_traits<char_type>::eof():
                        case '\0':
                        {
                            error_message = "invalid comment; missing closing '*/'";
                            return false;
                        }

                        case '*':
                        {
                            switch (get())
                            {
                                case '/':
                                    return true;

                                default:
                                {
                                    unget();
                                    continue;
                                }
                            }
                        }

                        default:
                            continue;
                    }
                }
            }

            // unexpected character after reading '/'
            default:
            {
                error_message = "invalid comment; expecting '/' or '*' after '/'";
                return false;
            }
        }
    }

    JSON_HEDLEY_NON_NULL(2)
    static void strtof(float& f, const char* str, char** endptr) noexcept
    {
        f = std::strtof(str, endptr);
    }

    JSON_HEDLEY_NON_NULL(2)
    static void strtof(double& f, const char* str, char** endptr) noexcept
    {
        f = std::strtod(str, endptr);
    }

    JSON_HEDLEY_NON_NULL(2)
    static void strtof(long double& f, const char* str, char** endptr) noexcept
    {
        f = std::strtold(str, endptr);
    }

    /*!
    @brief scan a number literal

    This function scans a string according to Sect. 6 of RFC 8259.

    The function is realized with a deterministic finite state machine derived
    from the grammar described in RFC 8259. Starting in state "init", the
    input is read and used to determined the next state. Only state "done"
    accepts the number. State "error" is a trap state to model errors. In the
    table below, "anything" means any character but the ones listed before.

    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
    ---------|----------|----------|----------|---------|---------|----------|-----------
    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
    zero     | done     | done     | exponent | done    | done    | decimal1 | done
    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
    any2     | any2     | any2     | done     | done    | done    | done     | done

    The state machine is realized with one label per state (prefixed with
    "scan_number_") and `goto` statements between them. The state machine
    contains cycles, but any cycle can be left when EOF is read. Therefore,
    the function is guaranteed to terminate.

    During scanning, the read bytes are stored in token_buffer. This string is
    then converted to a signed integer, an unsigned integer, or a
    floating-point number.

    @return token_type::value_unsigned, token_type::value_integer, or
            token_type::value_float if number could be successfully scanned,
            token_type::parse_error otherwise

    @note The scanner is independent of the current locale. Internally, the
          locale's decimal point is used instead of `.` to work with the
          locale-dependent converters.
    */
    token_type scan_number()  // lgtm [cpp/use-of-goto] `goto` is used in this function to implement the number-parsing state machine described above. By design, any finite input will eventually reach the "done" state or return token_type::parse_error. In each intermediate state, 1 byte of the input is appended to the token_buffer vector, and only the already initialized variables token_buffer, number_type, and error_message are manipulated.
    {
        // reset token_buffer to store the number's bytes
        reset();

        // the type of the parsed number; initially set to unsigned; will be
        // changed if minus sign, decimal point or exponent is read
        token_type number_type = token_type::value_unsigned;

        // state (init): we just found out we need to scan a number
        switch (current)
        {
            case '-':
            {
                add(current);
                goto scan_number_minus;
            }

            case '0':
            {
                add(current);
                goto scan_number_zero;
            }

            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any1;
            }

            // all other characters are rejected outside scan_number()
            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }

scan_number_minus:
        // state: we just parsed a leading minus sign
        number_type = token_type::value_integer;
        switch (get())
        {
            case '0':
            {
                add(current);
                goto scan_number_zero;
            }

            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any1;
            }

            default:
            {
                error_message = "invalid number; expected digit after '-'";
                return token_type::parse_error;
            }
        }

scan_number_zero:
        // state: we just parse a zero (maybe with a leading minus sign)
        switch (get())
        {
            case '.':
            {
                add(decimal_point_char);
                goto scan_number_decimal1;
            }

            case 'e':
            case 'E':
            {
                add(current);
                goto scan_number_exponent;
            }

            default:
                goto scan_number_done;
        }

scan_number_any1:
        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any1;
            }

            case '.':
            {
                add(decimal_point_char);
                goto scan_number_decimal1;
            }

            case 'e':
            case 'E':
            {
                add(current);
                goto scan_number_exponent;
            }

            default:
                goto scan_number_done;
        }

scan_number_decimal1:
        // state: we just parsed a decimal point
        number_type = token_type::value_float;
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_decimal2;
            }

            default:
            {
                error_message = "invalid number; expected digit after '.'";
                return token_type::parse_error;
            }
        }

scan_number_decimal2:
        // we just parsed at least one number after a decimal point
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_decimal2;
            }

            case 'e':
            case 'E':
            {
                add(current);
                goto scan_number_exponent;
            }

            default:
                goto scan_number_done;
        }

scan_number_exponent:
        // we just parsed an exponent
        number_type = token_type::value_float;
        switch (get())
        {
            case '+':
            case '-':
            {
                add(current);
                goto scan_number_sign;
            }

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any2;
            }

            default:
            {
                error_message =
                    "invalid number; expected '+', '-', or digit after exponent";
                return token_type::parse_error;
            }
        }

scan_number_sign:
        // we just parsed an exponent sign
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any2;
            }

            default:
            {
                error_message = "invalid number; expected digit after exponent sign";
                return token_type::parse_error;
            }
        }

scan_number_any2:
        // we just parsed a number after the exponent or exponent sign
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any2;
            }

            default:
                goto scan_number_done;
        }

scan_number_done:
        // unget the character after the number (we only read it to know that
        // we are done scanning a number)
        unget();

        char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
        errno = 0;

        // try to parse integers first and fall back to floats
        if (number_type == token_type::value_unsigned)
        {
            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);

            // we checked the number format before
            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());

            if (errno == 0)
            {
                value_unsigned = static_cast<number_unsigned_t>(x);
                if (value_unsigned == x)
                {
                    return token_type::value_unsigned;
                }
            }
        }
        else if (number_type == token_type::value_integer)
        {
            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);

            // we checked the number format before
            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());

            if (errno == 0)
            {
                value_integer = static_cast<number_integer_t>(x);
                if (value_integer == x)
                {
                    return token_type::value_integer;
                }
            }
        }

        // this code is reached if we parse a floating-point number or if an
        // integer conversion above failed
        strtof(value_float, token_buffer.data(), &endptr);

        // we checked the number format before
        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());

        return token_type::value_float;
    }

    /*!
    @param[in] literal_text  the literal text to expect
    @param[in] length        the length of the passed literal text
    @param[in] return_type   the token type to return on success
    */
    JSON_HEDLEY_NON_NULL(2)
    token_type scan_literal(const char_type* literal_text, const std::size_t length,
                            token_type return_type)
    {
        JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]);
        for (std::size_t i = 1; i < length; ++i)
        {
            if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i]))
            {
                error_message = "invalid literal";
                return token_type::parse_error;
            }
        }
        return return_type;
    }

    /////////////////////
    // input management
    /////////////////////

    /// reset token_buffer; current character is beginning of token
    void reset() noexcept
    {
        token_buffer.clear();
        token_string.clear();
        token_string.push_back(char_traits<char_type>::to_char_type(current));
    }

    /*
    @brief get next character from the input

    This function provides the interface to the used input adapter. It does
    not throw in case the input reached EOF, but returns a
    `char_traits<char>::eof()` in that case.  Stores the scanned characters
    for use in error messages.

    @return character read from the input
    */
    char_int_type get()
    {
        ++position.chars_read_total;
        ++position.chars_read_current_line;

        if (next_unget)
        {
            // just reset the next_unget variable and work with current
            next_unget = false;
        }
        else
        {
            current = ia.get_character();
        }

        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
        {
            token_string.push_back(char_traits<char_type>::to_char_type(current));
        }

        if (current == '\n')
        {
            ++position.lines_read;
            position.chars_read_current_line = 0;
        }

        return current;
    }

    /*!
    @brief unget current character (read it again on next get)

    We implement unget by setting variable next_unget to true. The input is not
    changed - we just simulate ungetting by modifying chars_read_total,
    chars_read_current_line, and token_string. The next call to get() will
    behave as if the unget character is read again.
    */
    void unget()
    {
        next_unget = true;

        --position.chars_read_total;

        // in case we "unget" a newline, we have to also decrement the lines_read
        if (position.chars_read_current_line == 0)
        {
            if (position.lines_read > 0)
            {
                --position.lines_read;
            }
        }
        else
        {
            --position.chars_read_current_line;
        }

        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
        {
            JSON_ASSERT(!token_string.empty());
            token_string.pop_back();
        }
    }

    /// add a character to token_buffer
    void add(char_int_type c)
    {
        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
    }

  public:
    /////////////////////
    // value getters
    /////////////////////

    /// return integer value
    constexpr number_integer_t get_number_integer() const noexcept
    {
        return value_integer;
    }

    /// return unsigned integer value
    constexpr number_unsigned_t get_number_unsigned() const noexcept
    {
        return value_unsigned;
    }

    /// return floating-point value
    constexpr number_float_t get_number_float() const noexcept
    {
        return value_float;
    }

    /// return current string value (implicitly resets the token; useful only once)
    string_t& get_string()
    {
        return token_buffer;
    }

    /////////////////////
    // diagnostics
    /////////////////////

    /// return position of last read token
    constexpr position_t get_position() const noexcept
    {
        return position;
    }

    /// return the last read token (for errors only).  Will never contain EOF
    /// (an arbitrary value that is not a valid char value, often -1), because
    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
    std::string get_token_string() const
    {
        // escape control characters
        std::string result;
        for (const auto c : token_string)
        {
            if (static_cast<unsigned char>(c) <= '\x1F')
            {
                // escape control characters
                std::array<char, 9> cs{{}};
                static_cast<void>((std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                result += cs.data();
            }
            else
            {
                // add character as is
                result.push_back(static_cast<std::string::value_type>(c));
            }
        }

        return result;
    }

    /// return syntax error message
    JSON_HEDLEY_RETURNS_NON_NULL
    constexpr const char* get_error_message() const noexcept
    {
        return error_message;
    }

    /////////////////////
    // actual scanner
    /////////////////////

    /*!
    @brief skip the UTF-8 byte order mark
    @return true iff there is no BOM or the correct BOM has been skipped
    */
    bool skip_bom()
    {
        if (get() == 0xEF)
        {
            // check if we completely parse the BOM
            return get() == 0xBB && get() == 0xBF;
        }

        // the first character is not the beginning of the BOM; unget it to
        // process is later
        unget();
        return true;
    }

    void skip_whitespace()
    {
        do
        {
            get();
        }
        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
    }

    token_type scan()
    {
        // initially, skip the BOM
        if (position.chars_read_total == 0 && !skip_bom())
        {
            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
            return token_type::parse_error;
        }

        // read next character and ignore whitespace
        skip_whitespace();

        // ignore comments
        while (ignore_comments && current == '/')
        {
            if (!scan_comment())
            {
                return token_type::parse_error;
            }

            // skip following whitespace
            skip_whitespace();
        }

        switch (current)
        {
            // structural characters
            case '[':
                return token_type::begin_array;
            case ']':
                return token_type::end_array;
            case '{':
                return token_type::begin_object;
            case '}':
                return token_type::end_object;
            case ':':
                return token_type::name_separator;
            case ',':
                return token_type::value_separator;

            // literals
            case 't':
            {
                std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}};
                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
            }
            case 'f':
            {
                std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}};
                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
            }
            case 'n':
            {
                std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}};
                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
            }

            // string
            case '\"':
                return scan_string();

            // number
            case '-':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
                return scan_number();

            // end of input (the null byte is needed when parsing from
            // string literals)
            case '\0':
            case char_traits<char_type>::eof():
                return token_type::end_of_input;

            // error
            default:
                error_message = "invalid literal";
                return token_type::parse_error;
        }
    }

  private:
    /// input adapter
    InputAdapterType ia;

    /// whether comments should be ignored (true) or signaled as errors (false)
    const bool ignore_comments = false;

    /// the current character
    char_int_type current = char_traits<char_type>::eof();

    /// whether the next get() call should just return current
    bool next_unget = false;

    /// the start position of the current token
    position_t position {};

    /// raw input token string (for error messages)
    std::vector<char_type> token_string {};

    /// buffer for variable-length tokens (numbers, strings)
    string_t token_buffer {};

    /// a description of occurred lexer errors
    const char* error_message = "";

    // number values
    number_integer_t value_integer = 0;
    number_unsigned_t value_unsigned = 0;
    number_float_t value_float = 0;

    /// the decimal point
    const char_int_type decimal_point_char = '.';
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/is_sax.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstdint> // size_t
#include <utility> // declval
#include <string> // string

// #include <nlohmann/detail/abi_macros.hpp>

// #include <nlohmann/detail/meta/detected.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

template<typename T>
using null_function_t = decltype(std::declval<T&>().null());

template<typename T>
using boolean_function_t =
    decltype(std::declval<T&>().boolean(std::declval<bool>()));

template<typename T, typename Integer>
using number_integer_function_t =
    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));

template<typename T, typename Unsigned>
using number_unsigned_function_t =
    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));

template<typename T, typename Float, typename String>
using number_float_function_t = decltype(std::declval<T&>().number_float(
                                    std::declval<Float>(), std::declval<const String&>()));

template<typename T, typename String>
using string_function_t =
    decltype(std::declval<T&>().string(std::declval<String&>()));

template<typename T, typename Binary>
using binary_function_t =
    decltype(std::declval<T&>().binary(std::declval<Binary&>()));

template<typename T>
using start_object_function_t =
    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));

template<typename T, typename String>
using key_function_t =
    decltype(std::declval<T&>().key(std::declval<String&>()));

template<typename T>
using end_object_function_t = decltype(std::declval<T&>().end_object());

template<typename T>
using start_array_function_t =
    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));

template<typename T>
using end_array_function_t = decltype(std::declval<T&>().end_array());

template<typename T, typename Exception>
using parse_error_function_t = decltype(std::declval<T&>().parse_error(
        std::declval<std::size_t>(), std::declval<const std::string&>(),
        std::declval<const Exception&>()));

template<typename SAX, typename BasicJsonType>
struct is_sax
{
  private:
    static_assert(is_basic_json<BasicJsonType>::value,
                  "BasicJsonType must be of type basic_json<...>");

    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using exception_t = typename BasicJsonType::exception;

  public:
    static constexpr bool value =
        is_detected_exact<bool, null_function_t, SAX>::value &&
        is_detected_exact<bool, boolean_function_t, SAX>::value &&
        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
        is_detected_exact<bool, start_object_function_t, SAX>::value &&
        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
        is_detected_exact<bool, end_object_function_t, SAX>::value &&
        is_detected_exact<bool, start_array_function_t, SAX>::value &&
        is_detected_exact<bool, end_array_function_t, SAX>::value &&
        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
};

template<typename SAX, typename BasicJsonType>
struct is_sax_static_asserts
{
  private:
    static_assert(is_basic_json<BasicJsonType>::value,
                  "BasicJsonType must be of type basic_json<...>");

    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using exception_t = typename BasicJsonType::exception;

  public:
    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
                  "Missing/invalid function: bool null()");
    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
                  "Missing/invalid function: bool boolean(bool)");
    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
                  "Missing/invalid function: bool boolean(bool)");
    static_assert(
        is_detected_exact<bool, number_integer_function_t, SAX,
        number_integer_t>::value,
        "Missing/invalid function: bool number_integer(number_integer_t)");
    static_assert(
        is_detected_exact<bool, number_unsigned_function_t, SAX,
        number_unsigned_t>::value,
        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
                  number_float_t, string_t>::value,
                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
    static_assert(
        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
        "Missing/invalid function: bool string(string_t&)");
    static_assert(
        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
        "Missing/invalid function: bool binary(binary_t&)");
    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
                  "Missing/invalid function: bool start_object(std::size_t)");
    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
                  "Missing/invalid function: bool key(string_t&)");
    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
                  "Missing/invalid function: bool end_object()");
    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
                  "Missing/invalid function: bool start_array(std::size_t)");
    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
                  "Missing/invalid function: bool end_array()");
    static_assert(
        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
        "Missing/invalid function: bool parse_error(std::size_t, const "
        "std::string&, const exception&)");
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/string_concat.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/// how to treat CBOR tags
enum class cbor_tag_handler_t
{
    error,   ///< throw a parse_error exception in case of a tag
    ignore,  ///< ignore tags
    store    ///< store tags as binary type
};

/*!
@brief determine system byte order

@return true if and only if system's byte order is little endian

@note from https://stackoverflow.com/a/1001328/266378
*/
static inline bool little_endianness(int num = 1) noexcept
{
    return *reinterpret_cast<char*>(&num) == 1;
}

///////////////////
// binary reader //
///////////////////

/*!
@brief deserialization of CBOR, MessagePack, and UBJSON values
*/
template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType>>
class binary_reader
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using json_sax_t = SAX;
    using char_type = typename InputAdapterType::char_type;
    using char_int_type = typename char_traits<char_type>::int_type;

  public:
    /*!
    @brief create a binary reader

    @param[in] adapter  input adapter to read from
    */
    explicit binary_reader(InputAdapterType&& adapter, const input_format_t format = input_format_t::json) noexcept : ia(std::move(adapter)), input_format(format)
    {
        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
    }

    // make class move-only
    binary_reader(const binary_reader&) = delete;
    binary_reader(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    binary_reader& operator=(const binary_reader&) = delete;
    binary_reader& operator=(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~binary_reader() = default;

    /*!
    @param[in] format  the binary format to parse
    @param[in] sax_    a SAX event processor
    @param[in] strict  whether to expect the input to be consumed completed
    @param[in] tag_handler  how to treat CBOR tags

    @return whether parsing was successful
    */
    JSON_HEDLEY_NON_NULL(3)
    bool sax_parse(const input_format_t format,
                   json_sax_t* sax_,
                   const bool strict = true,
                   const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        sax = sax_;
        bool result = false;

        switch (format)
        {
            case input_format_t::bson:
                result = parse_bson_internal();
                break;

            case input_format_t::cbor:
                result = parse_cbor_internal(true, tag_handler);
                break;

            case input_format_t::msgpack:
                result = parse_msgpack_internal();
                break;

            case input_format_t::ubjson:
            case input_format_t::bjdata:
                result = parse_ubjson_internal();
                break;

            case input_format_t::json: // LCOV_EXCL_LINE
            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }

        // strict mode: next byte must be EOF
        if (result && strict)
        {
            if (input_format == input_format_t::ubjson || input_format == input_format_t::bjdata)
            {
                get_ignore_noop();
            }
            else
            {
                get();
            }

            if (JSON_HEDLEY_UNLIKELY(current != char_traits<char_type>::eof()))
            {
                return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read,
                                        exception_message(input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr));
            }
        }

        return result;
    }

  private:
    //////////
    // BSON //
    //////////

    /*!
    @brief Reads in a BSON-object and passes it to the SAX-parser.
    @return whether a valid BSON-value was passed to the SAX parser
    */
    bool parse_bson_internal()
    {
        std::int32_t document_size{};
        get_number<std::int32_t, true>(input_format_t::bson, document_size);

        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1))))
        {
            return false;
        }

        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false)))
        {
            return false;
        }

        return sax->end_object();
    }

    /*!
    @brief Parses a C-style string from the BSON input.
    @param[in,out] result  A reference to the string variable where the read
                            string is to be stored.
    @return `true` if the \x00-byte indicating the end of the string was
             encountered before the EOF; false` indicates an unexpected EOF.
    */
    bool get_bson_cstr(string_t& result)
    {
        auto out = std::back_inserter(result);
        while (true)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring")))
            {
                return false;
            }
            if (current == 0x00)
            {
                return true;
            }
            *out++ = static_cast<typename string_t::value_type>(current);
        }
    }

    /*!
    @brief Parses a zero-terminated string of length @a len from the BSON
           input.
    @param[in] len  The length (including the zero-byte at the end) of the
                    string to be read.
    @param[in,out] result  A reference to the string variable where the read
                            string is to be stored.
    @tparam NumberType The type of the length @a len
    @pre len >= 1
    @return `true` if the string was successfully parsed
    */
    template<typename NumberType>
    bool get_bson_string(const NumberType len, string_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(len < 1))
        {
            auto last_token = get_token_string();
            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                    exception_message(input_format_t::bson, concat("string length must be at least 1, is ", std::to_string(len)), "string"), nullptr));
        }

        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != char_traits<char_type>::eof();
    }

    /*!
    @brief Parses a byte array input of length @a len from the BSON input.
    @param[in] len  The length of the byte array to be read.
    @param[in,out] result  A reference to the binary variable where the read
                            array is to be stored.
    @tparam NumberType The type of the length @a len
    @pre len >= 0
    @return `true` if the byte array was successfully parsed
    */
    template<typename NumberType>
    bool get_bson_binary(const NumberType len, binary_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(len < 0))
        {
            auto last_token = get_token_string();
            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                    exception_message(input_format_t::bson, concat("byte array length cannot be negative, is ", std::to_string(len)), "binary"), nullptr));
        }

        // All BSON binary values have a subtype
        std::uint8_t subtype{};
        get_number<std::uint8_t>(input_format_t::bson, subtype);
        result.set_subtype(subtype);

        return get_binary(input_format_t::bson, len, result);
    }

    /*!
    @brief Read a BSON document element of the given @a element_type.
    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
    @param[in] element_type_parse_position The position in the input stream,
               where the `element_type` was read.
    @warning Not all BSON element types are supported yet. An unsupported
             @a element_type will give rise to a parse_error.114:
             Unsupported BSON record type 0x...
    @return whether a valid BSON-object/array was passed to the SAX parser
    */
    bool parse_bson_element_internal(const char_int_type element_type,
                                     const std::size_t element_type_parse_position)
    {
        switch (element_type)
        {
            case 0x01: // double
            {
                double number{};
                return get_number<double, true>(input_format_t::bson, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0x02: // string
            {
                std::int32_t len{};
                string_t value;
                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value);
            }

            case 0x03: // object
            {
                return parse_bson_internal();
            }

            case 0x04: // array
            {
                return parse_bson_array();
            }

            case 0x05: // binary
            {
                std::int32_t len{};
                binary_t value;
                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value);
            }

            case 0x08: // boolean
            {
                return sax->boolean(get() != 0);
            }

            case 0x0A: // null
            {
                return sax->null();
            }

            case 0x10: // int32
            {
                std::int32_t value{};
                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
            }

            case 0x12: // int64
            {
                std::int64_t value{};
                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
            }

            default: // anything else not supported (yet)
            {
                std::array<char, 3> cr{{}};
                static_cast<void>((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                const std::string cr_str{cr.data()};
                return sax->parse_error(element_type_parse_position, cr_str,
                                        parse_error::create(114, element_type_parse_position, concat("Unsupported BSON record type 0x", cr_str), nullptr));
            }
        }
    }

    /*!
    @brief Read a BSON element list (as specified in the BSON-spec)

    The same binary layout is used for objects and arrays, hence it must be
    indicated with the argument @a is_array which one is expected
    (true --> array, false --> object).

    @param[in] is_array Determines if the element list being read is to be
                        treated as an object (@a is_array == false), or as an
                        array (@a is_array == true).
    @return whether a valid BSON-object/array was passed to the SAX parser
    */
    bool parse_bson_element_list(const bool is_array)
    {
        string_t key;

        while (auto element_type = get())
        {
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list")))
            {
                return false;
            }

            const std::size_t element_type_parse_position = chars_read;
            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key)))
            {
                return false;
            }

            if (!is_array && !sax->key(key))
            {
                return false;
            }

            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position)))
            {
                return false;
            }

            // get_bson_cstr only appends
            key.clear();
        }

        return true;
    }

    /*!
    @brief Reads an array from the BSON input and passes it to the SAX-parser.
    @return whether a valid BSON-array was passed to the SAX parser
    */
    bool parse_bson_array()
    {
        std::int32_t document_size{};
        get_number<std::int32_t, true>(input_format_t::bson, document_size);

        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1))))
        {
            return false;
        }

        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true)))
        {
            return false;
        }

        return sax->end_array();
    }

    //////////
    // CBOR //
    //////////

    /*!
    @param[in] get_char  whether a new character should be retrieved from the
                         input (true) or whether the last read character should
                         be considered instead (false)
    @param[in] tag_handler how CBOR tags should be treated

    @return whether a valid CBOR value was passed to the SAX parser
    */
    bool parse_cbor_internal(const bool get_char,
                             const cbor_tag_handler_t tag_handler)
    {
        switch (get_char ? get() : current)
        {
            // EOF
            case char_traits<char_type>::eof():
                return unexpect_eof(input_format_t::cbor, "value");

            // Integer 0x00..0x17 (0..23)
            case 0x00:
            case 0x01:
            case 0x02:
            case 0x03:
            case 0x04:
            case 0x05:
            case 0x06:
            case 0x07:
            case 0x08:
            case 0x09:
            case 0x0A:
            case 0x0B:
            case 0x0C:
            case 0x0D:
            case 0x0E:
            case 0x0F:
            case 0x10:
            case 0x11:
            case 0x12:
            case 0x13:
            case 0x14:
            case 0x15:
            case 0x16:
            case 0x17:
                return sax->number_unsigned(static_cast<number_unsigned_t>(current));

            case 0x18: // Unsigned integer (one-byte uint8_t follows)
            {
                std::uint8_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            case 0x19: // Unsigned integer (two-byte uint16_t follows)
            {
                std::uint16_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
            {
                std::uint32_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
            {
                std::uint64_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            // Negative integer -1-0x00..-1-0x17 (-1..-24)
            case 0x20:
            case 0x21:
            case 0x22:
            case 0x23:
            case 0x24:
            case 0x25:
            case 0x26:
            case 0x27:
            case 0x28:
            case 0x29:
            case 0x2A:
            case 0x2B:
            case 0x2C:
            case 0x2D:
            case 0x2E:
            case 0x2F:
            case 0x30:
            case 0x31:
            case 0x32:
            case 0x33:
            case 0x34:
            case 0x35:
            case 0x36:
            case 0x37:
                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));

            case 0x38: // Negative integer (one-byte uint8_t follows)
            {
                std::uint8_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
            }

            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
            {
                std::uint16_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
            }

            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
            {
                std::uint32_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
            }

            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
            {
                std::uint64_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1)
                        - static_cast<number_integer_t>(number));
            }

            // Binary data (0x00..0x17 bytes follow)
            case 0x40:
            case 0x41:
            case 0x42:
            case 0x43:
            case 0x44:
            case 0x45:
            case 0x46:
            case 0x47:
            case 0x48:
            case 0x49:
            case 0x4A:
            case 0x4B:
            case 0x4C:
            case 0x4D:
            case 0x4E:
            case 0x4F:
            case 0x50:
            case 0x51:
            case 0x52:
            case 0x53:
            case 0x54:
            case 0x55:
            case 0x56:
            case 0x57:
            case 0x58: // Binary data (one-byte uint8_t for n follows)
            case 0x59: // Binary data (two-byte uint16_t for n follow)
            case 0x5A: // Binary data (four-byte uint32_t for n follow)
            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
            case 0x5F: // Binary data (indefinite length)
            {
                binary_t b;
                return get_cbor_binary(b) && sax->binary(b);
            }

            // UTF-8 string (0x00..0x17 bytes follow)
            case 0x60:
            case 0x61:
            case 0x62:
            case 0x63:
            case 0x64:
            case 0x65:
            case 0x66:
            case 0x67:
            case 0x68:
            case 0x69:
            case 0x6A:
            case 0x6B:
            case 0x6C:
            case 0x6D:
            case 0x6E:
            case 0x6F:
            case 0x70:
            case 0x71:
            case 0x72:
            case 0x73:
            case 0x74:
            case 0x75:
            case 0x76:
            case 0x77:
            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
            case 0x7F: // UTF-8 string (indefinite length)
            {
                string_t s;
                return get_cbor_string(s) && sax->string(s);
            }

            // array (0x00..0x17 data items follow)
            case 0x80:
            case 0x81:
            case 0x82:
            case 0x83:
            case 0x84:
            case 0x85:
            case 0x86:
            case 0x87:
            case 0x88:
            case 0x89:
            case 0x8A:
            case 0x8B:
            case 0x8C:
            case 0x8D:
            case 0x8E:
            case 0x8F:
            case 0x90:
            case 0x91:
            case 0x92:
            case 0x93:
            case 0x94:
            case 0x95:
            case 0x96:
            case 0x97:
                return get_cbor_array(
                           conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);

            case 0x98: // array (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
            }

            case 0x99: // array (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
            }

            case 0x9A: // array (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
            }

            case 0x9B: // array (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
            }

            case 0x9F: // array (indefinite length)
                return get_cbor_array(static_cast<std::size_t>(-1), tag_handler);

            // map (0x00..0x17 pairs of data items follow)
            case 0xA0:
            case 0xA1:
            case 0xA2:
            case 0xA3:
            case 0xA4:
            case 0xA5:
            case 0xA6:
            case 0xA7:
            case 0xA8:
            case 0xA9:
            case 0xAA:
            case 0xAB:
            case 0xAC:
            case 0xAD:
            case 0xAE:
            case 0xAF:
            case 0xB0:
            case 0xB1:
            case 0xB2:
            case 0xB3:
            case 0xB4:
            case 0xB5:
            case 0xB6:
            case 0xB7:
                return get_cbor_object(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);

            case 0xB8: // map (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
            }

            case 0xB9: // map (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
            }

            case 0xBA: // map (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
            }

            case 0xBB: // map (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
            }

            case 0xBF: // map (indefinite length)
                return get_cbor_object(static_cast<std::size_t>(-1), tag_handler);

            case 0xC6: // tagged item
            case 0xC7:
            case 0xC8:
            case 0xC9:
            case 0xCA:
            case 0xCB:
            case 0xCC:
            case 0xCD:
            case 0xCE:
            case 0xCF:
            case 0xD0:
            case 0xD1:
            case 0xD2:
            case 0xD3:
            case 0xD4:
            case 0xD8: // tagged item (1 bytes follow)
            case 0xD9: // tagged item (2 bytes follow)
            case 0xDA: // tagged item (4 bytes follow)
            case 0xDB: // tagged item (8 bytes follow)
            {
                switch (tag_handler)
                {
                    case cbor_tag_handler_t::error:
                    {
                        auto last_token = get_token_string();
                        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                                exception_message(input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"), nullptr));
                    }

                    case cbor_tag_handler_t::ignore:
                    {
                        // ignore binary subtype
                        switch (current)
                        {
                            case 0xD8:
                            {
                                std::uint8_t subtype_to_ignore{};
                                get_number(input_format_t::cbor, subtype_to_ignore);
                                break;
                            }
                            case 0xD9:
                            {
                                std::uint16_t subtype_to_ignore{};
                                get_number(input_format_t::cbor, subtype_to_ignore);
                                break;
                            }
                            case 0xDA:
                            {
                                std::uint32_t subtype_to_ignore{};
                                get_number(input_format_t::cbor, subtype_to_ignore);
                                break;
                            }
                            case 0xDB:
                            {
                                std::uint64_t subtype_to_ignore{};
                                get_number(input_format_t::cbor, subtype_to_ignore);
                                break;
                            }
                            default:
                                break;
                        }
                        return parse_cbor_internal(true, tag_handler);
                    }

                    case cbor_tag_handler_t::store:
                    {
                        binary_t b;
                        // use binary subtype and store in binary container
                        switch (current)
                        {
                            case 0xD8:
                            {
                                std::uint8_t subtype{};
                                get_number(input_format_t::cbor, subtype);
                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
                                break;
                            }
                            case 0xD9:
                            {
                                std::uint16_t subtype{};
                                get_number(input_format_t::cbor, subtype);
                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
                                break;
                            }
                            case 0xDA:
                            {
                                std::uint32_t subtype{};
                                get_number(input_format_t::cbor, subtype);
                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
                                break;
                            }
                            case 0xDB:
                            {
                                std::uint64_t subtype{};
                                get_number(input_format_t::cbor, subtype);
                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
                                break;
                            }
                            default:
                                return parse_cbor_internal(true, tag_handler);
                        }
                        get();
                        return get_cbor_binary(b) && sax->binary(b);
                    }

                    default:                 // LCOV_EXCL_LINE
                        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                        return false;        // LCOV_EXCL_LINE
                }
            }

            case 0xF4: // false
                return sax->boolean(false);

            case 0xF5: // true
                return sax->boolean(true);

            case 0xF6: // null
                return sax->null();

            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
            {
                const auto byte1_raw = get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
                {
                    return false;
                }
                const auto byte2_raw = get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
                {
                    return false;
                }

                const auto byte1 = static_cast<unsigned char>(byte1_raw);
                const auto byte2 = static_cast<unsigned char>(byte2_raw);

                // code from RFC 7049, Appendix D, Figure 3:
                // As half-precision floating-point numbers were only added
                // to IEEE 754 in 2008, today's programming platforms often
                // still only have limited support for them. It is very
                // easy to include at least decoding support for them even
                // without such support. An example of a small decoder for
                // half-precision floating-point numbers in the C language
                // is shown in Fig. 3.
                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
                const double val = [&half]
                {
                    const int exp = (half >> 10u) & 0x1Fu;
                    const unsigned int mant = half & 0x3FFu;
                    JSON_ASSERT(0 <= exp&& exp <= 32);
                    JSON_ASSERT(mant <= 1024);
                    switch (exp)
                    {
                        case 0:
                            return std::ldexp(mant, -24);
                        case 31:
                            return (mant == 0)
                            ? std::numeric_limits<double>::infinity()
                            : std::numeric_limits<double>::quiet_NaN();
                        default:
                            return std::ldexp(mant + 1024, exp - 25);
                    }
                }();
                return sax->number_float((half & 0x8000u) != 0
                                         ? static_cast<number_float_t>(-val)
                                         : static_cast<number_float_t>(val), "");
            }

            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
            {
                float number{};
                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
            {
                double number{};
                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            default: // anything else (0xFF is handled inside the other types)
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                        exception_message(input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"), nullptr));
            }
        }
    }

    /*!
    @brief reads a CBOR string

    This function first reads starting bytes to determine the expected
    string length and then copies this number of bytes into a string.
    Additionally, CBOR's strings with indefinite lengths are supported.

    @param[out] result  created string

    @return whether string creation completed
    */
    bool get_cbor_string(string_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string")))
        {
            return false;
        }

        switch (current)
        {
            // UTF-8 string (0x00..0x17 bytes follow)
            case 0x60:
            case 0x61:
            case 0x62:
            case 0x63:
            case 0x64:
            case 0x65:
            case 0x66:
            case 0x67:
            case 0x68:
            case 0x69:
            case 0x6A:
            case 0x6B:
            case 0x6C:
            case 0x6D:
            case 0x6E:
            case 0x6F:
            case 0x70:
            case 0x71:
            case 0x72:
            case 0x73:
            case 0x74:
            case 0x75:
            case 0x76:
            case 0x77:
            {
                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
            }

            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x7F: // UTF-8 string (indefinite length)
            {
                while (get() != 0xFF)
                {
                    string_t chunk;
                    if (!get_cbor_string(chunk))
                    {
                        return false;
                    }
                    result.append(chunk);
                }
                return true;
            }

            default:
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
                                        exception_message(input_format_t::cbor, concat("expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x", last_token), "string"), nullptr));
            }
        }
    }

    /*!
    @brief reads a CBOR byte array

    This function first reads starting bytes to determine the expected
    byte array length and then copies this number of bytes into the byte array.
    Additionally, CBOR's byte arrays with indefinite lengths are supported.

    @param[out] result  created byte array

    @return whether byte array creation completed
    */
    bool get_cbor_binary(binary_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary")))
        {
            return false;
        }

        switch (current)
        {
            // Binary data (0x00..0x17 bytes follow)
            case 0x40:
            case 0x41:
            case 0x42:
            case 0x43:
            case 0x44:
            case 0x45:
            case 0x46:
            case 0x47:
            case 0x48:
            case 0x49:
            case 0x4A:
            case 0x4B:
            case 0x4C:
            case 0x4D:
            case 0x4E:
            case 0x4F:
            case 0x50:
            case 0x51:
            case 0x52:
            case 0x53:
            case 0x54:
            case 0x55:
            case 0x56:
            case 0x57:
            {
                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
            }

            case 0x58: // Binary data (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x59: // Binary data (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x5A: // Binary data (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x5F: // Binary data (indefinite length)
            {
                while (get() != 0xFF)
                {
                    binary_t chunk;
                    if (!get_cbor_binary(chunk))
                    {
                        return false;
                    }
                    result.insert(result.end(), chunk.begin(), chunk.end());
                }
                return true;
            }

            default:
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
                                        exception_message(input_format_t::cbor, concat("expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x", last_token), "binary"), nullptr));
            }
        }
    }

    /*!
    @param[in] len  the length of the array or static_cast<std::size_t>(-1) for an
                    array of indefinite size
    @param[in] tag_handler how CBOR tags should be treated
    @return whether array creation completed
    */
    bool get_cbor_array(const std::size_t len,
                        const cbor_tag_handler_t tag_handler)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
        {
            return false;
        }

        if (len != static_cast<std::size_t>(-1))
        {
            for (std::size_t i = 0; i < len; ++i)
            {
                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
                {
                    return false;
                }
            }
        }
        else
        {
            while (get() != 0xFF)
            {
                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler)))
                {
                    return false;
                }
            }
        }

        return sax->end_array();
    }

    /*!
    @param[in] len  the length of the object or static_cast<std::size_t>(-1) for an
                    object of indefinite size
    @param[in] tag_handler how CBOR tags should be treated
    @return whether object creation completed
    */
    bool get_cbor_object(const std::size_t len,
                         const cbor_tag_handler_t tag_handler)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
        {
            return false;
        }

        if (len != 0)
        {
            string_t key;
            if (len != static_cast<std::size_t>(-1))
            {
                for (std::size_t i = 0; i < len; ++i)
                {
                    get();
                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
                    {
                        return false;
                    }

                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
                    {
                        return false;
                    }
                    key.clear();
                }
            }
            else
            {
                while (get() != 0xFF)
                {
                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
                    {
                        return false;
                    }

                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
                    {
                        return false;
                    }
                    key.clear();
                }
            }
        }

        return sax->end_object();
    }

    /////////////
    // MsgPack //
    /////////////

    /*!
    @return whether a valid MessagePack value was passed to the SAX parser
    */
    bool parse_msgpack_internal()
    {
        switch (get())
        {
            // EOF
            case char_traits<char_type>::eof():
                return unexpect_eof(input_format_t::msgpack, "value");

            // positive fixint
            case 0x00:
            case 0x01:
            case 0x02:
            case 0x03:
            case 0x04:
            case 0x05:
            case 0x06:
            case 0x07:
            case 0x08:
            case 0x09:
            case 0x0A:
            case 0x0B:
            case 0x0C:
            case 0x0D:
            case 0x0E:
            case 0x0F:
            case 0x10:
            case 0x11:
            case 0x12:
            case 0x13:
            case 0x14:
            case 0x15:
            case 0x16:
            case 0x17:
            case 0x18:
            case 0x19:
            case 0x1A:
            case 0x1B:
            case 0x1C:
            case 0x1D:
            case 0x1E:
            case 0x1F:
            case 0x20:
            case 0x21:
            case 0x22:
            case 0x23:
            case 0x24:
            case 0x25:
            case 0x26:
            case 0x27:
            case 0x28:
            case 0x29:
            case 0x2A:
            case 0x2B:
            case 0x2C:
            case 0x2D:
            case 0x2E:
            case 0x2F:
            case 0x30:
            case 0x31:
            case 0x32:
            case 0x33:
            case 0x34:
            case 0x35:
            case 0x36:
            case 0x37:
            case 0x38:
            case 0x39:
            case 0x3A:
            case 0x3B:
            case 0x3C:
            case 0x3D:
            case 0x3E:
            case 0x3F:
            case 0x40:
            case 0x41:
            case 0x42:
            case 0x43:
            case 0x44:
            case 0x45:
            case 0x46:
            case 0x47:
            case 0x48:
            case 0x49:
            case 0x4A:
            case 0x4B:
            case 0x4C:
            case 0x4D:
            case 0x4E:
            case 0x4F:
            case 0x50:
            case 0x51:
            case 0x52:
            case 0x53:
            case 0x54:
            case 0x55:
            case 0x56:
            case 0x57:
            case 0x58:
            case 0x59:
            case 0x5A:
            case 0x5B:
            case 0x5C:
            case 0x5D:
            case 0x5E:
            case 0x5F:
            case 0x60:
            case 0x61:
            case 0x62:
            case 0x63:
            case 0x64:
            case 0x65:
            case 0x66:
            case 0x67:
            case 0x68:
            case 0x69:
            case 0x6A:
            case 0x6B:
            case 0x6C:
            case 0x6D:
            case 0x6E:
            case 0x6F:
            case 0x70:
            case 0x71:
            case 0x72:
            case 0x73:
            case 0x74:
            case 0x75:
            case 0x76:
            case 0x77:
            case 0x78:
            case 0x79:
            case 0x7A:
            case 0x7B:
            case 0x7C:
            case 0x7D:
            case 0x7E:
            case 0x7F:
                return sax->number_unsigned(static_cast<number_unsigned_t>(current));

            // fixmap
            case 0x80:
            case 0x81:
            case 0x82:
            case 0x83:
            case 0x84:
            case 0x85:
            case 0x86:
            case 0x87:
            case 0x88:
            case 0x89:
            case 0x8A:
            case 0x8B:
            case 0x8C:
            case 0x8D:
            case 0x8E:
            case 0x8F:
                return get_msgpack_object(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));

            // fixarray
            case 0x90:
            case 0x91:
            case 0x92:
            case 0x93:
            case 0x94:
            case 0x95:
            case 0x96:
            case 0x97:
            case 0x98:
            case 0x99:
            case 0x9A:
            case 0x9B:
            case 0x9C:
            case 0x9D:
            case 0x9E:
            case 0x9F:
                return get_msgpack_array(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));

            // fixstr
            case 0xA0:
            case 0xA1:
            case 0xA2:
            case 0xA3:
            case 0xA4:
            case 0xA5:
            case 0xA6:
            case 0xA7:
            case 0xA8:
            case 0xA9:
            case 0xAA:
            case 0xAB:
            case 0xAC:
            case 0xAD:
            case 0xAE:
            case 0xAF:
            case 0xB0:
            case 0xB1:
            case 0xB2:
            case 0xB3:
            case 0xB4:
            case 0xB5:
            case 0xB6:
            case 0xB7:
            case 0xB8:
            case 0xB9:
            case 0xBA:
            case 0xBB:
            case 0xBC:
            case 0xBD:
            case 0xBE:
            case 0xBF:
            case 0xD9: // str 8
            case 0xDA: // str 16
            case 0xDB: // str 32
            {
                string_t s;
                return get_msgpack_string(s) && sax->string(s);
            }

            case 0xC0: // nil
                return sax->null();

            case 0xC2: // false
                return sax->boolean(false);

            case 0xC3: // true
                return sax->boolean(true);

            case 0xC4: // bin 8
            case 0xC5: // bin 16
            case 0xC6: // bin 32
            case 0xC7: // ext 8
            case 0xC8: // ext 16
            case 0xC9: // ext 32
            case 0xD4: // fixext 1
            case 0xD5: // fixext 2
            case 0xD6: // fixext 4
            case 0xD7: // fixext 8
            case 0xD8: // fixext 16
            {
                binary_t b;
                return get_msgpack_binary(b) && sax->binary(b);
            }

            case 0xCA: // float 32
            {
                float number{};
                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0xCB: // float 64
            {
                double number{};
                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0xCC: // uint 8
            {
                std::uint8_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xCD: // uint 16
            {
                std::uint16_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xCE: // uint 32
            {
                std::uint32_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xCF: // uint 64
            {
                std::uint64_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xD0: // int 8
            {
                std::int8_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xD1: // int 16
            {
                std::int16_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xD2: // int 32
            {
                std::int32_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xD3: // int 64
            {
                std::int64_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xDC: // array 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
            }

            case 0xDD: // array 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_array(conditional_static_cast<std::size_t>(len));
            }

            case 0xDE: // map 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
            }

            case 0xDF: // map 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_object(conditional_static_cast<std::size_t>(len));
            }

            // negative fixint
            case 0xE0:
            case 0xE1:
            case 0xE2:
            case 0xE3:
            case 0xE4:
            case 0xE5:
            case 0xE6:
            case 0xE7:
            case 0xE8:
            case 0xE9:
            case 0xEA:
            case 0xEB:
            case 0xEC:
            case 0xED:
            case 0xEE:
            case 0xEF:
            case 0xF0:
            case 0xF1:
            case 0xF2:
            case 0xF3:
            case 0xF4:
            case 0xF5:
            case 0xF6:
            case 0xF7:
            case 0xF8:
            case 0xF9:
            case 0xFA:
            case 0xFB:
            case 0xFC:
            case 0xFD:
            case 0xFE:
            case 0xFF:
                return sax->number_integer(static_cast<std::int8_t>(current));

            default: // anything else
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                        exception_message(input_format_t::msgpack, concat("invalid byte: 0x", last_token), "value"), nullptr));
            }
        }
    }

    /*!
    @brief reads a MessagePack string

    This function first reads starting bytes to determine the expected
    string length and then copies this number of bytes into a string.

    @param[out] result  created string

    @return whether string creation completed
    */
    bool get_msgpack_string(string_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string")))
        {
            return false;
        }

        switch (current)
        {
            // fixstr
            case 0xA0:
            case 0xA1:
            case 0xA2:
            case 0xA3:
            case 0xA4:
            case 0xA5:
            case 0xA6:
            case 0xA7:
            case 0xA8:
            case 0xA9:
            case 0xAA:
            case 0xAB:
            case 0xAC:
            case 0xAD:
            case 0xAE:
            case 0xAF:
            case 0xB0:
            case 0xB1:
            case 0xB2:
            case 0xB3:
            case 0xB4:
            case 0xB5:
            case 0xB6:
            case 0xB7:
            case 0xB8:
            case 0xB9:
            case 0xBA:
            case 0xBB:
            case 0xBC:
            case 0xBD:
            case 0xBE:
            case 0xBF:
            {
                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
            }

            case 0xD9: // str 8
            {
                std::uint8_t len{};
                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
            }

            case 0xDA: // str 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
            }

            case 0xDB: // str 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
            }

            default:
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
                                        exception_message(input_format_t::msgpack, concat("expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x", last_token), "string"), nullptr));
            }
        }
    }

    /*!
    @brief reads a MessagePack byte array

    This function first reads starting bytes to determine the expected
    byte array length and then copies this number of bytes into a byte array.

    @param[out] result  created byte array

    @return whether byte array creation completed
    */
    bool get_msgpack_binary(binary_t& result)
    {
        // helper function to set the subtype
        auto assign_and_return_true = [&result](std::int8_t subtype)
        {
            result.set_subtype(static_cast<std::uint8_t>(subtype));
            return true;
        };

        switch (current)
        {
            case 0xC4: // bin 8
            {
                std::uint8_t len{};
                return get_number(input_format_t::msgpack, len) &&
                       get_binary(input_format_t::msgpack, len, result);
            }

            case 0xC5: // bin 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) &&
                       get_binary(input_format_t::msgpack, len, result);
            }

            case 0xC6: // bin 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) &&
                       get_binary(input_format_t::msgpack, len, result);
            }

            case 0xC7: // ext 8
            {
                std::uint8_t len{};
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, len) &&
                       get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, len, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xC8: // ext 16
            {
                std::uint16_t len{};
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, len) &&
                       get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, len, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xC9: // ext 32
            {
                std::uint32_t len{};
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, len) &&
                       get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, len, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD4: // fixext 1
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 1, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD5: // fixext 2
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 2, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD6: // fixext 4
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 4, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD7: // fixext 8
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 8, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD8: // fixext 16
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 16, result) &&
                       assign_and_return_true(subtype);
            }

            default:           // LCOV_EXCL_LINE
                return false;  // LCOV_EXCL_LINE
        }
    }

    /*!
    @param[in] len  the length of the array
    @return whether array creation completed
    */
    bool get_msgpack_array(const std::size_t len)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
        {
            return false;
        }

        for (std::size_t i = 0; i < len; ++i)
        {
            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
            {
                return false;
            }
        }

        return sax->end_array();
    }

    /*!
    @param[in] len  the length of the object
    @return whether object creation completed
    */
    bool get_msgpack_object(const std::size_t len)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
        {
            return false;
        }

        string_t key;
        for (std::size_t i = 0; i < len; ++i)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key)))
            {
                return false;
            }

            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
            {
                return false;
            }
            key.clear();
        }

        return sax->end_object();
    }

    ////////////
    // UBJSON //
    ////////////

    /*!
    @param[in] get_char  whether a new character should be retrieved from the
                         input (true, default) or whether the last read
                         character should be considered instead

    @return whether a valid UBJSON value was passed to the SAX parser
    */
    bool parse_ubjson_internal(const bool get_char = true)
    {
        return get_ubjson_value(get_char ? get_ignore_noop() : current);
    }

    /*!
    @brief reads a UBJSON string

    This function is either called after reading the 'S' byte explicitly
    indicating a string, or in case of an object key where the 'S' byte can be
    left out.

    @param[out] result   created string
    @param[in] get_char  whether a new character should be retrieved from the
                         input (true, default) or whether the last read
                         character should be considered instead

    @return whether string creation completed
    */
    bool get_ubjson_string(string_t& result, const bool get_char = true)
    {
        if (get_char)
        {
            get();  // TODO(niels): may we ignore N here?
        }

        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
        {
            return false;
        }

        switch (current)
        {
            case 'U':
            {
                std::uint8_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            case 'i':
            {
                std::int8_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            case 'I':
            {
                std::int16_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            case 'l':
            {
                std::int32_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            case 'L':
            {
                std::int64_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            case 'u':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint16_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            case 'm':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint32_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            case 'M':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint64_t len{};
                return get_number(input_format, len) && get_string(input_format, len, result);
            }

            default:
                break;
        }
        auto last_token = get_token_string();
        std::string message;

        if (input_format != input_format_t::bjdata)
        {
            message = "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token;
        }
        else
        {
            message = "expected length type specification (U, i, u, I, m, l, M, L); last byte: 0x" + last_token;
        }
        return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "string"), nullptr));
    }

    /*!
    @param[out] dim  an integer vector storing the ND array dimensions
    @return whether reading ND array size vector is successful
    */
    bool get_ubjson_ndarray_size(std::vector<size_t>& dim)
    {
        std::pair<std::size_t, char_int_type> size_and_type;
        size_t dimlen = 0;
        bool no_ndarray = true;

        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type, no_ndarray)))
        {
            return false;
        }

        if (size_and_type.first != npos)
        {
            if (size_and_type.second != 0)
            {
                if (size_and_type.second != 'N')
                {
                    for (std::size_t i = 0; i < size_and_type.first; ++i)
                    {
                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, size_and_type.second)))
                        {
                            return false;
                        }
                        dim.push_back(dimlen);
                    }
                }
            }
            else
            {
                for (std::size_t i = 0; i < size_and_type.first; ++i)
                {
                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray)))
                    {
                        return false;
                    }
                    dim.push_back(dimlen);
                }
            }
        }
        else
        {
            while (current != ']')
            {
                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, current)))
                {
                    return false;
                }
                dim.push_back(dimlen);
                get_ignore_noop();
            }
        }
        return true;
    }

    /*!
    @param[out] result  determined size
    @param[in,out] is_ndarray  for input, `true` means already inside an ndarray vector
                               or ndarray dimension is not allowed; `false` means ndarray
                               is allowed; for output, `true` means an ndarray is found;
                               is_ndarray can only return `true` when its initial value
                               is `false`
    @param[in] prefix  type marker if already read, otherwise set to 0

    @return whether size determination completed
    */
    bool get_ubjson_size_value(std::size_t& result, bool& is_ndarray, char_int_type prefix = 0)
    {
        if (prefix == 0)
        {
            prefix = get_ignore_noop();
        }

        switch (prefix)
        {
            case 'U':
            {
                std::uint8_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'i':
            {
                std::int8_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                if (number < 0)
                {
                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
                }
                result = static_cast<std::size_t>(number); // NOLINT(bugprone-signed-char-misuse,cert-str34-c): number is not a char
                return true;
            }

            case 'I':
            {
                std::int16_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                if (number < 0)
                {
                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'l':
            {
                std::int32_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                if (number < 0)
                {
                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'L':
            {
                std::int64_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                if (number < 0)
                {
                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read,
                                            exception_message(input_format, "count in an optimized container must be positive", "size"), nullptr));
                }
                if (!value_in_range_of<std::size_t>(number))
                {
                    return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408,
                                            exception_message(input_format, "integer value overflow", "size"), nullptr));
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'u':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint16_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'm':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint32_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                result = conditional_static_cast<std::size_t>(number);
                return true;
            }

            case 'M':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint64_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
                {
                    return false;
                }
                if (!value_in_range_of<std::size_t>(number))
                {
                    return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408,
                                            exception_message(input_format, "integer value overflow", "size"), nullptr));
                }
                result = detail::conditional_static_cast<std::size_t>(number);
                return true;
            }

            case '[':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
                {
                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
                }
                std::vector<size_t> dim;
                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
                {
                    return false;
                }
                if (dim.size() == 1 || (dim.size() == 2 && dim.at(0) == 1)) // return normal array size if 1D row vector
                {
                    result = dim.at(dim.size() - 1);
                    return true;
                }
                if (!dim.empty())  // if ndarray, convert to an object in JData annotated array format
                {
                    for (auto i : dim) // test if any dimension in an ndarray is 0, if so, return a 1D empty container
                    {
                        if ( i == 0 )
                        {
                            result = 0;
                            return true;
                        }
                    }

                    string_t key = "_ArraySize_";
                    if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size())))
                    {
                        return false;
                    }
                    result = 1;
                    for (auto i : dim)
                    {
                        result *= i;
                        if (result == 0 || result == npos) // because dim elements shall not have zeros, result = 0 means overflow happened; it also can't be npos as it is used to initialize size in get_ubjson_size_type()
                        {
                            return sax->parse_error(chars_read, get_token_string(), out_of_range::create(408, exception_message(input_format, "excessive ndarray size caused overflow", "size"), nullptr));
                        }
                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(static_cast<number_unsigned_t>(i))))
                        {
                            return false;
                        }
                    }
                    is_ndarray = true;
                    return sax->end_array();
                }
                result = 0;
                return true;
            }

            default:
                break;
        }
        auto last_token = get_token_string();
        std::string message;

        if (input_format != input_format_t::bjdata)
        {
            message = "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token;
        }
        else
        {
            message = "expected length type specification (U, i, u, I, m, l, M, L) after '#'; last byte: 0x" + last_token;
        }
        return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "size"), nullptr));
    }

    /*!
    @brief determine the type and size for a container

    In the optimized UBJSON format, a type and a size can be provided to allow
    for a more compact representation.

    @param[out] result  pair of the size and the type
    @param[in] inside_ndarray  whether the parser is parsing an ND array dimensional vector

    @return whether pair creation completed
    */
    bool get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result, bool inside_ndarray = false)
    {
        result.first = npos; // size
        result.second = 0; // type
        bool is_ndarray = false;

        get_ignore_noop();

        if (current == '$')
        {
            result.second = get();  // must not ignore 'N', because 'N' maybe the type
            if (input_format == input_format_t::bjdata
                    && JSON_HEDLEY_UNLIKELY(std::binary_search(bjd_optimized_type_markers.begin(), bjd_optimized_type_markers.end(), result.second)))
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                        exception_message(input_format, concat("marker 0x", last_token, " is not a permitted optimized array type"), "type"), nullptr));
            }

            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "type")))
            {
                return false;
            }

            get_ignore_noop();
            if (JSON_HEDLEY_UNLIKELY(current != '#'))
            {
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
                {
                    return false;
                }
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                        exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr));
            }

            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
            if (input_format == input_format_t::bjdata && is_ndarray)
            {
                if (inside_ndarray)
                {
                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read,
                                            exception_message(input_format, "ndarray can not be recursive", "size"), nullptr));
                }
                result.second |= (1 << 8); // use bit 8 to indicate ndarray, all UBJSON and BJData markers should be ASCII letters
            }
            return is_error;
        }

        if (current == '#')
        {
            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
            if (input_format == input_format_t::bjdata && is_ndarray)
            {
                return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read,
                                        exception_message(input_format, "ndarray requires both type and size", "size"), nullptr));
            }
            return is_error;
        }

        return true;
    }

    /*!
    @param prefix  the previously read or set type prefix
    @return whether value creation completed
    */
    bool get_ubjson_value(const char_int_type prefix)
    {
        switch (prefix)
        {
            case char_traits<char_type>::eof():  // EOF
                return unexpect_eof(input_format, "value");

            case 'T':  // true
                return sax->boolean(true);
            case 'F':  // false
                return sax->boolean(false);

            case 'Z':  // null
                return sax->null();

            case 'U':
            {
                std::uint8_t number{};
                return get_number(input_format, number) && sax->number_unsigned(number);
            }

            case 'i':
            {
                std::int8_t number{};
                return get_number(input_format, number) && sax->number_integer(number);
            }

            case 'I':
            {
                std::int16_t number{};
                return get_number(input_format, number) && sax->number_integer(number);
            }

            case 'l':
            {
                std::int32_t number{};
                return get_number(input_format, number) && sax->number_integer(number);
            }

            case 'L':
            {
                std::int64_t number{};
                return get_number(input_format, number) && sax->number_integer(number);
            }

            case 'u':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint16_t number{};
                return get_number(input_format, number) && sax->number_unsigned(number);
            }

            case 'm':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint32_t number{};
                return get_number(input_format, number) && sax->number_unsigned(number);
            }

            case 'M':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                std::uint64_t number{};
                return get_number(input_format, number) && sax->number_unsigned(number);
            }

            case 'h':
            {
                if (input_format != input_format_t::bjdata)
                {
                    break;
                }
                const auto byte1_raw = get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
                {
                    return false;
                }
                const auto byte2_raw = get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
                {
                    return false;
                }

                const auto byte1 = static_cast<unsigned char>(byte1_raw);
                const auto byte2 = static_cast<unsigned char>(byte2_raw);

                // code from RFC 7049, Appendix D, Figure 3:
                // As half-precision floating-point numbers were only added
                // to IEEE 754 in 2008, today's programming platforms often
                // still only have limited support for them. It is very
                // easy to include at least decoding support for them even
                // without such support. An example of a small decoder for
                // half-precision floating-point numbers in the C language
                // is shown in Fig. 3.
                const auto half = static_cast<unsigned int>((byte2 << 8u) + byte1);
                const double val = [&half]
                {
                    const int exp = (half >> 10u) & 0x1Fu;
                    const unsigned int mant = half & 0x3FFu;
                    JSON_ASSERT(0 <= exp&& exp <= 32);
                    JSON_ASSERT(mant <= 1024);
                    switch (exp)
                    {
                        case 0:
                            return std::ldexp(mant, -24);
                        case 31:
                            return (mant == 0)
                            ? std::numeric_limits<double>::infinity()
                            : std::numeric_limits<double>::quiet_NaN();
                        default:
                            return std::ldexp(mant + 1024, exp - 25);
                    }
                }();
                return sax->number_float((half & 0x8000u) != 0
                                         ? static_cast<number_float_t>(-val)
                                         : static_cast<number_float_t>(val), "");
            }

            case 'd':
            {
                float number{};
                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 'D':
            {
                double number{};
                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 'H':
            {
                return get_ubjson_high_precision_number();
            }

            case 'C':  // char
            {
                get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "char")))
                {
                    return false;
                }
                if (JSON_HEDLEY_UNLIKELY(current > 127))
                {
                    auto last_token = get_token_string();
                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
                                            exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr));
                }
                string_t s(1, static_cast<typename string_t::value_type>(current));
                return sax->string(s);
            }

            case 'S':  // string
            {
                string_t s;
                return get_ubjson_string(s) && sax->string(s);
            }

            case '[':  // array
                return get_ubjson_array();

            case '{':  // object
                return get_ubjson_object();

            default: // anything else
                break;
        }
        auto last_token = get_token_string();
        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format, "invalid byte: 0x" + last_token, "value"), nullptr));
    }

    /*!
    @return whether array creation completed
    */
    bool get_ubjson_array()
    {
        std::pair<std::size_t, char_int_type> size_and_type;
        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
        {
            return false;
        }

        // if bit-8 of size_and_type.second is set to 1, encode bjdata ndarray as an object in JData annotated array format (https://github.com/NeuroJSON/jdata):
        // {"_ArrayType_" : "typeid", "_ArraySize_" : [n1, n2, ...], "_ArrayData_" : [v1, v2, ...]}

        if (input_format == input_format_t::bjdata && size_and_type.first != npos && (size_and_type.second & (1 << 8)) != 0)
        {
            size_and_type.second &= ~(static_cast<char_int_type>(1) << 8);  // use bit 8 to indicate ndarray, here we remove the bit to restore the type marker
            auto it = std::lower_bound(bjd_types_map.begin(), bjd_types_map.end(), size_and_type.second, [](const bjd_type & p, char_int_type t)
            {
                return p.first < t;
            });
            string_t key = "_ArrayType_";
            if (JSON_HEDLEY_UNLIKELY(it == bjd_types_map.end() || it->first != size_and_type.second))
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                        exception_message(input_format, "invalid byte: 0x" + last_token, "type"), nullptr));
            }

            string_t type = it->second; // sax->string() takes a reference
            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type)))
            {
                return false;
            }

            if (size_and_type.second == 'C')
            {
                size_and_type.second = 'U';
            }

            key = "_ArrayData_";
            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) ))
            {
                return false;
            }

            for (std::size_t i = 0; i < size_and_type.first; ++i)
            {
                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
                {
                    return false;
                }
            }

            return (sax->end_array() && sax->end_object());
        }

        if (size_and_type.first != npos)
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
            {
                return false;
            }

            if (size_and_type.second != 0)
            {
                if (size_and_type.second != 'N')
                {
                    for (std::size_t i = 0; i < size_and_type.first; ++i)
                    {
                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
                        {
                            return false;
                        }
                    }
                }
            }
            else
            {
                for (std::size_t i = 0; i < size_and_type.first; ++i)
                {
                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
                    {
                        return false;
                    }
                }
            }
        }
        else
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1))))
            {
                return false;
            }

            while (current != ']')
            {
                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false)))
                {
                    return false;
                }
                get_ignore_noop();
            }
        }

        return sax->end_array();
    }

    /*!
    @return whether object creation completed
    */
    bool get_ubjson_object()
    {
        std::pair<std::size_t, char_int_type> size_and_type;
        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
        {
            return false;
        }

        // do not accept ND-array size in objects in BJData
        if (input_format == input_format_t::bjdata && size_and_type.first != npos && (size_and_type.second & (1 << 8)) != 0)
        {
            auto last_token = get_token_string();
            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
                                    exception_message(input_format, "BJData object does not support ND-array size in optimized format", "object"), nullptr));
        }

        string_t key;
        if (size_and_type.first != npos)
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first)))
            {
                return false;
            }

            if (size_and_type.second != 0)
            {
                for (std::size_t i = 0; i < size_and_type.first; ++i)
                {
                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
                    {
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
                    {
                        return false;
                    }
                    key.clear();
                }
            }
            else
            {
                for (std::size_t i = 0; i < size_and_type.first; ++i)
                {
                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
                    {
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
                    {
                        return false;
                    }
                    key.clear();
                }
            }
        }
        else
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1))))
            {
                return false;
            }

            while (current != '}')
            {
                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key)))
                {
                    return false;
                }
                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
                {
                    return false;
                }
                get_ignore_noop();
                key.clear();
            }
        }

        return sax->end_object();
    }

    // Note, no reader for UBJSON binary types is implemented because they do
    // not exist

    bool get_ubjson_high_precision_number()
    {
        // get size of following number string
        std::size_t size{};
        bool no_ndarray = true;
        auto res = get_ubjson_size_value(size, no_ndarray);
        if (JSON_HEDLEY_UNLIKELY(!res))
        {
            return res;
        }

        // get number string
        std::vector<char> number_vector;
        for (std::size_t i = 0; i < size; ++i)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
            {
                return false;
            }
            number_vector.push_back(static_cast<char>(current));
        }

        // parse number string
        using ia_type = decltype(detail::input_adapter(number_vector));
        auto number_lexer = detail::lexer<BasicJsonType, ia_type>(detail::input_adapter(number_vector), false);
        const auto result_number = number_lexer.scan();
        const auto number_string = number_lexer.get_token_string();
        const auto result_remainder = number_lexer.scan();

        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;

        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
        {
            return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
                                    exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
        }

        switch (result_number)
        {
            case token_type::value_integer:
                return sax->number_integer(number_lexer.get_number_integer());
            case token_type::value_unsigned:
                return sax->number_unsigned(number_lexer.get_number_unsigned());
            case token_type::value_float:
                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
            case token_type::uninitialized:
            case token_type::literal_true:
            case token_type::literal_false:
            case token_type::literal_null:
            case token_type::value_string:
            case token_type::begin_array:
            case token_type::begin_object:
            case token_type::end_array:
            case token_type::end_object:
            case token_type::name_separator:
            case token_type::value_separator:
            case token_type::parse_error:
            case token_type::end_of_input:
            case token_type::literal_or_value:
            default:
                return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
                                        exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
        }
    }

    ///////////////////////
    // Utility functions //
    ///////////////////////

    /*!
    @brief get next character from the input

    This function provides the interface to the used input adapter. It does
    not throw in case the input reached EOF, but returns a -'ve valued
    `char_traits<char_type>::eof()` in that case.

    @return character read from the input
    */
    char_int_type get()
    {
        ++chars_read;
        return current = ia.get_character();
    }

    /*!
    @return character read from the input after ignoring all 'N' entries
    */
    char_int_type get_ignore_noop()
    {
        do
        {
            get();
        }
        while (current == 'N');

        return current;
    }

    /*
    @brief read a number from the input

    @tparam NumberType the type of the number
    @param[in] format   the current format (for diagnostics)
    @param[out] result  number of type @a NumberType

    @return whether conversion completed

    @note This function needs to respect the system's endianness, because
          bytes in CBOR, MessagePack, and UBJSON are stored in network order
          (big endian) and therefore need reordering on little endian systems.
          On the other hand, BSON and BJData use little endian and should reorder
          on big endian systems.
    */
    template<typename NumberType, bool InputIsLittleEndian = false>
    bool get_number(const input_format_t format, NumberType& result)
    {
        // step 1: read input into array with system's byte order
        std::array<std::uint8_t, sizeof(NumberType)> vec{};
        for (std::size_t i = 0; i < sizeof(NumberType); ++i)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number")))
            {
                return false;
            }

            // reverse byte order prior to conversion if necessary
            if (is_little_endian != (InputIsLittleEndian || format == input_format_t::bjdata))
            {
                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
            }
            else
            {
                vec[i] = static_cast<std::uint8_t>(current); // LCOV_EXCL_LINE
            }
        }

        // step 2: convert array into number of type T and return
        std::memcpy(&result, vec.data(), sizeof(NumberType));
        return true;
    }

    /*!
    @brief create a string by reading characters from the input

    @tparam NumberType the type of the number
    @param[in] format the current format (for diagnostics)
    @param[in] len number of characters to read
    @param[out] result string created by reading @a len bytes

    @return whether string creation completed

    @note We can not reserve @a len bytes for the result, because @a len
          may be too large. Usually, @ref unexpect_eof() detects the end of
          the input before we run out of string memory.
    */
    template<typename NumberType>
    bool get_string(const input_format_t format,
                    const NumberType len,
                    string_t& result)
    {
        bool success = true;
        for (NumberType i = 0; i < len; i++)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string")))
            {
                success = false;
                break;
            }
            result.push_back(static_cast<typename string_t::value_type>(current));
        }
        return success;
    }

    /*!
    @brief create a byte array by reading bytes from the input

    @tparam NumberType the type of the number
    @param[in] format the current format (for diagnostics)
    @param[in] len number of bytes to read
    @param[out] result byte array created by reading @a len bytes

    @return whether byte array creation completed

    @note We can not reserve @a len bytes for the result, because @a len
          may be too large. Usually, @ref unexpect_eof() detects the end of
          the input before we run out of memory.
    */
    template<typename NumberType>
    bool get_binary(const input_format_t format,
                    const NumberType len,
                    binary_t& result)
    {
        bool success = true;
        for (NumberType i = 0; i < len; i++)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary")))
            {
                success = false;
                break;
            }
            result.push_back(static_cast<std::uint8_t>(current));
        }
        return success;
    }

    /*!
    @param[in] format   the current format (for diagnostics)
    @param[in] context  further context information (for diagnostics)
    @return whether the last read character is not EOF
    */
    JSON_HEDLEY_NON_NULL(3)
    bool unexpect_eof(const input_format_t format, const char* context) const
    {
        if (JSON_HEDLEY_UNLIKELY(current == char_traits<char_type>::eof()))
        {
            return sax->parse_error(chars_read, "<end of file>",
                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
        }
        return true;
    }

    /*!
    @return a string representation of the last read byte
    */
    std::string get_token_string() const
    {
        std::array<char, 3> cr{{}};
        static_cast<void>((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
        return std::string{cr.data()};
    }

    /*!
    @param[in] format   the current format
    @param[in] detail   a detailed error message
    @param[in] context  further context information
    @return a message string to use in the parse_error exceptions
    */
    std::string exception_message(const input_format_t format,
                                  const std::string& detail,
                                  const std::string& context) const
    {
        std::string error_msg = "syntax error while parsing ";

        switch (format)
        {
            case input_format_t::cbor:
                error_msg += "CBOR";
                break;

            case input_format_t::msgpack:
                error_msg += "MessagePack";
                break;

            case input_format_t::ubjson:
                error_msg += "UBJSON";
                break;

            case input_format_t::bson:
                error_msg += "BSON";
                break;

            case input_format_t::bjdata:
                error_msg += "BJData";
                break;

            case input_format_t::json: // LCOV_EXCL_LINE
            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }

        return concat(error_msg, ' ', context, ": ", detail);
    }

  private:
    static JSON_INLINE_VARIABLE constexpr std::size_t npos = static_cast<std::size_t>(-1);

    /// input adapter
    InputAdapterType ia;

    /// the current character
    char_int_type current = char_traits<char_type>::eof();

    /// the number of characters read
    std::size_t chars_read = 0;

    /// whether we can assume little endianness
    const bool is_little_endian = little_endianness();

    /// input format
    const input_format_t input_format = input_format_t::json;

    /// the SAX parser
    json_sax_t* sax = nullptr;

    // excluded markers in bjdata optimized type
#define JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_ \
    make_array<char_int_type>('F', 'H', 'N', 'S', 'T', 'Z', '[', '{')

#define JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_ \
    make_array<bjd_type>(                      \
    bjd_type{'C', "char"},                     \
    bjd_type{'D', "double"},                   \
    bjd_type{'I', "int16"},                    \
    bjd_type{'L', "int64"},                    \
    bjd_type{'M', "uint64"},                   \
    bjd_type{'U', "uint8"},                    \
    bjd_type{'d', "single"},                   \
    bjd_type{'i', "int8"},                     \
    bjd_type{'l', "int32"},                    \
    bjd_type{'m', "uint32"},                   \
    bjd_type{'u', "uint16"})

  JSON_PRIVATE_UNLESS_TESTED:
    // lookup tables
    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
    const decltype(JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_) bjd_optimized_type_markers =
        JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_;

    using bjd_type = std::pair<char_int_type, string_t>;
    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
    const decltype(JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_) bjd_types_map =
        JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_;

#undef JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_
#undef JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_
};

#ifndef JSON_HAS_CPP_17
    template<typename BasicJsonType, typename InputAdapterType, typename SAX>
    constexpr std::size_t binary_reader<BasicJsonType, InputAdapterType, SAX>::npos;
#endif

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/input/input_adapters.hpp>

// #include <nlohmann/detail/input/lexer.hpp>

// #include <nlohmann/detail/input/parser.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cmath> // isfinite
#include <cstdint> // uint8_t
#include <functional> // function
#include <string> // string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/input/input_adapters.hpp>

// #include <nlohmann/detail/input/json_sax.hpp>

// #include <nlohmann/detail/input/lexer.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/is_sax.hpp>

// #include <nlohmann/detail/string_concat.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{
////////////
// parser //
////////////

enum class parse_event_t : std::uint8_t
{
    /// the parser read `{` and started to process a JSON object
    object_start,
    /// the parser read `}` and finished processing a JSON object
    object_end,
    /// the parser read `[` and started to process a JSON array
    array_start,
    /// the parser read `]` and finished processing a JSON array
    array_end,
    /// the parser read a key of a value in an object
    key,
    /// the parser finished reading a JSON value
    value
};

template<typename BasicJsonType>
using parser_callback_t =
    std::function<bool(int /*depth*/, parse_event_t /*event*/, BasicJsonType& /*parsed*/)>;

/*!
@brief syntax analysis

This class implements a recursive descent parser.
*/
template<typename BasicJsonType, typename InputAdapterType>
class parser
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
    using token_type = typename lexer_t::token_type;

  public:
    /// a parser reading from an input adapter
    explicit parser(InputAdapterType&& adapter,
                    const parser_callback_t<BasicJsonType> cb = nullptr,
                    const bool allow_exceptions_ = true,
                    const bool skip_comments = false)
        : callback(cb)
        , m_lexer(std::move(adapter), skip_comments)
        , allow_exceptions(allow_exceptions_)
    {
        // read first token
        get_token();
    }

    /*!
    @brief public parser interface

    @param[in] strict      whether to expect the last token to be EOF
    @param[in,out] result  parsed JSON value

    @throw parse_error.101 in case of an unexpected token
    @throw parse_error.102 if to_unicode fails or surrogate error
    @throw parse_error.103 if to_unicode fails
    */
    void parse(const bool strict, BasicJsonType& result)
    {
        if (callback)
        {
            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
            sax_parse_internal(&sdp);

            // in strict mode, input must be completely read
            if (strict && (get_token() != token_type::end_of_input))
            {
                sdp.parse_error(m_lexer.get_position(),
                                m_lexer.get_token_string(),
                                parse_error::create(101, m_lexer.get_position(),
                                                    exception_message(token_type::end_of_input, "value"), nullptr));
            }

            // in case of an error, return discarded value
            if (sdp.is_errored())
            {
                result = value_t::discarded;
                return;
            }

            // set top-level value to null if it was discarded by the callback
            // function
            if (result.is_discarded())
            {
                result = nullptr;
            }
        }
        else
        {
            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
            sax_parse_internal(&sdp);

            // in strict mode, input must be completely read
            if (strict && (get_token() != token_type::end_of_input))
            {
                sdp.parse_error(m_lexer.get_position(),
                                m_lexer.get_token_string(),
                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
            }

            // in case of an error, return discarded value
            if (sdp.is_errored())
            {
                result = value_t::discarded;
                return;
            }
        }

        result.assert_invariant();
    }

    /*!
    @brief public accept interface

    @param[in] strict  whether to expect the last token to be EOF
    @return whether the input is a proper JSON text
    */
    bool accept(const bool strict = true)
    {
        json_sax_acceptor<BasicJsonType> sax_acceptor;
        return sax_parse(&sax_acceptor, strict);
    }

    template<typename SAX>
    JSON_HEDLEY_NON_NULL(2)
    bool sax_parse(SAX* sax, const bool strict = true)
    {
        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
        const bool result = sax_parse_internal(sax);

        // strict mode: next byte must be EOF
        if (result && strict && (get_token() != token_type::end_of_input))
        {
            return sax->parse_error(m_lexer.get_position(),
                                    m_lexer.get_token_string(),
                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
        }

        return result;
    }

  private:
    template<typename SAX>
    JSON_HEDLEY_NON_NULL(2)
    bool sax_parse_internal(SAX* sax)
    {
        // stack to remember the hierarchy of structured values we are parsing
        // true = array; false = object
        std::vector<bool> states;
        // value to avoid a goto (see comment where set to true)
        bool skip_to_state_evaluation = false;

        while (true)
        {
            if (!skip_to_state_evaluation)
            {
                // invariant: get_token() was called before each iteration
                switch (last_token)
                {
                    case token_type::begin_object:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1))))
                        {
                            return false;
                        }

                        // closing } -> we are done
                        if (get_token() == token_type::end_object)
                        {
                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
                            {
                                return false;
                            }
                            break;
                        }

                        // parse key
                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
                        {
                            return sax->parse_error(m_lexer.get_position(),
                                                    m_lexer.get_token_string(),
                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), nullptr));
                        }
                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
                        {
                            return false;
                        }

                        // parse separator (:)
                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
                        {
                            return sax->parse_error(m_lexer.get_position(),
                                                    m_lexer.get_token_string(),
                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), nullptr));
                        }

                        // remember we are now inside an object
                        states.push_back(false);

                        // parse values
                        get_token();
                        continue;
                    }

                    case token_type::begin_array:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1))))
                        {
                            return false;
                        }

                        // closing ] -> we are done
                        if (get_token() == token_type::end_array)
                        {
                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
                            {
                                return false;
                            }
                            break;
                        }

                        // remember we are now inside an array
                        states.push_back(true);

                        // parse values (no need to call get_token)
                        continue;
                    }

                    case token_type::value_float:
                    {
                        const auto res = m_lexer.get_number_float();

                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res)))
                        {
                            return sax->parse_error(m_lexer.get_position(),
                                                    m_lexer.get_token_string(),
                                                    out_of_range::create(406, concat("number overflow parsing '", m_lexer.get_token_string(), '\''), nullptr));
                        }

                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string())))
                        {
                            return false;
                        }

                        break;
                    }

                    case token_type::literal_false:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false)))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::literal_null:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->null()))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::literal_true:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true)))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::value_integer:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer())))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::value_string:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string())))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::value_unsigned:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned())))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::parse_error:
                    {
                        // using "uninitialized" to avoid "expected" message
                        return sax->parse_error(m_lexer.get_position(),
                                                m_lexer.get_token_string(),
                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), nullptr));
                    }
                    case token_type::end_of_input:
                    {
                        if (JSON_HEDLEY_UNLIKELY(m_lexer.get_position().chars_read_total == 1))
                        {
                            return sax->parse_error(m_lexer.get_position(),
                                                    m_lexer.get_token_string(),
                                                    parse_error::create(101, m_lexer.get_position(),
                                                            "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
                        }

                        return sax->parse_error(m_lexer.get_position(),
                                                m_lexer.get_token_string(),
                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr));
                    }
                    case token_type::uninitialized:
                    case token_type::end_array:
                    case token_type::end_object:
                    case token_type::name_separator:
                    case token_type::value_separator:
                    case token_type::literal_or_value:
                    default: // the last token was unexpected
                    {
                        return sax->parse_error(m_lexer.get_position(),
                                                m_lexer.get_token_string(),
                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr));
                    }
                }
            }
            else
            {
                skip_to_state_evaluation = false;
            }

            // we reached this line after we successfully parsed a value
            if (states.empty())
            {
                // empty stack: we reached the end of the hierarchy: done
                return true;
            }

            if (states.back())  // array
            {
                // comma -> next value
                if (get_token() == token_type::value_separator)
                {
                    // parse a new value
                    get_token();
                    continue;
                }

                // closing ]
                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
                {
                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
                    {
                        return false;
                    }

                    // We are done with this array. Before we can parse a
                    // new value, we need to evaluate the new state first.
                    // By setting skip_to_state_evaluation to false, we
                    // are effectively jumping to the beginning of this if.
                    JSON_ASSERT(!states.empty());
                    states.pop_back();
                    skip_to_state_evaluation = true;
                    continue;
                }

                return sax->parse_error(m_lexer.get_position(),
                                        m_lexer.get_token_string(),
                                        parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array, "array"), nullptr));
            }

            // states.back() is false -> object

            // comma -> next value
            if (get_token() == token_type::value_separator)
            {
                // parse key
                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
                {
                    return sax->parse_error(m_lexer.get_position(),
                                            m_lexer.get_token_string(),
                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), nullptr));
                }

                if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
                {
                    return false;
                }

                // parse separator (:)
                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
                {
                    return sax->parse_error(m_lexer.get_position(),
                                            m_lexer.get_token_string(),
                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), nullptr));
                }

                // parse values
                get_token();
                continue;
            }

            // closing }
            if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
            {
                if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
                {
                    return false;
                }

                // We are done with this object. Before we can parse a
                // new value, we need to evaluate the new state first.
                // By setting skip_to_state_evaluation to false, we
                // are effectively jumping to the beginning of this if.
                JSON_ASSERT(!states.empty());
                states.pop_back();
                skip_to_state_evaluation = true;
                continue;
            }

            return sax->parse_error(m_lexer.get_position(),
                                    m_lexer.get_token_string(),
                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object, "object"), nullptr));
        }
    }

    /// get next token from lexer
    token_type get_token()
    {
        return last_token = m_lexer.scan();
    }

    std::string exception_message(const token_type expected, const std::string& context)
    {
        std::string error_msg = "syntax error ";

        if (!context.empty())
        {
            error_msg += concat("while parsing ", context, ' ');
        }

        error_msg += "- ";

        if (last_token == token_type::parse_error)
        {
            error_msg += concat(m_lexer.get_error_message(), "; last read: '",
                                m_lexer.get_token_string(), '\'');
        }
        else
        {
            error_msg += concat("unexpected ", lexer_t::token_type_name(last_token));
        }

        if (expected != token_type::uninitialized)
        {
            error_msg += concat("; expected ", lexer_t::token_type_name(expected));
        }

        return error_msg;
    }

  private:
    /// callback function
    const parser_callback_t<BasicJsonType> callback = nullptr;
    /// the type of the last read token
    token_type last_token = token_type::uninitialized;
    /// the lexer
    lexer_t m_lexer;
    /// whether to throw exceptions in case of errors
    const bool allow_exceptions = true;
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/iterators/internal_iterator.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// #include <nlohmann/detail/abi_macros.hpp>

// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstddef> // ptrdiff_t
#include <limits>  // numeric_limits

// #include <nlohmann/detail/macro_scope.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/*
@brief an iterator for primitive JSON types

This class models an iterator for primitive JSON types (boolean, number,
string). It's only purpose is to allow the iterator/const_iterator classes
to "iterate" over primitive values. Internally, the iterator is modeled by
a `difference_type` variable. Value begin_value (`0`) models the begin,
end_value (`1`) models past the end.
*/
class primitive_iterator_t
{
  private:
    using difference_type = std::ptrdiff_t;
    static constexpr difference_type begin_value = 0;
    static constexpr difference_type end_value = begin_value + 1;

  JSON_PRIVATE_UNLESS_TESTED:
    /// iterator as signed integer type
    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();

  public:
    constexpr difference_type get_value() const noexcept
    {
        return m_it;
    }

    /// set iterator to a defined beginning
    void set_begin() noexcept
    {
        m_it = begin_value;
    }

    /// set iterator to a defined past the end
    void set_end() noexcept
    {
        m_it = end_value;
    }

    /// return whether the iterator can be dereferenced
    constexpr bool is_begin() const noexcept
    {
        return m_it == begin_value;
    }

    /// return whether the iterator is at end
    constexpr bool is_end() const noexcept
    {
        return m_it == end_value;
    }

    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
    {
        return lhs.m_it == rhs.m_it;
    }

    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
    {
        return lhs.m_it < rhs.m_it;
    }

    primitive_iterator_t operator+(difference_type n) noexcept
    {
        auto result = *this;
        result += n;
        return result;
    }

    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
    {
        return lhs.m_it - rhs.m_it;
    }

    primitive_iterator_t& operator++() noexcept
    {
        ++m_it;
        return *this;
    }

    primitive_iterator_t operator++(int)& noexcept // NOLINT(cert-dcl21-cpp)
    {
        auto result = *this;
        ++m_it;
        return result;
    }

    primitive_iterator_t& operator--() noexcept
    {
        --m_it;
        return *this;
    }

    primitive_iterator_t operator--(int)& noexcept // NOLINT(cert-dcl21-cpp)
    {
        auto result = *this;
        --m_it;
        return result;
    }

    primitive_iterator_t& operator+=(difference_type n) noexcept
    {
        m_it += n;
        return *this;
    }

    primitive_iterator_t& operator-=(difference_type n) noexcept
    {
        m_it -= n;
        return *this;
    }
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/*!
@brief an iterator value

@note This structure could easily be a union, but MSVC currently does not allow
unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
*/
template<typename BasicJsonType> struct internal_iterator
{
    /// iterator for JSON objects
    typename BasicJsonType::object_t::iterator object_iterator {};
    /// iterator for JSON arrays
    typename BasicJsonType::array_t::iterator array_iterator {};
    /// generic iterator for all other types
    primitive_iterator_t primitive_iterator {};
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/iterators/iter_impl.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
#include <type_traits> // conditional, is_const, remove_const

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/iterators/internal_iterator.hpp>

// #include <nlohmann/detail/iterators/primitive_iterator.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

// forward declare, to be able to friend it later on
template<typename IteratorType> class iteration_proxy;
template<typename IteratorType> class iteration_proxy_value;

/*!
@brief a template for a bidirectional iterator for the @ref basic_json class
This class implements a both iterators (iterator and const_iterator) for the
@ref basic_json class.
@note An iterator is called *initialized* when a pointer to a JSON value has
      been set (e.g., by a constructor or a copy assignment). If the iterator is
      default-constructed, it is *uninitialized* and most methods are undefined.
      **The library uses assertions to detect calls on uninitialized iterators.**
@requirement The class satisfies the following concept requirements:
-
[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
  The iterator that can be moved can be moved in both directions (i.e.
  incremented and decremented).
@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
*/
template<typename BasicJsonType>
class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
{
    /// the iterator with BasicJsonType of different const-ness
    using other_iter_impl = iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
    /// allow basic_json to access private members
    friend other_iter_impl;
    friend BasicJsonType;
    friend iteration_proxy<iter_impl>;
    friend iteration_proxy_value<iter_impl>;

    using object_t = typename BasicJsonType::object_t;
    using array_t = typename BasicJsonType::array_t;
    // make sure BasicJsonType is basic_json or const basic_json
    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
                  "iter_impl only accepts (const) basic_json");
    // superficial check for the LegacyBidirectionalIterator named requirement
    static_assert(std::is_base_of<std::bidirectional_iterator_tag, std::bidirectional_iterator_tag>::value
                  &&  std::is_base_of<std::bidirectional_iterator_tag, typename std::iterator_traits<typename array_t::iterator>::iterator_category>::value,
                  "basic_json iterator assumes array and object type iterators satisfy the LegacyBidirectionalIterator named requirement.");

  public:
    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
    /// A user-defined iterator should provide publicly accessible typedefs named
    /// iterator_category, value_type, difference_type, pointer, and reference.
    /// Note that value_type is required to be non-const, even for constant iterators.
    using iterator_category = std::bidirectional_iterator_tag;

    /// the type of the values when the iterator is dereferenced
    using value_type = typename BasicJsonType::value_type;
    /// a type to represent differences between iterators
    using difference_type = typename BasicJsonType::difference_type;
    /// defines a pointer to the type iterated over (value_type)
    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
          typename BasicJsonType::const_pointer,
          typename BasicJsonType::pointer>::type;
    /// defines a reference to the type iterated over (value_type)
    using reference =
        typename std::conditional<std::is_const<BasicJsonType>::value,
        typename BasicJsonType::const_reference,
        typename BasicJsonType::reference>::type;

    iter_impl() = default;
    ~iter_impl() = default;
    iter_impl(iter_impl&&) noexcept = default;
    iter_impl& operator=(iter_impl&&) noexcept = default;

    /*!
    @brief constructor for a given JSON instance
    @param[in] object  pointer to a JSON object for this iterator
    @pre object != nullptr
    @post The iterator is initialized; i.e. `m_object != nullptr`.
    */
    explicit iter_impl(pointer object) noexcept : m_object(object)
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
            {
                m_it.object_iterator = typename object_t::iterator();
                break;
            }

            case value_t::array:
            {
                m_it.array_iterator = typename array_t::iterator();
                break;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                m_it.primitive_iterator = primitive_iterator_t();
                break;
            }
        }
    }

    /*!
    @note The conventional copy constructor and copy assignment are implicitly
          defined. Combined with the following converting constructor and
          assignment, they support: (1) copy from iterator to iterator, (2)
          copy from const iterator to const iterator, and (3) conversion from
          iterator to const iterator. However conversion from const iterator
          to iterator is not defined.
    */

    /*!
    @brief const copy constructor
    @param[in] other const iterator to copy from
    @note This copy constructor had to be defined explicitly to circumvent a bug
          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
          information refer to: https://github.com/nlohmann/json/issues/1608
    */
    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
        : m_object(other.m_object), m_it(other.m_it)
    {}

    /*!
    @brief converting assignment
    @param[in] other const iterator to copy from
    @return const/non-const iterator
    @note It is not checked whether @a other is initialized.
    */
    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
    {
        if (&other != this)
        {
            m_object = other.m_object;
            m_it = other.m_it;
        }
        return *this;
    }

    /*!
    @brief converting constructor
    @param[in] other  non-const iterator to copy from
    @note It is not checked whether @a other is initialized.
    */
    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
        : m_object(other.m_object), m_it(other.m_it)
    {}

    /*!
    @brief converting assignment
    @param[in] other  non-const iterator to copy from
    @return const/non-const iterator
    @note It is not checked whether @a other is initialized.
    */
    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept // NOLINT(cert-oop54-cpp)
    {
        m_object = other.m_object;
        m_it = other.m_it;
        return *this;
    }

  JSON_PRIVATE_UNLESS_TESTED:
    /*!
    @brief set the iterator to the first value
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    void set_begin() noexcept
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
            {
                m_it.object_iterator = m_object->m_data.m_value.object->begin();
                break;
            }

            case value_t::array:
            {
                m_it.array_iterator = m_object->m_data.m_value.array->begin();
                break;
            }

            case value_t::null:
            {
                // set to end so begin()==end() is true: null is empty
                m_it.primitive_iterator.set_end();
                break;
            }

            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                m_it.primitive_iterator.set_begin();
                break;
            }
        }
    }

    /*!
    @brief set the iterator past the last value
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    void set_end() noexcept
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
            {
                m_it.object_iterator = m_object->m_data.m_value.object->end();
                break;
            }

            case value_t::array:
            {
                m_it.array_iterator = m_object->m_data.m_value.array->end();
                break;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                m_it.primitive_iterator.set_end();
                break;
            }
        }
    }

  public:
    /*!
    @brief return a reference to the value pointed to by the iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    reference operator*() const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
            {
                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
                return m_it.object_iterator->second;
            }

            case value_t::array:
            {
                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
                return *m_it.array_iterator;
            }

            case value_t::null:
                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));

            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
                {
                    return *m_object;
                }

                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
            }
        }
    }

    /*!
    @brief dereference the iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    pointer operator->() const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
            {
                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
                return &(m_it.object_iterator->second);
            }

            case value_t::array:
            {
                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
                return &*m_it.array_iterator;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
                {
                    return m_object;
                }

                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
            }
        }
    }

    /*!
    @brief post-increment (it++)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl operator++(int)& // NOLINT(cert-dcl21-cpp)
    {
        auto result = *this;
        ++(*this);
        return result;
    }

    /*!
    @brief pre-increment (++it)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator++()
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
            {
                std::advance(m_it.object_iterator, 1);
                break;
            }

            case value_t::array:
            {
                std::advance(m_it.array_iterator, 1);
                break;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                ++m_it.primitive_iterator;
                break;
            }
        }

        return *this;
    }

    /*!
    @brief post-decrement (it--)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl operator--(int)& // NOLINT(cert-dcl21-cpp)
    {
        auto result = *this;
        --(*this);
        return result;
    }

    /*!
    @brief pre-decrement (--it)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator--()
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
            {
                std::advance(m_it.object_iterator, -1);
                break;
            }

            case value_t::array:
            {
                std::advance(m_it.array_iterator, -1);
                break;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                --m_it.primitive_iterator;
                break;
            }
        }

        return *this;
    }

    /*!
    @brief comparison: equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
    bool operator==(const IterImpl& other) const
    {
        // if objects are not the same, the comparison is undefined
        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
        {
            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
        }

        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
                return (m_it.object_iterator == other.m_it.object_iterator);

            case value_t::array:
                return (m_it.array_iterator == other.m_it.array_iterator);

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
        }
    }

    /*!
    @brief comparison: not equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
    bool operator!=(const IterImpl& other) const
    {
        return !operator==(other);
    }

    /*!
    @brief comparison: smaller
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator<(const iter_impl& other) const
    {
        // if objects are not the same, the comparison is undefined
        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
        {
            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
        }

        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", m_object));

            case value_t::array:
                return (m_it.array_iterator < other.m_it.array_iterator);

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
        }
    }

    /*!
    @brief comparison: less than or equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator<=(const iter_impl& other) const
    {
        return !other.operator < (*this);
    }

    /*!
    @brief comparison: greater than
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator>(const iter_impl& other) const
    {
        return !operator<=(other);
    }

    /*!
    @brief comparison: greater than or equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator>=(const iter_impl& other) const
    {
        return !operator<(other);
    }

    /*!
    @brief add to iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator+=(difference_type i)
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));

            case value_t::array:
            {
                std::advance(m_it.array_iterator, i);
                break;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                m_it.primitive_iterator += i;
                break;
            }
        }

        return *this;
    }

    /*!
    @brief subtract from iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator-=(difference_type i)
    {
        return operator+=(-i);
    }

    /*!
    @brief add to iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl operator+(difference_type i) const
    {
        auto result = *this;
        result += i;
        return result;
    }

    /*!
    @brief addition of distance and iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    friend iter_impl operator+(difference_type i, const iter_impl& it)
    {
        auto result = it;
        result += i;
        return result;
    }

    /*!
    @brief subtract from iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl operator-(difference_type i) const
    {
        auto result = *this;
        result -= i;
        return result;
    }

    /*!
    @brief return difference
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    difference_type operator-(const iter_impl& other) const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));

            case value_t::array:
                return m_it.array_iterator - other.m_it.array_iterator;

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
                return m_it.primitive_iterator - other.m_it.primitive_iterator;
        }
    }

    /*!
    @brief access to successor
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    reference operator[](difference_type n) const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_data.m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", m_object));

            case value_t::array:
                return *std::next(m_it.array_iterator, n);

            case value_t::null:
                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));

            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
                {
                    return *m_object;
                }

                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
            }
        }
    }

    /*!
    @brief return the key of an object iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    const typename object_t::key_type& key() const
    {
        JSON_ASSERT(m_object != nullptr);

        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
        {
            return m_it.object_iterator->first;
        }

        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators", m_object));
    }

    /*!
    @brief return the value of an iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    reference value() const
    {
        return operator*();
    }

  JSON_PRIVATE_UNLESS_TESTED:
    /// associated JSON instance
    pointer m_object = nullptr;
    /// the actual iterator of the associated instance
    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/iterators/iteration_proxy.hpp>

// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <cstddef> // ptrdiff_t
#include <iterator> // reverse_iterator
#include <utility> // declval

// #include <nlohmann/detail/abi_macros.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

//////////////////////
// reverse_iterator //
//////////////////////

/*!
@brief a template for a reverse iterator class

@tparam Base the base iterator type to reverse. Valid types are @ref
iterator (to create @ref reverse_iterator) and @ref const_iterator (to
create @ref const_reverse_iterator).

@requirement The class satisfies the following concept requirements:
-
[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
  The iterator that can be moved can be moved in both directions (i.e.
  incremented and decremented).
- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
  It is possible to write to the pointed-to element (only if @a Base is
  @ref iterator).

@since version 1.0.0
*/
template<typename Base>
class json_reverse_iterator : public std::reverse_iterator<Base>
{
  public:
    using difference_type = std::ptrdiff_t;
    /// shortcut to the reverse iterator adapter
    using base_iterator = std::reverse_iterator<Base>;
    /// the reference type for the pointed-to element
    using reference = typename Base::reference;

    /// create reverse iterator from iterator
    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
        : base_iterator(it) {}

    /// create reverse iterator from base class
    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}

    /// post-increment (it++)
    json_reverse_iterator operator++(int)& // NOLINT(cert-dcl21-cpp)
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
    }

    /// pre-increment (++it)
    json_reverse_iterator& operator++()
    {
        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
    }

    /// post-decrement (it--)
    json_reverse_iterator operator--(int)& // NOLINT(cert-dcl21-cpp)
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
    }

    /// pre-decrement (--it)
    json_reverse_iterator& operator--()
    {
        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
    }

    /// add to iterator
    json_reverse_iterator& operator+=(difference_type i)
    {
        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
    }

    /// add to iterator
    json_reverse_iterator operator+(difference_type i) const
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
    }

    /// subtract from iterator
    json_reverse_iterator operator-(difference_type i) const
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
    }

    /// return difference
    difference_type operator-(const json_reverse_iterator& other) const
    {
        return base_iterator(*this) - base_iterator(other);
    }

    /// access to successor
    reference operator[](difference_type n) const
    {
        return *(this->operator+(n));
    }

    /// return the key of an object iterator
    auto key() const -> decltype(std::declval<Base>().key())
    {
        auto it = --this->base();
        return it.key();
    }

    /// return the value of an iterator
    reference value() const
    {
        auto it = --this->base();
        return it.operator * ();
    }
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/iterators/primitive_iterator.hpp>

// #include <nlohmann/detail/json_custom_base_class.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <type_traits> // conditional, is_same

// #include <nlohmann/detail/abi_macros.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/*!
@brief Default base class of the @ref basic_json class.

So that the correct implementations of the copy / move ctors / assign operators
of @ref basic_json do not require complex case distinctions
(no base class / custom base class used as customization point),
@ref basic_json always has a base class.
By default, this class is used because it is empty and thus has no effect
on the behavior of @ref basic_json.
*/
struct json_default_base {};

template<class T>
using json_base_class = typename std::conditional <
                        std::is_same<T, void>::value,
                        json_default_base,
                        T
                        >::type;

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/json_pointer.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <algorithm> // all_of
#include <cctype> // isdigit
#include <cerrno> // errno, ERANGE
#include <cstdlib> // strtoull
#ifndef JSON_NO_IO
    #include <iosfwd> // ostream
#endif  // JSON_NO_IO
#include <limits> // max
#include <numeric> // accumulate
#include <string> // string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/string_concat.hpp>

// #include <nlohmann/detail/string_escape.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN

/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
/// @sa https://json.nlohmann.me/api/json_pointer/
template<typename RefStringType>
class json_pointer
{
    // allow basic_json to access private members
    NLOHMANN_BASIC_JSON_TPL_DECLARATION
    friend class basic_json;

    template<typename>
    friend class json_pointer;

    template<typename T>
    struct string_t_helper
    {
        using type = T;
    };

    NLOHMANN_BASIC_JSON_TPL_DECLARATION
    struct string_t_helper<NLOHMANN_BASIC_JSON_TPL>
    {
        using type = StringType;
    };

  public:
    // for backwards compatibility accept BasicJsonType
    using string_t = typename string_t_helper<RefStringType>::type;

    /// @brief create JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/json_pointer/
    explicit json_pointer(const string_t& s = "")
        : reference_tokens(split(s))
    {}

    /// @brief return a string representation of the JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/to_string/
    string_t to_string() const
    {
        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
                               string_t{},
                               [](const string_t& a, const string_t& b)
        {
            return detail::concat(a, '/', detail::escape(b));
        });
    }

    /// @brief return a string representation of the JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_string/
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, to_string())
    operator string_t() const
    {
        return to_string();
    }

#ifndef JSON_NO_IO
    /// @brief write string representation of the JSON pointer to stream
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
    friend std::ostream& operator<<(std::ostream& o, const json_pointer& ptr)
    {
        o << ptr.to_string();
        return o;
    }
#endif

    /// @brief append another JSON pointer at the end of this JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
    json_pointer& operator/=(const json_pointer& ptr)
    {
        reference_tokens.insert(reference_tokens.end(),
                                ptr.reference_tokens.begin(),
                                ptr.reference_tokens.end());
        return *this;
    }

    /// @brief append an unescaped reference token at the end of this JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
    json_pointer& operator/=(string_t token)
    {
        push_back(std::move(token));
        return *this;
    }

    /// @brief append an array index at the end of this JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
    json_pointer& operator/=(std::size_t array_idx)
    {
        return *this /= std::to_string(array_idx);
    }

    /// @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
    friend json_pointer operator/(const json_pointer& lhs,
                                  const json_pointer& rhs)
    {
        return json_pointer(lhs) /= rhs;
    }

    /// @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
    friend json_pointer operator/(const json_pointer& lhs, string_t token) // NOLINT(performance-unnecessary-value-param)
    {
        return json_pointer(lhs) /= std::move(token);
    }

    /// @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
    friend json_pointer operator/(const json_pointer& lhs, std::size_t array_idx)
    {
        return json_pointer(lhs) /= array_idx;
    }

    /// @brief returns the parent of this JSON pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/parent_pointer/
    json_pointer parent_pointer() const
    {
        if (empty())
        {
            return *this;
        }

        json_pointer res = *this;
        res.pop_back();
        return res;
    }

    /// @brief remove last reference token
    /// @sa https://json.nlohmann.me/api/json_pointer/pop_back/
    void pop_back()
    {
        if (JSON_HEDLEY_UNLIKELY(empty()))
        {
            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
        }

        reference_tokens.pop_back();
    }

    /// @brief return last reference token
    /// @sa https://json.nlohmann.me/api/json_pointer/back/
    const string_t& back() const
    {
        if (JSON_HEDLEY_UNLIKELY(empty()))
        {
            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
        }

        return reference_tokens.back();
    }

    /// @brief append an unescaped token at the end of the reference pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
    void push_back(const string_t& token)
    {
        reference_tokens.push_back(token);
    }

    /// @brief append an unescaped token at the end of the reference pointer
    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
    void push_back(string_t&& token)
    {
        reference_tokens.push_back(std::move(token));
    }

    /// @brief return whether pointer points to the root document
    /// @sa https://json.nlohmann.me/api/json_pointer/empty/
    bool empty() const noexcept
    {
        return reference_tokens.empty();
    }

  private:
    /*!
    @param[in] s  reference token to be converted into an array index

    @return integer representation of @a s

    @throw parse_error.106  if an array index begins with '0'
    @throw parse_error.109  if an array index begins not with a digit
    @throw out_of_range.404 if string @a s could not be converted to an integer
    @throw out_of_range.410 if an array index exceeds size_type
    */
    template<typename BasicJsonType>
    static typename BasicJsonType::size_type array_index(const string_t& s)
    {
        using size_type = typename BasicJsonType::size_type;

        // error condition (cf. RFC 6901, Sect. 4)
        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0'))
        {
            JSON_THROW(detail::parse_error::create(106, 0, detail::concat("array index '", s, "' must not begin with '0'"), nullptr));
        }

        // error condition (cf. RFC 6901, Sect. 4)
        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9')))
        {
            JSON_THROW(detail::parse_error::create(109, 0, detail::concat("array index '", s, "' is not a number"), nullptr));
        }

        const char* p = s.c_str();
        char* p_end = nullptr;
        errno = 0; // strtoull doesn't reset errno
        const unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int)
        if (p == p_end // invalid input or empty string
                || errno == ERANGE // out of range
                || JSON_HEDLEY_UNLIKELY(static_cast<std::size_t>(p_end - p) != s.size())) // incomplete read
        {
            JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", s, "'"), nullptr));
        }

        // only triggered on special platforms (like 32bit), see also
        // https://github.com/nlohmann/json/pull/2203
        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))  // NOLINT(runtime/int)
        {
            JSON_THROW(detail::out_of_range::create(410, detail::concat("array index ", s, " exceeds size_type"), nullptr));   // LCOV_EXCL_LINE
        }

        return static_cast<size_type>(res);
    }

  JSON_PRIVATE_UNLESS_TESTED:
    json_pointer top() const
    {
        if (JSON_HEDLEY_UNLIKELY(empty()))
        {
            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
        }

        json_pointer result = *this;
        result.reference_tokens = {reference_tokens[0]};
        return result;
    }

  private:
    /*!
    @brief create and return a reference to the pointed to value

    @complexity Linear in the number of reference tokens.

    @throw parse_error.109 if array index is not a number
    @throw type_error.313 if value cannot be unflattened
    */
    template<typename BasicJsonType>
    BasicJsonType& get_and_create(BasicJsonType& j) const
    {
        auto* result = &j;

        // in case no reference tokens exist, return a reference to the JSON value
        // j which will be overwritten by a primitive value
        for (const auto& reference_token : reference_tokens)
        {
            switch (result->type())
            {
                case detail::value_t::null:
                {
                    if (reference_token == "0")
                    {
                        // start a new array if reference token is 0
                        result = &result->operator[](0);
                    }
                    else
                    {
                        // start a new object otherwise
                        result = &result->operator[](reference_token);
                    }
                    break;
                }

                case detail::value_t::object:
                {
                    // create an entry in the object
                    result = &result->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    // create an entry in the array
                    result = &result->operator[](array_index<BasicJsonType>(reference_token));
                    break;
                }

                /*
                The following code is only reached if there exists a reference
                token _and_ the current value is primitive. In this case, we have
                an error situation, because primitive values may only occur as
                single value; that is, with an empty list of reference tokens.
                */
                case detail::value_t::string:
                case detail::value_t::boolean:
                case detail::value_t::number_integer:
                case detail::value_t::number_unsigned:
                case detail::value_t::number_float:
                case detail::value_t::binary:
                case detail::value_t::discarded:
                default:
                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten", &j));
            }
        }

        return *result;
    }

    /*!
    @brief return a reference to the pointed to value

    @note This version does not throw if a value is not present, but tries to
          create nested values instead. For instance, calling this function
          with pointer `"/this/that"` on a null value is equivalent to calling
          `operator[]("this").operator[]("that")` on that value, effectively
          changing the null value to an object.

    @param[in] ptr  a JSON value

    @return reference to the JSON value pointed to by the JSON pointer

    @complexity Linear in the length of the JSON pointer.

    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    template<typename BasicJsonType>
    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            // convert null values to arrays or objects before continuing
            if (ptr->is_null())
            {
                // check if reference token is a number
                const bool nums =
                    std::all_of(reference_token.begin(), reference_token.end(),
                                [](const unsigned char x)
                {
                    return std::isdigit(x);
                });

                // change value to array for numbers or "-" or to object otherwise
                *ptr = (nums || reference_token == "-")
                       ? detail::value_t::array
                       : detail::value_t::object;
            }

            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // use unchecked object access
                    ptr = &ptr->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (reference_token == "-")
                    {
                        // explicitly treat "-" as index beyond the end
                        ptr = &ptr->operator[](ptr->m_data.m_value.array->size());
                    }
                    else
                    {
                        // convert array index to number; unchecked access
                        ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
                    }
                    break;
                }

                case detail::value_t::null:
                case detail::value_t::string:
                case detail::value_t::boolean:
                case detail::value_t::number_integer:
                case detail::value_t::number_unsigned:
                case detail::value_t::number_float:
                case detail::value_t::binary:
                case detail::value_t::discarded:
                default:
                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
            }
        }

        return *ptr;
    }

    /*!
    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.402  if the array index '-' is used
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    template<typename BasicJsonType>
    BasicJsonType& get_checked(BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // note: at performs range check
                    ptr = &ptr->at(reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" always fails the range check
                        JSON_THROW(detail::out_of_range::create(402, detail::concat(
                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
                                ") is out of range"), ptr));
                    }

                    // note: at performs range check
                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
                    break;
                }

                case detail::value_t::null:
                case detail::value_t::string:
                case detail::value_t::boolean:
                case detail::value_t::number_integer:
                case detail::value_t::number_unsigned:
                case detail::value_t::number_float:
                case detail::value_t::binary:
                case detail::value_t::discarded:
                default:
                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
            }
        }

        return *ptr;
    }

    /*!
    @brief return a const reference to the pointed to value

    @param[in] ptr  a JSON value

    @return const reference to the JSON value pointed to by the JSON
    pointer

    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.402  if the array index '-' is used
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    template<typename BasicJsonType>
    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // use unchecked object access
                    ptr = &ptr->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" cannot be used for const access
                        JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr));
                    }

                    // use unchecked array access
                    ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
                    break;
                }

                case detail::value_t::null:
                case detail::value_t::string:
                case detail::value_t::boolean:
                case detail::value_t::number_integer:
                case detail::value_t::number_unsigned:
                case detail::value_t::number_float:
                case detail::value_t::binary:
                case detail::value_t::discarded:
                default:
                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
            }
        }

        return *ptr;
    }

    /*!
    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.402  if the array index '-' is used
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    template<typename BasicJsonType>
    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // note: at performs range check
                    ptr = &ptr->at(reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" always fails the range check
                        JSON_THROW(detail::out_of_range::create(402, detail::concat(
                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
                                ") is out of range"), ptr));
                    }

                    // note: at performs range check
                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
                    break;
                }

                case detail::value_t::null:
                case detail::value_t::string:
                case detail::value_t::boolean:
                case detail::value_t::number_integer:
                case detail::value_t::number_unsigned:
                case detail::value_t::number_float:
                case detail::value_t::binary:
                case detail::value_t::discarded:
                default:
                    JSON_THROW(detail::out_of_range::create(404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
            }
        }

        return *ptr;
    }

    /*!
    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    */
    template<typename BasicJsonType>
    bool contains(const BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    if (!ptr->contains(reference_token))
                    {
                        // we did not find the key in the object
                        return false;
                    }

                    ptr = &ptr->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" always fails the range check
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9")))
                    {
                        // invalid char
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1))
                    {
                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9')))
                        {
                            // first char should be between '1' and '9'
                            return false;
                        }
                        for (std::size_t i = 1; i < reference_token.size(); i++)
                        {
                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9')))
                            {
                                // other char should be between '0' and '9'
                                return false;
                            }
                        }
                    }

                    const auto idx = array_index<BasicJsonType>(reference_token);
                    if (idx >= ptr->size())
                    {
                        // index out of range
                        return false;
                    }

                    ptr = &ptr->operator[](idx);
                    break;
                }

                case detail::value_t::null:
                case detail::value_t::string:
                case detail::value_t::boolean:
                case detail::value_t::number_integer:
                case detail::value_t::number_unsigned:
                case detail::value_t::number_float:
                case detail::value_t::binary:
                case detail::value_t::discarded:
                default:
                {
                    // we do not expect primitive values if there is still a
                    // reference token to process
                    return false;
                }
            }
        }

        // no reference token left means we found a primitive value
        return true;
    }

    /*!
    @brief split the string input to reference tokens

    @note This function is only called by the json_pointer constructor.
          All exceptions below are documented there.

    @throw parse_error.107  if the pointer is not empty or begins with '/'
    @throw parse_error.108  if character '~' is not followed by '0' or '1'
    */
    static std::vector<string_t> split(const string_t& reference_string)
    {
        std::vector<string_t> result;

        // special case: empty reference string -> no reference tokens
        if (reference_string.empty())
        {
            return result;
        }

        // check if nonempty reference string begins with slash
        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
        {
            JSON_THROW(detail::parse_error::create(107, 1, detail::concat("JSON pointer must be empty or begin with '/' - was: '", reference_string, "'"), nullptr));
        }

        // extract the reference tokens:
        // - slash: position of the last read slash (or end of string)
        // - start: position after the previous slash
        for (
            // search for the first slash after the first character
            std::size_t slash = reference_string.find_first_of('/', 1),
            // set the beginning of the first reference token
            start = 1;
            // we can stop if start == 0 (if slash == string_t::npos)
            start != 0;
            // set the beginning of the next reference token
            // (will eventually be 0 if slash == string_t::npos)
            start = (slash == string_t::npos) ? 0 : slash + 1,
            // find next slash
            slash = reference_string.find_first_of('/', start))
        {
            // use the text between the beginning of the reference token
            // (start) and the last slash (slash).
            auto reference_token = reference_string.substr(start, slash - start);

            // check reference tokens are properly escaped
            for (std::size_t pos = reference_token.find_first_of('~');
                    pos != string_t::npos;
                    pos = reference_token.find_first_of('~', pos + 1))
            {
                JSON_ASSERT(reference_token[pos] == '~');

                // ~ must be followed by 0 or 1
                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
                                         (reference_token[pos + 1] != '0' &&
                                          reference_token[pos + 1] != '1')))
                {
                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'", nullptr));
                }
            }

            // finally, store the reference token
            detail::unescape(reference_token);
            result.push_back(reference_token);
        }

        return result;
    }

  private:
    /*!
    @param[in] reference_string  the reference string to the current value
    @param[in] value             the value to consider
    @param[in,out] result        the result object to insert values to

    @note Empty objects or arrays are flattened to `null`.
    */
    template<typename BasicJsonType>
    static void flatten(const string_t& reference_string,
                        const BasicJsonType& value,
                        BasicJsonType& result)
    {
        switch (value.type())
        {
            case detail::value_t::array:
            {
                if (value.m_data.m_value.array->empty())
                {
                    // flatten empty array as null
                    result[reference_string] = nullptr;
                }
                else
                {
                    // iterate array and use index as reference string
                    for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i)
                    {
                        flatten(detail::concat(reference_string, '/', std::to_string(i)),
                                value.m_data.m_value.array->operator[](i), result);
                    }
                }
                break;
            }

            case detail::value_t::object:
            {
                if (value.m_data.m_value.object->empty())
                {
                    // flatten empty object as null
                    result[reference_string] = nullptr;
                }
                else
                {
                    // iterate object and use keys as reference string
                    for (const auto& element : *value.m_data.m_value.object)
                    {
                        flatten(detail::concat(reference_string, '/', detail::escape(element.first)), element.second, result);
                    }
                }
                break;
            }

            case detail::value_t::null:
            case detail::value_t::string:
            case detail::value_t::boolean:
            case detail::value_t::number_integer:
            case detail::value_t::number_unsigned:
            case detail::value_t::number_float:
            case detail::value_t::binary:
            case detail::value_t::discarded:
            default:
            {
                // add primitive value with its reference string
                result[reference_string] = value;
                break;
            }
        }
    }

    /*!
    @param[in] value  flattened JSON

    @return unflattened JSON

    @throw parse_error.109 if array index is not a number
    @throw type_error.314  if value is not an object
    @throw type_error.315  if object values are not primitive
    @throw type_error.313  if value cannot be unflattened
    */
    template<typename BasicJsonType>
    static BasicJsonType
    unflatten(const BasicJsonType& value)
    {
        if (JSON_HEDLEY_UNLIKELY(!value.is_object()))
        {
            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened", &value));
        }

        BasicJsonType result;

        // iterate the JSON object values
        for (const auto& element : *value.m_data.m_value.object)
        {
            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
            {
                JSON_THROW(detail::type_error::create(315, "values in object must be primitive", &element.second));
            }

            // assign value to reference pointed to by JSON pointer; Note that if
            // the JSON pointer is "" (i.e., points to the whole value), function
            // get_and_create returns a reference to result itself. An assignment
            // will then create a primitive value.
            json_pointer(element.first).get_and_create(result) = element.second;
        }

        return result;
    }

    // can't use conversion operator because of ambiguity
    json_pointer<string_t> convert() const&
    {
        json_pointer<string_t> result;
        result.reference_tokens = reference_tokens;
        return result;
    }

    json_pointer<string_t> convert()&&
    {
        json_pointer<string_t> result;
        result.reference_tokens = std::move(reference_tokens);
        return result;
    }

  public:
#if JSON_HAS_THREE_WAY_COMPARISON
    /// @brief compares two JSON pointers for equality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
    template<typename RefStringTypeRhs>
    bool operator==(const json_pointer<RefStringTypeRhs>& rhs) const noexcept
    {
        return reference_tokens == rhs.reference_tokens;
    }

    /// @brief compares JSON pointer and string for equality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
    JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer))
    bool operator==(const string_t& rhs) const
    {
        return *this == json_pointer(rhs);
    }

    /// @brief 3-way compares two JSON pointers
    template<typename RefStringTypeRhs>
    std::strong_ordering operator<=>(const json_pointer<RefStringTypeRhs>& rhs) const noexcept // *NOPAD*
    {
        return  reference_tokens <=> rhs.reference_tokens; // *NOPAD*
    }
#else
    /// @brief compares two JSON pointers for equality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
    // NOLINTNEXTLINE(readability-redundant-declaration)
    friend bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
                           const json_pointer<RefStringTypeRhs>& rhs) noexcept;

    /// @brief compares JSON pointer and string for equality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
    template<typename RefStringTypeLhs, typename StringType>
    // NOLINTNEXTLINE(readability-redundant-declaration)
    friend bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
                           const StringType& rhs);

    /// @brief compares string and JSON pointer for equality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
    template<typename RefStringTypeRhs, typename StringType>
    // NOLINTNEXTLINE(readability-redundant-declaration)
    friend bool operator==(const StringType& lhs,
                           const json_pointer<RefStringTypeRhs>& rhs);

    /// @brief compares two JSON pointers for inequality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
    // NOLINTNEXTLINE(readability-redundant-declaration)
    friend bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
                           const json_pointer<RefStringTypeRhs>& rhs) noexcept;

    /// @brief compares JSON pointer and string for inequality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
    template<typename RefStringTypeLhs, typename StringType>
    // NOLINTNEXTLINE(readability-redundant-declaration)
    friend bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
                           const StringType& rhs);

    /// @brief compares string and JSON pointer for inequality
    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
    template<typename RefStringTypeRhs, typename StringType>
    // NOLINTNEXTLINE(readability-redundant-declaration)
    friend bool operator!=(const StringType& lhs,
                           const json_pointer<RefStringTypeRhs>& rhs);

    /// @brief compares two JSON pointer for less-than
    template<typename RefStringTypeLhs, typename RefStringTypeRhs>
    // NOLINTNEXTLINE(readability-redundant-declaration)
    friend bool operator<(const json_pointer<RefStringTypeLhs>& lhs,
                          const json_pointer<RefStringTypeRhs>& rhs) noexcept;
#endif

  private:
    /// the reference tokens
    std::vector<string_t> reference_tokens;
};

#if !JSON_HAS_THREE_WAY_COMPARISON
// functions cannot be defined inside class due to ODR violations
template<typename RefStringTypeLhs, typename RefStringTypeRhs>
inline bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
                       const json_pointer<RefStringTypeRhs>& rhs) noexcept
{
    return lhs.reference_tokens == rhs.reference_tokens;
}

template<typename RefStringTypeLhs,
         typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
inline bool operator==(const json_pointer<RefStringTypeLhs>& lhs,
                       const StringType& rhs)
{
    return lhs == json_pointer<RefStringTypeLhs>(rhs);
}

template<typename RefStringTypeRhs,
         typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
inline bool operator==(const StringType& lhs,
                       const json_pointer<RefStringTypeRhs>& rhs)
{
    return json_pointer<RefStringTypeRhs>(lhs) == rhs;
}

template<typename RefStringTypeLhs, typename RefStringTypeRhs>
inline bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
                       const json_pointer<RefStringTypeRhs>& rhs) noexcept
{
    return !(lhs == rhs);
}

template<typename RefStringTypeLhs,
         typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
inline bool operator!=(const json_pointer<RefStringTypeLhs>& lhs,
                       const StringType& rhs)
{
    return !(lhs == rhs);
}

template<typename RefStringTypeRhs,
         typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
inline bool operator!=(const StringType& lhs,
                       const json_pointer<RefStringTypeRhs>& rhs)
{
    return !(lhs == rhs);
}

template<typename RefStringTypeLhs, typename RefStringTypeRhs>
inline bool operator<(const json_pointer<RefStringTypeLhs>& lhs,
                      const json_pointer<RefStringTypeRhs>& rhs) noexcept
{
    return lhs.reference_tokens < rhs.reference_tokens;
}
#endif

NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/json_ref.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <initializer_list>
#include <utility>

// #include <nlohmann/detail/abi_macros.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

template<typename BasicJsonType>
class json_ref
{
  public:
    using value_type = BasicJsonType;

    json_ref(value_type&& value)
        : owned_value(std::move(value))
    {}

    json_ref(const value_type& value)
        : value_ref(&value)
    {}

    json_ref(std::initializer_list<json_ref> init)
        : owned_value(init)
    {}

    template <
        class... Args,
        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
    json_ref(Args && ... args)
        : owned_value(std::forward<Args>(args)...)
    {}

    // class should be movable only
    json_ref(json_ref&&) noexcept = default;
    json_ref(const json_ref&) = delete;
    json_ref& operator=(const json_ref&) = delete;
    json_ref& operator=(json_ref&&) = delete;
    ~json_ref() = default;

    value_type moved_or_copied() const
    {
        if (value_ref == nullptr)
        {
            return std::move(owned_value);
        }
        return *value_ref;
    }

    value_type const& operator*() const
    {
        return value_ref ? *value_ref : owned_value;
    }

    value_type const* operator->() const
    {
        return &** this;
    }

  private:
    mutable value_type owned_value = nullptr;
    value_type const* value_ref = nullptr;
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/string_concat.hpp>

// #include <nlohmann/detail/string_escape.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/output/binary_writer.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <algorithm> // reverse
#include <array> // array
#include <map> // map
#include <cmath> // isnan, isinf
#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
#include <cstring> // memcpy
#include <limits> // numeric_limits
#include <string> // string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/input/binary_reader.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/output/output_adapters.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <algorithm> // copy
#include <cstddef> // size_t
#include <iterator> // back_inserter
#include <memory> // shared_ptr, make_shared
#include <string> // basic_string
#include <vector> // vector

#ifndef JSON_NO_IO
    #include <ios>      // streamsize
    #include <ostream>  // basic_ostream
#endif  // JSON_NO_IO

// #include <nlohmann/detail/macro_scope.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/// abstract output adapter interface
template<typename CharType> struct output_adapter_protocol
{
    virtual void write_character(CharType c) = 0;
    virtual void write_characters(const CharType* s, std::size_t length) = 0;
    virtual ~output_adapter_protocol() = default;

    output_adapter_protocol() = default;
    output_adapter_protocol(const output_adapter_protocol&) = default;
    output_adapter_protocol(output_adapter_protocol&&) noexcept = default;
    output_adapter_protocol& operator=(const output_adapter_protocol&) = default;
    output_adapter_protocol& operator=(output_adapter_protocol&&) noexcept = default;
};

/// a type to simplify interfaces
template<typename CharType>
using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;

/// output adapter for byte vectors
template<typename CharType, typename AllocatorType = std::allocator<CharType>>
class output_vector_adapter : public output_adapter_protocol<CharType>
{
  public:
    explicit output_vector_adapter(std::vector<CharType, AllocatorType>& vec) noexcept
        : v(vec)
    {}

    void write_character(CharType c) override
    {
        v.push_back(c);
    }

    JSON_HEDLEY_NON_NULL(2)
    void write_characters(const CharType* s, std::size_t length) override
    {
        v.insert(v.end(), s, s + length);
    }

  private:
    std::vector<CharType, AllocatorType>& v;
};

#ifndef JSON_NO_IO
/// output adapter for output streams
template<typename CharType>
class output_stream_adapter : public output_adapter_protocol<CharType>
{
  public:
    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
        : stream(s)
    {}

    void write_character(CharType c) override
    {
        stream.put(c);
    }

    JSON_HEDLEY_NON_NULL(2)
    void write_characters(const CharType* s, std::size_t length) override
    {
        stream.write(s, static_cast<std::streamsize>(length));
    }

  private:
    std::basic_ostream<CharType>& stream;
};
#endif  // JSON_NO_IO

/// output adapter for basic_string
template<typename CharType, typename StringType = std::basic_string<CharType>>
class output_string_adapter : public output_adapter_protocol<CharType>
{
  public:
    explicit output_string_adapter(StringType& s) noexcept
        : str(s)
    {}

    void write_character(CharType c) override
    {
        str.push_back(c);
    }

    JSON_HEDLEY_NON_NULL(2)
    void write_characters(const CharType* s, std::size_t length) override
    {
        str.append(s, length);
    }

  private:
    StringType& str;
};

template<typename CharType, typename StringType = std::basic_string<CharType>>
class output_adapter
{
  public:
    template<typename AllocatorType = std::allocator<CharType>>
    output_adapter(std::vector<CharType, AllocatorType>& vec)
        : oa(std::make_shared<output_vector_adapter<CharType, AllocatorType>>(vec)) {}

#ifndef JSON_NO_IO
    output_adapter(std::basic_ostream<CharType>& s)
        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
#endif  // JSON_NO_IO

    output_adapter(StringType& s)
        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}

    operator output_adapter_t<CharType>()
    {
        return oa;
    }

  private:
    output_adapter_t<CharType> oa = nullptr;
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/string_concat.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

///////////////////
// binary writer //
///////////////////

/*!
@brief serialization to CBOR and MessagePack values
*/
template<typename BasicJsonType, typename CharType>
class binary_writer
{
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using number_float_t = typename BasicJsonType::number_float_t;

  public:
    /*!
    @brief create a binary writer

    @param[in] adapter  output adapter to write to
    */
    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(std::move(adapter))
    {
        JSON_ASSERT(oa);
    }

    /*!
    @param[in] j  JSON value to serialize
    @pre       j.type() == value_t::object
    */
    void write_bson(const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::object:
            {
                write_bson_object(*j.m_data.m_value.object);
                break;
            }

            case value_t::null:
            case value_t::array:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                JSON_THROW(type_error::create(317, concat("to serialize to BSON, top-level type must be object, but is ", j.type_name()), &j));
            }
        }
    }

    /*!
    @param[in] j  JSON value to serialize
    */
    void write_cbor(const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::null:
            {
                oa->write_character(to_char_type(0xF6));
                break;
            }

            case value_t::boolean:
            {
                oa->write_character(j.m_data.m_value.boolean
                                    ? to_char_type(0xF5)
                                    : to_char_type(0xF4));
                break;
            }

            case value_t::number_integer:
            {
                if (j.m_data.m_value.number_integer >= 0)
                {
                    // CBOR does not differentiate between positive signed
                    // integers and unsigned integers. Therefore, we used the
                    // code from the value_t::number_unsigned case here.
                    if (j.m_data.m_value.number_integer <= 0x17)
                    {
                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
                    {
                        oa->write_character(to_char_type(0x18));
                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
                    {
                        oa->write_character(to_char_type(0x19));
                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
                    {
                        oa->write_character(to_char_type(0x1A));
                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
                    }
                    else
                    {
                        oa->write_character(to_char_type(0x1B));
                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
                    }
                }
                else
                {
                    // The conversions below encode the sign in the first
                    // byte, and the value is converted to a positive number.
                    const auto positive_number = -1 - j.m_data.m_value.number_integer;
                    if (j.m_data.m_value.number_integer >= -24)
                    {
                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
                    }
                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
                    {
                        oa->write_character(to_char_type(0x38));
                        write_number(static_cast<std::uint8_t>(positive_number));
                    }
                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
                    {
                        oa->write_character(to_char_type(0x39));
                        write_number(static_cast<std::uint16_t>(positive_number));
                    }
                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
                    {
                        oa->write_character(to_char_type(0x3A));
                        write_number(static_cast<std::uint32_t>(positive_number));
                    }
                    else
                    {
                        oa->write_character(to_char_type(0x3B));
                        write_number(static_cast<std::uint64_t>(positive_number));
                    }
                }
                break;
            }

            case value_t::number_unsigned:
            {
                if (j.m_data.m_value.number_unsigned <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
                }
                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x18));
                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
                }
                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x19));
                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_unsigned));
                }
                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x1A));
                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_unsigned));
                }
                else
                {
                    oa->write_character(to_char_type(0x1B));
                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_unsigned));
                }
                break;
            }

            case value_t::number_float:
            {
                if (std::isnan(j.m_data.m_value.number_float))
                {
                    // NaN is 0xf97e00 in CBOR
                    oa->write_character(to_char_type(0xF9));
                    oa->write_character(to_char_type(0x7E));
                    oa->write_character(to_char_type(0x00));
                }
                else if (std::isinf(j.m_data.m_value.number_float))
                {
                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
                    oa->write_character(to_char_type(0xf9));
                    oa->write_character(j.m_data.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
                    oa->write_character(to_char_type(0x00));
                }
                else
                {
                    write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::cbor);
                }
                break;
            }

            case value_t::string:
            {
                // step 1: write control byte and the string length
                const auto N = j.m_data.m_value.string->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0x60 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x78));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x79));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x7A));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0x7B));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write the string
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
                    j.m_data.m_value.string->size());
                break;
            }

            case value_t::array:
            {
                // step 1: write control byte and the array size
                const auto N = j.m_data.m_value.array->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0x80 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x98));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x99));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x9A));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0x9B));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write each element
                for (const auto& el : *j.m_data.m_value.array)
                {
                    write_cbor(el);
                }
                break;
            }

            case value_t::binary:
            {
                if (j.m_data.m_value.binary->has_subtype())
                {
                    if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint8_t>::max)())
                    {
                        write_number(static_cast<std::uint8_t>(0xd8));
                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.binary->subtype()));
                    }
                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint16_t>::max)())
                    {
                        write_number(static_cast<std::uint8_t>(0xd9));
                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.binary->subtype()));
                    }
                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint32_t>::max)())
                    {
                        write_number(static_cast<std::uint8_t>(0xda));
                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.binary->subtype()));
                    }
                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint64_t>::max)())
                    {
                        write_number(static_cast<std::uint8_t>(0xdb));
                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.binary->subtype()));
                    }
                }

                // step 1: write control byte and the binary array size
                const auto N = j.m_data.m_value.binary->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0x40 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x58));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x59));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x5A));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0x5B));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write each element
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
                    N);

                break;
            }

            case value_t::object:
            {
                // step 1: write control byte and the object size
                const auto N = j.m_data.m_value.object->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0xA0 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0xB8));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0xB9));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0xBA));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0xBB));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write each element
                for (const auto& el : *j.m_data.m_value.object)
                {
                    write_cbor(el.first);
                    write_cbor(el.second);
                }
                break;
            }

            case value_t::discarded:
            default:
                break;
        }
    }

    /*!
    @param[in] j  JSON value to serialize
    */
    void write_msgpack(const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::null: // nil
            {
                oa->write_character(to_char_type(0xC0));
                break;
            }

            case value_t::boolean: // true and false
            {
                oa->write_character(j.m_data.m_value.boolean
                                    ? to_char_type(0xC3)
                                    : to_char_type(0xC2));
                break;
            }

            case value_t::number_integer:
            {
                if (j.m_data.m_value.number_integer >= 0)
                {
                    // MessagePack does not differentiate between positive
                    // signed integers and unsigned integers. Therefore, we used
                    // the code from the value_t::number_unsigned case here.
                    if (j.m_data.m_value.number_unsigned < 128)
                    {
                        // positive fixnum
                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                    {
                        // uint 8
                        oa->write_character(to_char_type(0xCC));
                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                    {
                        // uint 16
                        oa->write_character(to_char_type(0xCD));
                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                    {
                        // uint 32
                        oa->write_character(to_char_type(0xCE));
                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                    {
                        // uint 64
                        oa->write_character(to_char_type(0xCF));
                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
                    }
                }
                else
                {
                    if (j.m_data.m_value.number_integer >= -32)
                    {
                        // negative fixnum
                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
                    {
                        // int 8
                        oa->write_character(to_char_type(0xD0));
                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
                    {
                        // int 16
                        oa->write_character(to_char_type(0xD1));
                        write_number(static_cast<std::int16_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
                    {
                        // int 32
                        oa->write_character(to_char_type(0xD2));
                        write_number(static_cast<std::int32_t>(j.m_data.m_value.number_integer));
                    }
                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
                    {
                        // int 64
                        oa->write_character(to_char_type(0xD3));
                        write_number(static_cast<std::int64_t>(j.m_data.m_value.number_integer));
                    }
                }
                break;
            }

            case value_t::number_unsigned:
            {
                if (j.m_data.m_value.number_unsigned < 128)
                {
                    // positive fixnum
                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                }
                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    // uint 8
                    oa->write_character(to_char_type(0xCC));
                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                }
                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // uint 16
                    oa->write_character(to_char_type(0xCD));
                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
                }
                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // uint 32
                    oa->write_character(to_char_type(0xCE));
                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
                }
                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    // uint 64
                    oa->write_character(to_char_type(0xCF));
                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
                }
                break;
            }

            case value_t::number_float:
            {
                write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::msgpack);
                break;
            }

            case value_t::string:
            {
                // step 1: write control byte and the string length
                const auto N = j.m_data.m_value.string->size();
                if (N <= 31)
                {
                    // fixstr
                    write_number(static_cast<std::uint8_t>(0xA0 | N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    // str 8
                    oa->write_character(to_char_type(0xD9));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // str 16
                    oa->write_character(to_char_type(0xDA));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // str 32
                    oa->write_character(to_char_type(0xDB));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 2: write the string
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
                    j.m_data.m_value.string->size());
                break;
            }

            case value_t::array:
            {
                // step 1: write control byte and the array size
                const auto N = j.m_data.m_value.array->size();
                if (N <= 15)
                {
                    // fixarray
                    write_number(static_cast<std::uint8_t>(0x90 | N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // array 16
                    oa->write_character(to_char_type(0xDC));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // array 32
                    oa->write_character(to_char_type(0xDD));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 2: write each element
                for (const auto& el : *j.m_data.m_value.array)
                {
                    write_msgpack(el);
                }
                break;
            }

            case value_t::binary:
            {
                // step 0: determine if the binary type has a set subtype to
                // determine whether or not to use the ext or fixext types
                const bool use_ext = j.m_data.m_value.binary->has_subtype();

                // step 1: write control byte and the byte string length
                const auto N = j.m_data.m_value.binary->size();
                if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    std::uint8_t output_type{};
                    bool fixed = true;
                    if (use_ext)
                    {
                        switch (N)
                        {
                            case 1:
                                output_type = 0xD4; // fixext 1
                                break;
                            case 2:
                                output_type = 0xD5; // fixext 2
                                break;
                            case 4:
                                output_type = 0xD6; // fixext 4
                                break;
                            case 8:
                                output_type = 0xD7; // fixext 8
                                break;
                            case 16:
                                output_type = 0xD8; // fixext 16
                                break;
                            default:
                                output_type = 0xC7; // ext 8
                                fixed = false;
                                break;
                        }

                    }
                    else
                    {
                        output_type = 0xC4; // bin 8
                        fixed = false;
                    }

                    oa->write_character(to_char_type(output_type));
                    if (!fixed)
                    {
                        write_number(static_cast<std::uint8_t>(N));
                    }
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    const std::uint8_t output_type = use_ext
                                                     ? 0xC8 // ext 16
                                                     : 0xC5; // bin 16

                    oa->write_character(to_char_type(output_type));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    const std::uint8_t output_type = use_ext
                                                     ? 0xC9 // ext 32
                                                     : 0xC6; // bin 32

                    oa->write_character(to_char_type(output_type));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 1.5: if this is an ext type, write the subtype
                if (use_ext)
                {
                    write_number(static_cast<std::int8_t>(j.m_data.m_value.binary->subtype()));
                }

                // step 2: write the byte string
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
                    N);

                break;
            }

            case value_t::object:
            {
                // step 1: write control byte and the object size
                const auto N = j.m_data.m_value.object->size();
                if (N <= 15)
                {
                    // fixmap
                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // map 16
                    oa->write_character(to_char_type(0xDE));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // map 32
                    oa->write_character(to_char_type(0xDF));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 2: write each element
                for (const auto& el : *j.m_data.m_value.object)
                {
                    write_msgpack(el.first);
                    write_msgpack(el.second);
                }
                break;
            }

            case value_t::discarded:
            default:
                break;
        }
    }

    /*!
    @param[in] j  JSON value to serialize
    @param[in] use_count   whether to use '#' prefixes (optimized format)
    @param[in] use_type    whether to use '$' prefixes (optimized format)
    @param[in] add_prefix  whether prefixes need to be used for this value
    @param[in] use_bjdata  whether write in BJData format, default is false
    */
    void write_ubjson(const BasicJsonType& j, const bool use_count,
                      const bool use_type, const bool add_prefix = true,
                      const bool use_bjdata = false)
    {
        switch (j.type())
        {
            case value_t::null:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('Z'));
                }
                break;
            }

            case value_t::boolean:
            {
                if (add_prefix)
                {
                    oa->write_character(j.m_data.m_value.boolean
                                        ? to_char_type('T')
                                        : to_char_type('F'));
                }
                break;
            }

            case value_t::number_integer:
            {
                write_number_with_ubjson_prefix(j.m_data.m_value.number_integer, add_prefix, use_bjdata);
                break;
            }

            case value_t::number_unsigned:
            {
                write_number_with_ubjson_prefix(j.m_data.m_value.number_unsigned, add_prefix, use_bjdata);
                break;
            }

            case value_t::number_float:
            {
                write_number_with_ubjson_prefix(j.m_data.m_value.number_float, add_prefix, use_bjdata);
                break;
            }

            case value_t::string:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('S'));
                }
                write_number_with_ubjson_prefix(j.m_data.m_value.string->size(), true, use_bjdata);
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
                    j.m_data.m_value.string->size());
                break;
            }

            case value_t::array:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('['));
                }

                bool prefix_required = true;
                if (use_type && !j.m_data.m_value.array->empty())
                {
                    JSON_ASSERT(use_count);
                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
                                                         [this, first_prefix, use_bjdata](const BasicJsonType & v)
                    {
                        return ubjson_prefix(v, use_bjdata) == first_prefix;
                    });

                    std::vector<CharType> bjdx = {'[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'}; // excluded markers in bjdata optimized type

                    if (same_prefix && !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end()))
                    {
                        prefix_required = false;
                        oa->write_character(to_char_type('$'));
                        oa->write_character(first_prefix);
                    }
                }

                if (use_count)
                {
                    oa->write_character(to_char_type('#'));
                    write_number_with_ubjson_prefix(j.m_data.m_value.array->size(), true, use_bjdata);
                }

                for (const auto& el : *j.m_data.m_value.array)
                {
                    write_ubjson(el, use_count, use_type, prefix_required, use_bjdata);
                }

                if (!use_count)
                {
                    oa->write_character(to_char_type(']'));
                }

                break;
            }

            case value_t::binary:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('['));
                }

                if (use_type && !j.m_data.m_value.binary->empty())
                {
                    JSON_ASSERT(use_count);
                    oa->write_character(to_char_type('$'));
                    oa->write_character('U');
                }

                if (use_count)
                {
                    oa->write_character(to_char_type('#'));
                    write_number_with_ubjson_prefix(j.m_data.m_value.binary->size(), true, use_bjdata);
                }

                if (use_type)
                {
                    oa->write_characters(
                        reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
                        j.m_data.m_value.binary->size());
                }
                else
                {
                    for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i)
                    {
                        oa->write_character(to_char_type('U'));
                        oa->write_character(j.m_data.m_value.binary->data()[i]);
                    }
                }

                if (!use_count)
                {
                    oa->write_character(to_char_type(']'));
                }

                break;
            }

            case value_t::object:
            {
                if (use_bjdata && j.m_data.m_value.object->size() == 3 && j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end())
                {
                    if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type))  // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata)
                    {
                        break;
                    }
                }

                if (add_prefix)
                {
                    oa->write_character(to_char_type('{'));
                }

                bool prefix_required = true;
                if (use_type && !j.m_data.m_value.object->empty())
                {
                    JSON_ASSERT(use_count);
                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
                    const bool same_prefix = std::all_of(j.begin(), j.end(),
                                                         [this, first_prefix, use_bjdata](const BasicJsonType & v)
                    {
                        return ubjson_prefix(v, use_bjdata) == first_prefix;
                    });

                    std::vector<CharType> bjdx = {'[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'}; // excluded markers in bjdata optimized type

                    if (same_prefix && !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end()))
                    {
                        prefix_required = false;
                        oa->write_character(to_char_type('$'));
                        oa->write_character(first_prefix);
                    }
                }

                if (use_count)
                {
                    oa->write_character(to_char_type('#'));
                    write_number_with_ubjson_prefix(j.m_data.m_value.object->size(), true, use_bjdata);
                }

                for (const auto& el : *j.m_data.m_value.object)
                {
                    write_number_with_ubjson_prefix(el.first.size(), true, use_bjdata);
                    oa->write_characters(
                        reinterpret_cast<const CharType*>(el.first.c_str()),
                        el.first.size());
                    write_ubjson(el.second, use_count, use_type, prefix_required, use_bjdata);
                }

                if (!use_count)
                {
                    oa->write_character(to_char_type('}'));
                }

                break;
            }

            case value_t::discarded:
            default:
                break;
        }
    }

  private:
    //////////
    // BSON //
    //////////

    /*!
    @return The size of a BSON document entry header, including the id marker
            and the entry name size (and its null-terminator).
    */
    static std::size_t calc_bson_entry_header_size(const string_t& name, const BasicJsonType& j)
    {
        const auto it = name.find(static_cast<typename string_t::value_type>(0));
        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
        {
            JSON_THROW(out_of_range::create(409, concat("BSON key cannot contain code point U+0000 (at byte ", std::to_string(it), ")"), &j));
            static_cast<void>(j);
        }

        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
    }

    /*!
    @brief Writes the given @a element_type and @a name to the output adapter
    */
    void write_bson_entry_header(const string_t& name,
                                 const std::uint8_t element_type)
    {
        oa->write_character(to_char_type(element_type)); // boolean
        oa->write_characters(
            reinterpret_cast<const CharType*>(name.c_str()),
            name.size() + 1u);
    }

    /*!
    @brief Writes a BSON element with key @a name and boolean value @a value
    */
    void write_bson_boolean(const string_t& name,
                            const bool value)
    {
        write_bson_entry_header(name, 0x08);
        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
    }

    /*!
    @brief Writes a BSON element with key @a name and double value @a value
    */
    void write_bson_double(const string_t& name,
                           const double value)
    {
        write_bson_entry_header(name, 0x01);
        write_number<double>(value, true);
    }

    /*!
    @return The size of the BSON-encoded string in @a value
    */
    static std::size_t calc_bson_string_size(const string_t& value)
    {
        return sizeof(std::int32_t) + value.size() + 1ul;
    }

    /*!
    @brief Writes a BSON element with key @a name and string value @a value
    */
    void write_bson_string(const string_t& name,
                           const string_t& value)
    {
        write_bson_entry_header(name, 0x02);

        write_number<std::int32_t>(static_cast<std::int32_t>(value.size() + 1ul), true);
        oa->write_characters(
            reinterpret_cast<const CharType*>(value.c_str()),
            value.size() + 1);
    }

    /*!
    @brief Writes a BSON element with key @a name and null value
    */
    void write_bson_null(const string_t& name)
    {
        write_bson_entry_header(name, 0x0A);
    }

    /*!
    @return The size of the BSON-encoded integer @a value
    */
    static std::size_t calc_bson_integer_size(const std::int64_t value)
    {
        return (std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)()
               ? sizeof(std::int32_t)
               : sizeof(std::int64_t);
    }

    /*!
    @brief Writes a BSON element with key @a name and integer @a value
    */
    void write_bson_integer(const string_t& name,
                            const std::int64_t value)
    {
        if ((std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)())
        {
            write_bson_entry_header(name, 0x10); // int32
            write_number<std::int32_t>(static_cast<std::int32_t>(value), true);
        }
        else
        {
            write_bson_entry_header(name, 0x12); // int64
            write_number<std::int64_t>(static_cast<std::int64_t>(value), true);
        }
    }

    /*!
    @return The size of the BSON-encoded unsigned integer in @a j
    */
    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
    {
        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
               ? sizeof(std::int32_t)
               : sizeof(std::int64_t);
    }

    /*!
    @brief Writes a BSON element with key @a name and unsigned @a value
    */
    void write_bson_unsigned(const string_t& name,
                             const BasicJsonType& j)
    {
        if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
        {
            write_bson_entry_header(name, 0x10 /* int32 */);
            write_number<std::int32_t>(static_cast<std::int32_t>(j.m_data.m_value.number_unsigned), true);
        }
        else if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
        {
            write_bson_entry_header(name, 0x12 /* int64 */);
            write_number<std::int64_t>(static_cast<std::int64_t>(j.m_data.m_value.number_unsigned), true);
        }
        else
        {
            JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_data.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j));
        }
    }

    /*!
    @brief Writes a BSON element with key @a name and object @a value
    */
    void write_bson_object_entry(const string_t& name,
                                 const typename BasicJsonType::object_t& value)
    {
        write_bson_entry_header(name, 0x03); // object
        write_bson_object(value);
    }

    /*!
    @return The size of the BSON-encoded array @a value
    */
    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
    {
        std::size_t array_index = 0ul;

        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), static_cast<std::size_t>(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
        {
            return result + calc_bson_element_size(std::to_string(array_index++), el);
        });

        return sizeof(std::int32_t) + embedded_document_size + 1ul;
    }

    /*!
    @return The size of the BSON-encoded binary array @a value
    */
    static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value)
    {
        return sizeof(std::int32_t) + value.size() + 1ul;
    }

    /*!
    @brief Writes a BSON element with key @a name and array @a value
    */
    void write_bson_array(const string_t& name,
                          const typename BasicJsonType::array_t& value)
    {
        write_bson_entry_header(name, 0x04); // array
        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_array_size(value)), true);

        std::size_t array_index = 0ul;

        for (const auto& el : value)
        {
            write_bson_element(std::to_string(array_index++), el);
        }

        oa->write_character(to_char_type(0x00));
    }

    /*!
    @brief Writes a BSON element with key @a name and binary value @a value
    */
    void write_bson_binary(const string_t& name,
                           const binary_t& value)
    {
        write_bson_entry_header(name, 0x05);

        write_number<std::int32_t>(static_cast<std::int32_t>(value.size()), true);
        write_number(value.has_subtype() ? static_cast<std::uint8_t>(value.subtype()) : static_cast<std::uint8_t>(0x00));

        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
    }

    /*!
    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
    @return The calculated size for the BSON document entry for @a j with the given @a name.
    */
    static std::size_t calc_bson_element_size(const string_t& name,
            const BasicJsonType& j)
    {
        const auto header_size = calc_bson_entry_header_size(name, j);
        switch (j.type())
        {
            case value_t::object:
                return header_size + calc_bson_object_size(*j.m_data.m_value.object);

            case value_t::array:
                return header_size + calc_bson_array_size(*j.m_data.m_value.array);

            case value_t::binary:
                return header_size + calc_bson_binary_size(*j.m_data.m_value.binary);

            case value_t::boolean:
                return header_size + 1ul;

            case value_t::number_float:
                return header_size + 8ul;

            case value_t::number_integer:
                return header_size + calc_bson_integer_size(j.m_data.m_value.number_integer);

            case value_t::number_unsigned:
                return header_size + calc_bson_unsigned_size(j.m_data.m_value.number_unsigned);

            case value_t::string:
                return header_size + calc_bson_string_size(*j.m_data.m_value.string);

            case value_t::null:
                return header_size + 0ul;

            // LCOV_EXCL_START
            case value_t::discarded:
            default:
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
                return 0ul;
                // LCOV_EXCL_STOP
        }
    }

    /*!
    @brief Serializes the JSON value @a j to BSON and associates it with the
           key @a name.
    @param name The name to associate with the JSON entity @a j within the
                current BSON document
    */
    void write_bson_element(const string_t& name,
                            const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::object:
                return write_bson_object_entry(name, *j.m_data.m_value.object);

            case value_t::array:
                return write_bson_array(name, *j.m_data.m_value.array);

            case value_t::binary:
                return write_bson_binary(name, *j.m_data.m_value.binary);

            case value_t::boolean:
                return write_bson_boolean(name, j.m_data.m_value.boolean);

            case value_t::number_float:
                return write_bson_double(name, j.m_data.m_value.number_float);

            case value_t::number_integer:
                return write_bson_integer(name, j.m_data.m_value.number_integer);

            case value_t::number_unsigned:
                return write_bson_unsigned(name, j);

            case value_t::string:
                return write_bson_string(name, *j.m_data.m_value.string);

            case value_t::null:
                return write_bson_null(name);

            // LCOV_EXCL_START
            case value_t::discarded:
            default:
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
                return;
                // LCOV_EXCL_STOP
        }
    }

    /*!
    @brief Calculates the size of the BSON serialization of the given
           JSON-object @a j.
    @param[in] value  JSON value to serialize
    @pre       value.type() == value_t::object
    */
    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
    {
        const std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast<std::size_t>(0),
                                          [](size_t result, const typename BasicJsonType::object_t::value_type & el)
        {
            return result += calc_bson_element_size(el.first, el.second);
        });

        return sizeof(std::int32_t) + document_size + 1ul;
    }

    /*!
    @param[in] value  JSON value to serialize
    @pre       value.type() == value_t::object
    */
    void write_bson_object(const typename BasicJsonType::object_t& value)
    {
        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_object_size(value)), true);

        for (const auto& el : value)
        {
            write_bson_element(el.first, el.second);
        }

        oa->write_character(to_char_type(0x00));
    }

    //////////
    // CBOR //
    //////////

    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
    {
        return to_char_type(0xFA);  // Single-Precision Float
    }

    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
    {
        return to_char_type(0xFB);  // Double-Precision Float
    }

    /////////////
    // MsgPack //
    /////////////

    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
    {
        return to_char_type(0xCA);  // float 32
    }

    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
    {
        return to_char_type(0xCB);  // float 64
    }

    ////////////
    // UBJSON //
    ////////////

    // UBJSON: write number (floating point)
    template<typename NumberType, typename std::enable_if<
                 std::is_floating_point<NumberType>::value, int>::type = 0>
    void write_number_with_ubjson_prefix(const NumberType n,
                                         const bool add_prefix,
                                         const bool use_bjdata)
    {
        if (add_prefix)
        {
            oa->write_character(get_ubjson_float_prefix(n));
        }
        write_number(n, use_bjdata);
    }

    // UBJSON: write number (unsigned integer)
    template<typename NumberType, typename std::enable_if<
                 std::is_unsigned<NumberType>::value, int>::type = 0>
    void write_number_with_ubjson_prefix(const NumberType n,
                                         const bool add_prefix,
                                         const bool use_bjdata)
    {
        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('i'));  // int8
            }
            write_number(static_cast<std::uint8_t>(n), use_bjdata);
        }
        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('U'));  // uint8
            }
            write_number(static_cast<std::uint8_t>(n), use_bjdata);
        }
        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('I'));  // int16
            }
            write_number(static_cast<std::int16_t>(n), use_bjdata);
        }
        else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint16_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
            }
            write_number(static_cast<std::uint16_t>(n), use_bjdata);
        }
        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('l'));  // int32
            }
            write_number(static_cast<std::int32_t>(n), use_bjdata);
        }
        else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
            }
            write_number(static_cast<std::uint32_t>(n), use_bjdata);
        }
        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('L'));  // int64
            }
            write_number(static_cast<std::int64_t>(n), use_bjdata);
        }
        else if (use_bjdata && n <= (std::numeric_limits<uint64_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('M'));  // uint64 - bjdata only
            }
            write_number(static_cast<std::uint64_t>(n), use_bjdata);
        }
        else
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('H'));  // high-precision number
            }

            const auto number = BasicJsonType(n).dump();
            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
            for (std::size_t i = 0; i < number.size(); ++i)
            {
                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
            }
        }
    }

    // UBJSON: write number (signed integer)
    template < typename NumberType, typename std::enable_if <
                   std::is_signed<NumberType>::value&&
                   !std::is_floating_point<NumberType>::value, int >::type = 0 >
    void write_number_with_ubjson_prefix(const NumberType n,
                                         const bool add_prefix,
                                         const bool use_bjdata)
    {
        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('i'));  // int8
            }
            write_number(static_cast<std::int8_t>(n), use_bjdata);
        }
        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('U'));  // uint8
            }
            write_number(static_cast<std::uint8_t>(n), use_bjdata);
        }
        else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('I'));  // int16
            }
            write_number(static_cast<std::int16_t>(n), use_bjdata);
        }
        else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::max)())))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
            }
            write_number(static_cast<uint16_t>(n), use_bjdata);
        }
        else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('l'));  // int32
            }
            write_number(static_cast<std::int32_t>(n), use_bjdata);
        }
        else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::max)())))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
            }
            write_number(static_cast<uint32_t>(n), use_bjdata);
        }
        else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('L'));  // int64
            }
            write_number(static_cast<std::int64_t>(n), use_bjdata);
        }
        // LCOV_EXCL_START
        else
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('H'));  // high-precision number
            }

            const auto number = BasicJsonType(n).dump();
            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
            for (std::size_t i = 0; i < number.size(); ++i)
            {
                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
            }
        }
        // LCOV_EXCL_STOP
    }

    /*!
    @brief determine the type prefix of container values
    */
    CharType ubjson_prefix(const BasicJsonType& j, const bool use_bjdata) const noexcept
    {
        switch (j.type())
        {
            case value_t::null:
                return 'Z';

            case value_t::boolean:
                return j.m_data.m_value.boolean ? 'T' : 'F';

            case value_t::number_integer:
            {
                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
                {
                    return 'i';
                }
                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    return 'U';
                }
                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
                {
                    return 'I';
                }
                if (use_bjdata && ((std::numeric_limits<std::uint16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)()))
                {
                    return 'u';
                }
                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
                {
                    return 'l';
                }
                if (use_bjdata && ((std::numeric_limits<std::uint32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)()))
                {
                    return 'm';
                }
                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
                {
                    return 'L';
                }
                // anything else is treated as high-precision number
                return 'H'; // LCOV_EXCL_LINE
            }

            case value_t::number_unsigned:
            {
                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
                {
                    return 'i';
                }
                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
                {
                    return 'U';
                }
                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
                {
                    return 'I';
                }
                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint16_t>::max)()))
                {
                    return 'u';
                }
                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
                {
                    return 'l';
                }
                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint32_t>::max)()))
                {
                    return 'm';
                }
                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
                {
                    return 'L';
                }
                if (use_bjdata && j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    return 'M';
                }
                // anything else is treated as high-precision number
                return 'H'; // LCOV_EXCL_LINE
            }

            case value_t::number_float:
                return get_ubjson_float_prefix(j.m_data.m_value.number_float);

            case value_t::string:
                return 'S';

            case value_t::array: // fallthrough
            case value_t::binary:
                return '[';

            case value_t::object:
                return '{';

            case value_t::discarded:
            default:  // discarded values
                return 'N';
        }
    }

    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
    {
        return 'd';  // float 32
    }

    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
    {
        return 'D';  // float 64
    }

    /*!
    @return false if the object is successfully converted to a bjdata ndarray, true if the type or size is invalid
    */
    bool write_bjdata_ndarray(const typename BasicJsonType::object_t& value, const bool use_count, const bool use_type)
    {
        std::map<string_t, CharType> bjdtype = {{"uint8", 'U'},  {"int8", 'i'},  {"uint16", 'u'}, {"int16", 'I'},
            {"uint32", 'm'}, {"int32", 'l'}, {"uint64", 'M'}, {"int64", 'L'}, {"single", 'd'}, {"double", 'D'}, {"char", 'C'}
        };

        string_t key = "_ArrayType_";
        auto it = bjdtype.find(static_cast<string_t>(value.at(key)));
        if (it == bjdtype.end())
        {
            return true;
        }
        CharType dtype = it->second;

        key = "_ArraySize_";
        std::size_t len = (value.at(key).empty() ? 0 : 1);
        for (const auto& el : value.at(key))
        {
            len *= static_cast<std::size_t>(el.m_data.m_value.number_unsigned);
        }

        key = "_ArrayData_";
        if (value.at(key).size() != len)
        {
            return true;
        }

        oa->write_character('[');
        oa->write_character('$');
        oa->write_character(dtype);
        oa->write_character('#');

        key = "_ArraySize_";
        write_ubjson(value.at(key), use_count, use_type, true,  true);

        key = "_ArrayData_";
        if (dtype == 'U' || dtype == 'C')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::uint8_t>(el.m_data.m_value.number_unsigned), true);
            }
        }
        else if (dtype == 'i')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::int8_t>(el.m_data.m_value.number_integer), true);
            }
        }
        else if (dtype == 'u')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::uint16_t>(el.m_data.m_value.number_unsigned), true);
            }
        }
        else if (dtype == 'I')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::int16_t>(el.m_data.m_value.number_integer), true);
            }
        }
        else if (dtype == 'm')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::uint32_t>(el.m_data.m_value.number_unsigned), true);
            }
        }
        else if (dtype == 'l')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::int32_t>(el.m_data.m_value.number_integer), true);
            }
        }
        else if (dtype == 'M')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::uint64_t>(el.m_data.m_value.number_unsigned), true);
            }
        }
        else if (dtype == 'L')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<std::int64_t>(el.m_data.m_value.number_integer), true);
            }
        }
        else if (dtype == 'd')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<float>(el.m_data.m_value.number_float), true);
            }
        }
        else if (dtype == 'D')
        {
            for (const auto& el : value.at(key))
            {
                write_number(static_cast<double>(el.m_data.m_value.number_float), true);
            }
        }
        return false;
    }

    ///////////////////////
    // Utility functions //
    ///////////////////////

    /*
    @brief write a number to output input
    @param[in] n number of type @a NumberType
    @param[in] OutputIsLittleEndian Set to true if output data is
                                 required to be little endian
    @tparam NumberType the type of the number

    @note This function needs to respect the system's endianness, because bytes
          in CBOR, MessagePack, and UBJSON are stored in network order (big
          endian) and therefore need reordering on little endian systems.
          On the other hand, BSON and BJData use little endian and should reorder
          on big endian systems.
    */
    template<typename NumberType>
    void write_number(const NumberType n, const bool OutputIsLittleEndian = false)
    {
        // step 1: write number to array of length NumberType
        std::array<CharType, sizeof(NumberType)> vec{};
        std::memcpy(vec.data(), &n, sizeof(NumberType));

        // step 2: write array to output (with possible reordering)
        if (is_little_endian != OutputIsLittleEndian)
        {
            // reverse byte order prior to conversion if necessary
            std::reverse(vec.begin(), vec.end());
        }

        oa->write_characters(vec.data(), sizeof(NumberType));
    }

    void write_compact_float(const number_float_t n, detail::input_format_t format)
    {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
                static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
                static_cast<double>(static_cast<float>(n)) == static_cast<double>(n))
        {
            oa->write_character(format == detail::input_format_t::cbor
                                ? get_cbor_float_prefix(static_cast<float>(n))
                                : get_msgpack_float_prefix(static_cast<float>(n)));
            write_number(static_cast<float>(n));
        }
        else
        {
            oa->write_character(format == detail::input_format_t::cbor
                                ? get_cbor_float_prefix(n)
                                : get_msgpack_float_prefix(n));
            write_number(n);
        }
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
    }

  public:
    // The following to_char_type functions are implement the conversion
    // between uint8_t and CharType. In case CharType is not unsigned,
    // such a conversion is required to allow values greater than 128.
    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
    template < typename C = CharType,
               enable_if_t < std::is_signed<C>::value && std::is_signed<char>::value > * = nullptr >
    static constexpr CharType to_char_type(std::uint8_t x) noexcept
    {
        return *reinterpret_cast<char*>(&x);
    }

    template < typename C = CharType,
               enable_if_t < std::is_signed<C>::value && std::is_unsigned<char>::value > * = nullptr >
    static CharType to_char_type(std::uint8_t x) noexcept
    {
        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
        CharType result;
        std::memcpy(&result, &x, sizeof(x));
        return result;
    }

    template<typename C = CharType,
             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
    static constexpr CharType to_char_type(std::uint8_t x) noexcept
    {
        return x;
    }

    template < typename InputCharType, typename C = CharType,
               enable_if_t <
                   std::is_signed<C>::value &&
                   std::is_signed<char>::value &&
                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
                   > * = nullptr >
    static constexpr CharType to_char_type(InputCharType x) noexcept
    {
        return x;
    }

  private:
    /// whether we can assume little endianness
    const bool is_little_endian = little_endianness();

    /// the output
    output_adapter_t<CharType> oa = nullptr;
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/output/output_adapters.hpp>

// #include <nlohmann/detail/output/serializer.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2008-2009 Björn Hoehrmann <bjoern@hoehrmann.de>
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <algorithm> // reverse, remove, fill, find, none_of
#include <array> // array
#include <clocale> // localeconv, lconv
#include <cmath> // labs, isfinite, isnan, signbit
#include <cstddef> // size_t, ptrdiff_t
#include <cstdint> // uint8_t
#include <cstdio> // snprintf
#include <limits> // numeric_limits
#include <string> // string, char_traits
#include <iomanip> // setfill, setw
#include <type_traits> // is_same
#include <utility> // move

// #include <nlohmann/detail/conversions/to_chars.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2009 Florian Loitsch <https://florian.loitsch.com/>
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <array> // array
#include <cmath>   // signbit, isfinite
#include <cstdint> // intN_t, uintN_t
#include <cstring> // memcpy, memmove
#include <limits> // numeric_limits
#include <type_traits> // conditional

// #include <nlohmann/detail/macro_scope.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

/*!
@brief implements the Grisu2 algorithm for binary to decimal floating-point
conversion.

This implementation is a slightly modified version of the reference
implementation which may be obtained from
http://florian.loitsch.com/publications (bench.tar.gz).

The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.

For a detailed description of the algorithm see:

[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
    Language Design and Implementation, PLDI 2010
[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
    Design and Implementation, PLDI 1996
*/
namespace dtoa_impl
{

template<typename Target, typename Source>
Target reinterpret_bits(const Source source)
{
    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");

    Target target;
    std::memcpy(&target, &source, sizeof(Source));
    return target;
}

struct diyfp // f * 2^e
{
    static constexpr int kPrecision = 64; // = q

    std::uint64_t f = 0;
    int e = 0;

    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}

    /*!
    @brief returns x - y
    @pre x.e == y.e and x.f >= y.f
    */
    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
    {
        JSON_ASSERT(x.e == y.e);
        JSON_ASSERT(x.f >= y.f);

        return {x.f - y.f, x.e};
    }

    /*!
    @brief returns x * y
    @note The result is rounded. (Only the upper q bits are returned.)
    */
    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
    {
        static_assert(kPrecision == 64, "internal error");

        // Computes:
        //  f = round((x.f * y.f) / 2^q)
        //  e = x.e + y.e + q

        // Emulate the 64-bit * 64-bit multiplication:
        //
        // p = u * v
        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
        //
        // (Since Q might be larger than 2^32 - 1)
        //
        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
        //
        // (Q_hi + H does not overflow a 64-bit int)
        //
        //   = p_lo + 2^64 p_hi

        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
        const std::uint64_t u_hi = x.f >> 32u;
        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
        const std::uint64_t v_hi = y.f >> 32u;

        const std::uint64_t p0 = u_lo * v_lo;
        const std::uint64_t p1 = u_lo * v_hi;
        const std::uint64_t p2 = u_hi * v_lo;
        const std::uint64_t p3 = u_hi * v_hi;

        const std::uint64_t p0_hi = p0 >> 32u;
        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
        const std::uint64_t p1_hi = p1 >> 32u;
        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
        const std::uint64_t p2_hi = p2 >> 32u;

        std::uint64_t Q = p0_hi + p1_lo + p2_lo;

        // The full product might now be computed as
        //
        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
        // p_lo = p0_lo + (Q << 32)
        //
        // But in this particular case here, the full p_lo is not required.
        // Effectively we only need to add the highest bit in p_lo to p_hi (and
        // Q_hi + 1 does not overflow).

        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up

        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);

        return {h, x.e + y.e + 64};
    }

    /*!
    @brief normalize x such that the significand is >= 2^(q-1)
    @pre x.f != 0
    */
    static diyfp normalize(diyfp x) noexcept
    {
        JSON_ASSERT(x.f != 0);

        while ((x.f >> 63u) == 0)
        {
            x.f <<= 1u;
            x.e--;
        }

        return x;
    }

    /*!
    @brief normalize x such that the result has the exponent E
    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
    */
    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
    {
        const int delta = x.e - target_exponent;

        JSON_ASSERT(delta >= 0);
        JSON_ASSERT(((x.f << delta) >> delta) == x.f);

        return {x.f << delta, target_exponent};
    }
};

struct boundaries
{
    diyfp w;
    diyfp minus;
    diyfp plus;
};

/*!
Compute the (normalized) diyfp representing the input number 'value' and its
boundaries.

@pre value must be finite and positive
*/
template<typename FloatType>
boundaries compute_boundaries(FloatType value)
{
    JSON_ASSERT(std::isfinite(value));
    JSON_ASSERT(value > 0);

    // Convert the IEEE representation into a diyfp.
    //
    // If v is denormal:
    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
    // If v is normalized:
    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))

    static_assert(std::numeric_limits<FloatType>::is_iec559,
                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");

    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
    constexpr int      kMinExp    = 1 - kBias;
    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)

    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;

    const auto bits = static_cast<std::uint64_t>(reinterpret_bits<bits_type>(value));
    const std::uint64_t E = bits >> (kPrecision - 1);
    const std::uint64_t F = bits & (kHiddenBit - 1);

    const bool is_denormal = E == 0;
    const diyfp v = is_denormal
                    ? diyfp(F, kMinExp)
                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);

    // Compute the boundaries m- and m+ of the floating-point value
    // v = f * 2^e.
    //
    // Determine v- and v+, the floating-point predecessor and successor if v,
    // respectively.
    //
    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
    //
    //      v+ = v + 2^e
    //
    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
    // between m- and m+ round to v, regardless of how the input rounding
    // algorithm breaks ties.
    //
    //      ---+-------------+-------------+-------------+-------------+---  (A)
    //         v-            m-            v             m+            v+
    //
    //      -----------------+------+------+-------------+-------------+---  (B)
    //                       v-     m-     v             m+            v+

    const bool lower_boundary_is_closer = F == 0 && E > 1;
    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
    const diyfp m_minus = lower_boundary_is_closer
                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
                          : diyfp(2 * v.f - 1, v.e - 1); // (A)

    // Determine the normalized w+ = m+.
    const diyfp w_plus = diyfp::normalize(m_plus);

    // Determine w- = m- such that e_(w-) = e_(w+).
    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);

    return {diyfp::normalize(v), w_minus, w_plus};
}

// Given normalized diyfp w, Grisu needs to find a (normalized) cached
// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
// within a certain range [alpha, gamma] (Definition 3.2 from [1])
//
//      alpha <= e = e_c + e_w + q <= gamma
//
// or
//
//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
//                          <= f_c * f_w * 2^gamma
//
// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
//
//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
//
// or
//
//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
//
// The choice of (alpha,gamma) determines the size of the table and the form of
// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
// in practice:
//
// The idea is to cut the number c * w = f * 2^e into two parts, which can be
// processed independently: An integral part p1, and a fractional part p2:
//
//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
//              = (f div 2^-e) + (f mod 2^-e) * 2^e
//              = p1 + p2 * 2^e
//
// The conversion of p1 into decimal form requires a series of divisions and
// modulos by (a power of) 10. These operations are faster for 32-bit than for
// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
// achieved by choosing
//
//      -e >= 32   or   e <= -32 := gamma
//
// In order to convert the fractional part
//
//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
//
// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
// d[-i] are extracted in order:
//
//      (10 * p2) div 2^-e = d[-1]
//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
//
// The multiplication by 10 must not overflow. It is sufficient to choose
//
//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
//
// Since p2 = f mod 2^-e < 2^-e,
//
//      -e <= 60   or   e >= -60 := alpha

constexpr int kAlpha = -60;
constexpr int kGamma = -32;

struct cached_power // c = f * 2^e ~= 10^k
{
    std::uint64_t f;
    int e;
    int k;
};

/*!
For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
satisfies (Definition 3.2 from [1])

     alpha <= e_c + e + q <= gamma.
*/
inline cached_power get_cached_power_for_binary_exponent(int e)
{
    // Now
    //
    //      alpha <= e_c + e + q <= gamma                                    (1)
    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
    //
    // and since the c's are normalized, 2^(q-1) <= f_c,
    //
    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
    //      ==> 2^(alpha - e - 1) <= c
    //
    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
    //
    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
    //        = ceil( (alpha - e - 1) * log_10(2) )
    //
    // From the paper:
    // "In theory the result of the procedure could be wrong since c is rounded,
    //  and the computation itself is approximated [...]. In practice, however,
    //  this simple function is sufficient."
    //
    // For IEEE double precision floating-point numbers converted into
    // normalized diyfp's w = f * 2^e, with q = 64,
    //
    //      e >= -1022      (min IEEE exponent)
    //           -52        (p - 1)
    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
    //           -11        (normalize the diyfp)
    //         = -1137
    //
    // and
    //
    //      e <= +1023      (max IEEE exponent)
    //           -52        (p - 1)
    //           -11        (normalize the diyfp)
    //         = 960
    //
    // This binary exponent range [-1137,960] results in a decimal exponent
    // range [-307,324]. One does not need to store a cached power for each
    // k in this range. For each such k it suffices to find a cached power
    // such that the exponent of the product lies in [alpha,gamma].
    // This implies that the difference of the decimal exponents of adjacent
    // table entries must be less than or equal to
    //
    //      floor( (gamma - alpha) * log_10(2) ) = 8.
    //
    // (A smaller distance gamma-alpha would require a larger table.)

    // NB:
    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.

    constexpr int kCachedPowersMinDecExp = -300;
    constexpr int kCachedPowersDecStep = 8;

    static constexpr std::array<cached_power, 79> kCachedPowers =
    {
        {
            { 0xAB70FE17C79AC6CA, -1060, -300 },
            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
            { 0xBE5691EF416BD60C, -1007, -284 },
            { 0x8DD01FAD907FFC3C,  -980, -276 },
            { 0xD3515C2831559A83,  -954, -268 },
            { 0x9D71AC8FADA6C9B5,  -927, -260 },
            { 0xEA9C227723EE8BCB,  -901, -252 },
            { 0xAECC49914078536D,  -874, -244 },
            { 0x823C12795DB6CE57,  -847, -236 },
            { 0xC21094364DFB5637,  -821, -228 },
            { 0x9096EA6F3848984F,  -794, -220 },
            { 0xD77485CB25823AC7,  -768, -212 },
            { 0xA086CFCD97BF97F4,  -741, -204 },
            { 0xEF340A98172AACE5,  -715, -196 },
            { 0xB23867FB2A35B28E,  -688, -188 },
            { 0x84C8D4DFD2C63F3B,  -661, -180 },
            { 0xC5DD44271AD3CDBA,  -635, -172 },
            { 0x936B9FCEBB25C996,  -608, -164 },
            { 0xDBAC6C247D62A584,  -582, -156 },
            { 0xA3AB66580D5FDAF6,  -555, -148 },
            { 0xF3E2F893DEC3F126,  -529, -140 },
            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
            { 0x87625F056C7C4A8B,  -475, -124 },
            { 0xC9BCFF6034C13053,  -449, -116 },
            { 0x964E858C91BA2655,  -422, -108 },
            { 0xDFF9772470297EBD,  -396, -100 },
            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
            { 0xF8A95FCF88747D94,  -343,  -84 },
            { 0xB94470938FA89BCF,  -316,  -76 },
            { 0x8A08F0F8BF0F156B,  -289,  -68 },
            { 0xCDB02555653131B6,  -263,  -60 },
            { 0x993FE2C6D07B7FAC,  -236,  -52 },
            { 0xE45C10C42A2B3B06,  -210,  -44 },
            { 0xAA242499697392D3,  -183,  -36 },
            { 0xFD87B5F28300CA0E,  -157,  -28 },
            { 0xBCE5086492111AEB,  -130,  -20 },
            { 0x8CBCCC096F5088CC,  -103,  -12 },
            { 0xD1B71758E219652C,   -77,   -4 },
            { 0x9C40000000000000,   -50,    4 },
            { 0xE8D4A51000000000,   -24,   12 },
            { 0xAD78EBC5AC620000,     3,   20 },
            { 0x813F3978F8940984,    30,   28 },
            { 0xC097CE7BC90715B3,    56,   36 },
            { 0x8F7E32CE7BEA5C70,    83,   44 },
            { 0xD5D238A4ABE98068,   109,   52 },
            { 0x9F4F2726179A2245,   136,   60 },
            { 0xED63A231D4C4FB27,   162,   68 },
            { 0xB0DE65388CC8ADA8,   189,   76 },
            { 0x83C7088E1AAB65DB,   216,   84 },
            { 0xC45D1DF942711D9A,   242,   92 },
            { 0x924D692CA61BE758,   269,  100 },
            { 0xDA01EE641A708DEA,   295,  108 },
            { 0xA26DA3999AEF774A,   322,  116 },
            { 0xF209787BB47D6B85,   348,  124 },
            { 0xB454E4A179DD1877,   375,  132 },
            { 0x865B86925B9BC5C2,   402,  140 },
            { 0xC83553C5C8965D3D,   428,  148 },
            { 0x952AB45CFA97A0B3,   455,  156 },
            { 0xDE469FBD99A05FE3,   481,  164 },
            { 0xA59BC234DB398C25,   508,  172 },
            { 0xF6C69A72A3989F5C,   534,  180 },
            { 0xB7DCBF5354E9BECE,   561,  188 },
            { 0x88FCF317F22241E2,   588,  196 },
            { 0xCC20CE9BD35C78A5,   614,  204 },
            { 0x98165AF37B2153DF,   641,  212 },
            { 0xE2A0B5DC971F303A,   667,  220 },
            { 0xA8D9D1535CE3B396,   694,  228 },
            { 0xFB9B7CD9A4A7443C,   720,  236 },
            { 0xBB764C4CA7A44410,   747,  244 },
            { 0x8BAB8EEFB6409C1A,   774,  252 },
            { 0xD01FEF10A657842C,   800,  260 },
            { 0x9B10A4E5E9913129,   827,  268 },
            { 0xE7109BFBA19C0C9D,   853,  276 },
            { 0xAC2820D9623BF429,   880,  284 },
            { 0x80444B5E7AA7CF85,   907,  292 },
            { 0xBF21E44003ACDD2D,   933,  300 },
            { 0x8E679C2F5E44FF8F,   960,  308 },
            { 0xD433179D9C8CB841,   986,  316 },
            { 0x9E19DB92B4E31BA9,  1013,  324 },
        }
    };

    // This computation gives exactly the same results for k as
    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
    // for |e| <= 1500, but doesn't require floating-point operations.
    // NB: log_10(2) ~= 78913 / 2^18
    JSON_ASSERT(e >= -1500);
    JSON_ASSERT(e <=  1500);
    const int f = kAlpha - e - 1;
    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);

    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
    JSON_ASSERT(index >= 0);
    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());

    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
    JSON_ASSERT(kAlpha <= cached.e + e + 64);
    JSON_ASSERT(kGamma >= cached.e + e + 64);

    return cached;
}

/*!
For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
For n == 0, returns 1 and sets pow10 := 1.
*/
inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
{
    // LCOV_EXCL_START
    if (n >= 1000000000)
    {
        pow10 = 1000000000;
        return 10;
    }
    // LCOV_EXCL_STOP
    if (n >= 100000000)
    {
        pow10 = 100000000;
        return  9;
    }
    if (n >= 10000000)
    {
        pow10 = 10000000;
        return  8;
    }
    if (n >= 1000000)
    {
        pow10 = 1000000;
        return  7;
    }
    if (n >= 100000)
    {
        pow10 = 100000;
        return  6;
    }
    if (n >= 10000)
    {
        pow10 = 10000;
        return  5;
    }
    if (n >= 1000)
    {
        pow10 = 1000;
        return  4;
    }
    if (n >= 100)
    {
        pow10 = 100;
        return  3;
    }
    if (n >= 10)
    {
        pow10 = 10;
        return  2;
    }

    pow10 = 1;
    return 1;
}

inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
                         std::uint64_t rest, std::uint64_t ten_k)
{
    JSON_ASSERT(len >= 1);
    JSON_ASSERT(dist <= delta);
    JSON_ASSERT(rest <= delta);
    JSON_ASSERT(ten_k > 0);

    //               <--------------------------- delta ---->
    //                                  <---- dist --------->
    // --------------[------------------+-------------------]--------------
    //               M-                 w                   M+
    //
    //                                  ten_k
    //                                <------>
    //                                       <---- rest ---->
    // --------------[------------------+----+--------------]--------------
    //                                  w    V
    //                                       = buf * 10^k
    //
    // ten_k represents a unit-in-the-last-place in the decimal representation
    // stored in buf.
    // Decrement buf by ten_k while this takes buf closer to w.

    // The tests are written in this order to avoid overflow in unsigned
    // integer arithmetic.

    while (rest < dist
            && delta - rest >= ten_k
            && (rest + ten_k < dist || dist - rest > rest + ten_k - dist))
    {
        JSON_ASSERT(buf[len - 1] != '0');
        buf[len - 1]--;
        rest += ten_k;
    }
}

/*!
Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
*/
inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
                             diyfp M_minus, diyfp w, diyfp M_plus)
{
    static_assert(kAlpha >= -60, "internal error");
    static_assert(kGamma <= -32, "internal error");

    // Generates the digits (and the exponent) of a decimal floating-point
    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
    //
    //               <--------------------------- delta ---->
    //                                  <---- dist --------->
    // --------------[------------------+-------------------]--------------
    //               M-                 w                   M+
    //
    // Grisu2 generates the digits of M+ from left to right and stops as soon as
    // V is in [M-,M+].

    JSON_ASSERT(M_plus.e >= kAlpha);
    JSON_ASSERT(M_plus.e <= kGamma);

    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)

    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
    //
    //      M+ = f * 2^e
    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
    //         = p1 + p2 * 2^e

    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);

    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e

    // 1)
    //
    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]

    JSON_ASSERT(p1 > 0);

    std::uint32_t pow10{};
    const int k = find_largest_pow10(p1, pow10);

    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
    //
    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
    //
    //      M+ = p1                                             + p2 * 2^e
    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
    //
    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
    //
    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
    //
    // but stop as soon as
    //
    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e

    int n = k;
    while (n > 0)
    {
        // Invariants:
        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
        //      pow10 = 10^(n-1) <= p1 < 10^n
        //
        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
        //
        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
        //
        JSON_ASSERT(d <= 9);
        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
        //
        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
        //
        p1 = r;
        n--;
        //
        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
        //      pow10 = 10^n
        //

        // Now check if enough digits have been generated.
        // Compute
        //
        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
        //
        // Note:
        // Since rest and delta share the same exponent e, it suffices to
        // compare the significands.
        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
        if (rest <= delta)
        {
            // V = buffer * 10^n, with M- <= V <= M+.

            decimal_exponent += n;

            // We may now just stop. But instead look if the buffer could be
            // decremented to bring V closer to w.
            //
            // pow10 = 10^n is now 1 ulp in the decimal representation V.
            // The rounding procedure works with diyfp's with an implicit
            // exponent of e.
            //
            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
            //
            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
            grisu2_round(buffer, length, dist, delta, rest, ten_n);

            return;
        }

        pow10 /= 10;
        //
        //      pow10 = 10^(n-1) <= p1 < 10^n
        // Invariants restored.
    }

    // 2)
    //
    // The digits of the integral part have been generated:
    //
    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
    //         = buffer            + p2 * 2^e
    //
    // Now generate the digits of the fractional part p2 * 2^e.
    //
    // Note:
    // No decimal point is generated: the exponent is adjusted instead.
    //
    // p2 actually represents the fraction
    //
    //      p2 * 2^e
    //          = p2 / 2^-e
    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
    //
    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
    //
    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
    //
    // using
    //
    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
    //                = (                   d) * 2^-e + (                   r)
    //
    // or
    //      10^m * p2 * 2^e = d + r * 2^e
    //
    // i.e.
    //
    //      M+ = buffer + p2 * 2^e
    //         = buffer + 10^-m * (d + r * 2^e)
    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
    //
    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e

    JSON_ASSERT(p2 > delta);

    int m = 0;
    for (;;)
    {
        // Invariant:
        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
        //
        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
        p2 *= 10;
        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
        //
        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
        //
        JSON_ASSERT(d <= 9);
        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
        //
        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
        //
        p2 = r;
        m++;
        //
        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
        // Invariant restored.

        // Check if enough digits have been generated.
        //
        //      10^-m * p2 * 2^e <= delta * 2^e
        //              p2 * 2^e <= 10^m * delta * 2^e
        //                    p2 <= 10^m * delta
        delta *= 10;
        dist  *= 10;
        if (p2 <= delta)
        {
            break;
        }
    }

    // V = buffer * 10^-m, with M- <= V <= M+.

    decimal_exponent -= m;

    // 1 ulp in the decimal representation is now 10^-m.
    // Since delta and dist are now scaled by 10^m, we need to do the
    // same with ulp in order to keep the units in sync.
    //
    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
    //
    const std::uint64_t ten_m = one.f;
    grisu2_round(buffer, length, dist, delta, p2, ten_m);

    // By construction this algorithm generates the shortest possible decimal
    // number (Loitsch, Theorem 6.2) which rounds back to w.
    // For an input number of precision p, at least
    //
    //      N = 1 + ceil(p * log_10(2))
    //
    // decimal digits are sufficient to identify all binary floating-point
    // numbers (Matula, "In-and-Out conversions").
    // This implies that the algorithm does not produce more than N decimal
    // digits.
    //
    //      N = 17 for p = 53 (IEEE double precision)
    //      N = 9  for p = 24 (IEEE single precision)
}

/*!
v = buf * 10^decimal_exponent
len is the length of the buffer (number of decimal digits)
The buffer must be large enough, i.e. >= max_digits10.
*/
JSON_HEDLEY_NON_NULL(1)
inline void grisu2(char* buf, int& len, int& decimal_exponent,
                   diyfp m_minus, diyfp v, diyfp m_plus)
{
    JSON_ASSERT(m_plus.e == m_minus.e);
    JSON_ASSERT(m_plus.e == v.e);

    //  --------(-----------------------+-----------------------)--------    (A)
    //          m-                      v                       m+
    //
    //  --------------------(-----------+-----------------------)--------    (B)
    //                      m-          v                       m+
    //
    // First scale v (and m- and m+) such that the exponent is in the range
    // [alpha, gamma].

    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);

    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k

    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
    const diyfp w       = diyfp::mul(v,       c_minus_k);
    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);

    //  ----(---+---)---------------(---+---)---------------(---+---)----
    //          w-                      w                       w+
    //          = c*m-                  = c*v                   = c*m+
    //
    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
    // w+ are now off by a small amount.
    // In fact:
    //
    //      w - v * 10^k < 1 ulp
    //
    // To account for this inaccuracy, add resp. subtract 1 ulp.
    //
    //  --------+---[---------------(---+---)---------------]---+--------
    //          w-  M-                  w                   M+  w+
    //
    // Now any number in [M-, M+] (bounds included) will round to w when input,
    // regardless of how the input rounding algorithm breaks ties.
    //
    // And digit_gen generates the shortest possible such number in [M-, M+].
    // Note that this does not mean that Grisu2 always generates the shortest
    // possible number in the interval (m-, m+).
    const diyfp M_minus(w_minus.f + 1, w_minus.e);
    const diyfp M_plus (w_plus.f  - 1, w_plus.e );

    decimal_exponent = -cached.k; // = -(-k) = k

    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
}

/*!
v = buf * 10^decimal_exponent
len is the length of the buffer (number of decimal digits)
The buffer must be large enough, i.e. >= max_digits10.
*/
template<typename FloatType>
JSON_HEDLEY_NON_NULL(1)
void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
{
    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
                  "internal error: not enough precision");

    JSON_ASSERT(std::isfinite(value));
    JSON_ASSERT(value > 0);

    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
    // decimal representations are not exactly "short".
    //
    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
    // and since sprintf promotes floats to doubles, I think this is exactly what 'std::to_chars'
    // does.
    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
    // representation using the corresponding std::from_chars function recovers value exactly". That
    // indicates that single precision floating-point numbers should be recovered using
    // 'std::strtof'.
    //
    // NB: If the neighbors are computed for single-precision numbers, there is a single float
    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
    //     value is off by 1 ulp.
#if 0 // NOLINT(readability-avoid-unconditional-preprocessor-if)
    const boundaries w = compute_boundaries(static_cast<double>(value));
#else
    const boundaries w = compute_boundaries(value);
#endif

    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
}

/*!
@brief appends a decimal representation of e to buf
@return a pointer to the element following the exponent.
@pre -1000 < e < 1000
*/
JSON_HEDLEY_NON_NULL(1)
JSON_HEDLEY_RETURNS_NON_NULL
inline char* append_exponent(char* buf, int e)
{
    JSON_ASSERT(e > -1000);
    JSON_ASSERT(e <  1000);

    if (e < 0)
    {
        e = -e;
        *buf++ = '-';
    }
    else
    {
        *buf++ = '+';
    }

    auto k = static_cast<std::uint32_t>(e);
    if (k < 10)
    {
        // Always print at least two digits in the exponent.
        // This is for compatibility with printf("%g").
        *buf++ = '0';
        *buf++ = static_cast<char>('0' + k);
    }
    else if (k < 100)
    {
        *buf++ = static_cast<char>('0' + k / 10);
        k %= 10;
        *buf++ = static_cast<char>('0' + k);
    }
    else
    {
        *buf++ = static_cast<char>('0' + k / 100);
        k %= 100;
        *buf++ = static_cast<char>('0' + k / 10);
        k %= 10;
        *buf++ = static_cast<char>('0' + k);
    }

    return buf;
}

/*!
@brief prettify v = buf * 10^decimal_exponent

If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
notation. Otherwise it will be printed in exponential notation.

@pre min_exp < 0
@pre max_exp > 0
*/
JSON_HEDLEY_NON_NULL(1)
JSON_HEDLEY_RETURNS_NON_NULL
inline char* format_buffer(char* buf, int len, int decimal_exponent,
                           int min_exp, int max_exp)
{
    JSON_ASSERT(min_exp < 0);
    JSON_ASSERT(max_exp > 0);

    const int k = len;
    const int n = len + decimal_exponent;

    // v = buf * 10^(n-k)
    // k is the length of the buffer (number of decimal digits)
    // n is the position of the decimal point relative to the start of the buffer.

    if (k <= n && n <= max_exp)
    {
        // digits[000]
        // len <= max_exp + 2

        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
        // Make it look like a floating-point number (#362, #378)
        buf[n + 0] = '.';
        buf[n + 1] = '0';
        return buf + (static_cast<size_t>(n) + 2);
    }

    if (0 < n && n <= max_exp)
    {
        // dig.its
        // len <= max_digits10 + 1

        JSON_ASSERT(k > n);

        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
        buf[n] = '.';
        return buf + (static_cast<size_t>(k) + 1U);
    }

    if (min_exp < n && n <= 0)
    {
        // 0.[000]digits
        // len <= 2 + (-min_exp - 1) + max_digits10

        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
        buf[0] = '0';
        buf[1] = '.';
        std::memset(buf + 2, '0', static_cast<size_t>(-n));
        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
    }

    if (k == 1)
    {
        // dE+123
        // len <= 1 + 5

        buf += 1;
    }
    else
    {
        // d.igitsE+123
        // len <= max_digits10 + 1 + 5

        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
        buf[1] = '.';
        buf += 1 + static_cast<size_t>(k);
    }

    *buf++ = 'e';
    return append_exponent(buf, n - 1);
}

}  // namespace dtoa_impl

/*!
@brief generates a decimal representation of the floating-point number value in [first, last).

The format of the resulting decimal representation is similar to printf's %g
format. Returns an iterator pointing past-the-end of the decimal representation.

@note The input number must be finite, i.e. NaN's and Inf's are not supported.
@note The buffer must be large enough.
@note The result is NOT null-terminated.
*/
template<typename FloatType>
JSON_HEDLEY_NON_NULL(1, 2)
JSON_HEDLEY_RETURNS_NON_NULL
char* to_chars(char* first, const char* last, FloatType value)
{
    static_cast<void>(last); // maybe unused - fix warning
    JSON_ASSERT(std::isfinite(value));

    // Use signbit(value) instead of (value < 0) since signbit works for -0.
    if (std::signbit(value))
    {
        value = -value;
        *first++ = '-';
    }

#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
    if (value == 0) // +-0
    {
        *first++ = '0';
        // Make it look like a floating-point number (#362, #378)
        *first++ = '.';
        *first++ = '0';
        return first;
    }
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif

    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);

    // Compute v = buffer * 10^decimal_exponent.
    // The decimal digits are stored in the buffer, which needs to be interpreted
    // as an unsigned decimal integer.
    // len is the length of the buffer, i.e. the number of decimal digits.
    int len = 0;
    int decimal_exponent = 0;
    dtoa_impl::grisu2(first, len, decimal_exponent, value);

    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);

    // Format the buffer like printf("%.*g", prec, value)
    constexpr int kMinExp = -4;
    // Use digits10 here to increase compatibility with version 2.
    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;

    JSON_ASSERT(last - first >= kMaxExp + 2);
    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);

    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
}

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/output/binary_writer.hpp>

// #include <nlohmann/detail/output/output_adapters.hpp>

// #include <nlohmann/detail/string_concat.hpp>

// #include <nlohmann/detail/value_t.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{

///////////////////
// serialization //
///////////////////

/// how to treat decoding errors
enum class error_handler_t
{
    strict,  ///< throw a type_error exception in case of invalid UTF-8
    replace, ///< replace invalid UTF-8 sequences with U+FFFD
    ignore   ///< ignore invalid UTF-8 sequences
};

template<typename BasicJsonType>
class serializer
{
    using string_t = typename BasicJsonType::string_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using binary_char_t = typename BasicJsonType::binary_t::value_type;
    static constexpr std::uint8_t UTF8_ACCEPT = 0;
    static constexpr std::uint8_t UTF8_REJECT = 1;

  public:
    /*!
    @param[in] s  output stream to serialize to
    @param[in] ichar  indentation character to use
    @param[in] error_handler_  how to react on decoding errors
    */
    serializer(output_adapter_t<char> s, const char ichar,
               error_handler_t error_handler_ = error_handler_t::strict)
        : o(std::move(s))
        , loc(std::localeconv())
        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->thousands_sep)))
        , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->decimal_point)))
        , indent_char(ichar)
        , indent_string(512, indent_char)
        , error_handler(error_handler_)
    {}

    // delete because of pointer members
    serializer(const serializer&) = delete;
    serializer& operator=(const serializer&) = delete;
    serializer(serializer&&) = delete;
    serializer& operator=(serializer&&) = delete;
    ~serializer() = default;

    /*!
    @brief internal implementation of the serialization function

    This function is called by the public member function dump and organizes
    the serialization internally. The indentation level is propagated as
    additional parameter. In case of arrays and objects, the function is
    called recursively.

    - strings and object keys are escaped using `escape_string()`
    - integer numbers are converted implicitly via `operator<<`
    - floating-point numbers are converted to a string using `"%g"` format
    - binary values are serialized as objects containing the subtype and the
      byte array

    @param[in] val               value to serialize
    @param[in] pretty_print      whether the output shall be pretty-printed
    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
    in the output are escaped with `\uXXXX` sequences, and the result consists
    of ASCII characters only.
    @param[in] indent_step       the indent level
    @param[in] current_indent    the current indent level (only used internally)
    */
    void dump(const BasicJsonType& val,
              const bool pretty_print,
              const bool ensure_ascii,
              const unsigned int indent_step,
              const unsigned int current_indent = 0)
    {
        switch (val.m_data.m_type)
        {
            case value_t::object:
            {
                if (val.m_data.m_value.object->empty())
                {
                    o->write_characters("{}", 2);
                    return;
                }

                if (pretty_print)
                {
                    o->write_characters("{\n", 2);

                    // variable to hold indentation for recursive calls
                    const auto new_indent = current_indent + indent_step;
                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
                    {
                        indent_string.resize(indent_string.size() * 2, ' ');
                    }

                    // first n-1 elements
                    auto i = val.m_data.m_value.object->cbegin();
                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
                    {
                        o->write_characters(indent_string.c_str(), new_indent);
                        o->write_character('\"');
                        dump_escaped(i->first, ensure_ascii);
                        o->write_characters("\": ", 3);
                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
                        o->write_characters(",\n", 2);
                    }

                    // last element
                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
                    o->write_characters(indent_string.c_str(), new_indent);
                    o->write_character('\"');
                    dump_escaped(i->first, ensure_ascii);
                    o->write_characters("\": ", 3);
                    dump(i->second, true, ensure_ascii, indent_step, new_indent);

                    o->write_character('\n');
                    o->write_characters(indent_string.c_str(), current_indent);
                    o->write_character('}');
                }
                else
                {
                    o->write_character('{');

                    // first n-1 elements
                    auto i = val.m_data.m_value.object->cbegin();
                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
                    {
                        o->write_character('\"');
                        dump_escaped(i->first, ensure_ascii);
                        o->write_characters("\":", 2);
                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
                        o->write_character(',');
                    }

                    // last element
                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
                    o->write_character('\"');
                    dump_escaped(i->first, ensure_ascii);
                    o->write_characters("\":", 2);
                    dump(i->second, false, ensure_ascii, indent_step, current_indent);

                    o->write_character('}');
                }

                return;
            }

            case value_t::array:
            {
                if (val.m_data.m_value.array->empty())
                {
                    o->write_characters("[]", 2);
                    return;
                }

                if (pretty_print)
                {
                    o->write_characters("[\n", 2);

                    // variable to hold indentation for recursive calls
                    const auto new_indent = current_indent + indent_step;
                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
                    {
                        indent_string.resize(indent_string.size() * 2, ' ');
                    }

                    // first n-1 elements
                    for (auto i = val.m_data.m_value.array->cbegin();
                            i != val.m_data.m_value.array->cend() - 1; ++i)
                    {
                        o->write_characters(indent_string.c_str(), new_indent);
                        dump(*i, true, ensure_ascii, indent_step, new_indent);
                        o->write_characters(",\n", 2);
                    }

                    // last element
                    JSON_ASSERT(!val.m_data.m_value.array->empty());
                    o->write_characters(indent_string.c_str(), new_indent);
                    dump(val.m_data.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);

                    o->write_character('\n');
                    o->write_characters(indent_string.c_str(), current_indent);
                    o->write_character(']');
                }
                else
                {
                    o->write_character('[');

                    // first n-1 elements
                    for (auto i = val.m_data.m_value.array->cbegin();
                            i != val.m_data.m_value.array->cend() - 1; ++i)
                    {
                        dump(*i, false, ensure_ascii, indent_step, current_indent);
                        o->write_character(',');
                    }

                    // last element
                    JSON_ASSERT(!val.m_data.m_value.array->empty());
                    dump(val.m_data.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);

                    o->write_character(']');
                }

                return;
            }

            case value_t::string:
            {
                o->write_character('\"');
                dump_escaped(*val.m_data.m_value.string, ensure_ascii);
                o->write_character('\"');
                return;
            }

            case value_t::binary:
            {
                if (pretty_print)
                {
                    o->write_characters("{\n", 2);

                    // variable to hold indentation for recursive calls
                    const auto new_indent = current_indent + indent_step;
                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
                    {
                        indent_string.resize(indent_string.size() * 2, ' ');
                    }

                    o->write_characters(indent_string.c_str(), new_indent);

                    o->write_characters("\"bytes\": [", 10);

                    if (!val.m_data.m_value.binary->empty())
                    {
                        for (auto i = val.m_data.m_value.binary->cbegin();
                                i != val.m_data.m_value.binary->cend() - 1; ++i)
                        {
                            dump_integer(*i);
                            o->write_characters(", ", 2);
                        }
                        dump_integer(val.m_data.m_value.binary->back());
                    }

                    o->write_characters("],\n", 3);
                    o->write_characters(indent_string.c_str(), new_indent);

                    o->write_characters("\"subtype\": ", 11);
                    if (val.m_data.m_value.binary->has_subtype())
                    {
                        dump_integer(val.m_data.m_value.binary->subtype());
                    }
                    else
                    {
                        o->write_characters("null", 4);
                    }
                    o->write_character('\n');
                    o->write_characters(indent_string.c_str(), current_indent);
                    o->write_character('}');
                }
                else
                {
                    o->write_characters("{\"bytes\":[", 10);

                    if (!val.m_data.m_value.binary->empty())
                    {
                        for (auto i = val.m_data.m_value.binary->cbegin();
                                i != val.m_data.m_value.binary->cend() - 1; ++i)
                        {
                            dump_integer(*i);
                            o->write_character(',');
                        }
                        dump_integer(val.m_data.m_value.binary->back());
                    }

                    o->write_characters("],\"subtype\":", 12);
                    if (val.m_data.m_value.binary->has_subtype())
                    {
                        dump_integer(val.m_data.m_value.binary->subtype());
                        o->write_character('}');
                    }
                    else
                    {
                        o->write_characters("null}", 5);
                    }
                }
                return;
            }

            case value_t::boolean:
            {
                if (val.m_data.m_value.boolean)
                {
                    o->write_characters("true", 4);
                }
                else
                {
                    o->write_characters("false", 5);
                }
                return;
            }

            case value_t::number_integer:
            {
                dump_integer(val.m_data.m_value.number_integer);
                return;
            }

            case value_t::number_unsigned:
            {
                dump_integer(val.m_data.m_value.number_unsigned);
                return;
            }

            case value_t::number_float:
            {
                dump_float(val.m_data.m_value.number_float);
                return;
            }

            case value_t::discarded:
            {
                o->write_characters("<discarded>", 11);
                return;
            }

            case value_t::null:
            {
                o->write_characters("null", 4);
                return;
            }

            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }
    }

  JSON_PRIVATE_UNLESS_TESTED:
    /*!
    @brief dump escaped string

    Escape a string by replacing certain special characters by a sequence of an
    escape character (backslash) and another character and other control
    characters by a sequence of "\u" followed by a four-digit hex
    representation. The escaped string is written to output stream @a o.

    @param[in] s  the string to escape
    @param[in] ensure_ascii  whether to escape non-ASCII characters with
                             \uXXXX sequences

    @complexity Linear in the length of string @a s.
    */
    void dump_escaped(const string_t& s, const bool ensure_ascii)
    {
        std::uint32_t codepoint{};
        std::uint8_t state = UTF8_ACCEPT;
        std::size_t bytes = 0;  // number of bytes written to string_buffer

        // number of bytes written at the point of the last valid byte
        std::size_t bytes_after_last_accept = 0;
        std::size_t undumped_chars = 0;

        for (std::size_t i = 0; i < s.size(); ++i)
        {
            const auto byte = static_cast<std::uint8_t>(s[i]);

            switch (decode(state, codepoint, byte))
            {
                case UTF8_ACCEPT:  // decode found a new code point
                {
                    switch (codepoint)
                    {
                        case 0x08: // backspace
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'b';
                            break;
                        }

                        case 0x09: // horizontal tab
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 't';
                            break;
                        }

                        case 0x0A: // newline
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'n';
                            break;
                        }

                        case 0x0C: // formfeed
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'f';
                            break;
                        }

                        case 0x0D: // carriage return
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'r';
                            break;
                        }

                        case 0x22: // quotation mark
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = '\"';
                            break;
                        }

                        case 0x5C: // reverse solidus
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = '\\';
                            break;
                        }

                        default:
                        {
                            // escape control characters (0x00..0x1F) or, if
                            // ensure_ascii parameter is used, non-ASCII characters
                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F)))
                            {
                                if (codepoint <= 0xFFFF)
                                {
                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                                    static_cast<void>((std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
                                                                      static_cast<std::uint16_t>(codepoint)));
                                    bytes += 6;
                                }
                                else
                                {
                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                                    static_cast<void>((std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
                                                                      static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
                                                                      static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu))));
                                    bytes += 12;
                                }
                            }
                            else
                            {
                                // copy byte to buffer (all previous bytes
                                // been copied have in default case above)
                                string_buffer[bytes++] = s[i];
                            }
                            break;
                        }
                    }

                    // write buffer and reset index; there must be 13 bytes
                    // left, as this is the maximal number of bytes to be
                    // written ("\uxxxx\uxxxx\0") for one code point
                    if (string_buffer.size() - bytes < 13)
                    {
                        o->write_characters(string_buffer.data(), bytes);
                        bytes = 0;
                    }

                    // remember the byte position of this accept
                    bytes_after_last_accept = bytes;
                    undumped_chars = 0;
                    break;
                }

                case UTF8_REJECT:  // decode found invalid UTF-8 byte
                {
                    switch (error_handler)
                    {
                        case error_handler_t::strict:
                        {
                            JSON_THROW(type_error::create(316, concat("invalid UTF-8 byte at index ", std::to_string(i), ": 0x", hex_bytes(byte | 0)), nullptr));
                        }

                        case error_handler_t::ignore:
                        case error_handler_t::replace:
                        {
                            // in case we saw this character the first time, we
                            // would like to read it again, because the byte
                            // may be OK for itself, but just not OK for the
                            // previous sequence
                            if (undumped_chars > 0)
                            {
                                --i;
                            }

                            // reset length buffer to the last accepted index;
                            // thus removing/ignoring the invalid characters
                            bytes = bytes_after_last_accept;

                            if (error_handler == error_handler_t::replace)
                            {
                                // add a replacement character
                                if (ensure_ascii)
                                {
                                    string_buffer[bytes++] = '\\';
                                    string_buffer[bytes++] = 'u';
                                    string_buffer[bytes++] = 'f';
                                    string_buffer[bytes++] = 'f';
                                    string_buffer[bytes++] = 'f';
                                    string_buffer[bytes++] = 'd';
                                }
                                else
                                {
                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
                                }

                                // write buffer and reset index; there must be 13 bytes
                                // left, as this is the maximal number of bytes to be
                                // written ("\uxxxx\uxxxx\0") for one code point
                                if (string_buffer.size() - bytes < 13)
                                {
                                    o->write_characters(string_buffer.data(), bytes);
                                    bytes = 0;
                                }

                                bytes_after_last_accept = bytes;
                            }

                            undumped_chars = 0;

                            // continue processing the string
                            state = UTF8_ACCEPT;
                            break;
                        }

                        default:            // LCOV_EXCL_LINE
                            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                    }
                    break;
                }

                default:  // decode found yet incomplete multi-byte code point
                {
                    if (!ensure_ascii)
                    {
                        // code point will not be escaped - copy byte to buffer
                        string_buffer[bytes++] = s[i];
                    }
                    ++undumped_chars;
                    break;
                }
            }
        }

        // we finished processing the string
        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
        {
            // write buffer
            if (bytes > 0)
            {
                o->write_characters(string_buffer.data(), bytes);
            }
        }
        else
        {
            // we finish reading, but do not accept: string was incomplete
            switch (error_handler)
            {
                case error_handler_t::strict:
                {
                    JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
                }

                case error_handler_t::ignore:
                {
                    // write all accepted bytes
                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
                    break;
                }

                case error_handler_t::replace:
                {
                    // write all accepted bytes
                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
                    // add a replacement character
                    if (ensure_ascii)
                    {
                        o->write_characters("\\ufffd", 6);
                    }
                    else
                    {
                        o->write_characters("\xEF\xBF\xBD", 3);
                    }
                    break;
                }

                default:            // LCOV_EXCL_LINE
                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
            }
        }
    }

  private:
    /*!
    @brief count digits

    Count the number of decimal (base 10) digits for an input unsigned integer.

    @param[in] x  unsigned integer number to count its digits
    @return    number of decimal digits
    */
    inline unsigned int count_digits(number_unsigned_t x) noexcept
    {
        unsigned int n_digits = 1;
        for (;;)
        {
            if (x < 10)
            {
                return n_digits;
            }
            if (x < 100)
            {
                return n_digits + 1;
            }
            if (x < 1000)
            {
                return n_digits + 2;
            }
            if (x < 10000)
            {
                return n_digits + 3;
            }
            x = x / 10000u;
            n_digits += 4;
        }
    }

    /*!
     * @brief convert a byte to a uppercase hex representation
     * @param[in] byte byte to represent
     * @return representation ("00".."FF")
     */
    static std::string hex_bytes(std::uint8_t byte)
    {
        std::string result = "FF";
        constexpr const char* nibble_to_hex = "0123456789ABCDEF";
        result[0] = nibble_to_hex[byte / 16];
        result[1] = nibble_to_hex[byte % 16];
        return result;
    }

    // templates to avoid warnings about useless casts
    template <typename NumberType, enable_if_t<std::is_signed<NumberType>::value, int> = 0>
    bool is_negative_number(NumberType x)
    {
        return x < 0;
    }

    template < typename NumberType, enable_if_t <std::is_unsigned<NumberType>::value, int > = 0 >
    bool is_negative_number(NumberType /*unused*/)
    {
        return false;
    }

    /*!
    @brief dump an integer

    Dump a given integer to output stream @a o. Works internally with
    @a number_buffer.

    @param[in] x  integer number (signed or unsigned) to dump
    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
    */
    template < typename NumberType, detail::enable_if_t <
                   std::is_integral<NumberType>::value ||
                   std::is_same<NumberType, number_unsigned_t>::value ||
                   std::is_same<NumberType, number_integer_t>::value ||
                   std::is_same<NumberType, binary_char_t>::value,
                   int > = 0 >
    void dump_integer(NumberType x)
    {
        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
        {
            {
                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
            }
        };

        // special case for "0"
        if (x == 0)
        {
            o->write_character('0');
            return;
        }

        // use a pointer to fill the buffer
        auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)

        number_unsigned_t abs_value;

        unsigned int n_chars{};

        if (is_negative_number(x))
        {
            *buffer_ptr = '-';
            abs_value = remove_sign(static_cast<number_integer_t>(x));

            // account one more byte for the minus sign
            n_chars = 1 + count_digits(abs_value);
        }
        else
        {
            abs_value = static_cast<number_unsigned_t>(x);
            n_chars = count_digits(abs_value);
        }

        // spare 1 byte for '\0'
        JSON_ASSERT(n_chars < number_buffer.size() - 1);

        // jump to the end to generate the string from backward,
        // so we later avoid reversing the result
        buffer_ptr += n_chars;

        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
        while (abs_value >= 100)
        {
            const auto digits_index = static_cast<unsigned>((abs_value % 100));
            abs_value /= 100;
            *(--buffer_ptr) = digits_to_99[digits_index][1];
            *(--buffer_ptr) = digits_to_99[digits_index][0];
        }

        if (abs_value >= 10)
        {
            const auto digits_index = static_cast<unsigned>(abs_value);
            *(--buffer_ptr) = digits_to_99[digits_index][1];
            *(--buffer_ptr) = digits_to_99[digits_index][0];
        }
        else
        {
            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
        }

        o->write_characters(number_buffer.data(), n_chars);
    }

    /*!
    @brief dump a floating-point number

    Dump a given floating-point number to output stream @a o. Works internally
    with @a number_buffer.

    @param[in] x  floating-point number to dump
    */
    void dump_float(number_float_t x)
    {
        // NaN / inf
        if (!std::isfinite(x))
        {
            o->write_characters("null", 4);
            return;
        }

        // If number_float_t is an IEEE-754 single or double precision number,
        // use the Grisu2 algorithm to produce short numbers which are
        // guaranteed to round-trip, using strtof and strtod, resp.
        //
        // NB: The test below works if <long double> == <double>.
        static constexpr bool is_ieee_single_or_double
            = (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 && std::numeric_limits<number_float_t>::max_exponent == 128) ||
              (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 && std::numeric_limits<number_float_t>::max_exponent == 1024);

        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
    }

    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
    {
        auto* begin = number_buffer.data();
        auto* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);

        o->write_characters(begin, static_cast<size_t>(end - begin));
    }

    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
    {
        // get number of digits for a float -> text -> float round-trip
        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;

        // the actual conversion
        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);

        // negative value indicates an error
        JSON_ASSERT(len > 0);
        // check if buffer was large enough
        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());

        // erase thousands separator
        if (thousands_sep != '\0')
        {
            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::remove returns an iterator, see https://github.com/nlohmann/json/issues/3081
            const auto end = std::remove(number_buffer.begin(), number_buffer.begin() + len, thousands_sep);
            std::fill(end, number_buffer.end(), '\0');
            JSON_ASSERT((end - number_buffer.begin()) <= len);
            len = (end - number_buffer.begin());
        }

        // convert decimal point to '.'
        if (decimal_point != '\0' && decimal_point != '.')
        {
            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::find returns an iterator, see https://github.com/nlohmann/json/issues/3081
            const auto dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
            if (dec_pos != number_buffer.end())
            {
                *dec_pos = '.';
            }
        }

        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));

        // determine if we need to append ".0"
        const bool value_is_int_like =
            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
                         [](char c)
        {
            return c == '.' || c == 'e';
        });

        if (value_is_int_like)
        {
            o->write_characters(".0", 2);
        }
    }

    /*!
    @brief check whether a string is UTF-8 encoded

    The function checks each byte of a string whether it is UTF-8 encoded. The
    result of the check is stored in the @a state parameter. The function must
    be called initially with state 0 (accept). State 1 means the string must
    be rejected, because the current byte is not allowed. If the string is
    completely processed, but the state is non-zero, the string ended
    prematurely; that is, the last byte indicated more bytes should have
    followed.

    @param[in,out] state  the state of the decoding
    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
    @param[in] byte       next byte to decode
    @return               new state

    @note The function has been edited: a std::array is used.

    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
    */
    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
    {
        static const std::array<std::uint8_t, 400> utf8d =
        {
            {
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
            }
        };

        JSON_ASSERT(byte < utf8d.size());
        const std::uint8_t type = utf8d[byte];

        codep = (state != UTF8_ACCEPT)
                ? (byte & 0x3fu) | (codep << 6u)
                : (0xFFu >> type) & (byte);

        const std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
        JSON_ASSERT(index < utf8d.size());
        state = utf8d[index];
        return state;
    }

    /*
     * Overload to make the compiler happy while it is instantiating
     * dump_integer for number_unsigned_t.
     * Must never be called.
     */
    number_unsigned_t remove_sign(number_unsigned_t x)
    {
        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        return x; // LCOV_EXCL_LINE
    }

    /*
     * Helper function for dump_integer
     *
     * This function takes a negative signed integer and returns its absolute
     * value as unsigned integer. The plus/minus shuffling is necessary as we can
     * not directly remove the sign of an arbitrary signed integer as the
     * absolute values of INT_MIN and INT_MAX are usually not the same. See
     * #1708 for details.
     */
    inline number_unsigned_t remove_sign(number_integer_t x) noexcept
    {
        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)()); // NOLINT(misc-redundant-expression)
        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
    }

  private:
    /// the output of the serializer
    output_adapter_t<char> o = nullptr;

    /// a (hopefully) large enough character buffer
    std::array<char, 64> number_buffer{{}};

    /// the locale
    const std::lconv* loc = nullptr;
    /// the locale's thousand separator character
    const char thousands_sep = '\0';
    /// the locale's decimal point character
    const char decimal_point = '\0';

    /// string buffer
    std::array<char, 512> string_buffer{{}};

    /// the indentation character
    const char indent_char;
    /// the indentation string
    string_t indent_string;

    /// error_handler how to react on decoding errors
    const error_handler_t error_handler;
};

}  // namespace detail
NLOHMANN_JSON_NAMESPACE_END

// #include <nlohmann/detail/value_t.hpp>

// #include <nlohmann/json_fwd.hpp>

// #include <nlohmann/ordered_map.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#include <functional> // equal_to, less
#include <initializer_list> // initializer_list
#include <iterator> // input_iterator_tag, iterator_traits
#include <memory> // allocator
#include <stdexcept> // for out_of_range
#include <type_traits> // enable_if, is_convertible
#include <utility> // pair
#include <vector> // vector

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>


NLOHMANN_JSON_NAMESPACE_BEGIN

/// ordered_map: a minimal map-like container that preserves insertion order
/// for use within nlohmann::basic_json<ordered_map>
template <class Key, class T, class IgnoredLess = std::less<Key>,
          class Allocator = std::allocator<std::pair<const Key, T>>>
                  struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
{
    using key_type = Key;
    using mapped_type = T;
    using Container = std::vector<std::pair<const Key, T>, Allocator>;
    using iterator = typename Container::iterator;
    using const_iterator = typename Container::const_iterator;
    using size_type = typename Container::size_type;
    using value_type = typename Container::value_type;
#ifdef JSON_HAS_CPP_14
    using key_compare = std::equal_to<>;
#else
    using key_compare = std::equal_to<Key>;
#endif

    // Explicit constructors instead of `using Container::Container`
    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
    ordered_map() noexcept(noexcept(Container())) : Container{} {}
    explicit ordered_map(const Allocator& alloc) noexcept(noexcept(Container(alloc))) : Container{alloc} {}
    template <class It>
    ordered_map(It first, It last, const Allocator& alloc = Allocator())
        : Container{first, last, alloc} {}
    ordered_map(std::initializer_list<value_type> init, const Allocator& alloc = Allocator() )
        : Container{init, alloc} {}

    std::pair<iterator, bool> emplace(const key_type& key, T&& t)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return {it, false};
            }
        }
        Container::emplace_back(key, std::forward<T>(t));
        return {std::prev(this->end()), true};
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    std::pair<iterator, bool> emplace(KeyType && key, T && t)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return {it, false};
            }
        }
        Container::emplace_back(std::forward<KeyType>(key), std::forward<T>(t));
        return {std::prev(this->end()), true};
    }

    T& operator[](const key_type& key)
    {
        return emplace(key, T{}).first->second;
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    T & operator[](KeyType && key)
    {
        return emplace(std::forward<KeyType>(key), T{}).first->second;
    }

    const T& operator[](const key_type& key) const
    {
        return at(key);
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    const T & operator[](KeyType && key) const
    {
        return at(std::forward<KeyType>(key));
    }

    T& at(const key_type& key)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return it->second;
            }
        }

        JSON_THROW(std::out_of_range("key not found"));
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    T & at(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return it->second;
            }
        }

        JSON_THROW(std::out_of_range("key not found"));
    }

    const T& at(const key_type& key) const
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return it->second;
            }
        }

        JSON_THROW(std::out_of_range("key not found"));
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    const T & at(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return it->second;
            }
        }

        JSON_THROW(std::out_of_range("key not found"));
    }

    size_type erase(const key_type& key)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                // Since we cannot move const Keys, re-construct them in place
                for (auto next = it; ++next != this->end(); ++it)
                {
                    it->~value_type(); // Destroy but keep allocation
                    new (&*it) value_type{std::move(*next)};
                }
                Container::pop_back();
                return 1;
            }
        }
        return 0;
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    size_type erase(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                // Since we cannot move const Keys, re-construct them in place
                for (auto next = it; ++next != this->end(); ++it)
                {
                    it->~value_type(); // Destroy but keep allocation
                    new (&*it) value_type{std::move(*next)};
                }
                Container::pop_back();
                return 1;
            }
        }
        return 0;
    }

    iterator erase(iterator pos)
    {
        return erase(pos, std::next(pos));
    }

    iterator erase(iterator first, iterator last)
    {
        if (first == last)
        {
            return first;
        }

        const auto elements_affected = std::distance(first, last);
        const auto offset = std::distance(Container::begin(), first);

        // This is the start situation. We need to delete elements_affected
        // elements (3 in this example: e, f, g), and need to return an
        // iterator past the last deleted element (h in this example).
        // Note that offset is the distance from the start of the vector
        // to first. We will need this later.

        // [ a, b, c, d, e, f, g, h, i, j ]
        //               ^        ^
        //             first    last

        // Since we cannot move const Keys, we re-construct them in place.
        // We start at first and re-construct (viz. copy) the elements from
        // the back of the vector. Example for first iteration:

        //               ,--------.
        //               v        |   destroy e and re-construct with h
        // [ a, b, c, d, e, f, g, h, i, j ]
        //               ^        ^
        //               it       it + elements_affected

        for (auto it = first; std::next(it, elements_affected) != Container::end(); ++it)
        {
            it->~value_type(); // destroy but keep allocation
            new (&*it) value_type{std::move(*std::next(it, elements_affected))}; // "move" next element to it
        }

        // [ a, b, c, d, h, i, j, h, i, j ]
        //               ^        ^
        //             first    last

        // remove the unneeded elements at the end of the vector
        Container::resize(this->size() - static_cast<size_type>(elements_affected));

        // [ a, b, c, d, h, i, j ]
        //               ^        ^
        //             first    last

        // first is now pointing past the last deleted element, but we cannot
        // use this iterator, because it may have been invalidated by the
        // resize call. Instead, we can return begin() + offset.
        return Container::begin() + offset;
    }

    size_type count(const key_type& key) const
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return 1;
            }
        }
        return 0;
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    size_type count(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return 1;
            }
        }
        return 0;
    }

    iterator find(const key_type& key)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return it;
            }
        }
        return Container::end();
    }

    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
    iterator find(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return it;
            }
        }
        return Container::end();
    }

    const_iterator find(const key_type& key) const
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, key))
            {
                return it;
            }
        }
        return Container::end();
    }

    std::pair<iterator, bool> insert( value_type&& value )
    {
        return emplace(value.first, std::move(value.second));
    }

    std::pair<iterator, bool> insert( const value_type& value )
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (m_compare(it->first, value.first))
            {
                return {it, false};
            }
        }
        Container::push_back(value);
        return {--this->end(), true};
    }

    template<typename InputIt>
    using require_input_iter = typename std::enable_if<std::is_convertible<typename std::iterator_traits<InputIt>::iterator_category,
            std::input_iterator_tag>::value>::type;

    template<typename InputIt, typename = require_input_iter<InputIt>>
    void insert(InputIt first, InputIt last)
    {
        for (auto it = first; it != last; ++it)
        {
            insert(*it);
        }
    }

private:
    JSON_NO_UNIQUE_ADDRESS key_compare m_compare = key_compare();
};

NLOHMANN_JSON_NAMESPACE_END


#if defined(JSON_HAS_CPP_17)
    #if JSON_HAS_STATIC_RTTI
        #include <any>
    #endif
    #include <string_view>
#endif

/*!
@brief namespace for Niels Lohmann
@see https://github.com/nlohmann
@since version 1.0.0
*/
NLOHMANN_JSON_NAMESPACE_BEGIN

/*!
@brief a class to store JSON values

@internal
@invariant The member variables @a m_value and @a m_type have the following
relationship:
- If `m_type == value_t::object`, then `m_value.object != nullptr`.
- If `m_type == value_t::array`, then `m_value.array != nullptr`.
- If `m_type == value_t::string`, then `m_value.string != nullptr`.
The invariants are checked by member function assert_invariant().

@note ObjectType trick from https://stackoverflow.com/a/9860911
@endinternal

@since version 1.0.0

@nosubgrouping
*/
NLOHMANN_BASIC_JSON_TPL_DECLARATION
class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
    : public ::nlohmann::detail::json_base_class<CustomBaseClass>
{
  private:
    template<detail::value_t> friend struct detail::external_constructor;

    template<typename>
    friend class ::nlohmann::json_pointer;
    // can be restored when json_pointer backwards compatibility is removed
    // friend ::nlohmann::json_pointer<StringType>;

    template<typename BasicJsonType, typename InputType>
    friend class ::nlohmann::detail::parser;
    friend ::nlohmann::detail::serializer<basic_json>;
    template<typename BasicJsonType>
    friend class ::nlohmann::detail::iter_impl;
    template<typename BasicJsonType, typename CharType>
    friend class ::nlohmann::detail::binary_writer;
    template<typename BasicJsonType, typename InputType, typename SAX>
    friend class ::nlohmann::detail::binary_reader;
    template<typename BasicJsonType>
    friend class ::nlohmann::detail::json_sax_dom_parser;
    template<typename BasicJsonType>
    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
    friend class ::nlohmann::detail::exception;

    /// workaround type for MSVC
    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
    using json_base_class_t = ::nlohmann::detail::json_base_class<CustomBaseClass>;

  JSON_PRIVATE_UNLESS_TESTED:
    // convenience aliases for types residing in namespace detail;
    using lexer = ::nlohmann::detail::lexer_base<basic_json>;

    template<typename InputAdapterType>
    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
        InputAdapterType adapter,
        detail::parser_callback_t<basic_json>cb = nullptr,
        const bool allow_exceptions = true,
        const bool ignore_comments = false
                                 )
    {
        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
                std::move(cb), allow_exceptions, ignore_comments);
    }

  private:
    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
    template<typename BasicJsonType>
    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
    template<typename BasicJsonType>
    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
    template<typename Iterator>
    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;

    template<typename CharType>
    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;

    template<typename InputType>
    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;

  JSON_PRIVATE_UNLESS_TESTED:
    using serializer = ::nlohmann::detail::serializer<basic_json>;

  public:
    using value_t = detail::value_t;
    /// JSON Pointer, see @ref nlohmann::json_pointer
    using json_pointer = ::nlohmann::json_pointer<StringType>;
    template<typename T, typename SFINAE>
    using json_serializer = JSONSerializer<T, SFINAE>;
    /// how to treat decoding errors
    using error_handler_t = detail::error_handler_t;
    /// how to treat CBOR tags
    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
    /// helper type for initializer lists of basic_json values
    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;

    using input_format_t = detail::input_format_t;
    /// SAX interface type, see @ref nlohmann::json_sax
    using json_sax_t = json_sax<basic_json>;

    ////////////////
    // exceptions //
    ////////////////

    /// @name exceptions
    /// Classes to implement user-defined exceptions.
    /// @{

    using exception = detail::exception;
    using parse_error = detail::parse_error;
    using invalid_iterator = detail::invalid_iterator;
    using type_error = detail::type_error;
    using out_of_range = detail::out_of_range;
    using other_error = detail::other_error;

    /// @}

    /////////////////////
    // container types //
    /////////////////////

    /// @name container types
    /// The canonic container types to use @ref basic_json like any other STL
    /// container.
    /// @{

    /// the type of elements in a basic_json container
    using value_type = basic_json;

    /// the type of an element reference
    using reference = value_type&;
    /// the type of an element const reference
    using const_reference = const value_type&;

    /// a type to represent differences between iterators
    using difference_type = std::ptrdiff_t;
    /// a type to represent container sizes
    using size_type = std::size_t;

    /// the allocator type
    using allocator_type = AllocatorType<basic_json>;

    /// the type of an element pointer
    using pointer = typename std::allocator_traits<allocator_type>::pointer;
    /// the type of an element const pointer
    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;

    /// an iterator for a basic_json container
    using iterator = iter_impl<basic_json>;
    /// a const iterator for a basic_json container
    using const_iterator = iter_impl<const basic_json>;
    /// a reverse iterator for a basic_json container
    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
    /// a const reverse iterator for a basic_json container
    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;

    /// @}

    /// @brief returns the allocator associated with the container
    /// @sa https://json.nlohmann.me/api/basic_json/get_allocator/
    static allocator_type get_allocator()
    {
        return allocator_type();
    }

    /// @brief returns version information on the library
    /// @sa https://json.nlohmann.me/api/basic_json/meta/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json meta()
    {
        basic_json result;

        result["copyright"] = "(C) 2013-2023 Niels Lohmann";
        result["name"] = "JSON for Modern C++";
        result["url"] = "https://github.com/nlohmann/json";
        result["version"]["string"] =
            detail::concat(std::to_string(NLOHMANN_JSON_VERSION_MAJOR), '.',
                           std::to_string(NLOHMANN_JSON_VERSION_MINOR), '.',
                           std::to_string(NLOHMANN_JSON_VERSION_PATCH));
        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;

#ifdef _WIN32
        result["platform"] = "win32";
#elif defined __linux__
        result["platform"] = "linux";
#elif defined __APPLE__
        result["platform"] = "apple";
#elif defined __unix__
        result["platform"] = "unix";
#else
        result["platform"] = "unknown";
#endif

#if defined(__ICC) || defined(__INTEL_COMPILER)
        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
#elif defined(__clang__)
        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
#elif defined(__GNUC__) || defined(__GNUG__)
        result["compiler"] = {{"family", "gcc"}, {"version", detail::concat(
                    std::to_string(__GNUC__), '.',
                    std::to_string(__GNUC_MINOR__), '.',
                    std::to_string(__GNUC_PATCHLEVEL__))
            }
        };
#elif defined(__HP_cc) || defined(__HP_aCC)
        result["compiler"] = "hp"
#elif defined(__IBMCPP__)
        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
#elif defined(_MSC_VER)
        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
#elif defined(__PGI)
        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
#elif defined(__SUNPRO_CC)
        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
#else
        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
#endif

#if defined(_MSVC_LANG)
        result["compiler"]["c++"] = std::to_string(_MSVC_LANG);
#elif defined(__cplusplus)
        result["compiler"]["c++"] = std::to_string(__cplusplus);
#else
        result["compiler"]["c++"] = "unknown";
#endif
        return result;
    }

    ///////////////////////////
    // JSON value data types //
    ///////////////////////////

    /// @name JSON value data types
    /// The data types to store a JSON value. These types are derived from
    /// the template arguments passed to class @ref basic_json.
    /// @{

    /// @brief default object key comparator type
    /// The actual object key comparator type (@ref object_comparator_t) may be
    /// different.
    /// @sa https://json.nlohmann.me/api/basic_json/default_object_comparator_t/
#if defined(JSON_HAS_CPP_14)
    // use of transparent comparator avoids unnecessary repeated construction of temporaries
    // in functions involving lookup by key with types other than object_t::key_type (aka. StringType)
    using default_object_comparator_t = std::less<>;
#else
    using default_object_comparator_t = std::less<StringType>;
#endif

    /// @brief a type for an object
    /// @sa https://json.nlohmann.me/api/basic_json/object_t/
    using object_t = ObjectType<StringType,
          basic_json,
          default_object_comparator_t,
          AllocatorType<std::pair<const StringType,
          basic_json>>>;

    /// @brief a type for an array
    /// @sa https://json.nlohmann.me/api/basic_json/array_t/
    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;

    /// @brief a type for a string
    /// @sa https://json.nlohmann.me/api/basic_json/string_t/
    using string_t = StringType;

    /// @brief a type for a boolean
    /// @sa https://json.nlohmann.me/api/basic_json/boolean_t/
    using boolean_t = BooleanType;

    /// @brief a type for a number (integer)
    /// @sa https://json.nlohmann.me/api/basic_json/number_integer_t/
    using number_integer_t = NumberIntegerType;

    /// @brief a type for a number (unsigned)
    /// @sa https://json.nlohmann.me/api/basic_json/number_unsigned_t/
    using number_unsigned_t = NumberUnsignedType;

    /// @brief a type for a number (floating-point)
    /// @sa https://json.nlohmann.me/api/basic_json/number_float_t/
    using number_float_t = NumberFloatType;

    /// @brief a type for a packed binary type
    /// @sa https://json.nlohmann.me/api/basic_json/binary_t/
    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;

    /// @brief object key comparator type
    /// @sa https://json.nlohmann.me/api/basic_json/object_comparator_t/
    using object_comparator_t = detail::actual_object_comparator_t<basic_json>;

    /// @}

  private:

    /// helper for exception-safe object creation
    template<typename T, typename... Args>
    JSON_HEDLEY_RETURNS_NON_NULL
    static T* create(Args&& ... args)
    {
        AllocatorType<T> alloc;
        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;

        auto deleter = [&](T * obj)
        {
            AllocatorTraits::deallocate(alloc, obj, 1);
        };
        std::unique_ptr<T, decltype(deleter)> obj(AllocatorTraits::allocate(alloc, 1), deleter);
        AllocatorTraits::construct(alloc, obj.get(), std::forward<Args>(args)...);
        JSON_ASSERT(obj != nullptr);
        return obj.release();
    }

    ////////////////////////
    // JSON value storage //
    ////////////////////////

  JSON_PRIVATE_UNLESS_TESTED:
    /*!
    @brief a JSON value

    The actual storage for a JSON value of the @ref basic_json class. This
    union combines the different storage types for the JSON value types
    defined in @ref value_t.

    JSON type | value_t type    | used type
    --------- | --------------- | ------------------------
    object    | object          | pointer to @ref object_t
    array     | array           | pointer to @ref array_t
    string    | string          | pointer to @ref string_t
    boolean   | boolean         | @ref boolean_t
    number    | number_integer  | @ref number_integer_t
    number    | number_unsigned | @ref number_unsigned_t
    number    | number_float    | @ref number_float_t
    binary    | binary          | pointer to @ref binary_t
    null      | null            | *no value is stored*

    @note Variable-length types (objects, arrays, and strings) are stored as
    pointers. The size of the union should not exceed 64 bits if the default
    value types are used.

    @since version 1.0.0
    */
    union json_value
    {
        /// object (stored with pointer to save storage)
        object_t* object;
        /// array (stored with pointer to save storage)
        array_t* array;
        /// string (stored with pointer to save storage)
        string_t* string;
        /// binary (stored with pointer to save storage)
        binary_t* binary;
        /// boolean
        boolean_t boolean;
        /// number (integer)
        number_integer_t number_integer;
        /// number (unsigned integer)
        number_unsigned_t number_unsigned;
        /// number (floating-point)
        number_float_t number_float;

        /// default constructor (for null values)
        json_value() = default;
        /// constructor for booleans
        json_value(boolean_t v) noexcept : boolean(v) {}
        /// constructor for numbers (integer)
        json_value(number_integer_t v) noexcept : number_integer(v) {}
        /// constructor for numbers (unsigned)
        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
        /// constructor for numbers (floating-point)
        json_value(number_float_t v) noexcept : number_float(v) {}
        /// constructor for empty values of a given type
        json_value(value_t t)
        {
            switch (t)
            {
                case value_t::object:
                {
                    object = create<object_t>();
                    break;
                }

                case value_t::array:
                {
                    array = create<array_t>();
                    break;
                }

                case value_t::string:
                {
                    string = create<string_t>("");
                    break;
                }

                case value_t::binary:
                {
                    binary = create<binary_t>();
                    break;
                }

                case value_t::boolean:
                {
                    boolean = static_cast<boolean_t>(false);
                    break;
                }

                case value_t::number_integer:
                {
                    number_integer = static_cast<number_integer_t>(0);
                    break;
                }

                case value_t::number_unsigned:
                {
                    number_unsigned = static_cast<number_unsigned_t>(0);
                    break;
                }

                case value_t::number_float:
                {
                    number_float = static_cast<number_float_t>(0.0);
                    break;
                }

                case value_t::null:
                {
                    object = nullptr;  // silence warning, see #821
                    break;
                }

                case value_t::discarded:
                default:
                {
                    object = nullptr;  // silence warning, see #821
                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
                    {
                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr)); // LCOV_EXCL_LINE
                    }
                    break;
                }
            }
        }

        /// constructor for strings
        json_value(const string_t& value) : string(create<string_t>(value)) {}

        /// constructor for rvalue strings
        json_value(string_t&& value) : string(create<string_t>(std::move(value))) {}

        /// constructor for objects
        json_value(const object_t& value) : object(create<object_t>(value)) {}

        /// constructor for rvalue objects
        json_value(object_t&& value) : object(create<object_t>(std::move(value))) {}

        /// constructor for arrays
        json_value(const array_t& value) : array(create<array_t>(value)) {}

        /// constructor for rvalue arrays
        json_value(array_t&& value) : array(create<array_t>(std::move(value))) {}

        /// constructor for binary arrays
        json_value(const typename binary_t::container_type& value) : binary(create<binary_t>(value)) {}

        /// constructor for rvalue binary arrays
        json_value(typename binary_t::container_type&& value) : binary(create<binary_t>(std::move(value))) {}

        /// constructor for binary arrays (internal type)
        json_value(const binary_t& value) : binary(create<binary_t>(value)) {}

        /// constructor for rvalue binary arrays (internal type)
        json_value(binary_t&& value) : binary(create<binary_t>(std::move(value))) {}

        void destroy(value_t t)
        {
            if (
                (t == value_t::object && object == nullptr) ||
                (t == value_t::array && array == nullptr) ||
                (t == value_t::string && string == nullptr) ||
                (t == value_t::binary && binary == nullptr)
            )
            {
                //not initialized (e.g. due to exception in the ctor)
                return;
            }
            if (t == value_t::array || t == value_t::object)
            {
                // flatten the current json_value to a heap-allocated stack
                std::vector<basic_json> stack;

                // move the top-level items to stack
                if (t == value_t::array)
                {
                    stack.reserve(array->size());
                    std::move(array->begin(), array->end(), std::back_inserter(stack));
                }
                else
                {
                    stack.reserve(object->size());
                    for (auto&& it : *object)
                    {
                        stack.push_back(std::move(it.second));
                    }
                }

                while (!stack.empty())
                {
                    // move the last item to local variable to be processed
                    basic_json current_item(std::move(stack.back()));
                    stack.pop_back();

                    // if current_item is array/object, move
                    // its children to the stack to be processed later
                    if (current_item.is_array())
                    {
                        std::move(current_item.m_data.m_value.array->begin(), current_item.m_data.m_value.array->end(), std::back_inserter(stack));

                        current_item.m_data.m_value.array->clear();
                    }
                    else if (current_item.is_object())
                    {
                        for (auto&& it : *current_item.m_data.m_value.object)
                        {
                            stack.push_back(std::move(it.second));
                        }

                        current_item.m_data.m_value.object->clear();
                    }

                    // it's now safe that current_item get destructed
                    // since it doesn't have any children
                }
            }

            switch (t)
            {
                case value_t::object:
                {
                    AllocatorType<object_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
                    break;
                }

                case value_t::array:
                {
                    AllocatorType<array_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
                    break;
                }

                case value_t::string:
                {
                    AllocatorType<string_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
                    break;
                }

                case value_t::binary:
                {
                    AllocatorType<binary_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
                    break;
                }

                case value_t::null:
                case value_t::boolean:
                case value_t::number_integer:
                case value_t::number_unsigned:
                case value_t::number_float:
                case value_t::discarded:
                default:
                {
                    break;
                }
            }
        }
    };

  private:
    /*!
    @brief checks the class invariants

    This function asserts the class invariants. It needs to be called at the
    end of every constructor to make sure that created objects respect the
    invariant. Furthermore, it has to be called each time the type of a JSON
    value is changed, because the invariant expresses a relationship between
    @a m_type and @a m_value.

    Furthermore, the parent relation is checked for arrays and objects: If
    @a check_parents true and the value is an array or object, then the
    container's elements must have the current value as parent.

    @param[in] check_parents  whether the parent relation should be checked.
               The value is true by default and should only be set to false
               during destruction of objects when the invariant does not
               need to hold.
    */
    void assert_invariant(bool check_parents = true) const noexcept
    {
        JSON_ASSERT(m_data.m_type != value_t::object || m_data.m_value.object != nullptr);
        JSON_ASSERT(m_data.m_type != value_t::array || m_data.m_value.array != nullptr);
        JSON_ASSERT(m_data.m_type != value_t::string || m_data.m_value.string != nullptr);
        JSON_ASSERT(m_data.m_type != value_t::binary || m_data.m_value.binary != nullptr);

#if JSON_DIAGNOSTICS
        JSON_TRY
        {
            // cppcheck-suppress assertWithSideEffect
            JSON_ASSERT(!check_parents || !is_structured() || std::all_of(begin(), end(), [this](const basic_json & j)
            {
                return j.m_parent == this;
            }));
        }
        JSON_CATCH(...) {} // LCOV_EXCL_LINE
#endif
        static_cast<void>(check_parents);
    }

    void set_parents()
    {
#if JSON_DIAGNOSTICS
        switch (m_data.m_type)
        {
            case value_t::array:
            {
                for (auto& element : *m_data.m_value.array)
                {
                    element.m_parent = this;
                }
                break;
            }

            case value_t::object:
            {
                for (auto& element : *m_data.m_value.object)
                {
                    element.second.m_parent = this;
                }
                break;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
                break;
        }
#endif
    }

    iterator set_parents(iterator it, typename iterator::difference_type count_set_parents)
    {
#if JSON_DIAGNOSTICS
        for (typename iterator::difference_type i = 0; i < count_set_parents; ++i)
        {
            (it + i)->m_parent = this;
        }
#else
        static_cast<void>(count_set_parents);
#endif
        return it;
    }

    reference set_parent(reference j, std::size_t old_capacity = static_cast<std::size_t>(-1))
    {
#if JSON_DIAGNOSTICS
        if (old_capacity != static_cast<std::size_t>(-1))
        {
            // see https://github.com/nlohmann/json/issues/2838
            JSON_ASSERT(type() == value_t::array);
            if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
            {
                // capacity has changed: update all parents
                set_parents();
                return j;
            }
        }

        // ordered_json uses a vector internally, so pointers could have
        // been invalidated; see https://github.com/nlohmann/json/issues/2962
#ifdef JSON_HEDLEY_MSVC_VERSION
#pragma warning(push )
#pragma warning(disable : 4127) // ignore warning to replace if with if constexpr
#endif
        if (detail::is_ordered_map<object_t>::value)
        {
            set_parents();
            return j;
        }
#ifdef JSON_HEDLEY_MSVC_VERSION
#pragma warning( pop )
#endif

        j.m_parent = this;
#else
        static_cast<void>(j);
        static_cast<void>(old_capacity);
#endif
        return j;
    }

  public:
    //////////////////////////
    // JSON parser callback //
    //////////////////////////

    /// @brief parser event types
    /// @sa https://json.nlohmann.me/api/basic_json/parse_event_t/
    using parse_event_t = detail::parse_event_t;

    /// @brief per-element parser callback type
    /// @sa https://json.nlohmann.me/api/basic_json/parser_callback_t/
    using parser_callback_t = detail::parser_callback_t<basic_json>;

    //////////////////
    // constructors //
    //////////////////

    /// @name constructors and destructors
    /// Constructors of class @ref basic_json, copy/move constructor, copy
    /// assignment, static functions creating objects, and the destructor.
    /// @{

    /// @brief create an empty value with a given type
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    basic_json(const value_t v)
        : m_data(v)
    {
        assert_invariant();
    }

    /// @brief create a null object
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    basic_json(std::nullptr_t = nullptr) noexcept // NOLINT(bugprone-exception-escape)
        : basic_json(value_t::null)
    {
        assert_invariant();
    }

    /// @brief create a JSON value from compatible types
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    template < typename CompatibleType,
               typename U = detail::uncvref_t<CompatibleType>,
               detail::enable_if_t <
                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
    basic_json(CompatibleType && val) noexcept(noexcept( // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape)
                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
                                           std::forward<CompatibleType>(val))))
    {
        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
        set_parents();
        assert_invariant();
    }

    /// @brief create a JSON value from an existing one
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    template < typename BasicJsonType,
               detail::enable_if_t <
                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
    basic_json(const BasicJsonType& val)
    {
        using other_boolean_t = typename BasicJsonType::boolean_t;
        using other_number_float_t = typename BasicJsonType::number_float_t;
        using other_number_integer_t = typename BasicJsonType::number_integer_t;
        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
        using other_string_t = typename BasicJsonType::string_t;
        using other_object_t = typename BasicJsonType::object_t;
        using other_array_t = typename BasicJsonType::array_t;
        using other_binary_t = typename BasicJsonType::binary_t;

        switch (val.type())
        {
            case value_t::boolean:
                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
                break;
            case value_t::number_float:
                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
                break;
            case value_t::number_integer:
                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
                break;
            case value_t::number_unsigned:
                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
                break;
            case value_t::string:
                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
                break;
            case value_t::object:
                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
                break;
            case value_t::array:
                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
                break;
            case value_t::binary:
                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
                break;
            case value_t::null:
                *this = nullptr;
                break;
            case value_t::discarded:
                m_data.m_type = value_t::discarded;
                break;
            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }
        JSON_ASSERT(m_data.m_type == val.type());
        set_parents();
        assert_invariant();
    }

    /// @brief create a container (array or object) from an initializer list
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    basic_json(initializer_list_t init,
               bool type_deduction = true,
               value_t manual_type = value_t::array)
    {
        // check if each element is an array with two elements whose first
        // element is a string
        bool is_an_object = std::all_of(init.begin(), init.end(),
                                        [](const detail::json_ref<basic_json>& element_ref)
        {
            // The cast is to ensure op[size_type] is called, bearing in mind size_type may not be int;
            // (many string types can be constructed from 0 via its null-pointer guise, so we get a
            // broken call to op[key_type], the wrong semantics and a 4804 warning on Windows)
            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[static_cast<size_type>(0)].is_string();
        });

        // adjust type if type deduction is not wanted
        if (!type_deduction)
        {
            // if array is wanted, do not create an object though possible
            if (manual_type == value_t::array)
            {
                is_an_object = false;
            }

            // if object is wanted but impossible, throw an exception
            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
            {
                JSON_THROW(type_error::create(301, "cannot create object from initializer list", nullptr));
            }
        }

        if (is_an_object)
        {
            // the initializer list is a list of pairs -> create object
            m_data.m_type = value_t::object;
            m_data.m_value = value_t::object;

            for (auto& element_ref : init)
            {
                auto element = element_ref.moved_or_copied();
                m_data.m_value.object->emplace(
                    std::move(*((*element.m_data.m_value.array)[0].m_data.m_value.string)),
                    std::move((*element.m_data.m_value.array)[1]));
            }
        }
        else
        {
            // the initializer list describes an array -> create array
            m_data.m_type = value_t::array;
            m_data.m_value.array = create<array_t>(init.begin(), init.end());
        }

        set_parents();
        assert_invariant();
    }

    /// @brief explicitly create a binary array (without subtype)
    /// @sa https://json.nlohmann.me/api/basic_json/binary/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(const typename binary_t::container_type& init)
    {
        auto res = basic_json();
        res.m_data.m_type = value_t::binary;
        res.m_data.m_value = init;
        return res;
    }

    /// @brief explicitly create a binary array (with subtype)
    /// @sa https://json.nlohmann.me/api/basic_json/binary/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype)
    {
        auto res = basic_json();
        res.m_data.m_type = value_t::binary;
        res.m_data.m_value = binary_t(init, subtype);
        return res;
    }

    /// @brief explicitly create a binary array
    /// @sa https://json.nlohmann.me/api/basic_json/binary/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(typename binary_t::container_type&& init)
    {
        auto res = basic_json();
        res.m_data.m_type = value_t::binary;
        res.m_data.m_value = std::move(init);
        return res;
    }

    /// @brief explicitly create a binary array (with subtype)
    /// @sa https://json.nlohmann.me/api/basic_json/binary/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype)
    {
        auto res = basic_json();
        res.m_data.m_type = value_t::binary;
        res.m_data.m_value = binary_t(std::move(init), subtype);
        return res;
    }

    /// @brief explicitly create an array from an initializer list
    /// @sa https://json.nlohmann.me/api/basic_json/array/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json array(initializer_list_t init = {})
    {
        return basic_json(init, false, value_t::array);
    }

    /// @brief explicitly create an object from an initializer list
    /// @sa https://json.nlohmann.me/api/basic_json/object/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json object(initializer_list_t init = {})
    {
        return basic_json(init, false, value_t::object);
    }

    /// @brief construct an array with count copies of given value
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    basic_json(size_type cnt, const basic_json& val):
        m_data{cnt, val}
    {
        set_parents();
        assert_invariant();
    }

    /// @brief construct a JSON container given an iterator range
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    template < class InputIT, typename std::enable_if <
                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
    basic_json(InputIT first, InputIT last)
    {
        JSON_ASSERT(first.m_object != nullptr);
        JSON_ASSERT(last.m_object != nullptr);

        // make sure iterator fits the current value
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible", nullptr));
        }

        // copy type from first iterator
        m_data.m_type = first.m_object->m_data.m_type;

        // check if iterator range is complete for primitive values
        switch (m_data.m_type)
        {
            case value_t::boolean:
            case value_t::number_float:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::string:
            {
                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
                                         || !last.m_it.primitive_iterator.is_end()))
                {
                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", first.m_object));
                }
                break;
            }

            case value_t::null:
            case value_t::object:
            case value_t::array:
            case value_t::binary:
            case value_t::discarded:
            default:
                break;
        }

        switch (m_data.m_type)
        {
            case value_t::number_integer:
            {
                m_data.m_value.number_integer = first.m_object->m_data.m_value.number_integer;
                break;
            }

            case value_t::number_unsigned:
            {
                m_data.m_value.number_unsigned = first.m_object->m_data.m_value.number_unsigned;
                break;
            }

            case value_t::number_float:
            {
                m_data.m_value.number_float = first.m_object->m_data.m_value.number_float;
                break;
            }

            case value_t::boolean:
            {
                m_data.m_value.boolean = first.m_object->m_data.m_value.boolean;
                break;
            }

            case value_t::string:
            {
                m_data.m_value = *first.m_object->m_data.m_value.string;
                break;
            }

            case value_t::object:
            {
                m_data.m_value.object = create<object_t>(first.m_it.object_iterator,
                                        last.m_it.object_iterator);
                break;
            }

            case value_t::array:
            {
                m_data.m_value.array = create<array_t>(first.m_it.array_iterator,
                                                       last.m_it.array_iterator);
                break;
            }

            case value_t::binary:
            {
                m_data.m_value = *first.m_object->m_data.m_value.binary;
                break;
            }

            case value_t::null:
            case value_t::discarded:
            default:
                JSON_THROW(invalid_iterator::create(206, detail::concat("cannot construct with iterators from ", first.m_object->type_name()), first.m_object));
        }

        set_parents();
        assert_invariant();
    }

    ///////////////////////////////////////
    // other constructors and destructor //
    ///////////////////////////////////////

    template<typename JsonRef,
             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}

    /// @brief copy constructor
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    basic_json(const basic_json& other)
        : json_base_class_t(other)
    {
        m_data.m_type = other.m_data.m_type;
        // check of passed value is valid
        other.assert_invariant();

        switch (m_data.m_type)
        {
            case value_t::object:
            {
                m_data.m_value = *other.m_data.m_value.object;
                break;
            }

            case value_t::array:
            {
                m_data.m_value = *other.m_data.m_value.array;
                break;
            }

            case value_t::string:
            {
                m_data.m_value = *other.m_data.m_value.string;
                break;
            }

            case value_t::boolean:
            {
                m_data.m_value = other.m_data.m_value.boolean;
                break;
            }

            case value_t::number_integer:
            {
                m_data.m_value = other.m_data.m_value.number_integer;
                break;
            }

            case value_t::number_unsigned:
            {
                m_data.m_value = other.m_data.m_value.number_unsigned;
                break;
            }

            case value_t::number_float:
            {
                m_data.m_value = other.m_data.m_value.number_float;
                break;
            }

            case value_t::binary:
            {
                m_data.m_value = *other.m_data.m_value.binary;
                break;
            }

            case value_t::null:
            case value_t::discarded:
            default:
                break;
        }

        set_parents();
        assert_invariant();
    }

    /// @brief move constructor
    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
    basic_json(basic_json&& other) noexcept
        : json_base_class_t(std::forward<json_base_class_t>(other)),
          m_data(std::move(other.m_data))
    {
        // check that passed value is valid
        other.assert_invariant(false);

        // invalidate payload
        other.m_data.m_type = value_t::null;
        other.m_data.m_value = {};

        set_parents();
        assert_invariant();
    }

    /// @brief copy assignment
    /// @sa https://json.nlohmann.me/api/basic_json/operator=/
    basic_json& operator=(basic_json other) noexcept (
        std::is_nothrow_move_constructible<value_t>::value&&
        std::is_nothrow_move_assignable<value_t>::value&&
        std::is_nothrow_move_constructible<json_value>::value&&
        std::is_nothrow_move_assignable<json_value>::value&&
        std::is_nothrow_move_assignable<json_base_class_t>::value
    )
    {
        // check that passed value is valid
        other.assert_invariant();

        using std::swap;
        swap(m_data.m_type, other.m_data.m_type);
        swap(m_data.m_value, other.m_data.m_value);
        json_base_class_t::operator=(std::move(other));

        set_parents();
        assert_invariant();
        return *this;
    }

    /// @brief destructor
    /// @sa https://json.nlohmann.me/api/basic_json/~basic_json/
    ~basic_json() noexcept
    {
        assert_invariant(false);
    }

    /// @}

  public:
    ///////////////////////
    // object inspection //
    ///////////////////////

    /// @name object inspection
    /// Functions to inspect the type of a JSON value.
    /// @{

    /// @brief serialization
    /// @sa https://json.nlohmann.me/api/basic_json/dump/
    string_t dump(const int indent = -1,
                  const char indent_char = ' ',
                  const bool ensure_ascii = false,
                  const error_handler_t error_handler = error_handler_t::strict) const
    {
        string_t result;
        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);

        if (indent >= 0)
        {
            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
        }
        else
        {
            s.dump(*this, false, ensure_ascii, 0);
        }

        return result;
    }

    /// @brief return the type of the JSON value (explicit)
    /// @sa https://json.nlohmann.me/api/basic_json/type/
    constexpr value_t type() const noexcept
    {
        return m_data.m_type;
    }

    /// @brief return whether type is primitive
    /// @sa https://json.nlohmann.me/api/basic_json/is_primitive/
    constexpr bool is_primitive() const noexcept
    {
        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
    }

    /// @brief return whether type is structured
    /// @sa https://json.nlohmann.me/api/basic_json/is_structured/
    constexpr bool is_structured() const noexcept
    {
        return is_array() || is_object();
    }

    /// @brief return whether value is null
    /// @sa https://json.nlohmann.me/api/basic_json/is_null/
    constexpr bool is_null() const noexcept
    {
        return m_data.m_type == value_t::null;
    }

    /// @brief return whether value is a boolean
    /// @sa https://json.nlohmann.me/api/basic_json/is_boolean/
    constexpr bool is_boolean() const noexcept
    {
        return m_data.m_type == value_t::boolean;
    }

    /// @brief return whether value is a number
    /// @sa https://json.nlohmann.me/api/basic_json/is_number/
    constexpr bool is_number() const noexcept
    {
        return is_number_integer() || is_number_float();
    }

    /// @brief return whether value is an integer number
    /// @sa https://json.nlohmann.me/api/basic_json/is_number_integer/
    constexpr bool is_number_integer() const noexcept
    {
        return m_data.m_type == value_t::number_integer || m_data.m_type == value_t::number_unsigned;
    }

    /// @brief return whether value is an unsigned integer number
    /// @sa https://json.nlohmann.me/api/basic_json/is_number_unsigned/
    constexpr bool is_number_unsigned() const noexcept
    {
        return m_data.m_type == value_t::number_unsigned;
    }

    /// @brief return whether value is a floating-point number
    /// @sa https://json.nlohmann.me/api/basic_json/is_number_float/
    constexpr bool is_number_float() const noexcept
    {
        return m_data.m_type == value_t::number_float;
    }

    /// @brief return whether value is an object
    /// @sa https://json.nlohmann.me/api/basic_json/is_object/
    constexpr bool is_object() const noexcept
    {
        return m_data.m_type == value_t::object;
    }

    /// @brief return whether value is an array
    /// @sa https://json.nlohmann.me/api/basic_json/is_array/
    constexpr bool is_array() const noexcept
    {
        return m_data.m_type == value_t::array;
    }

    /// @brief return whether value is a string
    /// @sa https://json.nlohmann.me/api/basic_json/is_string/
    constexpr bool is_string() const noexcept
    {
        return m_data.m_type == value_t::string;
    }

    /// @brief return whether value is a binary array
    /// @sa https://json.nlohmann.me/api/basic_json/is_binary/
    constexpr bool is_binary() const noexcept
    {
        return m_data.m_type == value_t::binary;
    }

    /// @brief return whether value is discarded
    /// @sa https://json.nlohmann.me/api/basic_json/is_discarded/
    constexpr bool is_discarded() const noexcept
    {
        return m_data.m_type == value_t::discarded;
    }

    /// @brief return the type of the JSON value (implicit)
    /// @sa https://json.nlohmann.me/api/basic_json/operator_value_t/
    constexpr operator value_t() const noexcept
    {
        return m_data.m_type;
    }

    /// @}

  private:
    //////////////////
    // value access //
    //////////////////

    /// get a boolean (explicit)
    boolean_t get_impl(boolean_t* /*unused*/) const
    {
        if (JSON_HEDLEY_LIKELY(is_boolean()))
        {
            return m_data.m_value.boolean;
        }

        JSON_THROW(type_error::create(302, detail::concat("type must be boolean, but is ", type_name()), this));
    }

    /// get a pointer to the value (object)
    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
    {
        return is_object() ? m_data.m_value.object : nullptr;
    }

    /// get a pointer to the value (object)
    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
    {
        return is_object() ? m_data.m_value.object : nullptr;
    }

    /// get a pointer to the value (array)
    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
    {
        return is_array() ? m_data.m_value.array : nullptr;
    }

    /// get a pointer to the value (array)
    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
    {
        return is_array() ? m_data.m_value.array : nullptr;
    }

    /// get a pointer to the value (string)
    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
    {
        return is_string() ? m_data.m_value.string : nullptr;
    }

    /// get a pointer to the value (string)
    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
    {
        return is_string() ? m_data.m_value.string : nullptr;
    }

    /// get a pointer to the value (boolean)
    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
    {
        return is_boolean() ? &m_data.m_value.boolean : nullptr;
    }

    /// get a pointer to the value (boolean)
    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
    {
        return is_boolean() ? &m_data.m_value.boolean : nullptr;
    }

    /// get a pointer to the value (integer number)
    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
    {
        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
    }

    /// get a pointer to the value (integer number)
    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
    {
        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
    }

    /// get a pointer to the value (unsigned number)
    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
    {
        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
    }

    /// get a pointer to the value (unsigned number)
    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
    {
        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
    }

    /// get a pointer to the value (floating-point number)
    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
    {
        return is_number_float() ? &m_data.m_value.number_float : nullptr;
    }

    /// get a pointer to the value (floating-point number)
    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
    {
        return is_number_float() ? &m_data.m_value.number_float : nullptr;
    }

    /// get a pointer to the value (binary)
    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
    {
        return is_binary() ? m_data.m_value.binary : nullptr;
    }

    /// get a pointer to the value (binary)
    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
    {
        return is_binary() ? m_data.m_value.binary : nullptr;
    }

    /*!
    @brief helper function to implement get_ref()

    This function helps to implement get_ref() without code duplication for
    const and non-const overloads

    @tparam ThisType will be deduced as `basic_json` or `const basic_json`

    @throw type_error.303 if ReferenceType does not match underlying value
    type of the current JSON
    */
    template<typename ReferenceType, typename ThisType>
    static ReferenceType get_ref_impl(ThisType& obj)
    {
        // delegate the call to get_ptr<>()
        auto* ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();

        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
        {
            return *ptr;
        }

        JSON_THROW(type_error::create(303, detail::concat("incompatible ReferenceType for get_ref, actual type is ", obj.type_name()), &obj));
    }

  public:
    /// @name value access
    /// Direct access to the stored value of a JSON value.
    /// @{

    /// @brief get a pointer value (implicit)
    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
    template<typename PointerType, typename std::enable_if<
                 std::is_pointer<PointerType>::value, int>::type = 0>
    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
    {
        // delegate the call to get_impl_ptr<>()
        return get_impl_ptr(static_cast<PointerType>(nullptr));
    }

    /// @brief get a pointer value (implicit)
    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
    template < typename PointerType, typename std::enable_if <
                   std::is_pointer<PointerType>::value&&
                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
    {
        // delegate the call to get_impl_ptr<>() const
        return get_impl_ptr(static_cast<PointerType>(nullptr));
    }

  private:
    /*!
    @brief get a value (explicit)

    Explicit type conversion between the JSON value and a compatible value
    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
    The value is converted by calling the @ref json_serializer<ValueType>
    `from_json()` method.

    The function is equivalent to executing
    @code {.cpp}
    ValueType ret;
    JSONSerializer<ValueType>::from_json(*this, ret);
    return ret;
    @endcode

    This overloads is chosen if:
    - @a ValueType is not @ref basic_json,
    - @ref json_serializer<ValueType> has a `from_json()` method of the form
      `void from_json(const basic_json&, ValueType&)`, and
    - @ref json_serializer<ValueType> does not have a `from_json()` method of
      the form `ValueType from_json(const basic_json&)`

    @tparam ValueType the returned value type

    @return copy of the JSON value, converted to @a ValueType

    @throw what @ref json_serializer<ValueType> `from_json()` method throws

    @liveexample{The example below shows several conversions from JSON values
    to other types. There a few things to note: (1) Floating-point numbers can
    be converted to integers\, (2) A JSON array can be converted to a standard
    `std::vector<short>`\, (3) A JSON object can be converted to C++
    associative containers such as `std::unordered_map<std::string\,
    json>`.,get__ValueType_const}

    @since version 2.1.0
    */
    template < typename ValueType,
               detail::enable_if_t <
                   detail::is_default_constructible<ValueType>::value&&
                   detail::has_from_json<basic_json_t, ValueType>::value,
                   int > = 0 >
    ValueType get_impl(detail::priority_tag<0> /*unused*/) const noexcept(noexcept(
                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
    {
        auto ret = ValueType();
        JSONSerializer<ValueType>::from_json(*this, ret);
        return ret;
    }

    /*!
    @brief get a value (explicit); special case

    Explicit type conversion between the JSON value and a compatible value
    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
    The value is converted by calling the @ref json_serializer<ValueType>
    `from_json()` method.

    The function is equivalent to executing
    @code {.cpp}
    return JSONSerializer<ValueType>::from_json(*this);
    @endcode

    This overloads is chosen if:
    - @a ValueType is not @ref basic_json and
    - @ref json_serializer<ValueType> has a `from_json()` method of the form
      `ValueType from_json(const basic_json&)`

    @note If @ref json_serializer<ValueType> has both overloads of
    `from_json()`, this one is chosen.

    @tparam ValueType the returned value type

    @return copy of the JSON value, converted to @a ValueType

    @throw what @ref json_serializer<ValueType> `from_json()` method throws

    @since version 2.1.0
    */
    template < typename ValueType,
               detail::enable_if_t <
                   detail::has_non_default_from_json<basic_json_t, ValueType>::value,
                   int > = 0 >
    ValueType get_impl(detail::priority_tag<1> /*unused*/) const noexcept(noexcept(
                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
    {
        return JSONSerializer<ValueType>::from_json(*this);
    }

    /*!
    @brief get special-case overload

    This overloads converts the current @ref basic_json in a different
    @ref basic_json type

    @tparam BasicJsonType == @ref basic_json

    @return a copy of *this, converted into @a BasicJsonType

    @complexity Depending on the implementation of the called `from_json()`
                method.

    @since version 3.2.0
    */
    template < typename BasicJsonType,
               detail::enable_if_t <
                   detail::is_basic_json<BasicJsonType>::value,
                   int > = 0 >
    BasicJsonType get_impl(detail::priority_tag<2> /*unused*/) const
    {
        return *this;
    }

    /*!
    @brief get special-case overload

    This overloads avoids a lot of template boilerplate, it can be seen as the
    identity method

    @tparam BasicJsonType == @ref basic_json

    @return a copy of *this

    @complexity Constant.

    @since version 2.1.0
    */
    template<typename BasicJsonType,
             detail::enable_if_t<
                 std::is_same<BasicJsonType, basic_json_t>::value,
                 int> = 0>
    basic_json get_impl(detail::priority_tag<3> /*unused*/) const
    {
        return *this;
    }

    /*!
    @brief get a pointer value (explicit)
    @copydoc get()
    */
    template<typename PointerType,
             detail::enable_if_t<
                 std::is_pointer<PointerType>::value,
                 int> = 0>
    constexpr auto get_impl(detail::priority_tag<4> /*unused*/) const noexcept
    -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
    {
        // delegate the call to get_ptr
        return get_ptr<PointerType>();
    }

  public:
    /*!
    @brief get a (pointer) value (explicit)

    Performs explicit type conversion between the JSON value and a compatible value if required.

    - If the requested type is a pointer to the internally stored JSON value that pointer is returned.
    No copies are made.

    - If the requested type is the current @ref basic_json, or a different @ref basic_json convertible
    from the current @ref basic_json.

    - Otherwise the value is converted by calling the @ref json_serializer<ValueType> `from_json()`
    method.

    @tparam ValueTypeCV the provided value type
    @tparam ValueType the returned value type

    @return copy of the JSON value, converted to @tparam ValueType if necessary

    @throw what @ref json_serializer<ValueType> `from_json()` method throws if conversion is required

    @since version 2.1.0
    */
    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>>
#if defined(JSON_HAS_CPP_14)
    constexpr
#endif
    auto get() const noexcept(
    noexcept(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {})))
    -> decltype(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {}))
    {
        // we cannot static_assert on ValueTypeCV being non-const, because
        // there is support for get<const basic_json_t>(), which is why we
        // still need the uncvref
        static_assert(!std::is_reference<ValueTypeCV>::value,
                      "get() cannot be used with reference types, you might want to use get_ref()");
        return get_impl<ValueType>(detail::priority_tag<4> {});
    }

    /*!
    @brief get a pointer value (explicit)

    Explicit pointer access to the internally stored JSON value. No copies are
    made.

    @warning The pointer becomes invalid if the underlying JSON object
    changes.

    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
    @ref number_unsigned_t, or @ref number_float_t.

    @return pointer to the internally stored JSON value if the requested
    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise

    @complexity Constant.

    @liveexample{The example below shows how pointers to internal values of a
    JSON value can be requested. Note that no type conversions are made and a
    `nullptr` is returned if the value and the requested pointer type does not
    match.,get__PointerType}

    @sa see @ref get_ptr() for explicit pointer-member access

    @since version 1.0.0
    */
    template<typename PointerType, typename std::enable_if<
                 std::is_pointer<PointerType>::value, int>::type = 0>
    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
    {
        // delegate the call to get_ptr
        return get_ptr<PointerType>();
    }

    /// @brief get a value (explicit)
    /// @sa https://json.nlohmann.me/api/basic_json/get_to/
    template < typename ValueType,
               detail::enable_if_t <
                   !detail::is_basic_json<ValueType>::value&&
                   detail::has_from_json<basic_json_t, ValueType>::value,
                   int > = 0 >
    ValueType & get_to(ValueType& v) const noexcept(noexcept(
                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
    {
        JSONSerializer<ValueType>::from_json(*this, v);
        return v;
    }

    // specialization to allow calling get_to with a basic_json value
    // see https://github.com/nlohmann/json/issues/2175
    template<typename ValueType,
             detail::enable_if_t <
                 detail::is_basic_json<ValueType>::value,
                 int> = 0>
    ValueType & get_to(ValueType& v) const
    {
        v = *this;
        return v;
    }

    template <
        typename T, std::size_t N,
        typename Array = T (&)[N], // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
        detail::enable_if_t <
            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
    Array get_to(T (&v)[N]) const // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
    noexcept(noexcept(JSONSerializer<Array>::from_json(
                          std::declval<const basic_json_t&>(), v)))
    {
        JSONSerializer<Array>::from_json(*this, v);
        return v;
    }

    /// @brief get a reference value (implicit)
    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
    template<typename ReferenceType, typename std::enable_if<
                 std::is_reference<ReferenceType>::value, int>::type = 0>
    ReferenceType get_ref()
    {
        // delegate call to get_ref_impl
        return get_ref_impl<ReferenceType>(*this);
    }

    /// @brief get a reference value (implicit)
    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
    template < typename ReferenceType, typename std::enable_if <
                   std::is_reference<ReferenceType>::value&&
                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
    ReferenceType get_ref() const
    {
        // delegate call to get_ref_impl
        return get_ref_impl<ReferenceType>(*this);
    }

    /*!
    @brief get a value (implicit)

    Implicit type conversion between the JSON value and a compatible value.
    The call is realized by calling @ref get() const.

    @tparam ValueType non-pointer type compatible to the JSON value, for
    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
    `std::vector` types for JSON arrays. The character type of @ref string_t
    as well as an initializer list of this type is excluded to avoid
    ambiguities as these types implicitly convert to `std::string`.

    @return copy of the JSON value, converted to type @a ValueType

    @throw type_error.302 in case passed type @a ValueType is incompatible
    to the JSON value type (e.g., the JSON value is of type boolean, but a
    string is requested); see example below

    @complexity Linear in the size of the JSON value.

    @liveexample{The example below shows several conversions from JSON values
    to other types. There a few things to note: (1) Floating-point numbers can
    be converted to integers\, (2) A JSON array can be converted to a standard
    `std::vector<short>`\, (3) A JSON object can be converted to C++
    associative containers such as `std::unordered_map<std::string\,
    json>`.,operator__ValueType}

    @since version 1.0.0
    */
    template < typename ValueType, typename std::enable_if <
                   detail::conjunction <
                       detail::negation<std::is_pointer<ValueType>>,
                       detail::negation<std::is_same<ValueType, std::nullptr_t>>,
                       detail::negation<std::is_same<ValueType, detail::json_ref<basic_json>>>,
                                        detail::negation<std::is_same<ValueType, typename string_t::value_type>>,
                                        detail::negation<detail::is_basic_json<ValueType>>,
                                        detail::negation<std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>>,
#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
                                                detail::negation<std::is_same<ValueType, std::string_view>>,
#endif
#if defined(JSON_HAS_CPP_17) && JSON_HAS_STATIC_RTTI
                                                detail::negation<std::is_same<ValueType, std::any>>,
#endif
                                                detail::is_detected_lazy<detail::get_template_function, const basic_json_t&, ValueType>
                                                >::value, int >::type = 0 >
                                        JSON_EXPLICIT operator ValueType() const
    {
        // delegate the call to get<>() const
        return get<ValueType>();
    }

    /// @brief get a binary value
    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
    binary_t& get_binary()
    {
        if (!is_binary())
        {
            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
        }

        return *get_ptr<binary_t*>();
    }

    /// @brief get a binary value
    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
    const binary_t& get_binary() const
    {
        if (!is_binary())
        {
            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
        }

        return *get_ptr<const binary_t*>();
    }

    /// @}

    ////////////////////
    // element access //
    ////////////////////

    /// @name element access
    /// Access to the JSON value.
    /// @{

    /// @brief access specified array element with bounds checking
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    reference at(size_type idx)
    {
        // at only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            JSON_TRY
            {
                return set_parent(m_data.m_value.array->at(idx));
            }
            JSON_CATCH (std::out_of_range&)
            {
                // create better exception explanation
                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
            }
        }
        else
        {
            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
        }
    }

    /// @brief access specified array element with bounds checking
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    const_reference at(size_type idx) const
    {
        // at only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            JSON_TRY
            {
                return m_data.m_value.array->at(idx);
            }
            JSON_CATCH (std::out_of_range&)
            {
                // create better exception explanation
                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
            }
        }
        else
        {
            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
        }
    }

    /// @brief access specified object element with bounds checking
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    reference at(const typename object_t::key_type& key)
    {
        // at only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
        }

        auto it = m_data.m_value.object->find(key);
        if (it == m_data.m_value.object->end())
        {
            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
        }
        return set_parent(it->second);
    }

    /// @brief access specified object element with bounds checking
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
    reference at(KeyType && key)
    {
        // at only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
        }

        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
        if (it == m_data.m_value.object->end())
        {
            JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
        }
        return set_parent(it->second);
    }

    /// @brief access specified object element with bounds checking
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    const_reference at(const typename object_t::key_type& key) const
    {
        // at only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
        }

        auto it = m_data.m_value.object->find(key);
        if (it == m_data.m_value.object->end())
        {
            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
        }
        return it->second;
    }

    /// @brief access specified object element with bounds checking
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
    const_reference at(KeyType && key) const
    {
        // at only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
        }

        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
        if (it == m_data.m_value.object->end())
        {
            JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
        }
        return it->second;
    }

    /// @brief access specified array element
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    reference operator[](size_type idx)
    {
        // implicitly convert null value to an empty array
        if (is_null())
        {
            m_data.m_type = value_t::array;
            m_data.m_value.array = create<array_t>();
            assert_invariant();
        }

        // operator[] only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            // fill up array with null values if given idx is outside range
            if (idx >= m_data.m_value.array->size())
            {
#if JSON_DIAGNOSTICS
                // remember array size & capacity before resizing
                const auto old_size = m_data.m_value.array->size();
                const auto old_capacity = m_data.m_value.array->capacity();
#endif
                m_data.m_value.array->resize(idx + 1);

#if JSON_DIAGNOSTICS
                if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
                {
                    // capacity has changed: update all parents
                    set_parents();
                }
                else
                {
                    // set parent for values added above
                    set_parents(begin() + static_cast<typename iterator::difference_type>(old_size), static_cast<typename iterator::difference_type>(idx + 1 - old_size));
                }
#endif
                assert_invariant();
            }

            return m_data.m_value.array->operator[](idx);
        }

        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
    }

    /// @brief access specified array element
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    const_reference operator[](size_type idx) const
    {
        // const operator[] only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            return m_data.m_value.array->operator[](idx);
        }

        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
    }

    /// @brief access specified object element
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    reference operator[](typename object_t::key_type key)
    {
        // implicitly convert null value to an empty object
        if (is_null())
        {
            m_data.m_type = value_t::object;
            m_data.m_value.object = create<object_t>();
            assert_invariant();
        }

        // operator[] only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            auto result = m_data.m_value.object->emplace(std::move(key), nullptr);
            return set_parent(result.first->second);
        }

        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
    }

    /// @brief access specified object element
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    const_reference operator[](const typename object_t::key_type& key) const
    {
        // const operator[] only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            auto it = m_data.m_value.object->find(key);
            JSON_ASSERT(it != m_data.m_value.object->end());
            return it->second;
        }

        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
    }

    // these two functions resolve a (const) char * ambiguity affecting Clang and MSVC
    // (they seemingly cannot be constrained to resolve the ambiguity)
    template<typename T>
    reference operator[](T* key)
    {
        return operator[](typename object_t::key_type(key));
    }

    template<typename T>
    const_reference operator[](T* key) const
    {
        return operator[](typename object_t::key_type(key));
    }

    /// @brief access specified object element
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int > = 0 >
    reference operator[](KeyType && key)
    {
        // implicitly convert null value to an empty object
        if (is_null())
        {
            m_data.m_type = value_t::object;
            m_data.m_value.object = create<object_t>();
            assert_invariant();
        }

        // operator[] only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            auto result = m_data.m_value.object->emplace(std::forward<KeyType>(key), nullptr);
            return set_parent(result.first->second);
        }

        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
    }

    /// @brief access specified object element
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int > = 0 >
    const_reference operator[](KeyType && key) const
    {
        // const operator[] only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
            JSON_ASSERT(it != m_data.m_value.object->end());
            return it->second;
        }

        JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
    }

  private:
    template<typename KeyType>
    using is_comparable_with_object_key = detail::is_comparable <
        object_comparator_t, const typename object_t::key_type&, KeyType >;

    template<typename ValueType>
    using value_return_type = std::conditional <
        detail::is_c_string_uncvref<ValueType>::value,
        string_t, typename std::decay<ValueType>::type >;

  public:
    /// @brief access specified object element with default value
    /// @sa https://json.nlohmann.me/api/basic_json/value/
    template < class ValueType, detail::enable_if_t <
                   !detail::is_transparent<object_comparator_t>::value
                   && detail::is_getable<basic_json_t, ValueType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
    {
        // value only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if key is found, return value and given default value otherwise
            const auto it = find(key);
            if (it != end())
            {
                return it->template get<ValueType>();
            }

            return default_value;
        }

        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
    }

    /// @brief access specified object element with default value
    /// @sa https://json.nlohmann.me/api/basic_json/value/
    template < class ValueType, class ReturnType = typename value_return_type<ValueType>::type,
               detail::enable_if_t <
                   !detail::is_transparent<object_comparator_t>::value
                   && detail::is_getable<basic_json_t, ReturnType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    ReturnType value(const typename object_t::key_type& key, ValueType && default_value) const
    {
        // value only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if key is found, return value and given default value otherwise
            const auto it = find(key);
            if (it != end())
            {
                return it->template get<ReturnType>();
            }

            return std::forward<ValueType>(default_value);
        }

        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
    }

    /// @brief access specified object element with default value
    /// @sa https://json.nlohmann.me/api/basic_json/value/
    template < class ValueType, class KeyType, detail::enable_if_t <
                   detail::is_transparent<object_comparator_t>::value
                   && !detail::is_json_pointer<KeyType>::value
                   && is_comparable_with_object_key<KeyType>::value
                   && detail::is_getable<basic_json_t, ValueType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    ValueType value(KeyType && key, const ValueType& default_value) const
    {
        // value only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if key is found, return value and given default value otherwise
            const auto it = find(std::forward<KeyType>(key));
            if (it != end())
            {
                return it->template get<ValueType>();
            }

            return default_value;
        }

        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
    }

    /// @brief access specified object element via JSON Pointer with default value
    /// @sa https://json.nlohmann.me/api/basic_json/value/
    template < class ValueType, class KeyType, class ReturnType = typename value_return_type<ValueType>::type,
               detail::enable_if_t <
                   detail::is_transparent<object_comparator_t>::value
                   && !detail::is_json_pointer<KeyType>::value
                   && is_comparable_with_object_key<KeyType>::value
                   && detail::is_getable<basic_json_t, ReturnType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    ReturnType value(KeyType && key, ValueType && default_value) const
    {
        // value only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if key is found, return value and given default value otherwise
            const auto it = find(std::forward<KeyType>(key));
            if (it != end())
            {
                return it->template get<ReturnType>();
            }

            return std::forward<ValueType>(default_value);
        }

        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
    }

    /// @brief access specified object element via JSON Pointer with default value
    /// @sa https://json.nlohmann.me/api/basic_json/value/
    template < class ValueType, detail::enable_if_t <
                   detail::is_getable<basic_json_t, ValueType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
    {
        // value only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if pointer resolves a value, return it or use default value
            JSON_TRY
            {
                return ptr.get_checked(this).template get<ValueType>();
            }
            JSON_INTERNAL_CATCH (out_of_range&)
            {
                return default_value;
            }
        }

        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
    }

    /// @brief access specified object element via JSON Pointer with default value
    /// @sa https://json.nlohmann.me/api/basic_json/value/
    template < class ValueType, class ReturnType = typename value_return_type<ValueType>::type,
               detail::enable_if_t <
                   detail::is_getable<basic_json_t, ReturnType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    ReturnType value(const json_pointer& ptr, ValueType && default_value) const
    {
        // value only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if pointer resolves a value, return it or use default value
            JSON_TRY
            {
                return ptr.get_checked(this).template get<ReturnType>();
            }
            JSON_INTERNAL_CATCH (out_of_range&)
            {
                return std::forward<ValueType>(default_value);
            }
        }

        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
    }

    template < class ValueType, class BasicJsonType, detail::enable_if_t <
                   detail::is_basic_json<BasicJsonType>::value
                   && detail::is_getable<basic_json_t, ValueType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
    ValueType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, const ValueType& default_value) const
    {
        return value(ptr.convert(), default_value);
    }

    template < class ValueType, class BasicJsonType, class ReturnType = typename value_return_type<ValueType>::type,
               detail::enable_if_t <
                   detail::is_basic_json<BasicJsonType>::value
                   && detail::is_getable<basic_json_t, ReturnType>::value
                   && !std::is_same<value_t, detail::uncvref_t<ValueType>>::value, int > = 0 >
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
    ReturnType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, ValueType && default_value) const
    {
        return value(ptr.convert(), std::forward<ValueType>(default_value));
    }

    /// @brief access the first element
    /// @sa https://json.nlohmann.me/api/basic_json/front/
    reference front()
    {
        return *begin();
    }

    /// @brief access the first element
    /// @sa https://json.nlohmann.me/api/basic_json/front/
    const_reference front() const
    {
        return *cbegin();
    }

    /// @brief access the last element
    /// @sa https://json.nlohmann.me/api/basic_json/back/
    reference back()
    {
        auto tmp = end();
        --tmp;
        return *tmp;
    }

    /// @brief access the last element
    /// @sa https://json.nlohmann.me/api/basic_json/back/
    const_reference back() const
    {
        auto tmp = cend();
        --tmp;
        return *tmp;
    }

    /// @brief remove element given an iterator
    /// @sa https://json.nlohmann.me/api/basic_json/erase/
    template < class IteratorType, detail::enable_if_t <
                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
    IteratorType erase(IteratorType pos)
    {
        // make sure iterator fits the current value
        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
        {
            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
        }

        IteratorType result = end();

        switch (m_data.m_type)
        {
            case value_t::boolean:
            case value_t::number_float:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::string:
            case value_t::binary:
            {
                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
                {
                    JSON_THROW(invalid_iterator::create(205, "iterator out of range", this));
                }

                if (is_string())
                {
                    AllocatorType<string_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
                    m_data.m_value.string = nullptr;
                }
                else if (is_binary())
                {
                    AllocatorType<binary_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
                    m_data.m_value.binary = nullptr;
                }

                m_data.m_type = value_t::null;
                assert_invariant();
                break;
            }

            case value_t::object:
            {
                result.m_it.object_iterator = m_data.m_value.object->erase(pos.m_it.object_iterator);
                break;
            }

            case value_t::array:
            {
                result.m_it.array_iterator = m_data.m_value.array->erase(pos.m_it.array_iterator);
                break;
            }

            case value_t::null:
            case value_t::discarded:
            default:
                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
        }

        return result;
    }

    /// @brief remove elements given an iterator range
    /// @sa https://json.nlohmann.me/api/basic_json/erase/
    template < class IteratorType, detail::enable_if_t <
                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
    IteratorType erase(IteratorType first, IteratorType last)
    {
        // make sure iterator fits the current value
        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value", this));
        }

        IteratorType result = end();

        switch (m_data.m_type)
        {
            case value_t::boolean:
            case value_t::number_float:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::string:
            case value_t::binary:
            {
                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
                                       || !last.m_it.primitive_iterator.is_end()))
                {
                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", this));
                }

                if (is_string())
                {
                    AllocatorType<string_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
                    m_data.m_value.string = nullptr;
                }
                else if (is_binary())
                {
                    AllocatorType<binary_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
                    m_data.m_value.binary = nullptr;
                }

                m_data.m_type = value_t::null;
                assert_invariant();
                break;
            }

            case value_t::object:
            {
                result.m_it.object_iterator = m_data.m_value.object->erase(first.m_it.object_iterator,
                                              last.m_it.object_iterator);
                break;
            }

            case value_t::array:
            {
                result.m_it.array_iterator = m_data.m_value.array->erase(first.m_it.array_iterator,
                                             last.m_it.array_iterator);
                break;
            }

            case value_t::null:
            case value_t::discarded:
            default:
                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
        }

        return result;
    }

  private:
    template < typename KeyType, detail::enable_if_t <
                   detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int > = 0 >
    size_type erase_internal(KeyType && key)
    {
        // this erase only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
        }

        return m_data.m_value.object->erase(std::forward<KeyType>(key));
    }

    template < typename KeyType, detail::enable_if_t <
                   !detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int > = 0 >
    size_type erase_internal(KeyType && key)
    {
        // this erase only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
        }

        const auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
        if (it != m_data.m_value.object->end())
        {
            m_data.m_value.object->erase(it);
            return 1;
        }
        return 0;
    }

  public:

    /// @brief remove element from a JSON object given a key
    /// @sa https://json.nlohmann.me/api/basic_json/erase/
    size_type erase(const typename object_t::key_type& key)
    {
        // the indirection via erase_internal() is added to avoid making this
        // function a template and thus de-rank it during overload resolution
        return erase_internal(key);
    }

    /// @brief remove element from a JSON object given a key
    /// @sa https://json.nlohmann.me/api/basic_json/erase/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
    size_type erase(KeyType && key)
    {
        return erase_internal(std::forward<KeyType>(key));
    }

    /// @brief remove element from a JSON array given an index
    /// @sa https://json.nlohmann.me/api/basic_json/erase/
    void erase(const size_type idx)
    {
        // this erase only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
            {
                JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
            }

            m_data.m_value.array->erase(m_data.m_value.array->begin() + static_cast<difference_type>(idx));
        }
        else
        {
            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
        }
    }

    /// @}

    ////////////
    // lookup //
    ////////////

    /// @name lookup
    /// @{

    /// @brief find an element in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/find/
    iterator find(const typename object_t::key_type& key)
    {
        auto result = end();

        if (is_object())
        {
            result.m_it.object_iterator = m_data.m_value.object->find(key);
        }

        return result;
    }

    /// @brief find an element in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/find/
    const_iterator find(const typename object_t::key_type& key) const
    {
        auto result = cend();

        if (is_object())
        {
            result.m_it.object_iterator = m_data.m_value.object->find(key);
        }

        return result;
    }

    /// @brief find an element in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/find/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
    iterator find(KeyType && key)
    {
        auto result = end();

        if (is_object())
        {
            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
        }

        return result;
    }

    /// @brief find an element in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/find/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
    const_iterator find(KeyType && key) const
    {
        auto result = cend();

        if (is_object())
        {
            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
        }

        return result;
    }

    /// @brief returns the number of occurrences of a key in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/count/
    size_type count(const typename object_t::key_type& key) const
    {
        // return 0 for all nonobject types
        return is_object() ? m_data.m_value.object->count(key) : 0;
    }

    /// @brief returns the number of occurrences of a key in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/count/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
    size_type count(KeyType && key) const
    {
        // return 0 for all nonobject types
        return is_object() ? m_data.m_value.object->count(std::forward<KeyType>(key)) : 0;
    }

    /// @brief check the existence of an element in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/contains/
    bool contains(const typename object_t::key_type& key) const
    {
        return is_object() && m_data.m_value.object->find(key) != m_data.m_value.object->end();
    }

    /// @brief check the existence of an element in a JSON object
    /// @sa https://json.nlohmann.me/api/basic_json/contains/
    template<class KeyType, detail::enable_if_t<
                 detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
    bool contains(KeyType && key) const
    {
        return is_object() && m_data.m_value.object->find(std::forward<KeyType>(key)) != m_data.m_value.object->end();
    }

    /// @brief check the existence of an element in a JSON object given a JSON pointer
    /// @sa https://json.nlohmann.me/api/basic_json/contains/
    bool contains(const json_pointer& ptr) const
    {
        return ptr.contains(this);
    }

    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
    bool contains(const typename ::nlohmann::json_pointer<BasicJsonType>& ptr) const
    {
        return ptr.contains(this);
    }

    /// @}

    ///////////////
    // iterators //
    ///////////////

    /// @name iterators
    /// @{

    /// @brief returns an iterator to the first element
    /// @sa https://json.nlohmann.me/api/basic_json/begin/
    iterator begin() noexcept
    {
        iterator result(this);
        result.set_begin();
        return result;
    }

    /// @brief returns an iterator to the first element
    /// @sa https://json.nlohmann.me/api/basic_json/begin/
    const_iterator begin() const noexcept
    {
        return cbegin();
    }

    /// @brief returns a const iterator to the first element
    /// @sa https://json.nlohmann.me/api/basic_json/cbegin/
    const_iterator cbegin() const noexcept
    {
        const_iterator result(this);
        result.set_begin();
        return result;
    }

    /// @brief returns an iterator to one past the last element
    /// @sa https://json.nlohmann.me/api/basic_json/end/
    iterator end() noexcept
    {
        iterator result(this);
        result.set_end();
        return result;
    }

    /// @brief returns an iterator to one past the last element
    /// @sa https://json.nlohmann.me/api/basic_json/end/
    const_iterator end() const noexcept
    {
        return cend();
    }

    /// @brief returns an iterator to one past the last element
    /// @sa https://json.nlohmann.me/api/basic_json/cend/
    const_iterator cend() const noexcept
    {
        const_iterator result(this);
        result.set_end();
        return result;
    }

    /// @brief returns an iterator to the reverse-beginning
    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
    reverse_iterator rbegin() noexcept
    {
        return reverse_iterator(end());
    }

    /// @brief returns an iterator to the reverse-beginning
    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
    const_reverse_iterator rbegin() const noexcept
    {
        return crbegin();
    }

    /// @brief returns an iterator to the reverse-end
    /// @sa https://json.nlohmann.me/api/basic_json/rend/
    reverse_iterator rend() noexcept
    {
        return reverse_iterator(begin());
    }

    /// @brief returns an iterator to the reverse-end
    /// @sa https://json.nlohmann.me/api/basic_json/rend/
    const_reverse_iterator rend() const noexcept
    {
        return crend();
    }

    /// @brief returns a const reverse iterator to the last element
    /// @sa https://json.nlohmann.me/api/basic_json/crbegin/
    const_reverse_iterator crbegin() const noexcept
    {
        return const_reverse_iterator(cend());
    }

    /// @brief returns a const reverse iterator to one before the first
    /// @sa https://json.nlohmann.me/api/basic_json/crend/
    const_reverse_iterator crend() const noexcept
    {
        return const_reverse_iterator(cbegin());
    }

  public:
    /// @brief wrapper to access iterator member functions in range-based for
    /// @sa https://json.nlohmann.me/api/basic_json/items/
    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
    ///             version 4.0.0 of the library. Please use @ref items() instead;
    ///             that is, replace `json::iterator_wrapper(j)` with `j.items()`.
    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
    {
        return ref.items();
    }

    /// @brief wrapper to access iterator member functions in range-based for
    /// @sa https://json.nlohmann.me/api/basic_json/items/
    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
    ///         version 4.0.0 of the library. Please use @ref items() instead;
    ///         that is, replace `json::iterator_wrapper(j)` with `j.items()`.
    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
    {
        return ref.items();
    }

    /// @brief helper to access iterator member functions in range-based for
    /// @sa https://json.nlohmann.me/api/basic_json/items/
    iteration_proxy<iterator> items() noexcept
    {
        return iteration_proxy<iterator>(*this);
    }

    /// @brief helper to access iterator member functions in range-based for
    /// @sa https://json.nlohmann.me/api/basic_json/items/
    iteration_proxy<const_iterator> items() const noexcept
    {
        return iteration_proxy<const_iterator>(*this);
    }

    /// @}

    //////////////
    // capacity //
    //////////////

    /// @name capacity
    /// @{

    /// @brief checks whether the container is empty.
    /// @sa https://json.nlohmann.me/api/basic_json/empty/
    bool empty() const noexcept
    {
        switch (m_data.m_type)
        {
            case value_t::null:
            {
                // null values are empty
                return true;
            }

            case value_t::array:
            {
                // delegate call to array_t::empty()
                return m_data.m_value.array->empty();
            }

            case value_t::object:
            {
                // delegate call to object_t::empty()
                return m_data.m_value.object->empty();
            }

            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                // all other types are nonempty
                return false;
            }
        }
    }

    /// @brief returns the number of elements
    /// @sa https://json.nlohmann.me/api/basic_json/size/
    size_type size() const noexcept
    {
        switch (m_data.m_type)
        {
            case value_t::null:
            {
                // null values are empty
                return 0;
            }

            case value_t::array:
            {
                // delegate call to array_t::size()
                return m_data.m_value.array->size();
            }

            case value_t::object:
            {
                // delegate call to object_t::size()
                return m_data.m_value.object->size();
            }

            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                // all other types have size 1
                return 1;
            }
        }
    }

    /// @brief returns the maximum possible number of elements
    /// @sa https://json.nlohmann.me/api/basic_json/max_size/
    size_type max_size() const noexcept
    {
        switch (m_data.m_type)
        {
            case value_t::array:
            {
                // delegate call to array_t::max_size()
                return m_data.m_value.array->max_size();
            }

            case value_t::object:
            {
                // delegate call to object_t::max_size()
                return m_data.m_value.object->max_size();
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                // all other types have max_size() == size()
                return size();
            }
        }
    }

    /// @}

    ///////////////
    // modifiers //
    ///////////////

    /// @name modifiers
    /// @{

    /// @brief clears the contents
    /// @sa https://json.nlohmann.me/api/basic_json/clear/
    void clear() noexcept
    {
        switch (m_data.m_type)
        {
            case value_t::number_integer:
            {
                m_data.m_value.number_integer = 0;
                break;
            }

            case value_t::number_unsigned:
            {
                m_data.m_value.number_unsigned = 0;
                break;
            }

            case value_t::number_float:
            {
                m_data.m_value.number_float = 0.0;
                break;
            }

            case value_t::boolean:
            {
                m_data.m_value.boolean = false;
                break;
            }

            case value_t::string:
            {
                m_data.m_value.string->clear();
                break;
            }

            case value_t::binary:
            {
                m_data.m_value.binary->clear();
                break;
            }

            case value_t::array:
            {
                m_data.m_value.array->clear();
                break;
            }

            case value_t::object:
            {
                m_data.m_value.object->clear();
                break;
            }

            case value_t::null:
            case value_t::discarded:
            default:
                break;
        }
    }

    /// @brief add an object to an array
    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
    void push_back(basic_json&& val)
    {
        // push_back only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
        {
            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
        }

        // transform null object into an array
        if (is_null())
        {
            m_data.m_type = value_t::array;
            m_data.m_value = value_t::array;
            assert_invariant();
        }

        // add element to array (move semantics)
        const auto old_capacity = m_data.m_value.array->capacity();
        m_data.m_value.array->push_back(std::move(val));
        set_parent(m_data.m_value.array->back(), old_capacity);
        // if val is moved from, basic_json move constructor marks it null, so we do not call the destructor
    }

    /// @brief add an object to an array
    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
    reference operator+=(basic_json&& val)
    {
        push_back(std::move(val));
        return *this;
    }

    /// @brief add an object to an array
    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
    void push_back(const basic_json& val)
    {
        // push_back only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
        {
            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
        }

        // transform null object into an array
        if (is_null())
        {
            m_data.m_type = value_t::array;
            m_data.m_value = value_t::array;
            assert_invariant();
        }

        // add element to array
        const auto old_capacity = m_data.m_value.array->capacity();
        m_data.m_value.array->push_back(val);
        set_parent(m_data.m_value.array->back(), old_capacity);
    }

    /// @brief add an object to an array
    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
    reference operator+=(const basic_json& val)
    {
        push_back(val);
        return *this;
    }

    /// @brief add an object to an object
    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
    void push_back(const typename object_t::value_type& val)
    {
        // push_back only works for null objects or objects
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
        {
            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
        }

        // transform null object into an object
        if (is_null())
        {
            m_data.m_type = value_t::object;
            m_data.m_value = value_t::object;
            assert_invariant();
        }

        // add element to object
        auto res = m_data.m_value.object->insert(val);
        set_parent(res.first->second);
    }

    /// @brief add an object to an object
    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
    reference operator+=(const typename object_t::value_type& val)
    {
        push_back(val);
        return *this;
    }

    /// @brief add an object to an object
    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
    void push_back(initializer_list_t init)
    {
        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
        {
            basic_json&& key = init.begin()->moved_or_copied();
            push_back(typename object_t::value_type(
                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
        }
        else
        {
            push_back(basic_json(init));
        }
    }

    /// @brief add an object to an object
    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
    reference operator+=(initializer_list_t init)
    {
        push_back(init);
        return *this;
    }

    /// @brief add an object to an array
    /// @sa https://json.nlohmann.me/api/basic_json/emplace_back/
    template<class... Args>
    reference emplace_back(Args&& ... args)
    {
        // emplace_back only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
        {
            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace_back() with ", type_name()), this));
        }

        // transform null object into an array
        if (is_null())
        {
            m_data.m_type = value_t::array;
            m_data.m_value = value_t::array;
            assert_invariant();
        }

        // add element to array (perfect forwarding)
        const auto old_capacity = m_data.m_value.array->capacity();
        m_data.m_value.array->emplace_back(std::forward<Args>(args)...);
        return set_parent(m_data.m_value.array->back(), old_capacity);
    }

    /// @brief add an object to an object if key does not exist
    /// @sa https://json.nlohmann.me/api/basic_json/emplace/
    template<class... Args>
    std::pair<iterator, bool> emplace(Args&& ... args)
    {
        // emplace only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
        {
            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace() with ", type_name()), this));
        }

        // transform null object into an object
        if (is_null())
        {
            m_data.m_type = value_t::object;
            m_data.m_value = value_t::object;
            assert_invariant();
        }

        // add element to array (perfect forwarding)
        auto res = m_data.m_value.object->emplace(std::forward<Args>(args)...);
        set_parent(res.first->second);

        // create result iterator and set iterator to the result of emplace
        auto it = begin();
        it.m_it.object_iterator = res.first;

        // return pair of iterator and boolean
        return {it, res.second};
    }

    /// Helper for insertion of an iterator
    /// @note: This uses std::distance to support GCC 4.8,
    ///        see https://github.com/nlohmann/json/pull/1257
    template<typename... Args>
    iterator insert_iterator(const_iterator pos, Args&& ... args)
    {
        iterator result(this);
        JSON_ASSERT(m_data.m_value.array != nullptr);

        auto insert_pos = std::distance(m_data.m_value.array->begin(), pos.m_it.array_iterator);
        m_data.m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
        result.m_it.array_iterator = m_data.m_value.array->begin() + insert_pos;

        // This could have been written as:
        // result.m_it.array_iterator = m_data.m_value.array->insert(pos.m_it.array_iterator, cnt, val);
        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.

        set_parents();
        return result;
    }

    /// @brief inserts element into array
    /// @sa https://json.nlohmann.me/api/basic_json/insert/
    iterator insert(const_iterator pos, const basic_json& val)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            // check if iterator pos fits to this JSON value
            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
            {
                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
            }

            // insert to array and return iterator
            return insert_iterator(pos, val);
        }

        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
    }

    /// @brief inserts element into array
    /// @sa https://json.nlohmann.me/api/basic_json/insert/
    iterator insert(const_iterator pos, basic_json&& val)
    {
        return insert(pos, val);
    }

    /// @brief inserts copies of element into array
    /// @sa https://json.nlohmann.me/api/basic_json/insert/
    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            // check if iterator pos fits to this JSON value
            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
            {
                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
            }

            // insert to array and return iterator
            return insert_iterator(pos, cnt, val);
        }

        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
    }

    /// @brief inserts range of elements into array
    /// @sa https://json.nlohmann.me/api/basic_json/insert/
    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_UNLIKELY(!is_array()))
        {
            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
        }

        // check if iterator pos fits to this JSON value
        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
        {
            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
        }

        // check if range iterators belong to the same JSON object
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
        }

        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
        {
            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container", this));
        }

        // insert to array and return iterator
        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
    }

    /// @brief inserts elements from initializer list into array
    /// @sa https://json.nlohmann.me/api/basic_json/insert/
    iterator insert(const_iterator pos, initializer_list_t ilist)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_UNLIKELY(!is_array()))
        {
            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
        }

        // check if iterator pos fits to this JSON value
        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
        {
            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
        }

        // insert to array and return iterator
        return insert_iterator(pos, ilist.begin(), ilist.end());
    }

    /// @brief inserts range of elements into object
    /// @sa https://json.nlohmann.me/api/basic_json/insert/
    void insert(const_iterator first, const_iterator last)
    {
        // insert only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
        }

        // check if range iterators belong to the same JSON object
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
        }

        // passed iterators must belong to objects
        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
        {
            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", this));
        }

        m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
    }

    /// @brief updates a JSON object from another object, overwriting existing keys
    /// @sa https://json.nlohmann.me/api/basic_json/update/
    void update(const_reference j, bool merge_objects = false)
    {
        update(j.begin(), j.end(), merge_objects);
    }

    /// @brief updates a JSON object from another object, overwriting existing keys
    /// @sa https://json.nlohmann.me/api/basic_json/update/
    void update(const_iterator first, const_iterator last, bool merge_objects = false)
    {
        // implicitly convert null value to an empty object
        if (is_null())
        {
            m_data.m_type = value_t::object;
            m_data.m_value.object = create<object_t>();
            assert_invariant();
        }

        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(312, detail::concat("cannot use update() with ", type_name()), this));
        }

        // check if range iterators belong to the same JSON object
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
        }

        // passed iterators must belong to objects
        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
        {
            JSON_THROW(type_error::create(312, detail::concat("cannot use update() with ", first.m_object->type_name()), first.m_object));
        }

        for (auto it = first; it != last; ++it)
        {
            if (merge_objects && it.value().is_object())
            {
                auto it2 = m_data.m_value.object->find(it.key());
                if (it2 != m_data.m_value.object->end())
                {
                    it2->second.update(it.value(), true);
                    continue;
                }
            }
            m_data.m_value.object->operator[](it.key()) = it.value();
#if JSON_DIAGNOSTICS
            m_data.m_value.object->operator[](it.key()).m_parent = this;
#endif
        }
    }

    /// @brief exchanges the values
    /// @sa https://json.nlohmann.me/api/basic_json/swap/
    void swap(reference other) noexcept (
        std::is_nothrow_move_constructible<value_t>::value&&
        std::is_nothrow_move_assignable<value_t>::value&&
        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
        std::is_nothrow_move_assignable<json_value>::value
    )
    {
        std::swap(m_data.m_type, other.m_data.m_type);
        std::swap(m_data.m_value, other.m_data.m_value);

        set_parents();
        other.set_parents();
        assert_invariant();
    }

    /// @brief exchanges the values
    /// @sa https://json.nlohmann.me/api/basic_json/swap/
    friend void swap(reference left, reference right) noexcept (
        std::is_nothrow_move_constructible<value_t>::value&&
        std::is_nothrow_move_assignable<value_t>::value&&
        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
        std::is_nothrow_move_assignable<json_value>::value
    )
    {
        left.swap(right);
    }

    /// @brief exchanges the values
    /// @sa https://json.nlohmann.me/api/basic_json/swap/
    void swap(array_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
    {
        // swap only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            using std::swap;
            swap(*(m_data.m_value.array), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(array_t&) with ", type_name()), this));
        }
    }

    /// @brief exchanges the values
    /// @sa https://json.nlohmann.me/api/basic_json/swap/
    void swap(object_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
    {
        // swap only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            using std::swap;
            swap(*(m_data.m_value.object), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(object_t&) with ", type_name()), this));
        }
    }

    /// @brief exchanges the values
    /// @sa https://json.nlohmann.me/api/basic_json/swap/
    void swap(string_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
    {
        // swap only works for strings
        if (JSON_HEDLEY_LIKELY(is_string()))
        {
            using std::swap;
            swap(*(m_data.m_value.string), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(string_t&) with ", type_name()), this));
        }
    }

    /// @brief exchanges the values
    /// @sa https://json.nlohmann.me/api/basic_json/swap/
    void swap(binary_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
    {
        // swap only works for strings
        if (JSON_HEDLEY_LIKELY(is_binary()))
        {
            using std::swap;
            swap(*(m_data.m_value.binary), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(binary_t&) with ", type_name()), this));
        }
    }

    /// @brief exchanges the values
    /// @sa https://json.nlohmann.me/api/basic_json/swap/
    void swap(typename binary_t::container_type& other) // NOLINT(bugprone-exception-escape)
    {
        // swap only works for strings
        if (JSON_HEDLEY_LIKELY(is_binary()))
        {
            using std::swap;
            swap(*(m_data.m_value.binary), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(binary_t::container_type&) with ", type_name()), this));
        }
    }

    /// @}

    //////////////////////////////////////////
    // lexicographical comparison operators //
    //////////////////////////////////////////

    /// @name lexicographical comparison operators
    /// @{

    // note parentheses around operands are necessary; see
    // https://github.com/nlohmann/json/issues/1530
#define JSON_IMPLEMENT_OPERATOR(op, null_result, unordered_result, default_result)                       \
    const auto lhs_type = lhs.type();                                                                    \
    const auto rhs_type = rhs.type();                                                                    \
    \
    if (lhs_type == rhs_type) /* NOLINT(readability/braces) */                                           \
    {                                                                                                    \
        switch (lhs_type)                                                                                \
        {                                                                                                \
            case value_t::array:                                                                         \
                return (*lhs.m_data.m_value.array) op (*rhs.m_data.m_value.array);                                     \
                \
            case value_t::object:                                                                        \
                return (*lhs.m_data.m_value.object) op (*rhs.m_data.m_value.object);                                   \
                \
            case value_t::null:                                                                          \
                return (null_result);                                                                    \
                \
            case value_t::string:                                                                        \
                return (*lhs.m_data.m_value.string) op (*rhs.m_data.m_value.string);                                   \
                \
            case value_t::boolean:                                                                       \
                return (lhs.m_data.m_value.boolean) op (rhs.m_data.m_value.boolean);                                   \
                \
            case value_t::number_integer:                                                                \
                return (lhs.m_data.m_value.number_integer) op (rhs.m_data.m_value.number_integer);                     \
                \
            case value_t::number_unsigned:                                                               \
                return (lhs.m_data.m_value.number_unsigned) op (rhs.m_data.m_value.number_unsigned);                   \
                \
            case value_t::number_float:                                                                  \
                return (lhs.m_data.m_value.number_float) op (rhs.m_data.m_value.number_float);                         \
                \
            case value_t::binary:                                                                        \
                return (*lhs.m_data.m_value.binary) op (*rhs.m_data.m_value.binary);                                   \
                \
            case value_t::discarded:                                                                     \
            default:                                                                                     \
                return (unordered_result);                                                               \
        }                                                                                                \
    }                                                                                                    \
    else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)                   \
    {                                                                                                    \
        return static_cast<number_float_t>(lhs.m_data.m_value.number_integer) op rhs.m_data.m_value.number_float;      \
    }                                                                                                    \
    else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)                   \
    {                                                                                                    \
        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_integer);      \
    }                                                                                                    \
    else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)                  \
    {                                                                                                    \
        return static_cast<number_float_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_float;     \
    }                                                                                                    \
    else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)                  \
    {                                                                                                    \
        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_unsigned);     \
    }                                                                                                    \
    else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)                \
    {                                                                                                    \
        return static_cast<number_integer_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_integer; \
    }                                                                                                    \
    else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)                \
    {                                                                                                    \
        return lhs.m_data.m_value.number_integer op static_cast<number_integer_t>(rhs.m_data.m_value.number_unsigned); \
    }                                                                                                    \
    else if(compares_unordered(lhs, rhs))\
    {\
        return (unordered_result);\
    }\
    \
    return (default_result);

  JSON_PRIVATE_UNLESS_TESTED:
    // returns true if:
    // - any operand is NaN and the other operand is of number type
    // - any operand is discarded
    // in legacy mode, discarded values are considered ordered if
    // an operation is computed as an odd number of inverses of others
    static bool compares_unordered(const_reference lhs, const_reference rhs, bool inverse = false) noexcept
    {
        if ((lhs.is_number_float() && std::isnan(lhs.m_data.m_value.number_float) && rhs.is_number())
                || (rhs.is_number_float() && std::isnan(rhs.m_data.m_value.number_float) && lhs.is_number()))
        {
            return true;
        }
#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
        return (lhs.is_discarded() || rhs.is_discarded()) && !inverse;
#else
        static_cast<void>(inverse);
        return lhs.is_discarded() || rhs.is_discarded();
#endif
    }

  private:
    bool compares_unordered(const_reference rhs, bool inverse = false) const noexcept
    {
        return compares_unordered(*this, rhs, inverse);
    }

  public:
#if JSON_HAS_THREE_WAY_COMPARISON
    /// @brief comparison: equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
    bool operator==(const_reference rhs) const noexcept
    {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
        const_reference lhs = *this;
        JSON_IMPLEMENT_OPERATOR( ==, true, false, false)
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
    }

    /// @brief comparison: equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
    template<typename ScalarType>
    requires std::is_scalar_v<ScalarType>
    bool operator==(ScalarType rhs) const noexcept
    {
        return *this == basic_json(rhs);
    }

    /// @brief comparison: not equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
    bool operator!=(const_reference rhs) const noexcept
    {
        if (compares_unordered(rhs, true))
        {
            return false;
        }
        return !operator==(rhs);
    }

    /// @brief comparison: 3-way
    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
    std::partial_ordering operator<=>(const_reference rhs) const noexcept // *NOPAD*
    {
        const_reference lhs = *this;
        // default_result is used if we cannot compare values. In that case,
        // we compare types.
        JSON_IMPLEMENT_OPERATOR(<=>, // *NOPAD*
                                std::partial_ordering::equivalent,
                                std::partial_ordering::unordered,
                                lhs_type <=> rhs_type) // *NOPAD*
    }

    /// @brief comparison: 3-way
    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
    template<typename ScalarType>
    requires std::is_scalar_v<ScalarType>
    std::partial_ordering operator<=>(ScalarType rhs) const noexcept // *NOPAD*
    {
        return *this <=> basic_json(rhs); // *NOPAD*
    }

#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
    // all operators that are computed as an odd number of inverses of others
    // need to be overloaded to emulate the legacy comparison behavior

    /// @brief comparison: less than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
    bool operator<=(const_reference rhs) const noexcept
    {
        if (compares_unordered(rhs, true))
        {
            return false;
        }
        return !(rhs < *this);
    }

    /// @brief comparison: less than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
    template<typename ScalarType>
    requires std::is_scalar_v<ScalarType>
    bool operator<=(ScalarType rhs) const noexcept
    {
        return *this <= basic_json(rhs);
    }

    /// @brief comparison: greater than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
    bool operator>=(const_reference rhs) const noexcept
    {
        if (compares_unordered(rhs, true))
        {
            return false;
        }
        return !(*this < rhs);
    }

    /// @brief comparison: greater than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
    template<typename ScalarType>
    requires std::is_scalar_v<ScalarType>
    bool operator>=(ScalarType rhs) const noexcept
    {
        return *this >= basic_json(rhs);
    }
#endif
#else
    /// @brief comparison: equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
    {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
        JSON_IMPLEMENT_OPERATOR( ==, true, false, false)
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
    }

    /// @brief comparison: equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator==(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs == basic_json(rhs);
    }

    /// @brief comparison: equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator==(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) == rhs;
    }

    /// @brief comparison: not equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
    {
        if (compares_unordered(lhs, rhs, true))
        {
            return false;
        }
        return !(lhs == rhs);
    }

    /// @brief comparison: not equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator!=(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs != basic_json(rhs);
    }

    /// @brief comparison: not equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator!=(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) != rhs;
    }

    /// @brief comparison: less than
    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
    {
        // default_result is used if we cannot compare values. In that case,
        // we compare types. Note we have to call the operator explicitly,
        // because MSVC has problems otherwise.
        JSON_IMPLEMENT_OPERATOR( <, false, false, operator<(lhs_type, rhs_type))
    }

    /// @brief comparison: less than
    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs < basic_json(rhs);
    }

    /// @brief comparison: less than
    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) < rhs;
    }

    /// @brief comparison: less than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
    {
        if (compares_unordered(lhs, rhs, true))
        {
            return false;
        }
        return !(rhs < lhs);
    }

    /// @brief comparison: less than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<=(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs <= basic_json(rhs);
    }

    /// @brief comparison: less than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<=(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) <= rhs;
    }

    /// @brief comparison: greater than
    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
    {
        // double inverse
        if (compares_unordered(lhs, rhs))
        {
            return false;
        }
        return !(lhs <= rhs);
    }

    /// @brief comparison: greater than
    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs > basic_json(rhs);
    }

    /// @brief comparison: greater than
    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) > rhs;
    }

    /// @brief comparison: greater than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
    {
        if (compares_unordered(lhs, rhs, true))
        {
            return false;
        }
        return !(lhs < rhs);
    }

    /// @brief comparison: greater than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>=(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs >= basic_json(rhs);
    }

    /// @brief comparison: greater than or equal
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>=(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) >= rhs;
    }
#endif

#undef JSON_IMPLEMENT_OPERATOR

    /// @}

    ///////////////////
    // serialization //
    ///////////////////

    /// @name serialization
    /// @{
#ifndef JSON_NO_IO
    /// @brief serialize to stream
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
    {
        // read width member and use it as indentation parameter if nonzero
        const bool pretty_print = o.width() > 0;
        const auto indentation = pretty_print ? o.width() : 0;

        // reset width to 0 for subsequent calls to this stream
        o.width(0);

        // do the actual serialization
        serializer s(detail::output_adapter<char>(o), o.fill());
        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
        return o;
    }

    /// @brief serialize to stream
    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
    /// @deprecated This function is deprecated since 3.0.0 and will be removed in
    ///             version 4.0.0 of the library. Please use
    ///             operator<<(std::ostream&, const basic_json&) instead; that is,
    ///             replace calls like `j >> o;` with `o << j;`.
    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
    {
        return o << j;
    }
#endif  // JSON_NO_IO
    /// @}

    /////////////////////
    // deserialization //
    /////////////////////

    /// @name deserialization
    /// @{

    /// @brief deserialize from a compatible input
    /// @sa https://json.nlohmann.me/api/basic_json/parse/
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json parse(InputType&& i,
                            const parser_callback_t cb = nullptr,
                            const bool allow_exceptions = true,
                            const bool ignore_comments = false)
    {
        basic_json result;
        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments).parse(true, result);
        return result;
    }

    /// @brief deserialize from a pair of character iterators
    /// @sa https://json.nlohmann.me/api/basic_json/parse/
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json parse(IteratorType first,
                            IteratorType last,
                            const parser_callback_t cb = nullptr,
                            const bool allow_exceptions = true,
                            const bool ignore_comments = false)
    {
        basic_json result;
        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result);
        return result;
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
    static basic_json parse(detail::span_input_adapter&& i,
                            const parser_callback_t cb = nullptr,
                            const bool allow_exceptions = true,
                            const bool ignore_comments = false)
    {
        basic_json result;
        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
        return result;
    }

    /// @brief check if the input is valid JSON
    /// @sa https://json.nlohmann.me/api/basic_json/accept/
    template<typename InputType>
    static bool accept(InputType&& i,
                       const bool ignore_comments = false)
    {
        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
    }

    /// @brief check if the input is valid JSON
    /// @sa https://json.nlohmann.me/api/basic_json/accept/
    template<typename IteratorType>
    static bool accept(IteratorType first, IteratorType last,
                       const bool ignore_comments = false)
    {
        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
    static bool accept(detail::span_input_adapter&& i,
                       const bool ignore_comments = false)
    {
        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
    }

    /// @brief generate SAX events
    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
    template <typename InputType, typename SAX>
    JSON_HEDLEY_NON_NULL(2)
    static bool sax_parse(InputType&& i, SAX* sax,
                          input_format_t format = input_format_t::json,
                          const bool strict = true,
                          const bool ignore_comments = false)
    {
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        return format == input_format_t::json
               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
    }

    /// @brief generate SAX events
    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
    template<class IteratorType, class SAX>
    JSON_HEDLEY_NON_NULL(3)
    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
                          input_format_t format = input_format_t::json,
                          const bool strict = true,
                          const bool ignore_comments = false)
    {
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        return format == input_format_t::json
               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
    }

    /// @brief generate SAX events
    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
    /// @deprecated This function is deprecated since 3.8.0 and will be removed in
    ///             version 4.0.0 of the library. Please use
    ///             sax_parse(ptr, ptr + len) instead.
    template <typename SAX>
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
    JSON_HEDLEY_NON_NULL(2)
    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
                          input_format_t format = input_format_t::json,
                          const bool strict = true,
                          const bool ignore_comments = false)
    {
        auto ia = i.get();
        return format == input_format_t::json
               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format).sax_parse(format, sax, strict);
    }
#ifndef JSON_NO_IO
    /// @brief deserialize from stream
    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
    /// @deprecated This stream operator is deprecated since 3.0.0 and will be removed in
    ///             version 4.0.0 of the library. Please use
    ///             operator>>(std::istream&, basic_json&) instead; that is,
    ///             replace calls like `j << i;` with `i >> j;`.
    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
    friend std::istream& operator<<(basic_json& j, std::istream& i)
    {
        return operator>>(i, j);
    }

    /// @brief deserialize from stream
    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
    friend std::istream& operator>>(std::istream& i, basic_json& j)
    {
        parser(detail::input_adapter(i)).parse(false, j);
        return i;
    }
#endif  // JSON_NO_IO
    /// @}

    ///////////////////////////
    // convenience functions //
    ///////////////////////////

    /// @brief return the type as string
    /// @sa https://json.nlohmann.me/api/basic_json/type_name/
    JSON_HEDLEY_RETURNS_NON_NULL
    const char* type_name() const noexcept
    {
        switch (m_data.m_type)
        {
            case value_t::null:
                return "null";
            case value_t::object:
                return "object";
            case value_t::array:
                return "array";
            case value_t::string:
                return "string";
            case value_t::boolean:
                return "boolean";
            case value_t::binary:
                return "binary";
            case value_t::discarded:
                return "discarded";
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            default:
                return "number";
        }
    }

  JSON_PRIVATE_UNLESS_TESTED:
    //////////////////////
    // member variables //
    //////////////////////

    struct data
    {
        /// the type of the current element
        value_t m_type = value_t::null;

        /// the value of the current element
        json_value m_value = {};

        data(const value_t v)
            : m_type(v), m_value(v)
        {
        }

        data(size_type cnt, const basic_json& val)
            : m_type(value_t::array)
        {
            m_value.array = create<array_t>(cnt, val);
        }

        data() noexcept = default;
        data(data&&) noexcept = default;
        data(const data&) noexcept = delete;
        data& operator=(data&&) noexcept = delete;
        data& operator=(const data&) noexcept = delete;

        ~data() noexcept
        {
            m_value.destroy(m_type);
        }
    };

    data m_data = {};

#if JSON_DIAGNOSTICS
    /// a pointer to a parent value (for debugging purposes)
    basic_json* m_parent = nullptr;
#endif

    //////////////////////////////////////////
    // binary serialization/deserialization //
    //////////////////////////////////////////

    /// @name binary serialization/deserialization support
    /// @{

  public:
    /// @brief create a CBOR serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
    static std::vector<std::uint8_t> to_cbor(const basic_json& j)
    {
        std::vector<std::uint8_t> result;
        to_cbor(j, result);
        return result;
    }

    /// @brief create a CBOR serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
    static void to_cbor(const basic_json& j, detail::output_adapter<std::uint8_t> o)
    {
        binary_writer<std::uint8_t>(o).write_cbor(j);
    }

    /// @brief create a CBOR serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
    {
        binary_writer<char>(o).write_cbor(j);
    }

    /// @brief create a MessagePack serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
    static std::vector<std::uint8_t> to_msgpack(const basic_json& j)
    {
        std::vector<std::uint8_t> result;
        to_msgpack(j, result);
        return result;
    }

    /// @brief create a MessagePack serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
    static void to_msgpack(const basic_json& j, detail::output_adapter<std::uint8_t> o)
    {
        binary_writer<std::uint8_t>(o).write_msgpack(j);
    }

    /// @brief create a MessagePack serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
    {
        binary_writer<char>(o).write_msgpack(j);
    }

    /// @brief create a UBJSON serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
    static std::vector<std::uint8_t> to_ubjson(const basic_json& j,
            const bool use_size = false,
            const bool use_type = false)
    {
        std::vector<std::uint8_t> result;
        to_ubjson(j, result, use_size, use_type);
        return result;
    }

    /// @brief create a UBJSON serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
    static void to_ubjson(const basic_json& j, detail::output_adapter<std::uint8_t> o,
                          const bool use_size = false, const bool use_type = false)
    {
        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type);
    }

    /// @brief create a UBJSON serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
                          const bool use_size = false, const bool use_type = false)
    {
        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
    }

    /// @brief create a BJData serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
    static std::vector<std::uint8_t> to_bjdata(const basic_json& j,
            const bool use_size = false,
            const bool use_type = false)
    {
        std::vector<std::uint8_t> result;
        to_bjdata(j, result, use_size, use_type);
        return result;
    }

    /// @brief create a BJData serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
    static void to_bjdata(const basic_json& j, detail::output_adapter<std::uint8_t> o,
                          const bool use_size = false, const bool use_type = false)
    {
        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type, true, true);
    }

    /// @brief create a BJData serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
    static void to_bjdata(const basic_json& j, detail::output_adapter<char> o,
                          const bool use_size = false, const bool use_type = false)
    {
        binary_writer<char>(o).write_ubjson(j, use_size, use_type, true, true);
    }

    /// @brief create a BSON serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
    static std::vector<std::uint8_t> to_bson(const basic_json& j)
    {
        std::vector<std::uint8_t> result;
        to_bson(j, result);
        return result;
    }

    /// @brief create a BSON serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
    static void to_bson(const basic_json& j, detail::output_adapter<std::uint8_t> o)
    {
        binary_writer<std::uint8_t>(o).write_bson(j);
    }

    /// @brief create a BSON serialization of a given JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
    {
        binary_writer<char>(o).write_bson(j);
    }

    /// @brief create a JSON value from an input in CBOR format
    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_cbor(InputType&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in CBOR format
    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_cbor(IteratorType first, IteratorType last,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
        return res ? result : basic_json(value_t::discarded);
    }

    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
    static basic_json from_cbor(const T* ptr, std::size_t len,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
    static basic_json from_cbor(detail::span_input_adapter&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in MessagePack format
    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_msgpack(InputType&& i,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in MessagePack format
    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_msgpack(IteratorType first, IteratorType last,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
    static basic_json from_msgpack(const T* ptr, std::size_t len,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
    static basic_json from_msgpack(detail::span_input_adapter&& i,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in UBJSON format
    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_ubjson(InputType&& i,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in UBJSON format
    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_ubjson(IteratorType first, IteratorType last,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
    static basic_json from_ubjson(const T* ptr, std::size_t len,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
    static basic_json from_ubjson(detail::span_input_adapter&& i,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in BJData format
    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_bjdata(InputType&& i,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in BJData format
    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_bjdata(IteratorType first, IteratorType last,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in BSON format
    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_bson(InputType&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /// @brief create a JSON value from an input in BSON format
    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_bson(IteratorType first, IteratorType last,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
    static basic_json from_bson(const T* ptr, std::size_t len,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        return from_bson(ptr, ptr + len, strict, allow_exceptions);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
    static basic_json from_bson(detail::span_input_adapter&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }
    /// @}

    //////////////////////////
    // JSON Pointer support //
    //////////////////////////

    /// @name JSON Pointer functions
    /// @{

    /// @brief access specified element via JSON Pointer
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    reference operator[](const json_pointer& ptr)
    {
        return ptr.get_unchecked(this);
    }

    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
    reference operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr)
    {
        return ptr.get_unchecked(this);
    }

    /// @brief access specified element via JSON Pointer
    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
    const_reference operator[](const json_pointer& ptr) const
    {
        return ptr.get_unchecked(this);
    }

    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
    const_reference operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr) const
    {
        return ptr.get_unchecked(this);
    }

    /// @brief access specified element via JSON Pointer
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    reference at(const json_pointer& ptr)
    {
        return ptr.get_checked(this);
    }

    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
    reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr)
    {
        return ptr.get_checked(this);
    }

    /// @brief access specified element via JSON Pointer
    /// @sa https://json.nlohmann.me/api/basic_json/at/
    const_reference at(const json_pointer& ptr) const
    {
        return ptr.get_checked(this);
    }

    template<typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, basic_json::json_pointer or nlohmann::json_pointer<basic_json::string_t>) // NOLINT(readability/alt_tokens)
    const_reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr) const
    {
        return ptr.get_checked(this);
    }

    /// @brief return flattened JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/flatten/
    basic_json flatten() const
    {
        basic_json result(value_t::object);
        json_pointer::flatten("", *this, result);
        return result;
    }

    /// @brief unflatten a previously flattened JSON value
    /// @sa https://json.nlohmann.me/api/basic_json/unflatten/
    basic_json unflatten() const
    {
        return json_pointer::unflatten(*this);
    }

    /// @}

    //////////////////////////
    // JSON Patch functions //
    //////////////////////////

    /// @name JSON Patch functions
    /// @{

    /// @brief applies a JSON patch in-place without copying the object
    /// @sa https://json.nlohmann.me/api/basic_json/patch/
    void patch_inplace(const basic_json& json_patch)
    {
        basic_json& result = *this;
        // the valid JSON Patch operations
        enum class patch_operations {add, remove, replace, move, copy, test, invalid};

        const auto get_op = [](const std::string & op)
        {
            if (op == "add")
            {
                return patch_operations::add;
            }
            if (op == "remove")
            {
                return patch_operations::remove;
            }
            if (op == "replace")
            {
                return patch_operations::replace;
            }
            if (op == "move")
            {
                return patch_operations::move;
            }
            if (op == "copy")
            {
                return patch_operations::copy;
            }
            if (op == "test")
            {
                return patch_operations::test;
            }

            return patch_operations::invalid;
        };

        // wrapper for "add" operation; add value at ptr
        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
        {
            // adding to the root of the target document means replacing it
            if (ptr.empty())
            {
                result = val;
                return;
            }

            // make sure the top element of the pointer exists
            json_pointer const top_pointer = ptr.top();
            if (top_pointer != ptr)
            {
                result.at(top_pointer);
            }

            // get reference to parent of JSON pointer ptr
            const auto last_path = ptr.back();
            ptr.pop_back();
            // parent must exist when performing patch add per RFC6902 specs
            basic_json& parent = result.at(ptr);

            switch (parent.m_data.m_type)
            {
                case value_t::null:
                case value_t::object:
                {
                    // use operator[] to add value
                    parent[last_path] = val;
                    break;
                }

                case value_t::array:
                {
                    if (last_path == "-")
                    {
                        // special case: append to back
                        parent.push_back(val);
                    }
                    else
                    {
                        const auto idx = json_pointer::template array_index<basic_json_t>(last_path);
                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
                        {
                            // avoid undefined behavior
                            JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), &parent));
                        }

                        // default case: insert add offset
                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
                    }
                    break;
                }

                // if there exists a parent it cannot be primitive
                case value_t::string: // LCOV_EXCL_LINE
                case value_t::boolean: // LCOV_EXCL_LINE
                case value_t::number_integer: // LCOV_EXCL_LINE
                case value_t::number_unsigned: // LCOV_EXCL_LINE
                case value_t::number_float: // LCOV_EXCL_LINE
                case value_t::binary: // LCOV_EXCL_LINE
                case value_t::discarded: // LCOV_EXCL_LINE
                default:            // LCOV_EXCL_LINE
                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
            }
        };

        // wrapper for "remove" operation; remove value at ptr
        const auto operation_remove = [this, & result](json_pointer & ptr)
        {
            // get reference to parent of JSON pointer ptr
            const auto last_path = ptr.back();
            ptr.pop_back();
            basic_json& parent = result.at(ptr);

            // remove child
            if (parent.is_object())
            {
                // perform range check
                auto it = parent.find(last_path);
                if (JSON_HEDLEY_LIKELY(it != parent.end()))
                {
                    parent.erase(it);
                }
                else
                {
                    JSON_THROW(out_of_range::create(403, detail::concat("key '", last_path, "' not found"), this));
                }
            }
            else if (parent.is_array())
            {
                // note erase performs range check
                parent.erase(json_pointer::template array_index<basic_json_t>(last_path));
            }
        };

        // type check: top level value must be an array
        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
        {
            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &json_patch));
        }

        // iterate and apply the operations
        for (const auto& val : json_patch)
        {
            // wrapper to get a value for an operation
            const auto get_value = [&val](const std::string & op,
                                          const std::string & member,
                                          bool string_type) -> basic_json &
            {
                // find value
                auto it = val.m_data.m_value.object->find(member);

                // context-sensitive error message
                const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); // NOLINT(bugprone-unused-local-non-trivial-variable)

                // check if desired value is present
                if (JSON_HEDLEY_UNLIKELY(it == val.m_data.m_value.object->end()))
                {
                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
                    JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have member '", member, "'"), &val));
                }

                // check if result is of type string
                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
                {
                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
                    JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have string member '", member, "'"), &val));
                }

                // no error: return value
                return it->second;
            };

            // type check: every element of the array must be an object
            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
            {
                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &val));
            }

            // collect mandatory members
            const auto op = get_value("op", "op", true).template get<std::string>();
            const auto path = get_value(op, "path", true).template get<std::string>();
            json_pointer ptr(path);

            switch (get_op(op))
            {
                case patch_operations::add:
                {
                    operation_add(ptr, get_value("add", "value", false));
                    break;
                }

                case patch_operations::remove:
                {
                    operation_remove(ptr);
                    break;
                }

                case patch_operations::replace:
                {
                    // the "path" location must exist - use at()
                    result.at(ptr) = get_value("replace", "value", false);
                    break;
                }

                case patch_operations::move:
                {
                    const auto from_path = get_value("move", "from", true).template get<std::string>();
                    json_pointer from_ptr(from_path);

                    // the "from" location must exist - use at()
                    basic_json const v = result.at(from_ptr);

                    // The move operation is functionally identical to a
                    // "remove" operation on the "from" location, followed
                    // immediately by an "add" operation at the target
                    // location with the value that was just removed.
                    operation_remove(from_ptr);
                    operation_add(ptr, v);
                    break;
                }

                case patch_operations::copy:
                {
                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
                    const json_pointer from_ptr(from_path);

                    // the "from" location must exist - use at()
                    basic_json const v = result.at(from_ptr);

                    // The copy is functionally identical to an "add"
                    // operation at the target location using the value
                    // specified in the "from" member.
                    operation_add(ptr, v);
                    break;
                }

                case patch_operations::test:
                {
                    bool success = false;
                    JSON_TRY
                    {
                        // check if "value" matches the one at "path"
                        // the "path" location must exist - use at()
                        success = (result.at(ptr) == get_value("test", "value", false));
                    }
                    JSON_INTERNAL_CATCH (out_of_range&)
                    {
                        // ignore out of range errors: success remains false
                    }

                    // throw an exception if test fails
                    if (JSON_HEDLEY_UNLIKELY(!success))
                    {
                        JSON_THROW(other_error::create(501, detail::concat("unsuccessful: ", val.dump()), &val));
                    }

                    break;
                }

                case patch_operations::invalid:
                default:
                {
                    // op must be "add", "remove", "replace", "move", "copy", or
                    // "test"
                    JSON_THROW(parse_error::create(105, 0, detail::concat("operation value '", op, "' is invalid"), &val));
                }
            }
        }
    }

    /// @brief applies a JSON patch to a copy of the current object
    /// @sa https://json.nlohmann.me/api/basic_json/patch/
    basic_json patch(const basic_json& json_patch) const
    {
        basic_json result = *this;
        result.patch_inplace(json_patch);
        return result;
    }

    /// @brief creates a diff as a JSON patch
    /// @sa https://json.nlohmann.me/api/basic_json/diff/
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json diff(const basic_json& source, const basic_json& target,
                           const std::string& path = "")
    {
        // the patch
        basic_json result(value_t::array);

        // if the values are the same, return empty patch
        if (source == target)
        {
            return result;
        }

        if (source.type() != target.type())
        {
            // different types: replace value
            result.push_back(
            {
                {"op", "replace"}, {"path", path}, {"value", target}
            });
            return result;
        }

        switch (source.type())
        {
            case value_t::array:
            {
                // first pass: traverse common elements
                std::size_t i = 0;
                while (i < source.size() && i < target.size())
                {
                    // recursive call to compare array values at index i
                    auto temp_diff = diff(source[i], target[i], detail::concat(path, '/', std::to_string(i)));
                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
                    ++i;
                }

                // We now reached the end of at least one array
                // in a second pass, traverse the remaining elements

                // remove my remaining elements
                const auto end_index = static_cast<difference_type>(result.size());
                while (i < source.size())
                {
                    // add operations in reverse order to avoid invalid
                    // indices
                    result.insert(result.begin() + end_index, object(
                    {
                        {"op", "remove"},
                        {"path", detail::concat(path, '/', std::to_string(i))}
                    }));
                    ++i;
                }

                // add other remaining elements
                while (i < target.size())
                {
                    result.push_back(
                    {
                        {"op", "add"},
                        {"path", detail::concat(path, "/-")},
                        {"value", target[i]}
                    });
                    ++i;
                }

                break;
            }

            case value_t::object:
            {
                // first pass: traverse this object's elements
                for (auto it = source.cbegin(); it != source.cend(); ++it)
                {
                    // escape the key name to be used in a JSON patch
                    const auto path_key = detail::concat(path, '/', detail::escape(it.key()));

                    if (target.find(it.key()) != target.end())
                    {
                        // recursive call to compare object values at key it
                        auto temp_diff = diff(it.value(), target[it.key()], path_key);
                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
                    }
                    else
                    {
                        // found a key that is not in o -> remove it
                        result.push_back(object(
                        {
                            {"op", "remove"}, {"path", path_key}
                        }));
                    }
                }

                // second pass: traverse other object's elements
                for (auto it = target.cbegin(); it != target.cend(); ++it)
                {
                    if (source.find(it.key()) == source.end())
                    {
                        // found a key that is not in this -> add it
                        const auto path_key = detail::concat(path, '/', detail::escape(it.key()));
                        result.push_back(
                        {
                            {"op", "add"}, {"path", path_key},
                            {"value", it.value()}
                        });
                    }
                }

                break;
            }

            case value_t::null:
            case value_t::string:
            case value_t::boolean:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::number_float:
            case value_t::binary:
            case value_t::discarded:
            default:
            {
                // both primitive type: replace value
                result.push_back(
                {
                    {"op", "replace"}, {"path", path}, {"value", target}
                });
                break;
            }
        }

        return result;
    }
    /// @}

    ////////////////////////////////
    // JSON Merge Patch functions //
    ////////////////////////////////

    /// @name JSON Merge Patch functions
    /// @{

    /// @brief applies a JSON Merge Patch
    /// @sa https://json.nlohmann.me/api/basic_json/merge_patch/
    void merge_patch(const basic_json& apply_patch)
    {
        if (apply_patch.is_object())
        {
            if (!is_object())
            {
                *this = object();
            }
            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
            {
                if (it.value().is_null())
                {
                    erase(it.key());
                }
                else
                {
                    operator[](it.key()).merge_patch(it.value());
                }
            }
        }
        else
        {
            *this = apply_patch;
        }
    }

    /// @}
};

/// @brief user-defined to_string function for JSON values
/// @sa https://json.nlohmann.me/api/basic_json/to_string/
NLOHMANN_BASIC_JSON_TPL_DECLARATION
std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
{
    return j.dump();
}

inline namespace literals
{
inline namespace json_literals
{

/// @brief user-defined string literal for JSON values
/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/
JSON_HEDLEY_NON_NULL(1)
#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
    inline nlohmann::json operator ""_json(const char* s, std::size_t n)
#else
    inline nlohmann::json operator "" _json(const char* s, std::size_t n)
#endif
{
    return nlohmann::json::parse(s, s + n);
}

/// @brief user-defined string literal for JSON pointer
/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/
JSON_HEDLEY_NON_NULL(1)
#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
    inline nlohmann::json::json_pointer operator ""_json_pointer(const char* s, std::size_t n)
#else
    inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
#endif
{
    return nlohmann::json::json_pointer(std::string(s, n));
}

}  // namespace json_literals
}  // namespace literals
NLOHMANN_JSON_NAMESPACE_END

///////////////////////
// nonmember support //
///////////////////////

namespace std // NOLINT(cert-dcl58-cpp)
{

/// @brief hash value for JSON objects
/// @sa https://json.nlohmann.me/api/basic_json/std_hash/
NLOHMANN_BASIC_JSON_TPL_DECLARATION
struct hash<nlohmann::NLOHMANN_BASIC_JSON_TPL> // NOLINT(cert-dcl58-cpp)
{
    std::size_t operator()(const nlohmann::NLOHMANN_BASIC_JSON_TPL& j) const
    {
        return nlohmann::detail::hash(j);
    }
};

// specialization for std::less<value_t>
template<>
struct less< ::nlohmann::detail::value_t> // do not remove the space after '<', see https://github.com/nlohmann/json/pull/679
{
    /*!
    @brief compare two value_t enum values
    @since version 3.0.0
    */
    bool operator()(::nlohmann::detail::value_t lhs,
                    ::nlohmann::detail::value_t rhs) const noexcept
    {
#if JSON_HAS_THREE_WAY_COMPARISON
        return std::is_lt(lhs <=> rhs); // *NOPAD*
#else
        return ::nlohmann::detail::operator<(lhs, rhs);
#endif
    }
};

// C++20 prohibit function specialization in the std namespace.
#ifndef JSON_HAS_CPP_20

/// @brief exchanges the values of two JSON objects
/// @sa https://json.nlohmann.me/api/basic_json/std_swap/
NLOHMANN_BASIC_JSON_TPL_DECLARATION
inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept(  // NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp)
    is_nothrow_move_constructible<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value&&                          // NOLINT(misc-redundant-expression,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
    is_nothrow_move_assignable<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value)
{
    j1.swap(j2);
}

#endif

}  // namespace std

#if JSON_USE_GLOBAL_UDLS
    #if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
        using nlohmann::literals::json_literals::operator ""_json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
        using nlohmann::literals::json_literals::operator ""_json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
    #else
        using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
        using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
    #endif
#endif

// #include <nlohmann/detail/macro_unscope.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// restore clang diagnostic settings
#if defined(__clang__)
    #pragma clang diagnostic pop
#endif

// clean up
#undef JSON_ASSERT
#undef JSON_INTERNAL_CATCH
#undef JSON_THROW
#undef JSON_PRIVATE_UNLESS_TESTED
#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
#undef NLOHMANN_BASIC_JSON_TPL
#undef JSON_EXPLICIT
#undef NLOHMANN_CAN_CALL_STD_FUNC_IMPL
#undef JSON_INLINE_VARIABLE
#undef JSON_NO_UNIQUE_ADDRESS
#undef JSON_DISABLE_ENUM_SERIALIZATION
#undef JSON_USE_GLOBAL_UDLS

#ifndef JSON_TEST_KEEP_MACROS
    #undef JSON_CATCH
    #undef JSON_TRY
    #undef JSON_HAS_CPP_11
    #undef JSON_HAS_CPP_14
    #undef JSON_HAS_CPP_17
    #undef JSON_HAS_CPP_20
    #undef JSON_HAS_FILESYSTEM
    #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
    #undef JSON_HAS_THREE_WAY_COMPARISON
    #undef JSON_HAS_RANGES
    #undef JSON_HAS_STATIC_RTTI
    #undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
#endif

// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


#undef JSON_HEDLEY_ALWAYS_INLINE
#undef JSON_HEDLEY_ARM_VERSION
#undef JSON_HEDLEY_ARM_VERSION_CHECK
#undef JSON_HEDLEY_ARRAY_PARAM
#undef JSON_HEDLEY_ASSUME
#undef JSON_HEDLEY_BEGIN_C_DECLS
#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
#undef JSON_HEDLEY_CLANG_HAS_FEATURE
#undef JSON_HEDLEY_CLANG_HAS_WARNING
#undef JSON_HEDLEY_COMPCERT_VERSION
#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
#undef JSON_HEDLEY_CONCAT
#undef JSON_HEDLEY_CONCAT3
#undef JSON_HEDLEY_CONCAT3_EX
#undef JSON_HEDLEY_CONCAT_EX
#undef JSON_HEDLEY_CONST
#undef JSON_HEDLEY_CONSTEXPR
#undef JSON_HEDLEY_CONST_CAST
#undef JSON_HEDLEY_CPP_CAST
#undef JSON_HEDLEY_CRAY_VERSION
#undef JSON_HEDLEY_CRAY_VERSION_CHECK
#undef JSON_HEDLEY_C_DECL
#undef JSON_HEDLEY_DEPRECATED
#undef JSON_HEDLEY_DEPRECATED_FOR
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
#undef JSON_HEDLEY_DIAGNOSTIC_POP
#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
#undef JSON_HEDLEY_DMC_VERSION
#undef JSON_HEDLEY_DMC_VERSION_CHECK
#undef JSON_HEDLEY_EMPTY_BASES
#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
#undef JSON_HEDLEY_END_C_DECLS
#undef JSON_HEDLEY_FLAGS
#undef JSON_HEDLEY_FLAGS_CAST
#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
#undef JSON_HEDLEY_GCC_HAS_BUILTIN
#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_GCC_HAS_EXTENSION
#undef JSON_HEDLEY_GCC_HAS_FEATURE
#undef JSON_HEDLEY_GCC_HAS_WARNING
#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
#undef JSON_HEDLEY_GCC_VERSION
#undef JSON_HEDLEY_GCC_VERSION_CHECK
#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
#undef JSON_HEDLEY_GNUC_HAS_FEATURE
#undef JSON_HEDLEY_GNUC_HAS_WARNING
#undef JSON_HEDLEY_GNUC_VERSION
#undef JSON_HEDLEY_GNUC_VERSION_CHECK
#undef JSON_HEDLEY_HAS_ATTRIBUTE
#undef JSON_HEDLEY_HAS_BUILTIN
#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_HAS_EXTENSION
#undef JSON_HEDLEY_HAS_FEATURE
#undef JSON_HEDLEY_HAS_WARNING
#undef JSON_HEDLEY_IAR_VERSION
#undef JSON_HEDLEY_IAR_VERSION_CHECK
#undef JSON_HEDLEY_IBM_VERSION
#undef JSON_HEDLEY_IBM_VERSION_CHECK
#undef JSON_HEDLEY_IMPORT
#undef JSON_HEDLEY_INLINE
#undef JSON_HEDLEY_INTEL_CL_VERSION
#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
#undef JSON_HEDLEY_INTEL_VERSION
#undef JSON_HEDLEY_INTEL_VERSION_CHECK
#undef JSON_HEDLEY_IS_CONSTANT
#undef JSON_HEDLEY_IS_CONSTEXPR_
#undef JSON_HEDLEY_LIKELY
#undef JSON_HEDLEY_MALLOC
#undef JSON_HEDLEY_MCST_LCC_VERSION
#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
#undef JSON_HEDLEY_MESSAGE
#undef JSON_HEDLEY_MSVC_VERSION
#undef JSON_HEDLEY_MSVC_VERSION_CHECK
#undef JSON_HEDLEY_NEVER_INLINE
#undef JSON_HEDLEY_NON_NULL
#undef JSON_HEDLEY_NO_ESCAPE
#undef JSON_HEDLEY_NO_RETURN
#undef JSON_HEDLEY_NO_THROW
#undef JSON_HEDLEY_NULL
#undef JSON_HEDLEY_PELLES_VERSION
#undef JSON_HEDLEY_PELLES_VERSION_CHECK
#undef JSON_HEDLEY_PGI_VERSION
#undef JSON_HEDLEY_PGI_VERSION_CHECK
#undef JSON_HEDLEY_PREDICT
#undef JSON_HEDLEY_PRINTF_FORMAT
#undef JSON_HEDLEY_PRIVATE
#undef JSON_HEDLEY_PUBLIC
#undef JSON_HEDLEY_PURE
#undef JSON_HEDLEY_REINTERPRET_CAST
#undef JSON_HEDLEY_REQUIRE
#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
#undef JSON_HEDLEY_REQUIRE_MSG
#undef JSON_HEDLEY_RESTRICT
#undef JSON_HEDLEY_RETURNS_NON_NULL
#undef JSON_HEDLEY_SENTINEL
#undef JSON_HEDLEY_STATIC_ASSERT
#undef JSON_HEDLEY_STATIC_CAST
#undef JSON_HEDLEY_STRINGIFY
#undef JSON_HEDLEY_STRINGIFY_EX
#undef JSON_HEDLEY_SUNPRO_VERSION
#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
#undef JSON_HEDLEY_TINYC_VERSION
#undef JSON_HEDLEY_TINYC_VERSION_CHECK
#undef JSON_HEDLEY_TI_ARMCL_VERSION
#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL2000_VERSION
#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL430_VERSION
#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL6X_VERSION
#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL7X_VERSION
#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
#undef JSON_HEDLEY_TI_CLPRU_VERSION
#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
#undef JSON_HEDLEY_TI_VERSION
#undef JSON_HEDLEY_TI_VERSION_CHECK
#undef JSON_HEDLEY_UNAVAILABLE
#undef JSON_HEDLEY_UNLIKELY
#undef JSON_HEDLEY_UNPREDICTABLE
#undef JSON_HEDLEY_UNREACHABLE
#undef JSON_HEDLEY_UNREACHABLE_RETURN
#undef JSON_HEDLEY_VERSION
#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
#undef JSON_HEDLEY_VERSION_DECODE_MINOR
#undef JSON_HEDLEY_VERSION_DECODE_REVISION
#undef JSON_HEDLEY_VERSION_ENCODE
#undef JSON_HEDLEY_WARNING
#undef JSON_HEDLEY_WARN_UNUSED_RESULT
#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
#undef JSON_HEDLEY_FALL_THROUGH


#endif  // INCLUDE_NLOHMANN_JSON_HPP_


================================================
FILE: archive/third_party/nlohmann/json_fwd.hpp
================================================
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT

#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
#define INCLUDE_NLOHMANN_JSON_FWD_HPP_

#include <cstdint> // int64_t, uint64_t
#include <map> // map
#include <memory> // allocator
#include <string> // string
#include <vector> // vector

// #include <nlohmann/detail/abi_macros.hpp>
//     __ _____ _____ _____
//  __|  |   __|     |   | |  JSON for Modern C++
// |  |  |__   |  |  | | | |  version 3.11.3
// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT


// This file contains all macro definitions affecting or depending on the ABI

#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
            #warning "Already included a different version of the library!"
        #endif
    #endif
#endif

#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)

#ifndef JSON_DIAGNOSTICS
    #define JSON_DIAGNOSTICS 0
#endif

#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
#endif

#if JSON_DIAGNOSTICS
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
#else
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
#endif

#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
#else
    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
#endif

// Construct the namespace ABI tags component
#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b) json_abi ## a ## b
#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b) \
    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b)

#define NLOHMANN_JSON_ABI_TAGS                                       \
    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON)

// Construct the namespace version component
#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
    _v ## major ## _ ## minor ## _ ## patch
#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)

#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
#define NLOHMANN_JSON_NAMESPACE_VERSION
#else
#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
                                           NLOHMANN_JSON_VERSION_MINOR, \
                                           NLOHMANN_JSON_VERSION_PATCH)
#endif

// Combine namespace components
#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)

#ifndef NLOHMANN_JSON_NAMESPACE
#define NLOHMANN_JSON_NAMESPACE               \
    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
            NLOHMANN_JSON_ABI_TAGS,           \
            NLOHMANN_JSON_NAMESPACE_VERSION)
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
    namespace nlohmann                               \
    {                                                \
    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
                NLOHMANN_JSON_ABI_TAGS,              \
                NLOHMANN_JSON_NAMESPACE_VERSION)     \
    {
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_END
#define NLOHMANN_JSON_NAMESPACE_END                                     \
    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
    }  // namespace nlohmann
#endif


/*!
@brief namespace for Niels Lohmann
@see https://github.com/nlohmann
@since version 1.0.0
*/
NLOHMANN_JSON_NAMESPACE_BEGIN

/*!
@brief default JSONSerializer template argument

This serializer ignores the template arguments and uses ADL
([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
for serialization.
*/
template<typename T = void, typename SFINAE = void>
struct adl_serializer;

/// a class to store JSON values
/// @sa https://json.nlohmann.me/api/basic_json/
template<template<typename U, typename V, typename... Args> class ObjectType =
         std::map,
         template<typename U, typename... Args> class ArrayType = std::vector,
         class StringType = std::string, class BooleanType = bool,
         class NumberIntegerType = std::int64_t,
         class NumberUnsignedType = std::uint64_t,
         class NumberFloatType = double,
         template<typename U> class AllocatorType = std::allocator,
         template<typename T, typename SFINAE = void> class JSONSerializer =
         adl_serializer,
         class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
         class CustomBaseClass = void>
class basic_json;

/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
/// @sa https://json.nlohmann.me/api/json_pointer/
template<typename RefStringType>
class json_pointer;

/*!
@brief default specialization
@sa https://json.nlohmann.me/api/json/
*/
using json = basic_json<>;

/// @brief a minimal map-like container that preserves insertion order
/// @sa https://json.nlohmann.me/api/ordered_map/
template<class Key, class T, class IgnoredLess, class Allocator>
struct ordered_map;

/// @brief specialization that maintains the insertion order of object keys
/// @sa https://json.nlohmann.me/api/ordered_json/
using ordered_json = basic_json<nlohmann::ordered_map>;

NLOHMANN_JSON_NAMESPACE_END

#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_


================================================
FILE: book.toml
================================================
[book]
authors = ["kvcache-ai"]
language = "zh-CN"
title = "Ktransformers"
src = "doc"

[output.html]
git-repository-url = "https://github.com/kvcache-ai/ktransformers"
edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"

[output.html.playground]
editable = true
copy-js = true
# line-numbers = true

[output.html.fold]
enable = true
level = 0

================================================
FILE: doc/SUMMARY.md
================================================
# Ktransformers

[Introduction](./README.md)
# Install & Usage
- [For kt-kernel](en/kt-kernel/kt-kernel_intro.md)
- [For kt-sft](en/SFT/KTransformers-Fine-Tuning_User-Guide.md)

# Tutorial 
- [kt-sft part](en/SFT/README.md)
  - [Injection Tutorial](en/SFT/injection_tutorial.md)
  - [kt-sft developer tech notes](en/SFT/KTransformers-Fine-Tuning_Developer-Technical-Notes.md)
  - [DPO tutorial](en/SFT/DPO_tutorial.md)
  <!-- - [Multi-GPU Tutorial](en/multi-gpu-tutorial.md) -->
  <!-- - [Use FP8 GPU Kernel](en/fp8_kernel.md) -->
  <!-- - [Use AMD GPU](en/ROCm.md) -->
<!-- - [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md) -->
<!-- - [Why KTransformers So Fast](en/deepseek-v2-injection.md) -->
<!-- # For Developer
- [Makefile Usage](en/makefile_usage.md) -->
- [kt-kernel part](en/kt-kernel/README.md)
  - [kt-cli](en/kt-kernel/kt-cli.md)
# FAQ
- [FAQ](en/FAQ.md)
<!-- # V3 Reproduction
- [Success List](en/V3-success.md)
# Benchmark
- [Benchmark](en/benchmark.md) -->


================================================
FILE: doc/basic/note1.md
================================================
# basic-first20


================================================
FILE: doc/basic/note2.md
================================================
# basic-data_structure


================================================
FILE: doc/en/AMX.md
================================================
# Qwen 3 + KTransformers 0.3 (+AMX) = AI Workstation/PC
Following DeepSeek-V3/R1, LLaMa-4, and Kimi-VL, Qwen has also released an impressive MoE model—undoubtedly, this year belongs to MoE. As a low-barrier inference system for running MoE models in local heterogeneous environments, KTransformers naturally joins the party. Thanks to the support of the Qwen team, we completed Day 0 support for the entire Qwen 3 series of MoE models. At the same time, we took this opportunity to open-source the long-awaited preliminary version of our AMX high-performance operators (BF16, Int8; an int4 variant is coming soon), officially advancing to version 0.3.

What excites me most about Qwen3MoE is that, unlike the 671 B “giant” model, its two configurations: 235B-A22 and 30B-A3B, **hit the performance sweet spots for both local workstations and consumer-grade PCs**. Accordingly, we ran benchmarks in two typical setups:

Server CPU (Xeon 4) + RTX 4090

Consumer-grade CPU (Core i9-14900KF + dual-channel DDR5-4000 MT/s) + RTX 4090

Note: Because the PC's memory has a low frequency, large capacity, and multiple sticks, it downclocks severely and only operates at 4000MT. Using higher - frequency memory can boost performance.

The results are as follows:

https://github.com/user-attachments/assets/fafe8aec-4e22-49a8-8553-59fb5c6b00a2


![Image](https://github.com/user-attachments/assets/62567aad-353b-4c6f-ab87-2ea283ff2ba2)

You can see that, thanks to the AMX instruction optimizations, we achieve up to 347 tokens/s prefill performance in the workstation scenario. On consumer-grade CPUs, we’re able to run the large model (235B-A22) and deliver smooth performance on the smaller 30B-A3B. Even in terms of resource overhead, it appears that a high-end gaming laptop can handle 30B-A3B smoothly. After talking about the concept of AIPC for so long, we can finally see its feasibility.

Here is the Qwen3MoE startup command:

``` python
# llamafile backend
python ktransformers/server/main.py --architectures Qwen3MoeForCausalLM --model_path <model_dir> --gguf_path <gguf_dir> --optimize_config_path ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml --backend_type balance_serve
# AMX backend
python ktransformers/server/main.py --architectures Qwen3MoeForCausalLM --model_path <model_dir> --gguf_path <gguf_dir> --optimize_config_path ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml --backend_type balance_serve
```

**Note: At present, Qwen3MoE running with AMX can only read BF16 GGUF; support for loading from safetensor will be added later.**

To make it easier for everyone to understand the AMX optimizations we’ve open-sourced, we’ve prepared a brief document. We also extend our gratitude to Intel for their assistance.

# Introduction to AMX Instruction Set

Intel Advanced Matrix Extensions (AMX) are a set of specialized instruction extensions introduced for the x86 architecture starting with Sapphire Rapids (4th generation Xeon Scalable processors) and onward. AMX accelerates large-scale matrix computations at the hardware level, particularly for the compute-intensive parts of deep learning inference and machine learning workloads. By introducing the concept of Tile registers, it loads 2D sub-matrices into dedicated Tile registers and performs matrix multiply-accumulate operations at the register level, significantly improving throughput and energy efficiency.

Each CPU core contains 8 dedicated registers (tmm0–tmm7), with each register capable of holding up to 16 rows × 64 bytes of data to store 2D sub-matrices. Additionally, there is a 64-byte configuration register (TILECFG) used to describe each tmm register's number of rows, columns, and row stride.

The main AMX instructions are summarized as follows:

| Instruction Category | Instruction Names | Description |
|:---|:---|:---|
| Configuration Instructions | LDTILECFG, STTILECFG, TILERELEASE, TILEZERO | Configure/reset Tile registers and metadata |
| Load/Store Instructions | TILELOADD, TILELOADDT1, TILESTORED | Transfer data between memory and Tile registers |
| INT8 Computation Instructions | TDPBSSD, TDPBUSD, TDPBUUD, TDPBSUD | Perform multiply and accumulate operations on int8 sub-matrices within Tiles |
| BF16 Computation Instructions | TDPBF16PS | Perform multiply and accumulate operations on bfloat16 sub-matrices within Tiles |

To simplify development, Intel provides corresponding intrinsics, allowing C/C++ developers to leverage AMX's performance benefits without writing lengthy assembly code. For example:

```C++
#include <immintrin.h>

_tile_loadconfig(cfg_ptr);
_tile_loadd(tmm0, A_ptr, lda);
_tile_loadd(tmm1, B_ptr, ldb);
_tile_zero(tmm2)
_tile_dpbf16ps(tmm2, tmm0, tmm1);
_tile_stored(tmm2, C_ptr, ldc);
_tile_release();
```

The above code copies sub-matrices from memory (A_ptr, B_ptr) to Tile registers, calls the AMX BF16 compute instruction to multiply two sub-matrices, and then copies the result to memory (C_ptr).

Taking INT8 as an example, AMX can perform the multiplication of two 16×64 sub-matrices (32,768 multiply/add operations) with a single instruction in 16 CPU cycles, enabling each core to complete 2048 multiply/add operations per cycle — 8 times the performance of AVX-512. On an Intel Xeon 4 CPU, a single core can theoretically provide 4 TOPS of compute power, making it highly suitable for compute-intensive tasks on the CPU.

<p align="center">
  <picture>
    <img alt="amx_intro" src="../assets/amx_intro.png" width=60%>
  </picture>
</p>


# AMX Kernel in KTransformers

Before version v0.3, KTransformers performed CPU matrix multiplications based on operators provided by llamafile. Unfortunately, llamafile's implementation had not yet been optimized for the AMX instruction set. This resulted in performance bottlenecks, even in strong hardware environments (such as Xeon 4th Gen + 4090), where inference speeds for large models like DeepSeek-V3 reached only 91 tokens/s during the prefill phase. The CPU thus remained a significant bottleneck. In long prompt scenarios, such performance is clearly unsatisfactory. To fully unleash CPU potential, we introduced a brand-new AMX optimization path along with multiple technical improvements in v0.3.

## 1. AMX Tiling-aware Memory Layout

AMX provides a high-throughput Tile register computation model, reducing instruction count and boosting theoretical throughput through coarse-grained matrix operations. However, to truly exploit AMX's potential, memory access efficiency is critical: because AMX transfers entire Tiles at once, misaligned Tiles and chaotic access patterns can cause severe cache misses, nullifying throughput gains.

Thus, in v0.3, we stopped directly memory-mapping GGUF-format files and introduced AMX Tiling-aware memory preprocessing during model loading. Specifically, expert weight matrices in MoE models are pre-rearranged into Tile-friendly sub-matrices whose shapes precisely match AMX Tile register dimensions, eliminating dynamic transposition overhead during inference. During rearrangement, we strictly align each sub-matrix's start address to 64 bytes to avoid cache line splits, and arrange sub-matrices sequentially according to computation access patterns, maximizing L1/L2 cache hit rates using compiler and hardware sequential prefetch capabilities.

For Int8 quantized formats, we adopted Symmetric Group-wise Quantization, with each column forming a group sharing a scale factor stored separately to maintain memory alignment for Tile data.

This AMX Tiling-aware memory layout design reduces memory latency while providing optimal input conditions for downstream computation kernels.

## 2. Cache-friendly AMX Kernel

During inference, we designed around the CPU’s multi-level cache hierarchy to perform computations in-place in high-speed caches, minimizing DRAM access frequency and overhead.

<p align="center">
  <picture>
    <img alt="amx" src="../assets/amx.png" width=60%>
  </picture>
</p>

As shown in the figure, 
- ① Expert weight matrices are first column-wise partitioned into multiple tasks dynamically scheduled across threads. Input activations are shared among tasks and typically reside in the shared L3 cache due to locality.
- ② Within each task, expert weights are row-wise partitioned into blocks, with block sizes finely tuned to ensure input activations, weights, and intermediate results stay within L2 cache, avoiding DRAM access.
- ③ ④ ⑤ Each block is treated as a set of sub-matrices matching AMX Tile registers, and during Tile-level computation, input Tiles (tmm0–tmm1) and expert Tiles (tmm2–tmm3) are loaded, and four AMX multiplication instructions directly generate and accumulate products into Tile registers (tmm4–tmm7), with output activations accumulated in Tile registers or L1 cache, avoiding additional data movement.

In short, we leveraged the cache hierarchy: every data element of expert weights and output activations accesses DRAM only once, with the other accesses hitting L2 or higher caches; input activations are accessed from DRAM only once and later hit in L3 or higher caches. This significantly reduces main memory traffic and improves overall execution efficiency.

## 3. AVX-512 Kernel Adaptation for Low Arithmetic Intensity Scenarios

Although AMX is highly efficient for large-scale matrix multiplication, it performs poorly under low arithmetic intensity, such as vector-matrix operations in the decode phase. This is because dispatching AMX Tiles involves fixed instruction overhead, which becomes wasteful when the data volume is insufficient to fill a Tile, causing reduced throughput.

<p align="center">
  <picture>
    <img alt="amx_avx" src="../assets/amx_avx.png" width=60%>
  </picture>
</p>

To address this, we introduced a lightweight AVX-512 kernel as a complement. This kernel follows the same memory layout as the AMX kernel but replaces heavy AMX matrix-matrix multiplications with fine-grained AVX-512 vector-matrix multiplications, lowering latency for small matrices.

KTransformers dynamically selects between AMX and AVX-512 kernels at runtime based on arithmetic intensity: AMX kernels are automatically selected during long prompt prefill phases (where each expert handles more than 4 tokens on average), while short prompt prefill and decode phases dynamically switch to AVX-512 kernels. This ensures optimal efficiency under different arithmetic intensity conditions.

## 4. MoE Operator Fusion and Dynamic Scheduling

MoE models have many experts per layer, each requiring three matrix multiplications (Gate, Up, Down projections), leading to many small matrix multiplication tasks. Independently scheduling each small task would cause massive synchronization overhead between threads, dragging down overall inference speed.

Thus, we fused the same type of matrix computations for all experts in a layer into large unified tasks. Furthermore, as there are no data dependencies between Gate and Up projections, their computations can also be fused, ultimately consolidating a layer’s matrix multiplications into two major tasks, greatly reducing scheduling overhead.

To address load imbalance — especially during the prefill phase where expert activations can be highly skewed — we introduced a dynamic task scheduling strategy. Each matrix multiplication task is further split into multiple fine-grained sub-tasks, evenly distributed among CPU threads initially. Once a thread completes its assigned tasks, it atomically "steals" tasks from others, greatly mitigating load imbalance and achieving near-optimal CPU resource utilization.

Thanks to these optimizations, our kernel can achieve 21 TFLOPS of BF16 throughput and 35 TOPS of Int8 throughput on Xeon4 CPUs — about 4× faster than PyTorch’s general AMX kernel. For DeepSeek-V3, pairing a Xeon4 CPU with a single RTX 4090 GPU achieves 418 tokens/s end-to-end throughput, close to the performance of multi-machine, multi-GPU setups. KTransformers’ AMX kernel is the first AMX kernel specifically designed for MoE inference scenarios, significantly lowering the hardware barrier for large model deployment and enabling more developers to enjoy GPU cluster level inference experiences at lower cost.

<p align="center">
  <picture>
    <img alt="onednn_1" src="../assets/onednn_1.png" width=60%>
  </picture>
</p>

# Usage

## Checking AMX Support

Before enabling the AMX-optimized kernels, it is important to verify whether your CPU supports the AMX instruction set. You can check AMX availability with the following command:

```bash
lscpu | grep -i amx
```

If your system supports AMX, you should see output similar to:

```bash
Flags: ... amx-bf16 amx-int8 amx-tile ...
```

If no amx-related flags are found, your CPU may not support AMX, or AMX may be disabled in BIOS settings. In that case, please ensure that:
- You are using a Sapphire Rapids (Xeon 4th Gen) or newer CPU.
- AMX support is enabled in your system BIOS under CPU feature settings.

## Enabling AMX in KTransformers

KTransformers allows users to easily switch between different backends through simple YAML configuration modifications. To enable AMX, modify the injection configuration of your experts by specifying backend as AMXInt8 or AMXBF16:

```YAML
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts    # custom MoE Kernel with expert parallelism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8"  # or "AMXBF16" or "llamafile" (default)
```

**Note:** Currently, using AMXInt8 requires reading weights from a BF16 GGUF file and performing online quantization during model loading. This may cause slightly slower load times. Future versions will provide pre-quantized weights to eliminate this overhead.

![Image](https://github.com/user-attachments/assets/7c33c410-3af9-456f-aa67-5b24e19ba680)


================================================
FILE: doc/en/DeepseekR1_V3_tutorial.md
================================================
<!-- omit in toc -->

# GPT-4/o1-level Local VSCode Copilot on a Desktop with only 24GB VRAM

- [SUMMARY](#summary)
  - [Show Case Environment](#show-case-environment)
  - [Bench Result](#bench-result)
    - [V0.2.1](#v021)
      - [Memory consumption:](#memory-consumption)
      - [Change Log](#change-log)
      - [Benchmark Results](#benchmark-results)
    - [V0.2](#v02)
      - [Settings](#settings)
      - [Memory consumption:](#memory-consumption-1)
      - [Benchmark Results](#benchmark-results-1)
    - [V0.3-Preview](#v03-preview)
      - [Settings](#settings-1)
      - [Memory consumptions:](#memory-consumptions)
      - [Benchmark results](#benchmark-results-2)
  - [How to Run](#how-to-run)
    - [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
      - [longer context](#longer-context)
      - [FP8 kernel](#fp8-kernel)
    - [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
      - [Single socket version (32 cores)](#single-socket-version-32-cores)
      - [Dual socket version (64 cores)](#dual-socket-version-64-cores)
    - [V0.3 Showcase](#v03-showcase)
      - [Dual socket version (64 cores)](#dual-socket-version-64-cores-1)
  - [Some Explanations](#some-explanations)
  - [Next](#next)
    - [Faster](#faster)
    - [Easier](#easier)
  - [FAQ](#faq)
    - [R1 No Thinking](#r1-no-thinking)
    - [More FAQ](#more-faq)

# SUMMARY

> **Feb 10, 2025**: Support DeepseekR1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup.<br>

Hi, we're the KTransformers team (formerly known for our local CPU/GPU hybrid inference open source project with DeepSeek-V2).

We've heard your requests for DeepSeek-R1/V3 support—and we're excited to finally deliver!
Apologies for the wait, but we've been cooking up something truly amazing!

Today, we're proud to announce that we not only support DeepSeek-R1/V3, as showcased in the video below:

https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

</p>

- **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM.
  - Prefill Speed (tokens/s):
    - KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
    - Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
  - Decode Speed (tokens/s):
    - KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
    - Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.

We also give our upcoming optimizations previews, including an Intel AMX-accelerated kernel and a selective expert activation method, which will significantly enhance performance. With V0.3-preview, we achieve up to 286 tokens/s for prefill, making it up to **28× faster than llama.cpp** for local inference.
The binary distribution is available now and the source code will come ASAP! Check out the wheel package [here](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl)

> **Feb 15, 2025**: KTransformers V0.2.1: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%) (Up to 16 Tokens/s), update docs [here](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).

We speed up the decode and prefill speed a littlt bit. The reason for the limited performance improvement mainly lies in the fact that the inference process is still constrained by the CPU's computational speed and memory bandwidth. The MLA part handled by the GPU accounts for a relatively small proportion.

Besides the improvements in speed, we've also significantly updated the documentation to enhance usability, including:<br>

- Added Multi-GPU configuration tutorial.
- Consolidated installation guide.
- Add a detailed tutorial on registering extra GPU memory with ExpertMarlin;

## Show Case Environment

We run our best performance tests (V0.2) on <br>
CPU: Intel (R) Xeon (R) Gold 6454S 1T DRAM (2 NUMA nodes) <br>
GPU: 4090D 24G VRAM <br>
Memory: standard DDR5-4800 server DRAM (1 TB), each socket with 8×DDR5-4800

## Bench Result

### V0.2.1

- Model: DeepseekV3-q4km (int4)<br>
- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
- GPU: 4090 24G VRAM
- We test after enough warm up

#### Memory consumption:

- Single socket: 382G DRAM, at least 14GB VRAM
- Dual socket: 1T DRAM, at least 14GB VRAM

#### Change Log

- Longer Context (from 4K to 8K for 24GB VRAM) and Slightly Faster Speed （+15%):<br>
  Integrated the highly efficient Triton MLA Kernel from the fantastic sglang project, enable much longer context length and slightly faster prefill/decode speed
- We suspect that some of the improvements come from the change of hardware platform (4090D->4090)

#### Benchmark Results

"6 experts" case is part of V0.3's preview


| Prompt               | hi (2)   | 1K (969)  | 2K (1930) | 4K (3846)               | 8K (7678) |
| -------------------- | -------- | --------- | --------- | ----------------------- | --------- |
| Output length        | 10tokens | 300tokens | 300tokens | 300tokens               | 300tokens |
| **6 experts V0.2.0** |          |           |           |                         |           |
| Prefill token/s      | 13       | 105       | 102       | 88                      | CUDA OOM  |
| decode token/s       | 16.8     | 15.4      | 14.2      | 13.0                    | CUDA OOM  |
| **6 experts V0.2.1** |          |           |           |                         |           |
| Prefill token/s      | 13       | 111       | 112.5     | 102**(1.16x speedup)**  | 101       |
| decode token/s       | 16.8     | 15.9      | 15.4      | 14.9**(1.15x speedup)** | 13.9      |
| **8 experts V0.2.1** |          |           |           |                         |           |
| Prefill token/s      | 12.2     | 88.2      | 88.5      | 81.9                    | 80        |
| Decode token/s       | 13.4     | 13.5      | 13.4      | 13.2                    | 12.4      |

### V0.2

#### Settings

- Model: DeepseekV3-q4km (int4)<br>
- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
- GPU: 4090D 24G VRAM
- We test after enough warm up

#### Memory consumption:

- Single socket: 382G DRAM, at least 14GB VRAM
- Dual socket: 1T DRAM, at least 14GB VRAM

#### Benchmark Results

"6 experts" case is part of V0.3's preview


| Prompt<br>(500 tokens) | Dual socket Ktrans (6 experts) | Dual socket Ktrans (8 experts) | Single socket Ktrans (6 experts) | Single socket Ktrans (8 experts) | llama.cpp (8 experts) |
| ---------------------- | ------------------------------ | ------------------------------ | -------------------------------- | -------------------------------- | --------------------- |
| Prefill token/s        | 97.32                          | 82.94                          | 65.14                            | 54.21                            | 10.31                 |
| Decode token/s         | 13.69                          | 12.208                         | 10.303                           | 8.73                             | 4.51                  |

**The highest speedup reaches up to <u>3.03x</u> in decoding and <u>9.44x</u> in prefill.**

### V0.3-Preview

#### Settings

- Model: DeepseekV3-BF16 (online quant into int8 for CPU and int4 for GPU)
- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 socket, 2 numa nodes
- GPU: (1~4)x 4090D 24GVRAM (requires more VRAM for longer prompt)

#### Memory consumptions:

- 644GB DRAM, at least 14GB VRAM

#### Benchmark results


| Prompt length                      | 1K     | 2K     | 4K     | 8K     |
| ---------------------------------- | ------ | ------ | ------ | ------ |
| KTrans (8 experts) Prefill token/s | 185.96 | 255.26 | 252.58 | 195.62 |
| KTrans (6 experts) Prefill token/s | 203.70 | 286.55 | 271.08 | 207.20 |

**The prefill of KTrans V0.3 is up to <u>3.45x</u> times faster than KTrans V0.2, and is up to <u>27.79x</u> times faster than llama.cpp.**
**The decoding speed is the same as KTrans V0.2 (6 experts version) so it is omitted**

The main acceleration comes from

- Intel AMX instruction set and our specially designed cache friendly memory layout
- Expert selection strategy that selects fewer experts based on offline profile results of out of domain data

*From our research on DeepSeekV2, DeepSeekV3 and DeepSeekR1,
when we slightly decrease the activation experts num in inference,
the output quality doesn't change. But the speed of decoding and prefill
is speed up which is inspiring. So our showcase makes use of this finding*

## How to Run

### v0.2.4 
We provide a server script, which supports multi-concurrency functionality in version v0.2.4.

```
python ktransformers/server/main.py --model_path /mnt/data/models/DeepSeek-V3 --gguf_path /mnt/data/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M/ --cpu_infer 62 --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
```
It features the following arguments:

- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)

### v0.2.2 & v0.2.3 longer context & FP8 kernel

#### longer context

To use this feature, [install flashinfer](https://github.com/flashinfer-ai/flashinfer) first.

Note: The latest MLA kernel in FlashInfer still has a few minor issues. They are continuously fixing them on the main branch. If you are using FlashInfer, please install it from the main source code.

If you want to use long context(longer than 20K) for prefill, enable the matrix absorption MLA during the prefill phase, which will significantly reduce the size of the kv cache. Modify yaml file like this:

```
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: True # change this to True to enable long context(prefill may slower).
```

If the VRAM is still insufficient, try reducing the `chunk_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.

#### FP8 kernel

The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:

- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
- **Hybrid Quantization Architecture**:
  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
  - Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)

So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.

The detailed guide is [here](./fp8_kernel.md).

### V0.2 & V0.2.1 Showcase

#### Single socket version (32 cores)

Our local_chat test command is:

```shell
numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
<when you see chat, then press enter to load the text prompt_file>
```

`<your model path>` can be local or set from online huggingface like deepseek-ai/DeepSeek-V3. If online encounters connection problem, try use mirror (hf-mirror.com) <br>
`<your gguf path>` can also be online, but as its large we recommend you download it and quantize the model to what you want (notice it's the dir path) <br>
`--max_new_tokens 1000` is the max output token length. If you find the answer is truncated, you
can increase the number for longer answer (But be aware of OOM, and increase it will slow down the generation rate.).

The command `numactl -N 1 -m 1` aims to avoid data transfer between numa nodes<br>
Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. This is explained in [FAQ](#faq) part

#### Dual socket version (64 cores)

Make sure before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set). You may check the doc [here](./install.md) for install details. <br>

Test Command:

```shell
# ---For those who have not installed ktransformers---
# git clone https://github.com/kvcache-ai/ktransformers.git
# cd ktransformers
# git submodule init
# git submodule update
# export USE_NUMA=1
# make dev_install # or sh ./install.sh
# ----------------------------------------------------
python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
<when you see chat, then press enter to load the text prompt_file>
```

The parameters' meaning is the same. But As we use dual socket, we set cpu_infer to 65

### V0.3 Showcase

#### Dual socket version (64 cores)

Our local_chat test command is:

```shell
wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
<when you see chat, then press enter to load the text prompt_file>
```

The parameters' meaning is the same with V0.2. But As we  use dual socket, we set cpu_infer to 65

## Some Explanations

1. Also we want to make further use of our two NUMA nodes on Xeon Gold cpu.
   To avoid the cost of data transfer between nodes, we "copy" the critical matrix on
   both nodes which takes more memory consumption but accelerates the prefill and decoding process.
   But this method takes huge memory and slow when loading weights, So be patient when loading
   and monitor the memory usage. We are going to optimize this huge memory overhead. Stay tuned~ <br>
2. The command args `--cpu_infer 65` specifies how many cores to use (it's ok that it exceeds the physical number,
   but it's not the more the better. Adjust it slightly lower to your actual number of cores)<br>
3. Why CPU/GPU Hybrid Inference?
   DeepSeek's MLA operators are highly computationally intensive. While running everything on CPU is possible, offloading the heavy computations to the GPU results in a massive performance boost.
4. Where Does the Speedup Come From?

   - Expert Offload: Unlike traditional layer-based or KVCache offloading (as seen in llama.cpp), we offload the expert computation to the CPU and MLA/KVCache to GPU, aligning perfectly with DeepSeek’s architecture for optimal efficiency.
   - Intel AMX Optimization – Our AMX-accelerated kernel is meticulously tuned, running several times faster than existing llama.cpp implementations. We plan to open-source this kernel after cleansing and are considering upstream contributions to llama.cpp.
5. Why Intel CPUs?
   Intel is currently the only CPU vendor that supports AMX-like instructions, which delivers significantly better performance compared to AVX-only alternatives.

## Next

### Faster

* The FlashInfer (https://github.com/flashinfer-ai/flashinfer) project is releasing an even more efficient fused MLA operator, promising further speedups
* vLLM has explored multi-token prediction in DeepSeek-V3, and support is on our roadmap for even better performance
* We are collaborating with Intel to enhance the AMX kernel (v0.3) and optimize for Xeon6/MRDIMM

### Easier

* Official Docker images to simplify installation
* Fix the server integration for web API access
* Fix the local chat only accepting a single line prompt (currently \n begins generating prompt)
* Support for more quantization types, including the highly requested dynamic quantization from unsloth

Stay tuned for more updates!

## FAQ

### R1 No Thinking

Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. The detail is in [FAQ](./FAQ.md) part <br>

### More FAQ

[See detail](./FAQ.md)


================================================
FILE: doc/en/Docker.md
================================================
# Docker

## Prerequisites
* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (ex. /mnt/models)

## Images
There is a Docker image available for our project, you can pull the docker image by：
```
docker pull approachingai/ktransformers:0.2.1
```
**Notice**: In this image, we compile the ktransformers in AVX512 instuction CPUs, if your cpu not support AVX512, it is suggested to recompile and install ktransformers in the /workspace/ktransformers directory within the container.

## Building docker image locally
 - Download Dockerfile in [there](../../Dockerfile)

 - finish, execute
   ```bash
   docker build  -t approachingai/ktransformers:0.2.1 .
   ```

## Usage

Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container.
```
docker run --gpus all -v /path/to/models:/models --name ktransformers -itd approachingai/ktransformers:0.2.1
docker exec -it ktransformers /bin/bash
python -m ktransformers.local_chat  --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --cpu_infer 33
```

More operators you can see in the [readme](../../README.md)

================================================
FILE: doc/en/Docker_xpu.md
================================================
# Intel GPU Docker Guide (Beta)

## Prerequisites

* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (e.g., /mnt/models)
* **Before proceeding, ensure the Intel GPU driver is installed correctly on your host:** [Installation Guide](./xpu.md#1-install-intel-gpu-driver)

---

## Building the Docker Image Locally

1. Clone the repository and navigate to the project directory:

   ```bash
   git clone https://github.com/kvcache-ai/ktransformers.git
   cd ktransformers
   ```

2. Build the Docker image using the XPU-specific [Dockerfile.xpu](../../Dockerfile.xpu):

   ```bash
   sudo http_proxy=$HTTP_PROXY \
        https_proxy=$HTTPS_PROXY \
        docker build \
          --build-arg http_proxy=$HTTP_PROXY \
          --build-arg https_proxy=$HTTPS_PROXY \
          -t kt_xpu:0.3.1 \
          -f Dockerfile.xpu \
          .
   ```

---

## Running the Container

### 1. Start the container

```bash
sudo docker run -td --privileged \
    --net=host \
    --device=/dev/dri \
    --shm-size="16g" \
    -v /path/to/models:/models \
    -e http_proxy=$HTTP_PROXY \
    -e https_proxy=$HTTPS_PROXY \
    --name ktransformers_xpu \
    kt_xpu:0.3.1
```

**Note**: Replace `/path/to/models` with your actual model directory path (e.g., `/mnt/models`).

---

### 2. Access the container

```bash
sudo docker exec -it ktransformers_xpu /bin/bash
```

---

### 3. Set required XPU environment variables (inside the container)

```bash
export SYCL_CACHE_PERSISTENT=1
export ONEAPI_DEVICE_SELECTOR=level_zero:0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
```

---

### 4. Run the sample script

```bash
python ktransformers/local_chat.py \
  --model_path deepseek-ai/DeepSeek-R1 \
  --gguf_path <path_to_gguf_files> \
  --optimize_config_path ktransformers/optimize/optimize_rules/xpu/DeepSeek-V3-Chat.yaml \
  --cpu_infer <cpu_cores + 1> \
  --device xpu \
  --max_new_tokens 200
```

**Note**:

* Replace `<path_to_gguf_files>` with the path to your GGUF model files.
* Replace `<cpu_cores + 1>` with the number of CPU cores you want to use plus one.

---

## Additional Information

For more configuration options and usage details, refer to the [project README](../../README.md). To run KTransformers natively on XPU (outside of Docker), please refer to [xpu.md](./xpu.md).


================================================
FILE: doc/en/FAQ.md
================================================
<!-- omit in toc -->
# see the issue [FAQ page](https://github.com/kvcache-ai/ktransformers/issues/1608)

================================================
FILE: doc/en/Kimi-K2-Thinking.md
================================================
# KTransformers+SGLang Inference Deployment
Please Note This is Quantization Deployment. For Native Kimi K2 Thinking deployment please refer to [here](./Kimi-K2-Thinking-Native.md).

## Installation

Step 1: Install SGLang

Install the kvcache-ai fork of SGLang (one of):
```bash
# Option A: One-click install (from ktransformers root)
./install.sh

# Option B: pip install
pip install sglang-kt
```

> **Important:** Use `sglang-kt` (kvcache-ai fork), not the official `sglang` package. Run `pip uninstall sglang` first if you have the official version installed.

Step 2: Install KTransformers CPU Kernels

The KTransformers CPU kernels (kt-kernel) provide AMX-optimized computation for hybrid inference, for detailed installation instructions and troubleshooting, refer to the official [kt-kernel installation guide](https://github.com/kvcache-ai/ktransformers/blob/main/kt-kernel/README.md).

## Download Model

Download the official KIMI weights as GPU weights.

* huggingface: https://huggingface.co/moonshotai/Kimi-K2-Thinking
* modelscope: https://modelscope.cn/models/moonshotai/Kimi-K2-Thinking

Download the AMX INT4 quantized weights from https://huggingface.co/KVCache-ai/Kimi-K2-Thinking-CPU-weight as CPU weights.

## How to start
```
python -m sglang.launch_server   --host 0.0.0.0   --port 60000   --model path/to/Kimi-K2-Thinking/   --kt-weight-path path/to/Kimi-K2-Instruct-CPU-weight/   --kt-cpuinfer 56   --kt-threadpool-count 2   --kt-num-gpu-experts 200   --kt-method AMXINT4   --attention-backend flashinfer   --trust-remote-code   --mem-fraction-static 0.98   --chunked-prefill-size 4096   --max-running-requests 37   --max-total-tokens 37000   --enable-mixed-chunk   --tensor-parallel-size 8   --enable-p2p-check   --disable-shared-experts-fusion
```
tips:

`--kt-cpuinfer`: is recommended to be set to (number of physical CPU cores - 8 (number of GPUs)).

`--kt-num-gpu-experts`: refers to the number of experts retained on GPUs, which should be adjusted according to your available GPU memory and expected KV cache space.

## Test

When testing, you need to add `--disable-radix-cache` and `--disable-chunked-prefix-cache` when starting the server.

### bench prefill
```
python -m sglang.bench_serving   --backend sglang   --host 127.0.0.1   --port 60000   --num-prompts 37 --random-input-len 1024 --random-output-len 1 --random-range-ratio 1.0 --dataset-name random
```

### bench decode
```
python -m sglang.bench_serving   --backend sglang   --host 127.0.0.1   --port 60000   --num-prompts 37 --random-input-len 10 --random-output-len 512 --random-range-ratio 1.0 --dataset-name random
```

## Performance

### System Configuration:

- GPUs: 8× NVIDIA L20
- CPU: Intel(R) Xeon(R) Gold 6454S

### Bench prefill
```
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 not set
Successful requests:                     37
Benchmark duration (s):                  65.58
Total input tokens:                      37888
Total input text tokens:                 37888
Total input vision tokens:               0
Total generated tokens:                  37
Total generated tokens (retokenized):    37
Request throughput (req/s):              0.56
Input token throughput (tok/s):          577.74
Output token throughput (tok/s):         0.56
Total token throughput (tok/s):          578.30
Concurrency:                             23.31
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   41316.50
Median E2E Latency (ms):                 41500.35
---------------Time to First Token----------------
Mean TTFT (ms):                          41316.48
Median TTFT (ms):                        41500.35
P99 TTFT (ms):                           65336.31
---------------Inter-Token Latency----------------
Mean ITL (ms):                           0.00
Median ITL (ms):                         0.00
P95 ITL (ms):                            0.00
P99 ITL (ms):                            0.00
Max ITL (ms):                            0.00
==================================================
```

### Bench decode

```
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 not set
Successful requests:                     37
Benchmark duration (s):                  412.66
Total input tokens:                      370
Total input text tokens:                 370
Total input vision tokens:               0
Total generated tokens:                  18944
Total generated tokens (retokenized):    18618
Request throughput (req/s):              0.09
Input token throughput (tok/s):          0.90
Output token throughput (tok/s):         45.91
Total token throughput (tok/s):          46.80
Concurrency:                             37.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   412620.35
Median E2E Latency (ms):                 412640.56
---------------Time to First Token----------------
Mean TTFT (ms):                          3551.87
Median TTFT (ms):                        3633.59
P99 TTFT (ms):                           3637.37
---------------Inter-Token Latency----------------
Mean ITL (ms):                           800.53
Median ITL (ms):                         797.89
P95 ITL (ms):                            840.06
P99 ITL (ms):                            864.96
Max ITL (ms):                            3044.56
==================================================
```


================================================
FILE: doc/en/Kimi-K2.5.md
================================================
# Running Kimi-K2.5 with SGLang and KT-Kernel

This tutorial demonstrates how to run Kimi-K2.5 model inference using SGLang integrated with KT-Kernel for CPU-GPU heterogeneous inference. This setup enables efficient deployment of large MoE models by offloading experts to CPU.

## Table of Contents

- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Launch SGLang Server](#step-2-launch-sglang-server)
- [Step 3: Send Inference Requests](#step-3-send-inference-requests)

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: NVIDIA RTX 2x4090 48GB (or equivalent with at least total 48GB VRAM available)
- **CPU**: x86 CPU with AVX512F support (e.g., Intel Sapphire Rapids)
- **RAM**: At least 600GB system memory
- **Storage**: ~600GB for model weights (native INT4 weight, same weight folder for CPU and GPU)

## Prerequisites

Before starting, ensure you have:

1. **KT-Kernel installed**:

   Note: Latest KTransformers' EPLB feature for Kimi-K2.5 will be supported soon.

```
git clone https://github.com/kvcache-ai/ktransformers.git
git submodule update --init --recursive
cd kt-kernel && ./install.sh
```

2. **SGLang installed** - Install the kvcache-ai fork of SGLang (one of):

```bash
# Option A: One-click install (from ktransformers root)
./install.sh

# Option B: pip install
pip install sglang-kt
```

> Note: You may need to reinstall cudnn: `pip install nvidia-cudnn-cu12==9.16.0.29`

3. **CUDA toolkit** - Compatible with your GPU (CUDA 12.8+ recommended)
4. **Hugging Face CLI** - For downloading models:
   
   ```bash
   pip install huggingface-hub
   ```

## Step 1: Download Model Weights

```bash
# Create a directory for models
mkdir -p /path/to/models
cd /path/to/models

# Download Kimi-K2.5 (RAW-INT4 for both CPU and GPU)
huggingface-cli download moonshotai/Kimi-K2.5 \
  --local-dir /path/to/kimi-k2.5
```

**Note:** Replace `/path/to/models` with your actual storage path throughout this tutorial.

## Step 2: Launch SGLang Server

Start the SGLang server with KT-Kernel integration for CPU-GPU heterogeneous inference.


### Launch Command (4x RTX 4090 Example)

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 31245 \
  --model /path/to/kimi-k2.5 \
  --kt-weight-path /path/to/kimi-k2.5 \
  --kt-cpuinfer 96 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 30 \
  --kt-method RAWINT4 \
  --kt-gpu-prefill-token-threshold 400 \
  --trust-remote-code \
  --mem-fraction-static 0.94 \
  --served-model-name Kimi-K2.5 \
  --enable-mixed-chunk \
  --tensor-parallel-size 4 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --chunked-prefill-size 32658 \
  --max-total-tokens 50000 \
  --attention-backend flashinfer
```

It takes about 2~3 minutes to start the server.

See [KT-Kernel Parameters](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel#kt-kernel-parameters) for detailed parameter tuning guidelines.

## Step 3: Send Inference Requests

Once the server is running, you can send inference requests using the OpenAI-compatible API.

### Basic Chat Completion Request

```bash
curl -s http://localhost:31245/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Kimi-K2.5",
    "stream": false,
    "messages": [
      {"role": "user", "content": "hi, who are you?"}
    ]
  }'
```

### Example Response

```json
{
    "id": "2a4e83f8a79b4b57b103b0f298fbaa7d",
    "object": "chat.completion",
    "created": 1769333912,
    "model": "Kimi-K2.5",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": " The user is asking \"hi, who are you?\" which is a simple greeting and identity question. I need to respond appropriately by introducing myself clearly and concisely.\n\nI am Kimi, a large language model trained by Moonshot AI. I should state my name, my nature (AI assistant), and my developer (Moonshot AI). I should keep it friendly and helpful.\n\nKey points to include:\n- Greet them back (\"hi\" or \"hello\")\n- State my name: Kimi\n- State what I am: an AI assistant/language model\n- Mention my developer: Moonshot AI\n- Briefly describe my purpose: to help answer questions, provide information, and assist with various tasks\n- Keep it concise but informative\n- Use a friendly, professional tone\n\nI should avoid overly technical jargon while being accurate. The response should be welcoming and set the stage for further interaction.\n\nPossible response:\n\"Hi! I'm Kimi, an AI assistant created by Moonshot AI. I'm designed to help answer questions, provide information, and assist with a wide range of tasks. How can I help you today?\"\n\nThis covers all the necessary points and invites the user to continue the conversation. </think> Hi! I'm Kimi, an AI assistant created by Moonshot AI. I'm designed to help answer questions, provide information, and assist with a wide range of tasks. How can I help you today?",
                "reasoning_content": null,
                "tool_calls": null
            },
            "logprobs": null,
            "finish_reason": "stop",
            "matched_stop": 163586
        }
    ],
    "usage": {
        "prompt_tokens": 32,
        "total_tokens": 317,
        "completion_tokens": 285,
        "prompt_tokens_details": null,
        "reasoning_tokens": 0
    },
    "metadata": {
        "weight_version": "default"
    }
}
```


================================================
FILE: doc/en/Kimi-K2.md
================================================
# Kimi-K2 Support for KTransformers

## Introduction

### Overview
We are very pleased to announce that Ktransformers now supports Kimi-K2 and Kimi-K2-0905.

On a single-socket CPU with one consumer-grade GPU, running the Q4_K_M model yields roughly 10 TPS and requires about 600 GB of DRAM.  
With a dual-socket CPU and sufficient system memory, enabling NUMA optimizations increases performance to about 14 TPS.

### Model & Resource Links

- Official Kimi-K2 Release: 
  - https://huggingface.co/collections/moonshotai/kimi-k2-6871243b990f2af5ba60617d
- GGUF Format(quantized models):
  - https://huggingface.co/KVCache-ai/Kimi-K2-Instruct-GGUF
- Official Kimi-K2-0905 Release:
  - https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905
- GGUF Format(quantized models):
  - https://huggingface.co/KVCache-ai/Kimi-K2-Instruct-0905-GGUF

## Installation Guide

### 1. Resource Requirements

The model running with 384 Experts requires approximately 600 GB of memory and 14 GB of GPU memory.

### 2. Prepare Models

```bash
# download gguf
huggingface-cli download --resume-download KVCache-ai/Kimi-K2-Instruct-GGUF

```

### 3. Install ktransformers

To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).

### 4. Run Kimi-K2 Inference Server

```bash
python ktransformers/server/main.py \
  --port 10002 \
  --model_path <path_to_safetensor_config> \
  --gguf_path <path_to_gguf_files> \
  --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml \
  --max_new_tokens 1024 \
  --cache_lens 32768 \
  --chunk_size 256 \
  --max_batch_size 4 \
  --backend_type balance_serve \
```

### 5. Access server

```
curl -X POST http://localhost:10002/v1/chat/completions \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "hello"}
    ],
    "model": "Kimi-K2",
    "temperature": 0.3,
    "top_p": 1.0,
    "stream": true
  }'
```


================================================
FILE: doc/en/Kllama_tutorial_DeepSeekV2Lite.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6201cdec-70f7-4c22-b988-b23ece31979d",
   "metadata": {},
   "source": [
    "<div align=\"center\">\n",
    "  <!-- <h1>KTransformers</h1> -->\n",
    "  <p align=\"center\">\n",
    "\n",
    "<picture>\n",
    "    <img alt=\"KTransformers\" src=\"https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b\" width=50%>\n",
    "\n",
    "</picture>\n",
    "\n",
    "</p>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5dcfddc6-d51b-4aa8-b887-f7c817492316",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "# **Introduction**\n",
    "[KTransformers](https://github.com/kvcache-ai/ktransformers), is designed to enhance the 🤗 Transformers experience through advanced kernel optimizations and placement/parallelism strategies. \n",
    "<br/> <br/>\n",
    "This tutorial serves as a guide for KTransformers-ft, aiming to to give resource-constrained researchers a **local path to explore fine-tuning ultra-large models (e.g., 671B/1000B)**, and also a fast way to customize smaller models (e.g., 14B/30B) for specific scenarios. We validate the setup using representative tasks such as stylized dialogue, Westernized translation tone, and medical Q&A, demonstrating that personalized adaptation can be achieved within hours.\n",
    "<br/> <br/>\n",
    "This tutorial takes DeepSeek-V2-Lite as a code example; for more details, refer to [KTransformers-Fine-Tuning_User-Guide](https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/KTransformers-Fine-Tuning_User-Guide.md) and [KTransformers-Fine-Tuning_Developer-Technical-Notes](https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/KTransformers-Fine-Tuning_Developer-Technical-Notes.md)."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4167684-81f4-4e2b-a486-c33ec3bc92f0",
   "metadata": {},
   "source": [
    "# **Installation**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5548a7f8-20d6-4ae4-a575-a3ef7a0ea5f8",
   "metadata": {},
   "source": [
    "### **1. Install torch and clone the repo**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f39051d-eb14-44fa-af82-9ded23144985",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git\n",
    "!cd LLaMA-Factory"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7dd351f-9102-4d7d-951c-4306df9f4cd7",
   "metadata": {},
   "source": [
    "**(Optional)** If you want to choose your version of torch and cuda, please install separately."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a5afa0c-1ed0-4190-ab50-967e553d6fd2",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "!pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu118"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "711dcc79-056f-4483-a2e1-7e780af1def1",
   "metadata": {},
   "source": [
    "### **2. Install LLaMA-Factory**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "42f09df9-7db8-46e3-b11d-2946a57d2933",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.chdir(\"LLaMA-Factory\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a6a5532-e5cc-463b-bdf8-030e547287fc",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "!pip install -e \".[torch,metrics]\" --no-build-isolation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "48c19762-70a7-402c-94f9-a71b277eb932",
   "metadata": {},
   "source": [
    "### **3. Install dependency libraries for GCC and CUDA**\n",
    "You need to install system-level dependency libraries. `libstdcxx-ng` and `gcc_impl_linux-64` ensure compilation compatibility, while cuda-runtime provides a GPU-accelerated runtime environment. **Please do NOT IGNORE this two commands! `nvidia/label/cuda-11.8.0 cuda-runtime` should be installed for every version of cuda for KT whl.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "202e672a-b30a-4bde-92d5-27500f435b30",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!conda install -y -c conda-forge libstdcxx-ng gcc_impl_linux-64\n",
    "!conda install -y -c nvidia/label/cuda-11.8.0 cuda-runtime"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94e6448f-1e27-4f16-885c-27738c2089dc",
   "metadata": {},
   "source": [
    "### **4. Install ktransformers and flash-attention**\n",
    "You need to download the corresponding version of python, cuda and torch from [downloading ktransformers whl](https://github.com/kvcache-ai/ktransformers/releases/tag/v0.4.1) and [downloading flash-attention whl](https://github.com/Dao-AILab/flash-attention/releases)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7c4a5e82-ae9f-490f-9f90-441cdd98041e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "print(torch._C._GLIBCXX_USE_CXX11_ABI)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "837a2240-818d-499f-a1b5-641fa5c45339",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install ../ktransformers-0.4.1+cu128torch27fancy-cp312-cp312-linux_x86_64.whl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3c78d9e-26e0-4f85-94ff-d6b028b194ac",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "!pip install ../flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp312-cp312-linux_x86_64.whl"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2593e2cb-5fbd-4d66-94fc-d2d74c4d8f65",
   "metadata": {},
   "source": [
    "# **How to Start**\n",
    "## Fine-tuning the Model with LoRA"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7db3349-8cdb-48cd-8b63-0ea70fe4af6f",
   "metadata": {},
   "source": [
    "LoRA (Low-Rank Adaptation) fine-tuning only trains small \"adapter\" weights for large models. However, under traditional frameworks, it still needs more than 1400GB GPU VRAM, which hardly handles on the 4090s machine. **KTransformers**, as high-performance backend engine, provides a solution for GPU/CPU Hybrid devices to further cut GPU memory usage and speed up training. As shown below, we compare KTransformers(ours) with other common LoRA fine-tuning backends (HuggingFace and Unsloth). KTransformers is the **only workable 4090-class solution** for ultra-large MoE models (e.g., 671B) and also delivers higher fine-tuning throughput. <br/>\n",
    "<div style=\"text-align: center;\">\n",
    "<img src=\"https://typora-tuchuang-jimmy.oss-cn-beijing.aliyuncs.com/img/按照模型划分的对比图_02.png\" alt=\"kt_unsloth_huggingface_compare\" width=\"70%\" height=\"auto\">\n",
    "</div>\n",
    "\n",
    "To make KTransformers-ft more easy-to-use, we cooperator with [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory/), a easy and efficiency model fine-tuning framework. As shown below, LLaMA-Factory is the unified configuration layer for the whole fine-tuning workflow. **KTransformers** acts as a high-performance backend that takes over core operators like Attention/MoE under the same training configs, enabling efficient **GPU+CPU heterogeneous cooperation**. <br/>\n",
    "<div style=\"text-align: center;\">\n",
    "<img src=\"https://typora-tuchuang-jimmy.oss-cn-beijing.aliyuncs.com/img/image-20251011010558909.png\" alt=\"image-20251011010558909\" width=\"70%\" height=\"auto\">\n",
    "</div>\n",
    "\n",
    "This combination lets you fine-tune big models (like 671B/1000B) on consumer level GPUs (2-4 RTX 4090s) — no need for expensive hardware. Here’s the training command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baf5b8fc-e910-4531-9f00-a2076c698eff",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!USE_KT=1 llamafactory-cli train examples/train_lora/deepseek2_lora_sft_kt.yaml"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc80b189-17ac-47a7-9889-b77e7a9d5304",
   "metadata": {},
   "source": [
    "Let’s break down the training command (`USE_KT=1 llamafactory-cli train examples/train_lora/deepseek2_lora_sft_kt.yaml`):\n",
    "- `USE_KT=1`: The \"switch\" to enable KTransformers optimization.  \n",
    "- `llamafactory-cli train`: The core command to start LLaMA-Factory’s fine-tuning tool.\n",
    "- `examples/train_lora/deepseek2_lora_sft_kt.yaml`: The configuration file that controls model, data, training rules and KTransformers settings — we’ll detail this next.\n",
    "\n",
    "**The LLaMA-Factory yaml (e.g. `deepseek2_lora_sft_kt.yaml`) is where you define how the fine-tuning works.** Below is a simplified version, you can use this directly for basic tasks like style transfer or domain Q&A. And We’ll explain each section’s purpose and why the values are set this way in the following part--Custom your KTransformers-FineTuning + LLaMA-Factory.\n",
    "```yaml\n",
    "### model\n",
    "model_name_or_path: deepseek-ai/DeepSeek-V2-Lite\n",
    "\n",
    "### method\n",
    "finetuning_type: lora\n",
    "lora_rank: 8\n",
    "lora_target: all\n",
    "\n",
    "### dataset\n",
    "dataset: identity\n",
    "template: deepseek\n",
    "cutoff_len: 2048\n",
    "max_samples: 100000\n",
    "\n",
    "### output\n",
    "output_dir: saves/Kllama_deepseekV2\n",
    "logging_steps: 10\n",
    "save_steps: 500\n",
    "\n",
    "### train\n",
    "per_device_train_batch_size: 1\n",
    "gradient_accumulation_steps: 8\n",
    "learning_rate: 1.0e-4\n",
    "num_train_epochs: 3.0\n",
    "\n",
    "### ktransformers\n",
    "use_kt: true # use KTransformers as LoRA sft backend\n",
    "kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml\n",
    "cpu_infer: 32\n",
    "chunk_size: 8192\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dac7722d-89dd-40b1-ac27-7ca64e80fe47",
   "metadata": {},
   "source": [
    "## Chat with the Fine-tuned Model: Test Your Customized AI"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9af428c6-4fce-4320-b3d3-af59726ab9ce",
   "metadata": {},
   "source": [
    "After finishing fine-tuning with KTransformers, **the next step is to chat with your model and verify the results!** This step loads the original base model plus the fine-tuned \"custom plugin\" (LoRA adapter) you saved earlier, letting you interact with the model in real time.  \n",
    "\n",
    "We’ll use LLaMA-Factory’s `chat` command to launch the interactive interface. The core is the LLaMA-Factory YAML configuration file — it tells the tool which model to load, how to optimize inference, and what style of dialogue to use. We take one of the example as follows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37191db1-a97c-407c-9626-af9fde6dd94f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!llamafactory-cli chat examples/inference/deepseek2_lora_sft_kt.yaml"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "06c18255-66d0-4189-a714-6050160a0637",
   "metadata": {},
   "source": [
    "To know exactly what you’re running, we break down the full command (`llamafactory-cli chat examples/inference/deepseek2_lora_sft_kt.yaml`):\n",
    "- `llamafactory-cli chat`: The core command to launch LLaMA-Factory’s interactive chat tool.\n",
    "- `examples/inference/deepseek2_lora_sft_kt.yaml`: The configuration file for inference (controls model loading, optimization, and dialogue settings).\n",
    "- No need for `USE_KT=1` here — we’ll enable KTransformers directly in the YAML (but it still needs to match the training settings!).\n",
    "\n",
    "**The LLaMA-Factory configuration file for inference (`examples/inference/deepseek2_lora_sft_kt.yaml`) controls the generate config for specific tasks.** Below is a simplified version, you can use this directly to chat with your fine-tuned model. Most setting is linked to your training config — we’ll still explain the details in next part.\n",
    "```yaml\n",
    "model_name_or_path: deepseek-ai/DeepSeek-V2-Lite\n",
    "adapter_name_or_path: saves/Kllama_deepseekV2\n",
    "template: deepseek\n",
    "infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]\n",
    "trust_remote_code: true\n",
    "\n",
    "use_kt: true # use KTransformers as LoRA sft backend to inference\n",
    "kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml\n",
    "cpu_infer: 32\n",
    "chunk_size: 8192\n",
    "```\n",
    "`kt_optimize_rule` needs as same as the kt_optimize_rule in LoRA Fine-tuning."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18814c5c-3b73-44cc-a608-505c1e870437",
   "metadata": {},
   "source": [
    "# **Custom your KTransformers-FineTuning + LLaMA-Factory**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8072427f-46d4-41fb-8850-e33a2446e031",
   "metadata": {},
   "source": [
    "Once you’ve got the basic fine-tuning workflow down, you’ll likely want to **adapt the process to your specific needs**—whether that’s training on your own data, squeezing more performance out of limited GPU memory, or speeding up training for large datasets. Below’s a hands-on guide to customizing every part of the process, with clear explanations of why each setting matters and how to tweak it.\n",
    "\n",
    "## 1. Fine-tuning Customization: Tailor Training to Your Needs  \n",
    "To start customizing, you’ll still use the core training command: `USE_KT=1 llamafactory-cli train examples/train_lora/deepseek2_lora_sft_kt.yaml`. Notably, it performs even better than the default setup when adapted to your specific needs. <br/>\n",
    "### Full example **LLaMA-Factory YAML** for DeepSeek-V2-Lite\n",
    "```yaml\n",
    "### model\n",
    "model_name_or_path: deepseek-ai/DeepSeek-V2-Lite\n",
    "trust_remote_code: true\n",
    "\n",
    "### method\n",
    "stage: sft\n",
    "do_train: true\n",
    "finetuning_type: lora\n",
    "lora_rank: 8\n",
    "lora_target: all\n",
    "\n",
    "### dataset\n",
    "dataset: identity\n",
    "template: deepseek\n",
    "cutoff_len: 2048\n",
    "max_samples: 100000\n",
    "overwrite_cache: true\n",
    "preprocessing_num_workers: 16\n",
    "dataloader_num_workers: 4\n",
    "\n",
    "### output\n",
    "output_dir: saves/Kllama_deepseekV2Lite\n",
    "logging_steps: 10\n",
    "save_steps: 500\n",
    "plot_loss: true\n",
    "overwrite_output_dir: true\n",
    "save_only_model: false\n",
    "report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]\n",
    "\n",
    "### train\n",
    "per_device_train_batch_size: 1\n",
    "gradient_accumulation_steps: 8\n",
    "learning_rate: 1.0e-4\n",
    "num_train_epochs: 3.0\n",
    "lr_scheduler_type: cosine\n",
    "warmup_ratio: 0.1\n",
    "bf16: true\n",
    "ddp_timeout: 180000000\n",
    "resume_from_checkpoint: null\n",
    "\n",
    "### ktransformers\n",
    "use_kt: true # use KTransformers as LoRA sft backend\n",
    "kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml\n",
    "cpu_infer: 32\n",
    "chunk_size: 8192\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6abc1968-6208-4344-9c82-335d7fe1d27c",
   "metadata": {},
   "source": [
    "---\n",
    "### A. Pick & Prepare Your Model\n",
    "The first step in customization is choosing the right base model, and ensuring it works with KTransformers. The `model_name_or_path` setting (shown in LLaMA-Factory YAML before) controls this, and getting it right avoids common errors.\n",
    "- **Use a public model**: Directly set to Hugging Face Hub names (e.g., `deepseek-ai/DeepSeek-V2-Lite`, `Qwen/Qwen2-MoE-72B`).  \n",
    "- **Use a local model**: Replace with your local folder path (e.g., `/mnt/data/models/DeepSeek-V2-Lite`).\n",
    "\n",
    "**Critical Requirement**: The model must be in **BF16 format**.  \n",
    "  - FP8 models (like DeepSeek-V3’s default release) aren’t compatible with KTransformers’ optimization.  \n",
    "  - Fix: Convert FP8 to BF16 with **[this official script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py)**.\n",
    "\n",
    "---\n",
    "\n",
    "### B. Tune LoRA: Balance Fitting Capability & Memory  \n",
    "LoRA trains tiny \"adapter\" weights instead of the entire model. Tweaking these two settings in LLaMA-Factory YAML (`lora_rank`, `lora_target`) lets you balance how well the model learns your data and how much GPU memory it uses:\n",
    "\n",
    "| Setting         | What it does                                                                 | Scenario & Recommendation                                                                 |\n",
    "|-----------------|-----------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|\n",
    "| `lora_rank`     | Controls the \"power\" of LoRA adapters (higher = more fitting, more memory). | - Small dataset (≤5k samples) or limited GPU: 4-8 (balances speed/memory).<br>- Large dataset (≥20k samples): 16-32 (better fits custom data). |\n",
    "| `lora_target`   | Which layers get LoRA (applies only to linear layers).                      | - Quick fine-tuning (e.g., style transfer): `q_proj,v_proj` (only attention layers—faster).<br>- Deep customization (e.g., medical Q&A): `all` (all linear layers—more accurate). |\n",
    "\n",
    "**Tip**: Pair `lora_rank=8` with `lora_alpha=32` (alpha = 4× rank) for stable training This ratio is tested to work well for most tasks, from chatbots to domain Q&A.  \n",
    "\n",
    "---\n",
    "\n",
    "### C. Use Your Own Dataset\n",
    "Fine-tuning’s value lies in training on your own data, such as company documents, customer support logs, or domain-specific Q&A. Below is how to replace the default (identity) dataset with yours:  \n",
    "\n",
    "1. **Add a custom dataset**:  \n",
    "   - Step 1: Organize your data into LLaMA-Factory’s format (e.g., JSON with `instruction`, `input`, `output` fields—see [dataset examples](https://github.com/hiyouga/LLaMA-Factory/tree/main/data)).  \n",
    "   - Step 2: Register your dataset in [LLaMA-Factory/data/dataset_info.json](https://github.com/hiyouga/LLaMA-Factory/blob/main/data/dataset_info.json) (copy the format of built-in datasets—just add your dataset name and file path).\n",
    "     For example,\n",
    "     ```json\n",
    "     \"niko\": {\n",
    "        \"file_name\": \"../niko_train.json\"\n",
    "      },\n",
    "      ```\n",
    "   - Step 3: You may replace `dataset: identity` in LLaMA-Factory YAML to your dataset name (e.g. `dataset: niko`).\n",
    "2. **Tweak dataset settings for better results**:  \n",
    "   - `cutoff_len`: Truncates long texts (e.g., set to 4096 for long documents, 2048 for short dialogues—never exceed `model_max_length`).  \n",
    "   - `max_samples`: Limit samples to avoid overfitting (use 100 for debugging, `None` for full training—great if your dataset is huge).  \n",
    "   - `template`: Must match your model (e.g., `deepseek` for DeepSeek, `llama3` for LLaMA3, more refer to [supported-models](https://github.com/hiyouga/LLaMA-Factory/tree/main?tab=readme-ov-file#supported-models))—mismatched templates break response formatting!  \n",
    "\n",
    "---\n",
    "\n",
    "### D. Save GPU Memory & Speed Up Training  \n",
    "If you’re hitting GPU memory limits or waiting too long for training, adjust these settings in LLaMA-Factory YAML:  \n",
    "\n",
    "| Challenge               | Setting to Tweak                          | How to Adjust                                                                 |\n",
    "|-------------------------|-------------------------------------------|--------------------------------------------------------------------------------|\n",
    "| GPU memory is tight     | `per_device_train_batch_size` + `gradient_accumulation_steps` | Set `per_device_train_batch_size=1` (smallest batch) + `gradient_accumulation_steps=16` (simulates a batch of 16—no memory penalty!). |\n",
    "| Model overfits (bad generalization) | `lora_dropout` + `num_train_epochs` | Add `lora_dropout: 0.1` (prevents overfitting) + reduce `num_train_epochs` to 2 (3 is default—overtraining hurts!). |\n",
    "\n",
    "**Key Train Configs Recap**:  \n",
    "- `learning_rate`: 1e-4~2e-4 for LoRA (stick to this range—too high = unstable, too low = slow learning).  \n",
    "- `save_steps`: Save checkpoints every 100-500 steps (frequent saves = safe, but don’t overdo it—each checkpoint takes storage!).  \n",
    "- `output_dir`: Customize the save path (e.g., `saves/medical_qa_deepseek` instead of the default—keeps your projects organized!).  \n",
    "\n",
    "---\n",
    "\n",
    "### E. KTransformers Optimization: Unlock Maximum Performance  \n",
    "KTransformers is what makes fine-tuning large models (like 671B-parameter DeepSeek-V3) possible on modest hardware. These settings control how it optimizes layer placement (GPU vs. CPU) and computation speed:\n",
    "\n",
    "| Setting               | What it does                                                                 | How to Customize                                                                 |\n",
    "|-----------------------|-----------------------------------------------------------------------------|----------------------------------------------------------------------------------|\n",
    "| `use_kt`              | Enables KTransformers backend (must be `true`—otherwise, no optimization!). | Leave as `true`—this is what makes 671B models trainable on 2×4090s!             |\n",
    "| `cpu_infer`           | Number of CPU threads for MoE/linear computations.                          | Set to half your CPU cores (e.g., 32 for a 64-core CPU—too many threads = bottlenecks!). |\n",
    "| `chunk_size`          | Block size for long text processing (affects memory and speed).             | Default 8192 works for most tasks; increase to 16384 for extra-long texts (e.g., book summaries). |\n",
    "| `kt_optimize_rule`    | Defines where layers run (GPU/CPU) and which kernels to use (core of KT!).  | - Use the pre-built rule for your model (e.g., `DeepSeek-V2-Lite-Chat-sft-amx.yaml`).<br>- For faster speed: Use `AMXInt8`/`AMXBF16` as backend (if your CPU supports AMX—check with `lscpu | grep amx`).<br>- For compatibility: Fall back to `llamafile` if AMX isn’t supported. |\n",
    "\n",
    "#### Example Custom `kt_optimize_rule` (shown in the table above)  \n",
    "This rule tells KTransformers to offload heavy MoE layers to the CPU (saving GPU memory) and use AMX for fast CPU computation. Use it as a template for your own model: (Details tutorial could be seen in **[here](https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/injection_tutorial.md)**)\n",
    "```yaml\n",
    "- match:\n",
    "    name: \"^model\\\\.layers\\\\..*\\\\.mlp\\\\.experts$\"  # Target all MoE expert layers\n",
    "  replace:\n",
    "    class: ktransformers.operators.experts.KTransformersExperts  # KT's optimized MoE kernel\n",
    "    kwargs:\n",
    "      prefill_device: \"cuda\"  # Fast pre-processing on GPU\n",
    "      prefill_op: \"KExpertsTorch\"\n",
    "      generate_device: \"cpu\"  # Heavy MoE compute on CPU (saves GPU memory)\n",
    "      generate_op: \"KSFTExpertsCPU\"  # KT's SFT-optimized MoE operator\n",
    "      out_device: \"cuda\"  # Send results back to GPU for next steps\n",
    "      backend: \"AMXInt8\"  # Options: AMXInt8 (fastest) > AMXBF16 > llamafile (default)\n",
    "```\n",
    "**Alert:** Never mix KLinearMarlin with LoRA fine-tuning—replace it with KLinearTorch (as in the example) to avoid compatibility issues!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93840117-084b-44fa-8b2e-6389e4a52bf0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!USE_KT=1 llamafactory-cli train examples/train_lora/deepseek2_lora_sft_kt.yaml"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c6d0b4db-65f7-4683-88d0-3269c962224c",
   "metadata": {},
   "source": [
    "## 2. Chat with the Fine-tuned Model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fdbc5e95-9567-4b8a-94d7-eec410d94a6b",
   "metadata": {},
   "source": [
    "After completing fine-tuning, the next critical step is to test your customized model through real-time interaction. Running `llamafactory-cli chat examples/inference/deepseek2_lora_sft_kt.yaml` loads the base model and your fine-tuned LoRA adapter. Below’s a detailed guide to customizing the chat process, with clear explanations of each setting’s role and how to fit it to your specific tasks.\n",
    "\n",
    "### Full example LLaMA-Factory YAML for inference\n",
    "```yaml\n",
    "model_name_or_path: deepseek-ai/DeepSeek-V2-Lite\n",
    "adapter_name_or_path: saves/Kllama_deepseekV2Lite\n",
    "template: deepseek\n",
    "infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]\n",
    "trust_remote_code: true\n",
    "\n",
    "use_kt: true # use KTransformers as LoRA sft backend to inference\n",
    "kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml\n",
    "cpu_infer: 32\n",
    "chunk_size: 8192\n",
    "```\n",
    "\n",
    "---\n",
    "\n",
    "### A. Load Your Fine-Tuned Adapter (Two Supported Formats)  \n",
    "The `adapter_name_or_path` setting in LLaMA-Factory YAML points to your trained LoRA weights. Two formats are supported:  \n",
    "- **Folder Format (Default)**: If training saved a folder (e.g., `saves/Kllama_deepseekV2`) with `.safetensors` files, set it directly (e.g., `adapter_name_or_path: saves/Kllama_deepseekV2`).  \n",
    "- **GGUF Format (Single File)**: If you exported the adapter to a `.gguf` file (for portability), set the full path (e.g., `adapter_name_or_path: saves/my_adapter.gguf`).  \n",
    "\n",
    "---\n",
    "\n",
    "### B. Tweak Response Quality (Generation Configs)  \n",
    "Optional generation parameters let you adjust the model’s responses to fit specific use cases, whether you need factual accuracy, creative expression, or concise answers. Add these to your YAML and modify based on your needs:\n",
    "```yaml\n",
    "# Optional generation configs (add to your inference YAML)\n",
    "max_new_tokens: 1024  # Max length of responses (512 = short, 2048 = long)\n",
    "temperature: 0.7      # Randomness (0.1 = factual/consistent, 1.0 = creative/diverse)\n",
    "top_p: 0.9            # Focus (0.8-0.95 = avoids irrelevant content)\n",
    "repetition_penalty: 1.1  # Reduces repetition (1.0 = no penalty, 1.2 = strict)\n",
    "```\n",
    "\n",
    "---\n",
    "\n",
    "### C. KTransformers Inference Backend  \n",
    "The KTransformers-related settings directly impact inference performance—they must align with your training configuration to maintain optimization effects (e.g., low memory usage, fast speed):\n",
    "- `infer_backend` determines how the model generates responses—pick based on your needs. You need to choose `ktransformers`, if you LoRA fine-tuning it with ktransformers.\n",
    "- `use_kt: true`: Must match training—disables KT optimization if set to `false` (slower inference!).  \n",
    "- `kt_optimize_rule`: Use the **exact same file** as training (e.g., `DeepSeek-V2-Lite-Chat-sft-amx.yaml`)—ensures layers map correctly.  \n",
    "\n",
    "---\n",
    "\n",
    "### How to Verify Inference Works\n",
    "After launching the chat command, check the logs for these key messages to confirm the model is running correctly:\n",
    "1. `Loaded adapter weight: XXX -> XXX`: LoRA adapter is loaded correctly.  \n",
    "2. `KTransformers inference enabled`: KT optimization is active.  \n",
    "3. `Backend: AMXInt8`: AMX acceleration is working (if supported).  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c08b31f7-32a4-4d51-b6c0-d063d7785371",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!llamafactory-cli chat examples/inference/deepseek2_lora_sft_kt.yaml"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "KNllama",
   "language": "python",
   "name": "knllama"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: doc/en/MiniMax-M2.5.md
================================================
# Running MiniMax-M2.5 with SGLang and KT-Kernel

This tutorial demonstrates how to run MiniMax-M2.5 model inference using SGLang integrated with KT-Kernel for CPU-GPU heterogeneous inference. This setup enables efficient deployment of large MoE models by offloading experts to CPU.

## Table of Contents

- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Launch SGLang Server](#step-2-launch-sglang-server)
- [Step 3: Send Inference Requests](#step-3-send-inference-requests)

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: NVIDIA RTX 2x4090 48GB (or equivalent with at least total 48GB VRAM available)
- **CPU**: x86 CPU with AVX512BF16 support (e.g., Intel Sapphire Rapids)
- **RAM**: At least 200GB system memory
- **Storage**: ~200GB for model weights (FP8 weight, same weight folder for CPU and GPU)

## Prerequisites

Before starting, ensure you have:

1. **KT-Kernel installed**:

```
git clone https://github.com/kvcache-ai/ktransformers.git
git submodule update --init --recursive
cd kt-kernel && ./install.sh
```

2. **SGLang installed** - Install the kvcache-ai fork of SGLang (one of):

```bash
# Option A: One-click install (from ktransformers root)
./install.sh

# Option B: pip install
pip install sglang-kt
```

> Note: You may need to reinstall cudnn: `pip install nvidia-cudnn-cu12==9.16.0.29`

3. **CUDA toolkit** - Compatible with your GPU (CUDA 12.8+ recommended)
4. **Hugging Face CLI** - For downloading models:

   ```bash
   pip install huggingface-hub
   ```

## Step 1: Download Model Weights

```bash
# Create a directory for models
mkdir -p /path/to/models
cd /path/to/models

# Download MiniMax-M2.5 (FP8 for both CPU and GPU)
huggingface-cli download MiniMaxAI/MiniMax-M2.5 \
  --local-dir /path/to/minimax-m2.5
```

**Note:** Replace `/path/to/models` with your actual storage path throughout this tutorial.

## Step 2: Launch SGLang Server

Start the SGLang server with KT-Kernel integration for CPU-GPU heterogeneous inference.


### Launch Command (4x RTX 4090 Example)

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30005 \
  --model /path/to/minimax-m2.5 \
  --kt-weight-path /path/to/minimax-m2.5 \
  --kt-cpuinfer 96 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 30 \
  --kt-method FP8 \
  --kt-gpu-prefill-token-threshold 400 \
  --trust-remote-code \
  --mem-fraction-static 0.94 \
  --served-model-name MiniMax-M2.5 \
  --enable-mixed-chunk \
  --tensor-parallel-size 4 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --chunked-prefill-size 32658 \
  --max-total-tokens 50000 \
  --attention-backend flashinfer
```

It takes about 2~3 minutes to start the server.

See [KT-Kernel Parameters](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel#kt-kernel-parameters) for detailed parameter tuning guidelines.

## Step 3: Send Inference Requests

Once the server is running, you can send inference requests using the OpenAI-compatible API.

### Basic Chat Completion Request

```bash
curl -s http://localhost:30005/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "MiniMax-M2.5",
    "stream": false,
    "messages": [
      {"role": "user", "content": "hi, who are you?"}
    ]
  }'
```

### Example Response

```json
{
    "id": "e82360a51dd4465281a2b954d5237a06",
    "object": "chat.completion",
    "created": 1770980318,
    "model": "MiniMax-M2.5",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The user is asking who I am. I should give a brief, friendly introduction about myself.\n</think>\n\nHi there! I'm MiniMax-M2.5, an AI assistant created by MiniMax. I'm here to help you with a wide range of tasks, including:\n\n- Answering questions\n- Writing and editing code\n- Explaining concepts\n- Brainstorming ideas\n- And much more!\n\nHow can I help you today?",
                "reasoning_content": null,
                "tool_calls": null
            },
            "logprobs": null,
            "finish_reason": "stop",
            "matched_stop": 200020
        }
    ],
    "usage": {
        "prompt_tokens": 44,
        "total_tokens": 138,
        "completion_tokens": 94,
        "prompt_tokens_details": null,
        "reasoning_tokens": 0
    },
    "metadata": {
        "weight_version": "default"
    }
}
```


================================================
FILE: doc/en/Qwen3-Next.md
================================================
# Qwen3-Next Support for KTransformers

## Introduction

### Overview
We are very pleased to announce that Ktransformers now supports Qwen3-Next-80B-A3B-Thinking and Qwen3-Next-80B-A3B-Instruct.

### Model & Resource Links

- Official Qwen3-Next-80B-A3B-Thinking Release: 
  - https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking

- Official Qwen3-Next-80B-A3B-Instruct Release
  - https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct


## Installation Guide

### 1. Resource Requirements

The model running with 512 Experts requires approximately 320 GB of memory and 6 GB of GPU memory.

### 2. Prepare Models

```bash
# download gguf
huggingface-cli download --resume-download Qwen/Qwen3-Next-80B-A3B-Instruct

```

### 3. Install ktransformers

To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).

### 4. Run Qwen3-Next Inference Server

```bash
python ktransformers/server/main.py \
  --port 10021 \
  --model_path path-to-Qwen3-Next-80B-A3B-Thinking \
  --gguf_path path-to-Qwen3-Next-80B-A3B-Thinking \
  --model_name Qwen3NextForCausalLM \
  --optimize_config_path <local_path>/ktransformers/optimize/optimize_rules/Qwen3Next-serve.yaml \
  --max_new_tokens 1024 \
  --cache_lens 32768 \
  --chunk_size 256 \
  --max_batch_size 4 \
  --no-use_cuda_graph \
  --backend_type balance_serve
```

### 5. Access server

```
curl -X POST http://localhost:10021/v1/chat/completions \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "hello"}
    ],
    "model": "Qwen3-Next-80B-A3B-Instruct",
    "temperature": 0.3,
    "top_p": 1.0,
    "stream": true
  }'
```

### 6. Notes

Due to Qwen3-Next’s use of linear attention, CUDA Graph optimization is not yet support — but it’s coming soon! 🚀

================================================
FILE: doc/en/Qwen3.5.md
================================================
# Running Qwen3.5 with SGLang and KT-Kernel

This tutorial demonstrates how to run Qwen3.5 (MoE-400B) model inference using SGLang integrated with KT-Kernel for CPU-GPU heterogeneous inference. This setup enables efficient deployment of large MoE models by offloading experts to CPU.

## Table of Contents

- [Running Qwen3.5 with SGLang and KT-Kernel](#running-qwen35-with-sglang-and-kt-kernel)
  - [Table of Contents](#table-of-contents)
  - [Hardware Requirements](#hardware-requirements)
  - [Prerequisites](#prerequisites)
  - [Step 1: Download Model Weights](#step-1-download-model-weights)
  - [Step 2: Launch SGLang Server](#step-2-launch-sglang-server)
    - [Launch Command (4x RTX 4090 Example)](#launch-command-4x-rtx-4090-example)
  - [Step 3: Send Inference Requests](#step-3-send-inference-requests)
    - [Basic Chat Completion Request](#basic-chat-completion-request)
    - [Example Response](#example-response)

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: NVIDIA 4x RTX 4090 (or equivalent with at least 96GB total VRAM available)
- **CPU**: x86 CPU with AVX512F support (e.g., Intel Sapphire Rapids)
- **RAM**: At least 800GB system memory
- **Storage**: ~800GB for model weights (BF16)

## Prerequisites

Before starting, ensure you have:

1. **KT-Kernel installed**:

```bash
git clone https://github.com/kvcache-ai/ktransformers.git
git checkout qwen3.5
git submodule update --init --recursive
cd kt-kernel && ./install.sh
```

2. **SGLang installed** - Install the kvcache-ai fork of SGLang (one of):

```bash
# Option A: One-click install (from ktransformers root)
./install.sh

# Option B: pip install
pip install sglang-kt
```

> Note: You may need to reinstall cudnn: `pip install nvidia-cudnn-cu12==9.16.0.29`

3. **CUDA toolkit** - Compatible with your GPU (CUDA 12.8+ recommended)
4. **Hugging Face CLI** - For downloading models:

   ```bash
   pip install huggingface-hub
   ```

## Step 1: Download Model Weights

```bash
# Create a directory for models
mkdir -p /path/to/models
cd /path/to/models

# Download Qwen3.5 (BF16)
huggingface-cli download Qwen/Qwen3.5 \
  --local-dir /path/to/qwen3.5
```

**Note:** Replace `/path/to/models` with your actual storage path throughout this tutorial.

## Step 2: Launch SGLang Server

Start the SGLang server with KT-Kernel integration for CPU-GPU heterogeneous inference.

### Launch Command (4x RTX 4090 Example)

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30005 \
  --model /path/to/qwen3.5 \
  --kt-weight-path /path/to/qwen3.5 \
  --kt-cpuinfer 60 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 1 \
  --kt-method BF16 \
  --attention-backend triton \
  --trust-remote-code \
  --mem-fraction-static 0.98 \
  --chunked-prefill-size 4096 \
  --max-running-requests 32 \
  --max-total-tokens 32000 \
  --served-model-name qwen3.5 \
  --enable-mixed-chunk \
  --tensor-parallel-size 4 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --disable-custom-all-reduce
```

See [KT-Kernel Parameters](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel#kt-kernel-parameters) for detailed parameter tuning guidelines.

## Step 3: Send Inference Requests

Once the server is running, you can send inference requests using the OpenAI-compatible API.

### Basic Chat Completion Request

```bash
curl -s http://localhost:30005/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "qwen3.5",
    "stream": false,
    "messages": [
      {"role": "user", "content": "hi, who are you?"}
    ]
  }'
```

### Example Response

```json
{
    "id": "c79f6d63e04f4874acb8853d218e1bf1",
    "object": "chat.completion",
    "created": 1770880035,
    "model": "qwen3.5",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "Hello! I'm **Qwen**, a large language model developed by **Alibaba Cloud**. I'm designed to provide helpful, accurate, and safe information across a wide range of topics—whether you have questions, need help with writing, coding, analysis, or just want to explore ideas together.\n\nHow can I assist *you* today?",
                "reasoning_content": null,
                "tool_calls": null
            },
            "logprobs": null,
            "finish_reason": "stop",
            "matched_stop": 248046
        }
    ],
    "usage": {
        "prompt_tokens": 16,
        "total_tokens": 527,
        "completion_tokens": 511,
        "prompt_tokens_details": null,
        "reasoning_tokens": 0
    },
    "metadata": {
        "weight_version": "default"
    }
}
```


================================================
FILE: doc/en/ROCm.md
================================================
# ROCm Support for ktransformers (Beta)

## Introduction

### Overview
In our effort to expand GPU architecture support beyond NVIDIA, we are excited to introduce **AMD GPU support through ROCm** in ktransformers (Beta release). This implementation has been tested and developed using EPYC 9274F processors and AMD Radeon 7900xtx GPUs.

## Installation Guide

### 1. Install ROCm Driver
Begin by installing the ROCm drivers for your AMD GPU:
- [Official ROCm Installation Guide for Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/install-radeon.html)

### 2. Set Up Conda Environment
We recommend using Miniconda3/Anaconda3 for environment management:

```bash
# Download Miniconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh

# Create environment
conda create --name ktransformers python=3.11
conda activate ktransformers

# Install required libraries
conda install -c conda-forge libstdcxx-ng

# Verify GLIBCXX version (should include 3.4.32)
strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
```

> **Note:** Adjust the Anaconda path if your installation directory differs from `~/anaconda3`

### 3. Install PyTorch for ROCm
Install PyTorch with ROCm 6.2.4 support:

```bash
pip3 install torch torchvision torchaudio \
  --index-url https://download.pytorch.org/whl/rocm6.2.4
pip3 install packaging ninja cpufeature numpy
```

> **Tip:** For other ROCm versions, visit [PyTorch Previous Versions](https://pytorch.org/get-started/previous-versions/)

### 4. Build ktransformers

```bash
# Clone repository
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule update --init

# Optional: Compile web interface
# See: api/server/website.md

# Install dependencies
bash install.sh
```

## Running DeepSeek-R1 Models

### Configuration for 24GB VRAM GPUs
Use our optimized configuration for constrained VRAM:

```bash
python ktransformers/local_chat.py \
  --model_path deepseek-ai/DeepSeek-R1 \
  --gguf_path <path_to_gguf_files> \
  --optimize_config_path ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml \
  --cpu_infer <cpu_cores + 1>
```

> **Beta Note:** Current Q8 linear implementation (Marlin alternative) shows suboptimal performance. Expect optimizations in future releases.

### Configuration for 40GB+ VRAM GPUs
For better performance on high-VRAM GPUs:

1. Modify `DeepSeek-V3-Chat.yaml`:
   ```yaml
   # Replace all instances of:
   KLinearMarlin → KLinearTorch
   ```

2. Execute with:
   ```bash
   python ktransformers/local_chat.py \
     --model_path deepseek-ai/DeepSeek-R1 \
     --gguf_path <path_to_gguf_files> \
     --optimize_config_path <modified_yaml_path> \
     --cpu_infer <cpu_cores + 1>
   ```
> **Tip:** If you got 2 * 24GB AMD GPUS, you may also do the same modify and run `ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` instead.

## Known Limitations
- Marlin operations not supported on ROCm platform
- Current Q8 linear implementation shows reduced performance (Beta limitation)


================================================
FILE: doc/en/SFT/DPO_tutorial.md
================================================
# DPO Training with LLaMA-Factory

This tutorial demonstrates how to use Direct Preference Optimization (DPO) to fine-tune a language model using the LLaMA-Factory framework. DPO is a method for training models based on human preferences, allowing for more aligned and user-centric outputs.

## Installation

### Step 1: Create a conda environment and suit it for KTransformers

```Bash
conda create -n Kllama python=3.12 # choose from : [3.11, 3.12, 3.13]
conda install -y -c conda-forge libstdcxx-ng gcc_impl_linux-64
conda install -y -c nvidia/label/cuda-12.8.0 cuda-runtime
```

### Step 2: Install the LLaMA-Factory environment

```Bash
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e ".[torch,metrics]" --no-build-isolation
```


### Step 3: Install KTransformers
#### Option 1: Install the KTransformers wheel that matches your Torch and Python versions, from https://github.com/kvcache-ai/ktransformers/releases/tag/v0.4.4

(Note: The CUDA version can differ from that in the wheel filename.)

```Bash
pip install ktransformers-0.4.4+cu128torch28fancy-cp312-cp312-linux_x86_64.whl
```

#### Option 2: Install KTransformers from source

```Bash
git clone --depth 1 https://github.com/kvcache-ai/ktransformers.git
cd ktransformers/kt-sft
export TORCH_CUDA_ARCH_LIST="8.0;8.9;9.0" # set according to your GPU

pip install -r "requirements-sft.txt"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation

```

### Step 4: Install the Flash-attention wheel that matches your Torch and Python versions, from: https://github.com/Dao-AILab/flash-attention/releases

```Bash
# abi=True/False can find from below
# import torch
# print(torch._C._GLIBCXX_USE_CXX11_ABI)

pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
```

### Step 5: (Optional) If you want to use flash_infer (otherwise it defaults to triton)

```Bash
git clone https://github.com/kvcache-ai/custom_flashinfer.git
pip install custom_flashinfer/
```

## Prepare Models

We use `deepseek-ai/DeepSeek-V2-Lite` as an example here. You can replace it with other models such as Kimi K2.

## How to start

```Python
# For LoRA SFT
USE_KT=1 llamafactory-cli train examples/train_lora/deepseek2_lora_dpo_kt.yaml
# For Chat with model after LoRA SFT
llamafactory-cli chat examples/inference/deepseek2_lora_dpo_kt.yaml
# For API with model after LoRA SFT
llamafactory-cli api examples/inference/deepseek2_lora_dpo_kt.yaml
```

For example, we provide the YAML file as follows: 

（1）examples/train_lora/deepseek2_lora_dpo_kt.yaml

```YAML
### model
model_name_or_path: deepseek-ai/DeepSeek-V2-Lite
trust_remote_code: true

### method
stage: dpo
do_train: true
finetuning_type: lora
lora_rank: 8
lora_target: all
pref_beta: 0.1
pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]

### dataset
dataset: dpo_en_demo
template: llama3
cutoff_len: 2048
max_samples: 1000
overwrite_cache: true
preprocessing_num_workers: 16
dataloader_num_workers: 4

### output
output_dir: saves/Kllama_deepseekV2_DPO
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true
save_only_model: false
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 5.0e-6
num_train_epochs: 3
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
resume_from_checkpoint: null

### ktransformers
use_kt: true # use KTransformers as LoRA sft backend
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
cpu_infer: 64
chunk_size: 8192
```

For more details about --kt_optimize_rule, please refer to https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/KTransformers-Fine-Tuning_User-Guide.md 

Then, you can use the lora adapter saved in `saves/Kllama_deepseekV2_DPO` for inference the same as the sft training. For example,

```YAML
model_name_or_path: DeepSeek-V2-Lite-Chat 
adapter_name_or_path: saves/Kllama_deepseekV2_DPO
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # use KTransformers as LoRA sft backend to inference
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
cpu_infer: 32
chunk_size: 8192

```


================================================
FILE: doc/en/SFT/KTransformers-Fine-Tuning_Developer-Technical-Notes.md
================================================
- [Introduction](#introduction)
- [Overall View of the KT Fine-Tuning Framework](#overall-view-of-the-kt-fine-tuning-framework)
  - [Attention (LoRA + KT coexist)](#attention-lora--kt-coexist)
  - [MoE (operator encapsulation + backward)](#moe-operator-encapsulation--backward)
  - [Multi-GPU Loading/Training: Placement strategy instead of DataParallel](#multi-gpu-loadingtraining-placement-strategy-instead-of-dataparallel)
- [KT-LoRA Fine-Tuning Evaluation](#kt-lora-fine-tuning-evaluation)
  - [Setup](#setup)
  - [Results](#results)
  - [Speed Tests](#speed-tests)
  - [Memory Footprint](#memory-footprint)
- [Conclusion](#conclusion)


# KTransformers Fine-Tuning × LLaMA-Factory Integration – Developer Technical Notes

**MadSys Lab, KVCache-AI Team, Approaching AI, LLaMA-Factory Team**

## Introduction

Recent open-source LLMs—from DeepSeek-V3/R1 to Qwen-MoE and Kimi-K2—have surged in performance and scale. Yet due to **compute and memory constraints**, it is difficult for typical researchers to fine-tune trillion-parameter-class models. We therefore integrate **KTransformers** with **LLaMA-Factory** so that, with **2–4 RTX 4090 GPUs** and sufficient CPU memory, one can fine-tune ultra-large Mixture-of-Experts (MoE) models such as DeepSeek-671B.

This architecture bridges resource gaps, enabling **local fine-tuning of ultra-large models**, while also supporting **efficient scenario customization** at 14B/30B scales. We validate on stylized dialogue, Westernized translation tone, and medical Q&A, achieving rapid adaptation within hours.

Architecturally, LLaMA-Factory orchestrates data/config/training, LoRA insertion, and inference; KTransformers is a pluggable, high-performance operator backend that takes over Attention and MoE under the same training code, enabling **GPU+CPU heterogeneity** to accelerate training and reduce GPU memory.

![image-20251011010558909](../../assets/image-20251011010558909.png)

We evaluated LoRA fine-tuning with HuggingFace default, Unsloth, and KTransformers backends (same settings and data). **KTransformers** is currently the only solution feasible on **2–4×24GB 4090s** for **671B-scale MoE**, and also shows higher throughput and lower GPU memory for 14B MoEs.

| Under LoRA (BF16) + [NekoQA-10K stylized dialogue](https://github.com/mindsRiverPonder/LLM-practice) | HuggingFace Backend                      | Unsloth Backend                      | KTransformers Backend |
| ------------------------------------------------------------ | ---------------------------------------- | ------------------------------------ | --------------------- |
| [14B-DeepSeekV2-Lite] LoRA fine-tuning throughput            | 303.58 token/s                           | 455.37 token/s                       | 530.38 token/s        |
| [14B-DeepSeekV2-Lite] GPU memory                             | 32.12 GB                                 | 9.64 GB                              | 6.08 GB               |
| [671B-DeepSeekV3] LoRA fine-tuning throughput                | <font color='red'>Too Huge to run</font> | <font color='red'>NOT SUPPORT</font> | 40.35 token/s         |
| [671B-DeepSeekV3] GPU memory (sum across GPUs)               | theoretical 1400 GB †                    | <font color='red'>NOT SUPPORT</font> | 70 GB †               |

† The **1400 GB** is the **theoretical** FP16 full-resident footprint (not runnable). **70 GB** is the **measured peak** with KT (Attention on GPU + layered MoE offload).

From the table above, it can be seen that for the 14B model, the KTransformers backend achieves approximately 75% higher throughput than the default HuggingFace solution, while using only about one-fifth of the GPU memory. For the 671B model, both HuggingFace and Unsloth fail to run on a single 4090 GPU, whereas KTransformers is able to perform LoRA fine-tuning at 40 tokens/s, keeping the GPU memory usage within 70 GB.

![按照模型划分的对比图_02](../../assets/image-compare_model.png)


## Overall View of the KT Fine-Tuning Framework

We detail how KTransformers takes over core operators in LLaMA-Factory’s fine-tuning framework to optimize Attention and MoE.

DeepSeek-V3/V2 MoE models comprise a small-parameter dense Attention part and a large-parameter sparse MoE part. For illustration, consider layer 2 of DeepSeek-V2-Lite-Chat (from which each layer includes both Attention and MoE). Attention compute and KV cache mainly reside on the GPU; the heavyweight MoE part is primarily executed on the CPU. We first cover **Attention replacement and inheritance**, then **MoE encapsulation and backend interfacing**, and finally **multi-GPU placement**.

### Attention (LoRA + KT coexist)

KTransformers provides operator injection (`BaseInjectedModule`), and PEFT provides LoRA layer insertion. For fine-tuning, we design `KTransformersLinearLora`, inheriting from both `KTransformersLinear` and `LoraLayer`:

- **Inheritance:** `KTransformersLinearLora` retains KT’s high-performance paths (`prefill_linear`/`generate_linear`) while accepting LoRA parameters (`lora_A/lora_B`).
- **Replacement:** During preparation, we replace original `KTransformersLinear` layers (Q/K/V/O) with `KTransformersLinearLora`, preserving KT optimizations while enabling LoRA trainability.

![image-20251016182810716](../../assets/image-20251016182810716.png)

After replacement, LoRA is inserted at Q/K/V/O linear transforms (left), and `KTransformersLinearLora` contains both KT fast paths and LoRA matrices (right).

![image-20251016182920722](../../assets/image-20251016182920722.png)

### MoE (operator encapsulation + backward)

#### Encapsulation

Given large parameters and sparse compute, we encapsulate the expert computation as a **differentiable black-box operator**—transparent upstream, replaceable downstream.

- **Upstream (PyTorch graph):** we register a custom Autograd Function so the MoE layer appears as **a single node**. In the left figure (red box), only `KSFTExpertsCPU` is visible; on the right, the unencapsulated graph expands routing, dispatch, and FFN experts. Encapsulation makes the MoE layer behave like a standard `nn.Module` with gradients.
- **Downstream (backend):** inside the Autograd Function, pybind11 calls C++ extensions for forward/backward. Multiple **pluggable backends** exist (AMX BF16/INT8; **llamafile**). The backend can be switched via YAML (e.g., `"backend": "AMXBF16"` vs. `"llamafile"`).

![image-20250801174623919](../../assets/image-20250801174623919.png)

#### Backward (CPU)

MoE backward frequently needs the transposed weights $W^\top$. To avoid repeated runtime transposes, we **precompute/cache** $W^\top$ at load time (blue box). We also **cache necessary intermediate activations** (e.g., expert projections, red box) to reuse in backward and reduce recomputation. We provide backward implementations for **llamafile** and **AMX (INT8/BF16)**, with NUMA-aware optimizations.

<img src="../../assets/image-20251016182942726.png" alt="image-20251016182942726" style="zoom:33%;" />

### Multi-GPU Loading/Training: Placement strategy instead of DataParallel

To lower **per-GPU memory peaks** on 2–4 GPUs, we use **model parallelism + explicit placement**, not DataParallel (which duplicates the whole model on each GPU).

Key changes:

1. **KTrainer:** takes over `.to(device)` to prevent “move whole model to a single GPU”. Using KT’s optimize-rule YAML, each layer declares `device: cuda:0/cuda:1/...` and is **constructed directly on the target GPU** (no extra copies).
2. **Disable automatic DataParallel:** when `USE_KT=1`, we disable automatic DP wrappers from LLaMA-Factory/HF Trainer to avoid duplication and keep full control over sharding.
3. **Gradient aggregation:** gradients are reduced to `cuda:0`. Intermediate activations stay local; only necessary tensors are transferred, cutting communication/activation overhead.

Thus, we keep KT placement strategies under multi-GPU fine-tuning. Users choose a `kt_optimize_rule` with `multi-gpu`. For DeepSeek-671B, `DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml` is a typical 2-GPU plan: KV/attention parts on each GPU; MoE experts sharded on CPU; both GPUs share the workload.


## KT-LoRA Fine-Tuning Evaluation

### Setup

LLaMA-Factory orchestration, KTransformers backend, LoRA (rank=8, α=32, dropout=0.1, BF16), `GAS=16`, `qlen=512`, with the same KT optimize rule as training. We evaluate (a) stylized dialogue transfer and (b) two **small-scale representative** benchmarks: Translational-Style (generative) and AfriMed-QA (medical vertical; **SAQ** and **MCQ**). AMX is enabled; GPUs: 2×48GB RTX 4090; CPU: Intel Xeon Platinum 8488C.

### Results

#### Stylized Dialogue (CatGirl tone)

Dataset: [NekoQA-10K](https://zhuanlan.zhihu.com/p/1934983798233231689). The fine-tuned model consistently exhibits the target style (red boxes) versus neutral/rational base (blue). This shows **KT-LoRA injects style features** into the generation distribution with low GPU cost.

![image-20251016175848143](../../assets/image-20251016175848143.png)

#### Translational-Style benchmark (generative)

Dataset: [Translational-Style-ChatLLM](https://github.com/Benson114/Translational-Style-ChatLLM). Metrics: BLEU-1/2/3/4, ROUGE-1/2/L.

| Translational-Style dataset    | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------ | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite (no LoRA)              | 20.66     | 8.33      | 4.54      | 2.89      | 22.71     | 4.52      | 19.19     |
| **KT-LoRA fine-tuned V2-Lite** | **35.41** | **22.44** | **15.42** | **11.18** | **42.03** | **18.38** | **33.10** |
| V3 base (no LoRA)              | 8.49      | 3.34      | 1.62      | 0.96      | 15.91     | 2.55      | 10.07     |
| **KT-LoRA fine-tuned V3**      | **37.02** | **23.70** | **16.21** | **11.49** | **43.43** | **18.96** | **34.54** |

As shown by the test results in the tables above, under a unified workflow and placement strategy, **both model scales exhibit consistent gains after fine-tuning**, supporting the usability and effectiveness of the “KT backend + LoRA fine-tuning” combination for generative style control. At the same time, this indicates that KT’s heterogeneous placement and operator optimizations can stably support small-sample adaptation in the style domain.

#### Medical Vertical Benchmark (AfriMed-SAQ/MCQ)

The dataset adopts [AfriMed-QA](https://aclanthology.org/2025.acl-long.96/) (ACL 2025), a domain-specific dataset for the medical field in Africa with strong scenario customization characteristics, comprising two formats—multiple-choice questions (MCQ) and short-answer questions (SAQ)—which in this case serve as the evaluation for vertical-domain fine-tuning. In terms of evaluation criteria, BLEU/ROUGE are used for SAQ, and Accuracy is used for MCQ.

| AfriMed-QA (SAQ)               | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------ | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite (no LoRA)              | 13.58     | 11.12     | 9.10      | 7.23      | 22.48     | 7.81      | 11.73     |
| **KT-LoRA fine-tuned V2-Lite** | **35.90** | **27.63** | **22.99** | **19.15** | **35.25** | **17.50** | **28.44** |
| V3 base (no LoRA)              | 12.75     | 10.27     | 8.05      | 5.99      | 20.33     | 5.65      | 10.11     |
| **KT-LoRA fine-tuned V3**      | **42.42** | **34.12** | **28.95** | **24.54** | **41.97** | **22.37** | **33.28** |

| AfriMed-QA (MCQ)               | Accuracy   |
| ------------------------------ | ---------- |
| V2-Lite (no LoRA)              | 0.0645     |
| **KT-LoRA fine-tuned V2-Lite** | **0.4812** |
| V3 base (no LoRA)              | 0.5833     |
| **KT-LoRA fine-tuned V3**      | **0.7930** |

As shown in the tables above, (1) DeepSeek-V3 (671B) after KT-LoRA fine-tuning achieves clearly higher performance than the fine-tuned DeepSeek-V2-Lite (14B) on both MCQ and SAQ, and it also surpasses the V3 base model. Within our small-scale setting, this preliminarily indicates that KT-LoRA fine-tuning of ultra-large-parameter models has practical significance in vertical domains.

(2) Across both SAQ/MCQ sub-tasks, KT-LoRA delivers consistent gains, indicating that—with KT’s heterogeneous placement and backend operator support—LoRA fine-tuning can effectively inject the key knowledge points of vertical domains such as medicine into the model.

#### Limitations

At present, most of our testing is conducted on **single datasets** and at **small scale** (≤ 20k examples), with the goal of providing **existence evidence of system effectiveness for KT-LoRA fine-tuning**, rather than drawing generalized conclusions about algorithmic generalization or scaling laws. Our report primarily presents representative figures; to support stronger algorithmic claims, larger sample sizes, multi-lingual/multi-domain datasets, and multi-seed repeated experiments would be required—these are beyond the scope of this work.

**We also warmly welcome everyone to join the open-source LLaMA-Factory KT fine-tuning project. If you have additional test results, we especially welcome you to record them in the shared spreadsheet below, and to include the corresponding `kt_optimize_rule` files, dataset examples, training/evaluation YAMLs, and detailed GPU-memory and CPU configurations for community reference and reproducibility~!** 


### Speed Tests

#### End-to-End Performance

**Definitions**

`step_time`：time per optimization step (tensor movement + Attention + MoE + others).

`tokens_per_step = GAS × qlen`；`token/s = tokens_per_step / step_time`。 We use `GAS=16`, `qlen=512` → `tokens_per_step=8192`.

**Measured**

| Model                | step_time (s) | tokens/step | token/s   |
| -------------------- | ------------- | ----------- | --------- |
| DeepSeek-V3-671B     | 203           | 8192        | **40.35** |
| DeepSeek-V2-Lite-14B | 36            | 8192        | **227.6** |

#### MoE Compute (DeepSeek-V3-671B)

**Theory**

- MoE per-layer, per-token FLOPs (forward+backward) approx.:
  $$
  \text{FLOPs}_{\text{per-layer, per-token}} \approx c \cdot k \cdot H \cdot I
  $$

​		with $k = 8$（Top-k），$H = 7168$（hidden size），$I = 2048$（intermediate size），$c\approx16$（≈6 forward + ≈10 backward matmuls）。

- Per-step across all MoE layers:
  $$
  \text{FLOPs}_{\text{per-step}} \approx c \cdot qlen \cdot k \cdot H \cdot I \cdot L_{\text{MoE}}
  $$

​		Plugging $c=16, qlen=512, k=8, H=7168, I=2048, L_{MoE}=58$，$\text{FLOPs}_{\text{per-step}} \approx 55.8\ \text{TFLOPs}$.

**Measured (MoE TFLOPS on CPU)**

If the **MoE-only** time per step is `t_moe` (seconds), $\text{TFLOPS} = \text{FLOPs}_{\text{per-step}} / \text{step\_per\_second}.$

Use MoE-phase time, not full `step_time`, to get MoE throughput.

| TFLOPS  | Forward | Backward |
| ------- | ------- | -------- |
| Average | 17.55   | 18.41    |

### Memory Footprint

- DeepSeek-V3 (671B; 58 MoE layers out of 61): ~**70 GB** total GPU, ~**1.2–1.3 TB** host memory.
- DeepSeek-V2-Lite (14B; 26 MoE layers out of 27): ~**5 GB** GPU, ~**30 GB** host memory.


## Conclusion

Integrating **KTransformers LoRA** with **LLaMA-Factory** provides a practical path to efficiently train and deploy MoE LLMs. KT contributes placement strategies and operator optimizations (DeepSeek/Qwen/Kimi support with AMX-accelerated kernels), and LoRA enables customization with very low GPU memory; LLaMA-Factory supplies a coherent user-level interface.

This means even tens-to-hundreds-of-billion-parameter MoE models can be fine-tuned and served with low latency on ordinary hardware. The approach balances **memory savings**, **speed**, and **usability**, turning ultra-large models into tools that developers can actually wield.

================================================
FILE: doc/en/SFT/KTransformers-Fine-Tuning_User-Guide.md
================================================
- [Introduction](#introduction)
  - [Fine-Tuning Results (Examples)](#fine-tuning-results-examples)
- [Quick to Start](#quick-to-start)
  - [Environment Setup](#environment-setup)
  - [Core Feature 1: Use KTransformers backend to fine-tune ultra-large MoE models](#core-feature-1-use-ktransformers-backend-to-fine-tune-ultra-large-moe-models)
  - [Core Feature 2: Chat with the fine-tuned model (base + LoRA adapter)](#core-feature-2-chat-with-the-fine-tuned-model-base--lora-adapter)
  - [Core Feature 3: Batch inference + metrics (base + LoRA adapter)](#core-feature-3-batch-inference--metrics-base--lora-adapter)
- [KT Fine-Tuning Speed (User-Side View)](#kt-fine-tuning-speed-user-side-view)
  - [End-to-End Performance](#end-to-end-performance)
  - [GPU/CPU Memory Footprint](#gpucpu-memory-footprint)
- [Conclusion](#conclusion)


# KTransformers Fine-Tuning × LLaMA-Factory Integration – User Guide

**MadSys Lab, KVCache-AI Team, Approaching AI, LLaMA-Factory Team**

## Introduction

From **DeepSeek-V3/R1** to **Qwen3-MoE** and **Kimi-K2**, each wave of open-sourced large models brings leaps in performance and scale. However, many researchers and developers are constrained by expensive GPUs and models with tens or even hundreds of billions of parameters, making it **hard to fine-tune very large models under limited resources**. To bridge this gap, we propose a practical approach: combining **KTransformers** with **LLaMA-Factory**. With just **2–4 RTX 4090s** and a high-memory CPU, you can fine-tune ultra-large MoE models like DeepSeek-671B.

Our goal is to give resource-constrained researchers a **local path to explore fine-tuning ultra-large models**, and also a fast way to customize smaller models (e.g., 14B/30B) for specific scenarios. We validate the setup using **stylized dialogue**, **Westernized translation tone**, and **medical Q&A** as representative tasks, showing that **personalized adaptation can be achieved within hours**.

As shown below, LLaMA-Factory is the unified orchestration/configuration layer for the whole fine-tuning workflow—handling data, training scheduling, LoRA injection, and inference interfaces. **KTransformers** acts as a pluggable high-performance backend that takes over core operators like Attention/MoE under the same training configs, enabling efficient **GPU+CPU heterogeneous cooperation**.

![image-20251011010558909](../../assets/image-20251011010558909.png)

Within LLaMA-Factory, we compared LoRA fine-tuning with **HuggingFace**, **Unsloth**, and **KTransformers** backends. KTransformers is the **only workable 4090-class solution** for ultra-large MoE models (e.g., 671B) and also delivers higher throughput and lower GPU memory on smaller MoE models (e.g., DeepSeek-14B).

| Under LoRA (BF16) + [NekoQA-10K stylized dialogue](https://github.com/mindsRiverPonder/LLM-practice) | HuggingFace Backend                      | Unsloth Backend                      | KTransformers Backend |
| ------------------------------------------------------------ | ---------------------------------------- | ------------------------------------ | --------------------- |
| [14B-DeepSeekV2-Lite] LoRA fine-tuning throughput            | 303.58 token/s                           | 455.37 token/s                       | 530.38 token/s        |
| [14B-DeepSeekV2-Lite] GPU memory                             | 32.12 GB                                 | 9.64 GB                              | 6.08 GB               |
| [671B-DeepSeekV3] LoRA fine-tuning throughput                | <font color='red'>Too Huge to run</font> | <font color='red'>NOT SUPPORT</font> | 40.35 token/s         |
| [671B-DeepSeekV3] GPU memory (sum across GPUs)               | theoretical 1400 GB †                    | <font color='red'>NOT SUPPORT</font> | 70 GB †               |

† **1400 GB** is a **theoretical** FP16 full-parameter resident footprint (not runnable). **70 GB** is the **measured peak** with KT strategy (Attention on GPU + layered MoE offload).

![按照模型划分的对比图_02](../../assets/image-compare_model.png)

### Fine-Tuning Results (Examples)

#### Stylized Dialogue (CatGirl tone)

Dataset: [NekoQA-10K](https://zhuanlan.zhihu.com/p/1934983798233231689). Goal: improve style consistency and recognizability.

The figure compares responses from the base vs. fine-tuned models. The fine-tuned model maintains the target tone and address terms more consistently (red boxes), validating the effectiveness of **style-transfer fine-tuning**.

![image-20251016175046882](../../assets/image-20251016175046882.png)

#### Benchmarks

We use:

(1) [Translational-Style-ChatLLM](https://github.com/Benson114/Translational-Style-ChatLLM), which asks for an exaggerated, Westernized translation tone—clear, stylized customization.

(2) [AfriMed-QA](https://aclanthology.org/2025.acl-long.96/) (ACL 2025), a medical dataset for African contexts with strong domain specificity, including multiple-choice and short-answer sub-tasks—well-suited for vertical fine-tuning evaluation.

The tables show metrics before vs. after LoRA fine-tuning. We observe **large improvements** across metrics, verifying fine-tuning effectiveness:

| Translational-Style dataset    | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------ | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite (no LoRA)              | 20.66     | 8.33      | 4.54      | 2.89      | 22.71     | 4.52      | 19.19     |
| **KT-LoRA fine-tuned V2-Lite** | **35.41** | **22.44** | **15.42** | **11.18** | **42.03** | **18.38** | **33.10** |
| V3 base (no LoRA)              | 8.49      | 3.34      | 1.62      | 0.96      | 15.91     | 2.55      | 10.07     |
| **KT-LoRA fine-tuned V3**      | **37.02** | **23.70** | **16.21** | **11.49** | **43.43** | **18.96** | **34.54** |

| AfriMed-QA (short answer)      | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------ | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite (no LoRA)              | 13.58     | 11.12     | 9.10      | 7.23      | 22.48     | 7.81      | 11.73     |
| **KT-LoRA fine-tuned V2-Lite** | **35.90** | **27.63** | **22.99** | **19.15** | **35.25** | **17.50** | **28.44** |
| V3 base (no LoRA)              | 12.75     | 10.27     | 8.05      | 5.99      | 20.33     | 5.65      | 10.11     |
| **KT-LoRA fine-tuned V3**      | **42.42** | **34.12** | **28.95** | **24.54** | **41.97** | **22.37** | **33.28** |

| AfriMed-QA (multiple choice)   | Accuracy   |
| ------------------------------ | ---------- |
| V2-Lite (no LoRA)              | 0.0645     |
| **KT-LoRA fine-tuned V2-Lite** | **0.4812** |
| V3 base (no LoRA)              | 0.5833     |
| **KT-LoRA fine-tuned V3**      | **0.7930** |

Even for ultra-large MoE models, **KTransformers-backed fine-tuning** achieves strong task performance quickly.


## Quick to Start

This section shows how to install and use **LLaMA-Factory + KTransformers** for fine-tuning and inference:

- Environment setup
- Fine-tune ultra-large MoE models with KTransformers backend
- Load the fine-tuned model (base + LoRA adapter) for chat/inference
- Batch inference and metric evaluation

### Environment Setup

According to the following example, install both the **KTransformers** and **LLaMA-Factory** environments simultaneously.
 This time, to simplify the installation process of KTransformers, we have specially packaged a wheel file to avoid local compilation.
 The detailed installation steps are as follows:
 (Note: Make sure your local **Python version**, **Torch version**, **CUDA version**, and the **KTransformers wheel filename** correspond correctly.)

```shell
# 1. Create a conda environment
conda create -n Kllama python=3.12 # choose from : [3.10, 3.11, 3.12, 3.13]
conda install -y -c conda-forge libstdcxx-ng gcc_impl_linux-64
conda install -y -c nvidia/label/cuda-11.8.0 cuda-runtime

# 2. Install the LLaMA-Factory environment
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e ".[torch,metrics]" --no-build-isolation

# 3. Install the KTransformers wheel that matches your Torch and Python versions, from https://github.com/kvcache-ai/ktransformers/releases/tag/v0.4.1 (Note: The CUDA version can differ from that in the wheel filename.)
pip install ktransformers-0.4.1+cu128torch27fancy-cp312-cp312-linux_x86_64.whl

# 4. Install flash-attention, download the corresponding file based on your Python and Torch versions from: https://github.com/Dao-AILab/flash-attention/releases
pip install flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
# abi=True/False can find from below
# import torch
# print(torch._C._GLIBCXX_USE_CXX11_ABI)

# 5. (Optional) If you want to use flash_infer (otherwise it defaults to triton)
git clone https://github.com/kvcache-ai/custom_flashinfer.git
pip install custom_flashinfer/
```

**Usage tip:** In LLaMA-Factory YAML, set `use_kt: true` and pick a `kt_optimize_rule` file to have KTransformers handle the core compute. The features below show typical configs.

### Core Feature 1: Use KTransformers backend to fine-tune ultra-large MoE models

Run the command: `USE_KT=1 llamafactory-cli train examples/train_lora/deepseek3_lora_sft_kt.yaml`.

Note: You **must** provide a **BF16** model. DeepSeek-V3-671B is released in FP8 by default; convert with [DeepSeek-V3/inference/fp8_cast_bf16.py](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py).

```yaml
### model
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
trust_remote_code: true

### method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 8
lora_target: all

### dataset
dataset: identity
template: deepseek
cutoff_len: 2048
max_samples: 100000
overwrite_cache: true
preprocessing_num_workers: 16
dataloader_num_workers: 4

### output
output_dir: saves/Kllama_deepseekV3
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true
save_only_model: false
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
resume_from_checkpoint: null

### ktransformers
use_kt: true # use KTransformers as LoRA sft backend
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```

`kt_optimize_rule` controls **placement strategy**. See also [ktransformers/optimize_rules](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules). Naming hints (`*` = wildcard):

| Pattern                                      | Meaning                                               |
| -------------------------------------------- | ----------------------------------------------------- |
| DeepSeek-V2-Lite-Chat-* / DeepSeek-V3-Chat-* | Target model variants                                 |
| *-sft-*                                      | Strategy for fine-tuning; others are for inference    |
| *-amx-*                                      | Use AMX on CPU; otherwise use **llamafile**           |
| *-multi-gpu-X*                               | Model parallel on X GPUs (X omitted → default 2 GPUs) |

Example: `DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml` = V3-Chat fine-tuning with AMX and 2-GPU model parallel.

We recommend **AMX acceleration** where available (`lscpu | grep amx`). AMX supports BF16/INT8. Example:

```yaml
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert parallelism
    kwargs:
      prefill_device: "cpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
```

Outputs go to `output_dir` in safetensors format plus adapter metadata for later loading.

![image-20251016171537997](../../assets/image-20251016171537997.png)

### Core Feature 2: Chat with the fine-tuned model (base + LoRA adapter)

Run the command: `llamafactory-cli chat examples/inference/deepseek3_lora_sft_kt.yaml`.

Use the safetensors adapter trained with KT for inference.

```yaml
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path: saves/Kllama_deepseekV3
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # use KTransformers as LoRA sft backend to inference
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```

We also support **GGUF** adapters: for safetensors, set the **directory**; for GGUF, set the **file path** in `adapter_name_or_path`.

During loading, LLaMA-Factory maps layer names to KT’s naming. You’ll see logs like `Loaded adapter weight: XXX -> XXX`:

![image-20251016171526210](../../assets/image-20251016171526210.png)

### Core Feature 3: Batch inference + metrics (base + LoRA adapter)

Run the command: `API_PORT=8000 llamafactory-cli api examples/inference/deepseek3_lora_sft_kt.yaml`.
 Invoke the KT fine-tuned adapter to provide the API; the usage logic of other APIs is consistent with the native LLaMA-Factory approach.

```yaml
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path: saves/Kllama_deepseekV3
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # use KTransformers as LoRA sft backend to inference
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```


## KT Fine-Tuning Speed (User-Side View)

### End-to-End Performance

**Definitions**

- `step_time`: wall-clock time for a full optimization step (tensor movement + Attention + MoE + other compute).
- `tokens_per_step = GAS × qlen`; `token/s = tokens_per_step / step_time`.

**Settings:** `GAS=16`, `qlen=512` (→ `tokens_per_step = 8192`); LoRA (`r=8, alpha=32, dropout=0.1`); **AMX** enabled; GPU: RTX 4090, CPU: Intel Xeon Platinum 8488C.

**Measured**

- **DeepSeek-V3-671B:** `step_time = 203 s` → `token/s ≈ 8192 / 203 ≈ 40.35`
- **DeepSeek-V2-Lite-14B:** `step_time = 36 s` → `token/s ≈ 8192 / 36 ≈ 227.6`

### GPU/CPU Memory Footprint

- DeepSeek-V3 (671B; 61 layers with 58 MoE): ~**70 GB** total GPU VRAM (multi-GPU), ~**1.2–1.3 TB** CPU RAM.
- DeepSeek-V2-Lite (14B; 27 layers with 26 MoE): ~**5.5 GB** GPU VRAM, ~**30 GB** CPU RAM.

## Conclusion

By integrating **KTransformers LoRA fine-tuning** into **LLaMA-Factory**, we provide a practical guide for efficient training and deployment of MoE LLMs. KT brings cutting-edge optimizations (DeepSeek/Qwen/Kimi support with AMX-accelerated kernels), and LoRA enables customization under very low GPU memory. LLaMA-Factory offers a friendly, unified interface.

This integration (akin to Unsloth-style speedups) means even models with tens to hundreds of billions of parameters can be fine-tuned and deployed with low latency on commodity hardware. You get **memory savings, speed-ups, and usability** together. We encourage you to try LLaMA-Factory + KT for your next MoE project and follow this guide. Feedback is welcome!


================================================
FILE: doc/en/SFT/README.md
================================================
# kt-sft Docs

================================================
FILE: doc/en/SFT/injection_tutorial.md
================================================
# Tutorial: Inject Operator Step by Step

> Author: Azure-Tang

## TL;DR
This tutorial will guide you through the process of injecting custom operators into a model using the KTransformers framework. We will use the DeepSeekV2-Chat model as an example to demonstrate how to inject custom operators into the model step by step. The tutorial will cover the following topics:
- [TL;DR](#tldr)
- [How to Write Injection Rules](#how-to-write-injection-rules)
- [Understanding Model Structure](#understanding-model-structure)
- [Matrix Absorption-based MLA Injection](#matrix-absorption-based-mla-injection)
- [Injection of Routed Experts](#injection-of-routed-experts)
- [Injection of Linear Layers](#injection-of-linear-layers)
- [Injection of Modules with Pre-calculated Buffers](#injection-of-modules-with-pre-calculated-buffers)
- [Specifying Running Devices for Modules](#specifying-running-devices-for-modules)
- [Muti-GPU](#muti-gpu)
- [How to Write a New Operator and Inject into the Model](#how-to-write-a-new-operator-and-inject-into-the-model)

## How to Write Injection Rules
The basic form of the injection rules for the Inject framework is as follows:
```yaml
- match:
    name: "^model\\.layers\\..*\\.*$"  # Target module name
    class: torch.nn.Linear  # Target module
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      # your_op_param_1: 1234
      # your_op_param_2: 5678
  recursive: True
```
* match: This field marks the matching rules, which can appear in two forms, name and class. These two matching rules can appear together or separately; they only match when both criteria are met.
* replace:
	* class: Python class that can be imported to replace the target module. If no replacement is desired, set to default.
	* kwargs: List of parameters needed for module initialization.
	    * generate_device: The device for this module, can be set to “cpu”, “cuda”, “cuda:1”, etc.
* recursive: Whether to recursively inject this module’s submodules, default is True.

For the recursive field: Some modules contain multiple submodules, such as the Self-attention module typically includes q/k/v/o four linear modules. If we replace the self-attention module but do not want the internal linear modules to be covered by other rules, set this rule to False.

## Understanding Model Structure
Using [deepseek-ai/DeepSeek-V2-Lite-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat) as an example, we can follow the above rules step by step to inject our custom module and run it. KTransformers offers a high degree of flexibility, allowing you to replace/experiment with basic operators. However, it also requires users to clearly understand the structure of the model they are running.

Fortunately, knowing the structure of a model is very simple. Open the file list on the [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/tree/main) homepage, and you can see the following files:
<p align="center">
  <picture>
    <img alt="Inject-Struction" src="../../assets/model_structure_guild.png" width=60%>
  </picture>
</p>

From the `.saftensors` file, we can see the name of each layer’s weights, corresponding to the match.name attribute in the injection rules.
From the `modeling_deepseek.py` file, we can see the specific implementation of each module class, with the class name corresponding to the match.class attribute in the injection rules.

The structure of the DeepSeekV2 model from the `.saftensors` and `modeling_deepseek.py` files is as follows:
<p align="center">
  <picture>
    <img alt="Inject-Struction" src="../../assets/deepseekv2_structure.png" width=60%>
  </picture>
</p>

Supported operators and their corresponding classes are as follows:

| match     | replace                | backends                | descriptions         |
| --------- | ---------------------- | ----------------------- | -------------------- |
| Linear    | KTransformersLinear    | KLinearMarlin           | Marlin as backend    |
|           |                        | KLinearTorch            | pytorch as backend   |
|           |                        | KLinearCPUInfer         | llamafile as backend |
|           |                        | KLinearFP8         | Triton fp8_gemm kernel. Requires GPU be able to caluculate fp8 data |
| experts   | KTransformersExperts   | KExpertsTorch           | pytorch as backend   |
|           |                        | KExpertsMarlin          | Marlin as backend    |
|           |                        | KExpertsCPU             | llamafile as backend |
| Attention | KDeepseekV2Attention   | KDeepseekV2Attention    | MLA implementation   |
| MoE       | KMistralSparseMoEBlock | KQwen2MoeSparseMoeBlock | MoE for Qwen2        |
|           | KDeepseekV2MoE         | KDeepseekV2MoE          | MoE for DeepseekV2   |
| Model     | KQwen2MoeModel         | KQwen2MoeModel          | Model for Qwen2      |
|           | KDeepseekV2Model       | KDeepseekV2Model        | Model for DeepseekV2 |
| RoPE      | RotaryEmbedding        | RotaryEmbedding         | RoPE module          |
|           | YarnRotaryEmbedding    | YarnRotaryEmbedding     | RoPE module          |

Then we start step-by-step injection of custom modules, our targets are:

* Replace the linear module with custom Marlin linear module.
* Replace the self-attention module with a custom Absorption-based MLA module.
* Replace the experts module with a custom Experts module.
* Replace the MoE module with a custom MoE module.
* Replace the RoPE module with a custom RoPE module.
* Set the running device for each module.

The full implementation of the injection rules can be found in the [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml).

## Matrix Absorption-based MLA Injection

For the injection of the Attention module, we only need to use a regular expression to match the module names used in transformers and replace them with our own MLA module implementation. The YAML injection rule is as follows:
```yaml
- match:
    name: "^model\\.layers\\..*\\.self_attn$"  # Regular expression
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # Optimized MLA implementation
```
As you can see, each rule in the YAML file has two parts: match and replace. The match part specifies the module to be replaced, and the replace part specifies the module to be injected into the model along with the initialization keywords.

## Injection of Routed Experts
For Routed Experts (corresponding to the exps in the diagram), the module we inject is CPUInfer, which is wrapped in the wrapper module KTransformersExperts. KTransformersExperts has multiple implementations, and we need to specify keywords to tell the wrapper module which implementation we want to use and how we plan to use it.

In the source code of the transformer, MoE is implemented using nn.ModuleList. We do not want KTransformers to traverse all submodules in the list and inject them one by one, so in this rule, we set recursive: False to prevent recursive injection into the submodules of this module. The YAML rule is as follows:

```yaml
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # Custom MoE kernel with expert parallelism
    kwargs:
      generate_device: "cpu"
      generate_op: "MLPCPUExperts"
      out_device: "cuda"
  recursive: False # Don't recursively inject submodules of this module
```

If we inject Routed Experts as a custom module, we cannot use the interfaces in the original `nn.ModuleList`. Therefore, it is necessary to modify the forward function in the FFN module. The simplest method is to implement a new module with a custom forward function and inject it.
```yaml
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # MLP module with custom forward function
```

## Injection of Linear Layers

For the remaining linear layer modules, we aim to use quantized operators to save storage space while improving performance. Since there is no current research on using MLA and quantization together, we do not want to inject linear into the MLA operator. Therefore, we can modify the regular expression and add a type check in the match part of the rule. Only modules that match both the name and class simultaneously will be injected. We also need to pass some keywords similar to the injection of Routed Experts. The YAML rule is as follows:

```yaml
- match:
    name: "^model\\.layers\\.(?!.*self_attn).*$"  # Regular expression
    class: torch.nn.Linear  # Only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # Optimized kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      generate_op: "QuantizedLinearMarlin"
```
## Injection of Modules with Pre-calculated Buffers

To avoid occupying resources when initializing the injected original model, we use torch’s meta device to initialize the original model. The RoPE module pre-calculates some buffers during initialization, but no calculations are performed when using the meta device. Therefore, we need to compensate for the calculation of the buffer when loading the model. Simply, we inject a custom module into the rotary embedding module, which performs pre-calculation during loading. The YAML rule is as follows:
```yaml
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
```

## Specifying Running Devices for Modules

Finally, we set a fallback basic attribute generate_device for all modules:
```yaml
- match:
    name: "^model\\.layers\\..*\\.|^lm_head"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda"
  
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
```
Through these two rules, we place all previously unmatched layers (and their submodules) and lm_head on cuda, and the embedding on cpu. Note that the properties of a module will be determined by the first rule it matches. For example, if you later set a new replace.kwargs.generate_device in an injected module, the device set earlier will take precedence. If your computer has multiple cards, you can also configure the model to multiple cards.


## Muti-GPU

If you have multiple GPUs, you can set the device for each module to different GPUs. 
DeepseekV2-Chat got 60 layers, if we got 2 GPUs, we can allocate 30 layers to each GPU. Complete multi GPU rule examples [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml).


<p align="center">
  <picture>
    <img alt="Inject-Struction" src="../../assets/multi_gpu.png" width=60%>
  </picture>
</p>

First of all, for multi-GPU, we have to inject an new operator `KDeepseekV2Model`. And set division of the layers to different GPUs. For our case, we have to set the `transfer_map` in the `KDeepseekV2Model` operatoras as follows:

```yaml
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      transfer_map: 
        30: "cuda:1"
```

And we have to set the device for each module in the model. 

For example, for `routed experts`, the yaml for one GPU is:
```yaml
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # Custom MoE kernel with expert parallelism
    kwargs:
      generate_device: "cuda:0"
      generate_op: "MLPCUDAExperts"
      out_device: "cuda:0"
  recursive: False # Don't recursively inject submodules of this module
```
But for two GPUs, we need to set the device for each module in the model. 

```yaml
# allcate 0-29 layers‘s out_device to cuda:0
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

# allocate 30-59 layers‘s out_device to cuda:1
- match:
    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module
```
For other modules, we can set the device in the same way.

## How to Write a New Operator and Inject into the Model

In this section, we will explain how to write an operator that can be injected, using the implementation of a new linear as an example.

First, all injectable operators need to inherit from the BaseInjectedModule class, which inherits some attributes required by our injection framework. Its initialization function needs to meet the following basic format:

```python
class LinearTorchInject(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "cuda",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, generate_device, **kwargs)
```
If users have other parameters that need to be passed to this class, they can also be included in the init function and re-passed in the kwargs parameter in the yaml file. For example, if our operator wants to pass a parameter `my_param`, the init function can be written as:
```python
class LinearTorchInject(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "cuda",
        my_param: bool = True,
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.my_param = my_param
```
Then our injection rule can be written as:
```yaml
- match: 
    name: "^model\\.layers\\..*$"  # Regular expression matches the module name.
    class: torch.nn.Linear  # Type restrictions can be added.
  replace:
    class: ktransformers.operators.linear.LinearTorchInject  # Inject module path
    kwargs: # Extra parameters
      generate_device: "cuda"
      my_param: True
```
For the linear module, it is also necessary to read weights from a gguf file. We provide the `KLinearBase` class to help users read weights from gguf files. Users only need to inherit and implement the load, unload, and forward functions. Therefore, a fully injectable linear class would look like this:
```python
class LinearTorchInject(BaseInjectedModule, KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "cuda",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, generate_device, **kwargs)
        KLinearBase.__init__(self)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.w = None
        self.has_bias = False
    
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: w = self.load_weight(device=device)

        if isinstance(w, nn.Parameter):
            self.w = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
            self.has_bias = False
        elif isinstance(w, tuple):
            self.w = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
            self.bias = w[1].to(dtype=self.dtype)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        self.w = self.w.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)

    def unload(self):
        if self.w is not None:
            self.w = None
        if self.has_bias:
            self.bias = None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        dtype = x.dtype
        out_device = x.device
        x = x.to(device=self.device, dtype=self.dtype)
        x = x @ self.w
        if self.has_bias:
            x = x + self.bias
        x = x.to(dtype=dtype, device=out_device)
        return x
```
Note that the `self.load_weight` function is provided by the KLinearBase class to help users load weights from a gguf file into the module. The implementation details of KLinearBase can be found on [GITHUB](https://github.com/kvcache-ai/ktransformers/blob/44f57270c9514d79fab224186d90ccf61059331a/ktransformers/operators/linear.py#L31).


================================================
FILE: doc/en/SFT_Installation_Guide_KimiK2.5.md
================================================
# Kimi-K2.5 LoRA SFT Tutorial

This tutorial demonstrates how to perform **LoRA Supervised Fine-Tuning (SFT)** on **Kimi-K2.5** using **LlamaFactory** with **KTransformers** as the backend, and then serve the fine-tuned model using **SGLang**.

The workflow is:

```txt
KTransformers + LlamaFactory LoRA SFT → (Optional) LlamaFactory Verification → SGLang Serving
```

## Table of Contents

- [Hardware Requirements](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#hardware-requirements)
- [Prerequisites](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#prerequisites)
- [Step 0: Environment Setup (Method 1: Source Install)](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#step-0-environment-setup-method-1-source-install)
- [Step 1: Prepare Model Weights (BF16 for SFT)](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#step-1-prepare-model-weights-bf16-for-sft)
- [Step 2: Prepare YAML for LoRA SFT (KTransformers Backend)](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#step-2-prepare-yaml-for-lora-sft-ktransformers-backend)
- [Step 3: Run LoRA SFT](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#step-3-run-lora-sft)
- [Step 4: Post-SFT Quick Verification with LlamaFactory (Optional)](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#step-4-post-sft-quick-verification-with-LlamaFactory-optional)
- [Step 5: SGLang Serving with LoRA (Recommended Delivery Path)](https://chatgpt.com/c/6975bb7f-52e0-839c-a727-ec4b5d6723b5#step-5-sglang-serving-with-lora-recommended-delivery-path)

## Hardware Requirements

### Training (LoRA SFT)

- **LlamaFactory + KTransformers**
- **GPU**: 4 * NVIDIA RTX 4090 24GB (or equivalent with at least total 48GB VRAM available)
- **CPU**: x86 CPU with AMX support
- **RAM**: At least 2TGB system memory
- Swap can be used if CPU memory is insufficient

### Inference (LoRA Adapter + Original Model)

- **SGLang + KTransformers**
- **GPU**: 2 * NVIDIA RTX 4090 24GB (or equivalent with at least total 48GB VRAM available)
- **CPU**: x86 CPU with AVX512F support (e.g., Intel Sapphire Rapids)
- **RAM**: At least 600GB system memory
- **Storage**: ~600GB for model weights (native INT4 weight, same weight dir for CPU and GPU)


## Step 0: Environment Setup

We recommend to separate **two conda environments**:

| Environment | Purpose                                             |
| ----------- | --------------------------------------------------- |
| `kt-kernel` | Inference & serving (KTransformers + SGLang)        |
| `kt-sft`    | Training (LlamaFactory + KTransformers SFT backend) |

### 0.1 Inference Environment: `kt-kernel`

```bash
conda create -n kt-kernel python=3.11
conda activate kt-kernel

git clone https://github.com/kvcache-ai/ktransformers.git
git checkout kimi_k2.5
git submodule update --init --recursive
cd kt-kernel && ./install.sh
```

### 0.2 Install SGLang (Inference / Serving)

**Recommended for Kimi-K2.5:**

```bash
# Option A: One-click install (from ktransformers root, installs sglang + kt-kernel)
./install.sh

# Option B: pip install
pip install sglang-kt
```

### 0.3 Training Environment: `kt-sft`

```bash
conda create -n kt-sft python=3.11
conda activate kt-sft

git clone https://github.com/hiyouga/LlamaFactory.git
cd LlamaFactory
pip install -e .
```

### 0.4 Install KTransformers SFT Dependencies

```bash
conda install -y -c conda-forge libstdcxx-ng gcc_impl_linux-64
conda install -y -c nvidia/label/cuda-11.8.0 cuda-runtime

# Install matching wheels (recommended), from https://github.com/kvcache-ai/ktransformers/releases
pip install ktransformers-<matching-version>.whl
pip install flash_attn-<matching-version>.whl
```

## Step 1: Prepare Model Weights (BF16 for SFT)

### 1.1 Download INT4 Weights

KTransformers **requires BF16 weights for SFT**.

```bash
# Download Kimi-K2.5 (RAW-INT4 for both CPU and GPU)
huggingface-cli download moonshotai/Kimi-K2.5 \
  --local-dir /path/to/kimi-k2.5
```

### 1.2 Convert INT4 → BF16

Kimi-K2.5 base model is in **INT4** format, convert it to **BF16** before SFT.

## Step 2: Prepare YAML for LoRA SFT (KTransformers Backend)

### 2.1 Training YAML (LoRA SFT)

Example file:
`examples/train_lora/kimik2_lora_sft_kt.yaml`

Required fields:

```yaml
stage: sft
finetuning_type: lora
bf16: true

use_kt: true
kt_optimize_rule: <rule.yaml>
cpu_infer: 32
chunk_size: 8192
```

Other fields (dataset, output_dir, learning rate, epochs) can be adjusted as usual.

### 2.2 Inference YAML (LlamaFactory Verification)

Key requirements:

- `adapter_name_or_path`: LoRA output directory
- `infer_backend: ktransformers`
- **Same `use_kt` and `kt_optimize_rule` as training**

This YAML is used only for **quick verification**, not production serving.

## Step 3: Run LoRA SFT

```bash
conda activate kt-sft
cd LlamaFactory

USE_KT=1 llamafactory-cli train examples/train_lora/kimik2_lora_sft_kt.yaml
```

After training, the LoRA adapter is saved to `output_dir`.

## Step 4: Post-SFT Quick Verification with LlamaFactory (Optional)

Before production deployment, the new PDF recommends a **lightweight sanity check**.

```bash
conda activate kt-sft
cd LlamaFactory

llamafactory-cli chat examples/inference/kimik2_lora_sft_kt.yaml
```

Purpose:

- Validate LoRA correctness
- Ensure reproducibility
- Not for throughput benchmarking

## Step 5: SGLang Serving with LoRA (Recommended Delivery Path)

This is the **major runtime update** introduced by the new PDF.

### 5.1 Convert LoRA for SGLang

```bash
python ktransformers/kt-kernel/scripts/convert_lora.py \
  --base_path /path/to/kimi-base-model \
  --lora_path /path/to/llamafactory/output_dir \
  --output_path /path/to/lora_converted
```

### 5.2 (Optional) Convert CPU Weights to INT8

To reduce CPU memory usage:

```bash
python ktransformers/kt-kernel/scripts/convert_cpu_weights.py \
  --base_path /path/to/kimi-base-model \
  --output_dir /path/to/kimi-base-model-int8
```

This produces:

```text
/path/to/kimi-base-model-int8/int8
```

### 5.3 Launch SGLang Server with LoRA

```bash
conda activate kt-kernel

python -m sglang.launch_server \
  --enable-lora \
  --lora-paths lora1=/path/to/lora_converted \
  --lora-backend triton \
  --model-path /path/to/kimi-base-model \
  --tp 1 \
  --trust-remote-code \
  --context-length 4096 \
  --kt-weight-path /path/to/kimi-base-model-int8/int8 \
  --mem-fraction-static 0.9
```

Notes:

- `--kt-weight-path` points to CPU INT8 weights
- Adjust `tp`, `context-length`, and memory parameters per machine
- RAWINT4 inference paths can follow **Kimi-K2.5-Native** directly

================================================
FILE: doc/en/SFT_Installation_Guide_KimiK2.md
================================================
## Installation

### Step 1: Create a conda environment and suit it for KTransformers

```Bash
conda create -n Kllama python=3.10 # choose from : [3.10, 3.11, 3.12, 3.13]
conda install -y -c conda-forge libstdcxx-ng gcc_impl_linux-64
conda install -y -c nvidia/label/cuda-11.8.0 cuda-runtime
```

### Step 2: Install the LLaMA-Factory environment

```Bash
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e ".[torch,metrics]" --no-build-isolation
```

### Step 3: Install the KTransformers wheel that matches your Torch and Python versions, from https://github.com/kvcache-ai/ktransformers/releases/tag/v0.4.1

(Note: The CUDA version can differ from that in the wheel filename.)

```Bash
pip install ktransformers-0.4.1+cu128torch28fancy-cp310-cp310-linux_x86_64.whl
```

### Step 4: Install the Flash-attention wheel that matches your Torch and Python versions, from: https://github.com/Dao-AILab/flash-attention/releases

```Bash
pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
# abi=True/False can find from below
# import torch
# print(torch._C._GLIBCXX_USE_CXX11_ABI)
```

### Step 5: (Optional) If you want to use flash_infer (otherwise it defaults to triton)

```Bash
git clone https://github.com/kvcache-ai/custom_flashinfer.git
pip install custom_flashinfer/
```

## Download Model

Download the official KIMI weights. If the weights are in FP8 format, please refer to [convert_kimi_k2_fp8_to_bf16_cpu.py](https://github.com/kvcache-ai/ktransformers/blob/main/kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py) to convert them to BF16 weights.

## How to start

```Python
# For LoRA SFT
USE_KT=1 llamafactory-cli train examples/train_lora/kimik2_lora_sft_kt.yaml
# For Chat with model after LoRA SFT
llamafactory-cli chat examples/inference/kimik2_lora_sft_kt.yaml
# For API with model after LoRA SFT
llamafactory-cli api examples/inference/kimik2_lora_sft_kt.yaml
```

**If your** **CPU** **memory is insufficient to exceed 2T to support the Kimi K2, you can use the swap method additionally:**

```Plain
sudo fallocate -l 200G /data/swapfile
sudo chmod 600 /data/swapfile
sudo mkswap /data/swapfile
sudo swapon /data/swapfile
```

For example, we provide the YAML file as follows: (Since the structures of Kimi and DeepSeek are relatively similar, we use deepseek as template in llamafactory)

（1）examples/train_lora/kimik2_lora_sft_kt.yaml

```YAML
### model
model_name_or_path: KimiK2-model
trust_remote_code: true

### method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 8
lora_target: all

### dataset
dataset: identity
template: deepseek
cutoff_len: 2048
max_samples: 100000
overwrite_cache: true
preprocessing_num_workers: 16
dataloader_num_workers: 4

### output
output_dir: saves/Kllama_kimik2
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true
save_only_model: false
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
resume_from_checkpoint: null

### ktransformers
use_kt: true # use KTransformers as LoRA sft backend
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```

For more details about --kt_optimize_rule, please refer to https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/KTransformers-Fine-Tuning_User-Guide.md

（2）examples/inference/kimik2_lora_sft_kt.yaml

```YAML
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path: saves/Kllama_deepseekV3
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # use KTransformers as LoRA sft backend to inference
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192

```


================================================
FILE: doc/en/SmallThinker_and_Glm4moe.md
================================================
# SmallThinker & GLM-4-MoE Support for KTransformers

## Introduction

### Overview
We are excited to announce that **KTransformers now supports both SmallThinker and GLM-4-MoE**.

- **SmallThinker-21BA3B-Instruct (bf16)**: ~26 TPS **on a dual-socket CPU with one consumer-grade GPU**, requiring ~84 GB DRAM.  
- **GLM-4.5-Air (bf16)**: ~11 TPS **on a dual-socket CPU with one consumer-grade GPU**, requiring ~440 GB DRAM.
- **GLM-4.5-Air (AMX INT8)**: prefill ~309 TPS / decode ~16 TPS **on a dual-socket CPU with one consumer-grade GPU**, requiring ~220 GB DRAM.

### Model & Resource Links
- **SmallThinker-21BA3B-Instruct**
  - *[SmallThinker-21BA3B-Instruct](https://huggingface.co/PowerInfer/SmallThinker-21BA3B-Instruct)*
- **GLM-4.5-Air 110B**
  - [*GLM-4.5-Air*](https://huggingface.co/zai-org/GLM-4.5-Air)

---

## Installation Guide

### 1. Resource Requirements

| Model                     | Precision  | Experts | DRAM Needed | GPU Memory Needed\* | TPS (approx.)                   |
| ------------------------- | ---------- | ------- | ----------- | ------------------- | --------------------------------------- |
| SmallThinker-21B-Instruct          | bf16       | 32      | \~42 GB     | 14 GB               | \~26 TPS                    |
| GLM-4.5-Air            | bf16       | 128     | \~220 GB    | 14 GB               | \~11 TPS                    |
| GLM-4.5-Air (AMX INT8) | int8       | 128     | \~220 GB    | 14 GB               |  \~16 TPS


\* Exact GPU memory depends on sequence length, batch size, and kernels used.  

### 2. Prepare Models

```bash
# Example: download original safetensors (adjust to your paths/repos)
# (Fill in actual repos/filenames yourself)

# SmallThinker-21B
huggingface-cli download --resume-download https://huggingface.co/PowerInfer/SmallThinker-21BA3B-Instruct \
  --local-dir ./SmallThinker-21BA3B-Instruct

# GLM-4-MoE 110B
huggingface-cli download --resume-download https://huggingface.co/zai-org/GLM-4.5-Air \
  --local-dir ./GLM-4.5-Air
```


### 3. Install KTransformers

Follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).

```bash
pip install ktransformers  # or from source if you need bleeding-edge features
```

### 4. Run SmallThinker-21B Inference Server

```bash
python ktransformers/server/main.py \
  --port 10021 \
  --model_path /abs/path/to/SmallThinker-21B-bf16 \
  --model_name SmallThinkerForCausalLM \
  --optimize_config_path ktransformers/optimize/optimize_rules/SmallThinker-serve.yaml \
  --max_new_tokens 1024 \
  --cache_lens 32768 \
  --chunk_size 256 \
  --max_batch_size 4 \
  --backend_type balance_serve
```

### 5. Run GLM-4-MoE 110B Inference Server

```bash
python ktransformers/server/main.py \
  --port 10110 \
  --model_name Glm4MoeForCausalLM \
  --model_path /abs/path/to/GLM-4-MoE-110B-bf16 \
  --optimize_config_path ktransformers/optimize/optimize_rules/Glm4Moe-serve.yaml \
  --max_new_tokens 1024 \
  --cache_lens 32768 \
  --chunk_size 256 \
  --max_batch_size 4 \
  --backend_type balance_serve
```

### 6. Access Server

```bash
curl -X POST http://localhost:10021/v1/chat/completions \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "hello"}
    ],
    "model": "SmallThinker-21BA3B-Instruct",
    "temperature": 0.3,
    "top_p": 1.0,
    "stream": true
  }'
```

```bash
curl -X POST http://localhost:10110/v1/chat/completions \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "hello"}
    ],
    "model": "GLM-4.5-Air",
    "temperature": 0.3,
    "top_p": 1.0,
    "stream": true
  }'
```


================================================
FILE: doc/en/V3-success.md
================================================
## Hello everyone, here is the successfully reproduced environment configuration for your reference:
### Case 1
- Configuration: l40s 48G + 9654 x2 (192 cores) + 768G DDR5 12-channel
- Performance: prefill 108 tokens/s, decode 10.8 tokens/s
- Used version: main source code compiled 
### Case 2
- Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card
- Performance: Running speed approximately 6-8 tokens per second 
## NOTE
If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : )
[click here](https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2)

================================================
FILE: doc/en/api/server/api.md
================================================
# API

- [OpenAI ChatCompletion](#openai-chatcompletion)
- [Ollama ChatCompletion](#ollama-chatcompletion)
- [OpenAI Assistant](#openai-assistant)

## OpenAI ChatCompletion
```bash
POST /v1/chat/completions

```
Generate responses based on the selected model.

### Parameters
- `messages`: An array of `message` representing all historical messages. A `message` can be from a user or model (assistant) and includes:

  - `role`: Either `user` or `assistant`, indicating the creator of this message.
  - `content`: The message from the user or model.
- `model`: The name of the selected model
- `stream`: Either true or false. Indicates whether to use streaming response. If true, model inference results are returned via HTTP event stream.

### Response
- Streaming response: An event stream, each event contains a `chat.completion.chunk`. `chunk.choices[0].delta.content` is the incremental output returned by the model each time.
- Non-streaming response: Not supported yet.


### Example

```bash
curl -X 'POST' \
  'http://localhost:9112/v1/chat/completions' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "messages": [
    {
      "content": "tell a joke",
      "role": "user"
    }
  ],
  "model": "Meta-Llama-3-8B-Instruct",
  "stream": true
}'
```

```bash
data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"Why ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"couldn't ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

...

data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"two-tired!","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

event: done
data: [DONE]
```


## Ollama ChatCompletion

```bash
POST /api/generate
```

Generate responses using the selected model.

### Parameters
- `prompt`: A string representing the input prompt.
- `model`: The name of the selected model
- `stream`: Either true or false. Indicates whether to use streaming responses. If true, returns the model inference results in the form of an HTTP event stream.

### Response
- Streaming response: A stream of JSON responses, each line is a JSON.
  - `response`: The incremental result of the model completion.
  - `done`: Whether the inference has finished.
- Non-streaming response: Not yet supported.

### 例子

```bash
curl -X 'POST' \
  'http://localhost:9112/api/generate' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "model": "Meta-Llama-3-8B-Instruct",
  "prompt": "tell me a joke",
  "stream": true
}'
```

```bash
{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.686513","response":"I'll ","done":false}
{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.729214","response":"give ","done":false}

...

{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.955475","response":"for","done":false}
{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.956795","response":"","done":true}
```


================================================
FILE: doc/en/api/server/server.md
================================================
# Backend Services (Server)
The Server offers fast heterogeneous inference capabilities of ktransformers through an API for external usage.

<img src="server-arch.png" height="600" alt="Server architecture">

## API

The Server provides model inference services externally through a RESTful API, with two methods of interaction: ChatCompletion and Assistant.

- The ChatCompletion interface requires users to provide all historical dialogues at once, after which the model responds. AI service providers (such as [OpenAI](https://platform.openai.com/docs/api-reference/chat/create)) and local inference frameworks (such as [Ollama](https://github.com/ollama/ollama/blob/main/docs/api.md)) both offer the ChatCompletion interface. To ensure compatibility with OpenAI and Ollama, the Server offers APIs that are consistent with theirs. Therefore, applications currently using OpenAI and Ollama can seamlessly switch to our Server. For example: [How to use Tabby and ktransformers locally with a 236B model for code completion?](tabby.md).
- The Assistant is suitable for applications that need to reuse a series of resources and call the model. For instance, in educational applications, developers can create an Assistant named "Second Grade Math Teacher" and set an initial prompt ("You are an experienced second-grade math teacher..."), and upload relevant materials (second grade math textbooks). After creating the Assistant, the application needs to create a Thread to store the dialogues between the user and the model (Message). When calling the model, the application creates a Run to obtain the Assistant's response. Compared to ChatCompletion, the Assistant-enabled Server handles the reuse of conversational contexts and multi-turn dialogues, making model calls in complex scenarios more convenient. The [OpenAI Assistant API](https://platform.openai.com/docs/api-reference/assistants/createAssistant) introduces such an Assistant interface, and the Server provides a consistent API.

These API definitions are located in `server/api`, and their specific usage can be seen [here](api.md).

## Integrating Model Inference Frameworks

The Server uses ktransformers for model calling and inference. It also supports other inference frameworks, such as the already supported [transformers](https://huggingface.co/docs/transformers/index), and plans to support [exllamav2](https://github.com/turboderp/exllamav2). These functionalities are implemented in `server/backend`.

The model inference functionalities of the frameworks are abstracted into a base class `BackendInterfaceBase`. This class includes a function: inference. It takes historical dialogue information messages as input and returns the text result from the model. The inference function adopts an async generator design, allowing the Server to return model responses in a streaming manner.

```python
class BackendInterfaceBase:
  async def inference(self, messages, **kwargs)->AsyncIterator[str]:
    ...
```

This inference function naturally implements the functionality of ChatCompletion because its inputs and outputs are historical dialogues and model responses, respectively. Thus, the ChatCompletion API can directly call the inference function to complete model inference.

Assistant is more complex than ChatCompletion, requiring the Server to store the related state of the Assistant and call the inference function appropriately. The Server maintains a set of Assistant logic in the database, storing the Assistants, Threads, and Messages created by applications. In memory, the Server maintains a `ThreadContext` for each Thread, gathering information related to each Thread's Assistant, etc. When a user sends a new Message, the Server calls the get_local_messages function of ThreadContext to obtain messages and then calls the inference function to get the inference results.

```python
class MyThreadContext(ThreadContext):
    def get_local_messages(self):
      ...
```

Since different model inference frameworks have different historical dialogue input formats, `ThreadContext` and `BackendInterface` need to be used in pairs. Besides its own ktransformers, the Server also supports transformers. For integrating other model inference frameworks, refer to the implementations of `TransformersInterface` and `TransformersThreadContext` in [transformers.py](https://github.com/kvcache-ai/ktransformers-dev/blob/main/ktransformers/server/backend/interfaces/transformers.py). 

================================================
FILE: doc/en/api/server/tabby.md
================================================
# How to Use Tabby and ktransformers Locally with 236B Large Models for Code Completion?

[Tabby](https://tabby.tabbyml.com/docs/welcome/) is an open-source code assistant that allows users to manually configure the backend framework and model, and use it across multiple IDEs/editors, such as VSCode and IntelliJ. Since Tabby can interface with Ollama on the framework side, and the ktransformers server provides a consistent API with Ollama, we can connect Tabby to the ktransformers server. This setup allows us to experience fast, heterogeneous inference in code completion scenarios.

1. Start ktransformers.
```bash
./ktransformers --port 9112
```
2. Install Tabby: Follow the official tutorial to install Tabby on a Linux server or Windows PC with an NVIDIA GPU [here](https://tabby.tabbyml.com/docs/quick-start/installation/linux/).
3. Configure Tabby: Create `~/.tabby/config.toml` and add the following configuration.
```toml
[model.completion.http]
kind = "ollama/completion"
api_endpoint = "http://127.0.0.1:9112/"
model_name = "DeepSeek-Coder-V2-Instruct"
prompt_template = "<｜fim▁begin｜>{prefix}<｜fim▁hole｜>{suffix}<｜fim▁end｜>" # Prompt Template
```

In this configuration, `kind` specifies that ktransformers uses the standard Ollama API to serve Tabby; `api_endpoint` matches the interface bound when launching ktransformers; `model_name` is set to the model used by ktransformers, here `DeepSeek-Coder-V2-Instruct` is the backend inference model; `prompt_template` is the model's prompt template, which requires a corresponding template for different models to use the Fill In the Middle feature properly.
Here we demonstrate the relevant configuration for Tabby using the Ollama API to provide the Completion feature. For configuration information about other functions available in Tabby, refer to [here](https://tabby.tabbyml.com/docs/administration/model/).


4. Start the Tabby service: `./tabby serve`.
<img src="run-tabby.png" alt="image-20240709112329577" style="zoom:50%;" />

   After launching, you should see access to the `/api/tags` interface in the ktransformers command line (in version v0.13.0 of Tabby, this changes to access to the `/api/show/` interface).
<img src="visit-api-tags.png" alt="image-20240709111648215" style="zoom:67%;" />

6. Register a Tabby account, obtain a Token: After starting the Tabby service, open the corresponding link in a browser (as shown above at 0.0.0.0:8080), and follow the [tutorial](https://tabby.tabbyml.com/docs/quick-start/register-account/) to create a user and get a Token.

7. Start VSCode, install the Tabby extension plugin, and use the Token obtained in the previous step to connect to the Tabby Server, following [here](https://tabby.tabbyml.com/docs/extensions/installation/vscode/).

8. Open any code file and experience the fast heterogeneous inference of ktransformers.

================================================
FILE: doc/en/api/server/website.md
================================================
# Start with website

This document provides the necessary steps to set up and run the web service for this project.

## 1. Starting the Web Service

### 1.1. Compiling the Web Code

Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher

Note: The version of Node.js in the Ubuntu or Debian GNU/Linux software repository is too low, causing compilation errors. Users can also install Node.js through the Nodesource repository, provided they uninstall the outdated version first.

```bash

  # sudo apt-get remove nodejs npm -y && sudo apt-get autoremove -y
  sudo apt-get update -y && sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
  curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/nodesource.gpg
  sudo chmod 644 /usr/share/keyrings/nodesource.gpg
  echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_23.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list
  sudo apt-get update -y
  sudo apt-get install nodejs -y

```

Once npm is installed, navigate to the `ktransformers/website` directory:

```bash
cd ktransformers/website
```

Next, install the Vue CLI with the following command:

```bash
npm install @vue/cli
```

Now you can build the project:

```bash
npm run build
```
Finally you can build ktransformers with website:
```
cd ../../
pip install .
```


================================================
FILE: doc/en/balance-serve.md
================================================
# Balance Serve backend (multi-concurrency) for ktransformers

## KTransformers v0.2.4 Release Notes

We are excited to announce the official release of the long-awaited **KTransformers v0.2.4**!
In this version, we’ve added highly desired **multi-concurrency** support to the community through a major refactor of the whole architecture, updating more than 10,000 lines of code.
By drawing inspiration from the excellent architecture of sglang, we have implemented high-performance asynchronous concurrent scheduling in C++, including features like continuous batching, chunked prefill, and more. Thanks to GPU sharing in concurrent scenarios, overall throughput is also improved to a certain extent. The following is a demonstration:

https://github.com/user-attachments/assets/faa3bda2-928b-45a7-b44f-21e12ec84b8a

</p>

### 🚀 Key Updates

1. Multi-Concurrency Support
   - Added capability to handle multiple concurrent inference requests. Supports receiving and executing multiple tasks simultaneously.
   - We implemented [custom_flashinfer](https://github.com/kvcache-ai/custom_flashinfer/tree/fix-precision-mla-merge-main) based on the high-performance and highly flexible operator library [flashinfer](https://github.com/flashinfer-ai/flashinfer/), and achieved a variable batch size CUDA Graph, which further enhances flexibility while reducing memory and padding overhead.
   - In our benchmarks, overall throughput improved by approximately 130% under 4-way concurrency.
   - With support from Intel, we tested KTransformers v0.2.4 on the latest Xeon6 + MRDIMM-8800 platform. By increasing concurrency, the total output throughput increased from 17 tokens/s to 40 tokens/s. We observed that the bottleneck has now shifted to the GPU. Using a higher-end GPU than the 4090D could further improve performance.
2. Engine Architecture Optimization
   ![image](https://github.com/user-attachments/assets/f5f001fa-dca7-4377-a01a-32192902aa47)
   Inspired by the scheduling framework of sglang, we refactored KTransformers with a clearer three-layer architecture through an update of 11,000 lines of code, now supporting full multi-concurrency:
   - Server：Handles user requests and serves the OpenAI-compatible API.
   - Inference Engine：Executes model inference and supports chunked prefill.
   - Scheduler：Manages task scheduling and requests orchestration. Supports continuous batching by organizing queued requests into batches in a FCFS manner and sending them to the inference engine.
3. Project Structure Reorganization
   All C/C++ code is now centralized under the /csrc directory.
4. Parameter Adjustments
   Removed some legacy and deprecated launch parameters for a cleaner configuration experience.
   We plan to provide a complete parameter list and detailed documentation in future releases to facilitate flexible configuration and debugging.

### 📚 Upgrade Notes

- Due to parameter changes, users who have installed previous versions are advised to delete the ~/.ktransformers directory and reinitialize.
- To enable multi-concurrency, please refer to the latest documentation for configuration examples.

### What's Changed

Implemented **custom_flashinfer** @Atream @ovowei @qiyuxinlin
Implemented **balance_serve** engine based on **FlashInfer** @qiyuxinlin @ovowei
Implemented a **continuous batching** scheduler in C++ @ErvinXie
release: bump version v0.2.4 by @Atream @Azure-Tang @ErvinXie  @qiyuxinlin @ovowei @KMSorSMS @SkqLiao

## Download the Docker image for testing v0.2.4
Visit the [link](https://hub.docker.com/r/approachingai/ktransformers/tags) to pull the image, using `v0.2.4-AVX512` as an example.

```bash
docker pull approachingai/ktransformers:v0.2.4-AVX512
docker run -it --gpus all --privileged --shm-size 64g --name ktrans --network=host -v /mnt:/mnt approachingai/ktransformers:v0.2.4-AVX512 /bin/bash
# Open a new terminal
docker exec -it ktrans bash
```

## Installation Guide

⚠️ Please note that installing this project will replace flashinfer in your environment. It is strongly recommended to create a new conda environment!!!

⚠️ Please note that installing this project will replace flashinfer in your environment. It is strongly recommended to create a new conda environment!!!

⚠️ Please note that installing this project will replace flashinfer in your environment. It is strongly recommended to create a new conda environment!!!

### 2. Set Up Conda Environment

We recommend using Miniconda3/Anaconda3 for environment management:

```bash
# Download Miniconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh

# Create environment
conda create --name ktransformers python=3.11
conda activate ktransformers

# Install required libraries
conda install -c conda-forge libstdcxx-ng

# Verify GLIBCXX version (should include 3.4.32)
strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
```

> **Note:** Adjust the Anaconda path if your installation directory differs from `~/anaconda3`

### 2. Install dependencies

```bash
sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libfmt-dev libgflags-dev zlib1g-dev patchelf
pip3 install packaging ninja cpufeature numpy openai
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

```

### 3. Build ktransformers

```bash
# Clone repository
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule update --init --recursive


# Install single NUMA dependencies
USE_BALANCE_SERVE=1  bash ./install.sh
# For those who have two cpu and 1T RAM（Dual NUMA）:
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
```

## Running DeepSeek-R1-Q4KM Models

### 1. Run for 24GB VRAM GPUs

Use our optimized configuration for constrained VRAM:

```bash
python ktransformers/server/main.py \
  --port 10002 \
  --model_path <path_to_safetensor_config> \
  --gguf_path <path_to_gguf_files> \
  --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml \
  --max_new_tokens 1024 \
  --cache_lens 32768 \
  --chunk_size 256 \
  --max_batch_size 4 \
  --backend_type balance_serve \
  --force_think # useful for R1
```

It features the following arguments:

- `--max_new_tokens`: Maximum number of tokens generated per request.
- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space.
- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)
- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
  corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
- `--model_path`: Path to safetensor config path (only config required, not model safetensors).  
  Please note that, since `ver 0.2.4`, the last segment of `${model_path}` directory name **MUST** be a local directory that contains the model's configuration files. Hugging Face links (e.g., deepseek-ai/DeepSeek-R1) are not supported at the moment.
- `--force_think`: Force responding the reasoning tag of `DeepSeek R1`.

The relationship between `max_batch_size`, `cache_lens`, and `max_new_tokens` should satisfy:
`cache_lens > max_batch_size * max_new_tokens`, otherwise the concurrency will decrease.

### 2. access server

```
curl -X POST http://localhost:10002/v1/chat/completions \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "hello"}
    ],
    "model": "DeepSeek-R1",
    "temperature": 0.3,
    "top_p": 1.0,
    "stream": true
  }'
```


================================================
FILE: doc/en/benchmark.md
================================================
## Benchmark

To conduct a quick and convenient check, we have employed a simple Python script available [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/tests) to assess the precision of our **[ktransformers](https://github.com/kvcache-ai/ktransformers)** project. For this evaluation, we utilized the same dataset, which was shuffled in a consistent manner and limited to the first 1,000 data points, to test our implementation across a variety of CPU kernels, MLA kernels, and quantization formats.

We selected the DeepSeek-V3 model in its bf16, int8, and q4km versions for this test. The MMLU dataset, which can be found [here](https://huggingface.co/datasets/cais/mmlu), was used (we selected all datasets and shuffled them with a fixed random seed).

**!!! However, we skipped the few-shot part and only chose the first 1,000 data points for a quick check.** Please note that this approach may result in results that are not consistent with the technical report of DeepSeek-V3. And the test of R1 and further more tests are on going.

To verify our results, we chose [cloud service platform](https://cloud.siliconflow.cn/models) as baseline. All tests were conducted using the same script and datasets, allowing us to make a preliminary assessment of our project's precision.

We set the argument `temperature=0.6`, and to simplify the test process, we skipped the few-shot part and used the following prompt: `There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter. \nQuestion: {question}\nA. {option_a}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '`. For more details, please refer to the [script](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/tests/mmlu_test.py).

Given that we have only tested 1,000 cases, which provides only a preliminary judgment, some fluctuations in the results are reasonable. We selected all datasets and shuffled them with a fixed random seed to ensure consistency.

## Some Details

- The bf16 model of DeepSeek-V3 is available [here](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16/tree/main) (you may convert it to gguf by llama.cpp). The q4km model can be found [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M).
    
- The optimization YAML file is located [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules). For the GEMM Kernel, you can change `KLinearMarlin` to `KLinearTorch`.
    
- To switch the MLA Kernel from Triton to Torch, you can check and modify [this file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py), specifically by using the `forward_windows` method.
    
- When attempting to conduct the bf16 test (both CPU Weight and GPU Weight), you may encounter issues stemming from older versions of g++ and as, particularly when using Ubuntu 20 or earlier versions. To facilitate a smoother experience and enable you to reproduce our results, we have provided a development container. This container offers a pre-configured environment tailored for this purpose. However, please note that the container does not have the ktrans package installed. Therefore, you may still need to manually install certain packages to ensure everything runs smoothly.
    
    - You may config the model mount dir in `devcontainer/devcontainer.json`, check the `"mouts":` config.


## The Result Table
Uses DeepSeek-V3 model (Some specific cases are R1)
|                          |                   |            |                   |         |            |                                                        |              |
| ------------------------ | ----------------- | ---------- | ----------------- | ------- | ---------- | ------------------------------------------------------ | ------------ |
| DataSet                  | CPU Weight Format | CPU Kernel | GPU Weight Format | GEMM Kernel   | MLA Kernel | [Siliconflow](https://cloud.siliconflow.cn/models)<br> | Ktrans Point |
| MMLU<br><br>(shuffle 1k) |               |    |               |    |       |                                                    |          |
|          1                | bf16              | cpuinfer   | bf16              | torch   | torch      | 81.6                                                   | 81.9         |
|           2               | q8_0              | cpuinfer   | bf16              | torch   | torch      | 81.6                                                   | 83.1         |
|             3             | q4km              | cpuinfer   | bf16              | torch   | triton     | 81.6                                                   | 81.4         |
|              4            | q4km              | cpuinfer   | q4km->marlin 8    | marlin  | triton     | 81.6                                                   | 81.1         |
|               5           | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 81.6                                                   | 81           |
|                6          | q4km              | cpuinfer   | fp8               | fp8gemm  | triton     | 81.6                                                   | 81.5         |
|                7 (DeepSeek-R1)          |  iq1             | cpuinfer   |     fp8           |  fp8gemm | triton     | 78.6                                                   | 83.6         |
| MMLU-pro<br>(shuffle 1k)                 |               |    |                |  |      |                                                    |          |
| 1                 | q4km              | cpuinfer   | fp8               | fp8gemm | triton     | 57.7                                                   | 57.6         |
|  2             | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 57.7                                                   | 57.5         |
|  3 (DeepSeek-R1)             | iq1              | cpuinfer   | fp8    | fp8gem  | triton     | 71.9                                                   | tbd         |
| HumanEval                | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |
| GSM8K                    | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |

**The details for each case are listed below**:

By default, The MLA kernel uses triton in linux and torch in windows. But we need to test torch in linux, so we manually modify the [file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py#L592). Just get rid of all the if branch and force it to use `self.forward_windows`

- MMLU test
  1. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml) change all the `KLinearMarlin` to `KLinearTorch` (just find all the usage in this file). The source weight comes from [there](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16) (you need to use llama.cpp to convert it to gguf)
  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q8_0 is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q8_0)
  3. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
  4. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You don't need to change the source code as they both use q4km. But note the yaml file [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L29) and [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L18), below these lines you need to add `num_bits: 8` (in other words: add this kwargs to all that use `KLinearMarlin`). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
  5. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
  6. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
  7. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
- MMLU-pro test
  1. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case. 
  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
  3. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.

================================================
FILE: doc/en/deepseek-v2-injection.md
================================================
# Tutorial: Heterogeneous and Local MoE Inference

DeepSeek-(Code)-V2 is a series of strong mixture-of-experts (MoE) models, featuring a total of 236 billion parameters, with 21 billion parameters activated per token. This model has demonstrated remarkable reasoning capabilities across various benchmarks, positioning it as one of the SOTA open models and nearly comparable in performance to GPT-4. DeepSeek-R1 uses a similar architecture to DeepSeek-V2, but with a bigger number of parameters.

<p align="center">
  <picture>
    <img alt="DeepSeek-Coder-V2 Score" src="../assets/BigCodeBench.png" width=80%>
  </picture>
</p>

Moreover, unlike previous models that employed traditional attention mechanisms like Grouped-Query Attention (GQA), DeepSeek-V2 incorporates a novel Multi-head Latent Attention (MLA). This innovation significantly reduces the size of the KV cache required during inference, enhancing efficiency.


However, despite its efficiency, the practicality of running such a large model on personal computing setups seems impractical. Official documentation for DeepSeek-V2 indicates that eight 80GB GPUs are necessary for standard inference operations, and even the scaled-down Q4_k_m version requires at least two 80GB GPUs. These requirements are beyond the reach of most individual researchers and small teams.


Nonetheless, by employing several cutting-edge optimization techniques, we have successfully operated this colossal model on a desktop computer with only 21GB of VRAM and 136GB of DRAM. In this document, we outline the specific optimizations utilized and provide a detailed tutorial on how to implement these strategies using KTransformers.

## Applied Optimizations

### Optimized MLA Operator

The following figure provides a brief overview of DeepSeek-V2 architecture. At the heart of its attention layer, DeepSeek-V2 introduces a novel MLA operator that represents the heads of key-value pairs using a common, joint compressed representation, which holds significant potential for efficiency improvements. However, the official open-source implementation of the MLA operator explicitly decompresses this compressed representation and caches the decompressed key-value pairs. This process not only enlarges the KV cache size but also diminishes inference performance.

<p align="center">
  <picture>
    <img alt="DeepSeek on KTransformers" src="../assets/DeepSeek-on-KTransformers.png" width=80%>
  </picture>
</p>

To truly capitalize on the benefits of MLA, we have implemented an optimized version for inference. According to its original paper, we absorb the decompression matrices directly into the q_proj and out_proj weights. Consequently, the compressed representation does not need to be decompressed to compute the attention. This adjustment significantly reduces the KV cache size and increases the arithmetic intensity of this operator, which greatly optimizes the utilization of GPU computational power.

### Advanced Quantization Kernels

The original DeepSeek-V2 model stores its parameters in BF16 format, consuming approximately 470GB of raw storage. This exceeds the RAM capacity available on mainstream desktop computers. To address this, we leverage the well-established GGUF community's quantized weights to simplify the process for users.
However, quantized data types are not typically supported by highly-optimized BLAS packages. As a result, the original HuggingFace Transformers' Torch implementation must dequantize these tensors to supported data types before processing, which introduces unnecessary computational overhead and increases memory traffic. To overcome this, we have incorporated advanced kernels that operate directly on quantized data types, thereby optimizing inference performance.


In the current version of KTransformers, we utilize Marlin for GPU kernels and llamafile for CPU kernels. These kerenls are specially designed to benefit from modern GPU architecture and modern CPU instruction extensions such as AVX512-BF16 (AMD Zen4 or newer) and AVX-VNNI (Intel Alder Lake or newer), that are tailored for quantized data types and machine learning workloads. We also use expert parallelism and other optimization for MOE inferencem on CPU based on llamafile, and call them as CPUInfer.  As demonstrated in Figure 2(cite from Marlin), Marlin can achieve near ideal 3.87x speedup compare to corresponding Torch counterparts. As demonstrated in the following figure, our micro benchmarks show that inference using CPUInfer performs several times faster than Torch in low bits representation. Note that in practical inference such as using transformers, the Torch baseline use BF16 or FP16 as linear weights, and will occupy more memory resources, or it will be more slower due to dequantization when using quanted weights.

<p align="center">
  <picture>
    <img alt="CPUInfer Performance" src="../assets/cpuinfer.png" width=80%>
  </picture>
</p>
<p align="center">
  <picture>
    <img alt="marlin performance" src="https://github.com/IST-DASLab/marlin/blob/master/assets/sustained.png?raw=true" width=80%>
  </picture>
</p>

### Arithmetic Intensity Guided Offloading

Storing all 236 billion parameters of a model in GPU VRAM is clearly impractical for local users. Therefore, we strategically store only the most computationally intensive parameters on the GPU. For instance, after our optimizations, the MLA operator, which contains 128 heads with a shared compressed key-value representation, shows an arithmetic intensity of 512. This makes it the most intensive operator, particularly during smaller inference batch sizes. Hence, it is allocated to the GPU to leverage the power of tensor cores.


On the other hand, as shown in Figure 1, each transformer block in DeepSeek-V2 includes 160 mixture-of-experts (MoE) experts, comprising 96% of the total parameters. However, the MoE router activates only 6 out of these 160 experts for each token, which means that only 3.75% of the MoE parameters are utilized during the decoding phase. With a batch size of one, the arithmetic intensity of the MoE operation is roughly 0.075. This operation, primarily involving a batched General Matrix-Vector Multiplication (GEMV), can thus be efficiently handled by the CPU.


Following this principle of arranging all operators by their arithmetic intensity and placing the most intensive ones in the GPU as much as possible, we prioritize positioning the MoE parameters and word embeddings computations on the CPU side to utilize its larger memory capacity. Meanwhile, the remaining parameters, including shared experts, projections in the attention module, and MLA, are stored in the GPU VRAM. As these parameters are accessed by every token, their placement on the GPU maximizes the benefits of high memory bandwidth. This configuration leads to approximately 20.7 GB of VRAM usage and 136GB DRAM memory requests if the Q4_K_M version is used, which is feasible even on a local desktop. Additionally, the placement can be adjusted according to the actual configuration, adhering to the same principle.


Moreover, as an extensible framework, KTransformers is set to support more advanced operators in future releases, continually enhancing its capability to handle diverse workloads efficiently.

## YAML Template

To implement the above optimizations in KTransformers, users need to write a YAML file containing the optimized rules. 
KTransformers will iterate through all sub-modules of the model, match rules specified in the YAML rule file, and replace them with advanced modules as specified.

<p align="center">
  <picture>
    <img alt="Inject-Struction" src="../assets/InjectStruction.png" width=80%>
  </picture>
</p>

Specifically, the following rules are used:

- Replace the Attention module with our [optimized MLA Operator](#mla).
- Replace routed experts with [CPUInfer kernels](#experts) that use Llamafile.
- Replace all Linear modules not belonging to attention with [Marlin](#linear) kernels.


<h3 id="mla">MLA</h3>

For attention module injection, we only need to match the module name used in Transformers using a regular expression and replace it with our pre-implemented module. 
The YAML rule is listed below.

```yaml
- match:
    name: "^model\\.layers\\..*\\.self_attn$" # regular expression
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
```

As we can see, each rule in the YAML file has two parts: `match` and `replace`. 
The match part specifies which module should be replaced, and the replace part specifies the module to be injected into the model along with the initialization keywords. 

<h3 id="experts">Routed Experts </h3>

For routed experts, the module we inject is a wrapper of CPUInfer, KTransformersExperts. There are several implementations within a wrapper, and we need to specify keywords to tell the wrapper which implementation we want to use and how we intend to use it.

In KTransformers, some models exhibit different behaviors during prefilling and generation for better performance. KTransformersExperts is one of them. All these special modules have a `device` keyword describing which device the module should be initialized on. Other keywords specify the behaviors during prefilling and generation and may be differ when using different injection modules. Here, we specify which implementation on which device we want to use during prefilling and generation, and which device the output should be on.
Note that we only use these parameters when layer-wise prefilling is enabled; otherwise, prefilling is conducted with the same configuration as generation.

In the original implementation of Transformers, MoE is implemented using `nn.ModuleList`. We don't want KTransformers to iterate through all the sub-modules in the list, so we set `recursive: False` in this rule to prevent recursive injection into submodules of the current module. Here is the YAML rule:

```yaml
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert parallelism
    device: "cpu"   # device to load this module on initialization
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
```

If we inject the expert list as a custom module, we can't use the interface in `nn.ModuleList` as default. We need to change the forward function in the FFN module. The simplest way is implementing a new module using custom forward function and inject it. We have implemented the new module, and the injection can be done by simply adding an injection rule. We can use the `class` instead of `name` to match a module that will be replaced. Here is the YAML rule:

```yaml
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # MLP module with custom forward function
```

<h3 id="linear">Other Linear Modules</h3>

For the remained linear modules, we want to use our quantization kernels. However, we don't want to inject linear in the MLA operator because we currently don't know the effect of using quantization in MLA. 
So, we can change our regular expression and add a class check in the match part of the rule. Only modules matching both name and class simultaneously will be injected. 
We also need to transfer some keywords similar to the injection of experts. Here is the YAML rule:

```yaml
- match:
    name: "^model\\.layers\\.(?!.*self_attn).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
```

<h3 id="Pre-compute Buffers">Pre-compute Buffers </h3>

The original model is initialized on the meta device. The rotary embedding module pre-computes some buffers when initializing, which has no effect and doesn't compute anything when using the meta device. Therefore, we need to compute the buffers when loading the model. For convenience, we inject the rotary embedding module with our custom module, which performs pre-computations when loading. Here is the YAML rule:

```yaml
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
```

## Wrap Your Custom Module

We have implemented some modules, but you may need to inject your custom module using KTransformers. 
The only thing you need to do is wrap your custom module and write YAML files. We provide a base operator specifying interfaces an injection module should have. You only need to inherit from that module and change the `__init__`, `forward`, or `load` function as needed.

- The `__init__` function of the base operator maintains the necessary information for injection and execution of the KTransformers framework. To override this function, subclass modules need to call the base operator's `__init__` function in their own initializer.
- The `forward` function is a function in torch that will be called during inference, where the module author has the freedom to achieve higher performance.
- The `load` function is used to load all parameters of this module. The default implementation is to call the `load` function of all submodules. You can modify this function to customize its loading method and explicitly control the loading of its submodules.


================================================
FILE: doc/en/fp8_kernel.md
================================================
# FP8 Linear Kernel for DeepSeek-V3/R1

## Overview
The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
- **Hybrid Quantization Architecture**:
  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
  - Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)

So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.

## Key Features

✅ Hybrid Precision Architecture (FP8 + GGML)<br>
✅ Memory Optimization (~19GB VRAM usage)

## Quick Start
### Using Pre-Merged Weights

Pre-merged weights are available on Hugging Face:<br>
[KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-V3)<br>
[KVCache-ai/DeepSeek-R1-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-R1)

> Please confirm the weights are fully uploaded before downloading. The large file size may extend Hugging Face upload time.


Download Pre-Merged Weights
```shell
pip install -U huggingface_hub

# Optional: Use HF Mirror for faster downloads in special area.
# export HF_ENDPOINT=https://hf-mirror.com 

huggingface-cli download --resume-download KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid --local-dir <local_dir>
```
### Using merge scripts
If you got local DeepSeek-R1/V3 fp8 safetensors and gguf weights(eg.q4km), you can merge them using the following scripts.

```shell
python merge_tensors/merge_safetensor_gguf.py \
  --safetensor_path <fp8_safetensor_path> \
  --gguf_path <gguf_folder_path> \
  --output_path <merged_output_path>
```

* `--safetensor_path`:	input path of safetensor file([Download](https://huggingface.co/deepseek-ai/DeepSeek-V3/tree/main)).
* `--gguf_path`: input path of gguf folder ([Download](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)).
* `--output_path`: output path of merged file.


### Execution Notes

Launch local_chat.py with custom quantized experts
```shell
python ktransformers/local_chat.py \
  --model_path deepseek-ai/DeepSeek-V3 \
  --gguf_path <merged_weights_folder> \
  --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml \
  --cpu_infer <cpu_cores + 1>
```


## Notes

⚠️ Hardware Requirements<br>
* Recommended minimum 19GB available VRAM for FP8 kernel.
* Requires GPU with FP8 support (e.g., 4090)

⏳ First-Run Optimization
JIT compilation causes longer initial execution (subsequent runs retain optimized speed).

🔄 Temporary Interface<br>
Current weight loading implementation is provisional - will be refined in future versions

📁 Path Specification<br>
Despite hybrid quantization, merged weights are stored as .safetensors - pass the containing folder path to `--gguf_path`

================================================
FILE: doc/en/install.md
================================================
<!-- omit in toc -->

# How to Run DeepSeek-R1

- [How to Run DeepSeek-R1](#how-to-run-deepseek-r1)
  - [Preparation](#preparation)
  - [Installation](#installation)
    - [Attention](#attention)
    - [Supported models include](#supported-models-include)
    - [Support quantize format](#support-quantize-format)

In this document, we will show you how to install and run KTransformers on your local machine. There are two versions:

* V0.2 is the current main branch.
* V0.3 is a preview version only provides binary distribution for now.
* To reproduce our DeepSeek-R1/V3 results, please refer to [Deepseek-R1/V3 Tutorial](./DeepseekR1_V3_tutorial.md) for more detail settings after installation.

## Preparation

Some preparation:

- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).

  ```sh
  # Adding CUDA to PATH
  if [ -d "/usr/local/cuda/bin" ]; then
      export PATH=$PATH:/usr/local/cuda/bin
  fi

  if [ -d "/usr/local/cuda/lib64" ]; then
      export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
      # Or you can add it to /etc/ld.so.conf and run ldconfig as root:
      # echo "/usr/local/cuda-12.x/lib64" | sudo tee -a /etc/ld.so.conf
      # sudo ldconfig
  fi

  if [ -d "/usr/local/cuda" ]; then
      export CUDA_PATH=$CUDA_PATH:/usr/local/cuda
  fi
  ```
- Linux-x86_64 with gcc, g++>=11 and cmake>=3.25 (using Ubuntu as an example)
- **Note**: The default CMake version in Ubuntu 22.04 LTS or higher may not support newer CUDA language dialects (e.g., CUDA 20). This can cause errors such as Target "cmTC_xxxxxx" requires the language dialect "CUDA20", but CMake does not know the compile flags to use to enable it. To resolve this, install a newer CMake version, for instance, by adding the Kitware APT repository.

  ```sh
  sudo apt-get update 
  sudo apt-get install build-essential cmake ninja-build patchelf
  ```
- We recommend using [Miniconda3](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) or [Anaconda3](https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program. Assuming your Anaconda installation directory is `~/anaconda3`, you should ensure that the version identifier of the GNU C++standard library used by Anaconda includes `GLIBCXX_3.4.32`

  ```sh
  conda create --name ktransformers python=3.11
  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first

  conda install -c conda-forge libstdcxx-ng # Anaconda provides a package called `libstdcxx-ng` that includes a newer version of `libstdc++`, which can be installed via `conda-forge`.

  strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
  ```
- Make sure that PyTorch, packaging, ninja is installed You can also [install previous versions of PyTorch](https://pytorch.org/get-started/previous-versions/)

  ```
  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
  pip3 install packaging ninja cpufeature numpy
  ```
- At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.

## Installation

### Attention

If you want to use numa support, not only do you need to set USE_NUMA=1, but you also need to make sure you have installed the libnuma-dev (`sudo apt-get install libnuma-dev` may help you).

[Optional] If you want to use the multi-concurrent version, please install the following dependencies.

```
sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libgflags-dev zlib1g-dev libfmt-dev
```

<!-- 1. ~~Use a Docker image, see [documentation for Docker](./doc/en/Docker.md)~~
   
   >We are working on the latest docker image, please wait for a while.

2. ~~You can install using Pypi (for linux):~~
    > We are working on the latest pypi package, please wait for a while.
   
   ```
   pip install ktransformers --no-build-isolation
   ```
   
   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced.  -->

Download source code and compile:

- init source code

  ```sh
  git clone https://github.com/kvcache-ai/ktransformers.git
  cd ktransformers
  git submodule update --init --recursive
  ```
- [Optional] If you want to run with website, please [compile the website](./api/server/website.md) before execute ``bash install.sh``
- For Linux

  - For simple install:

    ```shell
    bash install.sh
    ```
  - For those who have two cpu and 1T RAM:

    ```shell
    # Make sure your system has dual sockets and double size RAM than the model's size (e.g. 1T RAM for 512G model)
     apt install libnuma-dev
     export USE_NUMA=1
     bash install.sh # or #make dev_install
    ```
  - For Multi-concurrency with 500G RAM:

    ```shell
    USE_BALANCE_SERVE=1 bash ./install.sh
    ```
  - For Multi-concurrency with two cpu and 1T RAM:

    ```shell
    USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
    ```
- For Windows (Windows native temporarily deprecated, please try WSL)

  ```shell
  install.bat
  ```

* If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./makefile_usage.md)

<h3>Local Chat</h3>
We provide a simple command-line local chat Python script that you can run for testing.

> Note: this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666).

<h4>Run Example</h4>

```shell
# Begin from root of your cloned repo!
# Begin from root of your cloned repo!!
# Begin from root of your cloned repo!!! 

# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
mkdir DeepSeek-V2-Lite-Chat-GGUF
cd DeepSeek-V2-Lite-Chat-GGUF

wget https://huggingface.co/mradermacher/DeepSeek-V2-Lite-GGUF/resolve/main/DeepSeek-V2-Lite.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf

cd .. # Move to repo's root dir

# Start local chat
python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF

# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
# python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
```

It features the following arguments:

- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.

  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
  >
- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
- `--optimize_config_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).

<h3>Start Server</h3>
We provide a server script, which supports multi-concurrency functionality in version v0.2.4.

```
python ktransformers/server/main.py --model_path /mnt/data/models/DeepSeek-V3 --gguf_path /mnt/data/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M/ --cpu_infer 62 --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
```

It features the following arguments:

- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)

<details>
<summary>Supported Models/quantization</summary>

### Supported models include


| ✅**Supported Models** | ❌**Deprecated Models**    |
| ---------------------- | -------------------------- |
| DeepSeek-R1            | ~~InternLM2.5-7B-Chat-1M~~ |
| DeepSeek-V3            |                            |
| DeepSeek-V2            |                            |
| DeepSeek-V2.5          |                            |
| Qwen2-57B              |                            |
| DeepSeek-V2-Lite       |                            |
| Mixtral-8x7B           |                            |
| Mixtral-8x22B          |                            |

### Support quantize format


| ✅**Supported Formats** | ❌**Deprecated Formats** |
| ----------------------- | ------------------------ |
| IQ1_S                   | ~~IQ2_XXS~~              |
| IQ2_XXS                 |                          |
| Q2_K_L                  |                          |
| Q2_K_XS                 |                          |
| Q3_K_M                  |                          |
| Q4_K_M                  |                          |
| Q5_K_M                  |                          |
| Q6_K                    |                          |
| Q8_0                    |                          |

</details>

<details>
<summary>Suggested Model</summary>


| Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
| DeepSeek-R1-q4_k_m             | 377G       | 14G   | 382G            | 512G              |
| DeepSeek-V3-q4_k_m             | 377G       | 14G   | 382G            | 512G              |
| DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
| DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
| DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
| Qwen2-57B-A14B-Instruct-q4_k_m | 33G        | 8G    | 34G             | 64G               |
| DeepSeek-V2-Lite-q4_k_m        | 9.7G       | 3G    | 13G             | 16G               |
| Mixtral-8x7B-q4_k_m            | 25G        | 1.6G  | 51G             | 64G               |
| Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
| InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |

More will come soon. Please let us know which models you are most interested in.

Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).

</details>

<details>
  <summary>Click To Show how to run other examples</summary>

* Qwen2-57B

  ```sh
  pip install flash_attn # For Qwen2

  mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF

  wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf

  cd ..

  python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF

  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
  ```
* Deepseek-V2

  ```sh
  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
  # Download weights
  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf

  cd ..

  python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF

  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：

  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628

  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
  ```


| model name       | weights download link                                                                                                 |
| ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| Qwen2-57B        | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main)                       |
| DeepseekV2-coder | [DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
| DeepseekV2-chat  | [DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main)                 |
| DeepseekV2-lite  | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main)                |
| DeepSeek-R1      | [DeepSeek-R1-gguf-Q4K-M](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M)                |

</details>

<!-- pin block for jump -->

<span id='id_666'>

<h3>RESTful API and Web UI  </h3>

Start without website:

```sh
ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
```

Start with website:

```sh
ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
```

Or you want to start server with transformers, the model_path should include safetensors

```bash
ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
```

Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :

<p align="center">
  <picture>
    <img alt="Web UI" src="https://github.com/user-attachments/assets/615dca9b-a08c-4183-bbd3-ad1362680faf" width=90%>
  </picture>
</p>

More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).


================================================
FILE: doc/en/kt-kernel/GLM-5-Tutorial.md
================================================
# Running GLM-5 with SGLang and KT-Kernel

This tutorial demonstrates how to run GLM-5 model inference using SGLang integrated with KT-Kernel for CPU-GPU heterogeneous inference. This setup enables efficient deployment of large MoE models by offloading experts to CPU. KT-Kernel supports both BF16 and FP8 precision backends, allowing you to choose between maximum quality and reduced memory footprint.

## Table of Contents

- [Table of Contents](#table-of-contents)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Launch SGLang Server](#step-2-launch-sglang-server)
- [Step 3: Send Inference Requests](#step-3-send-inference-requests)
  - [Option A: Interactive Chat with KT CLI](#option-a-interactive-chat-with-kt-cli)
  - [Option B: OpenAI-Compatible API](#option-b-openai-compatible-api)
- [Additional Resources](#additional-resources)

## Prerequisites

Before starting, ensure you have:

1. **SGLang installed**

    Install the kvcache-ai fork of SGLang (one of):

    ```bash
    # Option A: One-click install (from ktransformers root)
    ./install.sh

    # Option B: pip install
    pip install sglang-kt
    ```

2. **KT-Kernel installed**

    ```bash
    git clone https://github.com/kvcache-ai/ktransformers.git
    git submodule update --init --recursive
    cd kt-kernel && ./install.sh
    ```

3. **transformers reinstalled**

    ```bash
    pip install git+https://github.com/huggingface/transformers.git
    ```

4. **CUDA toolkit** - CUDA 12.0+ recommended (12.8+ for best FP8 support)
5. **Hugging Face CLI** - For downloading models:
   ```bash
   pip install -U huggingface-hub
   ```

## Step 1: Download Model Weights

Download the GLM-5 weights from Hugging Face.

```bash
# FP8
hf download zai-org/GLM-5-FP8 \
  --local-dir /path/to/GLM-5-FP8

# BF16
hf download zai-org/GLM-5 \
  --local-dir /path/to/GLM-5
```

**Note:** Replace `/path/to/` with your actual storage path throughout this tutorial.

## Step 2: Launch SGLang Server

Start the SGLang server with KT-Kernel integration for CPU-GPU heterogeneous inference.

```bash
# FP8 Precision
export PYTORCH_ALLOC_CONF=expandable_segments:True
export SGLANG_ENABLE_JIT_DEEPGEMM=0

python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30000 \
  --model /path/to/GLM-5-FP8 \
  --kt-weight-path /path/to/GLM-5-FP8 \
  --kt-cpuinfer 96 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 30 \
  --kt-method FP8 \
  --kt-gpu-prefill-token-threshold 1024 \
  --kt-enable-dynamic-expert-update \
  --kt-expert-placement-strategy uniform \
  --trust-remote-code \
  --mem-fraction-static 0.75 \
  --served-model-name GLM5 \
  --enable-mixed-chunk \
  --tensor-parallel-size 8 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --chunked-prefill-size 16384 \
  --max-running-requests 4 \
  --max-total-tokens 128000 \
  --attention-backend flashinfer \
  --fp8-gemm-backend cutlass \
  --kv-cache-dtype bf16 \
  --tool-call-parser glm47 \
  --reasoning-parser glm45 \
  --watchdog-timeout 3000

# BF16 Precision
export PYTORCH_ALLOC_CONF=expandable_segments:True
export SGLANG_ENABLE_JIT_DEEPGEMM=0

python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30000 \
  --model /path/to/GLM-5 \
  --kt-weight-path /path/to/GLM-5 \
  --kt-cpuinfer 96 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 10 \
  --kt-method BF16 \
  --kt-gpu-prefill-token-threshold 1024 \
  --kt-enable-dynamic-expert-update \
  --kt-expert-placement-strategy uniform \
  --trust-remote-code \
  --mem-fraction-static 0.75 \
  --served-model-name GLM5 \
  --enable-mixed-chunk \
  --tensor-parallel-size 8 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --chunked-prefill-size 16384 \
  --max-running-requests 4 \
  --max-total-tokens 128000 \
  --attention-backend flashinfer \
  --tool-call-parser glm47 \
  --reasoning-parser glm45 \
  --watchdog-timeout 3000
```

Layerwise prefill requires one extra MoE layer's worth of VRAM.

If you encounter OOM, adjust `--kt-num-gpu-experts`, `--chunked-prefill-size`, `--mem-fraction-static` and `--max-total-tokens` when launching the server.

If you encounter other issues, try `kt doctor` to diagnose your setup.

See [KT-Kernel Parameters](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel#kt-kernel-parameters) for detailed parameter tuning guidelines.

## Step 3: Send Inference Requests

Once the server is running (default: `http://localhost:30000`), you can interact with the model in several ways:

### Option A: Interactive Chat with KT CLI

The easiest way to chat with the model:

```bash
kt chat
```

This opens an interactive terminal chat session. Type your messages and press Enter to send. Use `Ctrl+C` to exit.

### Option B: OpenAI-Compatible API

The server exposes an OpenAI-compatible API at `http://localhost:30000/v1`.

**curl example (streaming):**

```bash
curl http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "GLM5",
    "messages": [{"role": "user", "content": "hi, who are you?"}],
    "stream": true
  }'
```

## Additional Resources

- [GLM-5 Model Card](https://huggingface.co/zai-org/GLM-5)
- [KT-Kernel Documentation](../../../kt-kernel/README.md)
- [SGLang GitHub](https://github.com/sgl-project/sglang)
- [KT-Kernel Parameters Reference](../../../kt-kernel/README.md#kt-kernel-parameters)


================================================
FILE: doc/en/kt-kernel/Kimi-K2-Thinking-Native.md
================================================
# Running Kimi-K2-Thinking with SGLang and KT-Kernel

This tutorial demonstrates how to run Kimi-K2 model inference using SGLang integrated with KT-Kernel for CPU-GPU heterogeneous inference. This setup enables efficient deployment of large MoE models by offloading experts to CPU.

## Table of Contents

- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Launch SGLang Server](#step-2-launch-sglang-server)
- [Step 3: Send Inference Requests](#step-3-send-inference-requests)

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: NVIDIA RTX 4090 48GB (or equivalent with at least 48GB VRAM available)
- **CPU**: x86 CPU with AVX512 support (e.g., Sapphire Rapids)
- **RAM**: At least 650GB system memory
- **Storage**: ~600GB for model weights (native INT4 weight, same weight dir for CPU and GPU)

**Tested Configuration:**

- **GPU**: 1/2/4/8x NVIDIA RTX 4090/L20 48GB
- **CPU**: 2x Intel(R) Xeon(R) Platinum 8488C
- **RAM**: 2TB DDR5 4800MHz
- **OS**: Linux (Ubuntu 20.04+ recommended)

## Prerequisites

Before starting, ensure you have:

1. **KT-Kernel installed** - Follow the [installation guide](./kt-kernel_intro.md#installation)
2. **SGLang installed** - Install the kvcache-ai fork of SGLang (one of):

```bash
# Option A: One-click install (from ktransformers root)
./install.sh

# Option B: pip install
pip install sglang-kt
```

3. **CUDA toolkit** - Compatible with your GPU (CUDA 11.8+ recommended)
4. **Hugging Face CLI** - For downloading models:
   ```bash
   pip install huggingface-hub
   ```

## Step 1: Download Model Weights

```bash
# Create a directory for models
mkdir -p /path/to/models
cd /path/to/models

# Download Kimi-K2-Thinking (INT4 for both CPU and GPU)
huggingface-cli download moonshotai/Kimi-K2-Thinking \
  --local-dir /path/to/kimi-k2-thinking
```

**Note:** Replace `/path/to/models` with your actual storage path throughout this tutorial.

## Step 2: Launch SGLang Server

Start the SGLang server with KT-Kernel integration for CPU-GPU heterogeneous inference.


### Launch Command (2x RTX 4090 Example)

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30001 \
  --model /path/to/kimi-k2-thinking \
  --kt-weight-path /path/to/kimi-k2-thinking \
  --kt-cpuinfer 96 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 8 \
  --kt-method RAWINT4 \
  --kt-gpu-prefill-token-threshold 400 \
  --kt-max-deferred-experts-per-token 1 \
  --trust-remote-code \
  --mem-fraction-static 0.94 \
  --served-model-name Kimi-K2-Thinking \
  --enable-mixed-chunk \
  --tensor-parallel-size 2 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --chunked-prefill-size 65536 \
  --max-total-tokens 65536 \
  --attention-backend flashinfer
```

It takes about 2~3 minutes to start the server.

See [KT-Kernel Parameters](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel#kt-kernel-parameters) for detailed parameter tuning guidelines.

### Key Parameters

| Parameter | Description |
|-----------|-------------|
| `--kt-method RAWINT4` | CPU and GPU use the same INT4 weight. Set `--model` and `--kt-weight-path` to the same directory. |
| `--kt-num-gpu-experts` | Number of experts kept on GPU for decoding. |
| `--kt-gpu-prefill-token-threshold` | Token count threshold for prefill strategy. Below: hybrid CPU+GPU. Above: layerwise GPU prefill. |
| `--chunked-prefill-size` | Maximum tokens per prefill batch. |
| `--max-total-tokens` | Maximum total tokens in KV cache. |

### About `--kt-gpu-prefill-token-threshold`

This parameter controls the prefill strategy:

- **$\leq$ threshold**: Uses hybrid CPU+GPU prefill. No extra VRAM needed, but performance degrades slowly as token count increases.
- **> threshold**: Uses layerwise GPU prefill. Performance scales near-exponentially until reaching the bottleneck, but requires 9GB+ extra VRAM.

### Troubleshooting OOM

Layerwise prefill requires extra VRAM (~9GB + incremental cost with prefill length). If you encounter OOM, adjust these parameters based on your use case and hardware (refer to the recommended parameters table below):

| Parameter | VRAM Impact |
|-----------|-------------|
| `--kt-num-gpu-experts` | Reduces expert weight VRAM usage |
| `--chunked-prefill-size` | Reduces prefill extra VRAM allocation |
| `--max-total-tokens` | Reduces KV cache VRAM usage |

**Tip:** Test with an input of length `chunked-prefill-size` to verify your configuration won't OOM during prefill.


### Recommended Parameters

| GPU Config | `kt-num-gpu-experts` | `max-total-tokens` | `chunked-prefill-size` |
|------------|----------------------|---------------------|------------------------|
| 1x RTX 4090 (48GB) | 0 | 30000 | 30000 |
| 2x RTX 4090 (48GB) | 8 | 65536 | 65536 |
| 4x RTX 4090 (48GB) | 30 | 80000 | 65536 |
| 8x RTX 4090 (48GB) | 80 | 100000 | 65536 |

**Tip:** If your prefill and total length requirements are low (e.g., processing short texts), you can reduce `max-total-tokens` and `chunked-prefill-size` to free up VRAM for a larger `kt-num-gpu-experts`, which improves decode performance.

### Performance

The following prefill throughput (tokens/s) benchmarks were measured with single concurrency:

| GPU Config | 2048 tokens | 8192 tokens | 32768 tokens |
|------------|-------------|-------------|--------------|
| 1x RTX 4090 (48GB) | 53 | 184 | 290* |
| 2x RTX 4090 (48GB) | 85 | 294 | 529 |
| 4x RTX 4090 (48GB) | 118 | 415 | 818 |
| 8x RTX 4090 (48GB) | 130 | 435 | 1055 |

* Note: 1x RTX 4090 with layerwise prefill OOMs at 32768 tokens, so the 290 tokens/s is measured with qlen=30000.

## Step 3: Send Inference Requests

Once the server is running, you can send inference requests using the OpenAI-compatible API.

### Basic Chat Completion Request

```bash
curl -s http://localhost:30001/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Kimi-K2-Thinking",
    "stream": false,
    "messages": [
      {"role": "user", "content": "hi"}
    ]
  }'
```

### Example Response

```json
{
    "id": "cd0905562bf44513947284f80cc5634b",
    "object": "chat.completion",
    "created": 1764921457,
    "model": "Kimi-K2-Thinking",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": " <think> The user says \"hi\". This is a very simple greeting. I should respond in a friendly and helpful manner. Since I'm an AI assistant, I should be professional but approachable.\n\nPossible responses:\n1. \"Hello! How can I help you today?\"\n2. \"Hi there! What can I do for you?\"\n3. \"Hello! It's nice to hear from you. What would you like to talk about?\"\n4. \"Hi! I'm here to assist you with any questions you might have.\"\n\nI think option 1 is the most standard and professional. It's direct, friendly, and opens the door for the user to ask their question. I should keep it concise.\n\nLet me go with: \"Hello! How can I help you today?\" </think> Hello! How can I help you today?",
                "reasoning_content": null,
                "tool_calls": null
            },
            "logprobs": null,
            "finish_reason": "stop",
            "matched_stop": 163586
        }
    ],
    "usage": {
        "prompt_tokens": 26,
        "total_tokens": 189,
        "completion_tokens": 163,
        "prompt_tokens_details": null,
        "reasoning_tokens": 0
    },
    "metadata": {
        "weight_version": "default"
    }
}
```

## Advance Use Case: Running Claude Code with Native Kimi-K2-Thinking Local Backend

Add the following parameters to the SGLang launch command above to enable tool calling support:

```bash
--tool-call-parser kimi_k2 --reasoning-parser kimi_k2
```

With these parameters enabled, you can use [claude-code-router](https://github.com/musistudio/claude-code-router) to connect Kimi-K2-Thinking as a local backend for [Claude Code](https://github.com/anthropics/claude-code).

## Additional Resources

- [KT-Kernel Documentation](../../../kt-kernel/README.md)
- [SGLang GitHub](https://github.com/sgl-project/sglang)
- [Claude Code Router](https://github.com/musistudio/claude-code-router) - Route Claude Code to custom backends


================================================
FILE: doc/en/kt-kernel/MiniMax-M2.1-Tutorial.md
================================================
# Running MiniMax-M2.1 with Native Precision using SGLang and KT-Kernel

This tutorial demonstrates how to run MiniMax-M2.1 model inference using SGLang integrated with KT-Kernel. MiniMax-M2.1 provides native FP8 weights, enabling efficient GPU inference with reduced memory footprint while maintaining high accuracy.

## Table of Contents

- [Table of Contents](#table-of-contents)
- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Launch Server with KT CLI](#step-2-launch-server-with-kt-cli)
  - [Advanced Options](#advanced-options)
  - [Dry Run](#dry-run)
  - [Key Parameters](#key-parameters)
- [Step 3: Send Inference Requests](#step-3-send-inference-requests)
  - [Option A: Interactive Chat with KT CLI](#option-a-interactive-chat-with-kt-cli)
  - [Option B: OpenAI-Compatible API](#option-b-openai-compatible-api)
- [Performance](#performance)
  - [Throughput (tokens/s)](#throughput-tokenss)
  - [Comparison with llama.cpp](#comparison-with-llamacpp)
- [Troubleshooting](#troubleshooting)
  - [OOM (Out of Memory) Issues](#oom-out-of-memory-issues)
- [Advanced Use Case: Running Claude Code with MiniMax-M2.1 Local Backend](#advanced-use-case-running-claude-code-with-minimax-m21-local-backend)
- [Additional Resources](#additional-resources)

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: NVIDIA RTX 5090 32 GB (or equivalent with at least 32GB VRAM available)
- **CPU**: x86 CPU with AVX512 support (e.g., Intel Sapphire Rapids, AMD EPYC)
- **RAM**: At least 256GB system memory
- **Storage**: >220 GB for model weights (same weight dir for GPU and CPU)

**Tested Configuration:**

- **GPU**: 1/2 x NVIDIA GeForce RTX 5090 (32 GB)
- **CPU**: 2 x AMD EPYC 9355 32-Core Processor (128 threads)
- **RAM**: 1TB DDR5 5600MT/s ECC
- **OS**: Linux (Ubuntu 20.04+ recommended)

## Prerequisites

Before starting, ensure you have:

1. **SGLang installed**

    Install the kvcache-ai fork of SGLang (one of):

    ```bash
    # Option A: One-click install (from ktransformers root)
    ./install.sh

    # Option B: pip install
    pip install sglang-kt
    ```

2. **KT-Kernel installed**

    Please follow [kt-kernel](https://github.com/kvcache-ai/ktransformers/blob/main/kt-kernel/README.md)

    After installation, verify the CLI is working:

    ```bash
    kt version
    ```

3. **CUDA toolkit** - CUDA 12.0+ recommended for FP8 support
4. **Hugging Face CLI** - For downloading models:
   ```bash
   pip install -U huggingface-hub
   ```

## Step 1: Download Model Weights

Download the official MiniMax-M2.1 weights.

* huggingface: https://huggingface.co/MiniMaxAI/MiniMax-M2.1

    ```bash
    hf download MiniMaxAI/MiniMax-M2.1 --local-dir /path/to/minimax-m2.1
    ```

## Step 2: Launch Server with KT CLI

The simplest way to start the MiniMax-M2.1 server is using the `kt` CLI:

```bash
kt run m2.1
```

The CLI will automatically detect your hardware configuration and apply optimal parameters for your system.

### Advanced Options

For custom configurations, you can specify additional parameters:

```bash
# Use specific number of GPUs (tensor parallel)
kt run m2.1 --tensor-parallel-size 2

# Custom CPU threads and NUMA configuration
kt run m2.1 --cpu-threads 64 --numa-nodes 2
```

### Dry Run

To preview the command without executing:

```bash
kt run m2.1 --dry-run
```

See [KT-Kernel Parameters](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel#kt-kernel-parameters) for detailed parameter tuning guidelines.

### Key Parameters

| Parameter | Description |
|-----------|-------------|
| `--kt-method FP8` | Enable FP8 inference mode for MiniMax-M2.1 native FP8 weights. |
| `--kt-cpuinfer` | Number of CPU inference threads. Set to physical CPU cores (not hyperthreads). |
| `--kt-threadpool-count` | Number of thread pools. Set to NUMA node count. |
| `--kt-num-gpu-experts` | Number of experts kept on GPU for decoding. |
| `--chunked-prefill-size` | Maximum tokens per prefill batch. |
| `--max-total-tokens` | Maximum total tokens in KV cache. |
| `--kt-gpu-prefill-token-threshold` | Token threshold for layerwise prefill strategy. |

## Step 3: Send Inference Requests

Once the server is running (default: `http://localhost:30000`), you can interact with the model in several ways:

### Option A: Interactive Chat with KT CLI

The easiest way to chat with the model:

```bash
kt chat
```

This opens an interactive terminal chat session. Type your messages and press Enter to send. Use `Ctrl+C` to exit.

### Option B: OpenAI-Compatible API

The server exposes an OpenAI-compatible API at `http://localhost:30000/v1`.

**curl example (streaming):**

```bash
curl http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "MiniMax-M2.1",
    "messages": [{"role": "user", "content": "Hello!"}],
    "stream": true
  }'
```


## Performance

### Throughput (tokens/s)

The following benchmarks were measured with single concurrency (Prefill tps / Decode tps):

| GPU  | CPU  | PCIe |  2048 tokens | 8192 tokens | 32768 tokens |
|------------|-------------|-------------|-------------|-------------|--------------|
| 1 x RTX 4090 (48 GB) | 2 x Intel Xeon Platinum 8488C| PCIe 4.0 | 129 / 21.8 | 669 / 20.9 | 1385 / 18.5 |
| 2 x RTX 4090 (48 GB) | 2 x Intel Xeon Platinum 8488C| PCIe 4.0 | 139 / 23.6 | 1013 / 23.3 | 2269 / 21.6 |
| 1 x RTX 5090 (32 GB) | 2 x AMD EPYC 9355 | PCIe 5.0 | 408 / 32.1 | 1196 / 31.4 | 2540 / 27.6 |
| 2 x RTX 5090 (32 GB) | 2 x AMD EPYC 9355 | PCIe 5.0 | 414 / 35.9 | 1847 / 35.5 | 4007 / 33.1 |

![Throughput in 2 x RTX 5090](../../assets/MiniMax-M2_speed.png)

### Comparison with llama.cpp

We benchmarked KT-Kernel + Sglang against llama.cpp to demonstrate the performance advantages of our CPU-GPU heterogeneous inference approach.

- **Weight formats**: KT-Kernel uses native unquantized FP8 weights from MiniMax-M2, while llama.cpp only supports quantized weights, so we used Q8_0 quantization for the llama.cpp benchmarks.

- **Test environment**: 2 x RTX 5090 (32 GB) with AMD EPYC 9355 CPUs, input tokens=32768, output tokens=512. We made our best effort to optimize llama.cpp performance, but we could not achieve optimal prefill and decode with a single command, so we used separate configurations for prefill and decode measurements.

![Performance Comparison with llama.cpp](../../assets/MiniMax-M2_comparison.png)

As shown in the chart, KT-Kernel achieves up to **>4.5x prefill** and **30% faster decode** compared to llama.cpp on the same hardware.

## Troubleshooting

### OOM (Out of Memory) Issues

Layerwise prefill requires extra VRAM (~3.6GB + incremental cost with prefill length). If you encounter OOM, adjust these parameters when launching the server:

| Parameter | VRAM Impact |
|-----------|-------------|
| `--kt-num-gpu-experts` | Reduces expert weight VRAM usage |
| `--chunked-prefill-size` | Reduces prefill extra VRAM allocation |
| `--max-total-tokens` | Reduces KV cache VRAM usage |

**Tip:** Test with an input of length `chunked-prefill-size` to verify your configuration won't OOM during prefill.

## Advanced Use Case: Running Claude Code with MiniMax-M2.1 Local Backend

```bash
kt run m2.1 --tool-call-parser minimax-m2 --reasoning-parser minimax-append-think
```

With the above command, you can use [claude-code-router](https://github.com/musistudio/claude-code-router) to connect MiniMax-M2.1 as a local backend for [Claude Code](https://github.com/anthropics/claude-code).

## Additional Resources

- [KT-Kernel Documentation](../../../kt-kernel/README.md)
- [SGLang GitHub](https://github.com/sgl-project/sglang)
- [KT-Kernel Parameters Reference](../../../kt-kernel/README.md#kt-kernel-parameters)

================================================
FILE: doc/en/kt-kernel/Native-Precision-Tutorial.md
================================================
# Running Native Precision Models with SGLang and KT-Kernel

This tutorial demonstrates how to run native precision MoE model inference using SGLang integrated with KT-Kernel. KTransformers v0.5.1+ supports multiple native precision formats, enabling efficient inference across various model architectures.

## Table of Contents

- [Supported Precision Formats](#supported-precision-formats)
- [Supported Models](#supported-models)
- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Launch Server](#launch-server)
  - [Example Configurations](#example-configurations)
  - [Key Parameters Reference](#key-parameters-reference)
- [Send Inference Requests](#send-inference-requests)
- [Technical Highlights](#technical-highlights)
  - [Experts Scheduling](#experts-scheduling)
  - [Dual Prefill Mechanism](#dual-prefill-mechanism)
- [Troubleshooting](#troubleshooting)
- [Additional Resources](#additional-resources)

## Supported Precision Formats

KTransformers supports multiple native precision formats via the `--kt-method` parameter:

| kt-method | Precision Format | Description | Instruction Set |
|-----------|-----------------|-------------|-----------------|
| `BF16` | BF16 Native | Zero precision loss, original weights | AMX + AVX512 |
| `FP8` | FP8 Blockwise | Block-wise scale quantization | AVX512 |
| `FP8_PERCHANNEL` | FP8 Per-Channel | Per-channel scale quantization | AVX512 |
| `RAWINT4` | INT4 Native | Same INT4 weights for CPU and GPU | AVX512 |

## Supported Models

| Model(sorted by lexicographical order) | kt-method | Precision | 
|-------|-----------|------------|
| **DeepSeek-V3/R1/V3.2** | `FP8` | FP8 |
| **GLM-4.7** | `FP8_PERCHANNEL`, `BF16` | FP8, BF16 |
| **Kimi-K2-Thinking** | `RAWINT4` | INT4 Native |
| **MiniMax-M2/M2.1** | `FP8` | FP8 |
| **Qwen3-235B-A22B** | `FP8`, `BF16` | FP8, BF16 |
| **Qwen3-30-A3B** | `FP8`, `BF16` | FP8, BF16 |
| **Qwen3-Next-80B-A3B** | `FP8`, `BF16` | FP8, BF16 |

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: 1-2 x NVIDIA GPU with at least 24GB VRAM (RTX 4090/5090 or equivalent, depending on model)
- **CPU**: x86 CPU with AVX512 support (Intel Sapphire Rapids+, AMD EPYC)
  - BF16 additionally benefits from AMX support
- **RAM**: At least as much RAM as model size (e.g., 256GB+ for MiniMax-M2.1)
- **Storage**: Sufficient space for model weights (varies by model)

**Recommended Configuration:**
- **GPU**: 1-8 x NVIDIA RTX 5090 (32 GB) or equivalent
- **CPU**: 2 x AMD EPYC 9355 32-Core / Intel Xeon Platinum 8488C
- **RAM**: 1TB DDR5 5600MT/s ECC
- **PCIe**: PCIe 5.0 for optimal CPU-GPU data transfer
- **OS**: Linux (Ubuntu 20.04+ recommended)

## Prerequisites

Before starting, ensure you have:

1. **SGLang installed**

    Install the kvcache-ai fork of SGLang (one of):

    ```bash
    # Option A: One-click install (from ktransformers root)
    ./install.sh

    # Option B: pip install
    pip install sglang-kt
    ```

2. **KT-Kernel installed**

    Follow the [kt-kernel installation guide](https://github.com/kvcache-ai/ktransformers/blob/main/kt-kernel/README.md):

    ```bash
    git clone https://github.com/kvcache-ai/ktransformers.git
    cd ktransformers/kt-kernel
    ./install.sh
    ```

    Verify the installation:

    ```bash
    kt version
    ```

3. **CUDA toolkit** - CUDA 12.0+ recommended
4. **Hugging Face CLI** - For downloading models:
   ```bash
   pip install -U huggingface-hub
   ```
   
## Launch Server

### Example Configurations
For now, only `MiniMax-M2/M2.1`, `DeepSeek-V3/R1-0528/V3.2`, `Kimi-K2-Thinking` can run with kt-cli.

**DeepSeek-V3.2**

```bash
kt run V3.2 --kt-enable-dynamic-expert-update
```

**GLM-4.7**

```bash
python -m sglang.launch_server \
    --host 0.0.0.0 \
    --port 30000 \
    --model /path/to/GLM-4.7/ \
    --kt-weight-path /path/to/GLM-4.7/ \
    --kt-cpuinfer 100 \
    --kt-threadpool-count 2 \
    --kt-num-gpu-experts 15 \
    --kt-method BF16 \
    --kt-enable-dynamic-expert-update \
    --attention-backend flashinfer \
    --mem-fraction-static 0.80 \
    --chunked-prefill-size 16384 \
    --max-running-requests 2 \
    --max-total-tokens 32768 \
    --trust-remote-code \
    --served-model-name GLM-4.7 \
    --enable-mixed-chunk \
    --tensor-parallel-size 8 \
    --enable-p2p-check \
    --disable-shared-experts-fusion \
    --tool-call-parser glm47 \
    --reasoning-parser glm45 \
    --watchdog-timeout 3000 \
    --kt-gpu-prefill-token-threshold 1024
```

**GLM-4.7-FP8**

```bash
python -m sglang.launch_server \
    --host 0.0.0.0 \
    --port 30000 \
    --model /path/to/GLM-4.7-FP8/ \
    --kt-weight-path /path/to/GLM-4.7-FP8/ \
    --kt-cpuinfer 100 \
    --kt-threadpool-count 2 \
    --kt-num-gpu-experts 80 \
    --kt-method FP8_PERCHANNEL \
    --kt-enable-dynamic-expert-update \
    --attention-backend flashinfer \
    --mem-fraction-static 0.75 \
    --chunked-prefill-size 16384 \
    --max-running-requests 4 \
    --max-total-tokens 100000 \
    --trust-remote-code \
    --served-model-name GLM-4.7 \
    --enable-mixed-chunk \
    --tensor-parallel-size 8 \
    --enable-p2p-check \
    --disable-shared-experts-fusion \
    --watchdog-timeout 3000 \
    --fp8-gemm-backend triton \
    --kt-gpu-prefill-token-threshold 2048
```

**Qwen3-235B-A22B**

```bash
python -m sglang.launch_server \
    --host 0.0.0.0 \
    --port 30000 \
    --model /path/to/Qwen3-235B-A22B \
    --kt-weight-path /path/to/Qwen3-235B-A22B \
    --kt-cpuinfer 100 \
    --kt-threadpool-count 2 \
    --kt-num-gpu-experts 20 \
    --kt-method FP8 \
    --kt-enable-dynamic-expert-update \
    --kt-expert-placement-strategy uniform \
    --attention-backend flashinfer \
    --mem-fraction-static 0.80 \
    --chunked-prefill-size 16384 \
    --max-running-requests 4 \
    --max-total-tokens 100000 \
    --trust-remote-code \
    --served-model-name Qwen3-235B-A22B \
    --enable-mixed-chunk \
    --tensor-parallel-size 8 \
    --enable-p2p-check \
    --kt-gpu-prefill-token-threshold 2048
```

### Key Parameters Reference

| Parameter | Description |
|-----------|-------------|
| `--kt-method` | Precision format: `BF16`, `FP8_PERCHANNEL`, `FP8`, `RAWINT4`, `AMXINT4` |
| `--kt-cpuinfer` | Number of CPU inference threads (set to ~90% of physical cores) |
| `--kt-threadpool-count` | Number of thread pools (set to NUMA node count) |
| `--kt-num-gpu-experts` | Number of experts kept on GPU per layer |
| `--kt-enable-dynamic-expert-update` | Enable dynamic expert placement updates during Layerwise Prefill |
| `--kt-expert-placement-strategy` | Expert placement strategy |
| `--kt-gpu-prefill-token-threshold` | Token threshold for triggering Layerwise Prefill |
| `--chunked-prefill-size` | Maximum tokens per prefill batch |
| `--max-total-tokens` | Maximum total tokens in KV cache |

## Send Inference Requests

Once the server is running (default: `http://localhost:30000`), you can interact with the model:

### Option A: Interactive Chat with KT CLI

```bash
kt chat
```

This opens an interactive terminal chat session. Type your messages and press Enter to send. Use `Ctrl+C` to exit.

### Option B: OpenAI-Compatible API

The server exposes an OpenAI-compatible API at `http://localhost:30000/v1`.

**curl example (streaming):**

```bash
curl http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "MODEL_NAME",
    "messages": [{"role": "user", "content": "Hello! What can you help me with?"}],
    "stream": true
  }'
```

**Python example:**

```python
from openai import OpenAI

client = OpenAI(base_url="http://localhost:30000/v1", api_key="none")

response = client.chat.completions.create(
    model="MODEL_NAME",
    messages=[{"role": "user", "content": "Explain quantum computing in simple terms."}],
    stream=True
)

for chunk in response:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="")
```

## Technical Highlights

### Experts Scheduling

See [CPU-GPU Expert Scheduling Tutorial](./experts-sched-Tutorial.md) for details.

### Dual Prefill Mechanism

KTransformers implements an adaptive dual prefill mechanism based on input token count:

| Mode | Trigger Condition | Computation |
|------|-------------------|-------------|
| **CPU-GPU Hybrid** | num_tokens < threshold | GPU + CPU |
| **Layerwise Prefill** | num_tokens >= threshold | GPU (CPU weights transferred to GPU) |

Set the `kt-gpu-prefill-token-threshold` parameter for best performance based on your workload.

## Troubleshooting

### OOM (Out of Memory) Issues

Layerwise prefill requires extra VRAM. If you encounter OOM, adjust these parameters:

| Parameter | VRAM Impact |
|-----------|-------------|
| `--kt-num-gpu-experts` | Reduces expert weight VRAM usage |
| `--chunked-prefill-size` | Reduces prefill extra VRAM allocation |
| `--max-total-tokens` | Reduces KV cache VRAM usage |
| `--mem-fraction-static` | Adjusts static memory fraction |

**Tips:**
- Test with an input of length `chunked-prefill-size` to verify configuration
- Reduce `--kt-num-gpu-experts` if GPU memory is limited
- For multi-GPU setups, ensure `--enable-p2p-check` is enabled
- For FP8 models, `--fp8-gemm-backend triton` may be required

## Additional Resources

- [KT-Kernel Documentation](../../../kt-kernel/README.md)
- [SGLang GitHub](https://github.com/sgl-project/sglang)
- [MiniMax-M2.1 Tutorial](./MiniMax-M2.1-Tutorial.md) - Detailed guide for MiniMax-M2.1 and other FP8 models
- [Kimi-K2-Thinking Tutorial](./Kimi-K2-Thinking-Native.md) - Detailed guide for Kimi-K2-Thinking


================================================
FILE: doc/en/kt-kernel/Qwen3-Coder-Next-Tutorial.md
================================================
# Running Qwen3-Coder-Next with SGLang and KT-Kernel

This tutorial demonstrates how to run Qwen3-Coder-Next (80B-A3B) model inference using SGLang integrated with KT-Kernel for CPU-GPU heterogeneous inference. Qwen3-Coder-Next is a Mixture-of-Experts code generation model. KT-Kernel supports both BF16 and FP8 precision backends, allowing you to choose between maximum quality and reduced memory footprint.

## Table of Contents

- [Table of Contents](#table-of-contents)
- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Launch SGLang Server](#step-2-launch-sglang-server)
  - [Key Parameters](#key-parameters)
- [Step 3: Send Inference Requests](#step-3-send-inference-requests)
  - [Option A: Interactive Chat with KT CLI](#option-a-interactive-chat-with-kt-cli)
  - [Option B: OpenAI-Compatible API](#option-b-openai-compatible-api)
- [Performance](#performance)
- [Troubleshooting](#troubleshooting)
  - [OOM (Out of Memory) Issues](#oom-out-of-memory-issues)
- [Additional Resources](#additional-resources)

## Hardware Requirements

**Recommended Configuration:**
- **GPU**: 1 x NVIDIA RTX 4090 24 GB
- **CPU**: x86 CPU with AVX512 support (e.g., Intel Sapphire Rapids, AMD EPYC)
- **RAM**: At least 100GB system memory for FP8 model weights
- **Storage**: >85 GB for FP8 model weights (80.4 GB)

## Prerequisites

Before starting, ensure you have:

1. **SGLang installed**

    Install the kvcache-ai fork of SGLang (one of):

    ```bash
    # Option A: One-click install (from ktransformers root)
    ./install.sh

    # Option B: pip install
    pip install sglang-kt
    ```

2. **KT-Kernel installed**

    Please follow [kt-kernel](https://github.com/kvcache-ai/ktransformers/blob/main/kt-kernel/README.md)

    After installation, verify the CLI is working:

    ```bash
    kt version
    ```

3. **CUDA toolkit** - CUDA 12.0+ recommended (12.8+ for best FP8 support)
4. **Hugging Face CLI** - For downloading models:
   ```bash
   pip install -U huggingface-hub
   ```

## Step 1: Download Model Weights

Download the Qwen3-Coder-Next weights from Hugging Face.

```bash
# FP8
hf download Qwen/Qwen3-Coder-Next-FP8 \
  --local-dir /path/to/Qwen3-Coder-Next-FP8

# BF16
hf download Qwen/Qwen3-Coder-Next \
  --local-dir /path/to/Qwen3-Coder-Next
```

**Note:** Replace `/path/to/` with your actual storage path throughout this tutorial.

## Step 2: Launch SGLang Server

Start the SGLang server with KT-Kernel integration for CPU-GPU heterogeneous inference.

```bash
# FP8 Precision
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30000 \
  --model /path/to/Qwen3-Coder-Next-FP8 \
  --kt-weight-path /path/to/Qwen3-Coder-Next-FP8 \
  --kt-cpuinfer 96 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 100 \
  --kt-method FP8 \
  --kt-gpu-prefill-token-threshold 2048 \
  --attention-backend triton \
  --trust-remote-code \
  --mem-fraction-static 0.80 \
  --chunked-prefill-size 16384 \
  --max-running-requests 4 \
  --max-total-tokens 256000 \
  --served-model-name Qwen3-Coder-Next \
  --enable-mixed-chunk \
  --tensor-parallel-size 1 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --fp8-gemm-backend cutlass \
  --tool-call-parser qwen3_coder \
  --kt-enable-dynamic-expert-update

# BF16 Precision
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30000 \
  --model /path/to/Qwen3-Coder-Next \
  --kt-weight-path /path/to/Qwen3-Coder-Next \
  --kt-cpuinfer 96 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 60 \
  --kt-method BF16 \
  --kt-gpu-prefill-token-threshold 2048 \
  --attention-backend triton \
  --trust-remote-code \
  --mem-fraction-static 0.80 \
  --chunked-prefill-size 16384 \
  --max-running-requests 4 \
  --max-total-tokens 256000 \
  --served-model-name Qwen3-Coder-Next \
  --enable-mixed-chunk \
  --tensor-parallel-size 1 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --tool-call-parser qwen3_coder \
  --kt-enable-dynamic-expert-update
```

See [KT-Kernel Parameters](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel#kt-kernel-parameters) for detailed parameter tuning guidelines.

### Key Parameters

| Parameter | Description |
|-----------|-------------|
| `--kt-method FP8 / BF16` | Inference precision mode. FP8 halves weight memory; BF16 uses full precision. |
| `--kt-cpuinfer` | Number of CPU inference threads. |
| `--kt-threadpool-count` | Number of thread pools. Set to NUMA node count. |
| `--kt-num-gpu-experts` | Number of experts kept on GPU for decoding. |
| `--kt-gpu-prefill-token-threshold` | Token threshold for layerwise prefill strategy. |
| `--kt-enable-dynamic-expert-update` | Enable dynamic expert placement on GPU based on routing statistics. |
| `--kt-expert-placement-strategy` | Expert placement strategy. Default: `uniform`. See [Expert Scheduling Tutorial](experts-sched-Tutorial.md) for other options. |
| `--chunked-prefill-size` | Maximum tokens per prefill batch. |
| `--max-total-tokens` | Maximum total tokens in KV cache. |
| `--tool-call-parser` | Tool call parser for function calling support (use `qwen3_coder`). |
| `--fp8-gemm-backend` | GEMM backend for FP8 computation. |

## Step 3: Send Inference Requests

Once the server is running (default: `http://localhost:30000`), you can interact with the model in several ways:

### Option A: Interactive Chat with KT CLI

The easiest way to chat with the model:

```bash
kt chat
```

This opens an interactive terminal chat session. Type your messages and press Enter to send. Use `Ctrl+C` to exit.

### Option B: OpenAI-Compatible API

The server exposes an OpenAI-compatible API at `http://localhost:30000/v1`.

**curl example (streaming):**

```bash
curl http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen3-Coder-Next",
    "messages": [{"role": "user", "content": "Write a Python function to compute the Fibonacci sequence."}],
    "stream": true
  }'
```

**curl example (non-streaming):**

```bash
curl -s http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen3-Coder-Next",
    "messages": [{"role": "user", "content": "Hello! What can you help me with?"}],
    "stream": false
  }'
```

## Performance

The following benchmarks were measured with single concurrency (Prefill tps / Decode tps):

| GPU | CPU | PCIe | Precision | 64 tokens | 2048 tokens | 8192 tokens | 32768 tokens |
|-----|-----|------|-----------|-------------|-------------|-------------|--------------|
| 1 x RTX 5090 (32 GB) | 2 x AMD EPYC 9355 | PCIe 5.0 | FP8  | 362 / 75.9 | 1746 / 75.6 | 2407 / 69.1 | 6233 / 51.7 | 

## Troubleshooting

### OOM (Out of Memory) Issues

Layerwise prefill requires extra VRAM. If you encounter OOM, adjust these parameters when launching the server:

| Parameter | VRAM Impact |
|-----------|-------------|
| `--kt-num-gpu-experts` | Reduces expert weight VRAM usage |
| `--chunked-prefill-size` | Reduces prefill extra VRAM allocation |
| `--max-total-tokens` | Reduces KV cache VRAM usage |
| `--mem-fraction-static` | Lower values reserve more VRAM headroom (default: 0.80) |

**Tip:** Test with an input of length `chunked-prefill-size` to verify your configuration won't OOM during prefill.

## Additional Resources

- [Qwen3-Coder-Next Model Card](https://huggingface.co/Qwen/Qwen3-Coder-Next)
- [KT-Kernel Documentation](../../../kt-kernel/README.md)
- [SGLang GitHub](https://github.com/sgl-project/sglang)
- [KT-Kernel Parameters Reference](../../../kt-kernel/README.md#kt-kernel-parameters)


================================================
FILE: doc/en/kt-kernel/README.md
================================================
# kt-kernel Docs

================================================
FILE: doc/en/kt-kernel/amd_blis.md
================================================

### USAGE
1. To use this feature, you should use MOE_INT8 method (i.e. `--kt-method MOE_INT8`)
2. !!! you should see the method in the below motivation section to  build and install the correct amd blis lib.
3. Before your install you should set `export CPUINFER_ENABLE_BLIS=ON` to enable
### Motivation

To accelerate the prefill speed of AMD. Reference the https://github.com/amd/blis repo. And the usage should add the LPGEMM support. See the docs here: https://www.cs.utexas.edu/~flame/BLISRetreat2024/slides/Bhaskar_BLIS_Retreat_2024_AMD_LPGEMM_0.pdf
I reference this api guide for the code: https://docs.amd.com/r/en-US/57404-AOCL-user-guide/AOCL-BLAS?section=lpgemm-in-aocl-blas
To use lpgemm, see the doc here: 
https://www.amd.com/content/dam/amd/en/documents/developer/version-4-1-documents/aocl/aocl-4-1-user-guide.pdf
<img width="2134" height="1240" alt="Image" src="https://github.com/user-attachments/assets/d4008736-c1c7-422e-a747-155fc2eb4141" />
So, you just need to enable aocl_gemm add-on, examples are here:https://github.com/amd/blis/blob/master/docs/CMakeBuildSystem.md

<img width="2222" height="702" alt="Image" src="https://github.com/user-attachments/assets/bf924b69-e01d-460d-b4cd-122e77ec982d" />
You can see how to install it.


================================================
FILE: doc/en/kt-kernel/deepseek-v3.2-sglang-tutorial.md
================================================
# Running DeepSeek V3.2 with SGLang and KT-Kernel

This tutorial demonstrates how to run DeepSeek V3.2 model inference using SGLang integrated with KT-Kernel for CPU-GPU heterogeneous inference. This setup enables efficient deployment of large MoE models by offloading experts to CPU.

## Table of Contents

- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Quantize CPU Weights](#step-2-quantize-cpu-weights)
- [Step 3: Launch SGLang Server](#step-3-launch-sglang-server)
- [Step 4: Send Inference Requests](#step-4-send-inference-requests)

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: NVIDIA L20 48GB (or equivalent with at least 27GB VRAM available)
- **CPU**: Intel Xeon with AMX support (e.g., Sapphire Rapids)
- **RAM**: At least 350GB system memory for INT4 quantization
- **Storage**: ~1TB for model weights (FP8 + INT4 quantized)

**Tested Configuration:**
- **GPU**: NVIDIA L20 48GB
- **CPU**: Intel(R) Xeon(R) Platinum 8488C
- **RAM**: 2TB DDR5
- **OS**: Linux (Ubuntu 20.04+ recommended)

## Prerequisites

Before starting, ensure you have:

1. **KT-Kernel installed** - Follow the [installation guide](./kt-kernel_intro.md#installation)
2. **SGLang installed** - Install the kvcache-ai fork: `pip install sglang-kt` or run `./install.sh` from the ktransformers root
3. **CUDA toolkit** - Compatible with your GPU (CUDA 11.8+ recommended)
4. **Hugging Face CLI** - For downloading models:
   ```bash
   pip install huggingface-hub
   ```

## Step 1: Download Model Weights

DeepSeek V3.2 requires downloading model repositories:

1. **DeepSeek-V3.2**
2. **DeepSeek-V3.2-Speciale**

```bash
# Create a directory for models
mkdir -p /path/to/models
cd /path/to/models

# Download DeepSeek-V3.2 (FP8 weights for GPU)
huggingface-cli download deepseek-ai/DeepSeek-V3.2 \
  --local-dir /path/to/deepseek-v3.2

# Download DeepSeek-V3.2-Speciale (if needed)
huggingface-cli download deepseek-ai/DeepSeek-V3.2-Speciale \
  --local-dir /path/to/deepseek-v3.2-speciale
```

**Note:** Replace `/path/to/models` with your actual storage path throughout this tutorial.

## Step 2: Quantize CPU Weights

Convert the FP8 GPU weights to INT4 quantized CPU weights using the provided conversion script.

### Conversion Command

For a 2-NUMA system with 60 physical cores:

```bash
cd /path/to/ktransformers/kt-kernel

python scripts/convert_cpu_weights.py \
  --input-path /path/to/deepseek-v3.2 \
  --input-type fp8 \
  --output /path/to/deepseek-v3.2-INT4 \
  --quant-method int4 \
  --cpuinfer-threads 60 \
  --threadpool-count 2 \
  --no-merge-safetensor
```

## Step 3: Launch SGLang Server

Start the SGLang server with KT-Kernel integration for CPU-GPU heterogeneous inference.

### Launch Command

For single NVIDIA L20 48GB + 2-NUMA CPU system:

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 30000 \
  --model /path/to/deepseek-v3.2 \
  --kt-weight-path /path/to/deepseek-v3.2-INT4 \
  --kt-cpuinfer 60 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 1 \
  --attention-backend triton \
  --trust-remote-code \
  --mem-fraction-static 0.98 \
  --chunked-prefill-size 4096 \
  --max-running-requests 32 \
  --max-total-tokens 40000 \
  --served-model-name DeepSeek-V3.2 \
  --enable-mixed-chunk \
  --tensor-parallel-size 1 \
  --enable-p2p-check \
  --disable-shared-experts-fusion \
  --kt-method AMXINT4
```

### Resource Usage

- **GPU VRAM:** ~27GB (for 1 GPU expert per layer + attention)
- **System RAM:** ~350GB (for INT4 quantized CPU experts)

## Step 4: Send Inference Requests

Once the server is running, you can send inference requests using the OpenAI-compatible API.

### Basic Chat Completion Request

```bash
curl -s http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "DeepSeek-V3.2",
    "stream": false,
    "messages": [
      {"role": "user", "content": "hi"}
    ]
  }'
```

### Example Response

```json
{
  "id": "adbb44f6aafb4b58b167e42fbbb1eed3",
  "object": "chat.completion",
  "created": 1764675126,
  "model": "DeepSeek-V3.2",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Hi there! 👋 \n\nThanks for stopping by! How can I help you today? Feel free to ask me anything - I'm here to assist with questions, explanations, conversations, or whatever you need! 😊\n\nIs there something specific on your mind, or would you like to know more about what I can do?",
        "reasoning_content": null,
        "tool_calls": null
      },
      "logprobs": null,
      "finish_reason": "stop",
      "matched_stop": 1
    }
  ],
  "usage": {
    "prompt_tokens": 5,
    "total_tokens": 72,
    "completion_tokens": 67,
    "prompt_tokens_details": null,
    "reasoning_tokens": 0
  },
  "metadata": {
    "weight_version": "default"
  }
}
```

## Additional Resources

- [KT-Kernel Documentation](../../../kt-kernel/README.md)
- [DeepSeek V3.2 Model Card](https://huggingface.co/deepseek-ai/DeepSeek-V3.2)
- [SGLang GitHub](https://github.com/sgl-project/sglang)

================================================
FILE: doc/en/kt-kernel/experts-sched-Tutorial.md
================================================
# CPU-GPU Expert Scheduling Tutorial

This tutorial demonstrates how to use the CPU-GPU expert scheduling feature in KTransformers with SGLang. This feature introduces a flexible GPU expert mask system that allows intelligent placement of MoE experts across CPU and GPU, optimizing inference performance based on workload patterns.

## Table of Contents

- [Table of Contents](#table-of-contents)
- [Hardware Requirements](#hardware-requirements)
- [Prerequisites](#prerequisites)
- [Step 1: Download Model Weights](#step-1-download-model-weights)
- [Step 2: Launch Server with Expert Scheduling](#step-2-launch-server-with-expert-scheduling)
  - [Basic Usage](#basic-usage)
  - [Expert Placement Strategies](#expert-placement-strategies)
  - [Key Parameters](#key-parameters)
- [Step 3: Send Inference Requests](#step-3-send-inference-requests)
  - [Option A: Interactive Chat with KT CLI](#option-a-interactive-chat-with-kt-cli)
  - [Option B: OpenAI-Compatible API](#option-b-openai-compatible-api)
- [Performance](#performance)
- [Troubleshooting](#troubleshooting)
- [Additional Resources](#additional-resources)

## Hardware Requirements

**Minimum Configuration:**
- **GPU**: NVIDIA RTX 4090 24 GB (or equivalent with at least 24GB VRAM available)
- **CPU**: x86 CPU with AVX512 support (e.g., Intel Sapphire Rapids, AMD EPYC)
- **RAM**: At least 256GB system memory
- **Storage**: Sufficient space for model weights

**Tested Configuration:**

- **GPU**: 4 x NVIDIA GeForce RTX 4090 (24 GB)
- **CPU**: Intel Xeon Gold 6454S
- **RAM**: 512GB DDR5
- **OS**: Linux (Ubuntu 20.04+ recommended)

## Prerequisites

Before starting, ensure you have:

1. **SGLang installed**

    Install the kvcache-ai fork of SGLang (one of):

    ```bash
    # Option A: One-click install (from ktransformers root)
    ./install.sh

    # Option B: pip install
    pip install sglang-kt
    ```

2. **KTransformers installed**

    ```bash
    git clone https://github.com/kvcache-ai/ktransformers.git
    cd ktransformers/kt-kernel
    bash ./install.sh
    ```

    After installation, verify the CLI is working:

    ```bash
    kt version
    ```

3. **CUDA toolkit** - CUDA 12.0+ recommended
4. **Hugging Face CLI** - For downloading models:
   ```bash
   pip install -U huggingface-hub
   ```

## Step 1: Download Model Weights

Download your preferred MoE model weights. This feature supports various MoE models including:

* **Qwen3-Next-80B-A3B-Instruct-FP8**

    ```bash
    huggingface-cli download Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 --local-dir /path/to/qwen3-next-80b
    ```

## Step 2: Launch Server with Expert Scheduling

### Basic Usage

The simplest way to start the server with expert scheduling:

```bash
python -m sglang.launch_server \
    --model /path/to/model \
    --kt-num-gpu-experts 8 \
    --kt-expert-placement-strategy uniform
```

### Expert Placement Strategies

The system provides four expert placement strategies:

| Strategy | Description | Use Case |
|----------|-------------|----------|
| `uniform` | Distributes GPU experts evenly across all MoE layers | Default, no prior statistics needed |
| `frequency` | Places most frequently activated experts on GPU | Best performance when activation statistics are available |
| `front-loading` | Fills GPU experts from the first layer onwards | Testing or specific workload patterns |
| `random` | Randomly selects experts with fixed seed (42) | Baseline comparison |

**Using Frequency Strategy (Recommended for best performance):**

```bash
python -m sglang.launch_server \
    --model /path/to/model \
    --kt-num-gpu-experts 8 \
    --kt-expert-placement-strategy frequency \
    --init-expert-location /path/to/activation_stats.pt
```

**Using Dynamic Expert Update:**

```bash
python -m sglang.launch_server \
    --model /path/to/model \
    --kt-num-gpu-experts 8 \
    --kt-expert-placement-strategy frequency \
    --init-expert-location /path/to/activation_stats.pt \
    --kt-enable-dynamic-expert-update \
    --kt-gpu-prefill-token-threshold 512
```

### Key Parameters

| Parameter | Description |
|-----------|-------------|
| `--kt-num-gpu-experts` | Number of GPU experts per MoE layer. Internally multiplied by the number of MoE layers to get the total GPU experts. Ignored if `--kt-gpu-experts-ratio` is set. |
| `--kt-gpu-experts-ratio` | Ratio of total experts to place on GPU (0.0-1.0). If set, overrides `--kt-num-gpu-experts`. Example: 0.1 means 10% of all experts across all layers will be on GPU. |
| `--kt-expert-placement-strategy` | Expert placement strategy: `frequency`, `uniform`, `front-loading`, or `random`. Default: `uniform`. |
| `--init-expert-location` | Path to activation statistics file (`.pt`) for `frequency` strategy. |
| `--kt-enable-dynamic-expert-update` | Enable dynamic expert update during inference. |
| `--kt-gpu-prefill-token-threshold` | Token threshold for triggering dynamic expert redistribution during prefill. |
| `--record-kt-gpu-expert-distribution` | Enable recording of GPU expert distribution for analysis. |
| `--expert-distribution-recorder-mode` | Recording mode: `stat` (default), `stat_approx`, `per_pass`, or `per_token`. |

## Step 3: Send Inference Requests

Once the server is running (default: `http://localhost:30000`), you can interact with the model in several ways:

### Option A: Interactive Chat with KT CLI

The easiest way to chat with the model:

```bash
kt chat
```

This opens an interactive terminal chat session. Type your messages and press Enter to send. Use `Ctrl+C` to exit.

### Option B: OpenAI-Compatible API

The server exposes an OpenAI-compatible API at `http://localhost:30000/v1`.

**curl example (streaming):**

```bash
curl http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "model-name",
    "messages": [{"role": "user", "content": "Hello!"}],
    "stream": true
  }'
```

## Performance

### Throughput (tokens/s)

The following benchmarks were measured on Qwen3-Next-80B-A3B-Instruct-FP8 with 4 x RTX 4090, Intel Xeon Gold 6454S, tensor parallel size 4, using ShareGPT dataset:

| GPU Expert Ratio | random | uniform | front-loading | frequency | dynamic-expert-update |
|------------------|--------|---------|---------------|-----------|----------------------|
| 0% | 53.01 | 52.96 | 54.18 | 52.72 | 53.37 |
| 10% | 56.63 | 56.57 | 57.18 | 58.60 | 70.22 |
| 20% | 58.75 | 60.28 | 58.82 | 61.92 | 74.73 |
| 30% | 62.86 | 62.08 | 63.87 | 66.50 | 75.55 |
| 40% | 66.81 | 66.82 | 67.45 | 72.78 | 80.98 |
| 50% | 70.38 | 65.25 | 73.65 | 76.19 | 81.17 |
| 60% | 71.33 | 72.80 | 77.95 | 82.33 | 82.30 |
| 70% | 74.40 | 76.17 | 81.59 | 89.37 | 88.70 |
| 80% | 79.71 | 79.20 | 89.20 | 100.67 | 92.31 |
| 90% | 88.82 | 81.06 | 98.14 | 107.15 | 95.04 |
| 100% | 112.61 | 112.32 | 111.82 | 114.26 | 112.99 |

The `frequency` and `dynamic-expert-update` strategies show significant performance improvements over baseline strategies, especially at lower GPU expert ratios.

## Troubleshooting

### OOM (Out of Memory) Issues

If you encounter OOM, adjust these parameters when launching the server:

| Parameter | VRAM Impact |
|-----------|-------------|
| `--kt-num-gpu-experts` / `--kt-gpu-experts-ratio` | Reduces expert weight VRAM usage |
| `--chunked-prefill-size` | Reduces prefill extra VRAM allocation |
| `--max-total-tokens` | Reduces KV cache VRAM usage |

### Dynamic Expert Update Not Triggering

Ensure all conditions are met:
1. `--kt-enable-dynamic-expert-update` is enabled
2. `--kt-gpu-prefill-token-threshold` is set
3. Prefill length >= threshold value

### Statistics Recording

To save expert distribution statistics to a custom path, set the environment variable:

```bash
export SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR=/path/to/output
```

## Additional Resources

- [KT-Kernel Documentation](../../../kt-kernel/README.md)
- [SGLang GitHub](https://github.com/sgl-project/sglang)
- [KTransformers GitHub](https://github.com/kvcache-ai/ktransformers)


================================================
FILE: doc/en/kt-kernel/kt-cli.md
================================================
# KT-CLI

> ⚠️ **Note:** This feature is currently under active development. Many functionalities are not yet complete and are being improved. Please stay tuned for updates.

## Design Philosophy

KT-CLI is designed to **minimize the burden of reading documentation**. Instead of requiring users to read lengthy docs, the CLI provides:

- **Interactive Mode**: Run commands without arguments to get step-by-step guided prompts
- **Direct Mode**: Pass arguments directly for automation and scripting
    > 💡 **Tip:** The arguments are fully compatible with the previous SGLang + KTransformers approach, so you can migrate seamlessly.

Simply run a command, and the CLI will interactively guide you through the process!

## Usage

You can check the usage by `kt --help`

```
kt [OPTIONS] COMMAND [ARGS]...
```

KTransformers CLI - A unified command-line interface for KTransformers.

## Options

| Option | Description |
|--------|-------------|
| `--help` | Show this message and exit. |

## Commands

| Command | Description |
|---------|-------------|
| `version` | Show version information |
| `chat` | Interactive chat with running model |
| `quant` | Quantize model weights |
| `bench` | Run full benchmark |
| `microbench` | Run micro-benchmark |
| `doctor` | Diagnose environment issues |
| `model` | Manage models and storage paths |
| `config` | Manage configuration |
| `sft` | Fine-tuning with LlamaFactory |


================================================
FILE: doc/en/llama4.md
================================================
# 🦙 Tutorial: LLaMA 4 Multi-Concurrency Support with KTransformers (Balance Serve Backend)

## 📌 Overview

We are pleased to announce that **KTransformers** now provides **experimental support for LLaMA 4 models** through the powerful `balance_serve` backend introduced in **v0.2.4**. This update is available under the dedicated development branch: [`support-llama4`](https://github.com/kvcache-ai/ktransformers/tree/support-llama4), specifically targeting the newly released **Meta LLaMA 4** model architecture.

⚠️ This support is currently **not available on the main branch** due to dependencies on newer versions of `transformers`, and **compatibility limitations with inference of currently supported models**. Work is underway to integrate this into the mainline once broader stability and compatibility are validated.

💡 **If you already have an environment based on the main branch**, it is **strongly recommended to create a new environment** to avoid potential dependency conflicts.

------

## 🔗 Model & Resource Links

- 🔥 Official LLaMA 4 Release: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
   (Note: LLaMA 4 models are served through the Meta repository. Make sure to **agree to terms** before downloading.)
- 🧠 GGUF Format (quantized models):
  - https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF

------

## 🧪 Demo

https://github.com/user-attachments/assets/449706f1-784b-4931-b2ba-07687c1aca54

------

## Resource Requirements

The Scout model running with 16 Experts requires approximately 65 GB of memory and 10 GB of GPU memory, while the Maverick model with 128 Experts requires approximately 270 GB of memory and 12 GB of GPU memory.

------

## ⚙️ Usage Instructions

### 1. Clone `support-llama4` Branch

```bash
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git checkout support-llama4
git submodule update --init --recursive
```

### 2. Set Up Environment

```bash
# Download Miniconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh

# Create environment
conda create --name ktransformers python=3.11
conda activate ktransformers

# Install required libraries
conda install -c conda-forge libstdcxx-ng

# Verify GLIBCXX version (should include 3.4.32)
strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX

sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libfmt-dev libgflags-dev zlib1g-dev patchelf
pip3 install packaging ninja cpufeature numpy openai
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
```

### 3. Build with Balance Serve Support

```bash
# Install single NUMA dependencies
USE_BALANCE_SERVE=1  bash ./install.sh
# For those who have two cpu and 1T RAM（Dual NUMA）:
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
```

### 4. Use our custom config.json

Currently, you need to copy the content of our custom config file into the `config.json` under your `--model_path`.  
- Use [scout_config.json](https://github.com/kvcache-ai/ktransformers/blob/support-llama4/doc/en/scout_config.json) for the Llama-4-Scout-17B-16E model  
- Use [maverick_config.json](https://github.com/kvcache-ai/ktransformers/blob/support-llama4/doc/en/maverick_config.json) for the Llama-4-Maverick-17B-128E model  

Please make sure to replace the content of `config.json` with the appropriate one accordingly.

### 5. Run LLaMA 4 Inference Server

Make sure you have:

- `--model_path` pointing to a local config directory (not a Hugging Face name).
- `--gguf_path` pointing to the folder containing quantized `.gguf` weights.

```bash
python ktransformers/server/main.py \
  --port 10002 \
  --model_path <path_to_safetensor_config> \
  --gguf_path <path_to_gguf_files> \
  --optimize_config_path ktransformers/optimize/optimize_rules/Llama4-serve.yaml \
  --max_new_tokens 1024 \
  --cache_lens 32768 \
  --chunk_size 256 \
  --max_batch_size 4 \
  --backend_type balance_serve \
```

### 5. Access server

```
curl -X POST http://localhost:10002/v1/chat/completions \
  -H "accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "hello"}
    ],
    "model": "Llama4",
    "temperature": 0.3,
    "top_p": 1.0,
    "stream": true
  }'
```

------

## 📌 Limitations

- ✅ **Only `balance_serve` backend is supported** for LLaMA 4 models in this version.
- ⚠️ Requires **`transformers==4.51.0`** or newer. Due to potential compatibility issues with older toolchains, we have **not merged this branch to main yet**.
- ❌ Multimodal models are not supported yet in this version. Support will be added in future releases.


================================================
FILE: doc/en/long_context_introduction.md
================================================
# KVCache Long Context

## TL;DR

Training larger models and supporting longer text sequences are currently the two most widely agreed-upon directions toward achieving AGI. After lowering the barrier for local inference with trillion-parameter MoE models, the second showcase scenario for KTransformers is reducing the inference barrier for ultra-long context sequences. Recently, both ChatGLM and InternLM have released open-source models supporting 1M tokens of context. This article will use InternLM2.5-7B-Chat-1M as an example to introduce a method that leverages the sparsity of attention to accelerate long-text inference on heterogeneous CPU/GPU systems.

After optimization, KTransformers has achieved native-precision inference for 128K and even 1M tokens of context on a single 24GB GPU with CPU/DRAM support. In the 128K context scenario, the generation speed is 7.1 times faster than llama.cpp, while also achieving 100% accuracy on relatively simple test sets like "needle in haystack" and "passkey". On the more challenging dataset kvretrieval, through flexible framework configurations, we can achieve a **6.22x speedup** during inference while obtaining even higher scores than running the original model directly (**21.2 -> 24.4**). In the 1M context scenario on a single 24GB GPU, KTransformers can similarly achieve a 16 tokens/s inference speed, nearly 10 times faster than llama.cpp under the same conditions, with the "needle in haystack" evaluation score even surpassing the original model (**89.31 -> 92.88**).

Project url: https://github.com/kvcache-ai/ktransformers

## Mathematical Principle: The computational overhead of long-text inference and the sparsity in Attention caused by Softmax.

As the demand for longer context windows increases, not only have commercial large models like Kimi and Claude/Gemini started supporting increasingly longer context windows, but open-source models have also begun to catch up. Notably, both ChatGLM 4 and InternLM 2.5 have released versions that are under 10 billion parameters but support up to 1 million tokens of context. However, despite the relatively small size of these models, the enormous KVCache required for such ultra-long contexts still prevents local users from practically running these models. As shown in the figure below, while the InternLM2.5-7B-Chat-1M model weights only require 15.49GB of GPU memory, an additional 145.49GB is needed to store the entire 1M-token KVCache, which is clearly beyond the memory capacity of local users. Even when using the KVCache Offload feature of llama.cpp to offload the KVCache to CPU/DRAM, barely making the model runnable, performance remains unacceptable due to the need to fully scan the entire KVCache each time a single token is generated.

| <img title="" src="../assets/internlm_memory.png" alt="internlm_memory" width="882"> | <img src="../assets/SparQ_attention.png" title="" alt="sparQ" width="691"> |
| ------------------------------------------------------------------------------------ | -------------------------------------------------------------------------- |

Fortunately, many studies have noticed that attention distribution during the inference phase tends to be **sparse**. For example, the right figure shows SparQ's experimental statistics based on LLaMa 7B, where less than 1% of tokens in a 3k context have relatively high attention scores. Similar conclusions are not only reflected in many other papers, such as H2O, Quest, InfLLM, and SnapKV, but we have also further validated this through long-text experiments with InternLM 2.5-7B-1M. Although the proportion isn't as extreme as 1%, due to the inherent head-focused effect of the softmax operation in attention mechanisms, it is theoretically possible that if we can identify in advance which tokens have high attention scores, scanning less than 5% of the tokens would suffice to essentially replicate the original result.

Thus, the problem narrows down to how to quickly identify these tokens with high attention scores without scanning them all. In the following sections, we will first briefly survey several key related papers, then summarize and propose a general framework we designed and implemented within KTransformers—a highly efficient sparse attention operator for CPUs.

## Related Papers and Conclusions

### Prune or Retrieval？

Based on the aforementioned points, we studied papers from recent years related to sparse selection in KVCache. The earliest of these is the paper H2O, which suggested that the attention distribution during inference is sparse and that only 5% of the KVCache is needed during inference. Following this, a series of works built on H2O's approach by designing more complex methods for selecting tokens that perform better in different scenarios. These methods are quite reasonable for single-word inference. However, as we previously explored in the Mooncake project, **we believe that the future trend is to precompute reusable KVCache as much as possible, and then use it to answer different questions.** This "compute once, use many" approach aims to reduce computational costs. Therefore, with this goal in mind, we prefer not to delete any tokens from the KVCache, or at least not remove a significant portion of them, to ensure that different questions can focus on different parts of the context in the future.

![InfLLM Framework](../assets/InfLLM_framework.png)

We further investigated related research, among which InfLLM proposed a very promising framework. Not only does it recognize that attention is sparse, but it also suggests that overly long contexts can cause attention to be dispersed into irrelevant noise, thereby reducing the model's ability to focus on key information. To address this issue, InfLLM introduces an external memory module (Memory Units) to store the context's KVCache. In each computation step, the most relevant semantic information is retrieved from this external memory module to participate in the calculation, thus enhancing the model's ability to handle long-context inference.

Specifically, InfLLM organizes the external memory module using semantic blocks composed of neighboring tokens and employs a sliding window mechanism during computation. In each step, it selects only the semantic blocks at the head of the context (Initial Tokens), the blocks near the current token (Local Tokens), and a few blocks with the highest semantic similarity to the current token to participate in the attention calculation. As shown in equation 1, to efficiently retrieve the blocks with the highest similarity, InfLLM selects a few representative tokens whose scores $$r_m$$ are the highest within each block. Use Equation 2 to calculate the semantic similarity between the current token and each semantic block.

![InfLLM Equation](../assets/InfLLM_equation.jpg)

Compared to the previously mentioned H2O, the differences in InfLLM are as follows:

1. The KVCache is not discarded but stored in memory and dynamically loaded onto the GPU during inference.

2. KVCache is managed at the granularity of blocks rather than tokens, with each block selecting a few tokens as its representative index tokens.

InfLLM's proposed method aligns with our "compute once, use many" approach of reusing KVCache. The external memory units in this method can be offloaded to CPU/DRAM or even SSD storage, allowing different parts to be selected for computation based on the specific question. This significantly improves the efficiency of attention computation.

### Other Improvements

Similarly, after InfLLM, Quest also manages tokens at the granularity of blocks. Quest analyzed the recall rate of key tokens in H2O and full attention, finding that the Top-10 attention score token recall rate for the H2O algorithm is around 50%, which indicates that too much key information was lost. To improve the recall rate of key tokens, Quest chooses two "representative tokens" from each block for retrieval. In the prefill stage, each KVCache block records the maximum and minimum values for each channel, as shown in the figure below under "Reduced Keys," which contains the element-wise min key and element-wise max key.

During the attention computation stage, the dot product is computed between the current query vector and the max key and min key of each KVCache block, respectively. Then, for each channel, the maximum value between the two resulting product vectors is selected and summed to serve as the upper bound of the relevance score for that KVCache block, as shown in stage 1 of the diagram. Based on the relevance scores, the top-k KVCache blocks are selected to participate in the attention computation, as illustrated in stage 2 of the diagram.

![Quest Framework](../assets/Quest_framework.png)

Compared to InfLLM, Quest does not take heterogeneous architectures into account. Instead, it assumes that all KVCache can still fit into memory, simply leveraging sparse attention to accelerate the inference process. Ultimately, Quest achieves a 7.03x speedup in attention computation and a 2.23x improvement in end-to-end inference latency.

Going further, SnapKV proposes retaining two parts of the tokens during the prefill stage, as shown in the diagram below with the orange and green segments. The difference from InfLLM lies only in the method of selecting the middle tokens. SnapKV selects tokens at the token level rather than the block level, with the score calculation being similar to H2O, i.e., $$softmax(\frac{qk^T}{\sqrt{d_k}})$$. However, when summing across columns, only the rows within the final green window are selected for computation, corresponding to the Local Tokens section in InfLLM. Additionally, SnapKV introduces a pooling operation on top of attention, which the paper explains as ensuring that the recalled tokens retain more complete semantic information.

This approach in SnapKV involves a one-time selection during the inference phase, after which only the selected tokens are used for attention computation, while the rest of the KVCache is discarded.

![SnapKV Framework](../assets/SnapKV_framework.png)


Other related papers include PyramidKV, which observed that attention scores exhibit a pyramid-shaped distribution across attention layers. In lower attention layers, attention is widely distributed, while in higher layers, the attention scores for a few key tokens become increasingly prominent. Therefore, PyramidKV allocates more KVCache storage space to lower layers and less space to higher layers.

MagicPiG, based on Locality-Sensitive Hashing (LSH), proposes a dynamic KVCache management strategy. First, it uses SnapKV to select a portion of important tokens to be stored in the GPU, while the KVCache of other tokens is placed in memory. By leveraging the high efficiency of LSH in high-dimensional space searches and the multithreading capabilities of CPUs, MagicPiG retrieves KVCache from memory that is similar to the current query and loads it into memory for inference. Compared to the earlier methods like InfLLM, Quest, and SnapKV, MagicPiG does not need to scan all representative tokens and select the top-k KVCache. Instead, it utilizes the mathematical properties of LSH, which not only simulates attention scores but also allows for identifying important KVCache with low overhead and high speed.

The above are just descriptions of some key points. For more detailed explanations, you can refer to the existing articles on Zhihu in Chinese:

- https://zhuanlan.zhihu.com/p/701580870

- https://zhuanlan.zhihu.com/p/714288577

## KTransformers CPU Sparse Attn Framework

### Framework Prototype

Based on the introduction of the above papers, we have distilled the following key points:

- The distribution of attention weights is sparse, and useless KVCache may introduce noise, which could actually reduce performance during the inference stage.

- For the KVCache eviction strategy during the inference stage, the common approach is to retain the tokens from the beginning and the end of the prompt, while designing algorithms to select the tokens from the middle portion. One of the main factors affecting the model's performance is the ability to accurately identify the key tokens.

- Managing the middle portion of tokens in blocks can improve memory swapping and attention computation efficiency, and smaller blocks do not seem to perform worse than token-level granularity.

- The tokens that each attention layer focuses on during inference differ, and even the allocated KVCache capacity for different layers should vary.

Based on these insights and inspirations, we developed a general framework for implementing sparse CPU attention operators during the inference phase. In the prefill stage, we use chunked prefill, loading only one layer of KVCache into GPU memory at a time for computation. Once completed, the KVCache is stored on CPU/DRAM. In the subsequent decode stage, instead of swapping KVCache in and out, the sparse attention operator runs directly on the CPU. **This significantly reduces the minimum** **GPU** **memory requirements, making local 128K or even 1M token contexts possible.**

Specifically during the generation phase, we implemented the entire framework as shown in the diagram below.

![KTransformers long congtext v1](../assets/KTransformers_long_context_v1.png)

We organized the KVCache in units of blocks. Specifically:

- **KVCache Partitioning:** A complete input prompt is divided into three configurable parts: Initial, Context, and Local. During the computation process, the Initial/Local parts will be fully attended to, while the Context part will be sparsely retrieved. This approach is based on findings from many papers (such as streamingLLM and Minference) which mention the existence of "attention sinks," where higher attention weights are often found at the beginning and the end of the sequence.

- **Context Block Partitioning:** For the middle Context, we follow the InfLLM approach by dividing it into blocks based on a configurable fixed number of tokens. Each block can select 1 to k tokens as its representative tokens. During the actual inference phase, the Context blocks that require attention are selected based on these representative tokens.
  
  - Specifically, we have implemented the following methods for selecting representative tokens, based on the approaches outlined in various papers.
    
    - Max: The maximum values of multiple tokens within a block, across each channel, are concatenated to form the representative token for the current block.
    
    - Mean: The average values of multiple tokens within a block, across each channel, are concatenated to form the representative token for the current block.
    
    - Quest: A combination of the previous two methods: the maximum and minimum values of multiple tokens within a block, across each channel, are taken as the representative tokens for the block. Under this method, the number of representative tokens is fixed at 2
    
    - Dynamic: By calculating the cumulative attention score for each token using a specific method, each block selects the top-k tokens with the highest scores as the representative tokens for the block. This is similar to InfLLM but with some simplifications.
    
    - Fix: Select tokens at fixed intervals within the block.
  
  - Once the representative tokens for each block are determined, use Equation 2 from InfLLM to calculate the similarity between the input X and the k representative tokens of each block B, and only select the top $$r_k$$ blocks for attention computation, where $$l_P $$ represents the length of the historical tokens:

Since InfLLM requires calculating a representative score for each token during the prefill stage and then selecting a representative token for each block based on these scores, this operation involves invasive modifications to the prefill implementation, making it difficult to integrate with other methods. Furthermore, in actual testing, we found that in most scenarios, similar or even better results can be achieved through a combination of other methods. Therefore, we ultimately decided not to integrate this method into the framework.

## Further Optimizations

After implementing the above framework, we conducted a series of evaluations based on LongBench and InfiniteBench.

At the beginning of the experiment, we designed the architecture so that for each inference token, the most relevant KVCache blocks would be reselected. On the one hand, this strategy incurred significant overhead during the retrieval process. On the other hand, we found that in some scenarios, f**requently changing the selection of retrieved blocks did not lead to better results**. For example, in the kvretrieval dataset, we observed that the model's responses were often correct in the first half but incorrect in the second half. Since the answers to kvretrieval questions consist of long and meaningless strings, this indicates that the correct KVCache blocks were selected during the inference of the earlier tokens but incorrect blocks were chosen during the later stages of inference.

To address this issue, we further integrated the method proposed in SnapKV. Before starting the inference, we preselect relevant KVCache blocks by analyzing the attention scores of the context tokens, based on the question. During the subsequent inference stages, the selection of KVCache blocks is restricted to this preselected range. This approach allowed us to select the block containing the correct answer 100% of the time in the kvretrieval dataset.

However, it should be noted that this method strictly relies on the structure of the Benchmark Prompt and **does not necessarily guarantee optimal performance in other scenarios, such as complex document understanding and generation tasks.** Therefore, we have integrated it into our framework as an optional module. The final framework and configurable parameters are as follows:

![KTransformers long congtext v2](../assets/KTransformers_long_context_v2.png)


Configuration：

- **threads_num:** Number of CPU Threads

- **block_size:** KVCache Block Size

- **local_windows_len:** Prompt End Window Size

- **preselect_block_count:** Number of Preselected Blocks

- **second_block_count:** Number of Blocks Selected After Preselection

- **preselect_block:** Whether to Enable Preselection

- **token_step:** Interval Between Token Selections for KVCache

- **layer_step:** Interval Between Layer Selections for KVCache

- **dense_layer_num:** Number of Initial Layers Without KVCache Selection, Importing All KVCache

- **head_select_mode:SEPARATE**(In the GQA scenario, each kv_head is selected separately) / **SHARED:** (All kv_heads are selected together)

- **representative_type:** Method of Selecting Representative Tokens

- **representative_num:** Number of Representative Tokens

By modifying configuration options, various KVCache eviction or compression methods can be easily reproduced within our framework. For example:

- Setting `block_size` to 1 and `preselect_block` to True results in a version of SnapKV without the pooling operation.

- Setting `representative_type` to Quest, `preselect_block` to False, and `head_select_mode` to SEPARATE replicates the Quest method.

Below is the pseudocode for the framework:

```python
def preselect_block(local_q, kvcache):
    key_states = kvcache.keycache
    attn_scores = torch.matmul(
                local_q, key_states.transpose(2, 3)
            ) / math.sqrt(head_dim)
    attn_scores += attn_mask
    attn_scores = nn.functional.softmax(
                attn_scores, dim=-1, dtype=torch.float32
            ).to(query_states.dtype)
    vote = attn_scores[..., initial_size:-local_size:, :].sum(dim=-2)
    pool_vote = pool1d(vote, kernel_size=kernel_size, padding=kernel_size//2, stride=1)
    indices = pool_vote.topk(max_capacity_prompt - local_size, dim=-1).indices
    kv_cache_block_indices = find_representative_tokens_block(indices)
    kvcache_after_preselected = kvcache[kv_cache_block_indices]
    ...
    return kvcache_after_preselected
def get_representative_tokens():
    Calculate the representative token for each block based on the representative_type.
    return ...
def decode_attention(query, key, value):
  # Select once every token_steps tokens.
  token_steps = 4
  # Select once every layer_steps layers.
  layer_steps = 4
  for token_idx in range(max_new_tokens):
      for layer_idx in range(config.num_hidden_layers):
          if token_idx % token_steps != 0 or layer_idx % layer_steps != 0:
            # If the attention of the current layer in this round does not require reselection, the historical selection results from the kvcache will be retained.
            kvcache_after_retrieval = history_kvcache_after_retrieval[layer_idx//layer_steps]
          else:
            # Otherwise, use the query from the current round's current layer to reselect the kvcache.
            kvcache_after_retrieval = retrieval_kvcache(query, kvcache)
            # Save it to the kvcache historical selection results.
            history_kvcache_after_retrieval[layer_idx//layer_steps] = kvcache_after_retrieval
          # calculate attention
          output = attn(query, kvcache_after_retrieval)
          yield output

# Model prefill, if preselection is required, local_q still needs to be saved.
local_q, KVCache = model.prefill(input_ids)
if preselect_block:
    # Preselection round
    KVCache = preselect_block(local_q, kvcache)
# Find the representative token for each block.
block_representative_tokens = get_representative_tokens(
   kvcache,                      
   config.representative_type
)

# model generate
'''
'''
decode_attention(query, key, value)
'''
'''
```

## Experiment

At the beginning of testing, we will use the following basic configuration, which will be further optimized through the extended framework.

```python
max_seq_len: 256000 # KVCache length
block_size: 128 # KVCache block size
local_windows_len: 4096 # The KVCache of length local_windows_len is stored on the GPU.
second_block_count: 96 # After preselection, each time select the number of KVCache blocks. If >= preselect_block_count, use the preselected blocks.
threads_num: 64 # CPU thread num
representative_type: DYNAMIC # KVCache block representative token selection method.
kv_type: FP16 
dense_layer_num: 0 # The first few layers do not need to fill or select KVCache
representative_num: 1 # The number of representative tokens within a KVCache block.
preselect_block: False # Whether to preselect.
head_select_mode: SHARED # All kv_heads jointly select.
preselect_block_count: 0 # Number of preselected blocks.
layer_step: 1 # Select every few layers.
token_step: 1 # Select every few tokens.
```

Under our framework, the comparison between the original model and KTransformers after acceleration on datasets such as 128K Big Needle-in-a-Haystack, passkey, kvretrieval, etc., is as follows. The passkey dataset involves inserting a small segment of numbers at varying depths within a redundant text. kvretrieval is about finding a matching item in randomly generated key-value pairs. All tests were conducted under the opencompass framework:

![needle_128K.png](../assets/needle_128K.png)

|                                                             |                                 |         |             |
| ----------------------------------------------------------- | ------------------------------- | ------- | ----------- |
|                                                             | Single needle retrieval zh 128k | passkey | kvretrieval |
| Original model                                              | 99.89                           | 100     | 21.0        |
| KTransformers (reselect KVCache blocks for each generation) | 100                             | 100     | 15.40       |

We can see that both the original model and the accelerated KTransformers achieve perfect scores on the relatively simpler datasets, such as Single Needle Retrieval and passkey. At the same time, the generation speed has significantly improved, increasing from 4.86 tokens/s with llama.cpp to 27.49 tokens/s with KTransformers, achieving up to a 5.65x speedup. Although the current configuration shows a noticeable drop in performance on the more challenging kvretrieval dataset, in the next section, we will address this by implementing a more optimized selection strategy to compensate for or even surpass the original model's accuracy.

Additionally, we tested the performance of the KTransformers-based configuration framework in reproducing the results of Quest. However, since InternLM2.5-7B-Chat-1M uses GQA (Grouped Query Attention) while the Quest paper primarily focuses on optimizing MHA (Multi-Head Attention) models, the actual testing results were not particularly favorable. The official team also mentioned that further support for GQA models is needed, so we will not discuss this in detail for now.

### Further improve performance

By modifying certain configurations within our flexible framework on the basis of reproduction, **we can actually achieve better results than those reported in the previous paper,** as shown in the figure below:

![](../assets/Framework_effect.png)

As mentioned earlier, the goal of the kvretrieval dataset is to find a matching key-value pair within a long sequence of semantically meaningless pairs. If tokens are generated by reselecting based on the current query each time, the likelihood of deviation increases as the text grows, leading to the selection of different KVCache blocks compared to previous selections. To address this, we introduced a preselection mechanism using SnapKV to calculate the method for selecting representative tokens, which preselects a portion of the KVCache blocks. During the subsequent inference process, the selection is limited to these blocks. After one round of preselection, the score increased from 15.4 to 24.2, **surpassing the original model + full attention's performance of 21 points.** Further research indicates that the sparsity effect of the KVCache in the first few layers of LLMs is not as significant. Therefore, we set the first two layers to fully reuse the KVCache, ultimately achieving a score of **24.4**.

Similarly, when testing the needle-in-a-haystack task on the 1M dataset, we not only reproduced the original model's reported score but also further improved accuracy (**from 89.31 to 92.88**) by using the KTransformers CPU Sparse Attn Framework to selectively compute only certain KVCache blocks. Additionally, the inference speed **reached nearly 10 times that of llama.cpp**.

![needle 1M.png](../assets/needle_1M.png)

### More comparisons

As shown in the two figures below, using the Single Needle Retrieval dataset as an example, we set llama.cpp to store the KVCache on CPU/DRAM while performing all computations on the GPU. On a 4090D server, we compared the KTransformers CPU Sparse Attn Framework with llama.cpp. While maintaining **100% answer accuracy**, we achieved a 20.6 to 94.1 times prefill speed increase and a **1.2 to 7.1 times inference speed boost**.

| ![long context prefill.png](../assets/long_context_prefill.png) | ![long context generate.png](../assets/long_context_generate.png) |
| --------------------------------------------------------------- | ----------------------------------------------------------------- |

The main reason for the significant gap in prefill speed is that after enabling KVCache offload, llama.cpp performs the attention (attn) computation on the CPU. In long-text scenarios, attention not only requires heavy computation but also takes up the majority of the computation time. In contrast, KTransformers leverages a flexible template injection framework to implement GPU Chunk Prefill layer by layer. Moving forward, we plan to further integrate high-performance sparse prefill methods such as MInference to boost speed even further.

Additionally, as a key focus of this article, the right-hand graph shows that as the prompt length increases, the inference speed of KTransformers remains stable, hovering near a horizontal line. In contrast, llama.cpp slows down as the prompt length increases. By selecting only the most important 16K KVCache blocks to participate in the inference computation, KTransformers maintains a consistent inference speed comparable to llama.cpp when processing a 16K prompt, without any performance degradation (at least on these test datasets).

## How to Use

Currently, long context is only supported by our **local_chat.py** interface, and the integration with the server interface is under development.

To facilitate user management, we have uploaded the model config, gguf, and tokenizer to a repo. URL: https://huggingface.co/nilv234/internlm2_5_to_llama_1m/tree/main

By setting the model_path and gguf_path in the local_chat function to **/path/to/repo** and setting the mode to **"long_context"**, you can use the InternLM2.5-7B-Chat-1M model with 1m functionality on a 24G VRAM.

After running local_chat.py for the first time, a config.yaml file will be automatically created under ** ~/.ktransformers**. The relevant configurations for long context are as follows:

```python
chunk_size: 4096 # prefill chunk size
max_seq_len: 100000 # KVCache length
block_size: 128 # KVCache block size
local_windows_len: 4096 # The KVCache of length local_windows_len is stored on the GPU.
second_select_num: 96 # After preselection, each time select the number of KVCache blocks. If >= preselect_block_count, use the preselected blocks.
threads_num: 64 # CPU thread num
anchor_type: DYNAMIC # KVCache block representative token selection method.
kv_type: FP16
dense_layer_num: 0 # The first few layers do not need to fill or select KVCache
anchor_num: 1 # The number of representative tokens within a KVCache block.
preselect_block: False # Whether to preselect.
head_select_mode: SHARED # All kv_heads jointly select.
preselect_block_count: 96 # Number of preselected blocks.
layer_step: 1 # Select every few layers.
token_step: 1 # Select every few tokens.
```

The memory required for different context lengths is shown in the table below:

|                | 4K  | 32K  | 64K  | 128K | 512K | 1M     |
| -------------- | --- | ---- | ---- | ---- | ---- | ------ |
| DRAM Size (GB) | 0.5 | 4.29 | 8.58 | 17.1 | 68.7 | 145.49 |

Please choose an appropriate max_seq_len based on your DRAM size.
For example:
```python
python local_chat.py --model_path="/data/model/internlm2_5_to_llama_1m"  --gguf_path="/data/model/internlm2_5_to_llama_1m" --max_new_tokens=500 --cpu_infer=10  --use_cuda_graph=True  --mode="long_context" --prompt_file="/path/to/file"
```

If you've already specified the input text via the prompt_file, just press Enter when the terminal displays chat: to begin.


================================================
FILE: doc/en/long_context_tutorial.md
================================================
## How to use ktransformers long context framework

Currently, long context is only supported by our **local_chat.py** interface, and the integration with the server interface is under development.

To facilitate user management, we have uploaded the model config, gguf, and tokenizer to a repo. URL: https://huggingface.co/nilv234/internlm2_5_to_llama_1m/tree/main

By setting the model_path and gguf_path in the local_chat function to **/path/to/repo** and setting the mode to **"long_context"**, you can use the InternLM2.5-7B-Chat-1M model with 1m functionality on a 24G VRAM.

After running local_chat.py for the first time, a config.yaml file will be automatically created under ** ~/.ktransformers**. The relevant configurations for long context are as follows:

```python
chunk_size: 4096 # prefill chunk size
max_seq_len: 100000 # KVCache length
block_size: 128 # KVCache block size
local_windows_len: 4096 # The KVCache of length local_windows_len is stored on the GPU.
second_select_num: 96 # After preselection, each time select the number of KVCache blocks. If >= preselect_block_count, use the preselected blocks.
threads_num: 64 # CPU thread num
anchor_type: DYNAMIC # KVCache block representative token selection method.
kv_type: FP16
dense_layer_num: 0 # The first few layers do not need to fill or select KVCache
anchor_num: 1 # The number of representative tokens within a KVCache block.
preselect_block: False # Whether to preselect.
head_select_mode: SHARED # All kv_heads jointly select.
preselect_block_count: 96 # Number of preselected blocks.
layer_step: 1 # Select every few layers.
token_step: 1 # Select every few tokens.
```

The memory required for different context lengths is shown in the table below:

|                | 4K  | 32K  | 64K  | 128K | 512K | 1M     |
| -------------- | --- | ---- | ---- | ---- | ---- | ------ |
| DRAM Size (GB) | 0.5 | 4.29 | 8.58 | 17.1 | 68.7 | 145.49 |

Please choose an appropriate max_seq_len based on your DRAM size.
For example:
```python
python local_chat.py --model_path="/data/model/internlm2_5_to_llama_1m"  --gguf_path="/data/model/internlm2_5_to_llama_1m" --max_new_tokens=500 --cpu_infer=10  --use_cuda_graph=True  --mode="long_context" --prompt_file="/path/to/file"
```

If you've already specified the input text via the prompt_file, just press Enter when the terminal displays chat: to begin.

================================================
FILE: doc/en/makefile_usage.md
================================================
# Makefile
## Target
### flake_find:
```bash
make flake_find
```
find all the python files under ./ktransformers dir and find the Error, Warning, Fatal... (their codes) into a list that are not consistent with the pep8 standard. For now we have get all this list in the .flake8 file's extend-ignore section in order to let flakes8 ignore them temporarily.(we may improve them in the future)
### format:
```bash
make format
```
we use black to format all the python files under ./ktransformers dir. It obeys the pep8 standard 
but we modify the line length to 120 by add 
```toml
[tool.black]
line-length = 120
preview = true
unstable = true
```
in the pyproject.toml file.

### dev_install:
```bash
make dev_install
```
install the package in the development mode. It means that the package is installed in the editable mode. So if you modify the code, you don't need to reinstall the package. We recommend the developer to use this method to install the package.

================================================
FILE: doc/en/multi-gpu-tutorial.md
================================================

# Muti-GPU

Assume you have read the [Injection Tutorial](./injection_tutorial.md) and have a basic understanding of how to inject a model. In this tutorial, we will show you how to use KTransformers to run a model on multiple GPUs.

If you have multiple GPUs, you can set the device for each module to different GPUs. 
DeepseekV2-Chat got 60 layers, if we got 2 GPUs, we can allocate 30 layers to each GPU. Complete multi GPU rule examples [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml).


<p align="center">
  <picture>
    <img alt="Inject-Struction" src="../assets/multi_gpu.png" width=60%>
  </picture>
</p>

First of all, for multi-GPU, we have to inject an new operator `KDeepseekV2Model`. And set division of the layers to different GPUs. For our case, we have to set the `transfer_map` in the `KDeepseekV2Model` operatoras as follows:

```yaml
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      transfer_map: 
        30: "cuda:1"
```

And we have to set the device for each module in the model. 

For example, for `routed experts`, the yaml for one GPU is:
```yaml
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # Custom MoE kernel with expert parallelism
    kwargs:
      generate_device: "cuda:0"
      generate_op: "MLPCUDAExperts"
      out_device: "cuda:0"
  recursive: False # Don't recursively inject submodules of this module
```
But for two GPUs, we need to set the device for each module in the model. 

```yaml
# allcate 0-29 layers‘s out_device to cuda:0
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

# allocate 30-59 layers‘s out_device to cuda:1
- match:
    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module
```
For other modules, we can set the device in the same way.

# How to fully utilize multi-GPU's VRAM

When you have multiple GPUs, you can fully utilize the VRAM of each GPU by moving more weights to the GPU.

For example, for DeepSeekV2-Chat, we can move the weights of the experts to the GPU. 

For example, the yaml for two GPUs is:
```yaml
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False
```

But we got extra 60GB VRAM on cuda:0, we can move experts in layer 4~8 to cuda:0. 

```yaml
# Add new rule before old rule.
- match:
    name: "^model\\.layers\\.([4-8])\\.mlp\\.experts$" # inject experts in layer 4~8 as marlin expert
  replace:
    class: ktransformers.operators.experts.KTransformersExperts  
    kwargs:
      generate_device: "cuda:0"
      generate_op:  "KExpertsMarlin"
  recursive: False

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     
    kwargs:
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False 
```

Adjust the layer range as you want. Note that:
* The loading speed will be significantly slower for each expert moved to the GPU.
* You have to close the cuda graph if you want to move the experts to the GPU.
* For DeepSeek-R1/V3, each expert moved to the GPU will consume approximately 6GB of VRAM.
* The first matched rule in yaml will be applied. For example, if you have two rules that match the same layer, only the first rule's replacement will be valid.


================================================
FILE: doc/en/operators/llamafile.md
================================================
# Llamafile Operators Documentation

## Llamafile Sgemm

The Llamafile Sgemm module is an efficient implementation of general matrix multiplication (GEMM) extracted from the great [Llamafile project](https://github.com/Mozilla-Ocho/llamafile/blob/main/llamafile/sgemm.cpp). 
This module optimizes performance by utilizing various processor-specific instruction sets. For instance, it checks for different x86 instruction sets such as AVX, FMA, and AVX512, leveraging these advanced instructions to accelerate computation. 
Additionally, the Llamafile Sgemm module supports multiple quantization types, including q8_0, q6_k, and q5_k, among others. This adaptability to different hardware capabilities ensures the most advanced instructions are used in any given computing environment, achieving high computational efficiency. For more information, you can view the [Llamafile Sgemm module](https://github.com/Mozilla-Ocho/llamafile/blob/main/llamafile/sgemm.cpp) on GitHub.


## CPUInfer
To power Llamafile and many future CPU kernels without the original GGML framework, we developed a simple CPUInfer multi-threaded execution framework. It currently leverages the Llamafile Sgemm module to implement key operators such as linear layers, MLP, and MoE, and will be extended to support many other operators. These operators are fundamental components for building large models. CPUInfer features a backend work-stealing thread pool and asynchronous task queue execution logic to efficiently offload parts of model parameters to the CPU, thereby maintaining high inference performance. It supports adjustments based on hardware capabilities or user configurations, providing enhanced inference performance and making it an ideal tool for running deep learning models on CPUs.

## Expert-Parallel MoE

The MoE module's performance can be enhanced by using custom kernels that utilize **expert parallelism**. Since the routed experts are independently computable, we can utilize this inherent parallelism to speed up MoE computations. Specifically, we can allocate each expert MLP to a separate thread group, allowing for the simultaneous computation of all routed experts. This approach of expert parallelism significantly boosts MoE performance by minimizing the frequency of global synchronizations and reducing kernel launch overhead compared to sequential expert computation.

## Microbenchmark

Our evaluations were conducted on an Intel(R) Xeon(R) Gold 6454S processor, utilizing real parameters from the DeepSeek-Coder-V2-Instruct model.

### Linear Projection

The performance of the linear layer was assessed using an Attention Output Projection with dimensions of [5120, 16384]. Here, the input was a vector of 16384 dimensions, and the output was a vector of 5120 dimensions.

![Linear_projection_time](Linear_projection_time.png)

As we can see, in half-precision floating-point formats (fp16 and bf16), CPUInfer's performance exceeded that of Torch by 1.7 and 1.5 times, respectively. For 8-bit quantization, CPUInfer (supporting q8_0) and Torch (supporting qint8) demonstrated nearly equivalent performance. However, CPUInfer employs a more refined scaling approach, using different factors for each group (in q8_0 quantization, every 32 numbers form one group), whereas Torch uses a basic per-tensor quantization, potentially leading to significant precision loss. Furthermore, CPUInfer’s capability to use lower-bit quantization enhances inference speed in specific scenarios.

### MoE

In the MoE module, each token selected 6 experts out of 160 for computation, with input and output dimensions of 5120, and an intermediate dimension of 1536.

![Combined_MoE_time_per_layer](Combined_MoE_time_per_layer.png)

For half-precision floating points and 8-bit quantization formats, CPUInfer's generation performance was 2.5 and 3.2 times better than Torch, respectively. Moreover, using the 8-bit quantization format, CPUInfer achieved faster prefill speeds compared to Torch, with shorter prompts highlighting a more pronounced performance difference.


================================================
FILE: doc/en/prefix_cache.md
================================================
## Enabling Prefix Cache Mode in KTransformers

Balance serve now supports prefix cache reuse! To enable **Prefix Cache Mode** in KTransformers, you need to modify the configuration file and recompile the project. 

### Step 1: Modify the Configuration File

Edit the `./ktransformers/configs/config.yaml` file with the following content (you can adjust the values according to your needs):

```yaml
attn:
  page_size: 16 # Size of a page in KV Cache.
  chunk_size: 256
kvc2:
  gpu_only: false # Set to false to enable prefix cache mode (Disk + CPU + GPU KV storage)
  utilization_percentage: 1.0
  cpu_memory_size_GB: 500 # Amount of CPU memory allocated for KV Cache
  disk_path: /mnt/data/kvc # Path to store KV Cache on disk
```

### Step 2: Update Submodules and Recompile

If this is your first time using prefix cache mode, please update the submodules first:

```bash
git submodule update --init --recursive # Update PhotonLibOS submodule
```

Then recompile the project:

```bash
# Install single NUMA dependencies
USE_BALANCE_SERVE=1  bash ./install.sh
# For those who have two cpu and 1T RAM（Dual NUMA）:
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
```

## Note
Balance serve utilizes a 3-layer (GPU-CPU-Disk) scheme to store and reuse KVCache. Deleting KVCache is not supported now. If you have too much KVCache, you can simply delete them by remove kvcache files. 


================================================
FILE: doc/en/xpu.md
================================================
# Intel GPU Support for KTransformers (Beta)

## Introduction

### Overview
We are excited to introduce **Intel GPU support** in KTransformers (Beta release). This implementation has been tested and developed using Intel Xeon Scalable processors and Intel Arc GPUs (such as A770 and B580).

## Installation Guide

### 1. Install Intel GPU Driver
Begin by installing the GPU drivers for your Intel GPU:
- [Official GPU Installation Guide for Intel GPUs](https://dgpu-docs.intel.com/driver/overview.html)

To verify that the kernel and compute drivers are installed and functional:

```bash
clinfo --list | grep Device
 `-- Device #0: 13th Gen Intel(R) Core(TM) i9-13900K
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics
 `-- Device #0: Intel(R) UHD Graphics 770
```

> [!Important]
> Ensure that **Resizable BAR** is enabled in your system's BIOS before proceeding. This is essential for optimal GPU performance and to avoid potential issues such as `Bus error (core dumped)`. For detailed steps, please refer to the official guidance [here](https://www.intel.com/content/www/us/en/support/articles/000090831/graphics.html).

### 2. Set Up Conda Environment
We recommend using Miniconda3/Anaconda3 for environment management:

```bash
# Download Miniconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh

# Create environment
conda create --name ktransformers python=3.11
conda activate ktransformers

# Install required libraries
conda install -c conda-forge libstdcxx-ng

# Verify GLIBCXX version (should include 3.4.32)
strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
```

> **Note:** Adjust the Anaconda path if your installation directory differs from `~/anaconda3`

### 3. Install PyTorch and IPEX-LLM
Install PyTorch with XPU backend support and [IPEX-LLM](https://github.com/intel/ipex-llm):

```bash
pip install ipex-llm[xpu_2.6]==2.3.0b20250518 --extra-index-url https://download.pytorch.org/whl/xpu
pip uninstall torch torchvision torchaudio
pip install torch==2.7+xpu torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu # install torch2.7
pip uninstall intel-opencl-rt dpcpp-cpp-rt
```

### 4. Build ktransformers

```bash
# Clone repository
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule update --init

# Install dependencies
bash install.sh --dev xpu
```

## Running DeepSeek-R1 Models

### Configuration for 16B VRAM GPUs
Use our optimized configuration for constrained VRAM:

```bash
export SYCL_CACHE_PERSISTENT=1
export ONEAPI_DEVICE_SELECTOR=level_zero:0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1

python ktransformers/local_chat.py \
  --model_path deepseek-ai/DeepSeek-R1 \
  --gguf_path <path_to_gguf_files> \
  --optimize_config_path ktransformers/optimize/optimize_rules/xpu/DeepSeek-V3-Chat.yaml \
  --cpu_infer <cpu_cores + 1> \
  --device xpu \
  --max_new_tokens 200
```

## Known Limitations
- Serving function is not supported on Intel GPU platform for now

## Troubleshooting
1. Best Known Config (BKC) to obtain best performance

To obtain best performance on Intel GPU platform, we recommend to lock GPU frequency and set CPU to performance mode by below settings.
```bash
echo "performance" | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo 0 | sudo tee /sys/devices/system/cpu/cpu*/power/energy_perf_bias
# 2400 is max frequency for Arc A770
sudo xpu-smi config -d 0 -t 0 --frequencyrange 2400,2400
# 2850 is max frequency for Arc B580
# sudo xpu-smi config -d 0 -t 0 --frequencyrange 2850,2850
```

2. Runtime error like `xpu/sycl/TensorCompareKernels.cpp:163: xxx. Aborted (core dumped)`

This error is mostly related to GPU driver. If you meet such error, you could update your `intel-level-zero-gpu` to `1.3.29735.27-914~22.04` (which is a verified version by us) by below command.
```bash
wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
sudo gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy client" | \
sudo tee /etc/apt/sources.list.d/intel-gpu-jammy.list
sudo apt update
# or sudo apt update --allow-insecure-repositories
sudo apt install intel-level-zero-gpu=1.3.29735.27-914~22.04
```

3. `ImportError: cannot import name 'intel' from 'triton._C.libtriton'`

Installing Triton causes pytorch-triton-xpu to stop working. You can resolve the issue with following command:
```bash
pip uninstall triton pytorch-triton-xpu
# Reinstall correct version of pytorch-triton-xpu
pip install pytorch-triton-xpu==3.3.0 --index-url  https://download.pytorch.org/whl/xpu
```

4. `ValueError: Unsupported backend: CUDA_HOME ROCM_HOME MUSA_HOME are not set and XPU is not available.`

Ensure you have permissions to access /dev/dri/renderD*. This typically requires your user to be in the render group:
```bash
sudo gpasswd -a ${USER} render
newgrp render
```

## Additional Information
To run KTransformers on XPU with Docker, please refer to [Docker_xpu.md](./Docker_xpu.md).


================================================
FILE: doc/zh/DeepseekR1_V3_tutorial_zh.md
================================================
<!-- omit in toc -->

# GPT-4/o1 级别本地 VSCode Copilot 在仅 24GB 显存的台式机上的表现

- [摘要](#摘要)
  - [先决条件](#先决条件)
  - [基准测试结果](#基准测试结果)
    - [V0.2](#v02)
      - [设置](#设置)
      - [内存占用](#内存占用)
      - [基准测试结果](#基准测试结果)
    - [V0.3-Preview](#V0.3-Preview)
      - [设置](#设置-1)
      - [内存占用](#内存占用-1)
      - [基准测试结果](#基准测试结果-1)
  - [如何运行](#如何运行)
    - [V0.2 展示](#v02-展示)
      - [单插槽版本 (32 核心)](#单插槽版本（32 核心）)
      - [双插槽版本 (64 核心)](#双插槽版本（64 核心）)
    - [V0.3 展示](#v03-展示)
      - [双插槽版本 (64 核心)](#双插槽版本（64 核心）-1)
  - [一些解释](#一些解释)
  - [常见问题解答](#常见问题解答)
    - [R1 不思考](#R1 不返回思考过程)
    - [更多常见问题解答](#更多常见问题解答)

# 摘要

> **2025年2月10日**: 支持在单个（24GB 显存）/多个 GPU 和 382GB 内存上运行 DeepseekR1 和 V3，速度提升高达 3~28 倍。<br>

嗨，我们是 KTransformers 团队（以前因本地 CPU/GPU 混合推理开源项目 DeepSeek-V2 而闻名）。

我们听到了您对 DeepSeek-R1/V3 支持的请求——我们很高兴终于可以交付了！很抱歉让您久等了，但我们一直在酝酿一些真正令人惊叹的东西！

今天，我们自豪地宣布，我们不仅支持 DeepSeek-R1/V3，如下视频所示：

https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

</p>

- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1:** 仅使用 14GB 显存和 382GB 内存运行其 Q4_K_M 版本。
  - 预填充(Prefill)速度 (tokens/s):
    - KTransformers: 54.21 (32 核心) → 74.362 (双插槽，2×32 核心) → 255.26 (优化的 AMX 基 MoE 内核，仅 V0.3) → 286.55 (选择性使用 6 个专家，仅 V0.3)
    - 与 llama.cpp 在 2×32 核心下 10.31 tokens/s 相比，速度提升高达 **27.79 倍**
  - 解码(Decode)速度 (tokens/s):
    - KTransformers: 8.73 (32 核心) → 11.26 (双插槽， 2×32 核心) → 13.69 (选择性使用 6 个专家，仅 V0.3)
    - 与 llama.cpp 在 2×32 核心下 4.51 tokens/s 相比，速度提升高达 **3.03 倍**

我们还提供了即将推出的优化预览，包括英特尔 AMX 加速内核和选择性专家激活方法，这将显著提升性能。通过 V0.3 预览版，我们在预填充方面实现了高达 286 tokens/s 的速度，比本地推理的 llama.cpp **快 28 倍**。二进制发行版现已可用，源代码即将推出！请查看 wheel 包 [此处](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl) 。

## 先决条件

我们在以下配置下进行了最佳性能测试（V0.2）： <br>
CPU: Intel (R) Xeon (R) Gold 6454S 1T 内存 (2 NUMA 节点) <br>
GPU: 4090D 24G 显存 <br>
内存: 标准 DDR5-4800 服务器内存 (1 TB)

## 基准测试结果

### V0.2

#### 设置

- Model: DeepseekV3-q4km (int4)<br>
- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
- GPU: 4090D 24G 显存
- 我们在充分预热后进行测试

#### 内存占用:

- 单插槽: 382G 内存，至少 14GB 显存
- 双插槽: 1T 内存，至少 14GB 显存

#### 基准测试结果

“6 个专家” 情况是 V0.3 预览版中内容


| Prompt<br>(500 tokens)  | 双插槽 Ktrans (6 个专家) | 双插槽 Ktrans (8 个专家) | Single socket Ktrans (6 个专家) | Single socket Ktrans (8 个专家) | llama.cpp (8 个专家) |
| ----------------------- | ------------------------ | ------------------------ | ------------------------------- | ------------------------------- | -------------------- |
| 预填充(Prefill) token/s | 97.32                    | 82.94                    | 65.14                           | 54.21                           | 10.31                |
| 解码(Decode) token/s    | 13.69                    | 12.208                   | 10.303                          | 8.73                            | 4.51                 |

**最高加速比在解码方面达到 <u>3.03x</u> 倍，在预填充方面达到 <u>9.44x</u> 倍。**

### V0.3-Preview

#### 设置

- Model: DeepseekV3-BF16 (在线量化为 CPU 的 int8 和 GPU 的 int4)
- CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
- GPU: (1~4)x 4090D 24G 显存 (更长的 prompt 需要更多显存)

#### 内存占用:

- 644GB 内存，至少 14GB 显存

#### 基准测试结果


| Prompt length                     | 1K     | 2K     | 4K     | 8K     |
| --------------------------------- | ------ | ------ | ------ | ------ |
| KTrans (8 个专家) Prefill token/s | 185.96 | 255.26 | 252.58 | 195.62 |
| KTrans (6 个专家) Prefill token/s | 203.70 | 286.55 | 271.08 | 207.20 |

**KTrans V0.3 的预填充速度比 KTrans V0.2 快 <u>3.45x</u> 倍，比 llama.cpp 快 <u>27.79x</u> 倍。**
**解码速度与 KTrans V0.2（6 个专家版本）相同，因此省略。**

主要加速来自于

- 英特尔 AMX 指令集和我们专门设计的缓存友好内存布局
- 专家选择策略，根据离线配置文件结果选择更少的专家

*从我们对 DeepSeekV2、DeepSeekV3 和 DeepSeekR1 的研究中，当我们略微减少推理中的激活专家数量时，输出质量没有变化。但解码和预填充的速度加快了，这令人鼓舞。因此，我们的展示利用了这一发现。*

## 如何运行

### 多并发展示

多并发需要额外编译调度器 c++ 代码

```shell
sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libfmt-dev
sudo apt-get install libgflags-dev zlib1g-dev patchelf
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule update --init --recursive
# 如果使用双 numa 版本
USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
# 如果使用单 numa 版本
USE_BALANCE_SERVE=1 bash ./install.sh
# 启动命令
python ktransformers/server/main.py --model_path <your model path> --gguf_path <your gguf path> --cpu_infer 62 --optimize_config_path <inject rule path> --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
```

`<your model path>` 可以是本地路径，也可以是在线路径，例如 deepseek-ai/DeepSeek-V3。如果在线连接出现问题，可以尝试使用镜像（hf-mirror.com） <br>
`<your gguf path>` 也可以是在线路径，但由于其体积较大，我们建议您下载并量化模型（注意这是目录路径）

`<inject rule path>` 注入规则 yaml 文件地址，我们在 `ktransformers/optimize/optimize_rules/ ` 目录下提供了 `DeepSeek-V3-Chat-serve.yaml` 和 `DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml` 分别对应 [`DeepSeek-V3/R1-q4km`](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M) 和 [`DeepSeek-V3/R1-hybrid`](https://huggingface.co/KVCache-ai/DeepSeek-R1-GGML-FP8-Hybrid/tree/main)

`--max_new_tokens 1000` 是最大输出 token 长度。如果发现答案被截断，可以增加此数字以获得更长的答案（但要注意内存不足问题，增加此数字会降低生成速度）.

`--chunk_size 256` 引擎单次运行最大 token 个数

`--cache_lens 32768`  调度器申请 kvcache 的总长度。所有请求共享 32768 个 tokens 对应 kvcache 空间，请求完成后会释放其所占用的 kvcache 空间。

`--backend_type balance_serve` `balance_serve`是 v0.2.4新增的后端引擎，原本的单并发引擎为`ktransformers`

`--max_batch_size 4` 引擎单次运行最多处理 4 个请求(prefill + decode),(仅用于`balance_serve`)

<br>命令 numactl -N 1 -m 1 的目的是避免 NUMA 节点之间的数据传输<br>
注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think`，这在 [常见问题解答](#常见问题解答) 部分中解释。

### V0.2 展示

#### 单插槽版本（32 核心）

我们的 local_chat 测试命令是:

```shell
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule init
git submodule update
numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
<当您看到聊天时，按回车键加载文本提示文件>
```

#### 双插槽版本（64 核心）

在安装之前（使用 install.sh 或 `make dev_install`），请确保设置环境变量 `USE_NUMA=1`，方法是 `export USE_NUMA=1`（如果已经安装，请重新安装并设置此环境变量） <br>
我们的 local_chat 测试命令是：

```shell
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule init
git submodule update
export USE_NUMA=1
make dev_install # or sh ./install.sh
python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
<当您看到聊天时，按回车键加载文本提示文件>
```

参数的含义相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。

### V0.3 展示

#### 双插槽版本（64 核心）

我们的 local_chat 测试命令是：

```shell
wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
<当您看到聊天时，按回车键加载文本提示文件>
```

参数的含义与 V0.2 相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。

## 一些解释

1. 我们还想进一步利用 Xeon Gold CPU 上的两个 NUMA 节点。为了避免节点之间的数据传输成本，我们在两个节点上 "copy" 了关键矩阵，这会增加内存占用，但会加速预填充和解码过程。但这种方法占用大量内存，加载权重时速度较慢，因此加载时请耐心等待并监控内存使用情况。我们计划优化这一巨大的内存开销。敬请期待。
2. 命令参数 `--cpu_infer 65` 指定使用多少核心（超过物理核心数量是可以的，但并不是越多越好。根据实际核心数量适当降低此值）。<br>
3. 为什么使用 CPU/GPU 混合推理？
   DeepSeek 的 MLA 操作符计算密集。虽然全部在 CPU 上运行是可行的，但将繁重的计算任务卸载到 GPU 上能带来巨大的性能提升。
4. 加速来自哪里？

   - 专家卸载：与传统的基于层或 KVCache 卸载（如 llama.cpp 中的）不同，我们将专家计算卸载到 CPU，将 MLA/KVCache 卸载到 GPU，与 DeepSeek 的架构完美对齐，实现最佳效率。
   - 英特尔 AMX 优化 – 我们的 AMX 加速内核经过精心调优，运行速度是现有 llama.cpp 实现的数倍。我们计划在清理后开源此内核，并考虑向 llama.cpp 上游贡献代码。
5. 为什么选择英特尔 CPU？
   英特尔目前是唯一支持 AMX 类似指令的 CPU 供应商，与仅支持 AVX 的替代方案相比，性能显著更好。

## 常见问题解答

### R1 不返回思考过程

注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`。详细信息在 [常见问题解答](./FAQ.md) 部分中。 <br>

## 问题

* 修复服务器集成功能以实现网络API访问支持
* 修复本地聊天功能仅支持单行提示输入的问题（目前输入换行符(\n)即开始生成提示）

### 更多常见问题解答

[详见](./FAQ.md)


================================================
FILE: doc/zh/DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md
================================================
# 基准测试结果

在 Batchsize=4、输出长度为 1024 的条件下，性能测试结果如下：

| Prompt length                     | 1K     | 2K     | 4K     |
| --------------------------------- | ------ | ------ | ------ |
| KTrans Prefill token/s | 174.68 | 169.52 | 167.15 |
| KTrans Decode token/s | 16.07 | 16.12 | 16.48 |

## 先决条件
我们在以下配置下进行了Deepseek-R1最佳性能测试：
- 服务器型号：Atlas 2UP
- NPU：Atlas 300I A2
- CPU: HUAWEI Kunpeng 920 7270Z
- 内存: DDR5服务器内存（1TB）

# 部署

## 物理机安装

部署满血版Deepseek-R1/V3，需要机器物理内存能够存放下全部路由专家的权重，约400GB。

目前支持的NPU型号：**300I A2**。

在技术人员的支持下完成硬件安装。

## 系统安装

根据网页[昇腾兼容性查询助手](https://www.hiascend.com/hardware/compatibility)查询，选用系统Ubuntu 22.04 for aarch64，内核5.15.0-25-generic，并禁止系统自动更新。系统镜像获取链接：[ubuntu-old-releases](https://mirrors.aliyun.com/oldubuntu-releases/releases/22.04)。

## HDK安装

选择[Ascend HDK 25.3.RC1](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=32&cann=8.3.RC1&driver=Ascend+HDK+25.3.RC1)进行安装，安装方式参考[昇腾社区HDK安装指导](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/83RC1alpha003/softwareinst/instg/instg_0005.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit)。


## 镜像部署

建议使用昇腾MindIE镜像[昇腾社区镜像下载](https://www.hiascend.com/developer/ascendhub/detail/af85b724a7e5469ebd7ea13c3439d48f)部署开发环境，选择2.2.RC1-800I-A2-py311-openeuler24.03-lts下载。

下载完成镜像后，执行以下命令启动容器：

```bash
docker run -it -d --net=host --shm-size=500g \
       --name <container-name> \
       -w /workspace \
       --device=/dev/davinci_manager \
       --device=/dev/hisi_hdc \
       --device=/dev/devmm_svm \
       -v /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro \
       -v /usr/local/dcmi:/usr/local/dcmi:ro \
       -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro \
       -v /usr/local/sbin/:/usr/local/sbin:ro \
       -v <path_to_your_project>:/workspace \
       mindie:2.2.RC1-800I-A2-py311-openeuler24.03-lts bash
```

进入容器

```bash
docker exec -it <container-name> /bin/bash
```

部署Python环境：

```bash
yum install zlib1g-dev libtbb-dev libssl-dev libaio-dev libcurl4-openssl-dev
pip3 install numpy==1.26.4  # 适配torch/torch_npu
pip3 install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu
pip3 install packaging ninja fire protobuf attrs decorator cloudpickle ml-dtypes scipy tornado absl-py psutil
pip3 install sqlalchemy
pip3 install transformers==4.57.1 #此处注意运行时transformers版本要求4.57.1(其他版本未验证)
#pip3 install cpufeature  # only for x86
```

## CANN安装

选择[CANN 8.3.RC1.alpha003](https://www.hiascend.com/developer/download/community/result?cann=8.3.RC1.alpha003&product=4&model=32)进行安装，安装方式参考[昇腾社区CANN安装指导](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/83RC1alpha003/softwareinst/instg/instg_quick.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit)。

需要安装ToolKit，Kernel和NNAL。

## torch_npu安装

获取最新的仓库代码：[torch_npu Gitcode](https://gitcode.com/Ascend/pytorch)

由于涉及新增算子，公网pypi内提供的torch_npu暂时无法直接使用，可以下载代码仓库编译，当前适配分支为v2.5.1，编译命令可以参考仓库内文档。
编译过程需要保证访问github，gitcode等平台网络畅通并设置如下环境变量：

```bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh  # 以实际CANN安装路径为准
source /usr/local/Ascend/nnal/atb/set_env.sh  # 以实际NNAL安装路径为准
```
由于环境对于torch_npu版本号有特定要求，使用编译后的torch_npu包需要手动移除版本信息中的哈希后缀，操作如下：
使用文本编辑器打开`/usr/local/lib/python3.11/site-packages/torch_npu/version.py`(不同环境python路径可能不同，可以使用`pip show torch_npu`查看安装的python路径)
将`__version__ = '2.5.1.post4+git69550dfc'`改为`__version__ = '2.5.1.post4'`


## 权重准备

目前，为了满足性能和精度的要求，我们需要准备两份权重，并使用提供的权重合并脚本对权重进行合并，最终只会使用合并后的权重。

Q4权重：[DeepSeek-R1-Q4_K_M](https://modelscope.cn/models/unsloth/DeepSeek-R1-GGUF/files)

W8A8权重：[DeepSeek-R1-W8A8](https://modelers.cn/models/State_Cloud/DeepSeek-R1-W8A8)

使用[merge_safetensor_gguf.py](../../merge_tensors/merge_safetensor_gguf.py)来合并Q4和W8A8权重：

```bash
python merge_safetensor_gguf.py --safetensor_path /mnt/weights/DeepSeek-R1-Q4_K_M --gguf_path /mnt/weights/DeepSeek-R1-W8A8 --output_path /mnt/weights/DeepSeek-R1-q4km-w8a8
```

## 图下沉部署

开启图下沉功能，需要添加如下环境变量：

```bash
export TASK_QUEUE_ENABLE=0  # 保证算子下发顺序有序
```


## kTransformers部署

将项目文件部署到机器上：

- 初始化third_party。由于此过程耗时较多，且容易受网络影响导致仓库克隆失败，建议初始化一次后，将相关文件进行打包，以便后续直接解压使用。
  ```bash
  git clone https://github.com/kvcache-ai/ktransformers.git
  cd ktransformers
  git submodule update --init --recursive
  ```
- 对于arm平台，注释掉`./third_party/llamafile/iqk_mul_mat_arm82.cpp`中的
  ```cpp
  #define iqk_mul_mat iqk_mul_mat_arm82
  #define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
  ```
- 执行`source /usr/local/Ascend/ascend-toolkit/set_env.sh`（以实际CANN-TOOLKIT安装路径为准）。
- 执行`apt install cmake libhwloc-dev pkg-config`安装依赖。
- 修改项目目录下 /ktransformers/config/config.yaml 中attn部分的page_size: 128  chunk_size: 16384
- 执行`USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh`，等待安装完成。

此处给出示例balance_serve的启动脚本（由于使用了相对路径，需将该脚本放至项目的根路径下）：

```bash
#!/bin/bash
export USE_MERGE=0
export INF_NAN_MODE_FORCE_DISABLE=1
export TASK_QUEUE_ENABLE=0
export RANK=0
export LOCAL_WORLD_SIZE=1
#export PROF_DECODE=1
#export PROF_PREFILL=1

source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh

python ktransformers/server/main.py \
--port 10002 \
--model_path <your model path> \
--gguf_path <your model path> \
--model_name DeepSeekV3ForCausalLM \
--cpu_infer 100 \
--optimize_config_path  ./ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-300IA2-npu-serve.yaml \
--max_new_tokens 1024 \
--cache_lens 20480 \
--max_batch_size 4 \
--use_cuda_graph \
--tp 1 \
--backend_type balance_serve
```

相关参数说明：

- `--model_path`：kTransformers原生参数，str，此处用来指定合并后的模型文件路径
- `--gguf_path`：kTransformers原生参数，str，此处用来指定合并后的模型文件路径
- `--cpu_infer`：kTransformers原生参数，int，用来控制CPU侧实际worker线程数，非必选
- `--optimize_config_path`：kTransformers原生参数，str，用来指定所用的模型优化配置文件，需要注意相对路径的使用，此处为**必选**
- `--cache_lens`：调度器申请 kvcache 的总长度。所有请求共享指定数量（例如 `20480`）的 tokens 对应的 kvcache 空间，请求完成后会释放其所占用的 kvcache 空间，非必选
- `--use_cuda_graph`：kTransformers原生参数，bool，为True表示开启图下沉，为False表示关闭图下沉，非必选
- `--max_new_tokens`：kTransformers原生参数，int，当统计到输出的tokens数量达到该值时，会直接中止输出，非必选
- `--tp`：新增参数，int，用于开启tensor model parallel功能，目前local_chat只支持tp大小与ws大小相同（不支持local_chat使用多dp），非必选


# 其他问题

## 可能存在的其他依赖问题

ImportError: libhccl.so: cannot open shared object file: No such file or directory

```bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh  # 以实际CANN安装路径为准
```

ImportError: libascend_hal.so: cannot open shared object file: No such file or directory

```bash
export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH  # 以实际Driver安装路径为准
```


================================================
FILE: doc/zh/KTransformers-Fine-Tuning_Developer-Technical-Notes_zh.md
================================================
- [KTransformers 微调 × LLaMA-Factory 集成 – 开发技术篇](#ktransformers-微调-x-llama-factory-集成-–-开发技术篇)
- [Introduction](#introduction)

- [KT微调框架整体性描述](#kt微调框架整体性描述)
  - [Attention 部分（LoRA + KT 特性并存）](#attention-部分lora--kt-特性并存)
    - [继承关系](#继承关系)
    - [替换策略](#替换策略)
  - [MoE 部分（算子封装+backward实现）](#moe-部分算子封装backward实现)
    - [MoE算子封装](#moe算子封装)
    - [MoE 反向优化 (CPU 实现)](#moe-反向优化-cpu-实现)
  - [多卡加载与训练：用“放置策略”而不是 DataParallel](#多卡加载与训练用放置策略而不是-dataparallel)

- [KT-LoRA微调测试](#kt-lora微调测试)
  - [实验设置](#实验设置)
  - [效果测试](#效果测试)
    - [风格化对话测试（CatGirl语气）](#风格化对话测试catgirl语气)
    - [生成式翻译风格基准测试](#生成式翻译风格基准测试)
    - [医疗垂直领域基准（AfriMed-SAQ/MCQ）](#医疗垂直领域基准afrimed-saqmcq)
    - [局限性说明](#局限性说明)

- [速度测试](#速度测试)
  - [端到端性能](#端到端性能)
  - [MoE部分的计算性能（DeepSeek-V3-671B）](#moe部分的计算性能deepseek-v3-671b)

- [显存/内存性能](#显存内存性能)

- [结论](#结论)

# KTransformers 微调 × LLaMA-Factory 集成 – 开发技术篇

**MadSys实验室, KVCache-AI团队, 趋境科技, LLaMA-Factory团队**

## Introduction

当今的开源大模型（从 DeepSeek-V3/R1 到 Qwen-MoE 系列以及 Kimi-K2 等）在性能和规模上突飞猛进。然而，受限于**计算资源和显存**，普通研究者难以对这些上千亿乃至更大规模的模型进行微调。为此，我们设计了 **KTransformers** 与 **LLaMA-Factory** 集成的方案，使得仅需 **2～4 张 RTX 4090 GPU** 加上足够的 CPU 内存，就能微调 DeepSeek-671B 这样的超大规模 Mixture-of-Experts (MoE) 模型。

这一架构旨在桥接资源鸿沟，让更多人能够**在本地探索超大模型微调**的可能；同时在相对小一些的模型（如 14B/30B 参数量级）上，也能提供**更高效的场景化定制**途径。我们通过风格化对话、西式翻译语气、医学问答等任务验证了该方案，仅用数小时即可实现模型风格和专业领域的**快速适配**。

从系统架构上看，如下图所示，**LLaMA-Factory** 扮演微调流程的调度中枢，负责统一配置数据和训练流程、插入 LoRA 模块以及管理推理接口；**KTransformers** 则作为可插拔的高性能算子后端，在相同的训练代码下接管底层 **Attention** 和 **MoE** 运算，实现 **GPU+CPU 异构协同**，加速训练并降低显存占用。

![image-20251011010558909](../assets/image-20251011010558909.png)

为评估该集成的性能优势，我们使用 LLaMA-Factory 分别调用了 HuggingFace 默认后端、Unsloth 后端以及 KTransformers 后端进行 LoRA 微调的对比测试（在相同设置和数据集下）。结果表明，**KTransformers** 是目前唯一能在 2～4 张 24GB 4090卡上微调 **671B 规模 MoE 模型** 的方案；同时在 14B 规模的 MoE 模型上，相比另两种方案也具有**更高的吞吐速率**和**更低的 GPU 显存占用**。

| Under LoRA (BF16)+[NekoQA-10K-风格化对话数据集](https://github.com/mindsRiverPonder/LLM-practice) | HuggingFace Backend                      | Unsloth Backend                      | KTransformers Backend |
| ------------------------------------------------------------ | ---------------------------------------- | ------------------------------------ | --------------------- |
| [14B-DeepSeekV2-Lite] LoRA Fine-tuning throughput            | 303.58 token/s                           | 455.37 token/s                       | 530.38 token/s        |
| [14B-DeepSeekV2-Lite] GPU Memory                             | 32.12 GB                                 | 9.64 GB                              | 6.08 GB               |
| [671B-DeepSeekV3] LoRA Fine-tuning throughput                | <font color='red'>Too Huge to run</font> | <font color='red'>NOT SUPPORT</font> | 40.35 token/s         |
| [671B-DeepSeekV3] GPU Memory（多卡总和）                     | 理论值1400 GB †                          | <font color='red'>NOT SUPPORT</font> | 70 GB †               |

† **1400 GB** 为**理论显存**（FP16 全参数常驻，非可运行配置）；**70 GB** 为 KT 策略（Attention 驻 GPU + MoE分层 offload）下的**实测峰值**。

上表中可以看出，对于 14B 模型，KTransformers 后端的吞吐量相比 HuggingFace 默认方案提升了约 75%，而显存占用仅为其约 1/5。对于 671B 模型，HuggingFace 和 Unsloth 在单台4090环境下无法运行，而 KTransformers 能以 **40 tokens/s** 的速度LoRA微调，并将 GPU 显存需求控制在 70 GB。

![按照模型划分的对比图_02](../assets/image-compare_model.png)


## KT微调框架整体性描述

下面详细展示的是在 LLaMA-Factory 的微调框架中，KTransformers 后端如何接管底层算子并实现 Attention / MoE 的优化结构。

DeepSeek-V3/V2等MoE模型主要包括小参数、密集矩阵的Attention部分和大参数、稀疏矩阵的MoE部分。为了直观说明，我们以 DeepSeek-V2-Lite-Chat 的第 2 层为例（从该层起，每层包含 Attention 与 MoE 两个子模块），其中Attention由GPU承担主要计算与缓存（KV），剩下的大参数量MoE主要由CPU承担 。下文将先介绍 **Attention 部分的替换与继承关系**，再介绍 **MoE 部分的封装与后端对接**，最后说明**多卡放置等特性支持**。

### Attention 部分（LoRA + KT 特性并存）

KTransformers 提供了算子模块的注入机制（`BaseInjectedModule`），而 PEFT 库提供了 LoRA 微调的层插入机制。为了在**微调阶段**同时兼容两者，我们设计了 `KTransformersLinearLora` 类，使其同时继承自 KTransformers 的线性层 (`KTransformersLinear`) 和 LoRA 的层基类 (`LoraLayer`)。如下图所示：

- **继承关系**：如下图所示，`KTransformersLinearLora` 同时继承 `KTransformersLinear` 与 `LoraLayer`，既保留 **KT 的高性能算子**（如 `prefill_linear` / `generate_linear`），又能**加载 LoRA参数**（如 `lora_A`、`lora_B` 等矩阵）；

- **替换策略**：在微调准备阶段，用 `KTransformersLinearLora` **逐一替换** 原 `KTransformersLinear`层（如下图右侧所示，主要包含Q/K/V/O 等线性层），从而在不破坏 KT 优化的前提下，将 LoRA 注入到了模型中，使其参数可训练。

![image-20250911184023795](../assets/image-20250911184023795.png)

替换完成后，如下图（左）所示，在计算图中相当于在原模型的 Q/K/V/O 四个矩阵乘法位置都插入了 LoRA。下图（右）展示了 `KTransformersLinearLora` 的内部，它同时包含了 KT 模块的高性能计算接口（prefill 和 generate 阶段的方法）以及 LoRA 的 A、B 矩阵等参数。

![image-20250801174517784](../assets/image-20250801174517784.png)

### MoE 部分（算子封装+backward实现）

#### MoE算子封装

考虑到 MoE 参数量大且计算稀疏，我们采用“封装成黑盒算子”的策略处理：将 MoE 专家计算封装为一个**对上游而言透明（单节点）、对下游可替换（多实现）**的可微算子。

- **上游（PyTorch 计算图）**：我们注册自定义 Autograd Function，整个 MoE 专家层在计算图中呈现为**一个节点**。如下左图红框所示，封装后计算图中只有 `KSFTExpertsCPU` 这样一个算子节点；而右图红框为未封装时的细粒度计算图——路由、专家选择以及 FFN 计算都完整展开在计算图中。封装后，对微调过程来说，MoE层就等同于一个普通 `nn.Module`，前向计算可求梯度，反向梯度也由我们来自定义算子返回。
- **下游（后端实现）**：在这个 Autograd Function 内部，我们通过 pybind11 调用了 C++ 扩展实现具体的前向和反向计算。这里我们提供了多个**可插拔后端实现**，如 AMX 指令集版本（支持 BF16/INT8 算子优化）和 llamafile 版本。只要遵循同样的接口，即可灵活切换后端。例如在 YAML 优化规则里指定使用 `"backend": "AMXBF16"`，就会调用 AMX 后端；改成 `"llamafile"` 则使用默认后端。

![image-20250801174623919](../assets/image-20250801174623919.png)

#### MoE 反向优化 (CPU 实现)

在实现 MoE 自定义算子的反向传播时，我们特别优化了大矩阵的梯度计算开销。MoE反向计算需要频繁访问权重转置`Wᵀ`，为避免运行时反复转置带来的开销，我们在加载参数时**预备一份权重转置`Wᵀ` 便于复用**（如下图蓝框）。同时，**缓存必要的中间激活**（例如专家层中间投影结果，见下图红框），以便在反向阶段复用，减少重复计算。基于这些缓存，当前已提供 llamafile 与 AMX（INT8/BF16） 的MoE反向计算实现，并针对 NUMA 架构优化内存访问。

<img src="../assets/image-20250911184455749.png" alt="image-20250911184455749" style="zoom: 33%;" />

### 多卡加载与训练：用“放置策略”而不是 DataParallel

为了在使用 2～4 张 GPU 时进一步降低**单卡显存压力**，KTransformers 结合模型并行技术实现了**多卡协同微调**。与常规的 DataParallel 不同，我们没有简单地将整层模型复制到每张卡（那样显存需求会翻倍），而是采用**模型并行 + 显式算子放置**的策略，让不同 GPU 各自承载模型的一部分层。

具体而言，我们对 Transformers Trainer 做了以下改动：

1. **自定义训练器 (KTrainer)**：接管模型加载到设备的逻辑，采用显示层放置。默认情况下 `transformers` 会在初始化时将模型 `.to(device)` 全部搬移到单块 GPU，我们通过自定义 KTrainer 阻止这一行为，利用 KTransformers 的优化规则 YAML，我们可以在每一层声明 `device: cuda:0/cuda:1/...` 来指定该层所在的设备。这样初始化模型时，各层就直接构建在目标 GPU 上，不需要额外拷贝。。

2. **禁用自动 DataParallel**：当启动全局变量`USE_KT=1`时，我们暂时禁用了 LLaMA-Factory 和 HuggingFace Train 原本自动启动的多卡 DataParallel 封装。避免了框架层面对模型的重复拷贝，使我们能够完全掌控模型的分片方案。

3. **梯度回传与汇总**：由于模型各部分分散在不同 GPU 上，我们采取梯度汇总到 `cuda:0` 的方式。具体做法是：在反向传播时，仅将所需的梯度张量在设备间传输，而不传输整个模型的中间激活；各 GPU 计算各自部分的梯度，最终在0号卡汇总计算 loss。这种方式减少了不必要的通讯开销和激活冗余。

通过上述手段，我们实现了**多 GPU 下依然遵循 KTransformers 放置策略**的训练方案。用户只需选择合适的 `kt_optimize_rule` 配置文件（例如带有 `multi-gpu` 的 YAML），即可启用默认的模型分片方案。在 DeepSeek-671B 微调中，我们提供的 `DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml` 就是一个两卡模型并行的典型策略：Attention 模块的 KV缓存和部分计算放在每张卡上，MoE 专家层在 CPU 上分片处理，两张卡共同承担全模型的计算。


## KT-LoRA微调测试

### 实验设置

实验均采用 LLaMA-Factory 调度、KTransformers 后端、LoRA 轻量微调范式（超参数：rank = 8、α = 32、dropout = 0.1，BF16，`gradient_accumulation_steps=16`、`qlen=512`）以及与微调阶段一致的 KT 优化规则。我们分别评测了（a）风格化对话的迁移效果，以及（b）两类具有代表性的**定量基准**：西式翻译腔（生成式）与 AfriMed-QA（医疗垂直领域，含**简答生成**与**单项选择**两种子任务）。固定使用AMX指令集优化；GPU选取2张 48G VRAM 的 RTX 4090，CPU选取 Intel Xeon Platinum 8488C。

### 效果测试

#### 风格化对话测试（CatGirl语气）

数据集采用[NekoQA-10K](https://zhuanlan.zhihu.com/p/1934983798233231689)进行风格迁移微调，目标是提升语气一致性与可辨识度。

下图展示了原模型与微调后模型的对比。微调后回答在称谓、语气标记与修饰语上更稳定地保持了目标风格（红框），相较原模型的中性与理性表达（蓝框）具有更强的风格可辨识性，说明KT-LoRA 能以较低 GPU 成本，将特定风格特征有效注入到大模型生成分布。

![风格化数据集模型输出对比_01](../assets/风格化数据集模型输出对比_01.png)

#### 生成式翻译风格基准测试

数据集采用了[西式翻译腔数据集](https://github.com/Benson114/Translational-Style-ChatLLM)，要求模型采用夸张的“西式翻译腔”，属生成式风格控制任务，评价指标采用生成任务常见的 BLEU-1/2/3/4 与 ROUGE-1/2/L。

| 西式翻译腔数据集                | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite原模型（不LoRA微调）     | 20.66     | 8.33      | 4.54      | 2.89      | 22.71     | 4.52      | 19.19     |
| **KT-LoRA微调DeepSeek-V2-Lite** | **35.41** | **22.44** | **15.42** | **11.18** | **42.03** | **18.38** | **33.10** |
| V3原模型（不LoRA微调）          | 8.49      | 3.34      | 1.62      | 0.96      | 15.91     | 2.55      | 10.07     |
| **KT-LoRA微调DeepSeek-V3**      | **37.02** | **23.70** | **16.21** | **11.49** | **43.43** | **18.96** | **34.54** |

如上表测试结果所示，在统一流程与放置策略下，**两种规模的模型在微调后均出现一致性增益**，支持“KT 后端 + LoRA 微调”组合在生成式风格控制上的可用性与有效性。同时，说明 KT 的异构放置与算子优化能够稳定支撑风格域的小样本适配。

#### 医疗垂直领域基准（AfriMed-SAQ/MCQ）

数据集采用了[AfriMed-QA](https://aclanthology.org/2025.acl-long.96/)数据集（ACL-2025），作为非洲地区医疗领域的专用数据集，具有很强的场景定制特征，包含单选题（MCQ）和简答题（SAQ）两种形式，在本案例中作为垂直领域微调的评估。评估标准上，SAQ 用 BLEU/ROUGE；MCQ 用 Accuracy。

| AfriMed-QA数据集（简答任务SAQ） | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite原模型（不LoRA微调）     | 13.58     | 11.12     | 9.10      | 7.23      | 22.48     | 7.81      | 11.73     |
| **KT-LoRA微调DeepSeek-V2-Lite** | **35.90** | **27.63** | **22.99** | **19.15** | **35.25** | **17.50** | **28.44** |
| V3原模型（不LoRA微调）          | 12.75     | 10.27     | 8.05      | 5.99      | 20.33     | 5.65      | 10.11     |
| **KT-LoRA微调DeepSeek-V3**      | **42.42** | **34.12** | **28.95** | **24.54** | **41.97** | **22.37** | **33.28** |

| AfriMed-QA数据集（单选任务MCQ） | Accuracy   |
| ------------------------------- | ---------- |
| V2-Lite原模型（不LoRA微调）     | 0.0645     |
| **KT-LoRA微调DeepSeek-V2-Lite** | **0.4812** |
| V3原模型（不LoRA微调）          | 0.5833     |
| **KT-LoRA微调DeepSeek-V3**      | **0.7930** |

如上表所示，（1）DeepSeek-V3（671B）经 KT-LoRA 微调后在MCQ和SAQ任务上均明显高于微调后的 DeepSeek-V2-Lite（14B），并且超过 V3 原模型。在我们的小规模设置中，初步说明了KT-LoRA微调巨大参数模型，在垂直领域中具有实际意义。

（2）在 SAQ/MCQ 两类子任务上，KT-LoRA 均带来一致增益，说明在 KT 的异构放置与后端算子支持下，LoRA 微调能够把“医疗等垂直领域的知识要点”有效注入模型。

#### 局限性说明

目前我们基于的多为单数据集、小规模（2w条及以下）进行测试，旨在提供**KT-LoRA微调系统有效性的“存在性证据”**，而非对算法泛化或规模规律的概括性结论。我们报告中主要给出的是代表性数值；若要支持更强的算法结论，需要更大样本、跨语种/跨域多数据集与多随机种子重复实验，本文不作展开。

**我们也特别欢迎大家加入LLaMA-Factory KT微调的开源项目中，如果大家有更多的测试结果，也特别特别欢迎写在下面的共享表格中，并补充好`kt_optimize_rule` 文件、数据集example、训练/评测 YAML、具体显存与 CPU 配置等，以便大家参考、复现~！**


### 速度测试

#### 端到端性能

**测试定义：**

`step_time`：一次优化步的总耗时（含张量搬运、Attention、MoE 等全部计算）。

`tokens_per_step = GAS × qlen`；`token/s = tokens_per_step / step_time`。 本节统一采用 `GAS=16`、`qlen=512`，因此 `tokens_per_step = 8192`。

**实测结果：**

| 模型                 | step_time (s) | tokens/step | token/s   |
| -------------------- | ------------- | ----------- | --------- |
| DeepSeek-V3-671B     | 203           | 8192        | **40.35** |
| DeepSeek-V2-Lite-14B | 36            | 8192        | **227.6** |

#### MoE部分的计算性能（DeepSeek-V3-671B）

**理论估算**

- MoE 每层、每token的前/反向浮点计算总量 (FLOPs) 可近似：
  $$
  \text{FLOPs}_{\text{per-layer, per-token}} \approx c \cdot k \cdot H \cdot I
  $$

​		其中：$k = 8$（Top-k 专家数），$H = 7168$（hidden size），$I = 2048$（intermediate size），常数 $c\approx16$（折合前向=6、反向=10 的矩阵乘总系数）。

- 每步（全 MoE 层）FLOPs 近似：
  $$
  \text{FLOPs}_{\text{per-step}} \approx c \cdot qlen \cdot k \cdot H \cdot I \cdot L_{\text{MoE}}
  $$

​		代 $c=16, qlen=512, k=8, H=7168, I=2048, L_{MoE}=58$，得 $\text{FLOPs}_{\text{per-step}} \approx 55.8\ \text{TFLOPs}$.

**实测情况**

MOE部分在CPU上面的性能情况：每秒浮点计算量 $\text{TFLOPS} = \text{FLOPs}_{\text{per-step}} / \text{step\_per\_second}.$

| TFLOPS                 | Forward | Backward |
| ---------------------- | ------- | -------- |
| 平均值（单位：TFLOPS） | 17.55   | 18.41    |

### 显存/内存性能

DeepSeek-V3（671B，61层，其中58层有MoE）占用显存大约70GB（多卡总量）、内存占用约1.2-1.3TB。

DeepSeek-V2-lite（14B，27层，其中26层有MoE）占用显存大约5GB、内存占用约30GB。


## 结论

通过将 KTransformers LoRA 微调集成到 LLaMA‑Factory，我们为希望高效训练和部署 MoE 大模型的用户提供了一条可行路径。KT 提供新的放置策略和算子优化（支持 DeepSeek、Qwen、Kimi 等模型，并结合 AMX 指令加速关键内核），配合 LoRA 微调实现了在极低 GPU 显存占用下的模型定制化训练；而 LLaMA‑Factory 则提供了友好的上层接口与配置管理，让这一切变得易于使用。

这种集成意味着即便是拥有数百亿乃至上万亿参数的 MoE 模型，也能够在相对普通的硬件上完成微调，并进行低延迟的推理部署。**显存节省**、**速度提升**和**易用性**在这套方案中达到了一定的平衡。我们期待社区在未来的 MoE 项目中尝试使用 LLaMA‑Factory 与 KTransformers 的组合，并欢迎参考本文档提供的指南进行操作。通过这一方案，超大模型不再是“无法企及”的存在，而成为每个开发者都可能驾驭的工具。

================================================
FILE: doc/zh/KTransformers-Fine-Tuning_User-Guide_zh.md
================================================
- [KTransformers 微调 × LLaMA-Factory 集成 – 用户指南](#ktransformers-微调-x-llama-factory-集成-–-用户指南)
- [Introduction](#introduction)

- [Quick to Start](#quick-to-start)
  - [快速上手](#快速上手)
  - [环境安装](#环境安装)
  - [核心功能1：使用KTransformers作为backend，微调超大规模MoE模型](#核心功能1使用ktransformers作为backend微调超大规模moe模型)
  - [核心功能2：与微调后模型（即原模型+LoRA Adapter）聊天，用于交互](#核心功能2与微调后模型即原模型lora-adapter聊天用于交互)
  - [核心功能3：生成微调后模型（即原模型+LoRA Adapter）的API，用于批量生成并评测指标](#核心功能3生成微调后模型即原模型lora-adapter的api用于批量生成并评测指标)

- [KT微调速度性能测试：用户侧](#kt微调速度性能测试用户侧)
  - [端到端性能](#端到端性能)
  - [显存/内存性能](#显存内存性能)

- [结论](#结论)

# KTransformers 微调 × LLaMA-Factory 集成 – 用户指南

**MadSys实验室, KVCache-AI团队, 趋境科技, LLaMA-Factory团队**

## Introduction

从 **DeepSeek-V3/R1** 到 **Qwen3-MoE、Kimi-K2**，每一次超大模型的开源都带来性能与规模上的巨大跃升。然而，多数研究者与开发者受限于昂贵的显卡与动辄数千亿参数的模型，**难以在资源受限条件下微调超大模型**。面对这种差距，我们提出了一种更具可行性的方案：通过 **KTransformers 与 LLaMA-Factory 的结合**，仅需2~4张RTX 4090与较高内存CPU，便可微调DeepSeek-671B等超大规模的MoE模型。

该架构的核心目标是为资源受限下的研究者提供 **在本地探索超大规模模型微调的可能性**。同时，也在较小规模（如 14B/30B）提供快速定制特定场景的路径。我们以**风格化对话、西式腔调翻译、医学问答**作为代表任务，验证架构的可行性，并展示在**数小时内达成个性化适配**的可操作性。


如下图所示，LLaMA-Factory 是整个微调流程的统一调度与配置框架，负责数据处理、训练调度、LoRA 插入与推理接口管理； KTransformers 则作为其可插拔的高性能后端，在相同的训练配置下接管 Attention / MoE 等核心算子，实现异构设备（GPU+CPU）的高效协同。

![image-20251011010558909](../assets/image-20251011010558909.png)

我们在 LLaMA-Factory 框架下，对比评测了 **HuggingFace**、**Unsloth**、**KTransformers** 三种后端的 LoRA 微调方案。结果显示，KTransformers为超大规模的MoE模型（671B等）提供了**4090 级别**的唯一可行方案，并在较小规模的MoE模型（DeepSeek-14B）上面也展现了更高的吞吐和更低的显存占用。

| Under LoRA (BF16)+[NekoQA-10K-风格化对话数据集](https://github.com/mindsRiverPonder/LLM-practice) | HuggingFace Backend                      | Unsloth Backend                      | KTransformers Backend |
| ------------------------------------------------------------ | ---------------------------------------- | ------------------------------------ | --------------------- |
| [14B-DeepSeekV2-Lite] LoRA Fine-tuning throughput            | 303.58 token/s                           | 455.37 token/s                       | 530.38 token/s        |
| [14B-DeepSeekV2-Lite] GPU Memory                             | 32.12 GB                                 | 9.64 GB                              | 6.08 GB               |
| [671B-DeepSeekV3] LoRA Fine-tuning throughput                | <font color='red'>Too Huge to run</font> | <font color='red'>NOT SUPPORT</font> | 40.35 token/s         |
| [671B-DeepSeekV3] GPU Memory（多卡总和）                     | 理论值1400 GB †                          | <font color='red'>NOT SUPPORT</font> | 70 GB †               |

† **1400 GB** 为**理论显存**（FP16 全参数常驻，非可运行配置）；**70 GB** 为 KT 策略（Attention 驻 GPU + MoE分层 offload）下的**实测峰值**。

![按照模型划分的对比图_02](../assets/image-compare_model.png)

### 微调效果示例

#### 风格化对话测试（CatGirl风格语气）

数据集：[NekoQA-10K: 面向猫娘语言建模的对话数据集](https://zhuanlan.zhihu.com/p/1934983798233231689)，目标是提升风格一致性与可辨识度。

下图对比了原始模型和微调模型的回答，可以看到微调后模型在语气和称谓上更加稳定地保持了猫娘风格（红框部分），验证了**风格迁移微调**的有效性。

![风格化数据集模型输出对比_01](../assets/风格化数据集模型输出对比_01.png)

#### Benchmark测试

数据集选取：

（1）采用了[西式翻译腔数据集](https://github.com/Benson114/Translational-Style-ChatLLM)，该数据集要求模型按西式表达习惯进行夸张的翻译，有明确的定制化风格需求。

（2）采用了[AfriMed-QA](https://aclanthology.org/2025.acl-long.96/)数据集（ACL-2025），作为非洲地区医疗领域的专用数据集，具有很强的场景定制特征，包含选择题和简答题两种形式，非常适合作为垂直领域微调的评估。针对单选和简答形式，我们分别进行测试，结果如下。

下表显示了微调前后模型在这些数据集上的指标变化。可以看到经过 LoRA 微调后，各项指标**大幅提升**，验证了微调的有效性：

| 西式翻译腔数据集                | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite原模型（不LoRA微调）     | 20.66     | 8.33      | 4.54      | 2.89      | 22.71     | 4.52      | 19.19     |
| **KT-LoRA微调DeepSeek-V2-Lite** | **35.41** | **22.44** | **15.42** | **11.18** | **42.03** | **18.38** | **33.10** |
| V3原模型（不LoRA微调）          | 8.49      | 3.34      | 1.62      | 0.96      | 15.91     | 2.55      | 10.07     |
| **KT-LoRA微调DeepSeek-V3**      | **37.02** | **23.70** | **16.21** | **11.49** | **43.43** | **18.96** | **34.54** |

| AfriMed-QA数据集（简答任务）    | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite原模型（不LoRA微调）     | 13.58     | 11.12     | 9.10      | 7.23      | 22.48     | 7.81      | 11.73     |
| **KT-LoRA微调DeepSeek-V2-Lite** | **35.90** | **27.63** | **22.99** | **19.15** | **35.25** | **17.50** | **28.44** |
| V3原模型（不LoRA微调）          | 12.75     | 10.27     | 8.05      | 5.99      | 20.33     | 5.65      | 10.11     |
| **KT-LoRA微调DeepSeek-V3**      | **42.42** | **34.12** | **28.95** | **24.54** | **41.97** | **22.37** | **33.28** |

| AfriMed-QA数据集（单选任务）    | Accuracy   |
| ------------------------------- | ---------- |
| V2-Lite原模型（不LoRA微调）     | 0.0645     |
| **KT-LoRA微调DeepSeek-V2-Lite** | **0.4812** |
| V3原模型（不LoRA微调）          | 0.5833     |
| **KT-LoRA微调DeepSeek-V3**      | **0.7930** |

从以上测试可以看出，即使是参数量巨大的 MoE 模型，通过 KTransformers 后端的高效微调，**也能在特定任务上快速达到理想效果**。


## Quick to Start

### 快速上手

本节将指导您如何安装环境并使用 **LLaMA-Factory + KTransformers** 完成微调和推理。我们将涵盖以下内容：

- 环境依赖的安装配置
- 使用 KTransformers 作为后端微调超大规模 MoE 模型
- 加载微调后的模型（原模型 + LoRA 适配器）进行对话/推理
- 批量推理微调模型并评测指标

### 环境安装

根据下面示例，同时安装KTransformers和LLaMA-Factory环境，这次为了简化KTransformers的安装流程，我们特意封装了wheel包避免本地编译，具体安装步骤如下：（注意对应好本地的python版本、torch版本、cuda版本和不同文件名的KTransformers包）

```shell
# 1. 安装conda环境
conda create -n Kllama python=3.12 # choose from : [3.11, 3.12, 3.13]
conda install -y -c conda-forge libstdcxx-ng gcc_impl_linux-64
conda install -y -c nvidia/label/cuda-11.8.0 cuda-runtime

# 2. 安装llamafactory环境
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e ".[torch,metrics]" --no-build-isolation

# 3. 安装对应torch和python版本的KTransformers（CUDA版本可以跟whl命名的不一致），从https://github.com/kvcache-ai/ktransformers/releases/tag/v0.4.1
pip install ktransformers-0.4.1+cu128torch27fancy-cp312-cp312-linux_x86_64.whl

# 4. 安装flash-attention，参照python版本和torch版本，从https://github.com/Dao-AILab/flash-attention/releases下载
pip install flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
# abi=True/False可以用下面代码查看
# import torch
# print(torch._C._GLIBCXX_USE_CXX11_ABI)

# 5. （可选）如果你想使用flash_infer的话（不然默认triton）
git clone https://github.com/kvcache-ai/custom_flashinfer.git
pip install custom_flashinfer/
```


**使用要点**：在 LLaMA-Factory 的配置 YAML 文件中启用 KTransformers 后端，只需设置 `use_kt: true`，并指定相应的 `kt_optimize_rule` YAML 文件，即可切换到底层由 KTransformers 接管计算。下面我们将通过具体功能来说明如何设置这些配置。

### 核心功能1：使用KTransformers作为backend，微调超大规模MoE模型

运行命令：`USE_KT=1 llamafactory-cli train examples/train_lora/deepseek3_lora_sft_kt.yaml`。

需要注意的是，必须提供BF16格式模型文件，DeepSeek-V3-671B默认下载是FP8格式，需要通过 [DeepSeek-V3/inference/fp8_cast_bf16.py](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) 转换。

```yaml
### model
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
trust_remote_code: true

### method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 8
lora_target: all

### dataset
dataset: identity
template: deepseek
cutoff_len: 2048
max_samples: 100000
overwrite_cache: true
preprocessing_num_workers: 16
dataloader_num_workers: 4

### output
output_dir: saves/Kllama_deepseekV3
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true
save_only_model: false
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
resume_from_checkpoint: null

### ktransformers
use_kt: true # use KTransformers as LoRA sft backend
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```

其中，`kt_optimize_rule`提供了大量默认的YAML文件来控制**KTransformers的放置策略**，下面针对YAML文件名和功能对照特别说明，也可以参考[ktransformers/optimize_rules](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules)：（\*指通配符）

| 文件名字段                                    | 功能特征                                           |
| --------------------------------------------- | -------------------------------------------------- |
| DeepSeek-V2-Lite-Chat-\*或DeepSeek-V3-Chat-\* | 对应的不同模型                                     |
| \*-sft-\*                                     | 微调所用的放置策略，其他为推理所用                 |
| \*-amx-\*                                     | 使用AMX指令集进行CPU运算，其他为llamafile          |
| \*-multi-gpu-X\*                              | 使用X张GPU进行模型并行（显存共担），X为空默认是2张 |

例如：`examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml`为DeepSeek-V3-Chat模型用AMX指令集进行微调，并调用两卡模型并行。

对于微调任务，我们推荐使用**AMX指令集加速**，可以使用`lscpu | grep amx`查看CPU是否支持AMX指令集，AMX精度支持BF16/Int8，修改方式如下：

```yaml
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert parallelism
    kwargs:
      prefill_device: "cpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
```

输出会保存在`output_dir`里面，默认为safetensor格式，并且保留adapter.json等配套内容以便后续加载。

![演示文稿1_01](../assets/演示文稿1_01.png)


### 核心功能2：与微调后模型（即原模型+LoRA Adapter）聊天，用于交互

运行命令：`llamafactory-cli chat examples/inference/deepseek3_lora_sft_kt.yaml`。

调用KT微调的adapter (safetensor格式) 推理对话。

```yaml
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path: saves/Kllama_deepseekV3
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # 调用KTransformers backend
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml # 请选择和LoRA微调的时候保持一致的YAML文件
cpu_infer: 32
chunk_size: 8192
```

同时，我们也支持GGUF格式的adapter进行推理（如果您已经使用了上述LLaMA-Factory+KTransformers的微调方案，就不用管啦~）。

safetensors 场景填**文件所在目录**，GGUF 场景填**文件路径**，也就是说您需要把`adapter_name_or_path`选为具体的GGUF格式文件。

加载过程中适配了KT每层的命名，和torch.save保存下来的常规命名的不同，正常映射日志`Loaded adapter weight: XXX -> XXX`，展示如下。

![image-20250801165752484](../assets/image-20250801165752484.png)


### 核心功能3：生成微调后模型（即原模型+LoRA Adapter）的API，用于批量生成并评测指标

运行命令：`API_PORT=8000 llamafactory-cli api examples/inference/deepseek3_lora_sft_kt.yaml`。

调用KT微调的adapter给出API，其他API使用逻辑和llamafactory原生方式一致。

```yaml
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path: saves/Kllama_deepseekV3
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # use KTransformers as LoRA sft backend to inference
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```


## KT微调速度性能测试：用户侧

### 端到端性能

**测试定义：**

`step_time`：一次优化步（包含 `gradient_accumulation_steps (GAS)` 次累积）的总时间，涵盖 **PyTorch 张量搬运 + Attention + MoE + 其他计算等**。

`tokens_per_step = GAS × qlen`；`token/s = tokens_per_step / step_time`。

**测试设置：**`GAS=16`，`qlen=512`（即每步 8192 tokens）；LoRA（`r=8, alpha=32, dropout=0.1`）；使用AMX指令集优化；GPU选取RTX 4090，CPU选取Intel Xeon Platinum 8488C。

**实测结果：**

**DeepSeek-V3-671B：**step_time = 203 s` → `token/s ≈ 8192 / 203 **≈ 40.35 token/s**

**DeepSeek-V2-Lite-14B：**step_time = 36 s` → `token/s ≈ 8192 / 36 **≈ 227.6 token/s**

### 显存/内存性能

DeepSeek-V3（671B，61层，其中58层有MoE）占用显存（多卡总量）大约**70GB**、内存占用约1.2-1.3TB。

DeepSeek-V2-lite（14B，27层，其中26层有MoE）占用显存大约**5.5GB**、内存占用约150GB。


## 结论

通过开发 KTransformers LoRA微调并将其集成到 LLaMA‑Factory，我们为希望高效训练与部署 MoE 大模型的用户提供了可行指南。KT 带来最尖端的优化（支持 DeepSeek、Qwen、Kimi 等，配合 AMX 加速 kernel），同时通过 LoRA 微调在极低 GPU 显存下实现定制化。LLaMA‑Factory 则提供友好的统一界面，更广的用户支持。

该集成（类似 Unsloth 补丁所带来的提速）意味着即便是数百亿乃至万亿总参数量的 MoE 模型，也可在普通硬件上完成微调并低延迟部署。**显存节省、速度提升、易用性** 三者兼得。我们鼓励用户在下一次 MoE 项目中尝试 LLaMA‑Factory 的 KT 集成，并参考本文档进行操作。也欢迎提出任何问题和建议！


================================================
FILE: doc/zh/Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md
================================================
# 基准测试结果(输出token长度均设置1k, 单并发)

| Prompt length                     | 1K     | 2K     | 4K     |
| --------------------------------- | ------ | ------ | ------ |
| KTrans Prefill token/s | 134.11 | 141.60 |  143.42 |
| KTrans Decode token/s | 11.05 | 10.74 | 10.68 |

## 先决条件
我们在以下配置下进行了Qwen3-235B-A22B MoE最佳性能测试：
- 服务器型号：Atlas 2UP
- NPU：Atlas 300I A2
- CPU: HUAWEI Kunpeng 920 7270Z
- 内存: DDR5服务器内存（1TB）

# 部署

***关于部署过程，此README中只额外描述与同级目录下 `DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md` 不同的部分***

## 物理机安装

部署满血版Qwen3-MoE，需要机器物理内存能够存放下全部路由专家的权重，约200GB。

目前支持的NPU型号：**300I A2**。

在技术人员的支持下完成硬件安装。


## 权重准备

目前，为了满足性能和精度的要求，我们需要准备两份权重，并使用提供的权重合并脚本对权重进行合并，最终只会使用合并后的权重。

Q4权重：[Qwen3-235B-A22B-Instruct-2507-GGUF](https://modelscope.cn/models/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/files)

W8A8权重：[Qwen3-235B-A22B-w8a8](https://modelers.cn/models/Modelers_Park/Qwen3-235B-A22B-w8a8)

使用[merge_safetensor_gguf_for_qwen3.py](../../merge_tensors/merge_safetensor_gguf_for_qwen3.py)来合并Q4和W8A8权重：

```bash
python merge_safetensor_gguf_for_qwen3.py --safetensor_path /mnt/weights/Qwen3-235B-A22B-Q4_K_M --gguf_path /mnt/weights/Qwen3-235B-A22B-W8A8 --output_path /mnt/weights/Qwen3-235B-A22B-q4km-w8a8
```

## kTransformers部署

将项目文件部署到机器上：

- 初始化third_party。由于此过程耗时较多，且容易受网络影响导致仓库克隆失败，建议初始化一次后，将相关文件进行打包，以便后续直接解压使用。
  ```bash
  git clone https://github.com/kvcache-ai/ktransformers.git
  cd ktransformers
  git submodule update --init --recursive
  ```
- 对于arm平台，注释掉`./third_party/llamafile/iqk_mul_mat_arm82.cpp`中的
  ```cpp
  #define iqk_mul_mat iqk_mul_mat_arm82
  #define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
  ```
- 执行`source /usr/local/Ascend/ascend-toolkit/set_env.sh`（以实际CANN-TOOLKIT安装路径为准）。
- 执行`apt install cmake libhwloc-dev pkg-config`安装依赖。
- 修改项目目录下 /ktransformers/config/config.yaml 中attn部分的page_size: 128  chunk_size: 16384
- 执行`USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh`，等待安装完成。
    ***执行安装命令之前，需要将`./ktransformers/configs/config.yaml`中对于page size的设置改为page size=128(因为attn计算算子`torch_npu.npu_fused_infer_attention_score`支持page_size=16/128)***

此处给出示例balance_serve的启动脚本（由于使用了相对路径，需将该脚本放至项目的根路径下）：

```bash
#!/bin/bash
export USE_MERGE=0
export INF_NAN_MODE_FORCE_DISABLE=1
export TASK_QUEUE_ENABLE=0
export RANK=0
export LOCAL_WORLD_SIZE=1
#export PROF_DECODE=1
#export PROF_PREFILL=1

source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh

python ktransformers/server/main.py \
--port 10002 \
--model_path <your model path> \
--gguf_path <your model path> \
--cpu_infer 48 \
--optimize_config_path  ./ktransformers/optimize/optimize_rules/npu/Qwen3-Chat-300IA2-npu-serve.yaml \
--max_new_tokens 1024 \
--cache_lens 16384 \
--max_batch_size 4 \
--use_cuda_graph \
--tp 1 \
--backend_type balance_serve
```

相关参数说明：

- `--model_path`：kTransformers原生参数，str，此处用来指定合并后的模型文件路径
- `--gguf_path`：kTransformers原生参数，str，此处用来指定合并后的模型文件路径
- `--cpu_infer`：kTransformers原生参数，int，用来控制CPU侧实际worker线程数，非必选
- `--optimize_config_path`：kTransformers原生参数，str，用来指定所用的模型优化配置文件，需要注意相对路径的使用，此处为**必选**
- `--cache_lens`：调度器申请 kvcache 的总长度。所有请求共享指定数量（例如 `20480`）的 tokens 对应的 kvcache 空间，请求完成后会释放其所占用的 kvcache 空间，非必选
- `--use_cuda_graph`：kTransformers原生参数，bool，为True表示开启图下沉，为False表示关闭图下沉，非必选
- `--max_new_tokens`：kTransformers原生参数，int，当统计到输出的tokens数量达到该值时，会直接中止输出，非必选
- `--tp`：新增参数，int，用于开启tensor model parallel功能，目前local_chat只支持tp大小与ws大小相同（不支持local_chat使用多dp），非必选


# 其他问题

## 可能存在的其他依赖问题

ImportError: libhccl.so: cannot open shared object file: No such file or directory

```bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh  # 以实际CANN安装路径为准
```

ImportError: libascend_hal.so: cannot open shared object file: No such file or directory

```bash
export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH  # 以实际Driver安装路径为准
```


================================================
FILE: doc/zh/api/server/api.md
================================================
# API


- [OpenAI ChatCompletion](#openai-chatcompletion)
- [Ollama ChatCompletion](#ollama-chatcompletion)
- [OpenAI Assistant](#openai-assistant)


## OpenAI ChatCompletion
```bash
POST /v1/chat/completions
```
根据选定的模型生成回复。

### 参数


- `messages`：一个 `message` 的数组所有的历史消息。`message`：表示用户（user）或者模型（assistant）的消息。`message`包含：

  - `role`: 取值`user`或`assistant`，代表这个 message 的创建者。
  - `content`: 用户或者模型的消息。

- `model`：选定的模型名
- `stream`：取值 true 或者 false。表示是否使用流式返回。如果为 true，则以 http 的 event stream 的方式返回模型推理结果。

### 响应

- 流式返回：一个 event stream，每个 event 含有一个`chat.completion.chunk`。`chunk.choices[0].delta.content`是每次模型返回的增量输出。
- 非流式返回：还未支持。

### 例子

```bash
curl -X 'POST' \
  'http://localhost:9112/v1/chat/completions' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "messages": [
    {
      "content": "tell a joke",
      "role": "user"
    }
  ],
  "model": "Meta-Llama-3-8B-Instruct",
  "stream": true
}'
```

```bash
data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"Why ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"couldn't ","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

...

data:{"id":"c30445e8-1061-4149-a101-39b8222e79e1","object":"chat.completion.chunk","created":1720511671,"model":"not implmented","system_fingerprint":"not implmented","usage":null,"choices":[{"index":0,"delta":{"content":"two-tired!","role":"assistant","name":null},"logprobs":null,"finish_reason":null}]}

event: done
data: [DONE]
```


## Ollama ChatCompletion

```bash
POST /api/generate
```

根据选定的模型生成回复。

### 参数


- `prompt`：一个字符串，代表输入的 prompt。
- `model`：选定的模型名
- `stream`：取值 true 或者 false。表示是否使用流式返回。如果为 true，则以 http 的 event stream 的方式返回模型推理结果。

### 响应

- 流式返回：一个流式的 json 返回，每行是一个 json。
  - `response`：模型补全的增量结果。
  - `done`：是否推理结束。

- 非流式返回：还未支持。

### 例子

```bash
curl -X 'POST' \
  'http://localhost:9112/api/generate' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "model": "Meta-Llama-3-8B-Instruct",
  "prompt": "tell me a joke",
  "stream": true
}'
```

```bash
{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.686513","response":"I'll ","done":false}
{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:11.729214","response":"give ","done":false}

...

{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.955475","response":"for","done":false}
{"model":"Meta-Llama-3-8B-Instruct","created_at":"2024-07-09 08:13:33.956795","response":"","done":true}
```


================================================
FILE: doc/zh/api/server/server.md
================================================
# 后端服务（Server）
Server 将 ktransformers 的快速异构推理能力通过 API 提供给外界调用。

<img src="server-arch.png" height="600" alt="Server架构">

## API

Server 通过 RESTful API 对外提供模型推理服务，提供  ChatCompletion 和 Assistant 两种调用方式。

- ChatCompletion 接口要求用户一次提供所有的历史对话，然后返回模型的回复。AI 服务提供商（例如[OpenAI](https://platform.openai.com/docs/api-reference/chat/create) ）和本地推理框架（例如[Ollama](https://github.com/ollama/ollama/blob/main/docs/api.md) ）都提供 ChatCompletion 接口。为了兼容 OpenAI 和 Ollama，Server 分别提供和它们一致的 API 接口。因此，当前使用 OpenAI 和 Ollama 的应用可以无缝切换到我们的 Server。例如： [如何使用 Tabby 和 ktransformers 在本地利用 236B 的大模型做代码补全？](tabby.md)。
- Assistant 适用于应用需要复用一系列资源并调用模型的场景。例如，在教育应用场景中，应用开发者可以创建一个名为二年级数学老师的 Assistant，并设置初始prompt（“你是一个有经验的的二年级数学老师...”），上传相关的资料（二年级数学教材）。创建 Assistant 后，应用需要创建一个 Thread 来存储用户和模型的对话消息（Message）。调用模型时，应用需要创建一个 Run 来获得 Assistant 的回复。相对于 ChatCompletion，实现了 Assistant 的 Server 代替应用实现了对话背景复用和多轮对话，使得复杂场景下的模型的调用更加方便。 [OpenAI Assistant API](https://platform.openai.com/docs/api-reference/assistants/createAssistant) 提出了这样的 Assistant 接口，而 Server 也提供和它一致的 API 。

这些 API 定义在`server/api`中，它们的具体使用请见[这里](api.md)。


## 对接模型推理框架

Server 通过 ktransformers 调用模型并进行推理。Server 也支持其他的推理框架，例如已经支持的 [transformers](https://huggingface.co/docs/transformers/index) ，并计划支持 [exllamav2](https://github.com/turboderp/exllamav2)。这些功能在`server/backend` 中实现。

Server 将模型推理框架的推理功能抽象成一个基类`BackendInterfaceBase`。这个基类包含一个函数：inference。它的输入是是历史的对话信息 messages，输出是模型返回的文字结果。inference 函数采用 async generator 的设计，这使得 Server 可以流式地返回模型的回复。

```python
class BackendInterfaceBase:
  async def inference(self, messages, **kwargs)->AsyncIterator[str]:
  	...
```

这个 inference 函数，因为它的输入和输出分别是历史对话和模型回复，所以它自然地实现了 ChatCompletion 的功能。因此 ChatCompletion API 可以直接调用inference 函数完成模型推理。

而 Assistant 则比 ChatCompletion 复杂许多，需要 Server 存储 Assistant 的相关状态，并以合适的方式调用 inference 函数。Server 在数据库中维护了一套 Assistant 逻辑，存储应用创建的 Assistant，Thread 和 Message。在内存中，Server 为每个 Thread 维护一个 `ThreadContext`，集合每个Thread 相关的 Assistant 等信息。当用户发出新的 Message 时，Server 调用 ThreadContext 的get_local_messages函数，获得 messages，并调用 inference 函数获得推理结果。

```python
class MyThreadContext(ThreadContext):
    def get_local_messages(self):
      ...
```

由于不同的模型推理框架有着不同的历史对话输入格式，所以 `ThreadContext` 和 `BackendInterface` 需要成对地使用。Server 除了自己的 ktransformers 之外，还支持 transformers。如果要对接其他的模型推理框架，可以参考在 [transformers.py](https://github.com/kvcache-ai/ktransformers-dev/blob/main/ktransformers/server/backend/interfaces/transformers.py) 中`TransformersInterface`和`TransformersThreadContext`的实现。 


================================================
FILE: doc/zh/api/server/tabby.md
================================================
# 如何使用 Tabby 和 ktransformers 在本地利用 236B 的大模型做代码补全？

[Tabby](https://tabby.tabbyml.com/docs/welcome/) 是一个开源的代码助手，用户可以手动配置后端使用的框架及模型，并在多个 IDE/编辑器 上使用，例如 VSCode 和 InteliJ。因为 Tabby 在框架侧可以对接到 Ollama，并且 ktransformers server 提供和 Ollama 一致的 API 接口，所以我们可以将 Tabby 对接到 ktransformers server。并在代码补全的场景中体验到 ktransformers 快速的异构推理。

1. 启动 ktransformers。
```bash
./ktransformers --port 9112
```
2. 安装 Tabby：按照 Tabby 的官方教程在带有英伟达 GPU 的 Linux 服务器或者 Windows PC 上[安装 Tabby](https://tabby.tabbyml.com/docs/quick-start/installation/linux/)。
3. 配置 Tabby：创建`~/.tabby/config.toml`，并加入以下配置。
```toml
[model.completion.http]
kind = "ollama/completion"
api_endpoint = "http://127.0.0.1:9112/"
model_name = "DeepSeek-Coder-V2-Instruct"
prompt_template = "<｜fim▁begin｜>{prefix}<｜fim▁hole｜>{suffix}<｜fim▁end｜>" # Prompt Template
```

在这个配置中，`kind` 指明 ktransformers 使用 Ollama 的标准 API 为 Tabby 提供服务；`api_endpoint` 与 ktransforer 启动时绑定的接口保持一致；`model_name` 设置为 ktransformers 使用的模型，这里使用 `DeepSeek-Coder-V2-Instruct` 作为后台推理的模型；`prompt_template` 是模型的提示词模板，针对不同的模型，使用相对应的模版才能正常使用模型 Fill In the Middle 的功能。
在这里演示的是 Tabby 使用 Ollama API 提供 Completion 功能的相关配置，有关 Tabby 其他可选功能的配置信息请参照[这里](https://tabby.tabbyml.com/docs/administration/model/)。


4. 启动 Tabby 服务：`./tabby serve`。
<img src="run-tabby.png" alt="image-20240709112329577" style="zoom:50%;" />

​	启动之后，期望会在 ktransformers 的命令行界面看到对 `/api/tags` 接口的访问(在 Tabby 新版本 v0.13.0 中变为对 `/api/show/` 接口的访问)。
<img src="visit-api-tags.png" alt="image-20240709111648215" style="zoom:67%;" />

6. 注册 Tabby 账户，获取 Token：在启动 Tabby 服务后，在浏览器中打开相应的链接(如上图的 0.0.0.0:8080)，并参照[教程](https://tabby.tabbyml.com/docs/quick-start/register-account/) 创建用户并获取 Token。

7. 启动 VScode 安装 Tabby 拓展插件，并在相关提示下，使用上一步获得的 Token 连接 Tabby Server，参照[这里](https://tabby.tabbyml.com/docs/extensions/installation/vscode/)。

8. 打开任意代码文件，体验 ktransformers 的快速异构推理。


================================================
FILE: doc/zh/api/server/website.md
================================================
# Start with website

This document provides the necessary steps to set up and run the web service for this project.

## 1. Starting the Web Service

### 1.1. Compiling the Web Code

Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher

Once npm is installed, navigate to the `ktransformers/website` directory:

```bash
cd ktransformers/website
```

Next, install the Vue CLI with the following command:

```bash
npm install @vue/cli
```

Now you can build the project:

```bash
npm run build
```
Finally you can build ktransformers with website:
```
cd ../../
pip install .
```


================================================
FILE: doc/zh/clawdbot_integration_guide.md
================================================
# KTransformers + Clawdbot：本地部署 AI 助手方案

> **利用 KTransformers 的 CPU-GPU 混合推理能力，结合 Kimi-K2.5 的高质量推理能力，为 Clawdbot 提供高性能本地推理后端**

---

## 什么是 Clawdbot？

[Clawdbot](https://github.com/openclaw/openclaw) 是一款开源的个人 AI 智能体，支持通过 Telegram、Discord、Signal、WhatsApp 等聊天平台交互，可实现日程管理、邮件发送、数据查询等自动化任务，数据完全本地存储，隐私可控。

> **注意**：Clawdbot 默认不内置飞书（Feishu）Channel，需要额外安装社区插件，详见下方飞书接入章节。

---

## 为什么选择 KTransformers 作为推理后端？

**KTransformers** 使用 CPU-GPU 混合推理架构：

- **CPU-GPU 协同**：GPU 处理高价值推理路径，CPU（AMX 量化）处理专家模块，资源利用率最大化
- **原生 MoE 支持**：支持多种原生精度的 MoE 模型
- **SGLang 高性能引擎**：兼容 OpenAI API，支持多 GPU Tensor Parallel 并行
- **全栈 CLI 工具**：`kt run` 一键启动、`kt model` 模型管理、`kt quant` 智能量化、`kt bench` 性能测试、`kt doctor` 环境诊断

---

## 支持的模型

自 Kimi K2 Thinking 等[原精度模型支持](../en/kt-kernel/Native-Precision-Tutorial.md)以来，我们 Day0 适配了 [Kimi K2.5](../en/Kimi-K2.5.md)。目前，我们已经原精度支持 Kimi K2.5、MiniMax、DeepSeek、Qwen3、GLM 等 MoE 模型，仅使用 24-48G 显存即可完美部署。

---

## 部署架构

```
[用户] → [Telegram / Discord / Signal / 飞书] → [Clawdbot Gateway]
                                                        ↓
                                                  [KTransformers]
                                                   (SGLang API)
                                                        ↓
                                                  [多 GPU 推理]
```

Clawdbot 通过 OpenAI 兼容 API 接入 KTransformers，无需额外 API 密钥，本地推理零费用。

---

## 部署步骤

### 第一步：安装并启动 KTransformers

[Kimi K2.5 使用指南](../en/Kimi-K2.5.md)

[kt kernel 部署指南](https://github.com/kvcache-ai/ktransformers/tree/main/kt-kernel)。

启动后，KTransformers 会在 `http://<host>:30000/v1` 提供 OpenAI 兼容 API。

### 第二步：安装 Clawdbot

```bash
npm install -g openclaw@latest

openclaw onboard --install-daemon
```

> 关于 Clawdbot 的详细安装与配置，请参考 [Clawdbot 官方文档](https://openclaw.ai) 和 [GitHub 仓库](https://github.com/openclaw/openclaw)。

### 第三步：配置 KTransformers 作为推理后端

编辑 Clawdbot 配置文件（通常位于 `~/.openclaw/openclaw.json`，或通过网页版 `http://127.0.0.1:18789/config`），将模型 provider 指向本地 KTransformers 服务：

```json
{
  "models": {
    "providers": {
      "synthetic": {
        "baseUrl": "http://127.0.0.1:30000/v1",
        "apiKey": "EMPTY",
        "api": "openai-completions",
        "models": [
          {
            "id": "kimi-k2.5",
            "name": "kimi-k2.5",
            "contextWindow": 200000,
            "maxTokens": 16384
          }
        ]
      }
    },
    "routing": {
      "default": {
        "provider": "synthetic",
        "modelId": "kimi-k2.5"
      }
    }
  }
}
```

关键配置说明：
- `baseUrl`：KTransformers SGLang 服务地址
- `apiKey`：填写 `"EMPTY"` 即可，本地服务不需要密钥
- `models`：根据实际运行的模型调整 `id` 和 `contextWindow`

### 第四步：启动 Clawdbot Gateway

```bash
openclaw gateway --port 18789
```

### 第五步：配置消息通道

Clawdbot 原生支持 Telegram、Discord、Signal 等通道：

```bash
# Telegram
openclaw channels login --channel telegram

# Signal
openclaw channels login --channel signal
```

---

## 飞书接入

Clawdbot 默认不包含飞书通道，需要通过社区开发的飞书桥接插件接入。

主要步骤：
1. 在[飞书开放平台](https://open.feishu.cn/)创建企业自建应用，添加"机器人"能力
2. 安装飞书桥接插件（社区项目：[clawdbot-feishu](https://github.com/m1heng/clawdbot-feishu)）
3. 配置 `appId`、`appSecret` 等飞书应用凭据
4. 添加"接收消息"事件，发布应用版本

详细教程可参考：
- [Clawdbot 接入飞书保姆级教程](https://mp.weixin.qq.com/s/_i1fgNbeDrBR5wurEmJf0A)
- [腾讯云：Moltbot 接入飞书保姆级教程](https://cloud.tencent.com/developer/article/2625073)

---

## 硬件参考配置

以下是一个 8 卡 GPU 部署的参考配置：

| 组件 | 配置 |
|------|------|
| GPU | 8 × NVIDIA RTX 5090（32GB 显存） |
| CPU | 双路高核心数处理器（至少需支持 AVX 512 指令集） |
| 内存 | 512GB+ |
| 模型 | Kimi K2.5 / DeepSeek-V3 / GLM-4.7 等 |

```bash
# 启动示例
kt run kimi-k2.5
```

---

## KTransformers 与传统部署对比

| 特性 | KTransformers | 传统部署 |
|------|---------------|----------|
| 显存需求 | 小 | 原始大小 |
| MoE 支持 | CPU-GPU 动态调度 | 无 |
| CPU-GPU 混合 | NUMA 优化 | 无 |
| 管理工具 | kt CLI 全栈工具 | 手动 |
| 故障诊断 | `kt doctor` 自动检测 | 手动调试 |

---

## 适用场景

- **企业部署**：客户服务自动化、文档智能问答、工作流自动化
- **研发团队**：模型快速验证、性能基准测试、实验环境搭建
- **个人用户**：低成本本地 AI 助手、隐私数据可控

---

## 相关链接

- [KTransformers GitHub](https://github.com/KTransformers/ktransformers)
- [Clawdbot 官网](https://openclaw.ai/)
- [Clawdbot GitHub](https://github.com/clawdbot/clawdbot)
- [飞书桥接插件](https://github.com/m1heng/clawdbot-feishu)


================================================
FILE: docker/Dockerfile
================================================
ARG CUDA_VERSION=12.8.1
FROM docker.1ms.run/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS base

ARG TARGETARCH
ARG GRACE_BLACKWELL=0
ARG HOPPER_SBO=0
ARG CPU_VARIANT=x86-intel-multi
ARG BUILD_ALL_CPU_VARIANTS=1

# Proxy settings for build-time network access
ARG HTTP_PROXY
ARG HTTPS_PROXY
ARG http_proxy
ARG https_proxy
ENV HTTP_PROXY=${HTTP_PROXY} \
    HTTPS_PROXY=${HTTPS_PROXY} \
    http_proxy=${http_proxy} \
    https_proxy=${https_proxy}

ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2
ARG HOPPER_SBO_DEEPEP_COMMIT=9f2fc4b3182a51044ae7ecb6610f7c9c3258c4d6
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
ARG BUILD_AND_DOWNLOAD_PARALLEL=8
ARG SGL_KERNEL_VERSION=0.3.19
ARG SGL_VERSION=0.5.6.post1
ARG USE_LATEST_SGLANG=0
ARG GDRCOPY_VERSION=2.5.1
ARG UBUNTU_MIRROR
ARG GITHUB_ARTIFACTORY=github.com
ARG FLASHINFER_VERSION=0.5.3

# ktransformers wheel version (cu128torch28 for CUDA 12.8 + PyTorch 2.8)
ARG KTRANSFORMERS_VERSION=0.4.2
ARG KTRANSFORMERS_WHEEL=ktransformers-0.4.2+cu128torch28fancy-cp312-cp312-linux_x86_64.whl

# flash_attn wheel for fine-tune env
ARG FLASH_ATTN_WHEEL=flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl

ENV DEBIAN_FRONTEND=noninteractive \
    CUDA_HOME=/usr/local/cuda \
    GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/ \
    FLASHINFER_VERSION=${FLASHINFER_VERSION}

# Add GKE default lib and bin locations
ENV PATH="${PATH}:/usr/local/nvidia/bin" \
    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"

# Replace Ubuntu sources with Tsinghua mirror for Ubuntu 24.04 (noble)
RUN if [ -n "$UBUNTU_MIRROR" ]; then \
    echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ noble main restricted universe multiverse" > /etc/apt/sources.list && \
    echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
    echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ noble-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
    echo "deb http://security.ubuntu.com/ubuntu/ noble-security main restricted universe multiverse" >> /etc/apt/sources.list && \
    rm -f /etc/apt/sources.list.d/ubuntu.sources; \
fi

# Install system dependencies (organized by category for better caching)
RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \
    echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update && apt-get install -y --no-install-recommends --allow-change-held-packages \
    # Core system utilities
    tzdata \
    ca-certificates \
    software-properties-common \
    netcat-openbsd \
    kmod \
    unzip \
    openssh-server \
    curl \
    wget \
    lsof \
    locales \
    # Build essentials
    build-essential \
    cmake \
    perl \
    patchelf \
    ccache \
    git \
    git-lfs \
    # MPI and NUMA
    libopenmpi-dev \
    libnuma1 \
    libnuma-dev \
    numactl \
    # transformers multimodal VLM
    ffmpeg \
    # InfiniBand/RDMA
    libibverbs-dev \
    libibverbs1 \
    libibumad3 \
    librdmacm1 \
    libnl-3-200 \
    libnl-route-3-200 \
    libnl-route-3-dev \
    libnl-3-dev \
    ibverbs-providers \
    infiniband-diags \
    perftest \
    # Development libraries
    libgoogle-glog-dev \
    libgtest-dev \
    libjsoncpp-dev \
    libunwind-dev \
    libboost-all-dev \
    libssl-dev \
    libgrpc-dev \
    libgrpc++-dev \
    libprotobuf-dev \
    protobuf-compiler \
    protobuf-compiler-grpc \
    pybind11-dev \
    libhiredis-dev \
    libcurl4-openssl-dev \
    libczmq4 \
    libczmq-dev \
    libfabric-dev \
    # Package building tools
    devscripts \
    debhelper \
    fakeroot \
    dkms \
    check \
    libsubunit0 \
    libsubunit-dev \
    # Development tools
    gdb \
    ninja-build \
    vim \
    tmux \
    htop \
    zsh \
    tree \
    less \
    rdma-core \
    # NCCL
    libnccl2 \
    libnccl-dev \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

# GDRCopy installation
RUN mkdir -p /tmp/gdrcopy && cd /tmp \
    && curl --retry 3 --retry-delay 2 -fsSL -o v${GDRCOPY_VERSION}.tar.gz \
        https://${GITHUB_ARTIFACTORY}/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
    && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
    && cd gdrcopy-${GDRCOPY_VERSION}/packages \
    && CUDA=/usr/local/cuda ./build-deb-packages.sh \
    && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
    && cd / && rm -rf /tmp/gdrcopy

# Fix DeepEP IBGDA symlink
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so

# Set up locale
RUN locale-gen en_US.UTF-8
ENV LANG=en_US.UTF-8 \
    LANGUAGE=en_US:en \
    LC_ALL=en_US.UTF-8

########################################################
########## Install Miniconda ###########################
########################################################

RUN mkdir -p /opt/miniconda3 \
    && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /opt/miniconda3/miniconda.sh \
    && bash /opt/miniconda3/miniconda.sh -b -u -p /opt/miniconda3 \
    && rm /opt/miniconda3/miniconda.sh

# Add conda to PATH
ENV PATH="/opt/miniconda3/bin:${PATH}"

# Accept conda TOS
RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

# Configure conda to use Tsinghua mirror
RUN conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main \
    && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free \
    && conda config --set show_channel_urls yes

########################################################
########## Dual Conda Environment Setup ################
########################################################

FROM base AS framework

ARG CUDA_VERSION
ARG BUILD_AND_DOWNLOAD_PARALLEL
ARG SGL_KERNEL_VERSION
ARG SGL_VERSION
ARG USE_LATEST_SGLANG
ARG FLASHINFER_VERSION
ARG GRACE_BLACKWELL
ARG GRACE_BLACKWELL_DEEPEP_BRANCH
ARG HOPPER_SBO
ARG HOPPER_SBO_DEEPEP_COMMIT
ARG DEEPEP_COMMIT
ARG GITHUB_ARTIFACTORY
ARG KTRANSFORMERS_VERSION
ARG KTRANSFORMERS_WHEEL
ARG FLASH_ATTN_WHEEL
ARG FUNCTIONALITY=sft

WORKDIR /workspace

# Create conda environments (fine-tune only needed for sft mode)
RUN conda create -n serve python=3.12 -y \
    && if [ "$FUNCTIONALITY" = "sft" ]; then conda create -n fine-tune python=3.12 -y; fi

# Set pip mirror for conda envs
RUN /opt/miniconda3/envs/serve/bin/pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \
    && if [ "$FUNCTIONALITY" = "sft" ]; then \
        /opt/miniconda3/envs/fine-tune/bin/pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple; \
    fi

# Clone repositories (sglang is included as a submodule in ktransformers)
RUN git clone --depth 1 https://${GITHUB_ARTIFACTORY}/kvcache-ai/ktransformers.git /workspace/ktransformers \
    && cd /workspace/ktransformers && git submodule update --init --recursive \
    && ln -s /workspace/ktransformers/third_party/sglang /workspace/sglang \
    && if [ "$FUNCTIONALITY" = "sft" ]; then \
        git clone --depth 1 https://${GITHUB_ARTIFACTORY}/hiyouga/LLaMA-Factory.git /workspace/LLaMA-Factory; \
    fi

# Download ktransformers wheel and flash_attn wheel for fine-tune env (sft mode only)
RUN if [ "$FUNCTIONALITY" = "sft" ]; then \
        curl --retry 3 --retry-delay 2 -fsSL -o /workspace/${KTRANSFORMERS_WHEEL} \
            https://${GITHUB_ARTIFACTORY}/kvcache-ai/ktransformers/releases/download/v${KTRANSFORMERS_VERSION}/${KTRANSFORMERS_WHEEL} \
        && curl --retry 3 --retry-delay 2 -fsSL -o /workspace/${FLASH_ATTN_WHEEL} \
            https://${GITHUB_ARTIFACTORY}/Dao-AILab/flash-attention/releases/download/v2.8.3/${FLASH_ATTN_WHEEL}; \
    fi

########################################################
# Environment 1: serve (sglang + kt-kernel)
########################################################

# Upgrade pip and install basic tools in serve env
RUN --mount=type=cache,target=/root/.cache/pip \
    /opt/miniconda3/envs/serve/bin/pip install --upgrade pip setuptools wheel html5lib six

# Install sgl-kernel
RUN --mount=type=cache,target=/root/.cache/pip \
    case "$CUDA_VERSION" in \
        12.6.1) CUINDEX=126 ;; \
        12.8.1) CUINDEX=128 ;; \
        12.9.1) CUINDEX=129 ;; \
        13.0.1) CUINDEX=130 ;; \
        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
    esac \
    && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
        /opt/miniconda3/envs/serve/bin/pip install https://${GITHUB_ARTIFACTORY}/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
    ; \
    elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
        /opt/miniconda3/envs/serve/bin/pip install sgl-kernel==${SGL_KERNEL_VERSION} \
    ; \
    elif [ "$CUDA_VERSION" = "13.0.1" ]; then \
        /opt/miniconda3/envs/serve/bin/pip install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
    ; \
    fi

# Install SGLang in serve env (version aligned with ktransformers)
RUN --mount=type=cache,target=/root/.cache/pip \
    case "$CUDA_VERSION" in \
        12.6.1) CUINDEX=126 ;; \
        12.8.1) CUINDEX=128 ;; \
        12.9.1) CUINDEX=129 ;; \
        13.0.1) CUINDEX=130 ;; \
    esac \
    && export SGLANG_KT_VERSION=$(python3 -c "exec(open('/workspace/ktransformers/version.py').read()); print(__version__)") \
    && echo "Installing sglang-kt v${SGLANG_KT_VERSION}" \
    && cd /workspace/sglang \
    && /opt/miniconda3/envs/serve/bin/pip install -e "python[all]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX}

# Download FlashInfer cubin for serve env
RUN --mount=type=cache,target=/root/.cache/pip \
    FLASHINFER_CUBIN_DOWNLOAD_THREADS=${BUILD_AND_DOWNLOAD_PARALLEL} FLASHINFER_LOGGING_LEVEL=warning \
    /opt/miniconda3/envs/serve/bin/python -m flashinfer --download-cubin

# Install DeepEP in serve env
RUN set -eux; \
    if [ "$GRACE_BLACKWELL" = "1" ]; then \
      git clone https://github.com/fzyzcjy/DeepEP.git /workspace/DeepEP && \
      cd /workspace/DeepEP && \
      git checkout ${GRACE_BLACKWELL_DEEPEP_BRANCH} && \
      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh; \
    elif [ "$HOPPER_SBO" = "1" ]; then \
      git clone https://github.com/deepseek-ai/DeepEP.git -b antgroup-opt /workspace/DeepEP && \
      cd /workspace/DeepEP && \
      git checkout ${HOPPER_SBO_DEEPEP_COMMIT} && \
      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh; \
    else \
      curl --retry 3 --retry-delay 2 -fsSL -o /tmp/${DEEPEP_COMMIT}.zip \
          https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \
      unzip -q /tmp/${DEEPEP_COMMIT}.zip -d /tmp && rm /tmp/${DEEPEP_COMMIT}.zip && \
      mv /tmp/DeepEP-${DEEPEP_COMMIT} /workspace/DeepEP && \
      cd /workspace/DeepEP && \
      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh; \
    fi

RUN --mount=type=cache,target=/root/.cache/pip \
    cd /workspace/DeepEP && \
    case "$CUDA_VERSION" in \
        12.6.1) CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' ;; \
        12.8.1) CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' ;; \
        12.9.1|13.0.1) CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' ;; \
        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
    esac && \
    . /opt/miniconda3/etc/profile.d/conda.sh && conda activate serve && \
    TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" MAX_JOBS=${BUILD_AND_DOWNLOAD_PARALLEL} \
    pip install --no-build-isolation .

# Install NCCL for serve env
RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
        /opt/miniconda3/envs/serve/bin/pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
    elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
        /opt/miniconda3/envs/serve/bin/pip install nvidia-nccl-cu13==2.28.3 --force-reinstall --no-deps ; \
    fi

# Install kt-kernel in serve env with all CPU variants
RUN . /opt/miniconda3/etc/profile.d/conda.sh && conda activate serve \
    && cd /workspace/ktransformers/kt-kernel \
    && CPUINFER_BUILD_ALL_VARIANTS=1 ./install.sh build

########################################################
# Environment 2: fine-tune (LLaMA-Factory + ktransformers) - sft mode only
########################################################

# Install dependency libraries for ktransformers (CUDA 11.8 runtime required)
RUN if [ "$FUNCTIONALITY" = "sft" ]; then \
        conda install -n fine-tune -y -c conda-forge libstdcxx-ng gcc_impl_linux-64 \
        && conda install -n fine-tune -y -c nvidia/label/cuda-11.8.0 cuda-runtime; \
    fi

# Install PyTorch 2.8 in fine-tune env
RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$FUNCTIONALITY" = "sft" ]; then \
        case "$CUDA_VERSION" in \
            12.6.1) CUINDEX=126 ;; \
            12.8.1) CUINDEX=128 ;; \
            12.9.1) CUINDEX=129 ;; \
            13.0.1) CUINDEX=130 ;; \
        esac \
        && /opt/miniconda3/envs/fine-tune/bin/pip install --upgrade pip setuptools wheel hatchling \
        && /opt/miniconda3/envs/fine-tune/bin/pip install \
            torch==2.8.0 \
            torchvision \
            torchaudio \
            --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX}; \
    fi

# Install LLaMA-Factory in fine-tune env
RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$FUNCTIONALITY" = "sft" ]; then \
        cd /workspace/LLaMA-Factory \
        && /opt/miniconda3/envs/fine-tune/bin/pip install -e ".[torch,metrics]" --no-build-isolation; \
    fi

# Install ktransformers wheel in fine-tune env
RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$FUNCTIONALITY" = "sft" ]; then \
        /opt/miniconda3/envs/fine-tune/bin/pip install /workspace/${KTRANSFORMERS_WHEEL}; \
    fi

# Install flash_attn wheel in fine-tune env
RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$FUNCTIONALITY" = "sft" ]; then \
        /opt/miniconda3/envs/fine-tune/bin/pip install /workspace/${FLASH_ATTN_WHEEL}; \
    fi

# Install NCCL for fine-tune env
RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$FUNCTIONALITY" = "sft" ]; then \
        if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
            /opt/miniconda3/envs/fine-tune/bin/pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
        elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
            /opt/miniconda3/envs/fine-tune/bin/pip install nvidia-nccl-cu13==2.28.3 --force-reinstall --no-deps ; \
        fi; \
    fi

########################################################
# Cleanup and final setup
########################################################

# Clean up downloaded wheels
RUN if [ "$FUNCTIONALITY" = "sft" ]; then \
        rm -f /workspace/${KTRANSFORMERS_WHEEL} /workspace/${FLASH_ATTN_WHEEL}; \
    fi

# Initialize conda for bash
RUN /opt/miniconda3/bin/conda init bash

# Create shell aliases for convenience
RUN echo '\n# Conda environment aliases\nalias serve="conda activate serve"' >> /root/.bashrc \
    && if [ "$FUNCTIONALITY" = "sft" ]; then \
        echo 'alias finetune="conda activate fine-tune"' >> /root/.bashrc; \
    fi

########################################################
# Extract version information for image naming
########################################################

# Extract versions from each component and save to versions.env
RUN set -x && \
    # KTransformers version (single source of truth for both kt-kernel and sglang-kt)
    cd /workspace/ktransformers && \
    KTRANSFORMERS_VERSION=$(python3 -c "exec(open('version.py').read()); print(__version__)" 2>/dev/null || echo "unknown") && \
    echo "KTRANSFORMERS_VERSION=$KTRANSFORMERS_VERSION" > /workspace/versions.env && \
    echo "Extracted KTransformers version: $KTRANSFORMERS_VERSION" && \
    \
    # sglang-kt version = ktransformers version (aligned)
    echo "SGLANG_KT_VERSION=$KTRANSFORMERS_VERSION" >> /workspace/versions.env && \
    echo "sglang-kt version (aligned): $KTRANSFORMERS_VERSION" && \
    \
    # LLaMA-Factory version (from fine-tune environment, sft mode only)
    if [ "$FUNCTIONALITY" = "sft" ]; then \
        . /opt/miniconda3/etc/profile.d/conda.sh && conda activate fine-tune && \
        cd /workspace/LLaMA-Factory && \
        LLAMAFACTORY_VERSION=$(python -c "import sys; sys.path.insert(0, 'src'); from llamafactory import __version__; print(__version__)" 2>/dev/null || echo "unknown") && \
        echo "LLAMAFACTORY_VERSION=$LLAMAFACTORY_VERSION" >> /workspace/versions.env && \
        echo "Extracted LLaMA-Factory version: $LLAMAFACTORY_VERSION"; \
    else \
        echo "LLAMAFACTORY_VERSION=none" >> /workspace/versions.env && \
        echo "LLaMA-Factory not installed (infer mode)"; \
    fi && \
    \
    # Display all versions
    echo "=== Version Summary ===" && \
    cat /workspace/versions.env

WORKDIR /workspace

CMD ["/bin/bash"]


================================================
FILE: docker/README-packaging.md
================================================
# KTransformers Docker Packaging Guide

This directory contains scripts for building and distributing KTransformers Docker images with standardized naming conventions.

## Overview

The packaging system provides:

- **Automated version detection** from sglang, ktransformers, and LLaMA-Factory
- **Multi-CPU variant support** (AMX, AVX512, AVX2) with runtime auto-detection
- **Standardized naming convention** for easy identification and management
- **Two distribution methods**:
  - Local tar file export for offline distribution
  - DockerHub publishing for online distribution

## Naming Convention

Docker images follow this naming pattern:

```
sglang-v{sglang版本}_ktransformers-v{ktransformers版本}_{cpu信息}_{gpu信息}_{功能模式}_{时间戳}
```

### Example Names

**Tar file:**
```
sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022.tar
```

**DockerHub tags:**
```
Full tag:
kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022

Simplified tag:
kvcache/ktransformers:v0.4.3-cu128
```

### Name Components

| Component | Description | Example |
|-----------|-------------|---------|
| sglang version | SGLang package version | `v0.5.6` |
| ktransformers version | KTransformers version | `v0.4.3` |
| cpu info | CPU instruction set support | `x86-intel-multi` (includes AMX/AVX512/AVX2) |
| gpu info | CUDA version | `cu128` (CUDA 12.8) |
| functionality | Feature mode | `sft_llamafactory-v0.9.3` or `infer` |
| timestamp | Build time (Beijing/UTC+8) | `20241212143022` |

## Files

| File | Purpose |
|------|---------|
| `Dockerfile` | Main Dockerfile with multi-CPU build and version extraction |
| `docker-utils.sh` | Shared utility functions for both scripts |
| `build-docker-tar.sh` | Build and export Docker image to tar file |
| `push-to-dockerhub.sh` | Build and push Docker image to DockerHub |

## Prerequisites

- Docker installed and running
- For DockerHub push: Docker Hub account and login (`docker login`)
- Sufficient disk space (at least 20GB recommended)
- Internet access (or local mirrors configured)

## Quick Start

### Build Local Tar File

```bash
cd docker

# Basic build
./build-docker-tar.sh

# With specific CUDA version and mirror
./build-docker-tar.sh \
  --cuda-version 12.8.1 \
  --ubuntu-mirror 1

# With proxy
./build-docker-tar.sh \
  --cuda-version 12.8.1 \
  --ubuntu-mirror 1 \
  --http-proxy "http://127.0.0.1:16981" \
  --https-proxy "http://127.0.0.1:16981" \
  --output-dir /path/to/output
```

### Push to DockerHub

```bash
cd docker

# Basic push (requires --repository)
./push-to-dockerhub.sh \
  --repository kvcache/ktransformers

# With simplified tag
./push-to-dockerhub.sh \
  --cuda-version 12.8.1 \
  --repository kvcache/ktransformers \
  --also-push-simplified

# Skip build if image exists
./push-to-dockerhub.sh \
  --repository kvcache/ktransformers \
  --skip-build
```

## Script Options

### build-docker-tar.sh

```
Build Configuration:
  --cuda-version VERSION       CUDA version (default: 12.8.1)
  --ubuntu-mirror 0|1         Use Tsinghua mirror (default: 0)
  --http-proxy URL            HTTP proxy URL
  --https-proxy URL           HTTPS proxy URL
  --cpu-variant VARIANT       CPU variant (default: x86-intel-multi)
  --functionality TYPE        Mode: sft or infer (default: sft)

Paths:
  --dockerfile PATH           Path to Dockerfile (default: ./Dockerfile)
  --context-dir PATH          Build context directory (default: .)
  --output-dir PATH           Output directory for tar (default: .)

Options:
  --dry-run                   Preview without building
  --keep-image                Keep Docker image after export
  --build-arg KEY=VALUE       Additional build arguments
  -h, --help                  Show help message
```

### push-to-dockerhub.sh

```
All options from build-docker-tar.sh, plus:

Registry Settings:
  --registry REGISTRY         Docker registry (default: docker.io)
  --repository REPO           Repository name (REQUIRED)

Options:
  --skip-build                Skip build if image exists
  --also-push-simplified      Also push simplified tag
  --max-retries N             Max push retries (default: 3)
  --retry-delay SECONDS       Delay between retries (default: 5)
```

## Usage Examples

### Example 1: Local Development Build

For testing on your local machine:

```bash
./build-docker-tar.sh \
  --cuda-version 12.8.1 \
  --output-dir ./builds \
  --keep-image
```

This will:
1. Build the Docker image
2. Export to tar in `./builds/` directory
3. Keep the Docker image for local testing

### Example 2: Production Build for Distribution

For creating a production build with mirrors and proxy:

```bash
./build-docker-tar.sh \
  --cuda-version 12.8.1 \
  --ubuntu-mirror 1 \
  --http-proxy "http://127.0.0.1:16981" \
  --https-proxy "http://127.0.0.1:16981" \
  --output-dir /mnt/data/releases
```

### Example 3: Publish to DockerHub

For publishing to DockerHub:

```bash
# First, login to Docker Hub
docker login

# Then push
./push-to-dockerhub.sh \
  --cuda-version 12.8.1 \
  --repository kvcache/ktransformers \
  --also-push-simplified
```

This creates two tags:
- Full: `kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022`
- Simplified: `kvcache/ktransformers:v0.4.3-cu128`

### Example 4: Dry Run

Preview the build without actually building:

```bash
./build-docker-tar.sh --cuda-version 12.8.1 --dry-run
```

### Example 5: Custom Build Arguments

Pass additional Docker build arguments:

```bash
./build-docker-tar.sh \
  --cuda-version 12.8.1 \
  --build-arg SGL_VERSION=0.5.7 \
  --build-arg FLASHINFER_VERSION=0.5.4
```

## Using the Built Images

### Load from Tar File

```bash
# Load the image
docker load -i sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022.tar

# Run the container
docker run -it --rm \
  --gpus all \
  sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022 \
  /bin/bash
```

### Pull from DockerHub

```bash
# Pull with full tag
docker pull kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022

# Or pull with simplified tag
docker pull kvcache/ktransformers:v0.4.3-cu128

# Run the container
docker run -it --rm \
  --gpus all \
  kvcache/ktransformers:v0.4.3-cu128 \
  /bin/bash
```

### Inside the Container

The image contains two conda environments:

```bash
# Activate serve environment (for inference with sglang)
conda activate serve
# or use the alias:
serve

# Activate fine-tune environment (for training with LLaMA-Factory)
conda activate fine-tune
# or use the alias:
finetune
```

## Multi-CPU Variant Support

The Docker image includes all three CPU variants:
- **AMX** - For Intel Sapphire Rapids and newer (4th Gen Xeon+)
- **AVX512** - For Intel Skylake-X, Ice Lake, Cascade Lake
- **AVX2** - Maximum compatibility for older CPUs

The runtime automatically detects your CPU and loads the appropriate variant. To override:

```bash
# Force use of AVX2 variant
export KT_KERNEL_CPU_VARIANT=avx2
python your_script.py

# Enable debug output to see which variant is loaded
export KT_KERNEL_DEBUG=1
python your_script.py
```

## Version Extraction

Versions are automatically extracted during Docker build from:

- **SGLang**: From `sglang.__version__` in serve environment
- **KTransformers**: From `version.py` in ktransformers repository
- **LLaMA-Factory**: From `llamafactory.__version__` in fine-tune environment

The versions are saved to `/workspace/versions.env` in the image:

```bash
# View versions in running container
cat /workspace/versions.env

# Output:
SGLANG_VERSION=0.5.6
KTRANSFORMERS_VERSION=0.4.3
LLAMAFACTORY_VERSION=0.9.3
```

## Troubleshooting

### Build Fails with Out of Disk Space

Check available disk space:
```bash
df -h
```

The build requires approximately 15-20GB of disk space. Clean up Docker:
```bash
docker system prune -a
```

### Version Extraction Fails

If version extraction fails (shows "unknown"), check:

1. The cloned repositories have the correct branches
2. Python packages are properly installed in conda environments
3. Version files exist in expected locations

You can manually verify by running:
```bash
docker run --rm <image> /bin/bash -c "
  source /opt/miniconda3/etc/profile.d/conda.sh &&
  conda activate serve &&
  python -c 'import sglang; print(sglang.__version__)'
"
```

### Push to DockerHub Fails

1. **Check login**: `docker login`
2. **Check repository name**: Must include namespace (e.g., `kvcache/ktransformers`, not just `ktransformers`)
3. **Network issues**: Use `--max-retries` and `--retry-delay` options
4. **Rate limiting**: DockerHub has pull/push rate limits for free accounts

## Advanced Topics

### Custom Dockerfile Location

```bash
./build-docker-tar.sh \
  --dockerfile /path/to/custom/Dockerfile \
  --context-dir /path/to/build/context
```

### Building Only Inference Image (Future)

Currently, the image always includes both serve and fine-tune environments. To create an inference-only image, modify the Dockerfile to skip the fine-tune environment section.

### Customizing CPU Variants

To build only specific CPU variants, modify `kt-kernel/install.sh` or set environment variables in the Dockerfile.

### CI/CD Integration

The scripts are designed for manual execution but can be integrated into CI/CD pipelines:

```yaml
# Example GitHub Actions workflow
- name: Build and push Docker image
  run: |
    cd docker
    ./push-to-dockerhub.sh \
      --cuda-version ${{ matrix.cuda_version }} \
      --repository ${{ secrets.DOCKER_REPOSITORY }} \
      --also-push-simplified
```

## Support

For issues and questions:
- File an issue at: https://github.com/kvcache-ai/ktransformers/issues
- Check documentation: https://github.com/kvcache-ai/ktransformers

## License

This packaging system is part of KTransformers and follows the same license.


================================================
FILE: docker/docker-utils.sh
================================================
#!/usr/bin/env bash
#
# docker-utils.sh - Shared utility functions for Docker image build and publish scripts
#
# This script provides common functions for:
# - Timestamp generation (Beijing timezone)
# - Version extraction from Docker images
# - Image name generation following naming conventions
# - Colored logging
# - Validation and error handling
#
# Usage: source docker-utils.sh

set -euo pipefail

# Color codes for logging
COLOR_RED='\033[0;31m'
COLOR_GREEN='\033[0;32m'
COLOR_YELLOW='\033[1;33m'
COLOR_BLUE='\033[0;34m'
COLOR_CYAN='\033[0;36m'
COLOR_RESET='\033[0m'

################################################################################
# Logging Functions
################################################################################

log_info() {
    echo -e "${COLOR_BLUE}[INFO]${COLOR_RESET} $*"
}

log_success() {
    echo -e "${COLOR_GREEN}[SUCCESS]${COLOR_RESET} $*"
}

log_warning() {
    echo -e "${COLOR_YELLOW}[WARNING]${COLOR_RESET} $*"
}

log_error() {
    echo -e "${COLOR_RED}[ERROR]${COLOR_RESET} $*" >&2
}

log_step() {
    echo -e "\n${COLOR_CYAN}==>${COLOR_RESET} $*"
}

################################################################################
# Timestamp Functions
################################################################################

# Generate timestamp in Beijing timezone (UTC+8)
# Format: YYYYMMDDHHMMSS
# Example: 20241212143022
get_beijing_timestamp() {
    # Try to use TZ environment variable approach
    if date --version &>/dev/null 2>&1; then
        # GNU date (Linux)
        TZ='Asia/Shanghai' date '+%Y%m%d%H%M%S'
    else
        # BSD date (macOS)
        TZ='Asia/Shanghai' date '+%Y%m%d%H%M%S'
    fi
}

################################################################################
# CUDA Version Parsing
################################################################################

# Parse CUDA version to short format
# Input: 12.8.1 or 12.8 or 13.0.1
# Output: cu128 or cu130
parse_cuda_short_version() {
    local cuda_version="$1"

    # Extract major and minor version
    local major minor
    major=$(echo "$cuda_version" | cut -d. -f1)
    minor=$(echo "$cuda_version" | cut -d. -f2)

    # Validate
    if [[ ! "$major" =~ ^[0-9]+$ ]] || [[ ! "$minor" =~ ^[0-9]+$ ]]; then
        log_error "Invalid CUDA version format: $cuda_version"
        log_error "Expected format: X.Y.Z (e.g., 12.8.1)"
        return 1
    fi

    echo "cu${major}${minor}"
}

################################################################################
# Version Extraction
################################################################################

# Extract versions from built Docker image
# Input: image tag (e.g., ktransformers:temp-build-20241212)
# Output: Sets environment variables or prints to stdout
#   SGLANG_VERSION=x.y.z
#   KTRANSFORMERS_VERSION=x.y.z
#   LLAMAFACTORY_VERSION=x.y.z
extract_versions_from_image() {
    local image_tag="$1"

    log_step "Extracting versions from image: $image_tag"

    # Check if image exists
    if ! docker image inspect "$image_tag" &>/dev/null; then
        log_error "Image not found: $image_tag"
        return 1
    fi

    # Extract versions.env file from the image
    local versions_content
    versions_content=$(docker run --rm "$image_tag" cat /workspace/versions.env 2>/dev/null)

    if [ -z "$versions_content" ]; then
        log_error "Failed to extract versions from image"
        log_error "The /workspace/versions.env file may not exist in the image"
        return 1
    fi

    # Parse and display versions
    log_info "Extracted versions:"
    echo "$versions_content" | while IFS= read -r line; do
        log_info "  $line"
    done

    # Output the content (caller can parse this or eval it)
    echo "$versions_content"
}

# Validate that all required versions were extracted
# Input: versions string (output from extract_versions_from_image)
validate_versions() {
    local versions="$1"
    local all_valid=true

    # Check each required version
    for var in SGLANG_VERSION KTRANSFORMERS_VERSION LLAMAFACTORY_VERSION; do
        local value
        value=$(echo "$versions" | grep "^${var}=" | cut -d= -f2)

        if [ -z "$value" ]; then
            log_error "Missing version: $var"
            all_valid=false
        elif [ "$value" = "unknown" ]; then
            log_warning "Version is 'unknown': $var"
            # Don't fail, but warn user
        fi
    done

    if [ "$all_valid" = false ]; then
        return 1
    fi

    return 0
}

################################################################################
# Image Naming
################################################################################

# Generate standardized image name
# Input:
#   $1: versions string (from extract_versions_from_image)
#   $2: cuda_version (e.g., 12.8.1)
#   $3: cpu_variant (e.g., x86-intel-multi)
#   $4: functionality (e.g., sft_llamafactory or infer)
#   $5: timestamp (optional, will generate if not provided)
# Output: Standardized image name
# Format: sglang-v{ver}_ktransformers-v{ver}_{cpu}_{gpu}_{func}_{timestamp}
generate_image_name() {
    local versions="$1"
    local cuda_version="$2"
    local cpu_variant="$3"
    local functionality="$4"
    local timestamp="${5:-$(get_beijing_timestamp)}"

    # Parse versions from the versions string
    local sglang_ver ktrans_ver llama_ver
    sglang_ver=$(echo "$versions" | grep "^SGLANG_VERSION=" | cut -d= -f2)
    ktrans_ver=$(echo "$versions" | grep "^KTRANSFORMERS_VERSION=" | cut -d= -f2)
    llama_ver=$(echo "$versions" | grep "^LLAMAFACTORY_VERSION=" | cut -d= -f2)

    # Validate versions were extracted
    if [ -z "$sglang_ver" ] || [ -z "$ktrans_ver" ]; then
        log_error "Failed to parse versions from input"
        return 1
    fi

    # Parse CUDA short version
    local cuda_short
    cuda_short=$(parse_cuda_short_version "$cuda_version")

    # Build functionality string
    local func_str
    if [ "$functionality" = "sft" ]; then
        func_str="sft_llamafactory-v${llama_ver}"
    else
        func_str="infer"
    fi

    # Generate full image name
    # Format: sglang-v{ver}_ktransformers-v{ver}_{cpu}_{gpu}_{func}_{timestamp}
    local image_name
    image_name="sglang-v${sglang_ver}_ktransformers-v${ktrans_ver}_${cpu_variant}_${cuda_short}_${func_str}_${timestamp}"

    echo "$image_name"
}

# Generate simplified tag for DockerHub
# Input:
#   $1: ktransformers_version (e.g., 0.4.3)
#   $2: cuda_version (e.g., 12.8.1)
# Output: Simplified tag (e.g., v0.4.3-cu128)
generate_simplified_tag() {
    local ktrans_ver="$1"
    local cuda_version="$2"

    local cuda_short
    cuda_short=$(parse_cuda_short_version "$cuda_version")

    echo "v${ktrans_ver}-${cuda_short}"
}

################################################################################
# Validation Functions
################################################################################

# Check if Docker daemon is running
check_docker_running() {
    if ! docker info &>/dev/null; then
        log_error "Docker daemon is not running"
        log_error "Please start Docker and try again"
        return 1
    fi
    return 0
}

# Check if user is logged into Docker registry
# Input: registry (optional, default: docker.io)
check_docker_login() {
    local registry="${1:-docker.io}"

    # Try to check auth by attempting a trivial operation
    if ! docker login --help &>/dev/null; then
        log_error "Docker CLI is not available"
        return 1
    fi

    # Note: This is a best-effort check
    # docker login status is not always easy to check programmatically
    log_info "Assuming Docker login is configured"
    log_info "If push fails, please run: docker login $registry"

    return 0
}

# Validate CUDA version format
validate_cuda_version() {
    local cuda_version="$1"

    if [[ ! "$cuda_version" =~ ^[0-9]+\.[0-9]+(\.[0-9]+)?$ ]]; then
        log_error "Invalid CUDA version format: $cuda_version"
        log_error "Expected format: X.Y or X.Y.Z (e.g., 12.8 or 12.8.1)"
        return 1
    fi

    return 0
}

# Check available disk space
# Input: required space in GB
check_disk_space() {
    local required_gb="$1"
    local output_dir="${2:-.}"

    # Get available space in GB (works on Linux and macOS)
    local available_kb
    if df -k "$output_dir" &>/dev/null; then
        available_kb=$(df -k "$output_dir" | tail -1 | awk '{print $4}')
        local available_gb=$((available_kb / 1024 / 1024))

        log_info "Available disk space: ${available_gb}GB"

        if [ "$available_gb" -lt "$required_gb" ]; then
            log_warning "Low disk space: ${available_gb}GB available, ${required_gb}GB recommended"
            return 1
        fi
    else
        log_warning "Unable to check disk space"
    fi

    return 0
}

# Check if file/directory exists and is writable
check_writable() {
    local path="$1"

    if [ -e "$path" ]; then
        if [ ! -w "$path" ]; then
            log_error "Path exists but is not writable: $path"
            return 1
        fi
    else
        # Try to create parent directory to test writability
        local parent_dir
        parent_dir=$(dirname "$path")
        if [ ! -w "$parent_dir" ]; then
            log_error "Parent directory is not writable: $parent_dir"
            return 1
        fi
    fi

    return 0
}

################################################################################
# Cleanup Functions
################################################################################

# Remove intermediate Docker images
cleanup_temp_images() {
    local image_tag="$1"

    log_step "Cleaning up temporary image: $image_tag"

    if docker image inspect "$image_tag" &>/dev/null; then
        docker rmi "$image_tag" &>/dev/null || true
        log_success "Cleaned up temporary image"
    fi
}

################################################################################
# Display Functions
################################################################################

# Display a summary box
display_summary() {
    local title="$1"
    shift
    local lines=("$@")

    local width=80
    local border=$(printf '=%.0s' $(seq 1 $width))

    echo ""
    echo "$border"
    echo "  $title"
    echo "$border"
    for line in "${lines[@]}"; do
        echo "  $line"
    done
    echo "$border"
    echo ""
}

################################################################################
# Export functions
################################################################################

# Export all functions so they can be used by scripts that source this file
export -f log_info log_success log_warning log_error log_step
export -f get_beijing_timestamp
export -f parse_cuda_short_version
export -f extract_versions_from_image validate_versions
export -f generate_image_name generate_simplified_tag
export -f check_docker_running check_docker_login validate_cuda_version
export -f check_disk_space check_writable
export -f cleanup_temp_images
export -f display_summary


================================================
FILE: docker/push-to-dockerhub.sh
================================================
#!/usr/bin/env bash
#
# push-to-dockerhub.sh - Build and push Docker image to DockerHub
#
# This script builds a Docker image for ktransformers with standardized naming
# and pushes it to DockerHub with both full and simplified tags.
#
# Features:
# - Automatic version detection
# - Standardized naming convention
# - Multi-CPU variant support (AMX/AVX512/AVX2)
# - Full and simplified tag support
# - Retry logic for network failures
# - Comprehensive error handling
#
# Usage:
#   ./push-to-dockerhub.sh [OPTIONS]
#
# Example:
#   ./push-to-dockerhub.sh \
#     --cuda-version 12.8.1 \
#     --repository kvcache/ktransformers \
#     --also-push-simplified

set -euo pipefail

# Get script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Source utility functions
# shellcheck source=docker-utils.sh
source "$SCRIPT_DIR/docker-utils.sh"

################################################################################
# Default Configuration
################################################################################

# Build parameters
CUDA_VERSION="12.8.1"
UBUNTU_MIRROR="0"
HTTP_PROXY=""
HTTPS_PROXY=""
CPU_VARIANT="x86-intel-multi"
FUNCTIONALITY="sft"

# Paths
DOCKERFILE="$SCRIPT_DIR/Dockerfile"
CONTEXT_DIR="$SCRIPT_DIR"

# Registry settings
REGISTRY="docker.io"
REPOSITORY=""  # Must be provided by user

# Options
DRY_RUN=false
SKIP_BUILD=false
ALSO_PUSH_SIMPLIFIED=false
MAX_RETRIES=3
RETRY_DELAY=5
EXTRA_BUILD_ARGS=()

################################################################################
# Help Message
################################################################################

usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Build and push Docker image to DockerHub with standardized naming.

OPTIONS:
    Build Configuration:
        --cuda-version VERSION      CUDA version (default: 12.8.1)
                                   Examples: 12.8.1, 12.6.1, 13.0.1

        --ubuntu-mirror 0|1         Use Tsinghua mirror for Ubuntu packages
                                   (default: 0)

        --http-proxy URL           HTTP proxy URL
                                   Example: http://127.0.0.1:16981

        --https-proxy URL          HTTPS proxy URL
                                   Example: http://127.0.0.1:16981

        --cpu-variant VARIANT      CPU variant identifier
                                   (default: x86-intel-multi)

        --functionality TYPE       Functionality mode: sft or infer
                                   (default: sft, includes LLaMA-Factory)

    Paths:
        --dockerfile PATH          Path to Dockerfile
                                   (default: ./Dockerfile)

        --context-dir PATH         Docker build context directory
                                   (default: .)

    Registry Settings:
        --registry REGISTRY        Docker registry (default: docker.io)
                                   Examples: docker.io, ghcr.io

        --repository REPO          Repository name (REQUIRED)
                                   Example: kvcache/ktransformers

    Options:
        --skip-build               Skip build if image exists locally
        --also-push-simplified     Also push simplified tag (v{ver}-{cuda})
        --max-retries N            Maximum push retries (default: 3)
        --retry-delay SECONDS      Delay between retries (default: 5)
        --dry-run                  Preview commands without executing
        --build-arg KEY=VALUE      Additional build arguments (can be repeated)
        -h, --help                 Show this help message

EXAMPLES:
    # Basic push
    $0 --repository kvcache/ktransformers

    # Push with simplified tag
    $0 \\
        --repository kvcache/ktransformers \\
        --cuda-version 12.8.1 \\
        --also-push-simplified

    # Skip build if image exists
    $0 \\
        --repository kvcache/ktransformers \\
        --skip-build

    # Dry run to preview
    $0 --repository kvcache/ktransformers --dry-run

OUTPUT:
    The image will be pushed with tags:

    Full tag:
      {registry}/{repository}:sglang-v{ver}_ktransformers-v{ver}_{cpu}_{gpu}_{func}_{timestamp}

    Example:
      docker.io/kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022

    Simplified tag (if --also-push-simplified):
      {registry}/{repository}:v{ktransformers-ver}-{cuda}

    Example:
      docker.io/kvcache/ktransformers:v0.4.3-cu128

EOF
    exit 0
}

################################################################################
# Argument Parsing
################################################################################

parse_args() {
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --cuda-version)
                CUDA_VERSION="$2"
                shift 2
                ;;
            --ubuntu-mirror)
                UBUNTU_MIRROR="$2"
                shift 2
                ;;
            --http-proxy)
                HTTP_PROXY="$2"
                shift 2
                ;;
            --https-proxy)
                HTTPS_PROXY="$2"
                shift 2
                ;;
            --cpu-variant)
                CPU_VARIANT="$2"
                shift 2
                ;;
            --functionality)
                FUNCTIONALITY="$2"
                shift 2
                ;;
            --dockerfile)
                DOCKERFILE="$2"
                shift 2
                ;;
            --context-dir)
                CONTEXT_DIR="$2"
                shift 2
                ;;
            --registry)
                REGISTRY="$2"
                shift 2
                ;;
            --repository)
                REPOSITORY="$2"
                shift 2
                ;;
            --skip-build)
                SKIP_BUILD=true
                shift
                ;;
            --also-push-simplified)
                ALSO_PUSH_SIMPLIFIED=true
                shift
                ;;
            --max-retries)
                MAX_RETRIES="$2"
                shift 2
                ;;
            --retry-delay)
                RETRY_DELAY="$2"
                shift 2
                ;;
            --dry-run)
                DRY_RUN=true
                shift
                ;;
            --build-arg)
                EXTRA_BUILD_ARGS+=("--build-arg" "$2")
                shift 2
                ;;
            -h|--help)
                usage
                ;;
            *)
                log_error "Unknown option: $1"
                echo "Use -h or --help for usage information"
                exit 1
                ;;
        esac
    done
}

################################################################################
# Validation
################################################################################

validate_config() {
    log_step "Validating configuration"

    # Check Docker is running
    check_docker_running || exit 1

    # Check Docker login
    check_docker_login "$REGISTRY" || exit 1

    # Validate CUDA version
    validate_cuda_version "$CUDA_VERSION" || exit 1

    # Check repository is provided
    if [ -z "$REPOSITORY" ]; then
        log_error "Repository name is required"
        log_error "Use --repository to specify (e.g., kvcache/ktransformers)"
        exit 1
    fi
    log_info "Target repository: $REGISTRY/$REPOSITORY"

    # Check Dockerfile exists
    if [ ! -f "$DOCKERFILE" ]; then
        log_error "Dockerfile not found: $DOCKERFILE"
        exit 1
    fi
    log_info "Using Dockerfile: $DOCKERFILE"

    # Check context directory exists
    if [ ! -d "$CONTEXT_DIR" ]; then
        log_error "Context directory not found: $CONTEXT_DIR"
        exit 1
    fi
    log_info "Using context directory: $CONTEXT_DIR"

    # Validate functionality mode
    if [[ "$FUNCTIONALITY" != "sft" && "$FUNCTIONALITY" != "infer" ]]; then
        log_error "Invalid functionality mode: $FUNCTIONALITY"
        log_error "Must be 'sft' or 'infer'"
        exit 1
    fi

    log_success "Configuration validated"
}

################################################################################
# Build Docker Image
################################################################################

build_image() {
    local temp_tag="ktransformers:temp-push-$(get_beijing_timestamp)"

    # Check if we should skip build
    if [ "$SKIP_BUILD" = true ]; then
        log_info "Checking for existing local image..."
        # Try to find an existing image
        # This is a best-effort search for recent builds
        local existing_image
        existing_image=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "ktransformers:temp-" | head -1 || echo "")

        if [ -n "$existing_image" ]; then
            log_info "Found existing image: $existing_image"
            echo "$existing_image"
            return 0
        else
            log_warning "No existing image found, will build"
        fi
    fi

    log_step "Building Docker image" >&2
    log_info "Temporary tag: $temp_tag" >&2

    # Prepare build arguments
    local build_args=()
    build_args+=("--build-arg" "CUDA_VERSION=$CUDA_VERSION")
    build_args+=("--build-arg" "UBUNTU_MIRROR=$UBUNTU_MIRROR")
    build_args+=("--build-arg" "CPU_VARIANT=$CPU_VARIANT")
    build_args+=("--build-arg" "BUILD_ALL_CPU_VARIANTS=1")
    build_args+=("--build-arg" "FUNCTIONALITY=$FUNCTIONALITY")

    # Add proxy settings if provided
    if [ -n "$HTTP_PROXY" ]; then
        build_args+=("--build-arg" "HTTP_PROXY=$HTTP_PROXY")
    fi
    if [ -n "$HTTPS_PROXY" ]; then
        build_args+=("--build-arg" "HTTPS_PROXY=$HTTPS_PROXY")
    fi

    # Add extra build args
    build_args+=("${EXTRA_BUILD_ARGS[@]}")

    # Add network host
    build_args+=("--network" "host")

    # Build command
    local build_cmd=(
        docker build
        -f "$DOCKERFILE"
        "${build_args[@]}"
        -t "$temp_tag"
        "$CONTEXT_DIR"
    )

    # Display build command
    {
        log_info "Build command:"
        echo "  ${build_cmd[*]}"
    } >&2

    if [ "$DRY_RUN" = true ]; then
        log_warning "DRY RUN: Skipping actual build" >&2
        return 0
    fi

    # Execute build
    log_info "Starting Docker build (this may take 30-60 minutes)..." >&2
    if "${build_cmd[@]}" >&2; then
        log_success "Docker image built successfully" >&2
        echo "$temp_tag"
    else
        log_error "Docker build failed" >&2
        exit 1
    fi
}

################################################################################
# Generate Tags
################################################################################

generate_tags() {
    local image_tag="$1"
    local timestamp="$2"

    if [ "$DRY_RUN" = true ]; then
        log_warning "DRY RUN: Using placeholder versions"
        # Use placeholder versions for dry run
        local versions="SGLANG_VERSION=0.5.6
KTRANSFORMERS_VERSION=0.4.3
LLAMAFACTORY_VERSION=0.9.3"
    else
        # Extract versions from image
        local versions
        versions=$(extract_versions_from_image "$image_tag")

        if [ $? -ne 0 ]; then
            log_error "Failed to extract versions from image"
            exit 1
        fi

        # Validate versions
        if ! validate_versions "$versions"; then
            log_error "Version validation failed"
            exit 1
        fi
    fi

    # Generate full tag
    local full_tag
    full_tag=$(generate_image_name "$versions" "$CUDA_VERSION" "$CPU_VARIANT" "$FUNCTIONALITY" "$timestamp")

    if [ -z "$full_tag" ]; then
        log_error "Failed to generate image name"
        exit 1
    fi

    echo "FULL_TAG=$full_tag"

    # Generate simplified tag if requested
    if [ "$ALSO_PUSH_SIMPLIFIED" = true ]; then
        local ktrans_ver
        ktrans_ver=$(echo "$versions" | grep "^KTRANSFORMERS_VERSION=" | cut -d= -f2)

        local simplified_tag
        simplified_tag=$(generate_simplified_tag "$ktrans_ver" "$CUDA_VERSION")

        echo "SIMPLIFIED_TAG=$simplified_tag"
    fi
}

################################################################################
# Push to Registry
################################################################################

push_image_with_retry() {
    local source_tag="$1"
    local target_tag="$2"
    local attempt=1

    log_step "Pushing image: $target_tag"

    if [ "$DRY_RUN" = true ]; then
        log_warning "DRY RUN: Skipping actual push"
        log_info "Would execute:"
        echo "  docker tag $source_tag $target_tag"
        echo "  docker push $target_tag"
        return 0
    fi

    # Tag the image
    log_info "Tagging image..."
    if ! docker tag "$source_tag" "$target_tag"; then
        log_error "Failed to tag image"
        return 1
    fi

    # Push with retry logic
    while [ $attempt -le "$MAX_RETRIES" ]; do
        log_info "Push attempt $attempt/$MAX_RETRIES..."

        if docker push "$target_tag"; then
            log_success "Successfully pushed: $target_tag"
            return 0
        else
            log_warning "Push failed (attempt $attempt/$MAX_RETRIES)"

            if [ $attempt -lt "$MAX_RETRIES" ]; then
                log_info "Retrying in ${RETRY_DELAY} seconds..."
                sleep "$RETRY_DELAY"
            fi

            ((attempt++))
        fi
    done

    log_error "Failed to push after $MAX_RETRIES attempts"
    return 1
}

################################################################################
# Main
################################################################################

main() {
    log_step "KTransformers Docker Image Build and Push"

    # Parse arguments
    parse_args "$@"

    # Validate configuration
    validate_config

    # Generate timestamp
    TIMESTAMP=$(get_beijing_timestamp)
    log_info "Build timestamp: $TIMESTAMP"

    # Display configuration
    display_summary "Push Configuration" \
        "CUDA Version: $CUDA_VERSION" \
        "Ubuntu Mirror: $UBUNTU_MIRROR" \
        "CPU Variant: $CPU_VARIANT" \
        "Functionality: $FUNCTIONALITY" \
        "Registry: $REGISTRY" \
        "Repository: $REPOSITORY" \
        "Push Simplified: $ALSO_PUSH_SIMPLIFIED" \
        "Skip Build: $SKIP_BUILD" \
        "HTTP Proxy: ${HTTP_PROXY:-<not set>}" \
        "HTTPS Proxy: ${HTTPS_PROXY:-<not set>}" \
        "Dockerfile: $DOCKERFILE" \
        "Context Dir: $CONTEXT_DIR" \
        "Timestamp: $TIMESTAMP" \
        "Dry Run: $DRY_RUN"

    # Build image
    TEMP_TAG=$(build_image)

    if [ "$DRY_RUN" = true ]; then
        TEMP_TAG="ktransformers:temp-dryrun"
    fi

    # Generate tags
    log_step "Generating tags"
    TAG_INFO=$(generate_tags "$TEMP_TAG" "$TIMESTAMP")

    # Parse tag info
    FULL_TAG=$(echo "$TAG_INFO" | grep "^FULL_TAG=" | cut -d= -f2)
    SIMPLIFIED_TAG=$(echo "$TAG_INFO" | grep "^SIMPLIFIED_TAG=" | cut -d= -f2 || echo "")

    log_info "Full tag: $FULL_TAG"
    if [ -n "$SIMPLIFIED_TAG" ]; then
        log_info "Simplified tag: $SIMPLIFIED_TAG"
    fi

    # Push full tag
    FULL_IMAGE="$REGISTRY/$REPOSITORY:$FULL_TAG"
    if ! push_image_with_retry "$TEMP_TAG" "$FULL_IMAGE"; then
        log_error "Failed to push full tag"
        exit 1
    fi

    # Push simplified tag if requested
    if [ -n "$SIMPLIFIED_TAG" ]; then
        SIMPLIFIED_IMAGE="$REGISTRY/$REPOSITORY:$SIMPLIFIED_TAG"
        if ! push_image_with_retry "$TEMP_TAG" "$SIMPLIFIED_IMAGE"; then
            log_warning "Failed to push simplified tag, but continuing..."
        fi
    fi

    # Cleanup temporary image
    if [ "$DRY_RUN" = false ]; then
        log_step "Cleaning up temporary image"
        cleanup_temp_images "$TEMP_TAG"
    fi

    # Display summary
    local summary_lines=(
        "Successfully pushed images:"
        ""
        "Full tag:"
        "  $FULL_IMAGE"
        ""
    )

    if [ -n "$SIMPLIFIED_TAG" ]; then
        summary_lines+=(
            "Simplified tag:"
            "  $SIMPLIFIED_IMAGE"
            ""
        )
    fi

    summary_lines+=(
        "To pull the image:"
        "  docker pull $FULL_IMAGE"
        ""
        "To run the container:"
        "  docker run -it --rm $FULL_IMAGE /bin/bash"
    )

    display_summary "Push Complete" "${summary_lines[@]}"

    log_success "All done!"
}

# Run main function
main "$@"
#!/usr/bin/env bash
#
# push-to-dockerhub.sh - Build and push Docker image to DockerHub
#
# This script builds a Docker image for ktransformers with standardized naming
# and pushes it to DockerHub with both full and simplified tags.
#
# Features:
# - Automatic version detection
# - Standardized naming convention
# - Multi-CPU variant support (AMX/AVX512/AVX2)
# - Full and simplified tag support
# - Retry logic for network failures
# - Comprehensive error handling
#
# Usage:
#   ./push-to-dockerhub.sh [OPTIONS]
#
# Example:
#   ./push-to-dockerhub.sh \
#     --cuda-version 12.8.1 \
#     --repository kvcache/ktransformers \
#     --also-push-simplified

set -euo pipefail

# Get script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Source utility functions
# shellcheck source=docker-utils.sh
source "$SCRIPT_DIR/docker-utils.sh"

################################################################################
# Default Configuration
################################################################################

# Build parameters
CUDA_VERSION="12.8.1"
UBUNTU_MIRROR="0"
HTTP_PROXY=""
HTTPS_PROXY=""
CPU_VARIANT="x86-intel-multi"
FUNCTIONALITY="sft"

# Paths
DOCKERFILE="$SCRIPT_DIR/Dockerfile"
CONTEXT_DIR="$SCRIPT_DIR"

# Registry settings
REGISTRY="docker.io"
REPOSITORY=""  # Must be provided by user

# Options
DRY_RUN=false
SKIP_BUILD=false
ALSO_PUSH_SIMPLIFIED=false
MAX_RETRIES=3
RETRY_DELAY=5
EXTRA_BUILD_ARGS=()

################################################################################
# Help Message
################################################################################

usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Build and push Docker image to DockerHub with standardized naming.

OPTIONS:
    Build Configuration:
        --cuda-version VERSION      CUDA version (default: 12.8.1)
                                   Examples: 12.8.1, 12.6.1, 13.0.1

        --ubuntu-mirror 0|1         Use Tsinghua mirror for Ubuntu packages
                                   (default: 0)

        --http-proxy URL           HTTP proxy URL
                                   Example: http://127.0.0.1:16981

        --https-proxy URL          HTTPS proxy URL
                                   Example: http://127.0.0.1:16981

        --cpu-variant VARIANT      CPU variant identifier
                                   (default: x86-intel-multi)

        --functionality TYPE       Functionality mode: sft or infer
                                   (default: sft, includes LLaMA-Factory)

    Paths:
        --dockerfile PATH          Path to Dockerfile
                                   (default: ./Dockerfile)

        --context-dir PATH         Docker build context directory
                                   (default: .)

    Registry Settings:
        --registry REGISTRY        Docker registry (default: docker.io)
                                   Examples: docker.io, ghcr.io

        --repository REPO          Repository name (REQUIRED)
                                   Example: kvcache/ktransformers

    Options:
        --skip-build               Skip build if image exists locally
        --also-push-simplified     Also push simplified tag (v{ver}-{cuda})
        --max-retries N            Maximum push retries (default: 3)
        --retry-delay SECONDS      Delay between retries (default: 5)
        --dry-run                  Preview commands without executing
        --build-arg KEY=VALUE      Additional build arguments (can be repeated)
        -h, --help                 Show this help message

EXAMPLES:
    # Basic push
    $0 --repository kvcache/ktransformers

    # Push with simplified tag
    $0 \\
        --repository kvcache/ktransformers \\
        --cuda-version 12.8.1 \\
        --also-push-simplified

    # Skip build if image exists
    $0 \\
        --repository kvcache/ktransformers \\
        --skip-build

    # Dry run to preview
    $0 --repository kvcache/ktransformers --dry-run

OUTPUT:
    The image will be pushed with tags:

    Full tag:
      {registry}/{repository}:sglang-v{ver}_ktransformers-v{ver}_{cpu}_{gpu}_{func}_{timestamp}

    Example:
      docker.io/kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022

    Simplified tag (if --also-push-simplified):
      {registry}/{repository}:v{ktransformers-ver}-{cuda}

    Example:
      docker.io/kvcache/ktransformers:v0.4.3-cu128

EOF
    exit 0
}

################################################################################
# Argument Parsing
################################################################################

parse_args() {
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --cuda-version)
                CUDA_VERSION="$2"
                shift 2
                ;;
            --ubuntu-mirror)
                UBUNTU_MIRROR="$2"
                shift 2
                ;;
            --http-proxy)
                HTTP_PROXY="$2"
                shift 2
                ;;
            --https-proxy)
                HTTPS_PROXY="$2"
                shift 2
                ;;
            --cpu-variant)
                CPU_VARIANT="$2"
                shift 2
                ;;
            --functionality)
                FUNCTIONALITY="$2"
                shift 2
                ;;
            --dockerfile)
                DOCKERFILE="$2"
                shift 2
                ;;
            --context-dir)
                CONTEXT_DIR="$2"
                shift 2
                ;;
            --registry)
                REGISTRY="$2"
                shift 2
                ;;
            --repository)
                REPOSITORY="$2"
                shift 2
                ;;
            --skip-build)
                SKIP_BUILD=true
                shift
                ;;
            --also-push-simplified)
                ALSO_PUSH_SIMPLIFIED=true
                shift
                ;;
            --max-retries)
                MAX_RETRIES="$2"
                shift 2
                ;;
            --retry-delay)
                RETRY_DELAY="$2"
                shift 2
                ;;
            --dry-run)
                DRY_RUN=true
                shift
                ;;
            --build-arg)
                EXTRA_BUILD_ARGS+=("--build-arg" "$2")
                shift 2
                ;;
            -h|--help)
                usage
                ;;
            *)
                log_error "Unknown option: $1"
                echo "Use -h or --help for usage information"
                exit 1
                ;;
        esac
    done
}

################################################################################
# Validation
################################################################################

validate_config() {
    log_step "Validating configuration"

    # Check Docker is running
    check_docker_running || exit 1

    # Check Docker login
    check_docker_login "$REGISTRY" || exit 1

    # Validate CUDA version
    validate_cuda_version "$CUDA_VERSION" || exit 1

    # Check repository is provided
    if [ -z "$REPOSITORY" ]; then
        log_error "Repository name is required"
        log_error "Use --repository to specify (e.g., kvcache/ktransformers)"
        exit 1
    fi
    log_info "Target repository: $REGISTRY/$REPOSITORY"

    # Check Dockerfile exists
    if [ ! -f "$DOCKERFILE" ]; then
        log_error "Dockerfile not found: $DOCKERFILE"
        exit 1
    fi
    log_info "Using Dockerfile: $DOCKERFILE"

    # Check context directory exists
    if [ ! -d "$CONTEXT_DIR" ]; then
        log_error "Context directory not found: $CONTEXT_DIR"
        exit 1
    fi
    log_info "Using context directory: $CONTEXT_DIR"

    # Validate functionality mode
    if [[ "$FUNCTIONALITY" != "sft" && "$FUNCTIONALITY" != "infer" ]]; then
        log_error "Invalid functionality mode: $FUNCTIONALITY"
        log_error "Must be 'sft' or 'infer'"
        exit 1
    fi

    log_success "Configuration validated"
}

################################################################################
# Build Docker Image
################################################################################

build_image() {
    local temp_tag="ktransformers:temp-push-$(get_beijing_timestamp)"

    # Check if we should skip build
    if [ "$SKIP_BUILD" = true ]; then
        log_info "Checking for existing local image..."
        # Try to find an existing image
        # This is a best-effort search for recent builds
        local existing_image
        existing_image=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "ktransformers:temp-" | head -1 || echo "")

        if [ -n "$existing_image" ]; then
            log_info "Found existing image: $existing_image"
            echo "$existing_image"
            return 0
        else
            log_warning "No existing image found, will build"
        fi
    fi

    log_step "Building Docker image" >&2
    log_info "Temporary tag: $temp_tag" >&2

    # Prepare build arguments
    local build_args=()
    build_args+=("--build-arg" "CUDA_VERSION=$CUDA_VERSION")
    build_args+=("--build-arg" "UBUNTU_MIRROR=$UBUNTU_MIRROR")
    build_args+=("--build-arg" "CPU_VARIANT=$CPU_VARIANT")
    build_args+=("--build-arg" "BUILD_ALL_CPU_VARIANTS=1")
    build_args+=("--build-arg" "FUNCTIONALITY=$FUNCTIONALITY")

    # Add proxy settings if provided
    if [ -n "$HTTP_PROXY" ]; then
        build_args+=("--build-arg" "HTTP_PROXY=$HTTP_PROXY")
    fi
    if [ -n "$HTTPS_PROXY" ]; then
        build_args+=("--build-arg" "HTTPS_PROXY=$HTTPS_PROXY")
    fi

    # Add extra build args
    build_args+=("${EXTRA_BUILD_ARGS[@]}")

    # Add network host
    build_args+=("--network" "host")

    # Build command
    local build_cmd=(
        docker build
        -f "$DOCKERFILE"
        "${build_args[@]}"
        -t "$temp_tag"
        "$CONTEXT_DIR"
    )

    # Display build command
    {
        log_info "Build command:"
        echo "  ${build_cmd[*]}"
    } >&2

    if [ "$DRY_RUN" = true ]; then
        log_warning "DRY RUN: Skipping actual build" >&2
        return 0
    fi

    # Execute build
    log_info "Starting Docker build (this may take 30-60 minutes)..." >&2
    if "${build_cmd[@]}" >&2; then
        log_success "Docker image built successfully" >&2
        echo "$temp_tag"
    else
        log_error "Docker build failed" >&2
        exit 1
    fi
}

################################################################################
# Generate Tags
################################################################################

generate_tags() {
    local image_tag="$1"
    local timestamp="$2"

    if [ "$DRY_RUN" = true ]; then
        log_warning "DRY RUN: Using placeholder versions"
        # Use placeholder versions for dry run
        local versions="SGLANG_VERSION=0.5.6
KTRANSFORMERS_VERSION=0.4.3
LLAMAFACTORY_VERSION=0.9.3"
    else
        # Extract versions from image
        local versions
        versions=$(extract_versions_from_image "$image_tag")

        if [ $? -ne 0 ]; then
            log_error "Failed to extract versions from image"
            exit 1
        fi

        # Validate versions
        if ! validate_versions "$versions"; then
            log_error "Version validation failed"
            exit 1
        fi
    fi

    # Generate full tag
    local full_tag
    full_tag=$(generate_image_name "$versions" "$CUDA_VERSION" "$CPU_VARIANT" "$FUNCTIONALITY" "$timestamp")

    if [ -z "$full_tag" ]; then
        log_error "Failed to generate image name"
        exit 1
    fi

    echo "FULL_TAG=$full_tag"

    # Generate simplified tag if requested
    if [ "$ALSO_PUSH_SIMPLIFIED" = true ]; then
        local ktrans_ver
        ktrans_ver=$(echo "$versions" | grep "^KTRANSFORMERS_VERSION=" | cut -d= -f2)

        local simplified_tag
        simplified_tag=$(generate_simplified_tag "$ktrans_ver" "$CUDA_VERSION")

        echo "SIMPLIFIED_TAG=$simplified_tag"
    fi
}

################################################################################
# Push to Registry
################################################################################

push_image_with_retry() {
    local source_tag="$1"
    local target_tag="$2"
    local attempt=1

    log_step "Pushing image: $target_tag"

    if [ "$DRY_RUN" = true ]; then
        log_warning "DRY RUN: Skipping actual push"
        log_info "Would execute:"
        echo "  docker tag $source_tag $target_tag"
        echo "  docker push $target_tag"
        return 0
    fi

    # Tag the image
    log_info "Tagging image..."
    if ! docker tag "$source_tag" "$target_tag"; then
        log_error "Failed to tag image"
        return 1
    fi

    # Push with retry logic
    while [ $attempt -le "$MAX_RETRIES" ]; do
        log_info "Push attempt $attempt/$MAX_RETRIES..."

        if docker push "$target_tag"; then
            log_success "Successfully pushed: $target_tag"
            return 0
        else
            log_warning "Push failed (attempt $attempt/$MAX_RETRIES)"

            if [ $attempt -lt "$MAX_RETRIES" ]; then
                log_info "Retrying in ${RETRY_DELAY} seconds..."
                sleep "$RETRY_DELAY"
            fi

            ((attempt++))
        fi
    done

    log_error "Failed to push after $MAX_RETRIES attempts"
    return 1
}

################################################################################
# Main
################################################################################

main() {
    log_step "KTransformers Docker Image Build and Push"

    # Parse arguments
    parse_args "$@"

    # Validate configuration
    validate_config

    # Generate timestamp
    TIMESTAMP=$(get_beijing_timestamp)
    log_info "Build timestamp: $TIMESTAMP"

    # Display configuration
    display_summary "Push Configuration" \
        "CUDA Version: $CUDA_VERSION" \
        "Ubuntu Mirror: $UBUNTU_MIRROR" \
        "CPU Variant: $CPU_VARIANT" \
        "Functionality: $FUNCTIONALITY" \
        "Registry: $REGISTRY" \
        "Repository: $REPOSITORY" \
        "Push Simplified: $ALSO_PUSH_SIMPLIFIED" \
        "Skip Build: $SKIP_BUILD" \
        "HTTP Proxy: ${HTTP_PROXY:-<not set>}" \
        "HTTPS Proxy: ${HTTPS_PROXY:-<not set>}" \
        "Dockerfile: $DOCKERFILE" \
        "Context Dir: $CONTEXT_DIR" \
        "Timestamp: $TIMESTAMP" \
        "Dry Run: $DRY_RUN"

    # Build image
    TEMP_TAG=$(build_image)

    if [ "$DRY_RUN" = true ]; then
        TEMP_TAG="ktransformers:temp-dryrun"
    fi

    # Generate tags
    log_step "Generating tags"
    TAG_INFO=$(generate_tags "$TEMP_TAG" "$TIMESTAMP")

    # Parse tag info
    FULL_TAG=$(echo "$TAG_INFO" | grep "^FULL_TAG=" | cut -d= -f2)
    SIMPLIFIED_TAG=$(echo "$TAG_INFO" | grep "^SIMPLIFIED_TAG=" | cut -d= -f2 || echo "")

    log_info "Full tag: $FULL_TAG"
    if [ -n "$SIMPLIFIED_TAG" ]; then
        log_info "Simplified tag: $SIMPLIFIED_TAG"
    fi

    # Push full tag
    FULL_IMAGE="$REGISTRY/$REPOSITORY:$FULL_TAG"
    if ! push_image_with_retry "$TEMP_TAG" "$FULL_IMAGE"; then
        log_error "Failed to push full tag"
        exit 1
    fi

    # Push simplified tag if requested
    if [ -n "$SIMPLIFIED_TAG" ]; then
        SIMPLIFIED_IMAGE="$REGISTRY/$REPOSITORY:$SIMPLIFIED_TAG"
        if ! push_image_with_retry "$TEMP_TAG" "$SIMPLIFIED_IMAGE"; then
            log_warning "Failed to push simplified tag, but continuing..."
        fi
    fi

    # Cleanup temporary image
    if [ "$DRY_RUN" = false ]; then
        log_step "Cleaning up temporary image"
        cleanup_temp_images "$TEMP_TAG"
    fi

    # Display summary
    local summary_lines=(
        "Successfully pushed images:"
        ""
        "Full tag:"
        "  $FULL_IMAGE"
        ""
    )

    if [ -n "$SIMPLIFIED_TAG" ]; then
        summary_lines+=(
            "Simplified tag:"
            "  $SIMPLIFIED_IMAGE"
            ""
        )
    fi

    summary_lines+=(
        "To pull the image:"
        "  docker pull $FULL_IMAGE"
        ""
        "To run the container:"
        "  docker run -it --rm $FULL_IMAGE /bin/bash"
    )

    display_summary "Push Complete" "${summary_lines[@]}"

    log_success "All done!"
}

# Run main function
main "$@"


================================================
FILE: install.sh
================================================
#!/usr/bin/env bash
set -euo pipefail

# Resolve the repository root (directory containing this script)
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

usage() {
  cat <<EOF
Usage: $0 [SUBCOMMAND] [OPTIONS]

One-click installer for ktransformers (sglang + kt-kernel).

SUBCOMMANDS:
  all             Full install: submodules → sglang → kt-kernel (default)
  sglang          Install sglang only
  kt-kernel       Install kt-kernel only
  deps            Install system dependencies only
  -h, --help      Show this help message

OPTIONS:
  --skip-sglang       Skip sglang installation (for "all" subcommand)
  --skip-kt-kernel    Skip kt-kernel installation (for "all" subcommand)
  --editable          Install sglang in editable/dev mode (-e)
  --manual            Pass through to kt-kernel (manual CPU config)
  --no-clean          Pass through to kt-kernel (skip build clean)

EXAMPLES:
  # Full install (recommended)
  $0

  # Install everything in editable mode for development
  $0 all --editable

  # Install sglang only
  $0 sglang

  # Install kt-kernel only (manual CPU config)
  $0 kt-kernel --manual

  # Full install, skip sglang (already installed)
  $0 all --skip-sglang

EOF
  exit 1
}

# ─── Helpers ───────────────────────────────────────────────────────────────────

log_step() {
  echo ""
  echo "=========================================="
  echo "  $1"
  echo "=========================================="
  echo ""
}

log_info() {
  echo "[INFO] $1"
}

log_warn() {
  echo "[WARN] $1"
}

log_error() {
  echo "[ERROR] $1" >&2
}

# Read ktransformers version from version.py and export for sglang-kt
read_kt_version() {
  local version_file="$REPO_ROOT/version.py"
  if [ -f "$version_file" ]; then
    KT_VERSION=$(python3 -c "exec(open('$version_file').read()); print(__version__)")
    export SGLANG_KT_VERSION="$KT_VERSION"
    log_info "ktransformers version: $KT_VERSION (will be used for sglang-kt)"
  else
    log_warn "version.py not found; sglang-kt will use its default version"
  fi
}

# ─── Submodule init ────────────────────────────────────────────────────────────

init_submodules() {
  log_step "Initializing git submodules"

  if [ ! -d "$REPO_ROOT/.git" ]; then
    log_warn "Not a git repository. Skipping submodule init."
    log_warn "If you need sglang, clone with: git clone --recursive https://github.com/kvcache-ai/ktransformers.git"
    return 0
  fi

  cd "$REPO_ROOT"
  git submodule update --init --recursive
  log_info "Submodules initialized successfully."
}

# ─── sglang install ───────────────────────────────────────────────────────────

install_sglang() {
  local editable="${1:-0}"

  log_step "Installing sglang (kvcache-ai fork)"

  local sglang_dir="$REPO_ROOT/third_party/sglang"
  local pyproject="$sglang_dir/python/pyproject.toml"

  if [ ! -f "$pyproject" ]; then
    log_error "sglang source not found at $sglang_dir"
    log_error "Run 'git submodule update --init --recursive' first, or clone with --recursive."
    exit 1
  fi

  cd "$sglang_dir"

  if [ "$editable" = "1" ]; then
    log_info "Installing sglang in editable mode..."
    pip install -e "./python[all]"
  else
    log_info "Installing sglang..."
    pip install "./python[all]"
  fi

  log_info "sglang installed successfully."
}

# ─── kt-kernel install ────────────────────────────────────────────────────────

install_kt_kernel() {
  # Forward all remaining args to kt-kernel/install.sh
  local kt_args=("$@")

  log_step "Installing kt-kernel"

  local kt_install="$REPO_ROOT/kt-kernel/install.sh"

  if [ ! -f "$kt_install" ]; then
    log_error "kt-kernel/install.sh not found at $kt_install"
    exit 1
  fi

  cd "$REPO_ROOT/kt-kernel"
  bash ./install.sh build "${kt_args[@]}"
}

# ─── deps install ─────────────────────────────────────────────────────────────

install_deps() {
  log_step "Installing system dependencies"

  local kt_install="$REPO_ROOT/kt-kernel/install.sh"

  if [ ! -f "$kt_install" ]; then
    log_error "kt-kernel/install.sh not found at $kt_install"
    exit 1
  fi

  cd "$REPO_ROOT/kt-kernel"
  bash ./install.sh deps
}

# ─── "all" subcommand ─────────────────────────────────────────────────────────

install_all() {
  local skip_sglang=0
  local skip_kt_kernel=0
  local editable=0
  local kt_args=()

  while [[ $# -gt 0 ]]; do
    case "$1" in
      --skip-sglang)    skip_sglang=1; shift ;;
      --skip-kt-kernel) skip_kt_kernel=1; shift ;;
      --editable)       editable=1; shift ;;
      --manual)         kt_args+=("--manual"); shift ;;
      --no-clean)       kt_args+=("--no-clean"); shift ;;
      -h|--help)        usage ;;
      *)
        log_error "Unknown option: $1"
        usage
        ;;
    esac
  done

  # 1. Init submodules
  init_submodules

  # 2. System dependencies
  install_deps

  # 3. Read version for sglang-kt
  read_kt_version

  # 4. Install sglang
  if [ "$skip_sglang" = "0" ]; then
    install_sglang "$editable"
  else
    log_info "Skipping sglang installation (--skip-sglang)."
  fi

  # 4. Build & install kt-kernel
  if [ "$skip_kt_kernel" = "0" ]; then
    install_kt_kernel "${kt_args[@]}"
  else
    log_info "Skipping kt-kernel installation (--skip-kt-kernel)."
  fi

  log_step "Installation complete!"
  echo "  Verify with: kt doctor"
  echo ""
}

# ─── Subcommand dispatcher ────────────────────────────────────────────────────

SUBCMD="all"
if [[ $# -gt 0 ]]; then
  case "$1" in
    all|sglang|kt-kernel|deps)
      SUBCMD="$1"
      shift
      ;;
    -h|--help)
      usage
      ;;
    -*)
      # Flags without subcommand → default to "all"
      SUBCMD="all"
      ;;
    *)
      log_error "Unknown subcommand: $1"
      usage
      ;;
  esac
fi

case "$SUBCMD" in
  all)
    install_all "$@"
    ;;
  sglang)
    # Parse sglang-specific options
    editable=0
    while [[ $# -gt 0 ]]; do
      case "$1" in
        --editable) editable=1; shift ;;
        -h|--help) usage ;;
        *) log_error "Unknown option for sglang: $1"; usage ;;
      esac
    done
    init_submodules
    read_kt_version
    install_sglang "$editable"
    ;;
  kt-kernel)
    install_kt_kernel "$@"
    ;;
  deps)
    install_deps
    ;;
esac


================================================
FILE: kt-kernel/.clang-format
================================================
---
Language:        Cpp
# BasedOnStyle:  Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignArrayOfStructures: None
AlignConsecutiveAssignments:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   false
  AlignFunctionPointers: false
  PadOperators:    true
AlignConsecutiveBitFields:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   false
  AlignFunctionPointers: false
  PadOperators:    false
AlignConsecutiveDeclarations:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   false
  AlignFunctionPointers: false
  PadOperators:    false
AlignConsecutiveMacros:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCompound:   false
  AlignFunctionPointers: false
  PadOperators:    false
AlignConsecutiveShortCaseStatements:
  Enabled:         false
  AcrossEmptyLines: false
  AcrossComments:  false
  AlignCaseColons: false
AlignEscapedNewlines: Left
AlignOperands:   Align
AlignTrailingComments:
  Kind:            Always
  OverEmptyLines:  0
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowBreakBeforeNoexceptSpecifier: Never
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortCompoundRequirementOnASingleLine: true
AllowShortEnumsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLambdasOnASingleLine: All
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
AttributeMacros:
  - __capability
BinPackArguments: true
BinPackParameters: true
BitFieldColonSpacing: Both
BraceWrapping:
  AfterCaseLabel:  false
  AfterClass:      false
  AfterControlStatement: Never
  AfterEnum:       false
  AfterExternBlock: false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  BeforeCatch:     false
  BeforeElse:      false
  BeforeLambdaBody: false
  BeforeWhile:     false
  IndentBraces:    false
  SplitEmptyFunction: true
  SplitEmptyRecord: true
  SplitEmptyNamespace: true
BreakAdjacentStringLiterals: true
BreakAfterAttributes: Leave
BreakAfterJavaFieldAnnotations: false
BreakArrays:     true
BreakBeforeBinaryOperators: None
BreakBeforeConceptDeclarations: Always
BreakBeforeBraces: Attach
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeColon
BreakInheritanceList: BeforeColon
BreakStringLiterals: true
ColumnLimit:     120
CommentPragmas:  '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: true
DisableFormat:   false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
  - foreach
  - Q_FOREACH
  - BOOST_FOREACH
IfMacros:
  - KJ_IF_MAYBE
IncludeBlocks:   Regroup
IncludeCategories:
  - Regex:           '^<ext/.*\.h>'
    Priority:        2
    SortPriority:    0
    CaseSensitive:   false
  - Regex:           '^<.*\.h>'
    Priority:        1
    SortPriority:    0
    CaseSensitive:   false
  - Regex:           '^<.*'
    Priority:        2
    SortPriority:    0
    CaseSensitive:   false
  - Regex:           '.*'
    Priority:        3
    SortPriority:    0
    CaseSensitive:   false
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseBlocks: false
IndentCaseLabels: true
IndentExternBlock: AfterExternBlock
IndentGotoLabels: true
IndentPPDirectives: None
IndentRequiresClause: true
IndentWidth:     2
IndentWrappedFunctionNames: false
InsertBraces:    false
InsertNewlineAtEOF: false
InsertTrailingCommas: None
IntegerLiteralSeparator:
  Binary:          0
  BinaryMinDigits: 0
  Decimal:         0
  DecimalMinDigits: 0
  Hex:             0
  HexMinDigits:    0
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
KeepEmptyLinesAtEOF: false
LambdaBodyIndentation: Signature
LineEnding:      DeriveLF
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 2
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PackConstructorInitializers: NextLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakOpenParenthesis: 0
PenaltyBreakScopeResolution: 500
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyIndentedWhitespace: 0
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
PPIndentWidth:   -1
QualifierAlignment: Leave
RawStringFormats:
  - Language:        Cpp
    Delimiters:
      - cc
      - CC
      - cpp
      - Cpp
      - CPP
      - 'c++'
      - 'C++'
    CanonicalDelimiter: ''
    BasedOnStyle:    google
  - Language:        TextProto
    Delimiters:
      - pb
      - PB
      - proto
      - PROTO
    EnclosingFunctions:
      - EqualsProto
      - EquivToProto
      - PARSE_PARTIAL_TEXT_PROTO
      - PARSE_TEST_PROTO
      - PARSE_TEXT_PROTO
      - ParseTextOrDie
      - ParseTextProtoOrDie
      - ParseTestProto
      - ParsePartialTestProto
    CanonicalDelimiter: pb
    BasedOnStyle:    google
ReferenceAlignment: Pointer
ReflowComments:  true
RemoveBracesLLVM: false
RemoveParentheses: Leave
RemoveSemicolon: false
RequiresClausePosition: OwnLine
RequiresExpressionIndentation: OuterScope
SeparateDefinitionBlocks: Leave
ShortNamespaceLines: 1
SkipMacroDefinitionBody: false
SortIncludes:    CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceAroundPointerQualifiers: Default
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeJsonColon: false
SpaceBeforeParens: ControlStatements
SpaceBeforeParensOptions:
  AfterControlStatements: true
  AfterForeachMacros: true
  AfterFunctionDefinitionName: false
  AfterFunctionDeclarationName: false
  AfterIfMacros:   true
  AfterOverloadedOperator: false
  AfterPlacementOperator: true
  AfterRequiresInClause: false
  AfterRequiresInExpression: false
  BeforeNonEmptyParentheses: false
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyBlock: false
SpacesBeforeTrailingComments: 2
SpacesInAngles:  Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
  Minimum:         1
  Maximum:         -1
SpacesInParens:  Never
SpacesInParensOptions:
  InCStyleCasts:   false
  InConditionalStatements: false
  InEmptyParentheses: false
  Other:           false
SpacesInSquareBrackets: false
Standard:        Auto
StatementAttributeLikeMacros:
  - Q_EMIT
StatementMacros:
  - Q_UNUSED
  - QT_REQUIRE_VERSION
TabWidth:        2
UseTab:          Never
VerilogBreakBetweenInstancePorts: true
WhitespaceSensitiveMacros:
  - BOOST_PP_STRINGIZE
  - CF_SWIFT_NAME
  - NS_SWIFT_NAME
  - PP_STRINGIZE
  - STRINGIZE
...


================================================
FILE: kt-kernel/.githooks/commit-msg
================================================
#!/bin/sh
# commit-msg hook to enforce Conventional Commits (https://www.conventionalcommits.org/)
# This script checks the commit message subject (first line) for a conventional commit format.
# If the message does not conform, the hook exits non-zero to block the commit.

# Read the commit message (first line)
if [ -z "$1" ]; then
  echo "commit-msg hook: no message file provided" >&2
  exit 0
fi

MSG_FILE="$1"
read -r FIRST_LINE < "$MSG_FILE" || FIRST_LINE=""

# Trim leading/trailing whitespace
FIRST_LINE="$(echo "$FIRST_LINE" | sed -e 's/^[ \t]*//' -e 's/[ \t]*$//')"

# Allow empty message (let git handle it), or allow merges/reverts
case "$FIRST_LINE" in
  Merge:*|merge:*|Revert:*|revert:*)
    exit 0
    ;;
esac

# Conventional Commit regex (POSIX ERE)
# [type](scope)!?: subject
# types: feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip
# scope: any chars except )

regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+'

printf "%s" "$FIRST_LINE" | grep -E "$regex" >/dev/null 2>&1
if [ $? -eq 0 ]; then
  exit 0
fi

cat <<'EOF' >&2
ERROR: Commit message does not follow Conventional Commits.

Expected format:
  [type](scope)?: subject

Examples:
  [feat]: add new feature
  [fix(parser)]: handle edge case
  [docs]!: update API docs (breaking change)

Allowed types: feat, fix, docs, style, refactor, perf, test, build, ci, chore, revert, wip

You can bypass this hook locally by running:
  git commit --no-verify
EOF

exit 1


================================================
FILE: kt-kernel/.githooks/pre-commit
================================================
#!/usr/bin/bash
# Pre-commit hook: run clang-format via kt-kernel's CMake 'format' target and Black for Python
# before allowing commit. If formatting makes changes, stage them and abort so user can review.
set -euo pipefail

REPO_ROOT="$(git rev-parse --show-toplevel)"
# kt-kernel project directory within the monorepo
KERNEL_DIR="$REPO_ROOT/kt-kernel"
# Relative path for matching staged files under repo root
REL_KERNEL_DIR="kt-kernel"
BUILD_DIR="$KERNEL_DIR/build"
FORMAT_TARGET="format"
CLANG_FORMAT_BIN="${CLANG_FORMAT_BIN:-clang-format}"
BLACK_BIN="${BLACK_BIN:-black}"

# Simple check clang-format present (optional)
# clang-format optional: if missing, skip C/C++ formatting
if ! command -v "$CLANG_FORMAT_BIN" >/dev/null 2>&1; then
  echo "[pre-commit] clang-format not found (looked for $CLANG_FORMAT_BIN). Skipping C/C++ format." >&2
fi

# black optional: if missing, skip Python formatting
if ! command -v "$BLACK_BIN" >/dev/null 2>&1; then
  echo "[pre-commit] black not found (looked for $BLACK_BIN). Skipping Python format." >&2
fi

## Format only staged changes within kt-kernel
# Collect staged files (Added/Modified/Copied/Renamed)
mapfile -d '' STAGED < <(git diff --cached --name-only -z --diff-filter=AMCR)

PY_CHANGED=()
CPP_CHANGED=()

for f in "${STAGED[@]}"; do
  case "$f" in
    "$REL_KERNEL_DIR"/*)
      ext="${f##*.}"
      case "$ext" in
        py)
          PY_CHANGED+=("$f")
          ;;
        c|cc|cpp|cxx|h|hh|hpp|hxx|cu|cuh)
          CPP_CHANGED+=("$f")
          ;;
      esac
      ;;
  esac
done

# Run clang-format only on staged C/C++ files
if command -v "$CLANG_FORMAT_BIN" >/dev/null 2>&1 && [ ${#CPP_CHANGED[@]} -gt 0 ]; then
  echo "[pre-commit] clang-format on ${#CPP_CHANGED[@]} files" >&2
  for f in "${CPP_CHANGED[@]}"; do
    "$CLANG_FORMAT_BIN" -i "$f"
  done
fi

## Run black only on staged Python files
if command -v "$BLACK_BIN" >/dev/null 2>&1 && [ ${#PY_CHANGED[@]} -gt 0 ]; then
  echo "[pre-commit] black on ${#PY_CHANGED[@]} files" >&2
  "$BLACK_BIN" "${PY_CHANGED[@]}"
fi

# Stage any formatting changes for tracked, formatted files only
FMT_FILES=("${PY_CHANGED[@]}" "${CPP_CHANGED[@]}")
if [ ${#FMT_FILES[@]} -gt 0 ] && ! git diff --quiet --exit-code -- "${FMT_FILES[@]}"; then
  echo "[pre-commit] Formatting applied; updating index." >&2
  git add "${FMT_FILES[@]}"
  echo "[pre-commit] Re-run git commit to proceed after reviewing changes." >&2
  exit 1
fi

echo "[pre-commit] format OK." >&2
exit 0


================================================
FILE: kt-kernel/.gitignore
================================================
debug/
debug_prefill/
debug_decode/
debug1/
debug2/
.gdbinit
bp.gdb
.gdb_history
build/
# local git hooks installer and hooks
.clangd
.cache
tmp*
.vscode/
*.egg-info/
*.pyc
*.so
sparse_logs/
build-cm/
*.so
sparse_logs/

================================================
FILE: kt-kernel/.gitmodules
================================================
[submodule "pybind11"]
	path = third_party/pybind11
	url = https://github.com/pybind/pybind11.git
[submodule "llama.cpp"]
	path = third_party/llama.cpp
	url = https://github.com/ggerganov/llama.cpp.git


================================================
FILE: kt-kernel/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16)

# Toggle: default to system compilers; optionally use conda toolchain
option(USE_CONDA_TOOLCHAIN "Use C/C++ compilers and libraries from active conda env" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_AVX "llama: enable AVX" OFF)
option(LLAMA_AVX2 "llama: enable AVX2" OFF)
# AVX512 options will be auto-detected by cmake/DetectCPU.cmake
# Users can override with -DLLAMA_AVX512=OFF etc.
option(LLAMA_FMA "llama: enable FMA" OFF)
# in MSVC F16C is implied with AVX2/AVX512
if(NOT MSVC)
    option(LLAMA_F16C "llama: enable F16C" OFF)
endif()
option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI" OFF)
option(KTRANSFORMERS_USE_CUDA "ktransformers: use CUDA" OFF)
option(KTRANSFORMERS_USE_MUSA "ktransformers: use MUSA" OFF)
option(KTRANSFORMERS_USE_ROCM "ktransformers: use ROCM" OFF)
option(KTRANSFORMERS_CUDA_STATIC_RUNTIME "ktransformers: statically link CUDA runtime" ON)
option(KTRANSFORMERS_CPU_USE_KML "ktransformers: CPU use KML" OFF)
option(KTRANSFORMERS_CPU_USE_AMX_AVX512 "ktransformers: CPU use AMX or AVX512" OFF)
option(KTRANSFORMERS_CPU_USE_AMX "ktransformers: CPU use AMX" OFF)
option(KTRANSFORMERS_CPU_DEBUG "ktransformers: DEBUG CPU use AMX" OFF)
option(KTRANSFORMERS_CPU_MLA "ktransformers: CPU use MLA" OFF)
option(KTRANSFORMERS_CPU_MOE_KERNEL "ktransformers: CPU use moe kernel" OFF)
option(KTRANSFORMERS_CPU_MOE_AMD "ktransformers: CPU use moe kernel for amd" OFF)
# LTO control
option(CPUINFER_ENABLE_LTO "Enable link time optimization (IPO)" OFF)

project(kt_kernel_ext VERSION 0.5.0)

# Auto-detect CPU features early (unless building with LLAMA_NATIVE)
if(NOT LLAMA_NATIVE AND NOT MSVC)
    include(cmake/DetectCPU.cmake)
endif()

# Choose compilers BEFORE project() so CMake honors them
if(USE_CONDA_TOOLCHAIN)
    if(NOT DEFINED ENV{CONDA_PREFIX} OR NOT EXISTS "$ENV{CONDA_PREFIX}")
        message(FATAL_ERROR "USE_CONDA_TOOLCHAIN=ON but CONDA_PREFIX is not set. Activate your conda env or pass -DCONDA_PREFIX=/path")
    endif()
    # Locate conda GCC wrappers
    find_program(CONDA_CC NAMES x86_64-conda-linux-gnu-cc HINTS "$ENV{CONDA_PREFIX}/bin")
    find_program(CONDA_CXX NAMES x86_64-conda-linux-gnu-c++ HINTS "$ENV{CONDA_PREFIX}/bin")
    if(NOT CONDA_CC OR NOT CONDA_CXX)
        message(FATAL_ERROR "Conda compilers not found in $ENV{CONDA_PREFIX}/bin (expected x86_64-conda-linux-gnu-cc/c++).")
    endif()
    set(CMAKE_C_COMPILER   ${CONDA_CC}  CACHE FILEPATH "C compiler" FORCE)
    set(CMAKE_CXX_COMPILER ${CONDA_CXX} CACHE FILEPATH "C++ compiler" FORCE)
else()
    # Prefer system compilers explicitly to avoid accidentally picking conda wrappers from PATH
    if(EXISTS "/usr/bin/gcc" AND EXISTS "/usr/bin/g++")
        set(CMAKE_C_COMPILER   "/usr/bin/gcc" CACHE FILEPATH "C compiler" FORCE)
        set(CMAKE_CXX_COMPILER "/usr/bin/g++" CACHE FILEPATH "C++ compiler" FORCE)
    endif()
endif()


# If explicitly using conda toolchain, prefer its libraries/headers and RPATH
if(USE_CONDA_TOOLCHAIN)
    message(STATUS "Conda prefix detected: $ENV{CONDA_PREFIX}; prioritizing it for search paths and RPATH")
    # Make conda come first for CMake package discovery
    list(PREPEND CMAKE_PREFIX_PATH
        "$ENV{CONDA_PREFIX}"
        "$ENV{CONDA_PREFIX}/lib/cmake"
        "$ENV{CONDA_PREFIX}/share/cmake"
    )
    # Also hint direct include/lib searches
    list(PREPEND CMAKE_LIBRARY_PATH "$ENV{CONDA_PREFIX}/lib")
    list(PREPEND CMAKE_INCLUDE_PATH "$ENV{CONDA_PREFIX}/include")

    # Ensure pkg-config prefers conda .pc files
    set(ENV{PKG_CONFIG_PATH} "$ENV{CONDA_PREFIX}/lib/pkgconfig:$ENV{CONDA_PREFIX}/share/pkgconfig:$ENV{PKG_CONFIG_PATH}")
    # Make FindPkgConfig also search CMAKE_PREFIX_PATH
    set(PKG_CONFIG_USE_CMAKE_PREFIX_PATH ON)

    # Configure RPATH so the built extension prefers conda's shared libs at runtime
    # Use install RPATH during build to avoid mixing with implicit system paths
    set(CMAKE_SKIP_BUILD_RPATH FALSE)
    set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
    set(CMAKE_BUILD_RPATH "$ENV{CONDA_PREFIX}/lib")
    set(CMAKE_INSTALL_RPATH "$ENV{CONDA_PREFIX}/lib")
    # Do not auto-append link directories to RPATH; we want only conda path here
    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH OFF)
endif()

## Ensure git hooks are installed when configuring the project (monorepo-aware)
# If we are inside a git worktree (repo root is outside kt-kernel now), invoke the installer
# which will link kt-kernel/.githooks into the top-level .git/hooks. Otherwise, skip.
find_program(GIT_BIN git)
if(GIT_BIN)
    execute_process(
        COMMAND "${GIT_BIN}" rev-parse --show-toplevel
        WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
        OUTPUT_VARIABLE _GIT_TOP
        RESULT_VARIABLE _GIT_RV
        OUTPUT_STRIP_TRAILING_WHITESPACE
        ERROR_QUIET
    )
    if(_GIT_RV EQUAL 0 AND EXISTS "${_GIT_TOP}/.git" AND IS_DIRECTORY "${_GIT_TOP}/.git")
        if(EXISTS "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh")
            message(STATUS "Detected git worktree at ${_GIT_TOP}; installing hooks from kt-kernel/.githooks")
            execute_process(
                COMMAND sh "${CMAKE_SOURCE_DIR}/scripts/install-git-hooks.sh"
                WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
                RESULT_VARIABLE _INSTALL_GIT_HOOKS_RESULT
                OUTPUT_VARIABLE _INSTALL_GIT_HOOKS_OUT
                ERROR_VARIABLE _INSTALL_GIT_HOOKS_ERR
            )
            if(NOT _INSTALL_GIT_HOOKS_RESULT EQUAL 0)
                message(FATAL_ERROR "Installing git hooks failed (exit ${_INSTALL_GIT_HOOKS_RESULT}).\nOutput:\n${_INSTALL_GIT_HOOKS_OUT}\nError:\n${_INSTALL_GIT_HOOKS_ERR}")
            endif()
        else()
            message(FATAL_ERROR "Required script 'scripts/install-git-hooks.sh' not found in kt-kernel; cannot install hooks.")
        endif()
    else()
        message(STATUS "No git worktree detected; skipping git hooks installation")
    endif()
else()
    message(STATUS "git not found; skipping git hooks installation")
endif()

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Use header-only fmt to avoid needing to link libfmt (fix undefined symbol vprint)
add_compile_definitions(FMT_HEADER_ONLY)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fsanitize=address -fno-omit-frame-pointer")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
find_package(OpenMP REQUIRED)
message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")


include(CheckCXXCompilerFlag)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)


# instruction set specific
if(LLAMA_NATIVE)
    set(INS_ENB OFF)
else()
    set(INS_ENB ON)
endif()
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
#       feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")

set(ARCH_FLAGS "")

if(CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
    message(STATUS "ARM detected")
    if(MSVC)
        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
        add_compile_definitions(__ARM_NEON)
        add_compile_definitions(__ARM_FEATURE_FMA)

        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
        if(GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif()
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if(GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
        endif()
        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
    else()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if(NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
        endif()
        if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
            if("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
                # Android armeabi-v7a
                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
            else()
                # Raspberry Pi 2
                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
            endif()
        endif()
        if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Android arm64-v8a
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
        # add_compile_definitions(__ARM_NEON)
        # list(APPEND ARCH_FLAGS -march=armv8.2-a+fp16+dotprod)
        # add_compile_definitions(__ARM_FEATURE_DOTPROD)
        # add_compile_definitions(__aarch64__)

        # add_compile_definitions(__ARM_NEON)
        list(APPEND ARCH_FLAGS -march=armv8.2-a+fp16+dotprod+sve+bf16)
        # list(APPEND ARCH_FLAGS -march=armv8-a+dotprod+sha3+sm4+fp16fml+sve+rng+sb+ssbs+i8mm+bf16+flagm+pauth)
        # add_compile_definitions(__ARM_FEATURE_DOTPROD)
        # add_compile_definitions(__ARM_FEATURE_SVE)
        # add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
        # add_compile_definitions(__aarch64__)
    endif()
elseif(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
    message(STATUS "x86 detected")
    set(HOST_IS_X86 TRUE)
    add_compile_definitions(__x86_64__)
    if(MSVC)
        # instruction set detection for MSVC only
        if(LLAMA_NATIVE)
            include(cmake/FindSIMD.cmake)
        endif()
        if(LLAMA_AVX512)
            list(APPEND ARCH_FLAGS /arch:AVX512)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
            # Do it manually.
            if(LLAMA_AVX512_VBMI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
            endif()
            if(LLAMA_AVX512_VNNI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if(LLAMA_AVX512_FANCY_SIMD)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if(LLAMA_AVX512_BF16)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
            endif()
        elseif(LLAMA_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif(LLAMA_AVX)
            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
    else()
        if(LLAMA_NATIVE)
            list(APPEND ARCH_FLAGS -mfma -mavx -mavx2)
            list(APPEND ARCH_FLAGS -march=native)
        endif()
        if(LLAMA_F16C)
            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        if(LLAMA_FMA)
            list(APPEND ARCH_FLAGS -mfma)
        endif()
        if(LLAMA_AVX)
            list(APPEND ARCH_FLAGS -mavx -mfma -msse3 -mf16c)
            message(WARNING "pure AVX is not supported at least avx2")
        endif()
        if(LLAMA_AVX2)
            list(APPEND ARCH_FLAGS -mavx2 -mfma -msse3 -mf16c)
        endif()
        if(LLAMA_AVX512)
            list(APPEND ARCH_FLAGS -mavx512f -mavx512bw -mavx512dq -mfma -mf16c -msse3)
        endif()
        if(LLAMA_AVX512_VBMI)
            list(APPEND ARCH_FLAGS -mavx512vbmi)
        endif()
        if(LLAMA_AVX512_VNNI)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
        if(LLAMA_AVX512_FANCY_SIMD)
            message(STATUS "AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI enabled")
            list(APPEND ARCH_FLAGS -mavx512vl)
            list(APPEND ARCH_FLAGS -mavx512bw)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512vnni)
            list(APPEND ARCH_FLAGS -mavx512vpopcntdq)
        endif()
        if(LLAMA_AVX512_BF16)
            list(APPEND ARCH_FLAGS -mavx512bf16)
        endif()
    endif()
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
else()
    message(STATUS "Unknown architecture")
endif()

if(NOT EXISTS $ENV{ROCM_PATH})
    if(NOT EXISTS /opt/rocm)
        set(ROCM_PATH /usr)
    else()
        set(ROCM_PATH /opt/rocm)
    endif()
else()
    set(ROCM_PATH $ENV{ROCM_PATH})
endif()

list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")

if(NOT EXISTS $ENV{MUSA_PATH})
    if(NOT EXISTS /opt/musa)
        set(MUSA_PATH /usr/local/musa)
    else()
        set(MUSA_PATH /opt/musa)
    endif()
else()
    set(MUSA_PATH $ENV{MUSA_PATH})
endif()

list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")

if(KTRANSFORMERS_CPU_MOE_AMD)
    set(BLIS_ROOT "" CACHE PATH "Root directory of BLIS installation")
    set(_BLIS_SEARCH_DIRS)
    if(BLIS_ROOT)
        list(APPEND _BLIS_SEARCH_DIRS "${BLIS_ROOT}")
    endif()
    list(APPEND _BLIS_SEARCH_DIRS "/usr/local" "/usr")

    find_path(BLIS_INCLUDE_DIR
        NAMES blis.h
        HINTS ${_BLIS_SEARCH_DIRS}
        PATH_SUFFIXES include include/blis
    )
    find_library(BLIS_LIBRARY
        NAMES blis
        HINTS ${_BLIS_SEARCH_DIRS}
        PATH_SUFFIXES lib lib64
    )

    if(NOT BLIS_INCLUDE_DIR OR NOT BLIS_LIBRARY)
        message(WARNING "BLIS not found; set BLIS_ROOT or specify BLIS_INCLUDE_DIR/BLIS_LIBRARY")
    else()
        message(STATUS "Found BLIS include at ${BLIS_INCLUDE_DIR}")
        message(STATUS "Found BLIS library ${BLIS_LIBRARY}")
        set(_KT_BLIS_INCLUDE_DIR ${BLIS_INCLUDE_DIR})
        set(_KT_BLIS_LIBRARY ${BLIS_LIBRARY})
    endif()
    # The Python extension target (${PROJECT_NAME}) is created later by
    # pybind11_add_module(). Calling target_include_directories/target_link_libraries
    # here would fail because the target doesn't exist yet. Save the discovered
    # BLIS paths and apply them after the module target is created.
endif()


if(HOST_IS_X86)
    if(KTRANSFORMERS_CPU_USE_AMX_AVX512)
        add_compile_definitions(USE_AMX_AVX_KERNEL=1)
        if(KTRANSFORMERS_CPU_USE_AMX)
            add_compile_definitions(HAVE_AMX=1)
            list(APPEND ARCH_FLAGS -mamx-tile -mamx-bf16 -mamx-int8)
            message(STATUS "AMX enabled")
        endif()
        # add_executable(amx-test ${CMAKE_CURRENT_SOURCE_DIR}/operators/amx/amx-test.cpp)
        # target_link_libraries(amx-test llama)
        if(KTRANSFORMERS_CPU_DEBUG)
            file(GLOB AMX_TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/operators/amx/test/*.cpp")
            foreach(test_src ${AMX_TEST_SOURCES})
                # 获取不带扩展名的文件名作为 target 名
                get_filename_component(test_name ${test_src} NAME_WE)
                add_executable(${test_name} ${test_src} ${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend/shared_mem_buffer.cpp)
                target_link_libraries(${test_name} llama OpenMP::OpenMP_CXX numa)
            endforeach()
        endif()

        # AVX512 extensions are auto-detected by cmake/DetectCPU.cmake
        # Users can override with -DLLAMA_AVX512_BF16=OFF etc.
        # Only add -mf16c if LLAMA_F16C is not already enabled.
        if(NOT LLAMA_F16C)
            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        message(STATUS "AVX512 extensions: F=${LLAMA_AVX512}, BF16=${LLAMA_AVX512_BF16}, VNNI=${LLAMA_AVX512_VNNI}, VBMI=${LLAMA_AVX512_VBMI}")
    endif()
endif()

message(STATUS "ARCH_FLAGS: ${ARCH_FLAGS}")

add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../third_party/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/third_party/pybind11)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../third_party)
if(KTRANSFORMERS_USE_CUDA)
    include(CheckLanguage)
    check_language(CUDA)
    if(CMAKE_CUDA_COMPILER)
        message(STATUS "CUDA detected")
        find_package(CUDAToolkit REQUIRED)
        include_directories(${CUDAToolkit_INCLUDE_DIRS})
    else()
        message(FATAL_ERROR "KTRANSFORMERS_USE_CUDA=ON but CUDA compiler not found")
    endif()
    message(STATUS "enabling CUDA")
    enable_language(CUDA)
    add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)

    # Set default CUDA architectures if not specified
    # Target: SM 80/86 (Ampere), 89 (Ada), 90 (Hopper)
    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90" CACHE STRING "CUDA architectures" FORCE)
        message(STATUS "CUDA architectures (default): ${CMAKE_CUDA_ARCHITECTURES}")
    else()
        message(STATUS "CUDA architectures (user): ${CMAKE_CUDA_ARCHITECTURES}")
    endif()

    # Optimization flags
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --use_fast_math")
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
    set(CMAKE_CUDA_STANDARD 17)
    set(CMAKE_CUDA_STANDARD_REQUIRED ON)

    message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
    message(STATUS "CUDA toolkit: ${CUDAToolkit_VERSION}")
    message(STATUS "CUDA flags: ${CMAKE_CUDA_FLAGS}")
elseif(KTRANSFORMERS_USE_ROCM)
    find_package(HIP REQUIRED)
    if(HIP_FOUND)
        include_directories("${HIP_INCLUDE_DIRS}")
        add_compile_definitions(KTRANSFORMERS_USE_ROCM=1)
    endif()
elseif(KTRANSFORMERS_USE_MUSA)
    if(NOT EXISTS $ENV{MUSA_PATH})
        if(NOT EXISTS /opt/musa)
            set(MUSA_PATH /usr/local/musa)
        else()
            set(MUSA_PATH /opt/musa)
        endif()
    else()
        set(MUSA_PATH $ENV{MUSA_PATH})
    endif()

    list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")

    find_package(MUSAToolkit)
    if(MUSAToolkit_FOUND)
        message(STATUS "MUSA Toolkit found")
        add_compile_definitions(KTRANSFORMERS_USE_MUSA=1)
    endif()
elseif(KTRANSFORMERS_CPU_USE_KML)
    message(STATUS "KML CPU detected")
else()
    message(STATUS "No GPU support enabled, building for CPU only")
    add_compile_definitions(KTRANSFORMERS_CPU_ONLY=1)
endif()

aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../third_party/llamafile SOURCE_DIR4)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5)
# message(STATUS "SOURCE_DIR3: ${SOURCE_DIR3}")

# arm64
if(NOT HOST_IS_X86 AND KTRANSFORMERS_CPU_USE_KML)
    aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kml SOURCE_DIR6)
    if(NOT KTRANSFORMERS_CPU_MLA)
        list(REMOVE_ITEM SOURCE_DIR6 ${CMAKE_CURRENT_SOURCE_DIR}/operators/kml/mla/)
    endif()
endif()
# message(STATUS "SOURCE_DIR6: ${SOURCE_DIR6}")

if(KTRANSFORMERS_CPU_MOE_KERNEL)
    aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/la SOURCE_DIR7)
    if(KTRANSFORMERS_CPU_MOE_AMD)
        aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/mat_kernel/aocl_kernel SOURCE_DIR7_KERNEL)
        add_compile_definitions(USE_MOE_KERNEL_AMD=1)
    elseif(NOT HOST_IS_X86 AND KTRANSFORMERS_CPU_USE_KML)
        aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/mat_kernel/kml_kernel SOURCE_DIR7_KERNEL)
    endif()
    list(APPEND SOURCE_DIR7 ${SOURCE_DIR7_KERNEL})
    if(NOT KTRANSFORMERS_CPU_MLA)
        list(REMOVE_ITEM SOURCE_DIR7 ${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/mla/)
    endif()
    add_compile_definitions(USE_MOE_KERNEL=1)
endif()
message(STATUS "SOURCE_DIR7: ${SOURCE_DIR7}")


set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5} ${SOURCE_DIR6} ${SOURCE_DIR7})

file(GLOB_RECURSE FMT_SOURCES
    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/*.h"
)
# Exclude third_party directory
list(FILTER FMT_SOURCES EXCLUDE REGEX "/third_party/")

## Locate a specific clang-format executable to avoid version drift
## Prefer newer versions first to support modern .clang-format keys
## You can override by passing -DCLANG_FORMAT_BIN=/full/path/to/clang-format
if(NOT DEFINED CLANG_FORMAT_BIN)
    set(_CF_HINTS
        $ENV{CONDA_PREFIX}/bin
        $ENV{MAMBA_ROOT_PREFIX}/envs/$ENV{CONDA_DEFAULT_ENV}/bin
        $ENV{VIRTUAL_ENV}/bin
        $ENV{HOME}/.local/bin
    )
    find_program(CLANG_FORMAT_BIN
        NAMES clang-format-20 clang-format-19 clang-format-18 clang-format-17 clang-format-16 clang-format-15 clang-format
        HINTS ${_CF_HINTS}
    )
endif()
if(NOT CLANG_FORMAT_BIN)
    message(WARNING "ONLY for developer: clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.")
else()
    execute_process(
        COMMAND ${CLANG_FORMAT_BIN} --version
        OUTPUT_VARIABLE _CLANG_FORMAT_VER
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
    # message(STATUS "CMake PATH: $ENV{PATH}")
    # Parse version string, e.g. "Ubuntu clang-format version 19.1.0" or "clang-format version 18.1.8"
    string(REGEX MATCH "version[ ]+([0-9]+(\\.[0-9]+)*)" _CF_VER_MATCH "${_CLANG_FORMAT_VER}")
    if(NOT _CF_VER_MATCH)
        message(WARNING "Failed to parse clang-format version from: ${_CLANG_FORMAT_VER}")
    endif()
    set(CLANG_FORMAT_VERSION "${CMAKE_MATCH_1}")
    message(STATUS "Using clang-format ${CLANG_FORMAT_VERSION} at ${CLANG_FORMAT_BIN}")
    if(CLANG_FORMAT_VERSION VERSION_LESS "18.0.0")
        message(WARNING "clang-format >=18.0.0 required (found ${CLANG_FORMAT_VERSION} at ${CLANG_FORMAT_BIN}).\n"
                            "Tip: Ensure your desired clang-format (e.g., conda's ${CONDA_PREFIX}/bin/clang-format) is earlier in PATH when running CMake,\n"
                            "or pass -DCLANG_FORMAT_BIN=/full/path/to/clang-format.")
    endif()
    add_custom_target(
        format
        COMMAND ${CLANG_FORMAT_BIN}
                -i
                -style=file
                -fallback-style=none
                ${FMT_SOURCES}
        COMMENT "Running clang-format on all source files"
    )

    # Optional: target to check formatting without modifying files (CI-friendly)
    add_custom_target(
        format-check
        COMMAND ${CLANG_FORMAT_BIN}
                -n --Werror
                -style=file
                -fallback-style=none
                ${FMT_SOURCES}
        COMMENT "Checking clang-format on all source files"
    )
endif()

include(FindPkgConfig)
if(PKG_CONFIG_FOUND)
    pkg_search_module(HWLOC REQUIRED IMPORTED_TARGET hwloc)
else(PKG_CONFIG_FOUND)
    message(FATAL_ERROR "FindHWLOC needs pkg-config program and PKG_CONFIG_PATH must contain the path to hwloc.pc file.")
endif(PKG_CONFIG_FOUND)


add_library(llamafile STATIC ${SOURCE_DIR4})


if(CPUINFER_ENABLE_LTO)
    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON)
    # Use THIN_LTO keyword only if supported compiler (Clang). GCC ignores it.
    pybind11_add_module(${PROJECT_NAME} MODULE THIN_LTO ${ALL_SOURCES})
    message(STATUS "LTO: enabled")
else()
    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
    pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
    message(STATUS "LTO: disabled")
endif()

# If BLIS was detected earlier, apply its include directory and library to the
# created Python extension target. We only do this after the module target
# (${PROJECT_NAME}) has been created by pybind11_add_module().
if(DEFINED _KT_BLIS_INCLUDE_DIR AND DEFINED _KT_BLIS_LIBRARY)
    if(TARGET ${PROJECT_NAME})
        target_include_directories(${PROJECT_NAME} PRIVATE ${_KT_BLIS_INCLUDE_DIR})
        target_link_libraries(${PROJECT_NAME} PRIVATE ${_KT_BLIS_LIBRARY})
    else()
        message(WARNING "BLIS was detected earlier but ${PROJECT_NAME} target was not found when attempting to apply BLIS link/include settings.")
    endif()
endif()

# Ensure the module target also has correct RPATH when conda is active
if(TARGET ${PROJECT_NAME} AND DEFINED ENV{CONDA_PREFIX} AND EXISTS "$ENV{CONDA_PREFIX}")
    set_target_properties(${PROJECT_NAME} PROPERTIES
        BUILD_RPATH "$ENV{CONDA_PREFIX}/lib"
        INSTALL_RPATH "$ENV{CONDA_PREFIX}/lib"
        SKIP_BUILD_RPATH OFF
    )
endif()
if(NOT HOST_IS_X86 AND KTRANSFORMERS_CPU_USE_KML)
    message(STATUS "KML CPU detected")

    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/mat_kernel/kml_kernel/prefillgemm)
    target_link_libraries(${PROJECT_NAME} PRIVATE prefillint8gemm)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/mat_kernel/kml_kernel/prefillgemm_int4)
    target_link_libraries(${PROJECT_NAME} PRIVATE prefillint4gemm)

    set(DECODE_GEMM_SOURCES
        ${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/mat_kernel/kml_kernel/batch_gemm.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/operators/moe_kernel/mat_kernel/kml_kernel/batch_gemm_kernels.cpp
    )
    add_library(decode_gemm SHARED ${DECODE_GEMM_SOURCES})
    target_link_libraries(${PROJECT_NAME} PRIVATE decode_gemm)  
    if(KTRANSFORMERS_CPU_MLA)
        target_link_libraries(${PROJECT_NAME} PRIVATE kml_rt)
    endif()
    target_compile_definitions(${PROJECT_NAME} PRIVATE CPU_USE_KML)
endif()
target_link_libraries(${PROJECT_NAME} PRIVATE llama PkgConfig::HWLOC OpenMP::OpenMP_CXX)
if(NOT HOST_IS_X86 AND KTRANSFORMERS_CPU_USE_KML)
    if(KTRANSFORMERS_CPU_DEBUG)
        # add_executable(convert-test ${CMAKE_CURRENT_SOURCE_DIR}/operators/kml/convert-test.cpp)
        # target_link_libraries(convert-test llama)
        file(GLOB KML_TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/operators/kml/test/*.cpp")
        foreach(test_src ${KML_TEST_SOURCES})
            # 获取不带扩展名的文件名作为 target 名
            get_filename_component(test_name ${test_src} NAME_WE)
            add_executable(${test_name} ${test_src} ${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend/shared_mem_buffer.cpp)
            if(KTRANSFORMERS_CPU_MLA)
                target_link_libraries(${test_name} llama OpenMP::OpenMP_CXX numa kml_rt)
            endif()
        endforeach()
    endif()
endif()


if(KTRANSFORMERS_USE_CUDA)
    # Link CUDA runtime (static or dynamic)
    if(KTRANSFORMERS_CUDA_STATIC_RUNTIME)
        # Platform-aware static library path
        if(WIN32)
            set(CUDART_STATIC_LIB "${CUDAToolkit_LIBRARY_DIR}/cudart_static.lib")
        else()
            set(CUDART_STATIC_LIB "${CUDAToolkit_LIBRARY_DIR}/libcudart_static.a")
        endif()

        if(EXISTS "${CUDART_STATIC_LIB}")
            target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDART_STATIC_LIB}")
            message(STATUS "CUDA runtime: static (${CUDART_STATIC_LIB})")

            # Linux needs additional libs for static cudart
            if(UNIX AND NOT APPLE)
                target_link_libraries(${PROJECT_NAME} PRIVATE rt pthread dl)
            endif()
        else()
            message(WARNING "Static CUDA runtime not found, using dynamic")
            target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
        endif()
    else()
        # Dynamic linking
        target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
        message(STATUS "CUDA runtime: dynamic")
    endif()
endif()
if(KTRANSFORMERS_USE_ROCM)
    add_compile_definitions(USE_HIP=1)
    target_link_libraries(${PROJECT_NAME} PRIVATE "${ROCM_PATH}/lib/libamdhip64.so")
    message(STATUS "Building for HIP")
endif()
if(KTRANSFORMERS_USE_MUSA)
    target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
endif()


find_library(NUMA_LIBRARY NAMES numa)
if(NUMA_LIBRARY)
    message(STATUS "NUMA library found: ${NUMA_LIBRARY} - enabling NUMA support")
    target_link_libraries(${PROJECT_NAME} PRIVATE ${NUMA_LIBRARY})
else()
    message(FATAL_ERROR "NUMA library not found, please install NUMA, sudo apt install libnuma-dev")
endif()


================================================
FILE: kt-kernel/CMakePresets.json
================================================
{
  "version": 3,
  "cmakeMinimumRequired": {
    "major": 3,
    "minor": 19,
    "patch": 0
  },
  "configurePresets": [
    {
      "name": "avx512",
      "displayName": "avx512_platform",
      "description": "for avx512 platform",
      "cacheVariables": {
        "KTRANSFORMERS_CPU_USE_AMX": "OFF",
        "LLAMA_AVX512": "OFF",
        "LLAMA_AVX2": "OFF",
        "KTRANSFORMERS_CPU_USE_AMX_AVX512": "ON",
        "KTRANSFORMERS_USE_CUDA": "ON"
      }
    },
    {
      "name": "avx",
      "displayName": "avx_platform",
      "description": "for avx platform",
      "cacheVariables": {
        "KTRANSFORMERS_CPU_USE_AMX": "OFF",
        "LLAMA_AVX2": "ON",
        "KTRANSFORMERS_USE_CUDA": "ON"
      }
    },
    {
      "name": "amx",
      "displayName": "amx_platform",
      "description": "for amx platform",
      "cacheVariables": {
        "KTRANSFORMERS_CPU_USE_AMX": "ON",
        "LLAMA_AVX512": "OFF",
        "LLAMA_AVX2": "OFF",
        "KTRANSFORMERS_CPU_USE_AMX_AVX512": "ON",
        "KTRANSFORMERS_USE_CUDA": "ON"
      }
    },
    {
      "name": "amd",
      "displayName": "amd_platform",
      "description": "for amd platform",
      "cacheVariables": {
        "KTRANSFORMERS_CPU_USE_AMX": "OFF",
        "LLAMA_AVX512": "OFF",
        "LLAMA_AVX2": "ON",
        "KTRANSFORMERS_CPU_USE_AMX_AVX512": "OFF",
        "KTRANSFORMERS_USE_CUDA": "ON",
        "KTRANSFORMERS_CPU_MOE_AMD": "ON",
        "KTRANSFORMERS_CPU_MOE_KERNEL": "ON"
      }
    }

  ]
}


================================================
FILE: kt-kernel/MANIFEST.in
================================================
# MANIFEST.in for kt-kernel
# Ensures source distribution includes all necessary files for building from source

# Core build files
include CMakeLists.txt
include CMakePresets.json
include setup.py
include pyproject.toml
include requirements.txt
include README.md
include LICENSE

# CMake modules and configuration
recursive-include cmake *.cmake *.in

# C++ source files
recursive-include cpu_backend *.h *.hpp *.cpp *.c *.cc
recursive-include operators *.h *.hpp *.cpp *.c *.cc
include ext_bindings.cpp

# Python package
recursive-include python *.py

# Third-party dependencies (vendored)
recursive-include third_party *

# Exclude compiled and cache files
global-exclude *.pyc
global-exclude *.pyo
global-exclude __pycache__
global-exclude .git*
global-exclude *.so
global-exclude *.o
global-exclude *.a
global-exclude build
global-exclude dist
global-exclude *.egg-info


================================================
FILE: kt-kernel/README.md
================================================
# KT-Kernel

High-performance kernel operations for KTransformers, featuring CPU-optimized MoE inference with AMX, AVX, KML and blis (amd library) support.

- [Note](#note)
- [Features](#features)
- [Installation](#installation)
  - [Option 1: Install from PyPI (Recommended for Most Users)](#option-1-install-from-pypi-recommended-for-most-users)
  - [Option 2: Install from Source (For Local Use or Custom Builds)](#option-2-install-from-source-for-local-use-or-custom-builds)
- [Verification](#verification)
- [KT CLI Overview](#kt-cli-overview)
- [Integration with SGLang](#integration-with-sglang)
  - [Installation Steps](#installation-steps)
  - [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b)
  - [KT-Kernel Parameters](#kt-kernel-parameters)
- [Direct Python API Usage](#direct-python-api-usage)
  - [Advanced Options](#advanced-options)
  - [Manual Configuration (Advanced)](#manual-configuration-advanced)
- [Build Configuration](#build-configuration)
  - [Manual Installation (Without install.sh)](#manual-installation-without-installsh)
- [Error Troubleshooting](#error-troubleshooting)
  - [CUDA Not Found](#cuda-not-found)
  - [hwloc Not Found](#hwloc-not-found)
- [Weight Quantization](#weight-quantization)
- [Before Commit!](#before-commit)

## Note

**Current Support Status:**
- ✅ **Native Precision with AVX512/AMX**: Supported with AVX512 CPUs in `FP8`, `BF16` and `RAWINT4` format - [Guide](https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/kt-kernel/Native-Precision-Tutorial.md)
- ✅ **Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format)
- ✅ **Universal CPU (llamafile backend)**: Supported (using GGUF-format weights)
- ✅ **AMD CPUs with BLIS**: Supported (for int8 prefill & decode) - [Guide](https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/kt-kernel/amd_blis.md)

**KT-CLI**

We are developing a simpler way to use KTransformers. Check out the [KT-CLI Guide](https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/kt-kernel/kt-cli.md) for more details.

## Features

- **CPU-Optimized MoE Kernels**: High-throughput MoE expert kernels optimized for instruction sets.
- **AVX512 Native Precision Backend**: FP8 / BF16 / INT4 native MoE backend for AVX512-capable servers.
- **AMX INT4/INT8 Backend**: INT4 / INT8 quantized expert inference backend for AMX-capable servers.
- **Llamafile CPU Backend**: AVX2/AVX512-based MoE backend built on Llamafile for universal CPU deployment.
- **NUMA-Aware Execution**: Thread pool and memory layout designed for multi-socket / multi-NUMA machines.

## Installation

### Option 1: Install from PyPI (Recommended for Most Users)

Install the latest version with a single command:

```bash
pip install kt-kernel
```

> **Note**: Check the [latest version on PyPI](https://pypi.org/project/kt-kernel/#history)

**Features:**
- ✅ **Automatic CPU detection**: Detects your CPU and loads the optimal kernel variant
- ✅ **CPU multi-variant support**: Includes AMX, AVX512 (Base/VNNI/VBMI/BF16), and AVX2 variants
- ✅ **CUDA support included**: GPU acceleration for NVIDIA GPUs (SM 80, 86, 89, 90)
- ✅ **No compilation needed**: Pre-built wheels for Python 3.10, 3.11, 3.12
- ✅ **Static CUDA runtime**: No CUDA toolkit installation required
- ✅ **Works on CPU-only systems**: CUDA features automatically disabled when GPU not available

**Requirements:**
- Python 3.10, 3.11, or 3.12
- Linux x86-64 (manylinux_2_17 compatible)
- CPU with AVX2 support (Intel Haswell 2013+, AMD Zen+)
- Optional: NVIDIA GPU with compute capability 8.0+ for CUDA features

#### CUDA Installation (GPU Acceleration)

For NVIDIA GPU-accelerated inference:

```bash
pip install kt-kernel-cuda
```

**Features:**
- ✅ **Multi-architecture support**: Single wheel supports SM 80/86/89/90 (Ampere, Ada, Hopper)
- ✅ **Static CUDA runtime**: No CUDA toolkit installation required
- ✅ **Broad compatibility**: Works with CUDA 11.8+ and 12.x drivers
- ✅ **PyTorch compatible**: Works with any PyTorch CUDA variant (cu118, cu121, cu124)

**Requirements:**
- Python 3.10, 3.11, or 3.12
- Linux x86-64 (manylinux_2_17 compatible)
- NVIDIA GPU with compute capability 8.0+ (Ampere or newer)
  - ✅ Supported: A100, RTX 3000/4000 series, H100
  - ❌ Not supported: V100, P100, GTX 1000/2000 series (too old)
- NVIDIA driver with CUDA 11.8+ or 12.x support (no CUDA toolkit needed)

**GPU Compatibility Matrix:**

| GPU Architecture | Compute Capability | Supported | Example GPUs |
|-----------------|-------------------|-----------|-------------|
| Hopper | 9.0 | ✅ | H100, H200 |
| Ada Lovelace | 8.9 | ✅ | RTX 4090, 4080, 4070 |
| Ampere | 8.6 | ✅ | RTX 3090, 3080, 3070, 3060 |
| Ampere | 8.0 | ✅ | A100, A30 |
| Turing | 7.5 | ❌ | RTX 2080, T4 |
| Volta | 7.0 | ❌ | V100 |

**CUDA Driver Compatibility (for GPU features):**
- CUDA 11.8, 11.9, 12.0-12.6+: Full support
- CUDA 11.0-11.7: Not supported (upgrade driver or use CPU-only)

**CPU Variants Included:**

The wheel includes 6 optimized variants that are **automatically selected at runtime** based on your CPU:

| Variant | CPU Support | Performance | Auto-Selected When |
|---------|-------------|-------------|-------------------|
| **AMX** | Intel Sapphire Rapids+ (2023+) | ⚡⚡⚡ Best | AMX instructions detected |
| **AVX512+BF16** | Ice Lake server, Zen 4+ (2021+) | ⚡⚡⚡ Excellent | AVX512 + BF16 detected |
| **AVX512+VBMI** | Ice Lake client (2019+) | ⚡⚡ Great | AVX512 + VBMI detected |
| **AVX512+VNNI** | Cascade Lake+ (2019+) | ⚡⚡ Great | AVX512 + VNNI detected |
| **AVX512 Base** | Skylake-X+ (2017+) | ⚡⚡ Good | AVX512 base detected |
| **AVX2** | Haswell+ (2013+), AMD Zen+ | ⚡ Good | Fallback for maximum compatibility |

**Verify installation:**
```python
import kt_kernel

# Check which CPU variant was loaded
print(f"CPU variant: {kt_kernel.__cpu_variant__}")
print(f"Version: {kt_kernel.__version__}")

# Check CUDA support
from kt_kernel import kt_kernel_ext
cpu_infer = kt_kernel_ext.CPUInfer(4)
has_cuda = hasattr(cpu_infer, 'submit_with_cuda_stream')
print(f"CUDA support: {has_cuda}")

print("✓ kt-kernel installed successfully!")
```

**Environment Variables:**
```bash
# Override automatic CPU detection (for testing or debugging)
export KT_KERNEL_CPU_VARIANT=avx2  # Force specific variant

# Enable debug output to see detection process
export KT_KERNEL_DEBUG=1
python -c "import kt_kernel"
```

---

### Option 2: Install from Source (For Local Use or Custom Builds)

Build from source for local installation or when you need AMD (BLIS), ARM (KML), or custom CUDA versions.

#### Prerequisites

First, initialize git submodules and create a conda environment:
```bash
git submodule update --init --recursive
conda create -n kt-kernel python=3.11 -y
conda activate kt-kernel
```

#### Quick Installation (Recommended)

Simply run the install script - it will auto-detect your CPU and optimize for best performance:

```bash
./install.sh
```

**What happens automatically:**
- Auto-detects CPU capabilities (AMX, AVX512_VNNI, AVX512_BF16)
- Installs system dependencies (`cmake`, `libhwloc-dev`, `pkg-config`)
- Builds optimized binary for **your CPU only** (using `-march=native`)
- **Software fallbacks**: Automatically enabled for CPUs without VNNI/BF16

**Optional: Two-step installation**
```bash
./install.sh deps   # Install dependencies only
./install.sh build  # Build and install kt-kernel
```

**CPU Requirements by Backend:**

| Backend | Minimum CPU Requirement | Example CPUs | Notes |
|---------|-------------------------|--------------|-------|
| **LLAMAFILE** | AVX2 | Intel Haswell (2013+), AMD Zen+ | Universal compatibility |
| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+), Ice Lake, Cascade Lake | Software fallbacks for VNNI/BF16 |
| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | Best performance, requires AMX hardware |
| **FP8** | AVX512F + AVX512BW + AVX512_BF16 + AVX512_VBMI | Intel Cooper Lake (2020+), Sapphire Rapids (2023+); AMD Zen 4+ (e.g., EPYC 9355) | Native Precision (e.g., DeepSeek V3.2, MiniMax M2.1) |
| **BF16** | AVX512F + AVX512BW + AVX512_BF16 | Intel Cooper Lake (2020+), Sapphire Rapids (2023+); AMD Zen 4+ (e.g., EPYC 9355) | Native Precision (e.g., Qwen3-235B-A22B, GLM-4.7) |

**Software Fallback Support (AVX512 backends):**
- ✅ VNNI fallback: Uses AVX512BW instructions
- ✅ BF16 fallback: Uses AVX512F instructions
- ✅ Older AVX512 CPUs (Skylake-X, Cascade Lake) can run RAWINT4 with fallbacks

⚠️ **Portability Note:** The default build is optimized for your specific CPU and may not work on different/older CPUs. For portable builds or binary distribution, see [Manual Configuration](#manual-configuration-advanced) below.

⚠️ **AMD BLIS backend users:** See [installation guide](https://github.com/kvcache-ai/ktransformers/issues/1601) for AMD-specific setup.

## Verification

After installation, verify that the CLI is working:

```bash
kt version
```

Expected output:
```
KTransformers CLI v0.x.x

  Python:        3.11.x
  Platform:      Linux 5.15.0-xxx-generic
  CUDA:          12.x
  kt-kernel:     0.x.x (amx)
  sglang:        0.x.x
```

You can also verify the Python module directly:

```bash
python -c "from kt_kernel import KTMoEWrapper; print('✓ kt-kernel installed successfully')"
```

## KT CLI Overview

The `kt` command-line tool provides a unified interface for running and managing KTransformers models:

| Command | Description |
|---------|-------------|
| `kt run <model>` | Start model inference server with auto-optimized parameters |
| `kt chat` | Interactive chat with a running model server |
| `kt model` | Manage models and storage paths |
| `kt doctor` | Diagnose environment issues and check system compatibility |
| `kt config` | Manage CLI configuration |
| `kt version` | Show version information |

**Quick Start Example:**

```bash
# Start a model server (auto-detects hardware and applies optimal settings)
kt run m2

# In another terminal, chat with the model
kt chat

# Check system compatibility
kt doctor
```

Run `kt --help` for more options, or `kt <command> --help` for command-specific help.

## Integration with SGLang

KT-Kernel can be used standalone via [Direct Python API](#direct-python-api-usage) or integrated with SGLang for production deployment. This section describes SGLang integration to enable CPU-GPU heterogeneous inference, where "hot" experts run on GPU and "cold" experts run on CPU for optimal resource utilization.

### Installation Steps

#### 1. Install SGLang

Install the kvcache-ai fork of SGLang (required for kt-kernel support):

```bash
# Option A: One-click install (from ktransformers root, installs sglang + kt-kernel)
./install.sh

# Option B: pip install
pip install sglang-kt

# Option C: From source (editable mode)
git clone --recursive https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
pip install -e "third_party/sglang/python[all]"
```

> **Important:** Use `sglang-kt` (kvcache-ai fork), not the official `sglang` package. If you have the official version installed, uninstall it first: `pip uninstall sglang -y`

#### 2. Prepare Weights

You need both GPU weights and CPU-side expert weights for heterogeneous inference. The exact format depends on the backend:

**GPU Weights (for all backends):**  
Use the model weights required by SGLang for GPU inference (for example, the original or already-quantized model directory from Hugging Face).

**CPU Weights (AMX backend: `AMXINT4` / `AMXINT8`):**
Quantize weights to AMX-optimized INT4/INT8 format using the provided script:

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/model \
  --input-type bf16 \
  --output /path/to/cpu-weights \
  --quant-method int8  # or int4 or moe_int8 (for amd now) 
```

- `--input-path`: Path to GPU-side original weights
- `--input-type`: Depends on your GPU weights type (`fp8`, `fp16`, or `bf16`)

In SGLang integration, `--kt-weight-path` should point to this converted CPU weights directory.

**Supported input formats:** FP8, FP16, BF16 → INT4/INT8.

**CPU Weights (LLAMAFILE backend: `LLAMAFILE`):**
LLAMAFILE uses pre-quantized **GGUF** weights on the CPU side directly, without running `convert_cpu_weights.py`. You need to:

- Download a GGUF model directly from the web (e.g., GGUF repos on Hugging Face / Modelscope);
- In SGLang integration, use that GGUF directory as `--kt-weight-path`.
  KT-Kernel supports multiple GGUF quantization formats such as `Q4_KM`, `Q4_K`, `Q5_K`, etc. Choose based on your latency and accuracy requirements.

#### 3. Launch SGLang Server

Start the SGLang server with your normal SGLang parameters, and add the following KT-Kernel specific parameters to enable CPU-GPU heterogeneous inference:

**KT-Kernel Parameters to Add:**
- `--kt-method`: Backend method (AMXINT4, AMXINT8, or LLAMAFILE)
- `--kt-weight-path`: Path to the converted CPU weights
- `--kt-cpuinfer`: Number of CPU inference threads (set to physical cores)
- `--kt-threadpool-count`: Number of thread pools (set to NUMA node count)
- `--kt-num-gpu-experts`: Number of experts to keep on GPU
- `--kt-max-deferred-experts-per-token`: Deferred experts for pipelined execution

Example:
```bash
python -m sglang.launch_server \
  [your normal SGLang parameters...] \
  --kt-method AMXINT8 \
  --kt-weight-path /path/to/cpu-weights \
  --kt-cpuinfer 64 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 32 \
  --kt-max-deferred-experts-per-token 2
```

See [KT-Kernel Parameters](#kt-kernel-parameters) section below for detailed parameter tuning guidelines.

### Complete Example: Qwen3-30B-A3B

This example demonstrates the full workflow from downloading weights to launching the server, showing **Native backend**, **AMX backend** and **LLAMAFILE backend** options.

**Hardware Configuration:**
- **GPU**: NVIDIA RTX 4090 24GB
- **CPU**: 2x Intel Xeon Gold 6454S (64 physical cores total, 128 threads, 2 NUMA nodes)
- **Model**: [Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B)

**How to verify your system configuration:**
```bash
# Check CPU configuration
lscpu | grep -E "^CPU\(s\)|Thread\(s\) per core|Socket\(s\)|NUMA node\(s\)"
# Expected output example:
CPU(s):                                  128
Thread(s) per core:                      2
Socket(s):                               2
NUMA node(s):                            2
# → Physical cores = CPU(s) / Thread(s) per core = 128 / 2 = 64
```

**Parameter Rationale:**
- `--kt-cpuinfer 64`: Set to physical cores (64), not hyperthreads (128)
- `--kt-threadpool-count 2`: 2 NUMA nodes detected (dual-socket system)
- `--kt-num-gpu-experts 32`: With 24GB GPU memory, we can fit ~32 experts on GPU for this model (varies by model architecture and actual memory usage)
- `--kt-max-deferred-experts-per-token 2`: Enable pipelined execution; allows CPU to process next batch while GPU completes current batch
- `--kt-gpu-prefill-token-threshold 2048`: Use layerwise prefill strategy when token count exceeds 2048 (for native backends only)

---

#### Option A: Native Backend (BF16)

For AVX512 CPUs with BF16 support.

**Step 1: Download model weights**

```bash
# Install huggingface-cli if not already installed
pip install huggingface-hub
# Download model from Hugging Face  
huggingface-cli download Qwen/Qwen3-30B-A3B --local-dir /mnt/data/models/Qwen3-30B-A3B
```

**Step 2: Launch SGLang server**

```bash
python -m sglang.launch_server \
    --host 0.0.0.0 \
    --port 30000 \
    --model /mnt/data/models/Qwen3-30B-A3B \
    --kt-weight-path /mnt/data/models/Qwen3-30B-A3B \
    --kt-cpuinfer 64 \
    --kt-threadpool-count 2 \
    --kt-num-gpu-experts 32 \
    --kt-method BF16 \
    --attention-backend flashinfer \
    --trust-remote-code \
    --mem-fraction-static 0.80 \
    --chunked-prefill-size 16384 \
    --max-running-requests 4 \
    --served-model-name Qwen3 \
    --enable-mixed-chunk \
    --tensor-parallel-size 1 \
    --enable-p2p-check \
    --disable-shared-experts-fusion \
    --kt-gpu-prefill-token-threshold 4096 \
    --kt-enable-dynamic-expert-update
```

---

#### Option B: AMX Backend (AMXINT8)

For Intel CPUs with AMX instruction set support.

**Step 1: Download model weights**

```bash
# Install huggingface-cli if not already installed
pip install huggingface-hub

# Download model from Hugging Face
huggingface-cli download Qwen/Qwen3-30B-A3B --local-dir /mnt/data/models/Qwen3-30B-A3B
```

**Step 2: Convert to CPU weights (AMXINT8)**

```bash
python scripts/convert_cpu_weights.py \
  --input-path /mnt/data/models/Qwen3-30B-A3B \
  --input-type bf16 \
  --output /mnt/data/models/Qwen3-30B-A3B-INT8 \
  --quant-method int8
```

**Step 3: Launch SGLang server**

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 8000 \
  --model /mnt/data/models/Qwen3-30B-A3B \
  --trust-remote-code \
  --mem-fraction-static 0.92 \
  --chunked-prefill-size 4096 \
  --served-model-name Qwen3-30B-A3B \
  --enable-mixed-chunk \
  --kt-method AMXINT8 \
  --kt-weight-path /mnt/data/models/Qwen3-30B-A3B-INT8 \
  --kt-cpuinfer 64 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 32 \
  --kt-max-deferred-experts-per-token 2
```

---

#### Option C: LLAMAFILE Backend (GGUF)

For universal CPUs (no AMX required), using pre-quantized GGUF weights directly.

**Step 1: Download GPU weights (original model)**

```bash
pip install huggingface-hub

huggingface-cli download Qwen/Qwen3-30B-A3B --local-dir /mnt/data/models/Qwen3-30B-A3B
```

**Step 2: Download CPU weights (GGUF format)**

```bash
huggingface-cli download Qwen/Qwen3-30B-A3B-GGUF Qwen3-30B-A3B-Q4_K_M.gguf \
  --local-dir /mnt/data/models/Qwen3-30B-A3B-Q4_K_M
```

**Step 3: Launch SGLang server**

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 8000 \
  --model /mnt/data/models/Qwen3-30B-A3B \
  --trust-remote-code \
  --mem-fraction-static 0.92 \
  --chunked-prefill-size 4096 \
  --served-model-name Qwen3-30B-A3B \
  --enable-mixed-chunk \
  --kt-method LLAMAFILE \
  --kt-weight-path /mnt/data/models/Qwen3-30B-A3B-Q4_K_M \
  --kt-cpuinfer 64 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 32 \
  --kt-max-deferred-experts-per-token 2
```

### KT-Kernel Parameters

| Parameter | Description | Example Value |
|-----------|-------------|---------------|
| `--kt-method` | CPU inference backend method | `AMXINT4`, `AMXINT8`, `RAWINT4`, `FP8`, `FP8_PERCHANNEL`, `BF16` or `LLAMAFILE` |
| `--kt-weight-path` | Path to quantized CPU weights | `/path/to/cpu-weights` |
| `--kt-cpuinfer` | Number of CPU inference threads | `64` (adjust based on CPU cores) |
| `--kt-threadpool-count` | Number of thread pools for parallel execution | `2` (typically 1-4) |
| `--kt-num-gpu-experts` | Number of experts to keep on GPU | `32` (remaining experts go to CPU) |
| `--kt-max-deferred-experts-per-token` | Number of experts per token to defer for pipelined execution | `2` (0 to disable, 1-4 recommended) |
| `--kt-gpu-prefill-token-threshold` | Token count threshold for prefill strategy (native backend only) | ~`1024-4096` |
| `--kt-enable-dynamic-expert-update` | Enable dynamic expert placement updates during prefill based on actual routing statistics | (flag, no value needed) |
| `--kt-expert-placement-strategy` | Strategy for initial GPU expert placement | `uniform`, `frequency`, `front-loading`, or `random` |

**Parameter Guidelines:**

- **`kt-method`**: Choose based on your CPU and weight format:
  - `AMXINT4`: Best performance on AMX CPUs with INT4 quantized weights (May cause huge accuracy drop for some models, e.g., Qwen3-30B-A3B)
  - `AMXINT8`: Higher accuracy with INT8 quantized weights on AMX CPUs
  - `RAWINT4`: Native INT4 weights shared by CPU and GPU (currently supports Kimi-K2-Thinking model). See [Kimi-K2-Thinking Native Tutorial](../doc/en/Kimi-K2-Thinking-Native.md) for details.
  - `FP8`, `FP8_PERCHANNEL`: FP8 weights shared by CPU and GPU
  - `BF16`: BF16 weights shared by CPU and GPU
  - `LLAMAFILE`: GGUF-based backend

- **`kt-cpuinfer`**: Set to the number of **physical CPU cores** (not hyperthreads).
  - Check physical cores: `lscpu | grep -E "^CPU\(s\)|Thread\(s\) per core"`
  - Physical cores = CPU(s) / Thread(s) per core
  - Example: If CPU(s)=128 and Thread(s) per core=2, then physical cores = 64
  - **Important**: Do NOT set to hyperthread count - this will degrade performance

- **`kt-threadpool-count`**: Set to the number of **NUMA nodes**.
  - Check NUMA count: `lscpu | grep "NUMA node(s)"`
  - Or use: `numactl --hardware | grep "available"`
  - **Note**: NUMA node count is NOT necessarily the number of physical CPUs
    - It represents memory domains, which may be divided within a single CPU or across multiple CPUs
    - Use the NUMA node count from `lscpu`, regardless of physical CPU count
  - Typical values: 1-2 for single-socket, 2-4 for dual-socket systems
  - This enables better memory bandwidth utilization across NUMA domains

- **`kt-num-gpu-experts`**: Determine based on GPU memory and profiling:
  - More GPU experts = lower latency but higher GPU memory usage (May cause OOM)

- **`kt-max-deferred-experts-per-token`**: Enables pipelined execution:
  - `0`: Synchronous execution (simpler, higher latency)
  - `1-4`: Deferred execution (recommended range; good latency/quality balance, requires tuning)
  - `5-7`: Highest latency reduction but may introduce noticeable accuracy loss; use with care

- **`kt-gpu-prefill-token-threshold`** (FP8 and RAWINT4 only): Controls prefill strategy for native FP8 and INT4 inference:
  - **≤ threshold**: Uses hybrid CPU+GPU prefill. No extra VRAM needed, but performance degrades slowly as token count increases.
  - **> threshold**: Uses layerwise GPU prefill. Performance scales better with longer sequences, but requires one MoE layer extra VRAM (e.g., ~9GB+ for Kimi-K2-Thinking and ~3.6GB for MiniMax-M2.1).
  - Only applicable when `--kt-method RAWINT4` or `--kt-method FP8` is used.

- **`kt-enable-dynamic-expert-update`**: Enables dynamic expert placement updates during inference.
  - During layerwise prefill, the system collects actual routing statistics and redistributes GPU experts accordingly.
  - Requires `--kt-gpu-prefill-token-threshold` to be set, and prefill length must be ≥ the threshold value.
  - Particularly effective at lower GPU expert ratios (10%-70%), where it can significantly outperform static strategies.
  - See [Expert Scheduling Tutorial](../doc/en/kt-kernel/experts-sched-Tutorial.md) for benchmarks and details.

- **`kt-expert-placement-strategy`**: Determines which experts are placed on GPU at server startup.
  - `uniform`: Distributes GPU experts evenly across all MoE layers. Default option, no prior statistics needed.
  - `frequency`: Places the most frequently activated experts on GPU. Best performance when activation statistics are available; requires `--init-expert-location` pointing to a `.pt` statistics file.
  - `front-loading`: Fills GPU experts from the first MoE layer onwards.
  - `random`: Randomly selects experts with a fixed seed (42).
  - See [Expert Scheduling Tutorial](../doc/en/kt-kernel/experts-sched-Tutorial.md) for strategy comparison.

## Direct Python API Usage

For standalone usage without SGLang, you can use KT-Kernel directly via Python API:

```python
from kt_kernel import KTMoEWrapper

# Initialize the MoE wrapper
wrapper = KTMoEWrapper(
    layer_idx=0,
    num_experts=8,
    num_experts_per_tok=2,
    hidden_size=4096,
    moe_intermediate_size=14336,
    num_gpu_experts=2,
    cpuinfer_threads=32,
    threadpool_count=2,
    weight_path="/path/to/weights",
    chunked_prefill_size=512,
    method="AMXINT4"  # Options: "AMXINT4", "AMXINT8", "LLAMAFILE"
)

# Load weights (from disk - pre-quantized)
wrapper.load_weights(physical_to_logical_map)

# Or load weights from tensors (online quantization)
wrapper.load_weights_from_tensors(gate_proj, up_proj, down_proj, physical_to_logical_map)

# Run inference
output = wrapper.forward(hidden_states, topk_ids, topk_weights, cuda_stream)

# Or use async API for better performance
wrapper.submit_forward(hidden_states, topk_ids, topk_weights, cuda_stream)
# ... do other work ...
output = wrapper.sync_forward(hidden_states, cuda_stream)
```

### Advanced Options

```python
# Initialize with additional options
wrapper = KTMoEWrapper(
    layer_idx=0,
    num_experts=8,
    num_experts_per_tok=2,
    hidden_size=4096,
    moe_intermediate_size=14336,
    num_gpu_experts=2,
    cpuinfer_threads=32,
    threadpool_count=2,
    weight_path="/path/to/weights",
    chunked_prefill_size=512,
    method="AMXINT4",
    cpu_save=False,  # Keep weights in CPU memory after loading
    max_deferred_experts_per_token=0  # Number of experts to defer (for pipelined execution)
)

# Pre-allocate buffers for specific batch sizes (improves performance)
KTMoEWrapper.set_capture_batch_sizes([1, 2, 4, 8, 16])

# Query captured batch sizes
batch_sizes = KTMoEWrapper.get_capture_batch_sizes()

# Clear buffer cache to free memory
KTMoEWrapper.clear_buffer_cache()
```

### Manual Configuration (Advanced)

For portable builds, binary distribution, or cross-machine deployment, you need to manually specify target instruction sets:

```bash
# General distribution (works on any AVX512 CPU from 2017+)
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual

# Maximum compatibility (works on any CPU from 2013+)
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual

# Modern CPUs only (Ice Lake+, Zen 4+)
export CPUINFER_CPU_INSTRUCT=FANCY
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
```

**Optional: Override VNNI/BF16 detection**
```bash
# Force enable/disable VNNI and BF16 (for testing fallbacks)
export CPUINFER_ENABLE_AVX512_VNNI=OFF
export CPUINFER_ENABLE_AVX512_BF16=OFF
./install.sh
```

See `./install.sh --help` for all available options.

---

## Build Configuration

### Manual Installation (Without install.sh)

If you prefer manual installation without the `install.sh` script:

#### 1. Install System Dependencies

**Prerequisites:**
- `cmake` (recommended: `conda install -y cmake`)
- `libhwloc-dev` and `pkg-config`

#### 2. Set Build Configuration

**Core Options:**

| Variable | Options | Description |
|----------|---------|-------------|
| `CPUINFER_CPU_INSTRUCT` | `NATIVE`, `AVX512`, `AVX2`, `FANCY` | CPU instruction set to use |
| `CPUINFER_ENABLE_AMX` | `ON`, `OFF` | Enable Intel AMX support |
| `CPUINFER_BUILD_TYPE` | `Release`, `Debug`, `RelWithDebInfo` | Build type (default: `Release`) |
| `CPUINFER_PARALLEL` | Number | Parallel build jobs (default: auto-detect) |
| `CPUINFER_VERBOSE` | `0`, `1` | Verbose build output (default: `0`) |

**Instruction Set Details:**

| Option | Target CPUs | Use Case |
|--------|-------------|----------|
| **`NATIVE`** | Your specific CPU only | Local builds (best performance, **default**) |
| **`AVX512`** | Skylake-X, Ice Lake, Cascade Lake, Zen 4+ | General distribution |
| **`AVX2`** | Haswell (2013) and newer | Maximum compatibility |
| **`FANCY`** | Ice Lake+, Zen 4+ | Modern CPUs with full AVX512 extensions |

**Example Configurations:**

```bash
# Local use - maximum performance (default behavior)
export CPUINFER_CPU_INSTRUCT=NATIVE
export CPUINFER_ENABLE_AMX=ON  # or OFF

# Distribution build - works on any AVX512 CPU
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF

# Maximum compatibility - works on CPUs since 2013
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF

# Debug build
export CPUINFER_BUILD_TYPE=Debug
export CPUINFER_VERBOSE=1
```

#### 3. Build and Install

```bash
# Editable installation (for development)
pip install -e .

# Standard installation
pip install .
```

## Error Troubleshooting

### CUDA Not Found

```
 -- Looking for a CUDA compiler - NOTFOUND
  CMake Error at CMakeLists.txt:389 (message):
    KTRANSFORMERS_USE_CUDA=ON but CUDA compiler not found
```

Make sure you have the CUDA toolkit installed and `nvcc` is in your system PATH.

Try `export CMAKE_ARGS="-D CMAKE_CUDA_COMPILER=$(which nvcc)"` and reinstall again.

### hwloc Not Found

Run `sudo apt install libhwloc-dev` if on a Debian-based system or build from source: https://www.open-mpi.org/projects/hwloc/.

```
wget https://download.open-mpi.org/release/hwloc/v2.12/hwloc-2.12.2.tar.gz
tar -xzf hwloc-2.12.2.tar.gz
cd hwloc-2.12.2
./configure
make
sudo make install
```

## Weight Quantization

For AMX backends (`AMXINT4` / `AMXINT8`), CPU-side experts must be converted to AMX-friendly INT4/INT8 format using the provided script:

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/model \
  --input-type bf16 \
  --output /path/to/output \
  --quant-method int4
```

**Supported formats:** FP8, FP16, BF16 → INT4/INT8

For LLAMAFILE backend (`LLAMAFILE`), CPU-side experts are loaded directly from **GGUF** weights. You do **not** need to run the AMX conversion script; instead, download a GGUF model from the web (e.g., a GGUF repo on Hugging Face) and point `weight_path` / SGLang `--kt-weight-path` (or `--model` when appropriate) to that GGUF directory. KT-Kernel supports multiple GGUF quantization types such as `Q4_KM`, `Q4_K`, `Q5_K`, etc.

---

For detailed documentation, advanced options, and low-memory mode, see [scripts/README.md](scripts/README.md).

## Before Commit!

Commit messages should follow the Conventional Commits specification: https://www.conventionalcommits.org/

Please format your code before committing:

```shell
cmake -B build
cd build
make format
```

You may need a newer clang-format (at least version 18). In a conda environment:

```shell
conda install -c conda-forge clang-format=18
rm -rf build
```

It's also recommended to install black for Python code formatting:

```shell
conda install black
```


================================================
FILE: kt-kernel/README_zh.md
================================================
# KT-Kernel

高性能 KTransformers 内核库，提供面向 CPU 的高效 MoE 推理内核，支持 AMX 和 AVX 等后端。

- [KT-Kernel](#kt-kernel)
  - [说明](#说明)
  - [特性](#特性)
  - [安装](#安装)
    - [先决条件](#先决条件)
    - [快速安装（推荐）](#快速安装推荐)
    - [手动配置（进阶）](#手动配置进阶)
  - [验证安装](#验证安装)
  - [与 SGLang 集成](#与-sglang-集成)
    - [安装步骤](#安装步骤)
      - [1. 安装 SGLang](#1-安装-sglang)
      - [2. 准备权重](#2-准备权重)
      - [3. 启动 SGLang Server](#3-启动-sglang-server)
    - [完整示例：Qwen3-30B-A3B](#完整示例qwen3-30b-a3b)
      - [方案 A：AMX 后端（AMXINT8）](#方案-aamx-后端amxint8)
      - [方案 B：LLAMAFILE 后端（GGUF）](#方案-bllamafile-后端gguf)
    - [KT-Kernel 参数](#kt-kernel-参数)
  - [直接使用 Python API](#直接使用-python-api)
    - [高级选项](#高级选项)
  - [构建配置](#构建配置)
    - [手动安装](#手动安装)
      - [1. 安装系统依赖](#1-安装系统依赖)
      - [2. 配置构建参数](#2-配置构建参数)
      - [3. 构建并安装](#3-构建并安装)
  - [错误排查](#错误排查)
    - [找不到 CUDA](#找不到-cuda)
    - [找不到 hwloc](#找不到-hwloc)
  - [权重量化](#权重量化)
  - [提交前必读](#提交前必读)

## 说明

**当前支持状态：**
- ✅ **带 AMX 的 Intel CPU**：已支持（基于转换为 INT4/INT8 格式的权重）
- ✅ **通用 CPU（llamafile 后端）**：已支持（基于 GGUF 格式的权重）
- ✅ **带 BLIS 的 AMD CPU**：已支持（int8 的 prefill 和 decode）
- ✅ **Kimi-K2 原生 INT4（RAWINT4）**：支持 AVX512 CPU（CPU-GPU 共享 INT4 权重）- [使用指南](../doc/en/Kimi-K2-Thinking-Native.md)

## 特性

- **CPU 友好的 MoE 内核**：针对指令集优化的高吞吐 MoE 专家内核。
- **AMX INT4/INT8 后端**：面向支持 AMX 的服务器提供 INT4 / INT8 量化专家推理后端。
- **Llamafile CPU 后端**：基于 Llamafile 的 AVX2/AVX512 MoE 后端，适用于通用 CPU 部署。
- **NUMA 感知执行**：为多路 / 多 NUMA 机器设计的线程池和内存布局。


## 安装

### 从源码安装（本机使用或自定义构建）

适用于本地安装，或需要 AMD (BLIS)、ARM (KML) 或自定义 CUDA 版本的场景。

#### 先决条件

首先初始化子模块并创建 conda 环境：
```bash
git submodule update --init --recursive
conda create -n kt-kernel python=3.11 -y
conda activate kt-kernel
```

#### 快速安装（推荐）

只需运行安装脚本，它会自动检测 CPU 并优化性能：

```bash
./install.sh
```

**自动完成的操作：**
- 自动检测 CPU 能力（AMX、AVX512_VNNI、AVX512_BF16）
- 安装系统依赖（`cmake`、`libhwloc-dev`、`pkg-config`）
- 为**你的 CPU** 构建优化二进制（使用 `-march=native`）
- **软件回退机制**：为不支持 VNNI/BF16 的 CPU 自动启用

**可选：分步安装**
```bash
./install.sh deps   # 仅安装依赖
./install.sh build  # 构建并安装 kt-kernel
```

**不同后端的 CPU 要求：**

| 后端 | 最低 CPU 要求 | 示例 CPU | 说明 |
|------|---------------|----------|------|
| **LLAMAFILE** | AVX2 | Intel Haswell (2013+)、AMD Zen+ | 通用兼容性 |
| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+)、Ice Lake、Cascade Lake | 支持 VNNI/BF16 软件回退 |
| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | 最佳性能，需要 AMX 硬件 |

**软件回退支持（AVX512 后端）：**
- ✅ VNNI 回退：使用 AVX512BW 指令
- ✅ BF16 回退：使用 AVX512F 指令
- ✅ 老的 AVX512 CPU（Skylake-X、Cascade Lake）可以运行 RAWINT4（使用回退）

⚠️ **可移植性说明：** 默认构建针对你的特定 CPU 优化，可能无法在不同/更老的 CPU 上运行。如需打包分发或跨机器部署，请参见下方的 [手动配置](#手动配置进阶)。

⚠️ **AMD BLIS 后端用户：** 请参见 [安装指南](https://github.com/kvcache-ai/ktransformers/issues/1601) 了解 AMD 专用配置。

## 验证安装

```bash
python -c "from kt_kernel import KTMoEWrapper; print('✓ kt-kernel installed successfully')"
```

## 与 SGLang 集成

KT-Kernel 可以单独通过 [Python API](#直接使用-python-api) 使用，也可以集成到 SGLang 中用于生产部署。  
本节描述如何与 SGLang 集成，实现 CPU-GPU 混合（异构）推理：将“热” experts 放在 GPU 上，“冷” experts 放在 CPU 上，以达到资源利用和性价比的平衡。

### 安装步骤

#### 1. 安装 SGLang

安装 kvcache-ai 分支的 SGLang（kt-kernel 需要此分支）：

```bash
# 方式 A: 一键安装（从 ktransformers 根目录，同时安装 sglang + kt-kernel）
./install.sh

# 方式 B: pip 安装
pip install sglang-kt

# 方式 C: 从源码安装（可编辑模式）
git clone --recursive https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
pip install -e "third_party/sglang/python[all]"
```

> **重要:** 请使用 `sglang-kt`（kvcache-ai 分支），而非官方 `sglang` 包。如已安装官方版本，请先卸载：`pip uninstall sglang -y`

#### 2. 准备权重

要进行异构推理，需要同时准备 GPU 权重和 CPU 侧 experts 对应的权重，具体格式取决于后端类型：

**GPU 权重：**  
使用 SGLang 所需的模型权重（例如 Hugging Face 上的原始模型目录或已量化好的 GPU 权重）。

**CPU 权重（AMX 后端：`AMXINT4` / `AMXINT8`）：**  
通过提供的脚本将权重量化为适配 AMX 的 INT4/INT8 格式：

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/model \
  --input-type bf16 \
  --output /path/to/cpu-weights \
  --quant-method int8  # 或 int4 或 moe_int8（用于 amd 的）
```

- `--input-path`：GPU 侧原始权重路径
- `--input-type`：取决于 GPU 侧权重类型（`fp8`、`fp16` 或 `bf16`）

在 SGLang 集成中，`--kt-weight-path` 应指向该转换后的 CPU 权重目录。

**支持的输入格式：** FP8、FP16、BF16 → INT4/INT8。

**CPU 权重（LLAMAFILE 后端：`LLAMAFILE`）：**  
LLAMAFILE 在 CPU 侧直接使用预量化的 **GGUF** 权重，无需运行 `convert_cpu_weights.py`。你需要：

- 直接从互联网上下载 GGUF 模型（例如 Hugging Face / Modelscope 上的 GGUF 仓库）；
- 在 SGLang 集成中，将该 GGUF 目录作为 `--kt-weight-path`。
  KT-Kernel 支持多种 GGUF 量化格式，例如 `Q4_KM`、`Q4_K`、`Q5_K` 等，可根据延迟和效果需求选择。

#### 3. 启动 SGLang Server

在通常的 SGLang 启动参数基础上，增加如下 KT-Kernel 相关参数，以启用 CPU-GPU 异构推理：

**需要增加的 KT-Kernel 参数：**
- `--kt-method`：后端类型（AMXINT4、AMXINT8、或 LLAMAFILE）
- `--kt-weight-path`：转换后的 CPU 权重路径
- `--kt-cpuinfer`：CPU 推理线程数（建议设为物理核数）
- `--kt-threadpool-count`：线程池数量（建议设为 NUMA 节点个数）
- `--kt-num-gpu-experts`：留在 GPU 上的 experts 数量
- `--kt-max-deferred-experts-per-token`：每个 token 延迟到 CPU 的 experts 数量，用于流水线执行

示例：
```bash
python -m sglang.launch_server \
  [your normal SGLang parameters...] \
  --kt-method AMXINT8 \
  --kt-weight-path /path/to/cpu-weights \
  --kt-cpuinfer 64 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 32 \
  --kt-max-deferred-experts-per-token 2
```

更多调优建议见 [KT-Kernel 参数](#kt-kernel-参数) 一节。

### 完整示例：Qwen3-30B-A3B

该示例展示从下载权重到启动服务的完整流程，分别演示 **AMX 后端** 和 **LLAMAFILE 后端** 两种方案。

**硬件配置：**
- **GPU**：NVIDIA RTX 4090 24GB
- **CPU**：2x Intel Xeon Gold 6454S（共 64 个物理核，128 线程，2 个 NUMA 节点）
- **模型**：[Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B)

**如何检查系统配置：**
```bash
# 查看 CPU 配置
lscpu | grep -E "^CPU\(s\)|Thread\(s\) per core|Socket\(s\)|NUMA node\(s\)"
# 期望输出示例:
CPU(s):                                  128
Thread(s) per core:                      2
Socket(s):                               2
NUMA node(s):                            2
# → 物理核数 = CPU(s) / Thread(s) per core = 128 / 2 = 64
```

**参数选型说明：**
- `--kt-cpuinfer 64`：设为物理核数（64），而不是 128 线程
- `--kt-threadpool-count 2`：检测到 2 个 NUMA 节点（双路系统）
- `--kt-num-gpu-experts 32`：在 24GB 显存下，对该模型可以大约放 32 个 experts 在 GPU 上（具体取决于模型结构和实际内存占用）
- `--kt-max-deferred-experts-per-token 2`：启用流水线执行；允许 CPU 处理下一批 token 的同时，GPU 完成当前批次

---

#### 方案 A：AMX 后端（AMXINT8）

适用于支持 AMX 指令集的 Intel CPU。

**步骤 1：下载模型权重**

```bash
# 如未安装 huggingface-cli，请先安装
pip install huggingface-hub

# 从 Hugging Face 下载模型
huggingface-cli download Qwen/Qwen3-30B-A3B --local-dir /mnt/data/models/Qwen3-30B-A3B
```

**步骤 2：转换为 CPU 权重（AMXINT8）**

```bash
python scripts/convert_cpu_weights.py \
  --input-path /mnt/data/models/Qwen3-30B-A3B \
  --input-type bf16 \
  --output /mnt/data/models/Qwen3-30B-A3B-INT8 \
  --quant-method int8
```

**步骤 3：启动 SGLang 服务**

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 8000 \
  --model /mnt/data/models/Qwen3-30B-A3B \
  --trust-remote-code \
  --mem-fraction-static 0.92 \
  --chunked-prefill-size 4096 \
  --served-model-name Qwen3-30B-A3B \
  --enable-mixed-chunk \
  --kt-method AMXINT8 \
  --kt-weight-path /mnt/data/models/Qwen3-30B-A3B-INT8 \
  --kt-cpuinfer 64 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 32 \
  --kt-max-deferred-experts-per-token 2
```

---

#### 方案 B：LLAMAFILE 后端（GGUF）

适用于通用 CPU（无需 AMX 支持），直接使用预量化的 GGUF 权重。

**步骤 1：下载 GPU 权重（原始模型）**

```bash
pip install huggingface-hub

huggingface-cli download Qwen/Qwen3-30B-A3B --local-dir /mnt/data/models/Qwen3-30B-A3B
```

**步骤 2：下载 CPU 权重（GGUF 格式）**

```bash
huggingface-cli download Qwen/Qwen3-30B-A3B-GGUF Qwen3-30B-A3B-Q4_K_M.gguf \
  --local-dir /mnt/data/models/Qwen3-30B-A3B-Q4_K_M
```

**步骤 3：启动 SGLang 服务**

```bash
python -m sglang.launch_server \
  --host 0.0.0.0 \
  --port 8000 \
  --model /mnt/data/models/Qwen3-30B-A3B \
  --trust-remote-code \
  --mem-fraction-static 0.92 \
  --chunked-prefill-size 4096 \
  --served-model-name Qwen3-30B-A3B \
  --enable-mixed-chunk \
  --kt-method LLAMAFILE \
  --kt-weight-path /mnt/data/models/Qwen3-30B-A3B-Q4_K_M \
  --kt-cpuinfer 64 \
  --kt-threadpool-count 2 \
  --kt-num-gpu-experts 32 \
  --kt-max-deferred-experts-per-token 2
```

### KT-Kernel 参数

| 参数 | 描述 | 示例值 |
|------|------|--------|
| `--kt-method` | CPU 推理后端类型 | `AMXINT4`、`AMXINT8`、`RAWINT4` 或 `LLAMAFILE` |
| `--kt-weight-path` | 量化后的 CPU 权重路径 | `/path/to/cpu-weights` |
| `--kt-cpuinfer` | CPU 推理线程数 | `64`（根据 CPU 核心数调整） |
| `--kt-threadpool-count` | 并行执行的线程池数量 | `2`（通常为 1–4） |
| `--kt-num-gpu-experts` | 保留在 GPU 上的 experts 数量 | `32`（其余 experts 由 CPU 承担） |
| `--kt-max-deferred-experts-per-token` | 每个 token 延迟到 CPU 的 experts 数量（用于流水线执行） | `2`（0 关闭，1–4 推荐） |
| `--kt-gpu-prefill-token-threshold` | Prefill 策略的 token 数量阈值（仅 RAWINT4） | ~`400` |

**参数建议：**

- **`kt-method`**：根据 CPU 能力和权重格式选择：
  - `AMXINT4`：在 AMX CPU 上 INT4 量化时具有最佳性能（但可能对某些模型有较大精度影响，例如 Qwen3-30B-A3B）
  - `AMXINT8`：在 AMX CPU 上提供更高精度的 INT8 量化方案
  - `RAWINT4`：CPU 和 GPU 共享原生 INT4 权重（仅限 AMX 后端，目前仅支持 Kimi-K2-Thinking 模型）。详见 [Kimi-K2-Thinking 原生推理教程](../doc/en/Kimi-K2-Thinking-Native.md)。
  - `LLAMAFILE`：基于 AVX2/AVX512 的通用 CPU 后端，性能较 AMX 略低，但适用范围更广

- **`kt-cpuinfer`**：设置为 **物理核数**（不是线程数）。
  - 查看物理核数：`lscpu | grep -E "^CPU\(s\)|Thread\(s\) per core"`
  - 计算方式：物理核数 = CPU(s) / Thread(s) per core
  - 例：若 CPU(s)=128 且 Thread(s) per core=2，则物理核数=64
  - **重要**：不要设置为超线程总数，否则会降低性能

- **`kt-threadpool-count`**：设置为 **NUMA 节点数**。
  - 查看 NUMA 数：`lscpu | grep "NUMA node(s)"`
  - 或：`numactl --hardware | grep "available"`
  - **注意**：NUMA 节点数不等同于物理 CPU 数量：
    - 它表示内存域，可能在单颗 CPU 内被拆分，也可能跨多颗 CPU。
    - 请以 `lscpu` 输出的 NUMA 节点数为准。
  - 常见配置：单路 1–2，双路 2–4
  - 正确设置有助于充分利用跨 NUMA 域的内存带宽。

- **`kt-num-gpu-experts`**：根据 GPU 显存和实际性能测试决定：
  - GPU 上的 experts 越多 → 延迟越低，但显存占用越高（可能 OOM）

- **`kt-max-deferred-experts-per-token`**：用于开启 CPU-GPU 流水线：
  - `0`：完全同步执行（简单但延迟较高）
  - `1–4`：推荐范围，一部分 experts 延迟到 CPU，在延迟和质量之间取得较好平衡（需要按模型调参）
  - `5–7`：可以获得更低延迟，但存在明显精度下降风险，请谨慎使用

- **`kt-gpu-prefill-token-threshold`**（仅 RAWINT4）：控制原生 INT4 推理的 prefill 策略：
  - **≤ 阈值**：使用 CPU+GPU 混合 prefill。无需额外显存，但随着 token 数量增加性能会缓慢下降。
  - **> 阈值**：使用分层 GPU prefill。长序列性能更好，但需要约 9GB+ 额外显存。
  - 仅在使用 `--kt-method RAWINT4` 时生效。目前仅支持 Kimi-K2-Thinking 模型。

## 直接使用 Python API

如果不集成 SGLang，也可以直接通过 Python API 单独使用 KT-Kernel：

```python
from kt_kernel import KTMoEWrapper

# 初始化 MoE 包装器
wrapper = KTMoEWrapper(
    layer_idx=0,
    num_experts=8,
    num_experts_per_tok=2,
    hidden_size=4096,
    moe_intermediate_size=14336,
    num_gpu_experts=2,
    cpuinfer_threads=32,
    threadpool_count=2,
    weight_path="/path/to/weights",
    chunked_prefill_size=512,
    method="AMXINT4"  # 选项: "AMXINT4", "AMXINT8", "LLAMAFILE"
)

# 从磁盘加载权重（预先量化好）
wrapper.load_weights(physical_to_logical_map)

# 或者从张量加载权重（在线量化）
wrapper.load_weights_from_tensors(gate_proj, up_proj, down_proj, physical_to_logical_map)

# 执行推理
output = wrapper.forward(hidden_states, topk_ids, topk_weights, cuda_stream)

# 或使用异步 API 获取更好的流水线效果
wrapper.submit_forward(hidden_states, topk_ids, topk_weights, cuda_stream)
# ... 做一些其他工作 ...
output = wrapper.sync_forward(hidden_states, cuda_stream)
```

### 高级选项

```python
# 使用更多高级选项初始化
wrapper = KTMoEWrapper(
    layer_idx=0,
    num_experts=8,
    num_experts_per_tok=2,
    hidden_size=4096,
    moe_intermediate_size=14336,
    num_gpu_experts=2,
    cpuinfer_threads=32,
    threadpool_count=2,
    weight_path="/path/to/weights",
    chunked_prefill_size=512,
    method="AMXINT4",
    cpu_save=False,  # 加载后是否将权重常驻 CPU 内存
    max_deferred_experts_per_token=0  # 每个 token 延迟的 experts 数量（用于流水线）
)

# 为特定 batch size 预分配缓冲区（提升性能）
KTMoEWrapper.set_capture_batch_sizes([1, 2, 4, 8, 16])

# 查看当前捕获的 batch size
batch_sizes = KTMoEWrapper.get_capture_batch_sizes()

# 清理缓冲区缓存以释放内存
KTMoEWrapper.clear_buffer_cache()
```

### 手动配置（进阶）

如需打包分发、跨机器部署或构建可移植二进制，需要手动指定目标指令集：

```bash
# 通用分发版（适用于 2017+ 的任何 AVX512 CPU）
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual

# 最大兼容性（适用于 2013+ 的任何 CPU）
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual

# 仅限现代 CPU（Ice Lake+、Zen 4+）
export CPUINFER_CPU_INSTRUCT=FANCY
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
```

**可选：覆盖 VNNI/BF16 检测**
```bash
# 强制启用/禁用 VNNI 和 BF16（用于测试回退）
export CPUINFER_ENABLE_AVX512_VNNI=OFF
export CPUINFER_ENABLE_AVX512_BF16=OFF
./install.sh
```

运行 `./install.sh --help` 查看所有可用选项。

---

## 构建配置

### 手动安装（不使用 install.sh）

如果你不想使用 `install.sh` 脚本：

#### 1. 安装系统依赖

**前置依赖：**
- `cmake`（推荐：`conda install -y cmake`）
- `libhwloc-dev` 和 `pkg-config`

#### 2. 配置构建参数

**核心选项：**

| 变量 | 取值 | 描述 |
|------|------|------|
| `CPUINFER_CPU_INSTRUCT` | `NATIVE`, `AVX512`, `AVX2`, `FANCY` | 使用的 CPU 指令集 |
| `CPUINFER_ENABLE_AMX` | `ON`, `OFF` | 是否启用 Intel AMX 支持 |
| `CPUINFER_BUILD_TYPE` | `Release`, `Debug`, `RelWithDebInfo` | 构建类型（默认：`Release`） |
| `CPUINFER_PARALLEL` | 数值 | 并行构建的 Job 数（默认：自动检测） |
| `CPUINFER_VERBOSE` | `0`, `1` | 是否启用详细构建日志（默认：`0`） |

**指令集说明：**

| 选项 | 目标 CPU | 使用场景 |
|------|----------|----------|
| **`NATIVE`** | 仅限你的特定 CPU | 本地构建（最佳性能，**默认**） |
| **`AVX512`** | Skylake-X、Ice Lake、Cascade Lake、Zen 4+ | 通用分发 |
| **`AVX2`** | Haswell (2013) 及更新 | 最大兼容性 |
| **`FANCY`** | Ice Lake+、Zen 4+ | 具有完整 AVX512 扩展的现代 CPU |

**配置示例：**

```bash
# 本地使用 - 最高性能（默认行为）
export CPUINFER_CPU_INSTRUCT=NATIVE
export CPUINFER_ENABLE_AMX=ON  # 或 OFF

# 分发构建 - 适用于任何 AVX512 CPU
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF

# 最大兼容性 - 适用于 2013 年以来的 CPU
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF

# 调试构建
export CPUINFER_BUILD_TYPE=Debug
export CPUINFER_VERBOSE=1
```

#### 3. 构建并安装

```bash
# 开发模式（可编辑安装）
pip install -e .

# 普通安装
pip install .
```

## 错误排查

### 找不到 CUDA

```
 -- Looking for a CUDA compiler - NOTFOUND
  CMake Error at CMakeLists.txt:389 (message):
    KTRANSFORMERS_USE_CUDA=ON but CUDA compiler not found
```

请确认已安装 CUDA Toolkit 且 `nvcc` 在系统 PATH 中。

可以尝试：

```bash
export CMAKE_ARGS="-D CMAKE_CUDA_COMPILER=$(which nvcc)"
pip install .
```

然后重新安装。

### 找不到 hwloc

在 Debian 系发行版上可以直接：

```bash
sudo apt install libhwloc-dev
```

或从源码构建：https://www.open-mpi.org/projects/hwloc/

```bash
wget https://download.open-mpi.org/release/hwloc/v2.12/hwloc-2.12.2.tar.gz
tar -xzf hwloc-2.12.2.tar.gz
cd hwloc-2.12.2
./configure
make
sudo make install
```

## 权重量化

对于 AMX 后端（`AMXINT4` / `AMXINT8`），CPU 侧 experts 需要通过提供的脚本转换为适配 AMX 的 INT4/INT8 格式：

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/model \
  --input-type bf16 \
  --output /path/to/output \
  --quant-method int4
```

**支持的格式：** FP8、FP16、BF16 → INT4/INT8

对于 LLAMAFILE 后端（`LLAMAFILE`），CPU 侧 experts 直接从 **GGUF** 权重中加载。  
你**不需要**运行 AMX 转换脚本；只需从互联网上下载 GGUF 模型（例如 Hugging Face 上的 GGUF 仓库），并在 `weight_path` 或 SGLang 的 `--kt-weight-path` / `--model` 中指向该 GGUF 目录即可。KT-Kernel 支持多种 GGUF 量化格式，如 `Q4_KM`、`Q4_K`、`Q5_K` 等。

---

更多详细文档、高级参数和低显存模式，请参见 [scripts/README.md](scripts/README.md)。

## 提交前必读

提交信息应符合 Conventional Commits 规范：https://www.conventionalcommits.org/  
在提交前请先格式化代码：

```shell
cmake -B build
cd build
make format
```

你可能需要一个较新的 clang-format（至少 18），在 conda 环境中可以：

```shell
conda install -c conda-forge clang-format=18
rm -rf build
```

并且建议安装 black 用于 Python 代码格式化：

```shell
conda install black
```


================================================
FILE: kt-kernel/bench/.gitignore
================================================
*.jsonl
*.json

================================================
FILE: kt-kernel/bench/Makefile
================================================
# test bench_moe_kernel_tiling.py
kernel_tiling:
	python3 bench_moe_kernel_tiling.py \
	--hidden_size 7168 \
	--intermediate_size 2048 \
	--num_experts_per_tok 8 \
	--expert_num 256 \
	--max_len 51200 \
	--layer_num 1 \
	--qlen 1024 \
	--quant int8 \
	--warm_up_iter 500 \
	--test_iter 1000 \
	--threads 160 \
	--m_block 320 \
	
# 	--n_block_up_gate 256 \
# 	--n_block_down 128 \
# 	--n_block_up_gate_prefi 256 \
# 	--n_block_down_prefi 128 \

# 	--n_block_up_gate 256 \
# 	--n_block_down 512 \

================================================
FILE: kt-kernel/bench/bench_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1

anchor_type = kt_kernel_ext.kvcache.AnchorType.DYNAMIC
kv_type = kt_kernel_ext.kvcache.ggml_type.FP16
retrieval_type = kt_kernel_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 64
max_batch_size: int = 1
max_block_num: int = 1024
CPUInfer = kt_kernel_ext.CPUInfer(max_thread_num)

warm_up_iter = 1000
test_iter = 10000


def bench_linear(cache_seqlen: int):
    with torch.inference_mode(mode=True):
        cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
        seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")

        config = kt_kernel_ext.kvcache.KVCacheConfig(
            layer_num,
            kv_head_num,
            q_head_num,
            head_dim,
            block_len,
            anchor_num,
            anchor_type,
            kv_type,
            retrieval_type,
            layer_step,
            token_step,
            layer_offset,
            max_block_num,
            max_batch_size,
            max_thread_num,
        )
        local_kvcache = kt_kernel_ext.kvcache.KVCache(config)
        block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)

        for layer_idx in range(layer_num):
            k_cache = torch.randn(
                (1, cache_seqlen, kv_head_num, head_dim),
                dtype=torch.float16,
                device="cpu",
            ).contiguous()
            v_cache = torch.randn(
                (1, cache_seqlen, kv_head_num, head_dim),
                dtype=torch.float16,
                device="cpu",
            ).contiguous()

            CPUInfer.submit(
                local_kvcache.update_kvcache_fp16(
                    k_cache.data_ptr(),
                    v_cache.data_ptr(),
                    layer_idx,
                    block_table.data_ptr(),
                    1,
                    max_block_num,
                    seqlens_zero.data_ptr(),
                    cache_seqlen,
                )
            )
            CPUInfer.sync()

        input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
        output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()

        # attn_lse: (bsz, q_len, q_head_num)
        attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
        input = input / 100

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                local_kvcache.attn(
                    input.data_ptr(),
                    output.data_ptr(),
                    attn_lse.data_ptr(),
                    i % layer_num,
                    0,
                    1,
                    1,
                    max_block_num,
                    block_table.data_ptr(),
                    cache_seqlens.data_ptr(),
                    -1,
                    -1,
                    -1,
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                local_kvcache.attn(
                    input.data_ptr(),
                    output.data_ptr(),
                    attn_lse.data_ptr(),
                    i % layer_num,
                    0,
                    1,
                    1,
                    max_block_num,
                    block_table.data_ptr(),
                    cache_seqlens.data_ptr(),
                    -1,
                    -1,
                    -1,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print("cache sequence length: ", cache_seqlen)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            cache_seqlen * kv_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
            "GB/s",
        )
        print("")


bench_linear(1024)
bench_linear(4096)
bench_linear(16384)
bench_linear(32768)
bench_linear(65536)


================================================
FILE: kt-kernel/bench/bench_attention_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
warm_up_iter = 1000
test_iter = 10000


def bench_linear(cache_seqlen: int, device):
    with torch.inference_mode(mode=True):

        kvcaches = []

        for layer_idx in range(layer_num):
            k_cache = torch.randn(
                (1, 32, cache_seqlen, head_dim),
                dtype=torch.float16,
                device=device,
            ).contiguous()
            v_cache = torch.randn(
                (1, 32, cache_seqlen, head_dim),
                dtype=torch.float16,
                device=device,
            ).contiguous()

            kvcaches.append((k_cache, v_cache))

        input = torch.randn((1, q_head_num, 1, head_dim), dtype=torch.float16, device=device).contiguous()
        input = input / 100

        # warm up
        for i in range(warm_up_iter):
            k_cache = kvcaches[i % layer_num][0]
            v_cache = kvcaches[i % layer_num][1]
            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            k_cache = kvcaches[i % layer_num][0]
            v_cache = kvcaches[i % layer_num][1]
            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
        end = time.perf_counter()
        total_time = end - start
        print("cache sequence length: ", cache_seqlen)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            cache_seqlen * q_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
            "GB/s",
        )
        print("")


bench_linear(1024, "cpu")
bench_linear(4096, "cpu")
bench_linear(1024, "cuda")
bench_linear(4096, "cuda")
bench_linear(16384, "cuda")
bench_linear(32768, "cuda")
bench_linear(65536, "cuda")


================================================
FILE: kt-kernel/bench/bench_bf16_moe.py
================================================
"""
Performance benchmark for native BF16 MoE kernel (AMX implementation).

This benchmark measures the performance of the BF16 MoE operator with:
- Native BF16 weights (no quantization)
- BF16 activations
- AMX BF16 DPBF16PS compute path
"""

import os
import sys
import time
import json
import subprocess
import platform

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

import torch
from kt_kernel import kt_kernel_ext
from tqdm import tqdm

# Test parameters
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
num_experts_per_tok = 8
max_len = 25600

layer_num = 5
qlen = 1
warm_up_iter = 100
test_iter = 3000
CPUINFER_PARAM = 80

CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)

# Result file path
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
json_path = os.path.join(script_dir, "bench_bf16_moe.jsonl")


def get_git_commit():
    """Get current git commit info"""
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        result["dirty"] = bool(dirty_output)
        if dirty_output:
            result["dirty_files"] = dirty_output.splitlines()
    except Exception as e:
        result["commit"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """Get system information"""
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception:
            pass
    info["cpu_model"] = cpu_model
    info["cpu_core_count"] = os.cpu_count()
    return info


def record_results(result, filename=json_path):
    """Append result to JSON file"""
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def generate_bf16_weights(shape: tuple):
    """
    Generate random BF16 weights.

    Args:
        shape: (expert_num, n, k) - weight tensor shape

    Returns:
        bf16_weights: bfloat16 tensor with random values
    """
    # Generate random BF16 weights with small values to avoid overflow
    weights = (torch.randn(shape, dtype=torch.float32, device="cuda") / 100.0).to(torch.bfloat16).to("cpu").contiguous()
    return weights


def bench_bf16_moe():
    """Benchmark native BF16 MoE performance"""
    with torch.inference_mode():
        print("=" * 70)
        print("Native BF16 MoE Kernel Performance Benchmark")
        print("=" * 70)

        # Generate BF16 weights
        print("\nGenerating BF16 weights...")
        torch.manual_seed(42)
        gate_proj = generate_bf16_weights((expert_num, intermediate_size, hidden_size))
        up_proj = generate_bf16_weights((expert_num, intermediate_size, hidden_size))
        down_proj = generate_bf16_weights((expert_num, hidden_size, intermediate_size))

        physical_to_logical_map = torch.tensor(range(expert_num), device="cpu", dtype=torch.int64).contiguous()

        # Build MoE layers
        print("Building BF16 MoE layers...")
        moes = []
        for _ in tqdm(range(layer_num), desc="Initializing MOEs"):
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len

            # Set BF16 weight pointers (no scales needed)
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()

            # No scales for BF16
            config.gate_scale = 0
            config.up_scale = 0
            config.down_scale = 0
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            moes.append(moe)

        # Generate input data
        print("Generating input data...")
        gen_iter = 1000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .contiguous()
        )
        weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
        input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        qlen_tensor = torch.tensor([qlen], dtype=torch.int32)

        # Warmup
        print(f"Warming up ({warm_up_iter} iterations)...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # Benchmark
        print(f"Running benchmark ({test_iter} iterations)...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # Calculate metrics
        time_per_iter_us = total_time / test_iter * 1e6

        # FLOPS calculation:
        # Each expert performs: gate(intermediate x hidden) + up(intermediate x hidden) + down(hidden x intermediate)
        # GEMM/GEMV: 2 * m * n * k flops (multiply + accumulate = 2 ops per element)
        flops_per_expert = (
            2 * intermediate_size * hidden_size  # gate
            + 2 * intermediate_size * hidden_size  # up
            + 2 * hidden_size * intermediate_size  # down
        )
        total_flops = qlen * num_experts_per_tok * flops_per_expert * test_iter
        tflops = total_flops / total_time / 1e12

        # Bandwidth calculation (BF16 = 2 bytes per element)
        bytes_per_elem = 2.0
        # Weight memory: gate + up + down per expert
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / num_experts_per_tok * expert_num * (1 - (1 - num_experts_per_tok / expert_num) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # GB/s

        # Print results
        print("\n" + "=" * 70)
        print("Benchmark Results")
        print("=" * 70)
        print(f"Quant mode: Native BF16 (no quantization)")
        print(f"Total time: {total_time:.4f} s")
        print(f"Iterations: {test_iter}")
        print(f"Time per iteration: {time_per_iter_us:.2f} us")
        print(f"Bandwidth: {bandwidth:.2f} GB/s")
        print(f"TFLOPS: {tflops:.4f}")
        print("")

        # Record results
        result = {
            "test_name": os.path.basename(__file__),
            "quant_mode": "bf16_native",
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": tflops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result)

        return tflops, bandwidth


if __name__ == "__main__":
    bench_bf16_moe()


================================================
FILE: kt-kernel/bench/bench_fp8_moe.py
================================================
"""
Performance benchmark for FP8 MoE kernel (AVX implementation).

This benchmark measures the performance of the FP8 MoE operator with:
- FP8 (E4M3) weights with 128x128 block-wise scaling
- BF16 activations
- AVX-512 DPBF16 compute path
"""

import os
import sys
import time
import json
import subprocess
import platform

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

import torch
from kt_kernel import kt_kernel_ext
from tqdm import tqdm

# Test parameters
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
num_experts_per_tok = 8
fp8_group_size = 128
max_len = 25600

layer_num = 2
qlen = 1
warm_up_iter = 1000
test_iter = 3000
CPUINFER_PARAM = 80

CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)

# Result file path
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
json_path = os.path.join(script_dir, "bench_results.jsonl")


def get_git_commit():
    """Get current git commit info"""
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        result["dirty"] = bool(dirty_output)
        if dirty_output:
            result["dirty_files"] = dirty_output.splitlines()
    except Exception as e:
        result["commit"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """Get system information"""
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception:
            pass
    info["cpu_model"] = cpu_model
    info["cpu_core_count"] = os.cpu_count()
    return info


def record_results(result, filename=json_path):
    """Append result to JSON file"""
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def generate_fp8_weights_direct(shape: tuple, group_size: int = 128):
    """
    Directly generate random FP8 weights and e8m0 format scale_inv.

    Args:
        shape: (expert_num, n, k) - weight tensor shape
        group_size: block size for scaling (128x128 blocks)

    Returns:
        fp8_weights: uint8 tensor with random FP8 E4M3 values
        scale_inv: fp32 tensor with e8m0 format (powers of 2)
    """
    e, n, k = shape
    n_blocks = n // group_size
    k_blocks = k // group_size

    # Directly generate random FP8 weights as uint8
    # FP8 E4M3 format: 1 sign + 4 exp + 3 mantissa
    # Valid range for normal numbers: exp 1-14 (0 is subnormal, 15 is special)
    fp8_weights = torch.randint(0, 256, (e, n, k), dtype=torch.uint8, device="cuda").to("cpu").contiguous()

    # Generate e8m0 format scale_inv (powers of 2)
    # e8m0: 8-bit exponent only, no mantissa, bias = 127
    # Generate random exponents in a reasonable range (e.g., -8 to 8)
    exponents = torch.randint(-8, 9, (e, n_blocks, k_blocks), dtype=torch.int32, device="cuda").to("cpu").contiguous()
    scale_inv = (2.0 ** exponents.float()).to(torch.float32).contiguous()

    return fp8_weights, scale_inv


def bench_fp8_moe():
    """Benchmark FP8 MoE performance"""
    with torch.inference_mode():
        print("=" * 70)
        print("FP8 MoE Kernel Performance Benchmark")
        print("=" * 70)

        # Generate FP8 weights directly (no quantization from fp32)
        print("\nGenerating FP8 weights directly...")
        torch.manual_seed(42)
        gate_fp8, gate_scales = generate_fp8_weights_direct(
            (expert_num, intermediate_size, hidden_size), fp8_group_size
        )
        up_fp8, up_scales = generate_fp8_weights_direct((expert_num, intermediate_size, hidden_size), fp8_group_size)
        down_fp8, down_scales = generate_fp8_weights_direct(
            (expert_num, hidden_size, intermediate_size), fp8_group_size
        )

        physical_to_logical_map = torch.tensor(range(expert_num), device="cpu", dtype=torch.int64).contiguous()

        # Build MoE layers
        print("Building FP8 MoE layers...")
        moes = []
        for _ in tqdm(range(layer_num), desc="Initializing MOEs"):
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.quant_config.bits = 8
            config.quant_config.group_size = fp8_group_size
            config.quant_config.zero_point = False

            config.gate_proj = gate_fp8.data_ptr()
            config.up_proj = up_fp8.data_ptr()
            config.down_proj = down_fp8.data_ptr()
            config.gate_scale = gate_scales.data_ptr()
            config.up_scale = up_scales.data_ptr()
            config.down_scale = down_scales.data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXFP8_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            moes.append(moe)

        # Generate input data
        print("Generating input data...")
        gen_iter = 1000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .contiguous()
        )
        weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
        input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        qlen_tensor = torch.tensor([qlen], dtype=torch.int32)

        # Warmup
        print(f"Warming up ({warm_up_iter} iterations)...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # Benchmark
        print(f"Running benchmark ({test_iter} iterations)...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # Calculate metrics
        time_per_iter_us = total_time / test_iter * 1e6

        # FLOPS calculation:
        # Each expert performs: gate(intermediate x hidden) + up(intermediate x hidden) + down(hidden x intermediate)
        # GEMM/GEMV: 2 * m * n * k flops (multiply + accumulate = 2 ops per element)
        # For vector-matrix multiply (qlen=1): 2 * n * k per matrix
        flops_per_expert = (
            2 * intermediate_size * hidden_size  # gate
            + 2 * intermediate_size * hidden_size  # up
            + 2 * hidden_size * intermediate_size  # down
        )
        total_flops = qlen * num_experts_per_tok * flops_per_expert * test_iter
        tflops = total_flops / total_time / 1e12

        # Bandwidth calculation (FP8 = 1 byte per element)
        bytes_per_elem = 1.0
        # Weight memory: gate + up + down per expert
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / num_experts_per_tok * expert_num * (1 - (1 - num_experts_per_tok / expert_num) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # 单位：GB/s

        # Print results
        print("\n" + "=" * 70)
        print("Benchmark Results")
        print("=" * 70)
        print(f"Quant mode: FP8 (E4M3) with {fp8_group_size}x{fp8_group_size} block scaling")
        print(f"Total time: {total_time:.4f} s")
        print(f"Iterations: {test_iter}")
        print(f"Time per iteration: {time_per_iter_us:.2f} us")
        print(f"Bandwidth: {bandwidth:.2f} GB/s")
        print(f"TFLOPS: {tflops:.4f}")
        print("")

        # Record results
        result = {
            "test_name": os.path.basename(__file__),
            "quant_mode": "fp8_e4m3",
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": tflops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "num_experts_per_tok": num_experts_per_tok,
                "fp8_group_size": fp8_group_size,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result)

        return tflops, bandwidth


if __name__ == "__main__":
    bench_fp8_moe()


================================================
FILE: kt-kernel/bench/bench_fp8_perchannel_moe.py
================================================
"""
Performance benchmark for FP8 Per-Channel MoE kernel (GLM-4.7-FP8 style).

This benchmark measures the performance of the FP8 Per-Channel MoE operator with:
- FP8 (E4M3) weights with per-channel scaling (one scale per output row)
- BF16 activations
- AVX-512 DPBF16 compute path
"""

import os
import sys
import time
import json
import subprocess
import platform

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

import torch
from kt_kernel import kt_kernel_ext
from tqdm import tqdm

# Test parameters
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
num_experts_per_tok = 8
max_len = 25600

layer_num = 2
qlen = 1
warm_up_iter = 1000
test_iter = 3000
CPUINFER_PARAM = 80

CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)

# Result file path
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
json_path = os.path.join(script_dir, "bench_results.jsonl")


def get_git_commit():
    """Get current git commit info"""
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        result["dirty"] = bool(dirty_output)
        if dirty_output:
            result["dirty_files"] = dirty_output.splitlines()
    except Exception as e:
        result["commit"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """Get system information"""
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception:
            pass
    info["cpu_model"] = cpu_model
    info["cpu_core_count"] = os.cpu_count()
    return info


def record_results(result, filename=json_path):
    """Append result to JSON file"""
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def generate_fp8_perchannel_weights_direct(shape: tuple):
    """
    Directly generate random FP8 weights and per-channel scales.

    Args:
        shape: (expert_num, n, k) - weight tensor shape

    Returns:
        fp8_weights: uint8 tensor with random FP8 E4M3 values
        scales: fp32 tensor with per-channel scales, shape [expert_num, n]
    """
    e, n, k = shape

    # Directly generate random FP8 weights as uint8
    # FP8 E4M3 format: 1 sign + 4 exp + 3 mantissa
    fp8_weights = torch.randint(0, 256, (e, n, k), dtype=torch.uint8, device="cuda").to("cpu").contiguous()

    # Generate random per-channel scales (one per output row)
    # Use reasonable scale range (e.g., 2^-8 to 2^8)
    exponents = torch.randint(-8, 9, (e, n), dtype=torch.int32, device="cuda").to("cpu").contiguous()
    scales = (2.0 ** exponents.float()).to(torch.float32).contiguous()

    return fp8_weights, scales


def bench_fp8_perchannel_moe():
    """Benchmark FP8 Per-Channel MoE performance"""
    with torch.inference_mode():
        print("=" * 70)
        print("FP8 Per-Channel MoE Kernel Performance Benchmark")
        print("=" * 70)

        # Generate FP8 weights with per-channel scales
        print("\nGenerating FP8 weights with per-channel scales...")
        torch.manual_seed(42)
        gate_fp8, gate_scales = generate_fp8_perchannel_weights_direct((expert_num, intermediate_size, hidden_size))
        up_fp8, up_scales = generate_fp8_perchannel_weights_direct((expert_num, intermediate_size, hidden_size))
        down_fp8, down_scales = generate_fp8_perchannel_weights_direct((expert_num, hidden_size, intermediate_size))

        physical_to_logical_map = torch.tensor(range(expert_num), device="cpu", dtype=torch.int64).contiguous()

        # Build MoE layers
        print("Building FP8 Per-Channel MoE layers...")
        moes = []
        for _ in tqdm(range(layer_num), desc="Initializing MOEs"):
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.quant_config.bits = 8
            config.quant_config.group_size = 0  # Not used for per-channel
            config.quant_config.zero_point = False
            config.quant_config.per_channel = True  # Enable per-channel mode

            config.gate_proj = gate_fp8.data_ptr()
            config.up_proj = up_fp8.data_ptr()
            config.down_proj = down_fp8.data_ptr()
            config.gate_scale = gate_scales.data_ptr()
            config.up_scale = up_scales.data_ptr()
            config.down_scale = down_scales.data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXFP8PerChannel_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            moes.append(moe)

        # Generate input data
        print("Generating input data...")
        gen_iter = 1000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .contiguous()
        )
        weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
        input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        qlen_tensor = torch.tensor([qlen], dtype=torch.int32)

        # Warmup
        print(f"Warming up ({warm_up_iter} iterations)...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # Benchmark
        print(f"Running benchmark ({test_iter} iterations)...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # Calculate metrics
        time_per_iter_us = total_time / test_iter * 1e6

        # FLOPS calculation:
        # Each expert performs: gate(intermediate x hidden) + up(intermediate x hidden) + down(hidden x intermediate)
        # GEMM/GEMV: 2 * m * n * k flops (multiply + accumulate = 2 ops per element)
        # For vector-matrix multiply (qlen=1): 2 * n * k per matrix
        flops_per_expert = (
            2 * intermediate_size * hidden_size  # gate
            + 2 * intermediate_size * hidden_size  # up
            + 2 * hidden_size * intermediate_size  # down
        )
        total_flops = qlen * num_experts_per_tok * flops_per_expert * test_iter
        tflops = total_flops / total_time / 1e12

        # Bandwidth calculation (FP8 = 1 byte per element)
        bytes_per_elem = 1.0
        # Weight memory: gate + up + down per expert
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / num_experts_per_tok * expert_num * (1 - (1 - num_experts_per_tok / expert_num) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )

        # Print results
        print("\n" + "=" * 70)
        print("Benchmark Results")
        print("=" * 70)
        print(f"Quant mode: FP8 (E4M3) with per-channel scaling")
        print(f"Total time: {total_time:.4f} s")
        print(f"Iterations: {test_iter}")
        print(f"Time per iteration: {time_per_iter_us:.2f} us")
        print(f"Bandwidth: {bandwidth:.2f} GB/s")
        print(f"TFLOPS: {tflops:.4f}")
        print("")

        # Record results
        result = {
            "test_name": os.path.basename(__file__),
            "quant_mode": "fp8_e4m3_perchannel",
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": tflops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "num_experts_per_tok": num_experts_per_tok,
                "quant_type": "per_channel",
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result)

        return tflops, bandwidth


if __name__ == "__main__":
    bench_fp8_perchannel_moe()


================================================
FILE: kt-kernel/bench/bench_k2_moe_amx.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Benchmark AMX_K2_MOE_TP int4 path with packed weights and BF16 scales.
"""
import json
import math
import os
import platform
import subprocess
import sys
import time

from tqdm import tqdm

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

from kt_kernel import kt_kernel_ext
import torch

# Benchmark parameters (single MoE, no layer loop)
expert_num = 384
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
qlen = 1
warm_up_iter = 1000
test_iter = 5000
k_group_size = 32

physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()

worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [40, 40]
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)


def get_git_commit():
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    info["cpu_core_count"] = os.cpu_count()

    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, script_name + ".jsonl")


def record_results(result, filename=json_path):
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: int = 1) -> torch.Tensor:
    if value.dtype is not torch.int8:
        raise ValueError("Tensor must be torch.int8 before packing")
    if not (1 <= num_bits <= 8):
        raise ValueError(f"num_bits must be in [1, 8], got {num_bits}")

    offset = 1 << (num_bits - 1)
    value = (value + offset).to(torch.uint8)
    device = value.device

    pack_factor = 32 // num_bits

    if packed_dim == 0:
        value = value.transpose(0, 1)

    rows, cols = value.shape
    padded_cols = math.ceil(cols / pack_factor) * pack_factor
    pad_len = padded_cols - cols

    if pad_len > 0:
        value = torch.nn.functional.pad(value, (0, pad_len))

    num_groups = padded_cols // pack_factor
    reshaped = value.view(rows, num_groups, pack_factor).to(torch.int32)
    bit_shifts = torch.arange(pack_factor, device=device, dtype=torch.int32) * num_bits
    packed = (reshaped << bit_shifts).sum(dim=2, dtype=torch.int32)

    if packed_dim == 0:
        packed = packed.transpose(0, 1)

    return packed


def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor:
    e, rows, cols = q.shape
    flat = q.view(e * rows, cols)
    packed = pack_to_int32(flat, num_bits)
    return packed.view(e, rows, -1).contiguous()


def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
    """
    K2 int4 quantization producing int32-packed weights (8 int4s each) and BF16 scales.
    """
    weights_f32 = weights.to(torch.float32)
    e, rows, cols = weights_f32.shape
    if cols % group_size != 0 or cols % 2 != 0:
        raise ValueError(f"cols ({cols}) must be divisible by group_size ({group_size}) and 2")

    reshaped = weights_f32.view(e, rows, cols // group_size, group_size)
    max_abs = reshaped.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
    scales = (max_abs / 7.0).squeeze(-1)
    q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
    q = q.view(e, rows, cols)
    packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous()
    scales = scales.to(torch.bfloat16).contiguous().view(e, rows, cols // group_size).contiguous()
    return packed, scales


def build_quantized_layer_weights():
    gate_proj = torch.randn(
        (expert_num, intermediate_size, hidden_size),
        dtype=torch.float32,
        device="cpu",
    ).contiguous()
    up_proj = torch.randn(
        (expert_num, intermediate_size, hidden_size),
        dtype=torch.float32,
        device="cpu",
    ).contiguous()
    down_proj = torch.randn(
        (expert_num, hidden_size, intermediate_size),
        dtype=torch.float32,
        device="cpu",
    ).contiguous()

    gate_q, gate_scales = quantize_k2_tensor(gate_proj, k_group_size)
    up_q, up_scales = quantize_k2_tensor(up_proj, k_group_size)
    down_q, down_scales = quantize_k2_tensor(down_proj, k_group_size)

    return {
        "gate_qweight": gate_q,
        "up_qweight": up_q,
        "down_qweight": down_q,
        "gate_scales": gate_scales,
        "up_scales": up_scales,
        "down_scales": down_scales,
    }


def bench_k2_moe():
    with torch.inference_mode():
        bytes_per_elem = 0.5 + 2.0 / k_group_size

        quant_data = build_quantized_layer_weights()
        config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
        config.max_len = max_len
        config.quant_config.bits = 4
        config.quant_config.group_size = k_group_size
        config.quant_config.zero_point = False

        config.gate_proj = quant_data["gate_qweight"].data_ptr()
        config.up_proj = quant_data["up_qweight"].data_ptr()
        config.down_proj = quant_data["down_qweight"].data_ptr()

        config.gate_scale = quant_data["gate_scales"].data_ptr()
        config.up_scale = quant_data["up_scales"].data_ptr()
        config.down_scale = quant_data["down_scales"].data_ptr()
        config.pool = CPUInfer.backend_

        moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(config)
        CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
        CPUInfer.sync()

        gen_iter = 3000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .contiguous()
        )
        weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
        input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        output_tensor = torch.empty_like(input_tensor)
        bsz_tensor = torch.tensor([qlen], device="cpu")

        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor.data_ptr(),
                    output_tensor.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor.data_ptr(),
                    output_tensor.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )
        flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12

        print("Quant mode: int4_k2")
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")
        print("")

        result = {
            "quant_mode": "int4_k2",
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "k_group_size": k_group_size,
                "bytes_per_elem": bytes_per_elem,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result)


if __name__ == "__main__":
    bench_k2_moe()


================================================
FILE: kt-kernel/bench/bench_k2_write_buffer.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Benchmark write_weight_scale_to_buffer for AMX_K2_MOE_TP (int4 packed weights + bf16 scales).

Uses two MOE instances that alternate writing to simulate realistic multi-layer scenarios.
"""
import json
import os
import platform
import subprocess
import sys
import time

from tqdm import tqdm

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

from kt_kernel import kt_kernel_ext
import torch

# Benchmark parameters
expert_num = 384
num_experts_per_tok = expert_num
gpu_tp_count = 4

warm_up_iter = 3
test_iter = 7

gpu_experts_num = expert_num

hidden_size = 7168
intermediate_size = 2048
group_size = 32
max_len = 1

physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
CPUInfer = kt_kernel_ext.CPUInfer(80)


def get_git_commit():
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    info["cpu_core_count"] = os.cpu_count()

    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, script_name + ".jsonl")


def record_results(result, filename=json_path):
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def allocate_weights():
    per_mat_weight_bytes = (hidden_size * intermediate_size) // 2
    per_mat_scale_elems = (hidden_size * intermediate_size) // group_size

    gate_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    up_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    down_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)

    gate_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
    up_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
    down_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)

    return (
        gate_q.contiguous(),
        up_q.contiguous(),
        down_q.contiguous(),
        gate_scale.contiguous(),
        up_scale.contiguous(),
        down_scale.contiguous(),
        per_mat_weight_bytes,
        per_mat_scale_elems,
    )


def build_moe(layer_idx=0):
    """Build a single MOE instance with the given layer_idx."""
    (
        gate_q,
        up_q,
        down_q,
        gate_scale,
        up_scale,
        down_scale,
        per_mat_weight_bytes,
        per_mat_scale_elems,
    ) = allocate_weights()

    config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    config.max_len = max_len
    config.layer_idx = layer_idx
    config.quant_config.bits = 4
    config.quant_config.group_size = group_size
    config.quant_config.zero_point = False
    config.pool = CPUInfer.backend_

    config.gate_proj = gate_q.data_ptr()
    config.up_proj = up_q.data_ptr()
    config.down_proj = down_q.data_ptr()
    config.gate_scale = gate_scale.data_ptr()
    config.up_scale = up_scale.data_ptr()
    config.down_scale = down_scale.data_ptr()

    moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(config)
    CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    CPUInfer.sync()

    keep_tensors = {
        "gate_q": gate_q,
        "up_q": up_q,
        "down_q": down_q,
        "gate_scale": gate_scale,
        "up_scale": up_scale,
        "down_scale": down_scale,
    }

    buffer_shapes = {
        "per_mat_weight_bytes": per_mat_weight_bytes,
        "per_mat_scale_elems": per_mat_scale_elems,
    }

    return moe, buffer_shapes, keep_tensors


def allocate_buffers(buffer_shapes):
    """Allocate shared output buffers for single expert."""
    per_mat_weight_bytes = buffer_shapes["per_mat_weight_bytes"]
    per_mat_scale_elems = buffer_shapes["per_mat_scale_elems"]

    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
    scale_elems_per_expert_per_tp = per_mat_scale_elems // gpu_tp_count

    # Each buffer stores data for a single expert
    w13_weight_bufs = [torch.empty(2 * weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w13_scale_bufs = [torch.empty(2 * scale_elems_per_expert_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]
    w2_weight_bufs = [torch.empty(weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w2_scale_bufs = [torch.empty(scale_elems_per_expert_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]

    buffer_ptrs = {
        "w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
        "w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
        "w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
        "w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
    }

    keep_tensors = {
        "w13_weight_bufs": w13_weight_bufs,
        "w13_scale_bufs": w13_scale_bufs,
        "w2_weight_bufs": w2_weight_bufs,
        "w2_scale_bufs": w2_scale_bufs,
    }

    return buffer_ptrs, keep_tensors


def bench_write_buffer():
    # Build two MOE instances with different layer_idx
    moe_0, buffer_shapes, keep_tensors_0 = build_moe(layer_idx=0)
    moe_1, _, keep_tensors_1 = build_moe(layer_idx=1)
    moes = [moe_0, moe_1]

    # Allocate shared buffers
    buffer_ptrs, buffer_keep_tensors = allocate_buffers(buffer_shapes)

    total_weights = hidden_size * intermediate_size * expert_num * 3
    # Throughput accounting: scale bytes (bf16) + weight bytes (int4 packed)
    bytes_per_call = total_weights // group_size * 2 + total_weights // 2

    # Warm-up: alternate between two MOEs
    for _ in tqdm(range(warm_up_iter), desc="Warm-up"):
        for moe_idx, moe in enumerate(moes):
            for expert_id in range(gpu_experts_num):
                CPUInfer.submit(
                    moe.write_weight_scale_to_buffer_task(
                        gpu_tp_count=gpu_tp_count,
                        expert_id=expert_id,
                        **buffer_ptrs,
                    )
                )
                CPUInfer.sync()

    total_time = 0
    for iter_idx in tqdm(range(test_iter), desc="Testing"):
        start = time.perf_counter()
        # Alternate between two MOEs
        for moe_idx, moe in enumerate(moes):
            for expert_id in range(gpu_experts_num):
                CPUInfer.submit(
                    moe.write_weight_scale_to_buffer_task(
                        gpu_tp_count=gpu_tp_count,
                        expert_id=expert_id,
                        **buffer_ptrs,
                    )
                )
                CPUInfer.sync()
        end = time.perf_counter()
        iter_time = end - start
        total_time += iter_time
        print(f"Iter {iter_idx}: {iter_time*1000:.2f} ms")
        time.sleep(0.3)

    # bytes_per_call is for one MOE, we have 2 MOEs
    bytes_per_iter = bytes_per_call * 2
    time_per_iter_ms = total_time / test_iter * 1000
    bandwidth_gbs = bytes_per_iter * test_iter / total_time / 1e9

    print(f"\n{'='*60}")
    print("K2 write_weight_scale_to_buffer benchmark (2 MOEs alternating)")
    print(f"{'='*60}")
    print(f"Time per iteration: {time_per_iter_ms:.2f} ms")
    print(f"Bandwidth: {bandwidth_gbs:.2f} GB/s")
    print(f"Experts per MOE: {gpu_experts_num}, MOEs: 2")
    print(f"Time per expert: {time_per_iter_ms/(gpu_experts_num*2)*1000:.2f} us")

    result = {
        "op": "write_weight_scale_to_buffer_k2",
        "time_per_iteration_ms": time_per_iter_ms,
        "bandwidth_GBs": bandwidth_gbs,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "test_parameters": {
            "expert_num": expert_num,
            "hidden_size": hidden_size,
            "intermediate_size": intermediate_size,
            "group_size": group_size,
            "gpu_tp_count": gpu_tp_count,
            "bytes_per_iter": bytes_per_iter,
            "num_moes": 2,
        },
    }
    result.update(get_git_commit())
    result.update(get_system_info())
    record_results(result)


if __name__ == "__main__":
    bench_write_buffer()


================================================
FILE: kt-kernel/bench/bench_linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:31:59
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:35:35
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch

input_size = 16384
output_size = 5120
stride = 16
group_max_len = 1024
layer_num = 10
qlen = 1
CPUInfer = kt_kernel_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000


def bench_linear(quant_mode: str):
    with torch.inference_mode(mode=True):

        hidden_type = 30  # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            proj_type = 0  # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = 1  # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = 30  # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            proj_type = 8  # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            proj_type = 14  # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            proj_type = 13  # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.687500
        elif quant_mode == "q4_k_m":
            proj_type = 12  # ggml_type::GGML_TYPE_Q4_K
            bytes_per_elem = 0.562500
        elif quant_mode == "q3_k_m":
            proj_type = 11  # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.429688
        elif quant_mode == "q2_k":
            proj_type = 10  # ggml_type::GGML_TYPE_Q2_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            proj_type = 21  # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            proj_type = 16  # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert False

        linears = []
        projs = []
        for _ in range(layer_num):
            proj = torch.randn((output_size, input_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
            config = kt_kernel_ext.linear.LinearConfig(
                input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
            )
            linear = kt_kernel_ext.linear.Linear(config)
            projs.append(proj)
            linears.append(linear)
        input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
            "GB/s",
        )
        print("")


bench_linear("fp32")
bench_linear("fp16")
bench_linear("bf16")
bench_linear("q8_0")
bench_linear("q6_k")
bench_linear("q5_k_m")
bench_linear("q4_k_m")
bench_linear("q3_k_m")
bench_linear("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: kt-kernel/bench/bench_linear_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:31:59
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:48
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

input_size = 16384
output_size = 5120
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def bench_linear(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        projs = []
        for _ in range(layer_num):
            proj = torch.randn((output_size, input_size), dtype = torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                proj_q = torch.quantize_per_tensor(proj, scale, zero_point, torch.qint8)
                quantized_layer = nnq.Linear(input_size, output_size)
                quantized_layer.set_weight_bias(proj_q, None)
                projs.append(quantized_layer)
            else:
                projs.append(proj.to(proj_type))
        input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            if isinstance(projs[i % layer_num], nnq.Linear):
                input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
                t_output = projs[i % layer_num](input_q)
            else:
                t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            if isinstance(projs[i % layer_num], nnq.Linear):
                input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
                t_output = projs[i % layer_num](input_q)
            else:
                t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_linear("fp32")
bench_linear("fp16")
bench_linear("bf16")
bench_linear("qint8")


================================================
FILE: kt-kernel/bench/bench_mla.py
================================================
import os, sys
import time
import subprocess
import platform
import json

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init

from tqdm import tqdm

qlen = 4096
kvlen = 0
page_table = list(range(20))
page_size = 256
pages_count = 200


hidden_size = 7168
num_heads = 128
kv_lora_rank = 512
q_lora_rank = 512
nope_size = 128
rope_size = 64
page_size = 512
layer_num = 10


rope_theta = 10000
max_qlen = qlen + kvlen
max_kvlen = 4096
max_position_embeddings = 163840

rope_scaling = {
    "beta_fast": 32,
    "beta_slow": 1,
    "factor": 40,
    "mscale": 1.0,
    "mscale_all_dim": 1.0,
    "original_max_position_embeddings": 4096,
    "type": "yarn",
}

CPUINFER_PARAM = 304
# 初始化 CPUInfer（此处使用原始构造函数，可根据需要调整配置参数）
CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)


warm_up_iter = 20
test_iter = 100


# 获取脚本相关信息，用于生成结果保存文件名
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")


def get_git_commit():
    """
    获取当前 git 提交记录（commit hash 和提交信息），
    并检查是否存在未提交的更改（dirty）
    """
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        # 检查是否存在未提交的更改
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """
    获取系统信息，包括系统名称、CPU 型号、内存大小（GB）、CPU 核数及 socket 数量
    """
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    # 获取 CPU 型号（仅 Linux 支持）
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # 获取内存大小（单位：GB），仅 Linux 支持
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    info["cpu_core_count"] = os.cpu_count()

    # 解析 /proc/cpuinfo 获取 socket 数量
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


def record_results(result, filename=json_path):
    """
    将结果以 JSON 格式追加到文件中
    """
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def bench_mla(quant_mode: str):
    """
    测试 MLA 模型的性能
    """
    with torch.inference_mode():
        # 这里可以添加 MLA 模型的具体实现和测试代码
        hidden_type = 1  # ggml_type::GGML_TYPE_FP16（固定）
        if quant_mode == "fp32":
            q_a_proj_type = 0  # ggml_type::GGML_TYPE_F32
            q_b_proj_type = 0
            kv_a_proj_with_mqa_type = 0
            kv_b_proj_type = 0
            w_o_type = 0
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            q_a_proj_type = 1  # ggml_type::GGML_TYPE_F32
            q_b_proj_type = 1
            kv_a_proj_with_mqa_type = 1
            kv_b_proj_type = 1
            w_o_type = 1
            bytes_per_elem = 2.000000
        elif quant_mode == "q4_k_m":
            q_a_proj_type = 12  # ggml_type::GGML_TYPE_Q4_K
            q_b_proj_type = 12
            kv_a_proj_with_mqa_type = 12  # ggml_type::GGML_TYPE_Q6_K
            kv_b_proj_type = 12
            w_o_type = 12
            bytes_per_elem = 0.5625
        else:
            raise ValueError("不支持的量化模式")

        # 构建各层 MLA 模型的输入数据
        mlas = []
        for i in tqdm(range(layer_num)):
            q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
            q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
            kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
            kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=torch.float16)
            o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)

            init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
            init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
            init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
            init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
            init.normal_(o_proj.weight, mean=0.0, std=0.02)
            q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
            q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
            kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
            kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
            o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()

            config = kt_kernel_ext.mla.MLAConfig(
                hidden_size,
                q_lora_rank,
                kv_lora_rank,
                num_heads,
                nope_size,
                rope_size,
            )
            config.max_qlen = max_qlen
            config.max_kvlen = max_kvlen
            config.max_position_embeddings = max_position_embeddings
            config.rope_scaling_factor = rope_scaling["factor"]
            config.rope_theta = rope_theta
            config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
            config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
            config.rope_scaling_mscale = rope_scaling["mscale"]
            config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
            config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

            config.q_a_proj = q_a_proj_weight.data_ptr()
            config.q_b_proj = q_b_proj_weight.data_ptr()
            config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
            config.kv_b_proj = kv_b_proj_weight.data_ptr()
            config.o_proj = o_proj_weight.data_ptr()

            config.q_a_proj_type = ggml_type.FP16
            config.q_b_proj_type = ggml_type.FP16
            config.kv_a_proj_with_mqa_type = ggml_type.FP16
            config.kv_b_proj_type = ggml_type.FP16
            config.w_o_type = ggml_type.FP16

            config.pool = CPUInfer.backend_

            mla = kt_kernel_ext.mla.MLA(config)
            mla.load_weights()
            mla.set_local_pages(pages_count)
            mlas.append(mla)

        print("Generating data...")
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
        )

        print("Warming up...")

        for i in tqdm(range(warm_up_iter)):
            mlas[i % layer_num].forward(
                [qlen],
                [page_table],
                [kvlen],
                input_tensor[i % layer_num].data_ptr(),
                output_tensor[i % layer_num].data_ptr(),
            )

        print("Start testing...")

        start = time.perf_counter()
        for i in tqdm(range(test_iter)):
            mlas[i % layer_num].forward(
                [qlen],
                [page_table],
                [kvlen],
                input_tensor[i % layer_num].data_ptr(),
                output_tensor[i % layer_num].data_ptr(),
            )

        end = time.perf_counter()
        total_time = end - start

        time_per_iter_us = (total_time * 1e6) / test_iter
        bandwidth = (
            bytes_per_elem
            * (
                q_lora_rank * hidden_size
                + (kv_lora_rank + rope_size) * hidden_size
                + (nope_size + rope_size) * q_lora_rank * num_heads
                + (nope_size + nope_size) * kv_lora_rank * num_heads
                + hidden_size * nope_size * num_heads
                + hidden_size * qlen
            )
            * test_iter
            / (total_time * 1e9)
        )
        flops = (
            2
            * (
                q_lora_rank * hidden_size * qlen
                + kv_lora_rank * hidden_size * qlen
                + num_heads * (nope_size + rope_size) * q_lora_rank * qlen
                + num_heads * qlen * nope_size * kv_lora_rank
                + num_heads * (kvlen + qlen) * kv_lora_rank * qlen
                + num_heads * rope_size * qlen * (qlen + kvlen)
                + num_heads * kv_lora_rank * (qlen + kvlen) * qlen
                + num_heads * nope_size * kv_lora_rank * qlen
                + hidden_size * num_heads * nope_size * qlen
            )
            * test_iter
            / (total_time * 1e12)
        )

        print("Quant mode:", quant_mode)
        print("Time(s):", total_time)
        print("Iteration:", test_iter)
        print("Time(us) per iteration:", time_per_iter_us)
        print("Bandwidth:", bandwidth, "GB/s")
        print("TFLOPS:", flops)
        print("")

        # 整理测试结果
        result = {
            "test_name": os.path.basename(__file__),
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "qlen": qlen,
                "kvlen": kvlen,
                "page_table": page_table,
                "page_size": page_size,
                "pages_count": pages_count,
                "hidden_size": hidden_size,
                "num_heads": num_heads,
                "kv_lora_rank": kv_lora_rank,
                "q_lora_rank": q_lora_rank,
                "nope_size": nope_size,
                "rope_size": rope_size,
                "layer_num": layer_num,
                "rope_theta": rope_theta,
                "max_qlen": max_qlen,
                "max_kvlen": max_kvlen,
                "max_position_embeddings": max_position_embeddings,
                "rope_scaling": rope_scaling,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        # 添加 git 与系统信息
        result.update(get_git_commit())
        result.update(get_system_info())
        # 将结果记录到 JSON 文件中
        print(result)
        record_results(result)


bench_mla("fp16")


================================================
FILE: kt-kernel/bench/bench_mlp.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-16 10:43:18
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:36:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch

hidden_size = 5120
intermediate_size = 3072
stride = 16
group_max_len = 1024
layer_num = 10
qlen = 1
CPUInfer = kt_kernel_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000


def bench_mlp(quant_mode: str):
    with torch.inference_mode(mode=True):

        hidden_type = 30  # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            gate_type = 0  # ggml_type::GGML_TYPE_F32
            up_type = 0  # ggml_type::GGML_TYPE_F32
            down_type = 0  # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            gate_type = 1  # ggml_type::GGML_TYPE_F16
            up_type = 1  # ggml_type::GGML_TYPE_F16
            down_type = 1  # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            gate_type = 30  # ggml_type::GGML_TYPE_BF16
            up_type = 30  # ggml_type::GGML_TYPE_BF16
            down_type = 30  # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            gate_type = 8  # ggml_type::GGML_TYPE_Q8_0
            up_type = 8  # ggml_type::GGML_TYPE_Q8_0
            down_type = 8  # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            gate_type = 14  # ggml_type::GGML_TYPE_Q6_K
            up_type = 14  # ggml_type::GGML_TYPE_Q6_K
            down_type = 14  # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            gate_type = 13  # ggml_type::GGML_TYPE_Q5_K
            up_type = 13  # ggml_type::GGML_TYPE_Q5_K
            down_type = 14  # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.731771
        elif quant_mode == "q4_k_m":
            gate_type = 12  # ggml_type::GGML_TYPE_Q4_K
            up_type = 12  # ggml_type::GGML_TYPE_Q4_K
            down_type = 14  # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.648437
        elif quant_mode == "q3_k_m":
            gate_type = 11  # ggml_type::GGML_TYPE_Q3_K
            up_type = 11  # ggml_type::GGML_TYPE_Q3_K
            down_type = 13  # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.515625
        elif quant_mode == "q2_k":
            gate_type = 10  # ggml_type::GGML_TYPE_Q2_K
            up_type = 10  # ggml_type::GGML_TYPE_Q2_K
            down_type = 11  # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            gate_type = 21  # ggml_type::GGML_TYPE_IQ3_S
            up_type = 21  # ggml_type::GGML_TYPE_IQ3_S
            down_type = 21  # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            gate_type = 16  # ggml_type::GGML_TYPE_IQ2_XXS
            up_type = 16  # ggml_type::GGML_TYPE_IQ2_XXS
            down_type = 16  # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert False

        mlps = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = (
                torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
            )
            up_proj = (
                torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
            )
            down_proj = (
                torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
            )
            config = kt_kernel_ext.mlp.MLPConfig(
                hidden_size,
                intermediate_size,
                stride,
                group_max_len,
                gate_proj.data_ptr(),
                up_proj.data_ptr(),
                down_proj.data_ptr(),
                gate_type,
                up_type,
                down_type,
                hidden_type,
            )
            mlp = kt_kernel_ext.mlp.MLP(config)
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            mlps.append(mlp)
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
            "GB/s",
        )
        print("")


bench_mlp("fp32")
bench_mlp("fp16")
bench_mlp("bf16")
bench_mlp("q8_0")
bench_mlp("q6_k")
bench_mlp("q5_k_m")
bench_mlp("q4_k_m")
bench_mlp("q3_k_m")
bench_mlp("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: kt-kernel/bench/bench_mlp_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-16 10:43:18
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:53
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

hidden_size = 5120
intermediate_size = 3072
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    if isinstance(gate_proj, nnq.Linear):
        input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
        gate_buf = gate_proj(input_q)
        up_buf = up_proj(input_q)
        gate_buf = gate_buf.dequantize()
        up_buf = up_buf.dequantize()
        intermediate = act_fn(gate_buf) * up_buf
        intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
        expert_output = down_proj(intermediate_q)
        ret = expert_output.dequantize()
    else:
        gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
        up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
        intermediate = act_fn(gate_buf) * up_buf
        ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
    return ret

def bench_mlp(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                gate_proj_q = torch.quantize_per_tensor(gate_proj, scale, zero_point, torch.qint8)
                quantized_gate = nnq.Linear(hidden_size, intermediate_size)
                quantized_gate.set_weight_bias(gate_proj_q, None)
                up_proj_q = torch.quantize_per_tensor(up_proj, scale, zero_point, torch.qint8)
                quantized_up = nnq.Linear(hidden_size, intermediate_size)
                quantized_up.set_weight_bias(up_proj_q, None)
                down_proj_q = torch.quantize_per_tensor(down_proj, scale, zero_point, torch.qint8)
                quantized_down = nnq.Linear(intermediate_size, hidden_size)
                quantized_down.set_weight_bias(down_proj_q, None)
                gate_projs.append(quantized_gate)
                up_projs.append(quantized_up)
                down_projs.append(quantized_down)
            else:
                gate_projs.append(gate_proj.to(proj_type))
                up_projs.append(up_proj.to(proj_type))
                down_projs.append(down_proj.to(proj_type))
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_mlp("fp32")
bench_mlp("fp16")
bench_mlp("bf16")
bench_mlp("qint8")


================================================
FILE: kt-kernel/bench/bench_moe.py
================================================
import os
import sys
import time
import json
import subprocess
import platform

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
import torch
from tqdm import tqdm

# 测试参数设置
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
m_block = 1
group_min_len = 10
group_max_len = 1024
num_experts_per_tok = 8
# layer_num = 5  # 测试时不同的层数
# qlen = 1
# warm_up_iter = 100
# test_iter = 10000

layer_num = 1  # 测试时不同的层数
qlen = 1024
warm_up_iter = 100
test_iter = 10000
CPUINFER_PARAM = 304
# 初始化 CPUInfer（此处使用原始构造函数，可根据需要调整配置参数）
CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)

# 获取脚本相关信息，用于生成结果保存文件名
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")


def get_git_commit():
    """
    获取当前 git 提交记录（commit hash 和提交信息），
    并检查是否存在未提交的更改（dirty）
    """
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        # 检查是否存在未提交的更改
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """
    获取系统信息，包括系统名称、CPU 型号、内存大小（GB）、CPU 核数及 socket 数量
    """
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    # 获取 CPU 型号（仅 Linux 支持）
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # 获取内存大小（单位：GB），仅 Linux 支持
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    info["cpu_core_count"] = os.cpu_count()

    # 解析 /proc/cpuinfo 获取 socket 数量
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


def record_results(result, filename=json_path):
    """
    将结果以 JSON 格式追加到文件中
    """
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def bench_moe(quant_mode: str):
    """
    依据不同量化模式进行 MoE 性能测试，包含预热与测试阶段
    """
    with torch.inference_mode():
        # 根据量化模式设置数据类型与 bytes_per_elem
        hidden_type = 30  # ggml_type::GGML_TYPE_BF16（固定）
        if quant_mode == "fp32":
            gate_type = 0  # ggml_type::GGML_TYPE_F32
            up_type = 0
            down_type = 0
            bytes_per_elem = 4.0
        elif quant_mode == "fp16":
            gate_type = 1  # ggml_type::GGML_TYPE_F16
            up_type = 1
            down_type = 1
            bytes_per_elem = 2.0
        elif quant_mode == "bf16":
            gate_type = 30  # ggml_type::GGML_TYPE_BF16
            up_type = 30
            down_type = 30
            bytes_per_elem = 2.0
        elif quant_mode == "q8_0":
            gate_type = 8  # ggml_type::GGML_TYPE_Q8_0
            up_type = 8
            down_type = 8
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            gate_type = 14  # ggml_type::GGML_TYPE_Q6_K
            up_type = 14
            down_type = 14
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            gate_type = 13  # ggml_type::GGML_TYPE_Q5_K
            up_type = 13
            down_type = 14  # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.731771
        elif quant_mode == "q4_k_m":
            gate_type = 12  # ggml_type::GGML_TYPE_Q4_K
            up_type = 12
            down_type = 14  # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.648437
        elif quant_mode == "q3_k_m":
            gate_type = 11  # ggml_type::GGML_TYPE_Q3_K
            up_type = 11
            down_type = 13  # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.515625
        elif quant_mode == "q2_k":
            gate_type = 10  # ggml_type::GGML_TYPE_Q2_K
            up_type = 10
            down_type = 11  # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            gate_type = 21  # ggml_type::GGML_TYPE_IQ3_S
            up_type = 21
            down_type = 21
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            gate_type = 16  # ggml_type::GGML_TYPE_IQ2_XXS
            up_type = 16
            down_type = 16
            bytes_per_elem = 0.257812
        else:
            raise ValueError("不支持的量化模式")

        # 构建各层 MoE 模型
        moes = []
        for _ in tqdm(range(layer_num), desc="Initializing MOEs"):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu")
                .to("cpu")
                .contiguous()
            )

            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
            config.pool = CPUInfer.backend_
            config.m_block = m_block
            config.group_min_len = group_min_len
            config.group_max_len = group_max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.gate_type = gate_type
            config.up_type = up_type
            config.down_type = down_type
            config.hidden_type = hidden_type

            moe = kt_kernel_ext.moe.MOE(config)
            CPUInfer.submit(moe.load_weights_task())
            CPUInfer.sync()
            moes.append(moe)

        # 生成输入数据
        print("Generating data...")
        # 专家路由索引与权重，每层一个
        gen_iter = 1000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .contiguous()
        )
        weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
        input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        # 将 qlen 封装成 tensor，用于 forward 调用
        qlen_tensor = torch.tensor([qlen], dtype=torch.int32)

        # 预热阶段
        print("Warming up...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # 测试阶段
        print("Start testing...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    qlen_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # 计算性能指标
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # 单位：GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # 单位：TFLOPS

        # 打印结果
        print("Quant mode:", quant_mode)
        print("Time(s):", total_time)
        print("Iteration:", test_iter)
        print("Time(us) per iteration:", time_per_iter_us)
        print("Bandwidth:", bandwidth, "GB/s")
        print("TFLOPS:", flops)
        print("")

        # 整理测试结果
        result = {
            "test_name": os.path.basename(__file__),
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "m_block": m_block,
                "group_min_len": group_min_len,
                "group_max_len": group_max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        # 添加 git 与系统信息
        result.update(get_git_commit())
        result.update(get_system_info())
        # 将结果记录到 JSON 文件中
        record_results(result)


if __name__ == "__main__":
    # 根据需要选择量化模式，目前调用 q4_k_m 模式，对 layer_nums 列表中各层数进行测试
    bench_moe("q4_k_m")
    # 其他量化模式调用可以按需取消注释
    # bench_moe("fp32", layer_num)
    # bench_moe("fp16", layer_num)
    # bench_moe("bf16", layer_num)
    # bench_moe("q8_0")
    # bench_moe("q6_k", layer_num)
    # bench_moe("q5_k_m", layer_num)
    # bench_moe("q3_k_m", layer_num)
    # bench_moe("q2_k", layer_num)
    # bench_moe("iq3_xs", layer_num)
    # bench_moe("iq2_xxs", layer_num)


================================================
FILE: kt-kernel/bench/bench_moe_amx.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys, time, json, subprocess, platform

from tqdm import tqdm

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch
from kt_kernel import kt_kernel_ext
import numpy as np

# 测试参数设置
expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
layer_num = 2

qlen = 2048
warm_up_iter = 1000
test_iter = 2000
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()

# 将 CPUInfer 参数设为变量
# CPUINFER_PARAM = 257
# CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)

worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [80, 80]
CPUINFER_PARAM = 160
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)


def get_git_commit():
    """
    获取当前 git 提交记录（commit hash 和提交信息），
    并检查是否存在未提交的更改（dirty）
    """
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        # 检查是否存在未提交的更改
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """
    获取系统信息，包括系统名称、CPU 型号、内存大小（GB）、CPU 核数及 socket 数量
    """
    info = {}
    # 系统名称及主机名
    uname = platform.uname()
    info["system_name"] = uname.system  # 如 Linux, Windows 等
    info["node_name"] = uname.node  # 主机名称

    # 获取 CPU 型号（仅 Linux 支持）
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # 获取内存大小（单位：GB），仅 Linux 支持
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # 获取 CPU 核数（逻辑核数）
    info["cpu_core_count"] = os.cpu_count()

    # 解析 /proc/cpuinfo 获取 socket 数量
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    # 如果没有解析到 socket 信息，则默认至少有 1 个 socket
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, script_name + ".jsonl")


def record_results(result, filename=json_path):
    """
    将结果以 JSON 格式追加到文件中
    """
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def bench_moe(quant_mode: str):
    with torch.inference_mode():
        if quant_mode == "bf16":
            bytes_per_elem = 2.0
        elif quant_mode == "int8":
            bytes_per_elem = 1.0
        elif quant_mode == "int4":
            bytes_per_elem = 0.5
        else:
            raise ValueError("不支持的量化模式")

        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for layer_index in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_
            if quant_mode == "bf16":
                moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
            elif quant_mode == "int8":
                moe = kt_kernel_ext.moe.AMXInt8_MOE(config)
            elif quant_mode == "int4":
                moe = kt_kernel_ext.moe.AMXInt4_MOE(config)
            CPUInfer.submit(moe.load_weights_task())
            CPUInfer.sync()
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)
        gen_iter = 3000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .to("cpu")
            .contiguous()
        )
        weights = (
            torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
        )
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        bsz_tensor = torch.tensor([qlen], device="cpu")

        # 预热迭代
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)

        # 测试迭代
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            # print(f'test iteration {i}')
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)
        end = time.perf_counter()
        total_time = end - start

        # 计算性能指标
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # 单位：GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # 单位：TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")
        print("")

        # 整理结果记录，包括测试参数
        result = {
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        # 添加 git 提交记录信息
        result.update(get_git_commit())
        # 添加系统信息（包括 CPU 核数和 socket 数量）
        result.update(get_system_info())
        # 将结果以 JSON 形式追加到文件中
        record_results(result)


if __name__ == "__main__":
    # 选择需要测试的量化模式
    # bench_moe("bf16")
    bench_moe("int8")
    # bench_moe("int4")


================================================
FILE: kt-kernel/bench/bench_moe_amx_k.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys, time, json, subprocess, platform

from tqdm import tqdm

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
import torch
import numpy as np

# 测试参数设置
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
layer_num = 4
qlen = 1024
# qlen = 1
warm_up_iter = 1000
test_iter = 5000
k_group_size = 128

physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
# 将 CPUInfer 参数设为变量
# CPUINFER_PARAM = 257
# CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)

worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [40, 40]
CPUINFER_PARAM = 80
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)


def get_git_commit():
    """
    获取当前 git 提交记录（commit hash 和提交信息），
    并检查是否存在未提交的更改（dirty）
    """
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        # 检查是否存在未提交的更改
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """
    获取系统信息，包括系统名称、CPU 型号、内存大小（GB）、CPU 核数及 socket 数量
    """
    info = {}
    # 系统名称及主机名
    uname = platform.uname()
    info["system_name"] = uname.system  # 如 Linux, Windows 等
    info["node_name"] = uname.node  # 主机名称

    # 获取 CPU 型号（仅 Linux 支持）
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # 获取内存大小（单位：GB），仅 Linux 支持
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # 获取 CPU 核数（逻辑核数）
    info["cpu_core_count"] = os.cpu_count()

    # 解析 /proc/cpuinfo 获取 socket 数量
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    # 如果没有解析到 socket 信息，则默认至少有 1 个 socket
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, script_name + ".jsonl")


def record_results(result, filename=json_path):
    """
    将结果以 JSON 格式追加到文件中
    """
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def bench_moe(quant_mode: str):
    with torch.inference_mode():
        if quant_mode == "bf16":
            bytes_per_elem = 2.0
        elif quant_mode == "int8":
            bytes_per_elem = 1.0
        elif quant_mode == "int4":
            bytes_per_elem = 0.5
        elif quant_mode == "int4_1k":
            bytes_per_elem = 0.5
        else:
            raise ValueError("不支持的量化模式")

        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for layer_index in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_
            if quant_mode == "bf16":
                moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
            elif quant_mode == "int8":
                moe = kt_kernel_ext.moe.AMXInt8_MOE(config)
            elif quant_mode == "int4":
                moe = kt_kernel_ext.moe.AMXInt4_MOE(config)
            elif quant_mode == "int4_1k":
                config.quant_config.bits = 4
                config.quant_config.group_size = k_group_size
                config.quant_config.zero_point = True
                config.gate_scale = 0
                moe = kt_kernel_ext.moe.AMXInt4_1KGroup_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)
        gen_iter = 3000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .to("cpu")
            .contiguous()
        )
        weights = (
            torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
        )
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        bsz_tensor = torch.tensor([qlen], device="cpu")

        # 预热迭代
        # for i in range(warm_up_iter):
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)

        # 测试迭代
        start = time.perf_counter()
        # for i in range(test_iter):
        for i in tqdm(range(test_iter), desc="Testing"):
            # print(f'test iteration {i}')
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)
        end = time.perf_counter()
        total_time = end - start

        # 计算性能指标
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # 单位：GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # 单位：TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")
        print("")

        # 整理结果记录，包括测试参数
        result = {
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
                "k_group_size": k_group_size,
            },
        }
        # 添加 git 提交记录信息
        result.update(get_git_commit())
        # 添加系统信息（包括 CPU 核数和 socket 数量）
        result.update(get_system_info())
        # 将结果以 JSON 形式追加到文件中
        record_results(result)


if __name__ == "__main__":
    # 选择需要测试的量化模式
    # bench_moe("bf16")
    # bench_moe("int8")
    # bench_moe("int4")
    bench_moe("int4_1k")


================================================
FILE: kt-kernel/bench/bench_moe_kernel.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys, time, json, subprocess, platform

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch
from kt_kernel import kt_kernel_ext
import numpy as np
from tqdm import tqdm


# 测试参数设置
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 51200
num_experts_per_tok = 8
layer_num = 1
m_block = 320
n_block_up_gate = 32
n_block_down = 64
n_block_up_gate_prefi = 32
n_block_down_prefi = 64
qlen = 2048
warm_up_iter = 1000
test_iter = 1000

# 将 CPUInfer 参数设为变量
CPUINFER_PARAM = 160
CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()


# worker_config = kt_kernel_ext.WorkerPoolConfig()
# worker_config.subpool_count = 4
# worker_config.subpool_numa_map= [0,1,2,3]
# worker_config.subpool_thread_count = [36,36,36,36]
# worker_config.subpool_thread_count = [39,39,39,39]
# CPUINFER_PARAM = 156
# CPUInfer = kt_kernel_ext.CPUInfer(worker_config)


def get_git_commit():
    """
    获取当前 git 提交记录（commit hash 和提交信息），
    并检查是否存在未提交的更改（dirty）
    """
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        # 检查是否存在未提交的更改
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """
    获取系统信息，包括系统名称、CPU 型号、内存大小（GB）、CPU 核数及 socket 数量
    """
    info = {}
    # 系统名称及主机名
    uname = platform.uname()
    info["system_name"] = uname.system  # 如 Linux, Windows 等
    info["node_name"] = uname.node  # 主机名称

    # 获取 CPU 型号（仅 Linux 支持）
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # 获取内存大小（单位：GB），仅 Linux 支持
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # 获取 CPU 核数（逻辑核数）
    info["cpu_core_count"] = os.cpu_count()

    # 解析 /proc/cpuinfo 获取 socket 数量
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    # 如果没有解析到 socket 信息，则默认至少有 1 个 socket
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")


def record_results(result, filename=json_path):
    """
    将结果以 JSON 格式追加到文件中
    """
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def bench_moe(quant_mode: str):
    with torch.inference_mode():
        if quant_mode == "int8":
            bytes_per_elem = 1.0
        elif quant_mode == "int4":
            bytes_per_elem = 0.5
        else:
            raise ValueError("不支持的量化模式")

        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for layer_index in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_
            if quant_mode == "int8":
                d = kt_kernel_ext.moe.tiling.get_int8()
                nbug_prefi = n_block_up_gate_prefi
                nbd_prefi = n_block_down_prefi
                kb = d["k_block"]
                nb = d["n_block"]
                mb = m_block
                nbug = n_block_up_gate
                nbd = n_block_down
                print(
                    f"Int8 Tiling: nbug {nbug}, nbd {nbd}, nb {nb}, mb {mb}, kb {kb}, nbug_prefi {nbug_prefi}, nbd_prefi {nbd_prefi}"
                )
                kt_kernel_ext.moe.tiling.set_int8(nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi)
                moe = kt_kernel_ext.moe.Int8_KERNEL_MOE(config)
            elif quant_mode == "int4":
                moe = kt_kernel_ext.moe.Int4_KERNEL_MOE(config)
            else:
                raise ValueError(f"Unsupported quantization mode: {quant_mode}")
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)

        expert_ids = (
            torch.rand(test_iter * qlen, expert_num, device="cuda")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(test_iter, qlen * num_experts_per_tok)
            .to("cpu")
            .contiguous()
        )
        weights = (
            torch.rand((test_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cuda")
            .to("cpu")
            .contiguous()
        )
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        bsz_tensor = torch.tensor([qlen], device="cuda").to("cpu").contiguous()

        # 预热迭代
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            # print(f'warmup iteration {i}')
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i].data_ptr(),
                    weights[i].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    # False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)

        # 测试迭代
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            # print(f'test iteration {i}')
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i].data_ptr(),
                    weights[i].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)
        end = time.perf_counter()
        total_time = end - start

        # 计算性能指标
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            # * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * qlen
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # 单位：GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # 单位：TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")
        print("")

        # 整理结果记录，包括测试参数
        result = {
            "test_name": os.path.basename(__file__),
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        # 添加 git 提交记录信息
        result.update(get_git_commit())
        # 添加系统信息（包括 CPU 核数和 socket 数量）
        result.update(get_system_info())
        # 将结果以 JSON 形式追加到文件中
        record_results(result)


if __name__ == "__main__":
    # 选择需要测试的量化模式
    bench_moe("int8")
    # bench_moe("int4")


================================================
FILE: kt-kernel/bench/bench_moe_kernel_tiling.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Bench MOE kernel with runtime tiling params (N_BLOCK_UP_GATE, N_BLOCK_DOWN, N_BLOCK, M_BLOCK, K_BLOCK)
- Demonstrates how to get/set tiling params from Python via kt_kernel_ext.moe.tiling
- Runs a small benchmark similar to bench_moe_kernel.py

Usage examples:
  # 1) Just run with defaults (int8)
  python bench_moe_kernel_tiling.py --quant int8

  # 2) Override tiling params for INT8
  python bench_moe_kernel_tiling.py --quant int8 \
    --n_block_up_gate 32 --n_block_down 64 --n_block 64 --m_block 320 --k_block 7168

  # 3) Set both INT8 and INT4 tiling params (if INT4 kernel is available on your platform)
  python bench_moe_kernel_tiling.py --quant int4 --set_all \
    --n_block_up_gate 256 --n_block_down 1024 --n_block 64 --m_block 320 --k_block 7168
"""
import os
import sys
import time
import argparse

os.environ.setdefault("BLAS_NUM_THREADS", "1")
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

import torch  # noqa: E402
from kt_kernel import kt_kernel_ext as ce  # noqa: E402
from tqdm import tqdm  # noqa: E402


def maybe_get_class(module, name):
    return getattr(module, name) if hasattr(module, name) else None


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--quant", choices=["int8", "int4"], default="int8")
    parser.add_argument("--expert_num", type=int, default=256)
    parser.add_argument("--hidden_size", type=int, default=7168)
    parser.add_argument("--intermediate_size", type=int, default=2048)
    parser.add_argument("--num_experts_per_tok", type=int, default=8)
    parser.add_argument("--max_len", type=int, default=25600)
    parser.add_argument("--layer_num", type=int, default=1)
    parser.add_argument("--qlen", type=int, default=1024)
    parser.add_argument("--warm_up_iter", type=int, default=200)
    parser.add_argument("--test_iter", type=int, default=500)
    parser.add_argument("--threads", type=int, default=160, help="CPUInfer initialization param")

    # Tiling params
    parser.add_argument("--set_all", action="store_true", help="Apply tiling to both INT8 and INT4 kernels")
    parser.add_argument("--n_block_up_gate", type=int, default=None)
    parser.add_argument("--n_block_down", type=int, default=None)
    parser.add_argument("--n_block", type=int, default=None)
    parser.add_argument("--m_block", type=int, default=None)
    parser.add_argument("--k_block", type=int, default=None)
    parser.add_argument("--n_block_up_gate_prefi", type=int, default=None)
    parser.add_argument("--n_block_down_prefi", type=int, default=None)

    args = parser.parse_args()

    # Show current tiling defaults
    if args.quant == "int8":
        print("[tiling] default int8:", ce.moe.tiling.get_int8())
    if hasattr(ce.moe.tiling, "get_int4") and args.quant == "int4":
        print("[tiling] default int4:", ce.moe.tiling.get_int4())

    # Apply overrides if provided
    if any(v is not None for v in [args.n_block_up_gate, args.n_block_down, args.n_block, args.m_block, args.k_block]):
        # Fill missing values with current defaults to avoid overwriting unrelated params
        def fill_defaults(getter):
            cur = getter()
            return (
                args.n_block_up_gate if args.n_block_up_gate is not None else int(cur["n_block_up_gate"]),
                args.n_block_down if args.n_block_down is not None else int(cur["n_block_down"]),
                args.n_block if args.n_block is not None else int(cur["n_block"]),
                args.m_block if args.m_block is not None else int(cur["m_block"]),
                args.k_block if args.k_block is not None else int(cur["k_block"]),
                (
                    args.n_block_up_gate_prefi
                    if args.n_block_up_gate_prefi is not None
                    else int(cur["n_block_up_gate_prefi"])
                ),
                args.n_block_down_prefi if args.n_block_down_prefi is not None else int(cur["n_block_down_prefi"]),
            )

        if args.set_all and hasattr(ce.moe.tiling, "set_all"):
            nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi = fill_defaults(ce.moe.tiling.get_int8)
            ce.moe.tiling.set_all(nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi)
            print("[tiling] set_all ->", ce.moe.tiling.get_int8())
            if hasattr(ce.moe.tiling, "get_int4"):
                print("[tiling] set_all -> int4:", ce.moe.tiling.get_int4())
        else:
            if args.quant == "int8":
                nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi = fill_defaults(ce.moe.tiling.get_int8)
                ce.moe.tiling.set_int8(nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi)
                print("[tiling] set_int8 ->", ce.moe.tiling.get_int8())
            elif args.quant == "int4" and hasattr(ce.moe.tiling, "set_int4"):
                nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi = fill_defaults(ce.moe.tiling.get_int4)
                ce.moe.tiling.set_int4(nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi)
                print("[tiling] set_int4 ->", ce.moe.tiling.get_int4())

    # Warn about divisibility expectations; kernels assume specific blocking
    # - Some helpers assert n % N_BLOCK == 0, etc. Ensure your dims/tiles align.
    print("[note] Ensure your selected tiling parameters are compatible with hidden/intermediate sizes and blocking.")

    # Initialize CPUInfer
    CPUInfer = ce.CPUInfer(args.threads)

    # Select MOE kernel
    moe_cls = None
    if args.quant == "int8":
        moe_cls = maybe_get_class(ce.moe, "Int8_KERNEL_MOE")
        if moe_cls is None:
            raise RuntimeError("Int8 kernel binding 'Int8_KERNEL_MOE' not found.")
        bytes_per_elem = 1.0
    else:
        moe_cls = maybe_get_class(ce.moe, "Int4_KERNEL_MOE")
        if moe_cls is None:
            raise RuntimeError("Int4 kernel binding 'Int4_KERNEL_MOE' not available on this platform.")
        bytes_per_elem = 0.5

    # Prepare config/weights
    expert_num = args.expert_num
    hidden_size = args.hidden_size
    intermediate_size = args.intermediate_size
    num_experts_per_tok = args.num_experts_per_tok
    layer_num = args.layer_num
    max_len = args.max_len

    physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()

    moes = []
    gate_projs, up_projs, down_projs = [], [], []

    for layer_idx in range(layer_num):
        gate_proj = torch.randn(
            (expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cpu"
        ).contiguous()
        up_proj = torch.randn(
            (expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cpu"
        ).contiguous()
        down_proj = torch.randn(
            (expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cpu"
        ).contiguous()

        cfg = ce.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
        cfg.max_len = max_len
        cfg.gate_proj = gate_proj.data_ptr()
        cfg.up_proj = up_proj.data_ptr()
        cfg.down_proj = down_proj.data_ptr()
        cfg.pool = CPUInfer.backend_

        moe = moe_cls(cfg)
        CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
        CPUInfer.sync()

        gate_projs.append(gate_proj)
        up_projs.append(up_proj)
        down_projs.append(down_proj)
        moes.append(moe)

    qlen = args.qlen
    warm_up_iter = args.warm_up_iter
    test_iter = args.test_iter

    expert_ids = (
        torch.rand(test_iter * qlen, expert_num)
        .argsort(dim=-1)[:, :num_experts_per_tok]
        .reshape(test_iter, qlen * num_experts_per_tok)
        .to("cpu")
        .contiguous()
    )
    weights = torch.rand((test_iter, qlen, num_experts_per_tok), dtype=torch.float32).to("cpu").contiguous()
    input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16).to("cpu").contiguous()
    output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16).to("cpu").contiguous()
    bsz_tensor = torch.tensor([qlen], dtype=torch.int32).to("cpu").contiguous()

    # Warmup
    for i in tqdm(range(warm_up_iter), desc="Warm-up"):
        CPUInfer.submit(
            moes[i % layer_num].forward_task(
                bsz_tensor.data_ptr(),
                num_experts_per_tok,
                expert_ids[i].data_ptr(),
                weights[i].data_ptr(),
                input_tensor[i % layer_num].data_ptr(),
                output_tensor[i % layer_num].data_ptr(),
            )
        )
        CPUInfer.sync()

    # Measure
    start = time.perf_counter()
    for i in tqdm(range(test_iter), desc="Testing"):
        CPUInfer.submit(
            moes[i % layer_num].forward_task(
                bsz_tensor.data_ptr(),
                num_experts_per_tok,
                expert_ids[i].data_ptr(),
                weights[i].data_ptr(),
                input_tensor[i % layer_num].data_ptr(),
                output_tensor[i % layer_num].data_ptr(),
                False,
            )
        )
        CPUInfer.sync()
    end = time.perf_counter()

    total_time = end - start
    time_per_iter_us = total_time / test_iter * 1e6
    bandwidth_gbs = (
        hidden_size * intermediate_size * 3 * num_experts_per_tok * qlen * bytes_per_elem * test_iter / total_time / 1e9
    )
    flops_tflops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12

    print("\n=== Results ===")
    print("quant:", args.quant)
    if hasattr(ce.moe.tiling, "get_int8") and args.quant == "int8":
        print("tiling int8:", ce.moe.tiling.get_int8())
    if hasattr(ce.moe.tiling, "get_int4") and args.quant == "int4":
        print("tiling int4:", ce.moe.tiling.get_int4())
    print("time (s):", total_time)
    print("iter:", test_iter)
    print("time per iter (us):", time_per_iter_us)
    print("bandwidth (GB/s):", bandwidth_gbs)
    print("TFLOPS:", flops_tflops)


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/bench/bench_moe_kml.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys, time, json, subprocess, platform

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
import torch
import numpy as np
from tqdm import tqdm


# 测试参数设置
expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
layer_num = 1

qlen = 1
warm_up_iter = 1000
test_iter = 10000

# 将 CPUInfer 参数设为变量
CPUINFER_PARAM = 112
CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)

# worker_config = kt_kernel_ext.WorkerPoolConfig()
# worker_config.subpool_count = 4
# worker_config.subpool_numa_map= [0,1,2,3]
# worker_config.subpool_thread_count = [36,36,36,36]
# worker_config.subpool_thread_count = [39,39,39,39]
# CPUINFER_PARAM = 156
# CPUInfer = kt_kernel_ext.CPUInfer(worker_config)


def get_git_commit():
    """
    获取当前 git 提交记录（commit hash 和提交信息），
    并检查是否存在未提交的更改（dirty）
    """
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        # 检查是否存在未提交的更改
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """
    获取系统信息，包括系统名称、CPU 型号、内存大小（GB）、CPU 核数及 socket 数量
    """
    info = {}
    # 系统名称及主机名
    uname = platform.uname()
    info["system_name"] = uname.system  # 如 Linux, Windows 等
    info["node_name"] = uname.node  # 主机名称

    # 获取 CPU 型号（仅 Linux 支持）
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # 获取内存大小（单位：GB），仅 Linux 支持
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # 获取 CPU 核数（逻辑核数）
    info["cpu_core_count"] = os.cpu_count()

    # 解析 /proc/cpuinfo 获取 socket 数量
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    # 如果没有解析到 socket 信息，则默认至少有 1 个 socket
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")


def record_results(result, filename=json_path):
    """
    将结果以 JSON 格式追加到文件中
    """
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def bench_moe(quant_mode: str):
    with torch.inference_mode():
        if quant_mode == "bf16":
            bytes_per_elem = 2.0
        elif quant_mode == "int8":
            bytes_per_elem = 1.0
        elif quant_mode == "int4":
            bytes_per_elem = 0.5
        else:
            raise ValueError("不支持的量化模式")

        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for layer_index in range(layer_num):
            gate_proj = torch.randn(
                (expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cpu"
            ).contiguous()
            up_proj = torch.randn(
                (expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cpu"
            ).contiguous()
            down_proj = torch.randn(
                (expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cpu"
            ).contiguous()
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_
            if quant_mode == "int8":
                moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
            elif quant_mode == "int4":
                moe = kt_kernel_ext.moe.KMLInt4_MOE(config)
            else:
                raise ValueError(f"Unsupported quantization mode: {quant_mode}")
            CPUInfer.submit(moe.load_weights_task())
            CPUInfer.sync()
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)

        expert_ids = (
            torch.rand(test_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(test_iter, qlen * num_experts_per_tok)
            .contiguous()
        )
        weights = torch.rand((test_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
        input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        bsz_tensor = torch.tensor([qlen], device="cpu")

        # 预热迭代
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            # print(f'warmup iteration {i}')
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i].data_ptr(),
                    weights[i].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)

        # 测试迭代
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            # print(f'test iteration {i}')
            # start_it = time.time_ns()
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i].data_ptr(),
                    weights[i].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            # end_it = time.time_ns()
            # print('python Time(ns): ', end_it - start_it)
        end = time.perf_counter()
        total_time = end - start

        # 计算性能指标
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # 单位：GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # 单位：TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")
        print("")

        # 整理结果记录，包括测试参数
        result = {
            "test_name": os.path.basename(__file__),
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        # 添加 git 提交记录信息
        result.update(get_git_commit())
        # 添加系统信息（包括 CPU 核数和 socket 数量）
        result.update(get_system_info())
        # 将结果以 JSON 形式追加到文件中
        record_results(result)


if __name__ == "__main__":
    # 选择需要测试的量化模式
    # bench_moe("bf16")
    # bench_moe("int8")
    bench_moe("int4")


================================================
FILE: kt-kernel/bench/bench_moe_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:57
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

expert_num = 160
hidden_size = 5120
intermediate_size = 1536
num_experts_per_tok = 6
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    if isinstance(gate_proj, nnq.Linear):
        input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
        gate_buf = gate_proj(input_q)
        up_buf = up_proj(input_q)
        gate_buf = gate_buf.dequantize()
        up_buf = up_buf.dequantize()
        intermediate = act_fn(gate_buf) * up_buf
        intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
        expert_output = down_proj(intermediate_q)
        ret = expert_output.dequantize()
    else:
        gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
        up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
        intermediate = act_fn(gate_buf) * up_buf
        ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
    return ret

def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output

def bench_moe(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                quantized_gate_proj = []
                quantized_up_proj = []
                quantized_down_proj = []
                for i in range(expert_num):
                    gate_proj_q = torch.quantize_per_tensor(gate_proj[i], scale, zero_point, torch.qint8)
                    quantized_gate = nnq.Linear(hidden_size, intermediate_size)
                    quantized_gate.set_weight_bias(gate_proj_q, None)
                    quantized_gate_proj.append(quantized_gate)
                    up_proj_q = torch.quantize_per_tensor(up_proj[i], scale, zero_point, torch.qint8)
                    quantized_up = nnq.Linear(hidden_size, intermediate_size)
                    quantized_up.set_weight_bias(up_proj_q, None)
                    quantized_up_proj.append(quantized_up)
                    down_proj_q = torch.quantize_per_tensor(down_proj[i], scale, zero_point, torch.qint8)
                    quantized_down = nnq.Linear(intermediate_size, hidden_size)
                    quantized_down.set_weight_bias(down_proj_q, None)
                    quantized_down_proj.append(quantized_down)
                gate_projs.append(quantized_gate_proj)
                up_projs.append(quantized_up_proj)
                down_projs.append(quantized_down_proj)
            else:
                gate_projs.append(gate_proj.to(proj_type))
                up_projs.append(up_proj.to(proj_type))
                down_projs.append(down_proj.to(proj_type))
        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:num_experts_per_tok] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
        weights = torch.rand((layer_num, qlen, num_experts_per_tok), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * num_experts_per_tok * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_moe("fp32")
bench_moe("fp16")
bench_moe("bf16")
bench_moe("qint8")


================================================
FILE: kt-kernel/bench/bench_write_buffer.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Benchmark write_weight_scale_to_buffer for AMX MOE operators.

Supports:
- FP8: FP8 weights (1 byte) + float32 scales (block-wise)
- FP8_PERCHANNEL: FP8 weights (1 byte) + float32 per-channel scales
- BF16: Native BF16 weights (2 bytes), no scales

Usage:
    python bench_write_buffer.py          # Run all modes
    python bench_write_buffer.py fp8      # Run FP8 only
    python bench_write_buffer.py fp8_perchannel  # Run FP8 per-channel only
    python bench_write_buffer.py bf16     # Run BF16 only
"""
import json
import os
import platform
import subprocess
import sys
import time

from tqdm import tqdm

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

from kt_kernel import kt_kernel_ext
import torch

# Benchmark parameters
expert_num = 256
num_experts_per_tok = 8
gpu_tp_count = 2

warm_up_iter = 30
test_iter = 70

gpu_experts_num = expert_num

hidden_size = 7168
intermediate_size = 2048
group_size = 128  # FP8 uses 128x128 block-wise scales
max_len = 1

physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
CPUInfer = kt_kernel_ext.CPUInfer(80)


def get_git_commit():
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg
        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        result["dirty"] = bool(dirty_output)
        if dirty_output:
            result["dirty_files"] = dirty_output.splitlines()
    except Exception as e:
        result["error"] = str(e)
    return result


def get_system_info():
    info = {}
    info["system_name"] = platform.uname().system
    info["node_name"] = platform.uname().node
    info["cpu_core_count"] = os.cpu_count()
    if os.path.exists("/proc/cpuinfo"):
        with open("/proc/cpuinfo", "r") as f:
            for line in f:
                if "model name" in line:
                    info["cpu_model"] = line.split(":", 1)[1].strip()
                    break
    if os.path.exists("/proc/meminfo"):
        with open("/proc/meminfo", "r") as f:
            for line in f:
                if "MemTotal" in line:
                    mem_kb = float(line.split(":", 1)[1].split()[0])
                    info["memory_size_GB"] = round(mem_kb / (1024 * 1024), 2)
                    break
    return info


script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, script_name + ".jsonl")


def record_results(result, filename=json_path):
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


def div_up(a, b):
    return (a + b - 1) // b


# ==============================================================================
# FP8 Functions
# ==============================================================================


def allocate_weights_fp8():
    per_mat_weight_bytes = hidden_size * intermediate_size
    n_blocks_n_gate_up = div_up(intermediate_size, group_size)
    n_blocks_k = div_up(hidden_size, group_size)
    per_mat_scale_elems_gate_up = n_blocks_n_gate_up * n_blocks_k
    per_mat_scale_elems_down = n_blocks_k * n_blocks_n_gate_up

    gate_q = (
        torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
        .to("cpu")
        .contiguous()
    )
    up_q = (
        torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
        .to("cpu")
        .contiguous()
    )
    down_q = (
        torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
        .to("cpu")
        .contiguous()
    )
    gate_scale = (
        torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
    )
    up_scale = (
        torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
    )
    down_scale = (
        torch.randn(expert_num * per_mat_scale_elems_down, dtype=torch.float32, device="cuda").to("cpu").contiguous()
    )

    return {
        "gate_q": gate_q,
        "up_q": up_q,
        "down_q": down_q,
        "gate_scale": gate_scale,
        "up_scale": up_scale,
        "down_scale": down_scale,
        "per_mat_weight_bytes": per_mat_weight_bytes,
        "per_mat_scale_elems_gate_up": per_mat_scale_elems_gate_up,
        "per_mat_scale_elems_down": per_mat_scale_elems_down,
    }


def allocate_weights_fp8_perchannel():
    per_mat_weight_bytes = hidden_size * intermediate_size
    per_mat_scale_elems_gate_up = intermediate_size
    per_mat_scale_elems_down = hidden_size

    gate_q = (
        torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
        .to("cpu")
        .contiguous()
    )
    up_q = (
        torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
        .to("cpu")
        .contiguous()
    )
    down_q = (
        torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
        .to("cpu")
        .contiguous()
    )
    gate_scale = (
        torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
    )
    up_scale = (
        torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
    )
    down_scale = (
        torch.randn(expert_num * per_mat_scale_elems_down, dtype=torch.float32, device="cuda").to("cpu").contiguous()
    )

    return {
        "gate_q": gate_q,
        "up_q": up_q,
        "down_q": down_q,
        "gate_scale": gate_scale,
        "up_scale": up_scale,
        "down_scale": down_scale,
        "per_mat_weight_bytes": per_mat_weight_bytes,
        "per_mat_scale_elems_gate_up": per_mat_scale_elems_gate_up,
        "per_mat_scale_elems_down": per_mat_scale_elems_down,
    }


def build_moe_fp8(layer_idx=0):
    """Build a single FP8 MOE instance."""
    weights = allocate_weights_fp8()

    config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    config.max_len = max_len
    config.layer_idx = layer_idx
    config.quant_config.bits = 8
    config.quant_config.group_size = group_size
    config.quant_config.zero_point = False
    config.pool = CPUInfer.backend_
    config.gate_proj = weights["gate_q"].data_ptr()
    config.up_proj = weights["up_q"].data_ptr()
    config.down_proj = weights["down_q"].data_ptr()
    config.gate_scale = weights["gate_scale"].data_ptr()
    config.up_scale = weights["up_scale"].data_ptr()
    config.down_scale = weights["down_scale"].data_ptr()

    moe = kt_kernel_ext.moe.AMXFP8_MOE(config)
    CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    CPUInfer.sync()

    buffer_shapes = {
        "per_mat_weight_bytes": weights["per_mat_weight_bytes"],
        "per_mat_scale_elems_gate_up": weights["per_mat_scale_elems_gate_up"],
        "per_mat_scale_elems_down": weights["per_mat_scale_elems_down"],
    }

    return moe, buffer_shapes, weights


def build_moe_fp8_perchannel(layer_idx=0):
    """Build a single FP8 per-channel MOE instance."""
    weights = allocate_weights_fp8_perchannel()

    config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    config.max_len = max_len
    config.layer_idx = layer_idx
    config.quant_config.bits = 8
    config.quant_config.group_size = 0
    config.quant_config.zero_point = False
    config.quant_config.per_channel = True
    config.pool = CPUInfer.backend_
    config.gate_proj = weights["gate_q"].data_ptr()
    config.up_proj = weights["up_q"].data_ptr()
    config.down_proj = weights["down_q"].data_ptr()
    config.gate_scale = weights["gate_scale"].data_ptr()
    config.up_scale = weights["up_scale"].data_ptr()
    config.down_scale = weights["down_scale"].data_ptr()

    moe = kt_kernel_ext.moe.AMXFP8PerChannel_MOE(config)
    CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    CPUInfer.sync()

    buffer_shapes = {
        "per_mat_weight_bytes": weights["per_mat_weight_bytes"],
        "per_mat_scale_elems_gate_up": weights["per_mat_scale_elems_gate_up"],
        "per_mat_scale_elems_down": weights["per_mat_scale_elems_down"],
    }

    return moe, buffer_shapes, weights


def allocate_buffers_fp8(buffer_shapes):
    """Allocate output buffers for FP8 single expert."""
    per_mat_weight_bytes = buffer_shapes["per_mat_weight_bytes"]
    per_mat_scale_elems_gate_up = buffer_shapes["per_mat_scale_elems_gate_up"]
    per_mat_scale_elems_down = buffer_shapes["per_mat_scale_elems_down"]

    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
    scale_elems_per_expert_per_tp_gate_up = per_mat_scale_elems_gate_up // gpu_tp_count
    scale_elems_per_expert_per_tp_down = per_mat_scale_elems_down // gpu_tp_count

    w13_weight_bufs = [torch.empty(2 * weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w13_scale_bufs = [
        torch.empty(2 * scale_elems_per_expert_per_tp_gate_up, dtype=torch.float32) for _ in range(gpu_tp_count)
    ]
    w2_weight_bufs = [torch.empty(weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w2_scale_bufs = [torch.empty(scale_elems_per_expert_per_tp_down, dtype=torch.float32) for _ in range(gpu_tp_count)]

    buffer_ptrs = {
        "w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
        "w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
        "w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
        "w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
    }

    keep_tensors = {
        "w13_weight_bufs": w13_weight_bufs,
        "w13_scale_bufs": w13_scale_bufs,
        "w2_weight_bufs": w2_weight_bufs,
        "w2_scale_bufs": w2_scale_bufs,
    }

    return buffer_ptrs, keep_tensors


def allocate_buffers_fp8_perchannel(buffer_shapes):
    """Allocate output buffers for FP8 per-channel single expert."""
    per_mat_weight_bytes = buffer_shapes["per_mat_weight_bytes"]
    per_mat_scale_elems_gate_up = buffer_shapes["per_mat_scale_elems_gate_up"]
    per_mat_scale_elems_down = buffer_shapes["per_mat_scale_elems_down"]

    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
    scale_elems_per_expert_per_tp_gate_up = per_mat_scale_elems_gate_up // gpu_tp_count
    scale_elems_per_expert_per_tp_down = per_mat_scale_elems_down

    w13_weight_bufs = [torch.empty(2 * weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w13_scale_bufs = [
        torch.empty(2 * scale_elems_per_expert_per_tp_gate_up, dtype=torch.float32) for _ in range(gpu_tp_count)
    ]
    w2_weight_bufs = [torch.empty(weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w2_scale_bufs = [torch.empty(scale_elems_per_expert_per_tp_down, dtype=torch.float32) for _ in range(gpu_tp_count)]

    buffer_ptrs = {
        "w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
        "w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
        "w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
        "w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
    }

    keep_tensors = {
        "w13_weight_bufs": w13_weight_bufs,
        "w13_scale_bufs": w13_scale_bufs,
        "w2_weight_bufs": w2_weight_bufs,
        "w2_scale_bufs": w2_scale_bufs,
    }

    return buffer_ptrs, keep_tensors


# ==============================================================================
# BF16 Functions
# ==============================================================================


def allocate_weights_bf16():
    per_mat_weight_elems = hidden_size * intermediate_size
    per_mat_weight_bytes = per_mat_weight_elems * 2  # BF16 = 2 bytes

    gate_proj = (
        torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
    )
    up_proj = torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
    down_proj = (
        torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
    )

    return {
        "gate_proj": gate_proj,
        "up_proj": up_proj,
        "down_proj": down_proj,
        "per_mat_weight_bytes": per_mat_weight_bytes,
        "per_mat_weight_elems": per_mat_weight_elems,
    }


def build_moe_bf16(layer_idx=0):
    """Build a single BF16 MOE instance."""
    weights = allocate_weights_bf16()

    config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    config.max_len = max_len
    config.layer_idx = layer_idx
    config.pool = CPUInfer.backend_
    config.gate_proj = weights["gate_proj"].data_ptr()
    config.up_proj = weights["up_proj"].data_ptr()
    config.down_proj = weights["down_proj"].data_ptr()
    config.gate_scale = 0
    config.up_scale = 0
    config.down_scale = 0

    moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
    CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    CPUInfer.sync()

    buffer_shapes = {
        "per_mat_weight_bytes": weights["per_mat_weight_bytes"],
        "per_mat_weight_elems": weights["per_mat_weight_elems"],
    }

    return moe, buffer_shapes, weights


def allocate_buffers_bf16(buffer_shapes):
    """Allocate output buffers for BF16 single expert (no scales)."""
    per_mat_weight_bytes = buffer_shapes["per_mat_weight_bytes"]

    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count

    w13_weight_bufs = [torch.empty(2 * weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w2_weight_bufs = [torch.empty(weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    # Dummy scale buffers (not used for BF16 but needed for interface)
    w13_scale_bufs = [torch.empty(1, dtype=torch.float32) for _ in range(gpu_tp_count)]
    w2_scale_bufs = [torch.empty(1, dtype=torch.float32) for _ in range(gpu_tp_count)]

    buffer_ptrs = {
        "w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
        "w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
        "w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
        "w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
    }

    keep_tensors = {
        "w13_weight_bufs": w13_weight_bufs,
        "w13_scale_bufs": w13_scale_bufs,
        "w2_weight_bufs": w2_weight_bufs,
        "w2_scale_bufs": w2_scale_bufs,
    }

    return buffer_ptrs, keep_tensors


# ==============================================================================
# Benchmark Functions
# ==============================================================================


def bench_write_buffer(quant_mode: str):
    """Benchmark write_weight_scale_to_buffer for specified quant mode."""
    print(f"\n{'='*60}")
    print(f"{quant_mode.upper()} write_weight_scale_to_buffer benchmark")
    print(f"{'='*60}")

    if quant_mode == "fp8":
        bytes_per_elem = 1.0
        moe_0, buffer_shapes, keep_tensors_0 = build_moe_fp8(layer_idx=0)
        moe_1, _, keep_tensors_1 = build_moe_fp8(layer_idx=1)
        buffer_ptrs, buffer_keep = allocate_buffers_fp8(buffer_shapes)

        # Calculate total bytes including scales
        total_weights = hidden_size * intermediate_size * expert_num * 3
        total_scale_bytes = (
            (buffer_shapes["per_mat_scale_elems_gate_up"] * 2 + buffer_shapes["per_mat_scale_elems_down"])
            * expert_num
            * 4
        )
        bytes_per_call = total_weights + total_scale_bytes

    elif quant_mode == "fp8_perchannel":
        bytes_per_elem = 1.0
        moe_0, buffer_shapes, keep_tensors_0 = build_moe_fp8_perchannel(layer_idx=0)
        moe_1, _, keep_tensors_1 = build_moe_fp8_perchannel(layer_idx=1)
        buffer_ptrs, buffer_keep = allocate_buffers_fp8_perchannel(buffer_shapes)

        total_weights = hidden_size * intermediate_size * expert_num * 3
        total_scale_bytes = (
            (buffer_shapes["per_mat_scale_elems_gate_up"] * 2 + buffer_shapes["per_mat_scale_elems_down"])
            * expert_num
            * 4
        )
        bytes_per_call = total_weights + total_scale_bytes

    elif quant_mode == "bf16":
        bytes_per_elem = 2.0
        moe_0, buffer_shapes, keep_tensors_0 = build_moe_bf16(layer_idx=0)
        moe_1, _, keep_tensors_1 = build_moe_bf16(layer_idx=1)
        buffer_ptrs, buffer_keep = allocate_buffers_bf16(buffer_shapes)

        # BF16: only weights, no scales
        bytes_per_call = hidden_size * intermediate_size * expert_num * 3 * 2  # BF16 = 2 bytes

    else:
        raise ValueError(f"Unsupported quant_mode: {quant_mode}")

    moes = [moe_0, moe_1]

    # Warm-up
    for _ in tqdm(range(warm_up_iter), desc=f"[{quant_mode.upper()}] Warm-up"):
        for moe_idx, moe in enumerate(moes):
            for expert_id in range(gpu_experts_num):
                CPUInfer.submit(
                    moe.write_weight_scale_to_buffer_task(gpu_tp_count=gpu_tp_count, expert_id=expert_id, **buffer_ptrs)
                )
                CPUInfer.sync()

    # Benchmark
    total_time = 0
    for iter_idx in tqdm(range(test_iter), desc=f"[{quant_mode.upper()}] Testing"):
        start = time.perf_counter()
        for moe_idx, moe in enumerate(moes):
            for expert_id in range(gpu_experts_num):
                CPUInfer.submit(
                    moe.write_weight_scale_to_buffer_task(gpu_tp_count=gpu_tp_count, expert_id=expert_id, **buffer_ptrs)
                )
                CPUInfer.sync()
        end = time.perf_counter()
        iter_time = end - start
        total_time += iter_time
        # print(f"  Iter {iter_idx}: {iter_time*1000:.2f} ms")
        time.sleep(0.3)

    # bytes_per_call is for one MOE, we have 2 MOEs
    bytes_per_iter = bytes_per_call * 2
    time_per_iter_ms = total_time / test_iter * 1000
    bandwidth_gbs = bytes_per_iter * test_iter / total_time / 1e9

    print(f"\n{'='*60}")
    print(f"{quant_mode.upper()} write_weight_scale_to_buffer Results (2 MOEs alternating)")
    print(f"{'='*60}")
    print(f"Time per iteration: {time_per_iter_ms:.2f} ms")
    print(f"Bandwidth: {bandwidth_gbs:.2f} GB/s")
    print(f"Experts per MOE: {gpu_experts_num}, MOEs: 2")
    print(f"Time per expert: {time_per_iter_ms/(gpu_experts_num*2)*1000:.2f} us")

    result = {
        "op": f"write_weight_scale_to_buffer_{quant_mode}",
        "quant_mode": quant_mode,
        "time_per_iteration_ms": time_per_iter_ms,
        "bandwidth_GBs": bandwidth_gbs,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "test_parameters": {
            "expert_num": expert_num,
            "hidden_size": hidden_size,
            "intermediate_size": intermediate_size,
            "gpu_tp_count": gpu_tp_count,
            "bytes_per_iter": bytes_per_iter,
            "num_moes": 2,
        },
    }
    if quant_mode == "fp8":
        result["test_parameters"]["group_size"] = group_size

    result.update(get_git_commit())
    result.update(get_system_info())
    record_results(result)

    return bandwidth_gbs


def main(quant_modes=None):
    """Run benchmarks for specified quant modes."""
    if quant_modes is None:
        quant_modes = ["fp8", "fp8_perchannel", "bf16"]

    results = {}
    for mode in quant_modes:
        try:
            bandwidth = bench_write_buffer(mode)
            results[mode] = f"PASSED ({bandwidth:.2f} GB/s)"
        except Exception as e:
            results[mode] = f"FAILED: {e}"
            import traceback

            traceback.print_exc()

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    for mode, result in results.items():
        print(f"  {mode.upper()}: {result}")


if __name__ == "__main__":
    if len(sys.argv) > 1:
        mode = sys.argv[1].lower()
        if mode in ["fp8", "fp8_perchannel", "bf16"]:
            main([mode])
        else:
            print(f"Unknown mode: {mode}. Use 'fp8', 'fp8_perchannel' or 'bf16'")
            sys.exit(1)
    else:
        main()


================================================
FILE: kt-kernel/bench/compare_moe_performance.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
MoE Performance Comparison Script
Compares performance between KTransformers AMX MoE and SGL CPU MoE implementations
"""
import os
import sys
import time
import json
import platform
import subprocess
import argparse
import logging
import signal
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Environment configuration
@dataclass
class EnvironmentConfig:
    malloc_conf: str = "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
    jemalloc_path: str = "/home/xwy/Projects/jemalloc/lib/libjemalloc.so"
    
    def apply(self):
        os.environ['MALLOC_CONF'] = self.malloc_conf
        if os.path.exists(self.jemalloc_path):
            os.environ['LD_PRELOAD'] = self.jemalloc_path
        else:
            logger.warning(f"jemalloc not found at {self.jemalloc_path}")

# Apply environment configuration
env_config = EnvironmentConfig()
env_config.apply()

# Add paths for both implementations
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
sys.path.insert(0, '/home/xwy/Projects/sgl-cpu-tests')

import torch

# Try importing both implementations
try:
    import kt_kernel_ext
    KTRANSFORMERS_AVAILABLE = True
    logger.info("KTransformers kt_kernel_ext loaded successfully")
except ImportError as e:
    KTRANSFORMERS_AVAILABLE = False
    logger.warning(f"KTransformers kt_kernel_ext not available: {e}")

try:
    from sgl_kernel.common_ops import fused_experts_cpu
    from sgl_kernel.common_ops import convert_weight_packed
    SGL_AVAILABLE = True
    logger.info("SGL kernel loaded successfully")
except ImportError as e:
    SGL_AVAILABLE = False
    logger.warning(f"SGL kernel not available: {e}")

# Try importing int4 support
try:
    # For SGL INT4, we'll check if the sglang-jianan directory exists
    import os
    sglang_path = "/home/xwy/Projects/sglang-jianan"
    if os.path.exists(sglang_path) and os.path.exists(os.path.join(sglang_path, "benchmark/kernels/int4_moe/benchmark_int4_moe.py")):
        SGL_INT4_AVAILABLE = True
        logger.info("SGL INT4 support available (via sglang-jianan)")
    else:
        SGL_INT4_AVAILABLE = False
        logger.warning("SGL INT4 support not available: sglang-jianan directory not found")
except Exception as e:
    SGL_INT4_AVAILABLE = False
    logger.warning(f"SGL INT4 support not available: {e}")

def get_cpu_count() -> int:
    """Get logical CPU core count (including hyperthreading)"""
    cpu_count = None
    
    # Method 1: os.cpu_count()
    try:
        cpu_count = os.cpu_count()
        if cpu_count and cpu_count > 0:
            logger.info(f"Detected {cpu_count} logical CPU cores via os.cpu_count()")
            return cpu_count
    except Exception as e:
        logger.debug(f"os.cpu_count() failed: {e}")
    
    # Method 2: Check /proc/cpuinfo
    try:
        with open('/proc/cpuinfo', 'r') as f:
            cpu_count = sum(1 for line in f if line.strip().startswith('processor'))
        if cpu_count > 0:
            logger.info(f"Detected {cpu_count} logical CPU cores via /proc/cpuinfo")
            return cpu_count
    except Exception as e:
        logger.debug(f"Failed to read /proc/cpuinfo: {e}")
    
    # Default fallback
    logger.warning("Could not detect CPU count, defaulting to 32")
    return 32

def get_physical_cpu_count() -> int:
    """Get physical CPU core count (excluding hyperthreading)"""
    
    # Method 1: Try lscpu command
    try:
        result = subprocess.run(['lscpu'], capture_output=True, text=True, timeout=5)
        if result.returncode == 0:
            cores_per_socket = None
            sockets = None
            for line in result.stdout.split('\n'):
                if 'Core(s) per socket:' in line:
                    cores_per_socket = int(line.split(':')[1].strip())
                elif 'Socket(s):' in line:
                    sockets = int(line.split(':')[1].strip())
            
            if cores_per_socket and sockets:
                physical_cores = cores_per_socket * sockets
                logger.info(f"Detected {physical_cores} physical CPU cores via lscpu")
                return physical_cores
    except Exception as e:
        logger.debug(f"lscpu failed: {e}")
    
    # Method 2: Check /sys/devices/system/cpu/
    try:
        cpu_path = '/sys/devices/system/cpu/'
        if os.path.exists(cpu_path):
            # Count unique physical core IDs
            physical_cores = set()
            for cpu_dir in os.listdir(cpu_path):
                if cpu_dir.startswith('cpu') and cpu_dir[3:].isdigit():
                    core_id_path = os.path.join(cpu_path, cpu_dir, 'topology/core_id')
                    if os.path.exists(core_id_path):
                        with open(core_id_path, 'r') as f:
                            core_id = f.read().strip()
                            physical_cores.add(core_id)
            
            if physical_cores:
                count = len(physical_cores)
                logger.info(f"Detected {count} physical CPU cores via sysfs")
                return count
    except Exception as e:
        logger.debug(f"Failed to check sysfs: {e}")
    
    # Method 3: Parse /proc/cpuinfo for unique core ids
    try:
        with open('/proc/cpuinfo', 'r') as f:
            content = f.read()
            cores = set()
            current_physical_id = None
            
            for line in content.split('\n'):
                if line.startswith('physical id'):
                    current_physical_id = line.split(':')[1].strip()
                elif line.startswith('core id') and current_physical_id is not None:
                    core_id = line.split(':')[1].strip()
                    cores.add(f"{current_physical_id}:{core_id}")
            
            if cores:
                count = len(cores)
                logger.info(f"Detected {count} physical CPU cores via /proc/cpuinfo")
                return count
    except Exception as e:
        logger.debug(f"Failed to parse /proc/cpuinfo: {e}")
    
    # Fallback: assume hyperthreading is enabled and divide logical cores by 2
    try:
        logical_count = get_cpu_count()
        if logical_count > 0:
            # Assume hyperthreading, so physical cores = logical cores / 2
            physical_count = logical_count // 2
            logger.warning(f"Could not detect physical cores directly. Assuming hyperthreading enabled: {logical_count} logical cores -> {physical_count} physical cores")
            return physical_count
    except:
        pass
    
    # Default fallback
    logger.warning("Could not detect physical CPU count, defaulting to 32")
    return 32

# Test configuration dataclass
@dataclass
class TestConfig:
    expert_num: int = 256
    hidden_size: int = 7168
    intermediate_size: int = 2048
    max_len: int = 25600
    num_experts_per_tok: int = 8
    layer_num: int = 5
    warm_up_iter: int = 100
    test_iter: int = 10000
    qlen_values: List[int] = None
    thread_count_values: List[int] = None
    
    def __post_init__(self):
        if self.qlen_values is None:
            self.qlen_values = [1, 4, 16, 64, 256, 1024, 2048]
        if self.thread_count_values is None:
            # Default to physical CPU core count
            physical_cores = get_physical_cpu_count()
            self.thread_count_values = [physical_cores]
    
    @property
    def total_configurations(self) -> int:
        return len(self.qlen_values) * len(self.thread_count_values)

def get_numa_count() -> int:
    """Get NUMA node count from system with multiple fallback methods"""
    # Method 1: Try numactl
    try:
        result = subprocess.run(['numactl', '--hardware'], 
                              capture_output=True, text=True, timeout=5)
        if result.returncode == 0:
            for line in result.stdout.split('\n'):
                if 'available:' in line and 'nodes' in line:
                    parts = line.split()
                    if len(parts) >= 2 and parts[1].isdigit():
                        numa_count = int(parts[1])
                        logger.info(f"Detected {numa_count} NUMA nodes via numactl")
                        return numa_count
    except (subprocess.TimeoutExpired, FileNotFoundError) as e:
        logger.debug(f"numactl not available: {e}")
    
    # Method 2: Check /sys/devices/system/node/
    try:
        node_path = '/sys/devices/system/node/'
        if os.path.exists(node_path):
            numa_dirs = [d for d in os.listdir(node_path) if d.startswith('node')]
            if numa_dirs:
                numa_count = len(numa_dirs)
                logger.info(f"Detected {numa_count} NUMA nodes via sysfs")
                return numa_count
    except Exception as e:
        logger.debug(f"Failed to check sysfs: {e}")
    
    # Default fallback
    logger.warning("Could not detect NUMA configuration, defaulting to 2 nodes")
    return 2

# System configuration
@dataclass
class SystemConfig:
    numa_count: int = 0
    cpu_cores: int = 0
    
    def __post_init__(self):
        if self.numa_count == 0:
            self.numa_count = get_numa_count()
        if self.cpu_cores == 0:
            self.cpu_cores = get_cpu_count()

sys_config = SystemConfig()

@dataclass
class ThreadConfig:
    thread_count: int
    threads_per_numa: int
    sgl_thread_count: int
    numa_prefix: str
    
    @classmethod
    def from_thread_count(cls, thread_count: int, numa_count: int, cpu_cores: int) -> 'ThreadConfig':
        """Create thread configuration for a specific thread count"""
        # Validate thread count
        if thread_count > cpu_cores:
            logger.warning(f"thread_count ({thread_count}) > cpu_cores ({cpu_cores}), using all cores")
            thread_count = cpu_cores
        
        threads_per_numa = thread_count // numa_count
        sgl_thread_count = threads_per_numa
        last_core = sgl_thread_count - 1
        numa_prefix = f"numactl --physcpubind=0-{last_core} --membind=0"
        
        return cls(
            thread_count=thread_count,
            threads_per_numa=threads_per_numa,
            sgl_thread_count=sgl_thread_count,
            numa_prefix=numa_prefix
        )

def get_system_info() -> Dict[str, any]:
    """Get comprehensive system information"""
    info = {}
    
    # Basic system info
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node
    info["release"] = uname.release
    info["machine"] = uname.machine
    info["cpu_count"] = sys_config.cpu_cores
    info["numa_nodes"] = sys_config.numa_count
    
    # CPU model information
    if os.path.exists('/proc/cpuinfo'):
        try:
            with open('/proc/cpuinfo', 'r') as f:
                cpu_info = f.read()
                for line in cpu_info.split('\n'):
                    if "model name" in line:
                        info["cpu_model"] = line.split(":", 1)[1].strip()
                        break
                # Check for CPU features
                if "flags" in cpu_info:
                    flags_line = next(line for line in cpu_info.split('\n') if "flags" in line)
                    flags = flags_line.split(":", 1)[1].strip().split()
                    info["cpu_features"] = {
                        "avx2": "avx2" in flags,
                        "avx512": any(f.startswith("avx512") for f in flags),
                        "amx": any("amx" in f for f in flags)
                    }
        except Exception as e:
            logger.debug(f"Failed to read CPU info: {e}")
    
    # Memory information
    try:
        import psutil
        mem = psutil.virtual_memory()
        info["total_memory_gb"] = round(mem.total / (1024**3), 2)
        info["available_memory_gb"] = round(mem.available / (1024**3), 2)
    except ImportError:
        pass
    
    # Python and PyTorch versions
    info["python_version"] = sys.version.split()[0]
    info["torch_version"] = torch.__version__
    info["cuda_available"] = torch.cuda.is_available()
    if torch.cuda.is_available():
        info["cuda_version"] = torch.version.cuda
    
    return info

@dataclass
class BenchmarkResult:
    implementation: str
    quant_mode: str
    qlen: int
    thread_count: int
    total_time: float
    time_per_iter_us: float
    bandwidth_gbs: float
    tflops: float
    iterations: int
    
    def to_dict(self) -> Dict:
        return asdict(self)

@dataclass
class CheckpointState:
    """State information for checkpoint/resume functionality"""
    test_config: TestConfig
    completed_configs: List[Tuple[int, int, str, str]]  # (thread_count, qlen, implementation, quant_mode)
    results: List[BenchmarkResult]
    start_time: str
    last_update: str
    
    def to_dict(self) -> Dict:
        return {
            'test_config': asdict(self.test_config),
            'completed_configs': self.completed_configs,
            'results': [r.to_dict() for r in self.results],
            'start_time': self.start_time,
            'last_update': self.last_update
        }
    
    @classmethod
    def from_dict(cls, data: Dict) -> 'CheckpointState':
        test_config = TestConfig(**data['test_config'])
        results = [BenchmarkResult(**r) for r in data['results']]
        return cls(
            test_config=test_config,
            completed_configs=data['completed_configs'],
            results=results,
            start_time=data['start_time'],
            last_update=data['last_update']
        )

class CheckpointManager:
    """Manages checkpoint saving and loading"""
    def __init__(self, checkpoint_dir: str = None):
        self.checkpoint_dir = Path(checkpoint_dir) if checkpoint_dir else Path.cwd() / "checkpoints"
        self.checkpoint_dir.mkdir(exist_ok=True)
        self.checkpoint_file = self.checkpoint_dir / "moe_benchmark_checkpoint.json"
        self.interrupted = False
        
        # Set up signal handler for graceful shutdown
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGTERM, self._signal_handler)
    
    def _signal_handler(self, signum, frame):
        logger.warning(f"Received signal {signum}, will save checkpoint after current test...")
        self.interrupted = True
    
    def save_checkpoint(self, state: CheckpointState):
        """Save checkpoint to file"""
        state.last_update = datetime.now().isoformat()
        
        # Save to temporary file first for atomicity
        temp_file = self.checkpoint_file.with_suffix('.tmp')
        try:
            with open(temp_file, 'w') as f:
                json.dump(state.to_dict(), f, indent=2)
            
            # Atomically rename
            temp_file.replace(self.checkpoint_file)
            logger.info(f"Checkpoint saved: {len(state.results)} results, {len(state.completed_configs)} configs completed")
        except Exception as e:
            logger.error(f"Failed to save checkpoint: {e}")
            if temp_file.exists():
                temp_file.unlink()
    
    def load_checkpoint(self) -> Optional[CheckpointState]:
        """Load checkpoint from file if exists"""
        if not self.checkpoint_file.exists():
            return None
        
        try:
            with open(self.checkpoint_file, 'r') as f:
                data = json.load(f)
            state = CheckpointState.from_dict(data)
            logger.info(f"Loaded checkpoint: {len(state.results)} results, {len(state.completed_configs)} configs completed")
            logger.info(f"Checkpoint started at {state.start_time}, last updated {state.last_update}")
            return state
        except Exception as e:
            logger.error(f"Failed to load checkpoint: {e}")
            return None
    
    def clear_checkpoint(self):
        """Remove checkpoint file"""
        if self.checkpoint_file.exists():
            self.checkpoint_file.unlink()
            logger.info("Checkpoint cleared")

def bench_ktransformers_moe(test_config: TestConfig, quant_mode: str, qlen: int, 
                           thread_config: ThreadConfig) -> Optional[BenchmarkResult]:
    """Benchmark KTransformers AMX MoE implementation"""
    if not KTRANSFORMERS_AVAILABLE:
        logger.error("KTransformers not available, skipping benchmark")
        return None
    
    # Adjust iterations based on qlen to maintain reasonable runtime
    adjusted_iterations = test_config.test_iter
    adjusted_warmup = test_config.warm_up_iter
    if qlen >= 1024:
        adjusted_iterations = max(10, test_config.test_iter // 100)
        adjusted_warmup = max(5, test_config.warm_up_iter // 20)
    elif qlen >= 256:
        adjusted_iterations = max(50, test_config.test_iter // 20)
        adjusted_warmup = max(10, test_config.warm_up_iter // 10)
    elif qlen >= 64:
        adjusted_iterations = max(100, test_config.test_iter // 10)
        adjusted_warmup = max(20, test_config.warm_up_iter // 5)
    elif qlen >= 16:
        adjusted_iterations = max(200, test_config.test_iter // 5)
        adjusted_warmup = max(40, test_config.warm_up_iter // 2)
    
    logger.info(f"Testing KTransformers MoE: quant={quant_mode}, qlen={qlen}, threads={thread_config.thread_count}, "
                f"iterations={adjusted_iterations} (warmup={adjusted_warmup})")
    
    # Set thread count for this test
    os.environ['OMP_NUM_THREADS'] = str(thread_config.thread_count)
    
    try:
        with torch.inference_mode():
            # Setup worker config with consistent threads per NUMA
            worker_config = kt_kernel_ext.WorkerPoolConfig()
            worker_config.subpool_count = sys_config.numa_count
            worker_config.subpool_numa_map = list(range(sys_config.numa_count))
            worker_config.subpool_thread_count = [thread_config.threads_per_numa] * sys_config.numa_count
            CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
        
            # Create MoE layers
            moes = []
            gate_projs = []
            up_projs = []
            down_projs = []
            
            logger.debug(f"Creating {test_config.layer_num} MoE layers...")
            for i in range(test_config.layer_num):
                gate_proj = torch.randn((test_config.expert_num, test_config.intermediate_size, test_config.hidden_size), 
                                      dtype=torch.float32).contiguous()
                up_proj = torch.randn((test_config.expert_num, test_config.intermediate_size, test_config.hidden_size), 
                                    dtype=torch.float32).contiguous()
                down_proj = torch.randn((test_config.expert_num, test_config.hidden_size, test_config.intermediate_size), 
                                      dtype=torch.float32).contiguous()
            
                config = kt_kernel_ext.moe.MOEConfig(
                    test_config.expert_num, test_config.num_experts_per_tok, 
                    test_config.hidden_size, test_config.intermediate_size)
                config.max_len = test_config.max_len
                config.gate_proj = gate_proj.data_ptr()
                config.up_proj = up_proj.data_ptr()
                config.down_proj = down_proj.data_ptr()
                config.pool = CPUInfer.backend_
            
                if quant_mode == "bf16":
                    moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
                elif quant_mode == "int8":
                    moe = kt_kernel_ext.moe.AMXInt8_MOE(config)
                elif quant_mode == "int4":
                    moe = kt_kernel_ext.moe.AMXInt4_MOE(config)
                else:
                    raise ValueError(f"Unsupported quantization mode: {quant_mode}")
                
                CPUInfer.submit(moe.load_weights_task())
                CPUInfer.sync()
                gate_projs.append(gate_proj)
                up_projs.append(up_proj)
                down_projs.append(down_proj)
                moes.append(moe)
        
            # Prepare test data
            logger.debug("Preparing test data...")
            gen_iter = 1000
            expert_ids = torch.rand(gen_iter * qlen, test_config.expert_num).argsort(dim=-1)[
                :, :test_config.num_experts_per_tok
            ].reshape(gen_iter, qlen * test_config.num_experts_per_tok).contiguous()
            
            weights = torch.rand((gen_iter, qlen, test_config.num_experts_per_tok), 
                               dtype=torch.float32).contiguous()
            input_tensor = torch.randn((test_config.layer_num, qlen, test_config.hidden_size), 
                                     dtype=torch.bfloat16).contiguous()
            output_tensor = torch.empty((test_config.layer_num, qlen, test_config.hidden_size), 
                                      dtype=torch.bfloat16).contiguous()
            bsz_tensor = torch.tensor([qlen], dtype=torch.int32)
        
            # Warmup
            logger.debug(f"Running {adjusted_warmup} warmup iterations...")
            for i in range(adjusted_warmup):
                layer_idx = i % test_config.layer_num
                gen_idx = i % gen_iter
                CPUInfer.submit(
                    moes[layer_idx].forward_task(
                        bsz_tensor.data_ptr(),
                        test_config.num_experts_per_tok,
                        expert_ids[gen_idx].data_ptr(),
                        weights[gen_idx].data_ptr(),
                        input_tensor[layer_idx].data_ptr(),
                        output_tensor[layer_idx].data_ptr(),
                        False,
                    )
                )
                CPUInfer.sync()
        
            # Benchmark
            logger.debug(f"Running {adjusted_iterations} benchmark iterations...")
            start = time.perf_counter()
            for i in range(adjusted_iterations):
                layer_idx = i % test_config.layer_num
                gen_idx = i % gen_iter
                CPUInfer.submit(
                    moes[layer_idx].forward_task(
                        bsz_tensor.data_ptr(),
                        test_config.num_experts_per_tok,
                        expert_ids[gen_idx].data_ptr(),
                        weights[gen_idx].data_ptr(),
                        input_tensor[layer_idx].data_ptr(),
                        output_tensor[layer_idx].data_ptr(),
                        False,
                    )
                )
                CPUInfer.sync()
            end = time.perf_counter()
        
            # Calculate metrics
            total_time = end - start
            time_per_iter_us = total_time / adjusted_iterations * 1e6
            
            # Bytes per element based on quantization
            bytes_per_elem = {
                "bf16": 2.0,
                "int8": 1.0,
                "int4": 0.5
            }.get(quant_mode, 2.0)
            
            # Memory bandwidth calculation (GB/s)
            memory_per_iter = (
                test_config.hidden_size * test_config.intermediate_size * 3 * 
                test_config.num_experts_per_tok * 
                (1/8 * test_config.expert_num * (1-(31/32)**qlen)) * bytes_per_elem
            )
            bandwidth_gbs = memory_per_iter * adjusted_iterations / total_time / 1e9
            
            # FLOPS calculation (TFLOPS)
            flops_per_iter = (
                test_config.hidden_size * test_config.intermediate_size * qlen * 3 * 
                test_config.num_experts_per_tok * 2
            )
            tflops = flops_per_iter * adjusted_iterations / total_time / 1e12
            
            logger.info(f"Results - Time: {total_time:.4f}s, Per-iter: {time_per_iter_us:.2f}μs, "
                       f"BW: {bandwidth_gbs:.2f} GB/s, TFLOPS: {tflops:.2f}")
            
            return BenchmarkResult(
                implementation="KTransformers",
                quant_mode=quant_mode,
                qlen=qlen,
                thread_count=thread_config.thread_count,
                total_time=total_time,
                time_per_iter_us=time_per_iter_us,
                bandwidth_gbs=bandwidth_gbs,
                tflops=tflops,
                iterations=adjusted_iterations
            )
            
    except Exception as e:
        logger.error(f"KTransformers benchmark failed: {e}", exc_info=True)
        return None

def run_sgl_int4_with_numactl(test_config: TestConfig, qlen: int, 
                             thread_config: ThreadConfig) -> Optional[BenchmarkResult]:
    """Run SGL INT4 benchmark with numactl in subprocess"""
    if not SGL_INT4_AVAILABLE:
        logger.error("SGL INT4 not available, skipping benchmark")
        return None
    
    # Calculate SGL intermediate size (divided by NUMA nodes)
    sgl_intermediate_size = test_config.intermediate_size // sys_config.numa_count
    
    # Adjust iterations based on qlen to maintain reasonable runtime
    adjusted_iterations = test_config.test_iter
    adjusted_warmup = test_config.warm_up_iter
    if qlen >= 1024:
        adjusted_iterations = max(10, test_config.test_iter // 100)
        adjusted_warmup = max(5, test_config.warm_up_iter // 20)
    elif qlen >= 256:
        adjusted_iterations = max(50, test_config.test_iter // 20)
        adjusted_warmup = max(10, test_config.warm_up_iter // 10)
    elif qlen >= 64:
        adjusted_iterations = max(100, test_config.test_iter // 10)
        adjusted_warmup = max(20, test_config.warm_up_iter // 5)
    elif qlen >= 16:
        adjusted_iterations = max(200, test_config.test_iter // 5)
        adjusted_warmup = max(40, test_config.warm_up_iter // 2)
    
    logger.info(f"Testing SGL INT4: qlen={qlen}, iterations={adjusted_iterations} (warmup={adjusted_warmup}), "
                f"threads per NUMA: {thread_config.sgl_thread_count}")
    
    script_content = f'''
import sys
sys.path.insert(0, '/home/xwy/Projects/sglang-jianan')
sys.path.insert(0, '/home/xwy/Projects/sglang-jianan/test')

import os
import torch
import numpy as np
import sgl_kernel
from srt.cpu.utils import autoawq_to_int4pack
import time

torch.manual_seed(1111)
M, N, K, E, topk = {qlen}, {sgl_intermediate_size}, {test_config.hidden_size}, {test_config.expert_num}, {test_config.num_experts_per_tok}
layer_num = {test_config.layer_num}
group_size = 128
kernel = torch.ops.sgl_kernel

# Prepare int4 data
dtype = torch.bfloat16
device = "cpu"

# Generate input activations for all layers
input_tensors = [torch.rand(M, K, dtype=dtype, device=device) / np.sqrt(K) for _ in range(layer_num)]

# Generate weights and pack for each layer
all_awq_w13_weight_pack = []
all_awq_w13_zero_pack = []
all_awq_w13_scales_pack = []
all_awq_w2_weight_pack = []
all_awq_w2_zero_pack = []
all_awq_w2_scales_pack = []

# Generate expert routing scores (different for each iteration)
gen_iter = 1000
all_topk_weights = []
all_topk_ids = []

for gen_idx in range(gen_iter):
    score = torch.rand(M, E, dtype=dtype, device=device)
    score = torch.softmax(score, dim=-1, dtype=torch.float32)
    topk_weight, topk_ids = torch.topk(score, topk)
    all_topk_weights.append(topk_weight)
    all_topk_ids.append(topk_ids.to(torch.int32))

print("Creating " + str(layer_num) + " MoE layers...")
for layer_idx in range(layer_num):
    # Generate INT4 quantized weights for each expert
    # w1: gate and up projection (K -> 2*N)
    awq_w13_weight = torch.randint(-127, 128, (E, K, 2 * N // 8), device=device).to(torch.int)
    awq_w13_zero = torch.randint(0, 10, (E, K // group_size, 2 * N // 8), device=device).to(torch.int)
    awq_w13_scales = torch.rand(E, K // group_size, 2 * N, dtype=dtype, device=device)
    
    # w2: down projection (N -> K)  
    awq_w2_weight = torch.randint(-127, 128, (E, N, K // 8), device=device).to(torch.int)
    awq_w2_zero = torch.randint(0, 10, (E, N // group_size, K // 8), device=device).to(torch.int)
    awq_w2_scales = torch.rand(E, N // group_size, K, dtype=dtype, device=device)
    
    # Pack weights for optimized kernel
    awq_w13_weight_pack = []
    awq_w13_zero_pack = []
    awq_w13_scales_pack = []
    awq_w2_weight_pack = []
    awq_w2_zero_pack = []
    awq_w2_scales_pack = []
    
    for i in range(E):
        packed_weight_13, packed_zero_13, packed_scales_13 = autoawq_to_int4pack(
            awq_w13_weight[i], awq_w13_zero[i], awq_w13_scales[i], False
        )
        awq_w13_weight_pack.append(packed_weight_13)
        awq_w13_zero_pack.append(packed_zero_13)
        awq_w13_scales_pack.append(packed_scales_13)
        
        packed_weight_2, packed_zero_2, packed_scales_2 = autoawq_to_int4pack(
            awq_w2_weight[i], awq_w2_zero[i], awq_w2_scales[i], False
        )
        awq_w2_weight_pack.append(packed_weight_2)
        awq_w2_zero_pack.append(packed_zero_2)
        awq_w2_scales_pack.append(packed_scales_2)
    
    all_awq_w13_weight_pack.append(torch.stack(awq_w13_weight_pack).detach())
    all_awq_w13_zero_pack.append(torch.stack(awq_w13_zero_pack).detach())
    all_awq_w13_scales_pack.append(torch.stack(awq_w13_scales_pack).detach())
    all_awq_w2_weight_pack.append(torch.stack(awq_w2_weight_pack).detach())
    all_awq_w2_zero_pack.append(torch.stack(awq_w2_zero_pack).detach())
    all_awq_w2_scales_pack.append(torch.stack(awq_w2_scales_pack).detach())

# Warmup
print("Running " + str({adjusted_warmup}) + " warmup iterations...")
for i in range({adjusted_warmup}):
    layer_idx = i % layer_num
    gen_idx = i % gen_iter
    out = kernel.fused_experts_cpu(
        input_tensors[layer_idx],
        all_awq_w13_weight_pack[layer_idx],
        all_awq_w2_weight_pack[layer_idx],
        all_topk_weights[gen_idx],
        all_topk_ids[gen_idx],
        False,  # inplace
        False,  # use_int8_w8a8
        False,  # use_fp8_w8a16
        True,   # use_int4_w4a16
        all_awq_w13_scales_pack[layer_idx],
        all_awq_w2_scales_pack[layer_idx],
        all_awq_w13_zero_pack[layer_idx],
        all_awq_w2_zero_pack[layer_idx],
        None,   # block_size
        None,   # a1_scale
        None,   # a2_scale
        True,   # is_vnni
    )

# Benchmark
print("Running " + str({adjusted_iterations}) + " benchmark iterations...")
start = time.perf_counter()
for i in range({adjusted_iterations}):
    layer_idx = i % layer_num
    gen_idx = i % gen_iter
    out = kernel.fused_experts_cpu(
        input_tensors[layer_idx],
        all_awq_w13_weight_pack[layer_idx],
        all_awq_w2_weight_pack[layer_idx],
        all_topk_weights[gen_idx],
        all_topk_ids[gen_idx],
        False,
        False,
        False,
        True,
        all_awq_w13_scales_pack[layer_idx],
        all_awq_w2_scales_pack[layer_idx],
        all_awq_w13_zero_pack[layer_idx],
        all_awq_w2_zero_pack[layer_idx],
        None,
        None,
        None,
        True,
    )
end = time.perf_counter()

total_time = end - start
time_per_iter_us = total_time / {adjusted_iterations} * 1e6

# Calculate performance metrics for int4
bytes_per_elem = 0.5  # int4
memory_per_iter = (
    {test_config.hidden_size} * {sgl_intermediate_size} * 3 * {test_config.num_experts_per_tok} * 
    (1/8 * {test_config.expert_num} * (1-(31/32)**{qlen})) * bytes_per_elem
)
bandwidth_gbs = memory_per_iter * {adjusted_iterations} / total_time / 1e9

# FLOPS calculation 
flops_per_iter = {test_config.hidden_size} * {sgl_intermediate_size} * {qlen} * 3 * {test_config.num_experts_per_tok} * 2
tflops = flops_per_iter * {adjusted_iterations} / total_time / 1e12

print(f"SGL_RESULT:{{total_time}},{{time_per_iter_us}},{{bandwidth_gbs}},{{tflops}}")
'''
    
    # Create temporary script in sglang-jianan directory
    sglang_path = "/home/xwy/Projects/sglang-jianan"
    temp_script = f"{sglang_path}/temp_sgl_int4_bench_{os.getpid()}_{qlen}.py"
    
    try:
        with open(temp_script, 'w') as f:
            f.write(script_content)
        
        # Setup environment
        env = os.environ.copy()
        env['MALLOC_CONF'] = env_config.malloc_conf
        if os.path.exists(env_config.jemalloc_path):
            env['LD_PRELOAD'] = env_config.jemalloc_path
        env['OMP_NUM_THREADS'] = str(thread_config.sgl_thread_count)
        
        # Run with numactl from the sglang-jianan directory
        cmd = f"cd {sglang_path} && {thread_config.numa_prefix} python3 {temp_script}"
        logger.debug(f"Running SGL INT4 command: {cmd}")
        
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, env=env, timeout=300)
        
        if result.returncode == 0:
            # Parse result
            for line in result.stdout.split('\n'):
                if line.startswith('SGL_RESULT:'):
                    parts = line.replace('SGL_RESULT:', '').split(',')
                    if len(parts) >= 4:
                        try:
                            total_time = float(parts[0])
                            time_per_iter_us = float(parts[1])
                            bandwidth_gbs = float(parts[2])
                            tflops = float(parts[3])
                            
                            logger.info(f"SGL INT4 Results - Time: {total_time:.4f}s, Per-iter: {time_per_iter_us:.2f}μs, "
                                       f"BW: {bandwidth_gbs:.2f} GB/s, TFLOPS: {tflops:.2f}")
                            
                            return BenchmarkResult(
                                implementation="SGL",
                                quant_mode="int4",
                                qlen=qlen,
                                thread_count=thread_config.thread_count,
                                total_time=total_time,
                                time_per_iter_us=time_per_iter_us,
                                bandwidth_gbs=bandwidth_gbs,
                                tflops=tflops,
                                iterations=adjusted_iterations
                            )
                        except ValueError as e:
                            logger.error(f"Failed to parse SGL INT4 results: {e}")
        else:
            logger.error(f"SGL INT4 subprocess failed with code {result.returncode}")
            logger.error(f"STDOUT: {result.stdout}")
            logger.error(f"STDERR: {result.stderr}")
            
    except subprocess.TimeoutExpired:
        logger.error("SGL INT4 benchmark timed out")
    except Exception as e:
        logger.error(f"SGL INT4 benchmark error: {e}", exc_info=True)
    finally:
        # Clean up
        if os.path.exists(temp_script):
            try:
                os.remove(temp_script)
            except:
                pass
    
    return None

def run_sgl_with_numactl(test_config: TestConfig, qlen: int, 
                        thread_config: ThreadConfig) -> Optional[BenchmarkResult]:
    """Run SGL benchmark with numactl in subprocess"""
    if not SGL_AVAILABLE:
        logger.error("SGL not available, skipping benchmark")
        return None
    
    # Calculate SGL intermediate size (divided by NUMA nodes)
    sgl_intermediate_size = test_config.intermediate_size // sys_config.numa_count
    
    # Adjust iterations based on qlen to maintain reasonable runtime
    adjusted_iterations = test_config.test_iter
    adjusted_warmup = test_config.warm_up_iter
    if qlen >= 1024:
        adjusted_iterations = max(10, test_config.test_iter // 100)
        adjusted_warmup = max(5, test_config.warm_up_iter // 20)
    elif qlen >= 256:
        adjusted_iterations = max(50, test_config.test_iter // 20)
        adjusted_warmup = max(10, test_config.warm_up_iter // 10)
    elif qlen >= 64:
        adjusted_iterations = max(100, test_config.test_iter // 10)
        adjusted_warmup = max(20, test_config.warm_up_iter // 5)
    elif qlen >= 16:
        adjusted_iterations = max(200, test_config.test_iter // 5)
        adjusted_warmup = max(40, test_config.warm_up_iter // 2)
    
    logger.info(f"Testing SGL INT8: qlen={qlen}, iterations={adjusted_iterations} (warmup={adjusted_warmup}), "
                f"threads per NUMA: {thread_config.sgl_thread_count}")
    
    script_content = f'''
import sys
sys.path.insert(0, "/home/xwy/Projects/sgl-cpu-tests")

import os
import torch
from sgl_kernel.common_ops import fused_experts_cpu as fused_experts
from sgl_kernel.common_ops import convert_weight_packed
import time

torch.manual_seed(1111)
M, N, K, E, topk = {qlen}, {sgl_intermediate_size}, {test_config.hidden_size}, {test_config.expert_num}, {test_config.num_experts_per_tok}
layer_num = {test_config.layer_num}

# Generate expert routing scores (different for each iteration)
gen_iter = 1000
all_topk_weights = []
all_topk_ids = []

for gen_idx in range(gen_iter):
    score = torch.randn(M, E).to(dtype=torch.bfloat16)
    score = torch.softmax(score, dim=-1, dtype=torch.float32)
    topk_weight, topk_ids = torch.topk(score, topk)
    all_topk_weights.append(topk_weight)
    all_topk_ids.append(topk_ids.to(torch.int32))

prepack = True
inplace = True
use_int4_w4a16 = False

# Create multiple layers
print("Creating " + str(layer_num) + " MoE layers...")
inputs = []
packed_w1s_int8 = []
packed_w2s_int8 = []
w1_s_list = []
w2_s_list = []

for layer_idx in range(layer_num):
    input_tensor = torch.randn(M, K).to(dtype=torch.bfloat16)
    
    # int8 weights
    w1_int8 = torch.randn(E, 2 * N, K).to(dtype=torch.int8)
    w2_int8 = torch.randn(E, K, N).to(dtype=torch.int8)
    packed_w1_int8 = convert_weight_packed(w1_int8)
    packed_w2_int8 = convert_weight_packed(w2_int8)
    w1_s = torch.rand(E, 2 * N)
    w2_s = torch.rand(E, K)
    
    inputs.append(input_tensor)
    packed_w1s_int8.append(packed_w1_int8)
    packed_w2s_int8.append(packed_w2_int8)
    w1_s_list.append(w1_s)
    w2_s_list.append(w2_s)

# Warmup
print("Running " + str({adjusted_warmup}) + " warmup iterations...")
for i in range({adjusted_warmup}):
    layer_idx = i % layer_num
    gen_idx = i % gen_iter
    fused_experts(inputs[layer_idx], packed_w1s_int8[layer_idx], packed_w2s_int8[layer_idx], 
                 all_topk_weights[gen_idx], all_topk_ids[gen_idx],
                 inplace, True, False, use_int4_w4a16, w1_s_list[layer_idx], w2_s_list[layer_idx], 
                 None, None, None, None, None, prepack)

# Benchmark
print("Running " + str({adjusted_iterations}) + " benchmark iterations...")
start = time.perf_counter()
for i in range({adjusted_iterations}):
    layer_idx = i % layer_num
    gen_idx = i % gen_iter
    fused_experts(inputs[layer_idx], packed_w1s_int8[layer_idx], packed_w2s_int8[layer_idx], 
                 all_topk_weights[gen_idx], all_topk_ids[gen_idx],
                 inplace, True, False, use_int4_w4a16, w1_s_list[layer_idx], w2_s_list[layer_idx], 
                 None, None, None, None, None, prepack)
end = time.perf_counter()

total_time = end - start
time_per_iter_us = total_time / {adjusted_iterations} * 1e6

# Calculate performance metrics for int8
bytes_per_elem = 1.0  # int8
memory_per_iter = (
    {test_config.hidden_size} * {sgl_intermediate_size} * 3 * {test_config.num_experts_per_tok} * 
    (1/8 * {test_config.expert_num} * (1-(31/32)**{qlen})) * bytes_per_elem
)
bandwidth_gbs = memory_per_iter * {adjusted_iterations} / total_time / 1e9

# FLOPS calculation 
flops_per_iter = {test_config.hidden_size} * {sgl_intermediate_size} * {qlen} * 3 * {test_config.num_experts_per_tok} * 2
tflops = flops_per_iter * {adjusted_iterations} / total_time / 1e12

print(f"SGL_RESULT:{{total_time}},{{time_per_iter_us}},{{bandwidth_gbs}},{{tflops}}")
'''
    
    # Create temporary script
    temp_script = f"/tmp/sgl_bench_{os.getpid()}_{qlen}.py"
    
    try:
        with open(temp_script, 'w') as f:
            f.write(script_content)
        
        # Setup environment
        env = os.environ.copy()
        env['MALLOC_CONF'] = env_config.malloc_conf
        if os.path.exists(env_config.jemalloc_path):
            env['LD_PRELOAD'] = env_config.jemalloc_path
        env['OMP_NUM_THREADS'] = str(thread_config.sgl_thread_count)
        
        # Run with numactl
        cmd = f"{thread_config.numa_prefix} python3 {temp_script}"
        logger.debug(f"Running SGL command: {cmd}")
        
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, env=env, timeout=300)
        
        if result.returncode == 0:
            # Parse result
            for line in result.stdout.split('\n'):
                if line.startswith('SGL_RESULT:'):
                    parts = line.replace('SGL_RESULT:', '').split(',')
                    if len(parts) >= 4:
                        try:
                            total_time = float(parts[0])
                            time_per_iter_us = float(parts[1])
                            bandwidth_gbs = float(parts[2])
                            tflops = float(parts[3])
                            
                            logger.info(f"SGL Results - Time: {total_time:.4f}s, Per-iter: {time_per_iter_us:.2f}μs, "
                                       f"BW: {bandwidth_gbs:.2f} GB/s, TFLOPS: {tflops:.2f}")
                            
                            return BenchmarkResult(
                                implementation="SGL",
                                quant_mode="int8",
                                qlen=qlen,
                                thread_count=thread_config.thread_count,
                                total_time=total_time,
                                time_per_iter_us=time_per_iter_us,
                                bandwidth_gbs=bandwidth_gbs,
                                tflops=tflops,
                                iterations=adjusted_iterations
                            )
                        except ValueError as e:
                            logger.error(f"Failed to parse SGL results: {e}")
        else:
            logger.error(f"SGL subprocess failed with code {result.returncode}: {result.stderr}")
            
    except subprocess.TimeoutExpired:
        logger.error("SGL benchmark timed out")
    except Exception as e:
        logger.error(f"SGL benchmark error: {e}", exc_info=True)
    finally:
        # Clean up
        if os.path.exists(temp_script):
            try:
                os.remove(temp_script)
            except:
                pass
    
    return None

def save_results(results: List[BenchmarkResult], test_config: TestConfig, filename: str = None) -> str:
    """Save benchmark results to JSON file"""
    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"moe_comparison_{timestamp}.json"
    
    output_data = {
        "timestamp": datetime.now().isoformat(),
        "test_configuration": asdict(test_config),
        "system_info": get_system_info(),
        "results": [r.to_dict() for r in results],
        "summary": {
            "total_benchmarks": len(results),
            "implementations_tested": list(set(r.implementation for r in results)),
            "quantization_modes": list(set(r.quant_mode for r in results)),
            "qlen_values_tested": sorted(set(r.qlen for r in results)),
            "thread_counts_tested": sorted(set(r.thread_count for r in results))
        }
    }
    
    with open(filename, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    logger.info(f"Results saved to: {filename}")
    return filename

def print_summary_table(results: List[BenchmarkResult]):
    """Print formatted summary table of results"""
    if not results:
        return
    
    print("\n" + "=" * 100)
    print("PERFORMANCE SUMMARY")
    print("=" * 100)
    print(f"{'Implementation':<15} {'Quant':<6} {'Threads':<8} {'QLen':<8} {'Time(μs)':<12} {'BW(GB/s)':<12} {'TFLOPS':<10} {'Speedup':<10}")
    print("-" * 100)
    
    # Group by configuration for better comparison
    baseline_times = {}
    
    for result in sorted(results, key=lambda r: (r.thread_count, r.qlen, r.implementation, r.quant_mode)):
        key = (result.thread_count, result.qlen)
        
        if key not in baseline_times:
            baseline_times[key] = result.time_per_iter_us
            speedup = "1.00x"
        else:
            speedup = f"{baseline_times[key]/result.time_per_iter_us:.2f}x"
        
        print(f"{result.implementation:<15} {result.quant_mode:<6} {result.thread_count:<8} "
              f"{result.qlen:<8} {result.time_per_iter_us:<12.2f} {result.bandwidth_gbs:<12.2f} "
              f"{result.tflops:<10.2f} {speedup:<10}")

def main():
    parser = argparse.ArgumentParser(description="Compare MoE performance between KTransformers and SGL")
    parser.add_argument("--qlen", type=int, nargs="+", help="Sequence lengths to test")
    parser.add_argument("--threads", type=int, nargs="+", help="Thread counts to test")
    parser.add_argument("--iterations", type=int, help="Number of test iterations")
    parser.add_argument("--warmup", type=int, help="Number of warmup iterations")
    parser.add_argument("--output", type=str, help="Output filename for results")
    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
    parser.add_argument("--resume", action="store_true", help="Resume from checkpoint if available")
    parser.add_argument("--checkpoint-dir", type=str, help="Directory for checkpoint files")
    parser.add_argument("--no-checkpoint", action="store_true", help="Disable checkpoint saving")
    parser.add_argument("--framework", choices=["all", "ktransformers", "sgl"], default="all",
                        help="Framework to test (default: all)")
    parser.add_argument("--precision", choices=["all", "int8", "int4"], default="all",
                        help="Precision to test (default: all)")
    
    args = parser.parse_args()
    
    # Configure logging level
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    
    # Create test configuration
    test_config = TestConfig()
    if args.qlen:
        test_config.qlen_values = args.qlen
    if args.threads:
        test_config.thread_count_values = args.threads
    if args.iterations:
        test_config.test_iter = args.iterations
    if args.warmup:
        test_config.warm_up_iter = args.warmup
    
    # Determine which frameworks and precisions to test
    test_ktransformers = args.framework in ["all", "ktransformers"] and KTRANSFORMERS_AVAILABLE
    test_sgl = args.framework in ["all", "sgl"] and (SGL_AVAILABLE or SGL_INT4_AVAILABLE)
    
    # Determine which precisions to test
    test_precisions = []
    if args.precision == "all":
        test_precisions = ["int8", "int4"]
    else:
        test_precisions = [args.precision]
    
    # Print configuration
    logger.info("MoE Performance Comparison")
    logger.info("=" * 60)
    logger.info(f"System configuration:")
    logger.info(f"  CPU cores: {sys_config.cpu_cores}")
    logger.info(f"  NUMA nodes: {sys_config.numa_count}")
    logger.info(f"Test parameters:")
    logger.info(f"  Expert count: {test_config.expert_num}")
    logger.info(f"  Hidden size: {test_config.hidden_size}")
    logger.info(f"  Intermediate size: {test_config.intermediate_size}")
    logger.info(f"  Experts per token: {test_config.num_experts_per_tok}")
    logger.info(f"  Test iterations: {test_config.test_iter}")
    logger.info(f"  Warmup iterations: {test_config.warm_up_iter}")
    logger.info(f"Testing configurations:")
    logger.info(f"  QLEN values: {test_config.qlen_values}")
    logger.info(f"  Thread counts: {test_config.thread_count_values}")
    logger.info(f"  Frameworks: {args.framework}")
    logger.info(f"  Precisions: {args.precision}")
    logger.info(f"  Total configs: {test_config.total_configurations}")
    print()
    
    # Check availability
    if not KTRANSFORMERS_AVAILABLE and not SGL_AVAILABLE:
        logger.error("Neither KTransformers nor SGL is available. Cannot run benchmarks.")
        return 1
    
    # Initialize checkpoint manager
    checkpoint_mgr = CheckpointManager(args.checkpoint_dir) if not args.no_checkpoint else None
    
    # Load checkpoint if resuming
    checkpoint_state = None
    completed_configs = set()
    all_results = []
    start_time = datetime.now().isoformat()
    
    if args.resume and checkpoint_mgr:
        checkpoint_state = checkpoint_mgr.load_checkpoint()
        if checkpoint_state:
            # Verify configuration matches
            if (checkpoint_state.test_config.qlen_values != test_config.qlen_values or
                checkpoint_state.test_config.thread_count_values != test_config.thread_count_values):
                logger.warning("Checkpoint configuration doesn't match current configuration")
                response = input("Continue with checkpoint anyway? (y/n): ")
                if response.lower() != 'y':
                    logger.info("Starting fresh run")
                    checkpoint_state = None
            
            if checkpoint_state:
                all_results = checkpoint_state.results
                completed_configs = set(checkpoint_state.completed_configs)
                start_time = checkpoint_state.start_time
                logger.info(f"Resuming from checkpoint with {len(all_results)} results")
    
    # Create checkpoint state if not loaded
    if not checkpoint_state and checkpoint_mgr:
        checkpoint_state = CheckpointState(
            test_config=test_config,
            completed_configs=[],
            results=[],
            start_time=start_time,
            last_update=start_time
        )
    
    config_count = 0
    total_configs_to_run = 0
    
    # Calculate total configs to run
    for thread_count in test_config.thread_count_values:
        for qlen in test_config.qlen_values:
            if test_ktransformers:
                for quant_mode in test_precisions:
                    if (thread_count, qlen, "KTransformers", quant_mode) not in completed_configs:
                        total_configs_to_run += 1
            if test_sgl:
                if "int8" in test_precisions and SGL_AVAILABLE:
                    if (thread_count, qlen, "SGL", "int8") not in completed_configs:
                        total_configs_to_run += 1
                if "int4" in test_precisions and SGL_INT4_AVAILABLE:
                    if (thread_count, qlen, "SGL", "int4") not in completed_configs:
                        total_configs_to_run += 1
    
    logger.info(f"Total configurations to run: {total_configs_to_run}")
    
    # Test all combinations
    for thread_count in test_config.thread_count_values:
        thread_config = ThreadConfig.from_thread_count(thread_count, sys_config.numa_count, sys_config.cpu_cores)
        logger.info(f"\nThread Configuration: {thread_count} total ({thread_config.threads_per_numa} per NUMA)")
        
        for qlen in test_config.qlen_values:
            # Check for interrupt
            if checkpoint_mgr and checkpoint_mgr.interrupted:
                logger.warning("Interrupt detected, saving checkpoint and exiting...")
                if checkpoint_state:
                    checkpoint_state.results = all_results
                    checkpoint_state.completed_configs = list(completed_configs)
                    checkpoint_mgr.save_checkpoint(checkpoint_state)
                return 2
            
            logger.info(f"\n--- Configuration: threads={thread_count}, qlen={qlen} ---")
            
            # Test KTransformers
            if test_ktransformers:
                for quant_mode in test_precisions:
                    config_key = (thread_count, qlen, "KTransformers", quant_mode)
                    if config_key in completed_configs:
                        logger.info(f"Skipping already completed: KTransformers-{quant_mode}")
                        continue
                    
                    config_count += 1
                    logger.info(f"Progress: {config_count}/{total_configs_to_run}")
                    
                    result = bench_ktransformers_moe(test_config, quant_mode, qlen, thread_config)
                    if result:
                        all_results.append(result)
                        completed_configs.add(config_key)
                        
                        # Save checkpoint after each successful test
                        if checkpoint_mgr and checkpoint_state:
                            checkpoint_state.results = all_results
                            checkpoint_state.completed_configs = list(completed_configs)
                            checkpoint_mgr.save_checkpoint(checkpoint_state)
            
            # Test SGL int8
            if test_sgl and "int8" in test_precisions and SGL_AVAILABLE:
                config_key = (thread_count, qlen, "SGL", "int8")
                if config_key in completed_configs:
                    logger.info("Skipping already completed: SGL-int8")
                    continue
                
                config_count += 1
                logger.info(f"Progress: {config_count}/{total_configs_to_run}")
                
                logger.info(f"Testing SGL MoE (int8): qlen={qlen}, threads={thread_count}")
                sgl_intermediate = test_config.intermediate_size // sys_config.numa_count
                sgl_threads_per_numa = thread_config.sgl_thread_count
                logger.info(f"Using NUMA TP: intermediate_size {test_config.intermediate_size} -> "
                           f"{sgl_intermediate} (/{sys_config.numa_count}), threads per NUMA: {sgl_threads_per_numa}")
                
                result = run_sgl_with_numactl(test_config, qlen, thread_config)
                if result:
                    all_results.append(result)
                    completed_configs.add(config_key)
                    
                    # Save checkpoint after each successful test
                    if checkpoint_mgr and checkpoint_state:
                        checkpoint_state.results = all_results
                        checkpoint_state.completed_configs = list(completed_configs)
                        checkpoint_mgr.save_checkpoint(checkpoint_state)
            
            # Test SGL int4
            if test_sgl and "int4" in test_precisions and SGL_INT4_AVAILABLE:
                config_key = (thread_count, qlen, "SGL", "int4")
                if config_key in completed_configs:
                    logger.info("Skipping already completed: SGL-int4")
                    continue
                
                config_count += 1
                logger.info(f"Progress: {config_count}/{total_configs_to_run}")
                
                logger.info(f"Testing SGL MoE (int4): qlen={qlen}, threads={thread_count}")
                sgl_intermediate = test_config.intermediate_size // sys_config.numa_count
                sgl_threads_per_numa = thread_config.sgl_thread_count
                logger.info(f"Using NUMA TP: intermediate_size {test_config.intermediate_size} -> "
                           f"{sgl_intermediate} (/{sys_config.numa_count}), threads per NUMA: {sgl_threads_per_numa}")
                
                result = run_sgl_int4_with_numactl(test_config, qlen, thread_config)
                if result:
                    all_results.append(result)
                    completed_configs.add(config_key)
                    
                    # Save checkpoint after each successful test
                    if checkpoint_mgr and checkpoint_state:
                        checkpoint_state.results = all_results
                        checkpoint_state.completed_configs = list(completed_configs)
                        checkpoint_mgr.save_checkpoint(checkpoint_state)
    
    # Final summary
    if all_results:
        print_summary_table(all_results)
        
        # Save results
        output_file = save_results(all_results, test_config, args.output)
        
        print(f"\nTotal benchmarks completed: {len(all_results)}")
        print(f"Results saved to: {output_file}")
        
        # Clear checkpoint on successful completion
        if checkpoint_mgr and config_count == total_configs_to_run:
            checkpoint_mgr.clear_checkpoint()
            logger.info("All tests completed successfully, checkpoint cleared")
        elif checkpoint_mgr and config_count < total_configs_to_run:
            logger.warning(f"Only {config_count}/{total_configs_to_run} configurations completed")
            logger.info("Checkpoint preserved for resuming")
        
        # Print best performers per configuration
        print("\nBest performers by configuration:")
        from itertools import groupby
        
        sorted_results = sorted(all_results, key=lambda r: (r.qlen, r.thread_count, r.time_per_iter_us))
        for key, group in groupby(sorted_results, key=lambda r: (r.qlen, r.thread_count)):
            qlen, threads = key
            best = next(group)
            print(f"  QLen={qlen}, Threads={threads}: {best.implementation}-{best.quant_mode} "
                  f"({best.time_per_iter_us:.2f}μs, {best.tflops:.2f} TFLOPS)")
    else:
        logger.error("No successful benchmarks completed.")
        return 1
    
    return 0

if __name__ == "__main__":
    sys.exit(main())

================================================
FILE: kt-kernel/bench/multi_bench_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
自动展开 list 参数的 benchmark 脚本。
只要将所有测试参数放在 all_params 字典中，凡是值为 list 的键都会被自动展开，
生成参数组合后依次调用 bench_moe/bench_moe_amx 运行测试。
"""

import os
import sys
import itertools
from collections.abc import Sequence

# 将当前目录加入搜索路径
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))

#####################################################################
# 1. 在此处一次性写好所有测试参数
#####################################################################
all_params = {
    # 固定参数
    "test_operator_type": "llamafile",   # "llamafile" 或 "amx" "kml"
    "expert_num": 256,
    "num_experts_per_tok": 8,
    "hidden_size": 7168,
    "intermediate_size": 2048,
    "max_len": 25600,         # amx 专用，llamafile 可保留不使用
    "group_max_len": 1024,    # llamafile 专用
    "group_min_len": 10,      # llamafile 专用
    "m_block": [256],             # llamafile 专用
     "qlen": range(1,11,1),
    "layer_num": 3,
    "warm_up_iter": 100,
    "test_iter": 10000,

    # ↓↓↓ 下面这些值是 list，会被自动展开 ↓↓↓
    "CPUINFER_PARAM": [304],
    # "CPUINFER_PARAM": [144], # Kunpeng 920 7280Z
    "quant_mode": "q4_k_m", # llamafile
    # "quant_mode": ["int4", "int8"], # amx
    # "quant_mode": "int8", # amx
}
#####################################################################


def expand_param_dict(param_dict):
    """对值为 list 的键做笛卡儿积展开"""
    vary_keys, vary_values, fixed_items = [], [], {}
    for k, v in param_dict.items():
        if isinstance(v, Sequence) and not isinstance(v, (str, bytes)):
            vary_keys.append(k)
            vary_values.append(v)
        else:
            fixed_items[k] = v

    if not vary_keys:
        yield param_dict
        return

    for combo in itertools.product(*vary_values):
        params = fixed_items.copy()
        params.update(dict(zip(vary_keys, combo)))
        yield params


# 根据 operator 类型动态导入 bench 模块
if all_params["test_operator_type"] == "llamafile":
    import bench_moe as bench
elif all_params["test_operator_type"] == "amx":
    import bench_moe_amx as bench
elif all_params["test_operator_type"] == "kml":
    import bench_moe_kml as bench
else:
    raise ValueError(f"Unknown test_operator_type: {all_params['test_operator_type']}")


def update_bench_parameters(params):
    """同步参数到 bench 模块并重新初始化 CPUInfer"""
    bench.expert_num = params["expert_num"]
    bench.hidden_size = params["hidden_size"]
    bench.intermediate_size = params["intermediate_size"]
    bench.max_len = params["max_len"]
    bench.group_max_len = params["group_max_len"]
    bench.group_min_len = params["group_min_len"]
    bench.m_block = params["m_block"]
    bench.num_experts_per_tok = params["num_experts_per_tok"]
    bench.layer_num = params["layer_num"]
    bench.qlen = params["qlen"]
    bench.warm_up_iter = params["warm_up_iter"]
    bench.test_iter = params["test_iter"]
    bench.CPUINFER_PARAM = params["CPUINFER_PARAM"]
    # 重新初始化 CPUInfer 对象
    bench.CPUInfer = bench.kt_kernel_ext.CPUInfer(bench.CPUINFER_PARAM)


def main():
    for params in expand_param_dict(all_params):
        print("=" * 60)
        print("开始测试参数集:", params)
        update_bench_parameters(params)
        bench.bench_moe(params["quant_mode"])
        print("完成测试，量化模式:", params["quant_mode"])
        print("=" * 60, "\n")


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/bench/upload-bench-json.py
================================================
from pymongo import MongoClient, errors
import json
import os

script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)

# === 加载 secrets.json 文件 ===
with open(os.path.join(script_dir,"mongo.json")) as f:
    secrets = json.load(f)

MONGO_URI = secrets["mongo_uri"]
DB_NAME = secrets["db_name"]
COLLECTION_NAME = secrets["collection_name"]

# === 连接 MongoDB ===
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

# 创建唯一索引（只需执行一次）
collection.create_index(
    [("timestamp", 1), ("test_parameters.CPUInfer_parameter", 1)],
    unique=True
)

# === 插入函数 ===
def insert_jsonl_file(file_path):
    total_inserted = 0
    total_skipped = 0

    with open(file_path, "r") as f:
        docs = [json.loads(line) for line in f if line.strip()]
        try:
            result = collection.insert_many(docs, ordered=False)
            inserted = len(result.inserted_ids)
            total_inserted += inserted
            print(f"[✓] {file_path} 插入 {inserted} 条记录")
        except errors.BulkWriteError as e:
            inserted = len(e.details.get("writeErrors", []))
            skipped = len(docs) - inserted
            total_inserted += inserted
            total_skipped += skipped
            print(f"[!] {file_path} 插入 {inserted} 条，跳过重复后 {skipped} 条")
    
    return total_inserted, total_skipped


insert_jsonl_file( os.path.join(script_dir, "bench_results.jsonl"))


================================================
FILE: kt-kernel/cmake/DetectCPU.cmake
================================================
# CPU Feature Detection for kt-kernel
# Detects CPU capabilities and sets appropriate compiler flags

function(detect_cpu_features)
    set(HAS_AVX2 OFF PARENT_SCOPE)
    set(HAS_AVX512F OFF PARENT_SCOPE)
    set(HAS_AVX512_VNNI OFF PARENT_SCOPE)
    set(HAS_AVX512_BF16 OFF PARENT_SCOPE)
    set(HAS_AVX512_VBMI OFF PARENT_SCOPE)
    set(HAS_AMX OFF PARENT_SCOPE)

    if(NOT EXISTS "/proc/cpuinfo")
        message(STATUS "CPU detection: /proc/cpuinfo not found, skipping auto-detection")
        return()
    endif()

    # Read CPU flags from /proc/cpuinfo
    file(READ "/proc/cpuinfo" CPUINFO_CONTENT)
    string(REGEX MATCH "flags[ \t]*:[ \t]*([^\n]*)" FLAGS_LINE "${CPUINFO_CONTENT}")
    if(NOT CMAKE_MATCH_1)
        message(STATUS "CPU detection: Could not parse CPU flags")
        return()
    endif()

    set(CPU_FLAGS "${CMAKE_MATCH_1}")
    string(REPLACE " " ";" CPU_FLAGS_LIST "${CPU_FLAGS}")

    # Check for each feature
    if("avx2" IN_LIST CPU_FLAGS_LIST)
        set(HAS_AVX2 ON PARENT_SCOPE)
    endif()

    if("avx512f" IN_LIST CPU_FLAGS_LIST)
        set(HAS_AVX512F ON PARENT_SCOPE)
    endif()

    if("avx512_vnni" IN_LIST CPU_FLAGS_LIST OR "avx512vnni" IN_LIST CPU_FLAGS_LIST)
        set(HAS_AVX512_VNNI ON PARENT_SCOPE)
    endif()

    if("avx512_bf16" IN_LIST CPU_FLAGS_LIST OR "avx512bf16" IN_LIST CPU_FLAGS_LIST)
        set(HAS_AVX512_BF16 ON PARENT_SCOPE)
    endif()

    if("avx512_vbmi" IN_LIST CPU_FLAGS_LIST OR "avx512vbmi" IN_LIST CPU_FLAGS_LIST)
        set(HAS_AVX512_VBMI ON PARENT_SCOPE)
    endif()

    # Check for AMX (need all three)
    set(AMX_COUNT 0)
    foreach(flag "amx_tile" "amx_int8" "amx_bf16")
        if("${flag}" IN_LIST CPU_FLAGS_LIST)
            math(EXPR AMX_COUNT "${AMX_COUNT} + 1")
        endif()
    endforeach()
    if(AMX_COUNT EQUAL 3)
        set(HAS_AMX ON PARENT_SCOPE)
    endif()

    # Get CPU model name for display
    string(REGEX MATCH "model name[ \t]*:[ \t]*([^\n]*)" MODEL_LINE "${CPUINFO_CONTENT}")
    if(CMAKE_MATCH_1)
        set(CPU_MODEL "${CMAKE_MATCH_1}" PARENT_SCOPE)
    endif()
endfunction()

# Main detection and configuration
message(STATUS "")
message(STATUS "========================================")
message(STATUS "CPU Feature Detection (CMake)")
message(STATUS "========================================")

# Check if variables were already set by install.sh/setup.py
set(FROM_INSTALL_SH OFF)
if(DEFINED LLAMA_AVX512_VNNI OR DEFINED LLAMA_AVX512_BF16 OR DEFINED LLAMA_AVX512_VBMI)
    set(FROM_INSTALL_SH ON)
    message(STATUS "Detected configuration from install.sh/setup.py")
    message(STATUS "  LLAMA_AVX512:      ${LLAMA_AVX512}")
    message(STATUS "  LLAMA_AVX512_VNNI: ${LLAMA_AVX512_VNNI}")
    message(STATUS "  LLAMA_AVX512_BF16: ${LLAMA_AVX512_BF16}")
    message(STATUS "  LLAMA_AVX512_VBMI: ${LLAMA_AVX512_VBMI}")
    message(STATUS "")
    message(STATUS "Skipping auto-detection (using install.sh settings)")
    message(STATUS "========================================")
    message(STATUS "")
    return()
endif()

# Detect CPU features (only if not set by install.sh)
detect_cpu_features()

if(CPU_MODEL)
    message(STATUS "CPU Model: ${CPU_MODEL}")
endif()

message(STATUS "")
message(STATUS "Detected features:")
message(STATUS "  AVX2:         ${HAS_AVX2}")
message(STATUS "  AVX512F:      ${HAS_AVX512F}")
message(STATUS "  AVX512_VNNI:  ${HAS_AVX512_VNNI}")
message(STATUS "  AVX512_BF16:  ${HAS_AVX512_BF16}")
message(STATUS "  AVX512_VBMI:  ${HAS_AVX512_VBMI}")
message(STATUS "  AMX:          ${HAS_AMX}")
message(STATUS "")

# Auto-enable features based on detection
# Only set if not already defined by user via -D flags
if(NOT DEFINED LLAMA_AVX2 AND HAS_AVX2)
    set(LLAMA_AVX2 ON CACHE BOOL "Enable AVX2" FORCE)
    message(STATUS "Auto-enabled: AVX2")
endif()

if(NOT DEFINED LLAMA_AVX512 AND HAS_AVX512F)
    set(LLAMA_AVX512 ON CACHE BOOL "Enable AVX512F" FORCE)
    message(STATUS "Auto-enabled: AVX512F")
endif()

if(NOT DEFINED LLAMA_AVX512_VNNI AND HAS_AVX512_VNNI)
    set(LLAMA_AVX512_VNNI ON CACHE BOOL "Enable AVX512_VNNI" FORCE)
    message(STATUS "Auto-enabled: AVX512_VNNI")
endif()

if(NOT DEFINED LLAMA_AVX512_BF16 AND HAS_AVX512_BF16)
    set(LLAMA_AVX512_BF16 ON CACHE BOOL "Enable AVX512_BF16" FORCE)
    message(STATUS "Auto-enabled: AVX512_BF16")
endif()

if(NOT DEFINED LLAMA_AVX512_VBMI AND HAS_AVX512_VBMI)
    set(LLAMA_AVX512_VBMI ON CACHE BOOL "Enable AVX512_VBMI" FORCE)
    message(STATUS "Auto-enabled: AVX512_VBMI")
endif()

if(NOT DEFINED KTRANSFORMERS_CPU_USE_AMX AND HAS_AMX)
    set(KTRANSFORMERS_CPU_USE_AMX ON CACHE BOOL "Enable AMX" FORCE)
    message(STATUS "Auto-enabled: AMX")
endif()

message(STATUS "")
message(STATUS "Note: You can override by passing -DLLAMA_AVX512_BF16=OFF etc.")
message(STATUS "Note: Or use install.sh with environment variables")
message(STATUS "========================================")
message(STATUS "")


================================================
FILE: kt-kernel/cmake/FindSIMD.cmake
================================================
include(CheckCSourceRuns)

set(AVX_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 a;
        a = _mm256_set1_ps(0);
        return 0;
    }
")

set(AVX512_CODE "
    #include <immintrin.h>
    int main()
    {
        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0);
        __m512i b = a;
        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
        return 0;
    }
")

set(AVX2_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256i a = {0};
        a = _mm256_abs_epi16(a);
        __m256i x;
        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
        return 0;
    }
")

set(FMA_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 acc = _mm256_setzero_ps();
        const __m256 d = _mm256_setzero_ps();
        const __m256 p = _mm256_setzero_ps();
        acc = _mm256_fmadd_ps( d, p, acc );
        return 0;
    }
")

macro(check_sse type flags)
    set(__FLAG_I 1)
    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
    foreach (__FLAG ${flags})
        if (NOT ${type}_FOUND)
            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
            if (HAS_${type}_${__FLAG_I})
                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
            endif()
            math(EXPR __FLAG_I "${__FLAG_I}+1")
        endif()
    endforeach()
    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})

    if (NOT ${type}_FOUND)
        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
        set(${type}_FLAGS "" CACHE STRING "${type} flags")
    endif()

    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
endmacro()

# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
    set(LLAMA_AVX OFF)
else()
    set(LLAMA_AVX ON)
endif()

check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
    set(LLAMA_AVX2 OFF)
else()
    set(LLAMA_AVX2 ON)
endif()

check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
    set(LLAMA_AVX512 OFF)
else()
    set(LLAMA_AVX512 ON)
endif()


================================================
FILE: kt-kernel/cpu_backend/cpuinfer.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-08-07 09:47:43
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_CPUINFER_H
#define CPUINFER_CPUINFER_H

#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
#ifdef KTRANSFORMERS_USE_CUDA
#include "vendors/cuda.h"
#elif KTRANSFORMERS_USE_MUSA
#include "vendors/musa.h"
#elif KTRANSFORMERS_USE_ROCM
#define __HIP_PLATFORM_AMD__
#include "vendors/hip.h"
#endif

#include "./vendors/vendor.h"
#include "llama.cpp/ggml-impl.h"
#include "task_queue.h"
#include "worker_pool.h"

class CPUInfer {
 public:
  CPUInfer(int thread_num) {
    printf("CPUInfer[0x%lx]: Hello\n", (intptr_t)this);
    backend_ = new WorkerPool(thread_num);
    task_queue_ = new TaskQueue();
    for (int i = 0; i < (1 << 16); ++i) {
      ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
    }
  }
  CPUInfer(int thread_num, int numa_id) {
    printf("CPUInfer[0x%lx]: Hello\n", (intptr_t)this);
    backend_ = new WorkerPool(thread_num, numa_id);
    task_queue_ = new TaskQueue();
    for (int i = 0; i < (1 << 16); ++i) {
      ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
    }
  }

  CPUInfer(WorkerPoolConfig config) {
    printf("CPUInfer[0x%lx]: Hello\n", (intptr_t)this);
    backend_ = new WorkerPool(config);
    task_queue_ = new TaskQueue();
    for (int i = 0; i < (1 << 16); ++i) {
      ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
    }
  }

  ~CPUInfer() {
    printf("CPUInfer[0x%lx]: Goodbye\n", (intptr_t)this);
    delete backend_;
    delete task_queue_;
  }

  CPUInfer(const CPUInfer&) = delete;
  CPUInfer& operator=(const CPUInfer&) = delete;
  CPUInfer(CPUInfer&&) = delete;
  CPUInfer& operator=(CPUInfer&&) = delete;

  template <typename Func, typename Obj, typename... Args>
  void enqueue(Func f, Obj* obj, Args... args) {
    task_queue_->enqueue([=]() { std::invoke(f, *obj, args...); });
  }

  void submit(std::pair<intptr_t, intptr_t> params) {
    void (*func)(void*) = (void (*)(void*))params.first;
    void* args = (void*)params.second;
    *((CPUInfer**)args) = this;
    func(args);
  }
#ifndef KTRANSFORMERS_CPU_ONLY
  void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
#if defined(KTRANSFORMERS_USE_CUDA)
    void (*func)(void*) = (void (*)(void*))params.first;
    void* args = (void*)params.second;
    *((CPUInfer**)args) = this;
    cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
#endif
  }
#endif

  struct SyncArgs {
    CPUInfer* cpuinfer;
    size_t allow_n_pending;
  };

  static void sync_(void* sync_args) {
    SyncArgs* args = (SyncArgs*)sync_args;
    args->cpuinfer->task_queue_->sync(args->allow_n_pending);
  }

  void sync(size_t allow_n_pending = 0) {
    SyncArgs* args = new SyncArgs{this, allow_n_pending};
    sync_(args);
  }
#ifndef KTRANSFORMERS_CPU_ONLY
  void sync_with_cuda_stream(intptr_t user_cuda_stream, size_t allow_n_pending = 0) {
#if defined(KTRANSFORMERS_USE_CUDA)
    SyncArgs* args = new SyncArgs{this, allow_n_pending};
    cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)args);
#endif
  }
#endif
 public:
  WorkerPool* backend_;
  TaskQueue* task_queue_;
};

#endif

================================================
FILE: kt-kernel/cpu_backend/shared_mem_buffer.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-08-05 04:49:08
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-08-05 09:21:29
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "shared_mem_buffer.h"

#include <errno.h>
#include <numa.h>

#include <cstdio>

size_t MemoryRequest::total_size() {
  size_t total = 0;
  for (size_t i = 0; i < sizes.size(); ++i) {
    total += sizes[i];
  }
  return total;
}

void MemoryRequest::update_base_ptr(void* base) {
  size_t total_offset = 0;
  for (size_t i = 0; i < funcs.size(); ++i) {
    funcs[i]((uint8_t*)base + total_offset);
    total_offset += sizes[i];
  }
}

void MemoryRequest::append_function(std::function<void(void*)> func, size_t size) {
  funcs.push_back(func);
  sizes.push_back(size);
}

SharedMemBuffer::SharedMemBuffer() {
  buffer = nullptr;
  size = 0;
}

SharedMemBuffer::~SharedMemBuffer() {
  if (buffer) {
    free(buffer);
  }
}

void SharedMemBuffer::alloc(void* object, MemoryRequest requests) {
  size_t total_size = requests.total_size();
  object_requests.push_back(requests);

  if (total_size > size) {
    if (buffer) {
      free(buffer);
    }
    void* newbuf = nullptr;
    int rc = posix_memalign(&newbuf, 64, total_size);
    if (rc != 0 || !newbuf) {
      errno = rc;  // posix_memalign returns error code instead of setting errno
      printf("cannot aligned alloc %zu bytes (align=%d)\n", (size_t)total_size, 64);
      perror("posix_memalign");  // ENOMEM/EINVAL
      exit(1);
    }
    buffer = newbuf;
    size = total_size;
    for (auto& req : object_requests) {
      req.update_base_ptr(buffer);
    }
  } else {
    requests.update_base_ptr(buffer);
  }
}

void SharedMemBufferNuma::alloc(int numa, void* object, MemoryRequest requests) {
  std::lock_guard<std::mutex> guard(lock);
  if (numa != numa_node_of_cpu(sched_getcpu())) {
    printf("alloc %d from other numa for %lx\n", numa, reinterpret_cast<intptr_t>(object));
  }
  if (numa_mem.count(numa) == 0) {
    numa_mem[numa] = std::unique_ptr<SharedMemBuffer>(new SharedMemBuffer());
  }
  // printf("numa %d alloc for %lx\n", numa,reinterpret_cast<intptr_t> (object));
  numa_mem.at(numa)->alloc(object, requests);
}


================================================
FILE: kt-kernel/cpu_backend/shared_mem_buffer.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-08-05 04:49:08
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-08-05 06:36:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H

#include <cstdint>
#include <cstdlib>
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <variant>
#include <vector>

struct MemoryRequest {
  std::vector<std::function<void(void*)>> funcs;
  std::vector<size_t> sizes;

  size_t total_size();
  void update_base_ptr(void* base);

  template <typename T>
  void append_pointer(T** ptr, size_t size) {
    append_function([ptr](void* base) { *ptr = reinterpret_cast<T*>(base); }, size);
  }
  void append_function(std::function<void(void*)> func, size_t size);
};

class SharedMemBuffer {
 public:
  SharedMemBuffer();
  ~SharedMemBuffer();

  void alloc(void* object, MemoryRequest requests);

 private:
  void* buffer;
  uint64_t size;
  std::vector<MemoryRequest> object_requests;
};

static SharedMemBuffer shared_mem_buffer;
static SharedMemBuffer shared_mem_buffer_for_decoder_layer;

class SharedMemBufferNuma {
  std::mutex lock;
  std::map<size_t, std::unique_ptr<SharedMemBuffer>> numa_mem;

 public:
  void alloc(int numa, void* object, MemoryRequest requests);
};

static SharedMemBufferNuma shared_mem_buffer_numa;

#endif

================================================
FILE: kt-kernel/cpu_backend/task_queue.cpp
================================================
/**
 * @Description :
 * @Author    : chenht2022
 * @Date     : 2024-07-17 12:25:51
 * @Version   : 1.0.0
 * @LastEditors : chenht2022
 * @LastEditTime : 2024-10-09 11:08:10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "task_queue.h"

#include <pthread.h>
#include <sched.h>

#include <chrono>
#include <iostream>
#include <thread>

TaskQueue::TaskQueue() : done(false), pending(0) {
  Node* dummy = new Node();
  head.store(dummy, std::memory_order_relaxed);
  tail.store(dummy, std::memory_order_relaxed);
  workerThread = std::thread(&TaskQueue::worker, this);
}

TaskQueue::~TaskQueue() {
  done.store(true, std::memory_order_release);
  if (workerThread.joinable()) workerThread.join();

  Node* node = head.load(std::memory_order_relaxed);
  while (node) {
    Node* next = node->next.load(std::memory_order_relaxed);
    delete node;
    node = next;
  }
}

void TaskQueue::enqueue(std::function<void()> task) {
  pending.fetch_add(1, std::memory_order_acq_rel);
  Node* node = new Node(task);
  Node* prev = tail.exchange(node, std::memory_order_acq_rel);
  prev->next.store(node, std::memory_order_release);
}

void TaskQueue::sync(size_t allow_n_pending) {
  // Spin until the pending task count drops to the allowed threshold.
  while (pending.load(std::memory_order_acquire) > allow_n_pending);
}

void TaskQueue::worker() {
  Node* curr = head.load(std::memory_order_relaxed);
  while (!done.load(std::memory_order_acquire)) {
    Node* next = curr->next.load(std::memory_order_acquire);
    if (next) {
      if (next->task) {
        next->task();
      }
      delete curr;
      curr = next;
      head.store(curr, std::memory_order_release);
      pending.fetch_sub(1, std::memory_order_acq_rel);
    }
  }
}

================================================
FILE: kt-kernel/cpu_backend/task_queue.h
================================================
/**
 * @Description :
 * @Author    : chenht2022
 * @Date     : 2024-07-16 10:43:18
 * @Version   : 1.0.0
 * @LastEditors : chenht
 * @LastEditTime : 2024-10-09 11:08:07
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_TASKQUEUE_H
#define CPUINFER_TASKQUEUE_H

#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>

class TaskQueue {
 public:
  TaskQueue();
  ~TaskQueue();

  void enqueue(std::function<void()>);

  void sync(size_t allow_n_pending);

 private:
  struct Node {
    std::function<void()> task;
    std::atomic<Node*> next;
    Node() : task(nullptr), next(nullptr) {}
    Node(const std::function<void()>& t) : task(t), next(nullptr) {}
  };

  std::atomic<Node*> head;
  std::atomic<Node*> tail;
  std::atomic<bool> done;
  std::atomic<size_t> pending;
  std::thread workerThread;

  void worker();
};

#endif

================================================
FILE: kt-kernel/cpu_backend/vendors/README.md
================================================
## TODO

This directory can be removed after updating the version of `llama.cpp`.

================================================
FILE: kt-kernel/cpu_backend/vendors/cuda.h
================================================
#pragma once

#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif  // CUDART_VERSION < 11020


================================================
FILE: kt-kernel/cpu_backend/vendors/hip.h
================================================
#pragma once

#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif  // __HIP_PLATFORM_AMD__

#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_32F HIPBLAS_R_32F
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn)                                              \
  {                                                               \
    hipError_t err = fn;                                          \
    if (err != hipSuccess) {                                      \
      GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); \
    }                                                             \
  }
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t  // deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cublasOperation_t hipblasOperation_t
#define cudaDataType_t hipblasDatatype_t  // deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cuDeviceGet hipDeviceGet
#define CUdevice hipDevice_t
#define CUdeviceptr hipDeviceptr_t
#define cuMemUnmap hipMemUnmap
#define CUmemAccessDesc hipMemAccessDesc
#define cuMemAddressFree hipMemAddressFree
#define cuMemRelease hipMemRelease
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
#define cuMemCreate hipMemCreate
#define cuMemAddressReserve hipMemAddressReserve
#define cuMemMap hipMemMap
#define cuMemSetAccess hipMemSetAccess
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
#define CUmemAllocationProp hipMemAllocationProp
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaGraphExec_t hipGraphExec_t
#define cudaGraphNode_t hipGraphNode_t
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaGraphExecDestroy hipGraphExecDestroy
#define cudaGraphLaunch hipGraphLaunch
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
#define cudaGraphNodeType hipGraphNodeType
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
#define cudaGraphInstantiate hipGraphInstantiate
#define cudaStreamEndCapture hipStreamEndCapture
#define cudaGraphDestroy hipGraphDestroy
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
#define cudaGraphNodeGetType hipGraphNodeGetType
#define cudaGraphGetNodes hipGraphGetNodes
#define cudaGraphExecUpdate hipGraphExecUpdate
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
#define cudaStreamBeginCapture hipStreamBeginCapture
#define cudaGraph_t hipGraph_t
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define cudaHostFn_t hipHostFn_t
#define __trap()             \
  do {                       \
    abort();                 \
    __builtin_unreachable(); \
  } while (0)
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED

#define __CUDA_ARCH__ 1300

#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif

#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
#define CDNA
#endif

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#if defined(__gfx1010__) || defined(__gfx1012__)
#define RDNA1
#endif

#ifndef __has_builtin
#define __has_builtin(x) 0
#endif

typedef hip_bfloat16 nv_bfloat16;


================================================
FILE: kt-kernel/cpu_backend/vendors/musa.h
================================================
#pragma once

#include <mublas.h>
#include <musa.h>
#include <musa_bf16.h>
#include <musa_fp16.h>
#include <musa_runtime.h>
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
#define CUDA_R_16F MUSA_R_16F
#define CUDA_R_32F MUSA_R_32F
#define cublasComputeType_t cudaDataType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy
#define cublasGemmEx mublasGemmEx
#define cublasGemmBatchedEx mublasGemmBatchedEx
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
#define cublasHandle_t mublasHandle_t
#define cublasSetMathMode mublasSetMathMode
#define cublasSetStream mublasSetStream
#define cublasSgemm mublasSgemm
#define cublasStatus_t mublasStatus_t
#define cublasOperation_t mublasOperation_t
#define cublasGetStatusString mublasStatus_to_string
#define cudaDataType_t musaDataType_t
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t
#define cudaEventDestroy musaEventDestroy
#define cudaFree musaFree
#define cudaFreeHost musaFreeHost
#define cudaGetDevice musaGetDevice
#define cudaGetDeviceCount musaGetDeviceCount
#define cudaGetDeviceProperties musaGetDeviceProperties
#define cudaGetErrorString musaGetErrorString
#define cudaGetLastError musaGetLastError
#define cudaHostRegister musaHostRegister
#define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister
#define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost
#define cudaMallocManaged musaMallocManaged
#define cudaMemcpy musaMemcpy
#define cudaMemcpyAsync musaMemcpyAsync
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
#define cudaMemcpy2DAsync musaMemcpy2DAsync
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
#define cudaMemcpyKind musaMemcpyKind
#define cudaMemset musaMemset
#define cudaMemsetAsync musaMemsetAsync
#define cudaMemGetInfo musaMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
#define cudaSetDevice musaSetDevice
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
#define cudaStreamDestroy musaStreamDestroy
#define cudaStreamFireAndForget musaStreamFireAndForget
#define cudaStreamNonBlocking musaStreamNonBlocking
#define cudaStreamPerThread musaStreamPerThread
#define cudaStreamSynchronize musaStreamSynchronize
#define cudaStreamWaitEvent musaStreamWaitEvent
#define cudaStream_t musaStream_t
#define cudaHostFn_t musaHostFn_t
#define nv_bfloat16 mt_bfloat16
#define cudaSuccess musaSuccess

// Additional mappings for MUSA virtual memory pool
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
#define CUdevice MUdevice
#define CUdeviceptr MUdeviceptr
#define CUmemAccessDesc MUmemAccessDesc
#define CUmemAllocationProp MUmemAllocationProp
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
#define cuDeviceGet muDeviceGet
#define cuDeviceGetAttribute muDeviceGetAttribute
#define cuMemAddressFree muMemAddressFree
#define cuMemAddressReserve muMemAddressReserve
#define cuMemCreate muMemCreate
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
#define cuMemMap muMemMap
#define cuMemRelease muMemRelease
#define cuMemSetAccess muMemSetAccess
#define cuMemUnmap muMemUnmap
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
#define cudaFuncSetAttribute musaFuncSetAttribute
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
#define make_cudaExtent make_musaExtent
#define make_cudaPitchedPtr make_musaPitchedPtr

// Additional mappings for MUSA graphs
#define CUDA_SUCCESS MUSA_SUCCESS
#define CUresult MUresult
#define cuGetErrorString muGetErrorString
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
#define cudaGraphDestroy musaGraphDestroy
#define cudaGraphExecDestroy musaGraphExecDestroy
#define cudaGraphExec_t musaGraphExec_t
#define cudaGraphExecUpdate musaGraphExecUpdate
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
#define cudaGraphGetNodes musaGraphGetNodes
#define cudaGraphInstantiate musaGraphInstantiate
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
#define cudaGraphLaunch musaGraphLaunch
#define cudaGraphNodeGetType musaGraphNodeGetType
#define cudaGraphNode_t musaGraphNode_t
#define cudaGraphNodeType musaGraphNodeType
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
#define cudaGraph_t musaGraph_t
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture

typedef mt_bfloat16 nv_bfloat16;


================================================
FILE: kt-kernel/cpu_backend/vendors/vendor.h
================================================
#ifndef CPUINFER_VENDOR_VENDOR_H
#define CPUINFER_VENDOR_VENDOR_H

#ifdef USE_CUDA
#include "cuda.h"
#elif USE_HIP
#define __HIP_PLATFORM_AMD__
#include "hip.h"
#elif USE_MUSA
#include "musa.h"
#endif

#endif  // CPUINFER_VENDOR_VENDOR_H

================================================
FILE: kt-kernel/cpu_backend/worker_pool.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:34
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "worker_pool.h"

#include <hwloc/bitmap.h>
#include <numa.h>
#include <numaif.h>

#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdio>
#include <stdexcept>

#include "hwloc.h"

thread_local int WorkerPool::thread_local_id = -1;

InNumaPool::InNumaPool(int max_thread_num) {
  printf("In Numa Worker Pool at NUMA %d, %d threads\n", numa_node_of_cpu(sched_getcpu()), max_thread_num);
  total_worker_count = max_thread_num;
  set_restricted_worker_count(total_worker_count);
  thread_state_ = std::unique_ptr<ThreadState[]>(new ThreadState[max_thread_num]);
  for (int i = 0; i < total_worker_count; i++) {
    thread_state_[i].status.store(ThreadStatus::WAITING, std::memory_order_release);
  }
  workers_.resize(total_worker_count);
  for (int i = 1; i < total_worker_count; i++) {
    workers_[i] = std::thread(&InNumaPool::worker_thread, this, i, -1);
  }
}

InNumaPool::InNumaPool(int max_thread_num, int numa_id, int threads_id_start) {
  printf("===========In NumaPool============\n");
  hwloc_topology_t topology;
  hwloc_obj_t numa_obj, core_obj;
  hwloc_bitmap_t cpuset;
  hwloc_topology_init(&topology);
  hwloc_topology_load(topology);
  printf("In Numa Worker Pool at NUMA %d, %d threads\n", numa_node_of_cpu(sched_getcpu()), max_thread_num);
  total_worker_count = max_thread_num;
  set_restricted_worker_count(total_worker_count);
  thread_state_ = std::unique_ptr<ThreadState[]>(new ThreadState[max_thread_num]);
  for (int i = 0; i < total_worker_count; i++) {
    thread_state_[i].status.store(ThreadStatus::WAITING, std::memory_order_release);
  }
  workers_.resize(total_worker_count);
  for (int i = 1; i < total_worker_count; i++) {
    workers_[i] = std::thread(&InNumaPool::worker_thread, this, i, numa_id);
    // set the thread name as: "numa_(numa_id)_t_(i+threads_id_start)"
    std::string thread_name = "numa_" + std::to_string(numa_id) + "_t_" + std::to_string(i + threads_id_start);
    pthread_t native_handle = workers_[i].native_handle();
    auto res_set_name = pthread_setname_np(native_handle, thread_name.c_str());
    if (res_set_name != 0) {
      fprintf(stderr, "Failed to set thread name: %s\n", strerror(res_set_name));
    }
    // 检查线程是否成功命名
    char name[16];
    pthread_getname_np(native_handle, name, sizeof(name));
    if (strcmp(name, thread_name.c_str()) == 0) {
      // printf("Thread name set successfully: %s\n", name);
    } else {
      // printf("Failed to set thread name: %s\n", name);
    }
    // Set the thread affinity to the specified NUMA node's CPU
    numa_obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, numa_id);
    if (!numa_obj) {
      fprintf(stderr, "NUMA node %d not found\n", numa_id);
      // throw std::runtime_error("NUMA node not found");
      continue;
    }
    core_obj = hwloc_get_obj_inside_cpuset_by_type(topology, numa_obj->cpuset, HWLOC_OBJ_CORE, i + threads_id_start);
    if (!core_obj) {
      fprintf(stderr, "Core %d inside NUMA node %d not found\n", i, numa_id);
      // throw std::runtime_error("Core not found inside NUMA node");
      continue;
    }
    cpuset = hwloc_bitmap_alloc();
    hwloc_bitmap_copy(cpuset, core_obj->cpuset);
    hwloc_bitmap_singlify(cpuset);
    auto res = hwloc_set_thread_cpubind(topology, native_handle, cpuset, HWLOC_CPUBIND_STRICT);
    if (res != 0) {
      fprintf(stderr, "Failed to set thread CPU binding: %s\n", strerror(errno));
    }
  }
}

InNumaPool::~InNumaPool() {
  for (int i = 0; i < total_worker_count; i++) {
    thread_state_[i].status.store(ThreadStatus::EXIT, std::memory_order_release);
  }
  for (int i = 0; i < total_worker_count; i++) {
    if (workers_[i].joinable()) {
      workers_[i].join();
    }
  }
}

int InNumaPool::get_thread_num() {
  throw std::runtime_error("Deprecated");
  return total_worker_count;
}

void InNumaPool::set_restricted_worker_count(int count) { restricted_worker_count = count; }

void InNumaPool::wait() {
  for (int i = 0; i < worker_count; i++) {
    while (thread_state_[i].status.load(std::memory_order_acquire) == ThreadStatus::WORKING) {
    }
  }

#ifdef PROFILE_BALANCE
  size_t max_time = 0;
  size_t min_time = thread_state_[0].finish_ns;
  size_t sum = 0;
  for (int i = 0; i < worker_count; i++) {
    sum += thread_state_[i].finish_ns;
    max_time = std::max(max_time, thread_state_[i].finish_ns);
    min_time = std::min(min_time, thread_state_[i].finish_ns);
  }
  double balance = 1.0 * sum / (max_time * worker_count);
  printf("max_time: %ld, min_time: %ld, sum_time: %ld, balance: %f\n", max_time, min_time, sum, balance);

#endif
}

void InNumaPool::do_work_stealing_job(int task_num, std::function<void(int)> compute_func) {
  do_work_stealing_job(task_num, nullptr, compute_func, nullptr);
}

void InNumaPool::do_work_stealing_job(int task_num, std::function<void(int)> init_func,
                                      std::function<void(int)> compute_func, std::function<void(int)> finalize_func) {
  do_work_stealing_job_async(task_num, init_func, compute_func, finalize_func);
  wait();
}

void InNumaPool::do_work_stealing_job_async(int task_num, std::function<void(int)> init_func,
                                            std::function<void(int)> compute_func,
                                            std::function<void(int)> finalize_func) {
  init_func_ = init_func;
  compute_func_ = compute_func;
  finalize_func_ = finalize_func;
  worker_count = std::min(restricted_worker_count, task_num);
  curr_.store(0, std::memory_order_release);
  end_ = task_num;
  for (int i = 0; i < worker_count; i++) {
    thread_state_[i].status.store(ThreadStatus::WORKING, std::memory_order_release);
  }
  WorkerPool::thread_local_id = 0;
  process_tasks(0);
}

void InNumaPool::process_tasks(int thread_id) {
#ifdef PROFILE_BALANCE
  auto start = std::chrono::high_resolution_clock::now();
#endif
  auto& s = thread_state_[thread_id];
  if (init_func_ != nullptr) {
    init_func_(thread_id);
  }

  // omp-guided-style work scheduling
  while (true) {
    int old = curr_.load(std::memory_order_relaxed);
    int rem = end_ - old;
    if (rem <= 0) {
      break;
    }

    int block = (rem + worker_count - 1) / worker_count;
    block = 1;
    int task_id = curr_.fetch_add(block, std::memory_order_acq_rel);
    if (task_id >= end_) {
      break;
    }

    for (int i = 0; i < block; i++) {
      if (task_id + i >= end_) {
        break;
      }
      compute_func_(task_id + i);
    }
  }

  if (finalize_func_ != nullptr) {
    finalize_func_(thread_id);
  }

  s.status.store(ThreadStatus::WAITING, std::memory_order_release);
#ifdef PROFILE_BALANCE
  s.finish_ns =
      std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now() - start).count();
#endif
}

void InNumaPool::worker_thread(int thread_id, int numa_id) {
  if (numa_id >= 0) {
    set_memory_to_numa(numa_id);
  }
  auto start = std::chrono::high_resolution_clock::now();
  WorkerPool::thread_local_id = thread_id;  // 设置线程本地变量
  while (true) {
    ThreadStatus status = thread_state_[thread_id].status.load(std::memory_order_acquire);
    if (status == ThreadStatus::WORKING) {
      process_tasks(thread_id);
      start = std::chrono::high_resolution_clock::now();
    } else if (status == ThreadStatus::WAITING) {
      auto now = std::chrono::high_resolution_clock::now();
      auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
      if (duration > 50) {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
      }
    } else if (status == ThreadStatus::EXIT) {
      return;
    }
  }
}

NumaJobDistributor::NumaJobDistributor(int numa_count) {
  std::vector<int> numa_ids;
  for (int i = 0; i < numa_count; i++) {
    numa_ids.push_back(i);
  }
  init(numa_ids);
}

NumaJobDistributor::NumaJobDistributor(std::vector<int> numa_ids) { init(numa_ids); }
NumaJobDistributor::NumaJobDistributor(std::vector<int> numa_ids, std::vector<int> thread_count) {
  init(numa_ids, thread_count);
}

void NumaJobDistributor::init(std::vector<int> numa_ids) {
  this->numa_count = numa_ids.size();
  this->ready_bar = std::unique_ptr<std::barrier<>>(new std::barrier<>(numa_count + 1));
  this->numa_ids = numa_ids;
  for (size_t i = 0; i < numa_count; i++) {
    status.push_back(nullptr);
  }

  workers.resize(numa_count);
  for (int i = 0; i < numa_count; i++) {
    std::thread([this, i]() { workers[i] = std::thread(&NumaJobDistributor::worker_thread, this, i); }).join();
  }
  ready_bar->arrive_and_wait();
}

void NumaJobDistributor::init(std::vector<int> numa_ids, std::vector<int> thread_count) {
  hwloc_topology_t topology;
  hwloc_obj_t numa_obj, core_obj;
  hwloc_bitmap_t cpuset;
  hwloc_topology_init(&topology);
  hwloc_topology_load(topology);

  this->numa_count = numa_ids.size();
  this->ready_bar = std::unique_ptr<std::barrier<>>(new std::barrier<>(numa_count + 1));
  this->numa_ids = numa_ids;
  for (size_t i = 0; i < numa_count; i++) {
    status.push_back(nullptr);
  }

  workers.resize(numa_count);
  std::vector<int> numa_threads_count(numa_count, 0);
  for (int i = 0; i < numa_count; i++) {
    workers[i] = std::thread(&NumaJobDistributor::worker_thread, this, i);
    auto this_numa = numa_ids[i];
    auto start_id = numa_threads_count[this_numa];
    // set the thread name as: "worker_numa_(numa_id)_main_start_id(0)"
    // printf("nuam_id %d, start_id %d\n", this_numa, start_id);
    std::string thread_name = "numa_" + std::to_string(numa_ids[i]) + "_m_" + std::to_string(start_id);
    pthread_t native_handle = workers[i].native_handle();
    pthread_setname_np(native_handle, thread_name.c_str());
    // Set the thread affinity to the specified NUMA node's CPU (0)
    numa_obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, this_numa);
    if (!numa_obj) {
      fprintf(stderr, "NUMA node %d not found\n", this_numa);
      // throw std::runtime_error("NUMA node not found");
      continue;
    }
    core_obj = hwloc_get_obj_inside_cpuset_by_type(topology, numa_obj->cpuset, HWLOC_OBJ_CORE, start_id);
    if (!core_obj) {
      fprintf(stderr, "Core %d inside NUMA node %d not found\n", 0, this_numa);
      // throw std::runtime_error("Core not found inside NUMA node");
      continue;
    }
    // 精简 cpuset
    auto cpuset_simple = hwloc_bitmap_alloc();
    hwloc_bitmap_copy(cpuset_simple, core_obj->cpuset);
    hwloc_bitmap_singlify(cpuset_simple);
    // 打印绑定的具体的 CPU 物理索引
    unsigned long i_in;
    // hwloc_bitmap_foreach_begin(i_in, cpuset_simple) { printf("Thread %d bound to CPU %ld\n", start_id, i_in); }
    // hwloc_bitmap_foreach_end();
    auto res = hwloc_set_thread_cpubind(topology, native_handle, cpuset_simple, HWLOC_CPUBIND_STRICT);
    if (res != 0) {
      fprintf(stderr, "Failed to set thread CPU binding: %s\n", strerror(errno));
    }
    // 检查线程是否绑定到指定的 核上了
    hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
    hwloc_get_thread_cpubind(topology, native_handle, cpuset, HWLOC_CPUBIND_THREAD);
    // hwloc_bitmap_foreach_begin(i_in, cpuset) { printf("Thread %d is bound to CPU %ld\n", start_id, i_in); }
    // hwloc_bitmap_foreach_end();

    numa_threads_count[this_numa] += thread_count[i];
  }
  ready_bar->arrive_and_wait();
}

NumaJobDistributor::~NumaJobDistributor() {
  for (int i = 0; i < numa_count; i++) {
    status[i]->store(ThreadStatus::EXIT, std::memory_order_release);
  }
  for (int i = 0; i < numa_count; i++) {
    if (workers[i].joinable()) {
      workers[i].join();
    }
  }
}

#ifdef USE_NUMA_JOB_DIRECT_WORK

void NumaJobDistributor::do_numa_job(std::function<void(int)> compute_func) {
  this->compute_func = compute_func;
  auto me_numa = numa_node_of_cpu(sched_getcpu());
  for (int i = 0; i < numa_count; i++) {
    if (i == me_numa) continue;

    status[i]->store(ThreadStatus::WORKING, std::memory_order_release);
  }
  compute_func(me_numa);
  for (int i = 0; i < numa_count; i++) {
    if (i == me_numa) continue;

    while (status[i]->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
    }
  }
}
#else
void NumaJobDistributor::do_numa_job(std::function<void(int)> compute_func) {
  this->compute_func = compute_func;
  for (int i = 0; i < numa_count; i++) {
    status[i]->store(ThreadStatus::WORKING, std::memory_order_release);
  }
  for (int i = 0; i < numa_count; i++) {
    while (status[i]->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
    }
  }
}
#endif

void NumaJobDistributor::worker_thread(int numa_id) {
  auto start = std::chrono::high_resolution_clock::now();
  set_memory_to_numa(numa_id);
  status[numa_id] =
      std::move(std::unique_ptr<std::atomic<ThreadStatus>>(new std::atomic<ThreadStatus>(ThreadStatus::WAITING)));
  ready_bar->arrive_and_wait();
  while (true) {
    auto stat = status[numa_id]->load(std::memory_order_acquire);
    if (stat == ThreadStatus::WORKING) {
      auto me_numa = numa_node_of_cpu(sched_getcpu());
      // printf("numa work on %d, me %d\n", numa_id, me_numa);
      compute_func(numa_id);
      status[numa_id]->store(ThreadStatus::WAITING, std::memory_order_release);
      start = std::chrono::high_resolution_clock::now();
    } else if (stat == ThreadStatus::WAITING) {
      auto now = std::chrono::high_resolution_clock::now();
      auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
      if (duration > 50) {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
      }
    } else if (stat == ThreadStatus::EXIT) {
      return;
    }
  }
}

void WorkerPool::init(WorkerPoolConfig config) {
  printf("WorkerPool[0x%lx] %d subpools, [numa:threads]", (intptr_t)this, config.subpool_count);
  for (int i = 0; i < config.subpool_count; i++) {
    printf("[%d:%d] ", config.subpool_numa_map[i], config.subpool_thread_count[i]);
  }
  printf("\n");

  for (int i = 0; i < config.subpool_count; i++) {
    numa_worker_pools.push_back(nullptr);
  }
  std::vector<int> numa_threads_count(config.subpool_count, 0);
  for (int i = 0; i < config.subpool_count; i++) {
    auto this_numa = config.subpool_numa_map[i];
    auto this_thread_count = config.subpool_thread_count[i];
    auto this_thread_id_start = numa_threads_count[this_numa];
    std::thread([this, i, this_numa, this_thread_count, this_thread_id_start]() {
      set_to_numa(this_numa);
      numa_worker_pools[i] =
          std::move(std::unique_ptr<InNumaPool>(new InNumaPool(this_thread_count, this_numa, this_thread_id_start)));
      // numa_worker_pools[i] = std::move(std::unique_ptr<InNumaPool>(new InNumaPool(this_thread_count)));
    }).join();
    numa_threads_count[this_numa] += this_thread_count;
  }

  distributor = std::move(std::unique_ptr<NumaJobDistributor>(
      new NumaJobDistributor(config.subpool_numa_map, config.subpool_thread_count)));
  // distributor = std::move(std::unique_ptr<NumaJobDistributor>(new NumaJobDistributor(config.subpool_numa_map)));
}

WorkerPool::WorkerPool(WorkerPoolConfig config) : config(config) { init(config); }

WorkerPool::WorkerPool(int total_threads) {
  config.subpool_count = numa_num_configured_nodes();
  config.subpool_numa_map.resize(config.subpool_count);
  config.subpool_thread_count.resize(config.subpool_count);
  for (int i = 0; i < config.subpool_count; i++) {
    config.subpool_numa_map[i] = i;
    config.subpool_thread_count[i] = total_threads / config.subpool_count;
  }
  init(config);
}

WorkerPool::WorkerPool(int total_threads, int single_numa_id) {
  set_to_numa(single_numa_id);
  config.subpool_count = numa_num_configured_nodes();
  config.subpool_numa_map.resize(config.subpool_count);
  config.subpool_thread_count.resize(config.subpool_count);
  for (int i = 0; i < config.subpool_count; i++) {
    config.subpool_numa_map[i] = single_numa_id;
    config.subpool_thread_count[i] = total_threads / config.subpool_count;
  }
  init(config);
}

WorkerPool::~WorkerPool() {}

int WorkerPool::get_thread_num() { return total_thread_count; }

void WorkerPool::set_restricted_worker_count(int count) {
  for (int i = 0; i < numa_count; i++) {
    numa_worker_pools[i]->set_restricted_worker_count(threads_per_numa);
  }
}

InNumaPool* WorkerPool::get_subpool(int numa_id) { return numa_worker_pools[numa_id].get(); }

NumaJobDistributor* WorkerPool::dispense_backend() { return distributor.get(); }

void WorkerPool::do_work_stealing_job(int task_num, std::function<void(int)> init_func,
                                      std::function<void(int)> compute_func, std::function<void(int)> finalize_func) {
  numa_worker_pools[0]->do_work_stealing_job(task_num, init_func, compute_func, finalize_func);
}

void WorkerPool::do_work_stealing_job(int task_num, std::function<void(int)> compute_func) {
  do_work_stealing_job(task_num, nullptr, compute_func, nullptr);
}


================================================
FILE: kt-kernel/cpu_backend/worker_pool.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_BACKEND_H
#define CPUINFER_BACKEND_H

#include <hwloc.h>
#include <numa.h>

#include <atomic>
#include <barrier>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>

// #define PROFILE_BALANCE

inline void set_to_numa(int this_numa) {
  struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes());
  numa_bitmask_setbit(mask, this_numa);
  numa_bind(mask);
  numa_bitmask_free(mask);
}

inline void set_memory_to_numa(int this_numa) {
  // printf("Set memory to NUMA %d\n", this_numa);
  hwloc_topology_t topology;
  hwloc_topology_init(&topology);
  hwloc_topology_load(topology);

  hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, this_numa);
  if (!obj) {
    fprintf(stderr, "NUMA node %d not found.\n", this_numa);
    hwloc_topology_destroy(topology);
    return;
  }

  auto ret = hwloc_set_membind(topology, obj->nodeset, HWLOC_MEMBIND_BIND,
                               HWLOC_MEMBIND_THREAD | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_BYNODESET);
  if (ret != 0) {
    perror("hwloc_set_membind_nodeset");
  }

  hwloc_topology_destroy(topology);
}

enum ThreadStatus {
  WORKING,
  WAITING,
  EXIT,
};

struct alignas(64) ThreadState {
  std::atomic<ThreadStatus> status;
#ifdef PROFILE_BALANCE
  size_t finish_ns;
#endif
};

class InNumaPool {
 public:
  InNumaPool(int thread_count);
  InNumaPool(int max_thread_num, int numa_id, int threads_id_start);
  ~InNumaPool();
  int get_thread_num();
  void set_restricted_worker_count(int count);

  void do_work_stealing_job_async(int, std::function<void(int)>, std::function<void(int)>, std::function<void(int)>);
  void wait();

  void do_work_stealing_job(int, std::function<void(int)>, std::function<void(int)>, std::function<void(int)>);
  void do_work_stealing_job(int, std::function<void(int)>);

 private:
  int worker_count;
  int total_worker_count;

  std::unique_ptr<ThreadState[]> thread_state_;  // [thread_num]
  std::vector<std::thread> workers_;

  // changed ever time called do_work_stealing_job_async
  int restricted_worker_count;
  std::function<void(int)> init_func_;
  std::function<void(int)> compute_func_;
  std::function<void(int)> finalize_func_;
  std::atomic<int> curr_;
  int end_;

  void process_tasks(int);
  void worker_thread(int, int);
};

class NumaJobDistributor {
 public:
  NumaJobDistributor(int numa_count);
  NumaJobDistributor(std::vector<int> numa_ids);
  NumaJobDistributor(std::vector<int> numa_ids, std::vector<int> thread_count);

  ~NumaJobDistributor();

  void do_numa_job(std::function<void(int)>);

 private:
  void init(std::vector<int> numa_ids);
  void init(std::vector<int> numa_ids, std::vector<int> thread_count);

  std::unique_ptr<std::barrier<>> ready_bar;

  int numa_count;
  std::vector<int> numa_ids;
  std::vector<std::unique_ptr<std::atomic<ThreadStatus>>> status;
  std::function<void(int)> compute_func;
  std::vector<std::thread> workers;

  void worker_thread(int);
};

struct WorkerPoolConfig {
  int subpool_count;
  std::vector<int> subpool_numa_map;
  std::vector<int> subpool_thread_count;
};

class WorkerPool {
 public:
  WorkerPool(int total_thread_count);
  WorkerPool(int total_thread_count, int single_numa_id);
  WorkerPool(WorkerPoolConfig config);
  ~WorkerPool();
  int get_thread_num();
  void set_restricted_worker_count(int count);

  static thread_local int thread_local_id;

  NumaJobDistributor* dispense_backend();

  InNumaPool* get_subpool(int numa_id);

  void do_work_stealing_job(int, std::function<void(int)>, std::function<void(int)>, std::function<void(int)>);
  void do_work_stealing_job(int, std::function<void(int)>);

  WorkerPoolConfig config;

 private:
  void init(WorkerPoolConfig config);

  int total_thread_count;
  int numa_count;
  int threads_per_numa;
  std::unique_ptr<NumaJobDistributor> distributor;

  std::vector<std::unique_ptr<InNumaPool>> numa_worker_pools;
};

#endif


================================================
FILE: kt-kernel/cuda/binding.cpp
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 0.2.2
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "custom_gguf/ops.h"
#ifdef KTRANSFORMERS_USE_CUDA
#include "gptq_marlin/ops.h"
#include "moe/ops.h"
#endif
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>
// namespace py = pybind11;

PYBIND11_MODULE(KTransformersOps, m) {
  m.def(
      "dequantize_q8_0",
      [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device,
         py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q8_0((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
      },
      "Function to dequantize q8_0 data.", py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"),
      py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

  m.def(
      "dequantize_q6_k",
      [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device,
         py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q6_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
      },
      "Function to dequantize q6_k data.", py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"),
      py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

  m.def(
      "dequantize_q5_k",
      [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device,
         py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q5_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
      },
      "Function to dequantize q5_k data.", py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"),
      py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

  m.def(
      "dequantize_q4_k",
      [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device,
         py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q4_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
      },
      "Function to dequantize q4_k data.", py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"),
      py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

  m.def(
      "dequantize_q3_k",
      [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device,
         py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q3_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
      },
      "Function to dequantize q3_k data.", py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"),
      py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

  m.def(
      "dequantize_q2_k",
      [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device,
         py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q2_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
      },
      "Function to dequantize q2_k data.", py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"),
      py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

  m.def(
      "dequantize_iq4_xs",
      [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device,
         py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_iq4_xs((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
      },
      "Function to dequantize iq4_xs data.", py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"),
      py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

#ifdef KTRANSFORMERS_USE_CUDA
  m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.", py::arg("a"),
        py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"), py::arg("perm"), py::arg("workspace"),
        py::arg("num_bits"), py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
  m.def("topk_softmax", &topk_softmax, "Function to perform topk_softmax.", py::arg("topk_weights"),
        py::arg("topk_indices"), py::arg("token_expert_indices"), py::arg("gating_output"));
#endif
}


================================================
FILE: kt-kernel/cuda/custom_gguf/dequant.cu
================================================
/*
 * @Description  :  
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 0.2.2
 * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
 * Copyright (c) 2023-2024 The ggml authors
 * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 */
#include <cuda_runtime.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
#include <cstdint>
#include <c10/cuda/CUDAGuard.h>

#ifdef __HIP_PLATFORM_AMD__
typedef hip_bfloat16 nv_bfloat16;
#endif

__global__ void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++){
            output_blk[i] = scale * cur_block[i];
        }
    }
}

__global__ void dequantize_q8_0_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++) {
            output_blk[i] = __float2half(scale * cur_block[i]);
        }
    }
}

__global__ void dequantize_q8_0_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++) {
            output_blk[i] = __float2bfloat16(scale * cur_block[i]);
        }
    }
}

// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
__device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
    if (j < 4) {
        *d = q[j] & 63; *m = q[j + 4] & 63;
    } else {
        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
}

__global__ void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q2_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q2_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}


__global__ void dequantize_q4_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1;
            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l]  >> 4) - m2;
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q4_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * (q[l] & 0xF) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * (q[l]  >> 4) - m2);
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q4_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * (q[l] & 0xF) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * (q[l]  >> 4) - m2);
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q5_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q5_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q5_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q6_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = d * sc[is + 0] * q1;
                output_blk[l + 32] = d * sc[is + 2] * q2;
                output_blk[l + 64] = d * sc[is + 4] * q3;
                output_blk[l + 96] = d * sc[is + 6] * q4;
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

__global__ void dequantize_q6_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = __float2half(d * sc[is + 0] * q1);
                output_blk[l + 32] = __float2half(d * sc[is + 2] * q2);
                output_blk[l + 64] = __float2half(d * sc[is + 4] * q3);
                output_blk[l + 96] = __float2half(d * sc[is + 6] * q4);
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

__global__ void dequantize_q6_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = __float2bfloat16(d * sc[is + 0] * q1);
                output_blk[l + 32] = __float2bfloat16(d * sc[is + 2] * q2);
                output_blk[l + 64] = __float2bfloat16(d * sc[is + 4] * q3);
                output_blk[l + 96] = __float2bfloat16(d * sc[is + 6] * q4);
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

__global__ void dequantize_iq4_xs_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
                output_blk[j + 16] = dl * kvalues_iq4nl[qs[j] >> 4];
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

__global__ void dequantize_iq4_xs_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = __float2half(dl * kvalues_iq4nl[qs[j] & 0xf]);
                output_blk[j + 16] = __float2half(dl * kvalues_iq4nl[qs[j] >> 4]);
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

__global__ void dequantize_iq4_xs_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] & 0xf]);
                output_blk[j + 16] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] >> 4]);
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({ num_bytes }, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({ num_blocks, ele_per_blk }, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q8_0_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q8_0_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q8_0_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }

    cudaDeviceSynchronize();
    return output;
}


torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = num_bytes / blk_size;

    const at::cuda::OptionalCUDAGuard device_guard(device);
    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q6_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q6_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q6_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q5_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q5_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q5_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q4_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q4_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q4_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q3_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q3_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q3_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q2_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q2_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q2_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_iq4_xs_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_iq4_xs_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_iq4_xs_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

================================================
FILE: kt-kernel/cuda/custom_gguf/ops.h
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-12 03:48:46
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#pragma once

#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>

torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk,
                              const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk,
                              const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk,
                              const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk,
                              const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk,
                              const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk,
                              const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk,
                                const torch::Device device, const torch::Dtype target_dtype);


================================================
FILE: kt-kernel/cuda/gptq_marlin/gptq_marlin.cu
================================================
/*
 * Modified by Neural Magic
 * Copyright (C) Marlin.2024 Elias Frantar
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Adapted from https://github.com/IST-DASLab/marlin
 */
/*
 * Adapted from  https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
 */
#include "gptq_marlin.cuh"
#include "gptq_marlin_dtypes.cuh"
#include <c10/cuda/CUDAGuard.h>
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
  static_assert(std::is_same<scalar_t, half>::value ||          \
                    std::is_same<scalar_t, nv_bfloat16>::value, \
                "only float16 and bfloat16 is supported");

template <typename T>
inline std::string str(T x) {
  return std::to_string(x);
}

namespace gptq_marlin {

#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined(__HIP_PLATFORM_AMD__)

__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
                                    int size_k, int block_rows) {}

template <typename scalar_t,          // compute dtype, half or nv_float16
          const int num_bits,         // number of bits used for weights
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
                                      // threadblock
          const int thread_n_blocks,  // same for n dimension (output)
          const int thread_k_blocks,  // same for k dimension (reduction)
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
          const bool has_act_order,    // whether act_order is enabled
          const int group_blocks = -1  // number of consecutive 16x16 blocks
                                       // with a separate quantization scale
          >
__global__ void Marlin(
    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const int* __restrict__ g_idx,        // int32 group indices of shape k
    int num_groups,  // number of scale groups per output channel
    int prob_m,      // batch dimension m
    int prob_n,      // output dimension n
    int prob_k,      // reduction dimension k
    int* locks       // extra global storage for barrier synchronization
) {}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full) {
  TORCH_CHECK_NOT_IMPLEMENTED(false,
                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
  return torch::empty({1, 1});
}

#else

// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
template <typename scalar_t>
__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
                           const typename ScalarType<scalar_t>::FragB& frag_b,
                           typename ScalarType<scalar_t>::FragC& frag_c) {
  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
  float* c = reinterpret_cast<float*>(&frag_c);
  if constexpr (std::is_same<scalar_t, half>::value) {
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
  } else {
    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
  }
}

// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
template <typename scalar_t>
__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
                             const void* smem_ptr) {
  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
               : "r"(smem));
}

// Lookup-table based 3-input logical operation; explicitly used for
// dequantization as the compiler does not seem to automatically recognize it in
// all cases.
template <int lut>
__device__ inline int lop3(int a, int b, int c) {
  int res;
  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
               : "=r"(res)
               : "r"(a), "r"(b), "r"(c), "n"(lut));
  return res;
}

// Constructs destination register by taking bytes from 2 sources (based on
// mask)
template <int start_byte, int mask>
__device__ inline uint32_t prmt(uint32_t a) {
  uint32_t res;
  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
               : "=r"(res)
               : "r"(a), "n"(start_byte), "n"(mask));
  return res;
}

// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
// values. We mostly follow the strategy in the link below, with some small
// changes:
// - FP16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
// - BF16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
template <typename scalar_t>
__device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit(int q) {
  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
}

template <>
__device__ inline typename ScalarType<half>::FragB dequant_4bit<half>(int q) {
  const int LO = 0x000f000f;
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
  const int MUL = 0x2c002c00;
  const int ADD = 0xd480d480;
  typename ScalarType<half>::FragB frag_b;
  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
                      *reinterpret_cast<const half2*>(&SUB));
  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
                      *reinterpret_cast<const half2*>(&MUL),
                      *reinterpret_cast<const half2*>(&ADD));
  return frag_b;
}

template <>
__device__ inline typename ScalarType<nv_bfloat16>::FragB
dequant_4bit<nv_bfloat16>(int q) {
  static constexpr uint32_t MASK = 0x000f000f;
  static constexpr uint32_t EX = 0x43004300;

  // Guarantee that the `(a & b) | c` operations are LOP3s.

  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
  static constexpr uint32_t ADD = 0xC308C308;

  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
  return frag_b;
}

// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
// bf16 Reference:
// - FP16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
// - BF16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
template <typename scalar_t>
__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
}

template <>
__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
  static constexpr uint32_t mask_for_elt_01 = 0x5250;
  static constexpr uint32_t mask_for_elt_23 = 0x5351;
  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;

  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);

  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;

  typename ScalarType<half>::FragB frag_b;
  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
  return frag_b;
}

template <>
__device__ inline typename ScalarType<nv_bfloat16>::FragB
dequant_8bit<nv_bfloat16>(int q) {
  typename ScalarType<nv_bfloat16>::FragB frag_b;

  float fp32_intermediates[4];
  uint32_t* fp32_intermediates_casted =
      reinterpret_cast<uint32_t*>(fp32_intermediates);

  static constexpr uint32_t fp32_base = 0x4B000000;
  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);

  fp32_intermediates[0] -= 8388736.f;
  fp32_intermediates[1] -= 8388736.f;
  fp32_intermediates[2] -= 8388736.f;
  fp32_intermediates[3] -= 8388736.f;

  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
                                   fp32_intermediates_casted[1], 0x7632);
  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
                                   fp32_intermediates_casted[3], 0x7632);

  return frag_b;
}

// Multiply dequantized values by the corresponding quantization scale; used
// only for grouped quantization.
template <typename scalar_t>
__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
                             typename ScalarType<scalar_t>::FragS& frag_s,
                             int i) {
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  scalar_t2 s =
      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
  frag_b[0] = __hmul2(frag_b[0], s);
  frag_b[1] = __hmul2(frag_b[1], s);
}

// Same as above, but for act_order (each K is multiplied individually)
template <typename scalar_t>
__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
                              typename ScalarType<scalar_t>::FragS& frag_s_1,
                              typename ScalarType<scalar_t>::FragS& frag_s_2,
                              typename ScalarType<scalar_t>::FragS& frag_s_3,
                              typename ScalarType<scalar_t>::FragS& frag_s_4,
                              int i) {
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  scalar_t2 s_val_1_2;
  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];

  scalar_t2 s_val_3_4;
  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];

  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
}

// Given 2 floats multiply by 2 scales (halves)
template <typename scalar_t>
__device__ inline void scale_float(float* c,
                                   typename ScalarType<scalar_t>::FragS& s) {
  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
}

// Wait until barrier reaches `count`, then lock for current threadblock.
__device__ inline void barrier_acquire(int* lock, int count) {
  if (threadIdx.x == 0) {
    int state = -1;
    do
      // Guarantee that subsequent writes by this threadblock will be visible
      // globally.
      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
                   : "=r"(state)
                   : "l"(lock));
    while (state != count);
  }
  __syncthreads();
}

// Release barrier and increment visitation count.
__device__ inline void barrier_release(int* lock, bool reset = false) {
  __syncthreads();
  if (threadIdx.x == 0) {
    if (reset) {
      lock[0] = 0;
      return;
    }
    int val = 1;
    // Make sure that all writes since acquiring this barrier are visible
    // globally, while releasing the barrier.
    asm volatile("fence.acq_rel.gpu;\n");
    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
                 :
                 : "l"(lock), "r"(val));
  }
}

// For a given "a" of size [M,K] performs a permutation of the K columns based
// on the given "perm" indices.
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
                                    int size_k, int block_rows) {
  int start_row = block_rows * blockIdx.x;
  int finish_row = start_row + block_rows;
  if (finish_row > size_m) {
    finish_row = size_m;
  }
  int cur_block_rows = finish_row - start_row;

  int row_stride = size_k * sizeof(half) / 16;

  auto permute_row = [&](int row) {
    int iters = size_k / default_threads;
    int rest = size_k % default_threads;

    int offset = row * row_stride;

    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);

    int base_k = 0;

    for (int i = 0; i < iters; i++) {
      int cur_k = base_k + threadIdx.x;
      int src_pos = perm_int_ptr[cur_k];

      out_half[cur_k] = a_row_half[src_pos];

      base_k += default_threads;
    }

    if (rest) {
      if (threadIdx.x < rest) {
        int cur_k = base_k + threadIdx.x;
        int src_pos = perm_int_ptr[cur_k];

        out_half[cur_k] = a_row_half[src_pos];
      }
    }
  };

  for (int i = 0; i < cur_block_rows; i++) {
    int cur_row = start_row + i;
    if (cur_row < size_m) {
      permute_row(cur_row);
    }
  }
}

template <typename scalar_t,          // compute dtype, half or nv_float16
          const int num_bits,         // number of bits used for weights
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
                                      // threadblock
          const int thread_n_blocks,  // same for n dimension (output)
          const int thread_k_blocks,  // same for k dimension (reduction)
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
          const bool has_act_order,    // whether act_order is enabled
          const int group_blocks = -1  // number of consecutive 16x16 blocks
                                       // with a separate quantization scale
          >
__global__ void Marlin(
    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const int* __restrict__ g_idx,        // int32 group indices of shape k
    int num_groups,  // number of scale groups per output channel
    int prob_m,      // batch dimension m
    int prob_n,      // output dimension n
    int prob_k,      // reduction dimension k
    int* locks       // extra global storage for barrier synchronization
) {
  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
  // same size, which might involve multiple column "slices" (of width 16 *
  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
  // example:
  //   0 1 3
  //   0 2 3
  //   1 2 4
  // While this kind of partitioning makes things somewhat more complicated, it
  // ensures good utilization of all SMs for many kinds of shape and GPU
  // configurations, while requiring as few slow global cross-threadblock
  // reductions as possible.
  using Dtype = ScalarType<scalar_t>;
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  using FragA = typename ScalarType<scalar_t>::FragA;
  using FragB = typename ScalarType<scalar_t>::FragB;
  using FragC = typename ScalarType<scalar_t>::FragC;
  using FragS = typename ScalarType<scalar_t>::FragS;

  constexpr int pack_factor = 32 / num_bits;

  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
  // better partitioning with less reductions
  int parallel = 1;
  if (prob_m > 16 * thread_m_blocks) {
    parallel = prob_m / (16 * thread_m_blocks);
    prob_m = 16 * thread_m_blocks;
  }

  int k_tiles = prob_k / 16 / thread_k_blocks;
  int n_tiles = prob_n / 16 / thread_n_blocks;
  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);

  if constexpr (!has_act_order && group_blocks != -1) {
    if (group_blocks >= thread_k_blocks) {
      // Ensure that the number of tiles in each stripe is a multiple of the
      // groupsize; this avoids an annoying special case where a stripe starts
      // in the middle of group.
      iters = (group_blocks / thread_k_blocks) *
              div_ceil(iters, (group_blocks / thread_k_blocks));
    }
  }

  int slice_row = (iters * blockIdx.x) % k_tiles;
  int slice_col_par = (iters * blockIdx.x) / k_tiles;
  int slice_col = slice_col_par;
  int slice_iters;  // number of threadblock tiles in the current slice
  int slice_count =
      0;          // total number of active threadblocks in the current slice
  int slice_idx;  // index of threadblock in current slice; numbered bottom to
                  // top

  // We can easily implement parallel problem execution by just remapping
  // indices and advancing global pointers
  if (slice_col_par >= n_tiles) {
    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
    locks += (slice_col_par / n_tiles) * n_tiles;
    slice_col = slice_col_par % n_tiles;
  }

  // Compute all information about the current slice which is required for
  // synchronization.
  auto init_slice = [&]() {
    slice_iters =
        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
    if (slice_iters == 0) return;
    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
    slice_count = 1;
    slice_idx = 0;
    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
    if (col_first <= k_tiles * (slice_col_par + 1)) {
      int col_off = col_first - k_tiles * slice_col_par;
      slice_count = div_ceil(k_tiles - col_off, iters);
      if (col_off > 0) slice_count++;
      int delta_first = iters * blockIdx.x - col_first;
      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
        slice_idx = slice_count - 1;
      else {
        slice_idx = slice_count - 1 - delta_first / iters;
        if (col_off > 0) slice_idx--;
      }
    }
    if (slice_col == n_tiles) {
      A += 16 * thread_m_blocks * prob_k / 8;
      C += 16 * thread_m_blocks * prob_n / 8;
      locks += n_tiles;
      slice_col = 0;
    }
  };
  init_slice();

  // A sizes/strides

  // stride of the A matrix in global memory
  int a_gl_stride = prob_k / 8;
  // stride of an A matrix tile in shared memory
  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
  // delta between subsequent A tiles in global memory
  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
  // between subsequent accesses within a tile
  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
  // between shared memory writes
  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
  // between shared memory tile reads
  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
  // within a shared memory tile
  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
  // overall size of a tile
  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
  // number of shared write iterations for a tile
  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);

  // B sizes/strides
  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;

  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;

  // Scale sizes/strides without act_order
  int s_gl_stride = prob_n / 8;
  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
  constexpr int s_tb_groups =
      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
          ? thread_k_blocks / group_blocks
          : 1;
  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
  int s_gl_rd_delta = s_gl_stride;

  // Scale size/strides with act_order
  constexpr int tb_k = 16 * thread_k_blocks;
  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
  // constexpr int act_s_row_stride      = 1;
  // int           act_s_col_stride      = act_s_row_stride * num_groups;
  int act_s_col_stride = 1;
  int act_s_col_warp_stride = act_s_col_stride * 8;
  int tb_n_warps = thread_n_blocks / 4;
  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;

  // Global A read index of current thread.
  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                (threadIdx.x % a_gl_rd_delta_o);
  a_gl_rd += a_gl_rd_delta_o * slice_row;
  // Shared write index of current thread.
  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
                (threadIdx.x % a_gl_rd_delta_o);
  // Shared read index.
  int a_sh_rd =
      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));

  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
  int b_sh_wr = threadIdx.x * b_thread_vecs;
  int b_sh_rd = threadIdx.x * b_thread_vecs;

  // For act_order
  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
  int slice_k_start = tb_k * slice_row;
  int slice_k_finish = slice_k_start + tb_k * slice_iters;
  int slice_k_start_shared_fetch = slice_k_start;
  int slice_n_offset = act_s_col_tb_stride * slice_col;

  // No act_order
  int s_gl_rd;
  if constexpr (!has_act_order) {
    if constexpr (group_blocks == -1) {
      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
    } else {
      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                s_sh_stride * slice_col + threadIdx.x;
    }
  }
  int s_sh_wr = threadIdx.x;
  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;

  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
  // row-major in the latter case.
  int s_sh_rd;
  if constexpr (group_blocks != -1)
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) / 4;
  else
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) % 4;

  // Precompute which thread should not read memory in which iterations; this is
  // needed if there are more threads than required for a certain tilesize or
  // when the batchsize is not a multiple of 16.
  bool a_sh_wr_pred[a_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < a_sh_wr_iters; i++)
    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;

  // To ensure that writing and reading A tiles to/from shared memory, the
  // latter in fragment format, is fully bank conflict free, we need to use a
  // rather fancy XOR-based layout. The key here is that neither reads nor
  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
  // same shared memory banks. Further, it seems (based on NSight-Compute) that
  // each warp must also write a consecutive memory segment?
  auto transform_a = [&](int i) {
    int row = i / a_gl_rd_delta_o;
    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
  };
  // Since the computation of this remapping is non-trivial and, due to our main
  // loop unrolls, all shared memory accesses are static, we simply precompute
  // both transformed reads and writes.
  int a_sh_wr_trans[a_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < a_sh_wr_iters; i++)
    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
  #pragma unroll
  for (int i = 0; i < b_sh_wr_iters; i++) {
  #pragma unroll
    for (int j = 0; j < thread_m_blocks; j++)
      a_sh_rd_trans[i][j] =
          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
  }

  // Since B-accesses have non-constant stride they have to be computed at
  // runtime; we break dependencies between subsequent accesses with a tile by
  // maintining multiple pointers (we have enough registers), a tiny
  // optimization.
  const int4* B_ptr[b_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < b_sh_wr_iters; i++)
    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;

  extern __shared__ int4 sh[];
  // Shared memory storage for global fetch pipelines.
  int4* sh_a = sh;
  int4* sh_b = sh_a + (stages * a_sh_stage);
  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
  int4* sh_s = sh_g_idx + (stages * g_idx_stage);

  // Register storage for double buffer of shared memory reads.
  FragA frag_a[2][thread_m_blocks];
  I4 frag_b_quant[2][b_thread_vecs];
  FragC frag_c[thread_m_blocks][4][2];
  FragS frag_s[2][4];         // No act-order
  FragS act_frag_s[2][4][4];  // For act-order

  // Zero accumulators.
  auto zero_accums = [&]() {
  #pragma unroll
    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
      reinterpret_cast<float*>(frag_c)[i] = 0;
  };

  int sh_first_group_id = -1;
  int sh_num_groups = -1;
  constexpr int sh_max_num_groups = 32;

  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
                                    int last_group_id) {
    sh_first_group_id = first_group_id;
    sh_num_groups = last_group_id - first_group_id + 1;

    if (sh_num_groups < sh_max_num_groups) {
      sh_num_groups = sh_max_num_groups;
    }

    if (sh_first_group_id + sh_num_groups > num_groups) {
      sh_num_groups = num_groups - sh_first_group_id;
    }

    int row_offset = first_group_id * s_gl_stride;

    if (is_async) {
      for (int i = 0; i < sh_num_groups; i++) {
        if (threadIdx.x < s_sh_stride) {
          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
                         &scales_ptr[row_offset + (i * s_gl_stride) +
                                     slice_n_offset + threadIdx.x]);
        }
      }
    } else {
      for (int i = 0; i < sh_num_groups; i++) {
        if (threadIdx.x < s_sh_stride) {
          sh_s[(i * s_sh_stride) + threadIdx.x] =
              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
                         threadIdx.x];
        }
      }
    }
  };
  // Asynchronously fetch the next A, B and s tile from global to the next
  // shared memory pipeline location.
  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
    if (pred) {
      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
  #pragma unroll
      for (int i = 0; i < a_sh_wr_iters; i++) {
        cp_async4_pred(
            &sh_a_stage[a_sh_wr_trans[i]],
            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
            a_sh_wr_pred[i]);
      }
      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
  #pragma unroll
      for (int i = 0; i < b_sh_wr_iters; i++) {
  #pragma unroll
        for (int j = 0; j < b_thread_vecs; j++) {
          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
        }

        B_ptr[i] += b_gl_rd_delta_o;
      }

      if constexpr (has_act_order) {
        // Fetch g_idx thread-block portion
        int full_pipe = a_off;
        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
        if (cur_k < prob_k && cur_k < slice_k_finish) {
          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;

          int4 const* cur_g_idx_stage_ptr =
              reinterpret_cast<int4 const*>(&g_idx[cur_k]);

          if (threadIdx.x < g_idx_stage) {
            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
                           &cur_g_idx_stage_ptr[threadIdx.x]);
          }
        }
      } else {
        if constexpr (group_blocks != -1) {
          int4* sh_s_stage = sh_s + s_sh_stage * pipe;

          if constexpr (group_blocks >= thread_k_blocks) {
            // Only fetch scales if this tile starts a new group
            if (pipe % (group_blocks / thread_k_blocks) == 0) {
              if (s_sh_wr_pred) {
                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
              }
              s_gl_rd += s_gl_rd_delta;
            }
          } else {
            for (int i = 0; i < s_tb_groups; i++) {
              if (s_sh_wr_pred) {
                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
                          &scales_ptr[s_gl_rd]);
              }
              s_gl_rd += s_gl_rd_delta;
            }
          }
        }
      }
    }
    // Insert a fence even when we are winding down the pipeline to ensure that
    // waiting is also correct at this point.
    cp_async_fence();
  };

  // Wait until the next thread tile has been loaded to shared memory.
  auto wait_for_stage = [&]() {
    // We only have `stages - 2` active fetches since we are double buffering
    // and can only issue the next fetch when it is guaranteed that the previous
    // shared memory load is fully complete (as it may otherwise be
    // overwritten).
    cp_async_wait<stages - 2>();
    __syncthreads();
  };

  // Load the next sub-tile from the current location in the shared memory pipe
  // into the current register buffer.
  auto fetch_to_registers = [&](int k, int pipe) {
    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
  #pragma unroll
    for (int i = 0; i < thread_m_blocks; i++)
      ldsm4<scalar_t>(frag_a[k % 2][i],
                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
    int4* sh_b_stage = sh_b + b_sh_stage * pipe;

  #pragma unroll
    for (int i = 0; i < b_thread_vecs; i++) {
      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
    }
  };

  bool is_same_group[stages];
  int same_group_id[stages];

  auto init_same_group = [&](int pipe) {
    if constexpr (!has_act_order) {
      is_same_group[pipe] = false;
      same_group_id[pipe] = 0;
      return;
    }

    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

    int group_id_1 = sh_g_idx_int_ptr[0];
    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];

    is_same_group[pipe] = group_id_1 == group_id_2;
    same_group_id[pipe] = group_id_1;
  };

  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
    int pipe = full_pipe % stages;

    if constexpr (!has_act_order) {
      // No act-order case
      if constexpr (group_blocks != -1) {
        if constexpr (group_blocks >= thread_k_blocks) {
          int4* sh_s_stage =
              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                   (pipe / (group_blocks / thread_k_blocks)));
          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
        } else {
          int warp_id = threadIdx.x / 32;
          int n_warps = thread_n_blocks / 4;

          int warp_row = warp_id / n_warps;

          int cur_k = warp_row * 16;
          cur_k += k_iter_size * (k % b_sh_wr_iters);

          int k_blocks = cur_k / 16;
          int cur_group_id = k_blocks / group_blocks;

          int4* sh_s_stage = sh_s + s_sh_stage * pipe;

          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
        }
      }

      return;
    }

    // Act-order case

    // Determine K of the "current" thread-block
    int cur_k = slice_k_start + tb_k * full_pipe;
    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
      return;
    }

    // Reset (to current thread-block) since we read g_idx portion from the
    // shared memory
    cur_k = 0;

    // Progress to current iteration
    cur_k += k_iter_size * (k % b_sh_wr_iters);

    // Determine "position" inside the thread-block (based on warp and
    // thread-id)
    int warp_id = threadIdx.x / 32;
    int n_warps =
        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N

    int warp_row = warp_id / n_warps;
    int warp_col = warp_id % n_warps;

    cur_k += warp_row * 16;

    int th_id = threadIdx.x % 32;
    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix

    int s_col_shift =
        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
        (th_id / 4) * act_s_col_stride;

    if (is_same_group[pipe]) {
      if (k % 2 == 0) {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
                 s_col_shift];
      } else {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
      }

      for (int i = 1; i < 4; i++) {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
      }
      return;
    }

    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

    constexpr int k_frag_offsets[4] = {0, 1, 8,
                                       9};  // Tensor core offsets per thread

  #pragma unroll
    for (int i = 0; i < 4; i++) {
      int actual_k = cur_k + k_frag_offsets[i];

      int group_id = sh_g_idx_int_ptr[actual_k];
      int rel_group_id = group_id - sh_first_group_id;

      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
          sh_s[rel_group_id * s_sh_stride + s_col_shift];
    }
  };

  // Execute the actual tensor core matmul of a sub-tile.
  auto matmul = [&](int k) {
  // We have the m dimension as the inner loop in order to encourage overlapping
  // dequantization and matmul operations.
  #pragma unroll
    for (int j = 0; j < 4; j++) {
      FragB frag_b0;
      FragB frag_b1;
      if constexpr (num_bits == 4) {
        int b_quant = frag_b_quant[k % 2][0][j];
        int b_quant_shift = b_quant >> 8;

        frag_b0 = dequant_4bit<scalar_t>(b_quant);
        frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);

      } else {
        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];

        frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
        frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
      }

      // Apply scale to frag_b0
      if constexpr (has_act_order) {
        scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                         act_frag_s[k % 2][3][j], 0);
      } else {
        if constexpr (group_blocks != -1) {
          scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
        }
      }

      // Apply scale to frag_b1
      if constexpr (has_act_order) {
        scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                         act_frag_s[k % 2][3][j], 1);

      } else {
        if constexpr (group_blocks != -1) {
          scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
        }
      }

  #pragma unroll
      for (int i = 0; i < thread_m_blocks; i++) {
        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
      }
    }
  };

  // Since we slice across the k dimension of a tile in order to increase the
  // number of warps while keeping the n dimension of a tile reasonable, we have
  // multiple warps that accumulate their partial sums of the same output
  // location; which we have to reduce over in the end. We do in shared memory.
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride_threads / 2;
    if (red_off >= 1) {
      int red_idx = threadIdx.x / b_sh_stride_threads;
      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride_threads;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                      (threadIdx.x % b_sh_stride_threads);

      // Parallel logarithmic shared memory reduction. We make sure to avoid any
      // unnecessary read or write iterations, e.g., for two warps we write only
      // once by warp 1 and read only once by warp 0.

  #pragma unroll
      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
  #pragma unroll
        for (int i = red_off; i > 0; i /= 2) {
          if (i <= red_idx && red_idx < 2 * i) {
  #pragma unroll
            for (int j = 0; j < 4 * 2; j++) {
              int red_sh_wr =
                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
              if (i < red_off) {
                float* c_rd =
                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
  #pragma unroll
                for (int k = 0; k < 4; k++)
                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                      c_rd[k] + c_wr[k];
              }
              sh[red_sh_wr] =
                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
            }
          }
          __syncthreads();
        }
        if (red_idx == 0) {
  #pragma unroll
          for (int i = 0; i < 4 * 2; i++) {
            float* c_rd =
                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
  #pragma unroll
            for (int j = 0; j < 4; j++)
              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
                  c_rd[j];
          }
        }
        __syncthreads();
      }
    }
  };

  // Since multiple threadblocks may process parts of the same column slice, we
  // finally have to globally reduce over the results. As the striped
  // partitioning minimizes the number of such reductions and our outputs are
  // usually rather small, we perform this reduction serially in L2 cache.
  auto global_reduce = [&](bool first = false, bool last = false) {
    // We are very careful here to reduce directly in the output buffer to
    // maximize L2 cache utilization in this step. To do this, we write out
    // results in FP16 (but still reduce with FP32 compute).
    constexpr int active_threads = 32 * thread_n_blocks / 4;
    if (threadIdx.x < active_threads) {
      int c_gl_stride = prob_n / 8;
      int c_gl_wr_delta_o = 8 * c_gl_stride;
      int c_gl_wr_delta_i = 4 * (active_threads / 32);
      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
      int c_sh_wr = threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

      if (!first) {
  // Interestingly, doing direct global accesses here really seems to mess up
  // the compiler and lead to slowdowns, hence we also use async-copies even
  // though these fetches are not actually asynchronous.
  #pragma unroll
        for (int i = 0; i < thread_m_blocks * 4; i++) {
          cp_async4_pred(
              &sh[c_sh_wr + c_sh_wr_delta * i],
              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                 c_gl_wr_delta_i * (i % 2)],
              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
        }
        cp_async_fence();
        cp_async_wait<0>();
      }

  #pragma unroll
      for (int i = 0; i < thread_m_blocks * 4; i++) {
        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
          if (!first) {
            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
  #pragma unroll
            for (int j = 0; j < 2 * 4; j++) {
              reinterpret_cast<float*>(
                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
            }
          }
          if (!last) {
            int4 c;
  #pragma unroll
            for (int j = 0; j < 2 * 4; j++) {
              reinterpret_cast<scalar_t*>(&c)[j] =
                  Dtype::float2num(reinterpret_cast<float*>(
                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
            }
            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
                c;
          }
        }
      }
    }
  };

  // Write out the reduce final result in the correct layout. We only actually
  // reshuffle matrix fragments in this step, the reduction above is performed
  // in fragment layout.
  auto write_result = [&]() {
    int c_gl_stride = prob_n / 8;
    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
    constexpr int c_sh_rd_delta =
        c_sh_stride * (threads / (2 * thread_n_blocks));

    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                  (threadIdx.x % (2 * thread_n_blocks));
    c_gl_wr += (2 * thread_n_blocks) * slice_col;
    int c_sh_wr =
        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
    c_sh_wr += 32 * (threadIdx.x / 32);
    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                  (threadIdx.x % (2 * thread_n_blocks));

    int c_gl_wr_end = c_gl_stride * prob_m;

    // We first reorder in shared memory to guarantee the most efficient final
    // global write patterns
    auto write = [&](int idx, float c0, float c1, FragS& s) {
      scalar_t2 res =
          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

      // For per-column quantization we finally apply the scale here (only for
      // 4-bit)
      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
        res = __hmul2(res, s[0]);
      }

      ((scalar_t2*)sh)[idx] = res;
    };

    if (threadIdx.x / 32 < thread_n_blocks / 4) {
  #pragma unroll
      for (int i = 0; i < thread_m_blocks; i++) {
  #pragma unroll
        for (int j = 0; j < 4; j++) {
          int wr = c_sh_wr + 8 * j;
          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
        }
        c_sh_wr += 16 * (4 * c_sh_stride);
      }
    }
    __syncthreads();

  #pragma unroll
    for (int i = 0;
         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
         i++) {
      if (c_gl_wr < c_gl_wr_end) {
        C[c_gl_wr] = sh[c_sh_rd];
        c_gl_wr += c_gl_wr_delta;
        c_sh_rd += c_sh_rd_delta;
      }
    }
  };

  // Start global fetch and register load pipelines.
  auto start_pipes = [&]() {

  #pragma unroll
    for (int i = 0; i < stages - 1; i++) {
      if (has_act_order && i == 0) {
        int last_g_idx = slice_k_start + stages * tb_k * 2;
        if (last_g_idx >= prob_k) {
          last_g_idx = prob_k - 1;
        }
        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
      }
      fetch_to_shared(i, i, i < slice_iters);
    }

    zero_accums();
    wait_for_stage();
    init_same_group(0);
    fetch_to_registers(0, 0);
    fetch_scales_to_registers(0, 0);
    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
    slice_k_start_shared_fetch += tb_k * (stages - 1);
  };
  if (slice_iters) {
    start_pipes();
  }

  // Main loop.
  while (slice_iters) {
    // We unroll over both the global fetch and the register load pipeline to
    // ensure all shared memory accesses are static. Note that both pipelines
    // have even length meaning that the next iteration will always start at
    // index 0.

  #pragma unroll
    for (int pipe = 0; pipe < stages;) {
  #pragma unroll
      for (int k = 0; k < b_sh_wr_iters; k++) {
        fetch_to_registers(k + 1, pipe % stages);
        fetch_scales_to_registers(k + 1, pipe);
        if (k == b_sh_wr_iters - 2) {
          fetch_to_shared((pipe + stages - 1) % stages, pipe,
                          slice_iters >= stages);
          pipe++;
          wait_for_stage();
          init_same_group(pipe % stages);
        }
        matmul(k);
      }
      slice_iters--;
      if (slice_iters == 0) {
        break;
      }
    }

    a_gl_rd += a_gl_rd_delta_o * stages;
    slice_k_start += tb_k * stages;
    slice_k_start_shared_fetch += tb_k * stages;

    if constexpr (has_act_order) {
      int first_group_id = g_idx[slice_k_start];
      int last_g_idx = slice_k_start + stages * tb_k * 2;
      if (last_g_idx >= prob_k) {
        last_g_idx = prob_k - 1;
      }
      int last_group_id = g_idx[last_g_idx];
      if (last_group_id >= sh_first_group_id + sh_num_groups) {
        fetch_scales_to_shared(false, first_group_id, last_group_id);
        __syncthreads();
      }
    }

    // Process results and, if necessary, proceed to the next column slice.
    // While this pattern may not be the most readable, other ways of writing
    // the loop seemed to noticeably worse performance after compilation.
    if (slice_iters == 0) {
      cp_async_wait<0>();
      bool last = slice_idx == slice_count - 1;
      // For per-column scales, we only fetch them here in the final step before
      // write-out
      if constexpr (!has_act_order && group_blocks == -1) {
        if constexpr (num_bits == 8) {
          if (s_sh_wr_pred) {
            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
          }
          cp_async_fence();
        } else {
          if (last) {
            if (s_sh_wr_pred) {
              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
            }
            cp_async_fence();
          }
        }
      }

      thread_block_reduce();
      if constexpr (!has_act_order && group_blocks == -1) {
        if constexpr (num_bits == 8) {
          cp_async_wait<0>();
          __syncthreads();
          if (threadIdx.x / 32 < thread_n_blocks / 4) {
            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
          }

        } else {
          if (last) {
            cp_async_wait<0>();
            __syncthreads();
            if (threadIdx.x / 32 < thread_n_blocks / 4) {
              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
            }
          }
        }
      }

      // For 8-bit channelwise, we apply the scale before the global reduction
      // that converts the fp32 results to fp16 (so that we avoid possible
      // overflow in fp16)
      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
        if (threadIdx.x / 32 < thread_n_blocks / 4) {
  #pragma unroll
          for (int i = 0; i < thread_m_blocks; i++) {
  #pragma unroll
            for (int j = 0; j < 4; j++) {
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                  frag_s[j / 2][2 * (j % 2) + 0]);
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                  frag_s[j / 2][2 * (j % 2) + 0]);

              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                  frag_s[j / 2][2 * (j % 2) + 1]);
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                  frag_s[j / 2][2 * (j % 2) + 1]);
            }
          }
        }
      }

      if (slice_count > 1) {  // only globally reduce if there is more than one
                              // block in a slice
        barrier_acquire(&locks[slice_col], slice_idx);
        global_reduce(slice_idx == 0, last);
        barrier_release(&locks[slice_col], last);
      }
      if (last)  // only the last block in a slice actually writes the result
        write_result();
      slice_row = 0;
      slice_col_par++;
      slice_col++;
      init_slice();
      if (slice_iters) {
        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                  (threadIdx.x % a_gl_rd_delta_o);
  #pragma unroll
        for (int i = 0; i < b_sh_wr_iters; i++)
          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
        if (slice_col == 0) {
  #pragma unroll
          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
        }

        // Update slice k/n for scales loading
        if constexpr (has_act_order) {
          slice_k_start = tb_k * slice_row;
          slice_k_finish = slice_k_start + tb_k * slice_iters;
          slice_k_start_shared_fetch = slice_k_start;
          slice_n_offset = act_s_col_tb_stride * slice_col;

        } else {
          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
        }

        start_pipes();
      }
    }
  }
}

  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
                    THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \
    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
             thread_n_blocks == THREAD_N_BLOCKS &&                             \
             thread_k_blocks == THREAD_K_BLOCKS &&                             \
             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
             num_threads == NUM_THREADS) {                                     \
      cudaFuncSetAttribute(                                                    \
          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
                 GROUP_BLOCKS>,                                                \
          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
             GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>(   \
          A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n,   \
          prob_k, locks);                                                      \
    }

typedef struct {
  int thread_k;
  int thread_n;
  int num_threads;
} thread_config_t;

typedef struct {
  int max_m_blocks;
  thread_config_t tb_cfg;
} exec_config_t;

thread_config_t small_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {128, 128, 256},
    {64, 128, 128},
    {128, 64, 128},
};

thread_config_t large_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {64, 256, 256},
    {64, 128, 128},
    {128, 64, 128},

};

int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                          int prob_n, int prob_k, int num_bits, int group_size,
                          bool has_act_order, bool is_k_full) {
  bool cache_scales_chunk = has_act_order && !is_k_full;

  int tb_n = th_config.thread_n;
  int tb_k = th_config.thread_k;

  // Get max scale groups per thread-block
  int tb_groups;
  if (group_size == -1) {
    tb_groups = 1;
  } else if (group_size == 0) {
    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
  } else {
    tb_groups = div_ceil(tb_k, group_size);
  }

  if (cache_scales_chunk) {
    int load_groups =
        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
    return load_groups * tb_n * 2;

  } else {
    int tb_scales = tb_groups * tb_n * 2;

    return tb_scales * pipe_stages;
  }
}

bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
                         int prob_m, int prob_n, int prob_k, int num_bits,
                         int scales_cache_size, int max_shared_mem) {
  int pack_factor = 32 / num_bits;

  // Get B size
  int tb_k = th_config.thread_k;
  int tb_n = th_config.thread_n;

  int b_size = (tb_k * tb_n / pack_factor) * 4;

  // Get A size
  int m_blocks = div_ceil(prob_m, 16);
  int tb_max_m = 16;

  while (true) {
    if (m_blocks >= max_m_blocks) {
      tb_max_m *= max_m_blocks;
      break;
    }

    max_m_blocks--;
    if (max_m_blocks == 0) {
      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
    }
  }

  int a_size = (tb_max_m * tb_k) * 2;

  float pipe_size = (a_size + b_size) * pipe_stages;

  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity

  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
}

bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
                     int prob_m, int prob_n, int prob_k, int num_bits,
                     int group_size, bool has_act_order, bool is_k_full,
                     int max_shared_mem) {
  // Sanity
  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
      th_config.num_threads == -1) {
    return false;
  }

  // Verify K/N are divisible by thread K/N
  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
    return false;
  }

  // Verify min for thread K/N
  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
    return false;
  }

  // num_threads must be at least 128 (= 4 warps)
  if (th_config.num_threads < 128) {
    return false;
  }

  //  Determine cache for scales
  int scales_cache_size =
      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                            group_size, has_act_order, is_k_full);

  // Check that pipeline fits into cache
  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                           num_bits, scales_cache_size, max_shared_mem)) {
    return false;
  }

  return true;
}

exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
                                      int num_bits, int group_size,
                                      bool has_act_order, bool is_k_full,
                                      int max_shared_mem) {
  int max_m_blocks = 4;
  while (max_m_blocks > 0) {
    if (prob_m <= 16) {
      for (auto th_config : small_batch_thread_configs) {
        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                            num_bits, group_size, has_act_order, is_k_full,
                            max_shared_mem)) {
          return exec_config_t{max_m_blocks, th_config};
        }
      }
    } else {
      for (auto th_config : large_batch_thread_configs) {
        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                            num_bits, group_size, has_act_order, is_k_full,
                            max_shared_mem)) {
          return exec_config_t{max_m_blocks, th_config};
        }
      }
    }

    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
                     // usage
  }

  return exec_config_t{0, {-1, -1, -1}};
}

  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
                                                                       \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)

template <typename scalar_t>
void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
                     void* g_idx, void* perm, void* a_tmp, int prob_m,
                     int prob_n, int prob_k, void* workspace, int num_bits,
                     bool has_act_order, bool is_k_full, int num_groups,
                     int group_size, int dev, cudaStream_t stream, int thread_k,
                     int thread_n, int sms, int max_par) {
  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
              ", ", prob_n, ", ", prob_k, "]");

  int tot_m = prob_m;
  int tot_m_blocks = div_ceil(tot_m, 16);
  int pad = 16 * tot_m_blocks - tot_m;

  if (sms == -1) {
    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
  }

  int max_shared_mem = 0;
  cudaDeviceGetAttribute(&max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);

  // Set thread config
  exec_config_t exec_cfg;
  if (thread_k != -1 && thread_n != -1) {
    // User-defined config
    exec_cfg =
        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
  } else {
    // Auto config
    exec_cfg =
        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
                                has_act_order, is_k_full, max_shared_mem);
  }

  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
                                  prob_m, prob_n, prob_k, num_bits, group_size,
                                  has_act_order, is_k_full, max_shared_mem),
              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
              ", group_size = ", group_size,
              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
              ", max_shared_mem = ", max_shared_mem);

  int num_threads = exec_cfg.tb_cfg.num_threads;
  thread_k = exec_cfg.tb_cfg.thread_k;
  thread_n = exec_cfg.tb_cfg.thread_n;

  int thread_k_blocks = thread_k / 16;
  int thread_n_blocks = thread_n / 16;

  int blocks = sms;

  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
              " is not divisible by thread_n = ", thread_n);
  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
              " is not divisible by thread_k = ", thread_k);

  int group_blocks = 0;
  if (has_act_order) {
    if (is_k_full) {
      TORCH_CHECK(group_size != -1);
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    } else {
      TORCH_CHECK(group_size == 0);
      group_blocks = 0;
    }

  } else {
    if (group_size == -1) {
      group_blocks = -1;
    } else {
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    }
  }

  const int4* A_ptr = (const int4*)A;
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  const int4* s_ptr = (const int4*)s;
  const int* g_idx_ptr = (const int*)g_idx;
  const int* perm_ptr = (const int*)perm;
  int4* a_tmp_ptr = (int4*)a_tmp;

  int* locks = (int*)workspace;

  if (has_act_order) {
    // Permute A columns
    int block_rows = div_ceil(prob_m, blocks);
    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
    A_ptr = a_tmp_ptr;
  }

  // If we have a full K, then we can run the non-act-order version of Marlin
  // (since the weight rows are reordered by increasing group ids, and by having
  // a full K, we have full original groups)
  if (is_k_full) {
    has_act_order = false;
  }

  // Main loop
  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
    int thread_m_blocks = tot_m_blocks - i;
    prob_m = tot_m - 16 * i;
    int par = 1;
    if (thread_m_blocks > exec_cfg.max_m_blocks) {
      // Note that parallel > 1 currently only works for inputs without any
      // padding
      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
      if (par > max_par) par = max_par;
      prob_m = (16 * exec_cfg.max_m_blocks) * par;
      i += exec_cfg.max_m_blocks * (par - 1);
      thread_m_blocks = exec_cfg.max_m_blocks;
    }


    // Define kernel configurations
#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
    str(prob_n) + ", " + str(prob_k) + "]" + \
        ", has_act_order = " + str(has_act_order) + \
        ", num_groups = " + str(num_groups) + \
        ", group_size = " + str(group_size) + \
        ", thread_m_blocks = " + str(thread_m_blocks) + \
        ", thread_n_blocks = " + str(thread_n_blocks) + \
        ", thread_k_blocks = " + str(thread_k_blocks));


    if (num_bits == 4 && num_threads == 256)
    {
        if (false) {
        }
        CALL_IF(4, 32, 2, 256)
        CALL_IF(4, 16, 4, 256)
        CALL_IF(4, 8, 8, 256)
        else {
            undefined_error
        }
    }
    else if (num_bits == 4 && num_threads == 128)
    {
        if (false) {
        }
        CALL_IF(4, 8, 4, 128)
        CALL_IF(4, 4, 8, 128)
        else {
            undefined_error
        }
    }
    else if (num_bits == 8 && num_threads == 256)
    {
        if (false) {
        }
        CALL_IF(8, 32, 2, 256)
        CALL_IF(8, 16, 4, 256)
        CALL_IF(8, 8, 8, 256)
        else {
            undefined_error
        }
    }
    else if (num_bits == 8 && num_threads == 128)
    {
        if (false) {
        }
        CALL_IF(8, 8, 4, 128)
        CALL_IF(8, 4, 8, 128)
        else {
            undefined_error
        }
    }
    else {
        undefined_error
    }

    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
  }
}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  // Verify num_bits
  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  int pack_factor = 32 / num_bits;

  // Verify A
  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
              ", size_m = ", size_m);
  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
              ", size_k = ", size_k);

  // Verify B
  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
              " is not divisible by tile_size = ", gptq_marlin::tile_size);
  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
              "b_q_weight.size(1) = ", b_q_weight.size(1),
              " is not divisible by tile_size = ", gptq_marlin::tile_size);
  int actual_size_n =
      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
              ", actual_size_n = ", actual_size_n);

  // Verify device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");

  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");

  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");

  TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
  TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");

  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");

  // Alloc buffers
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c = torch::empty({size_m, size_n}, options);
  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);

  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_k = -1;
  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_n = -1;
  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
  int sms = -1;

  // Verify g_idx and perm
  TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
                  (g_idx.size(0) == size_k && perm.size(0) == size_k),
              "Unexpected g_idx.size(0) = ", g_idx.size(0),
              " and perm.size(0) = ", perm.size(0),
              ", where size_k = ", size_k);

  // Detect groupsize and act_order
  int num_groups = -1;
  int group_size = -1;
  bool has_act_order = g_idx.size(0) != 0;

  int b_rank = b_scales.sizes().size();
  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
              " is not size_n = ", size_n);
  num_groups = b_scales.size(0);

  if (has_act_order) {
    if (is_k_full) {
      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
                  ", is not divisible by num_groups = ", num_groups);
      group_size = size_k / num_groups;
    } else {
      group_size = 0;
    }

  } else {
    if (num_groups > 1) {
      TORCH_CHECK(
          size_k % num_groups == 0, "size_k = ", size_k,
          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
      group_size = size_k / num_groups;
    } else {
      group_size = -1;
    }
  }

  // Verify workspace size
  TORCH_CHECK(
      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
  int min_workspace_size =
      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
  TORCH_CHECK(workspace.numel() >= min_workspace_size,
              "workspace.numel = ", workspace.numel(),
              " is below min_workspace_size = ", min_workspace_size);

  int dev = a.get_device();
  if (a.scalar_type() == at::ScalarType::Half) {
    gptq_marlin::marlin_mm_f16i4<half>(
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
        b_scales.data_ptr<at::Half>(), g_idx.data_ptr(), perm.data_ptr(),
        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
        workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups,
        group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
        thread_n, sms, gptq_marlin::max_par);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order,
        is_k_full, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        gptq_marlin::max_par);
  } else {
    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
  }

  return c;
}

#endif


================================================
FILE: kt-kernel/cuda/gptq_marlin/gptq_marlin.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once

#include <torch/all.h>

#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>

namespace gptq_marlin {

// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
static constexpr int default_threads = 256;

static constexpr int pipe_stages =
    4;  // 4 pipeline stages fit into shared memory

static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;

static constexpr int tile_size = 16;
static constexpr int max_par = 16;

template <typename T, int n>
struct Vec {
  T elems[n];
  __device__ T& operator[](int i) { return elems[i]; }
};

using I4 = Vec<int, 4>;

constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }

#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined (__HIP_PLATFORM_AMD__)
// No support for async
#else

__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
                                      bool pred = true) {
  const int BYTES = 16;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   .reg .pred p;\n"
      "   setp.ne.b32 p, %0, 0;\n"
      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
      "}\n" ::"r"((int)pred),
      "r"(smem), "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
  const int BYTES = 16;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
      "}\n" ::"r"(smem),
      "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async_fence() {
  asm volatile("cp.async.commit_group;\n" ::);
}

template <int n>
__device__ inline void cp_async_wait() {
  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}

#endif

}  // namespace gptq_marlin


================================================
FILE: kt-kernel/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#ifndef _data_types_cuh
#define _data_types_cuh
#include "gptq_marlin.cuh"
#include <cuda_fp16.h>
#include <cuda_bf16.h>

#ifdef __HIP_PLATFORM_AMD__
typedef __hip_bfloat16 nv_bfloat16;
typedef __hip_bfloat162 nv_bfloat162;
#endif

namespace gptq_marlin {

template <typename scalar_t>
class ScalarType {};

template <>
class ScalarType<half> {
 public:
  using scalar_t = half;
  using scalar_t2 = half2;

  // Matrix fragments for tensor core instructions; their precise layout is
  // documented here:
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
  using FragA = Vec<half2, 4>;
  using FragB = Vec<half2, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<half2, 1>;

  static __device__ float inline num2float(const half x) {
    return __half2float(x);
  }

  static __device__ half2 inline num2num2(const half x) {
    return __half2half2(x);
  }

  static __device__ half2 inline nums2num2(const half x1, const half x2) {
    return __halves2half2(x1, x2);
  }

  static __host__ __device__ half inline float2num(const float x) {
    return __float2half(x);
  }
};

template <>
class ScalarType<nv_bfloat16> {
 public:
  using scalar_t = nv_bfloat16;
  using scalar_t2 = nv_bfloat162;

  using FragA = Vec<nv_bfloat162, 4>;
  using FragB = Vec<nv_bfloat162, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<nv_bfloat162, 1>;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
  static __device__ float inline num2float(const nv_bfloat16 x) {
    return __bfloat162float(x);
  }

  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
    return __bfloat162bfloat162(x);
  }

  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
                                                  const nv_bfloat16 x2) {
    return __halves2bfloat162(x1, x2);
  }

  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
    return __float2bfloat16(x);
  }
#endif
};

}  // namespace gptq_marlin

#endif


================================================
FILE: kt-kernel/cuda/gptq_marlin/ops.h
================================================
/**
 * @Description  :
 * @Author       : Azure
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : Azure
 * @LastEditTime : 2024-07-26 08:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#pragma once

#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_scales,
                               torch::Tensor& g_idx, torch::Tensor& perm, torch::Tensor& workspace, int64_t num_bits,
                               int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full);

// torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
//                                  int64_t size_k, int64_t size_n,
//                                  int64_t num_bits);

================================================
FILE: kt-kernel/cuda/moe/moe_topk_softmax_kernels.cu
================================================
// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.3/csrc/moe/topk_softmax_kernels.cu
// which is originally adapted from
// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.7.1/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu

#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/all.h>

#ifndef USE_ROCM
#include <cub/cub.cuh>
#include <cub/util_type.cuh>
#else
#include <hipcub/hipcub.hpp>
#include <hipcub/util_type.hpp>
#endif

#include "utils.h"

#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))

/// Aligned array type
template <
    typename T,
    /// Number of elements in the array
    int N,
    /// Alignment requirement in bytes
    int Alignment = sizeof(T) * N>
class alignas(Alignment) AlignedArray {
  float data[N];
};

// ====================== Softmax things ===============================
// We have our own implementation of softmax here so we can support transposing the output
// in the softmax kernel when we extend this module to support expert-choice routing.
template <int TPB>
__launch_bounds__(TPB) __global__
    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols) {
  using BlockReduce = cub::BlockReduce<float, TPB>;
  __shared__ typename BlockReduce::TempStorage tmpStorage;

  __shared__ float normalizing_factor;
  __shared__ float float_max;

  const int thread_row_offset = blockIdx.x * num_cols;

  cub::Sum sum;
  float threadData(-FLT_MAX);

  // Don't touch finished rows.
  if ((finished != nullptr) && finished[blockIdx.x]) {
    return;
  }

  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
    const int idx = thread_row_offset + ii;
    threadData = max(static_cast<float>(input[idx]), threadData);
  }

  const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());

  if (threadIdx.x == 0) {
    float_max = maxElem;
  }
  __syncthreads();

  threadData = 0;

  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
    const int idx = thread_row_offset + ii;
    threadData += exp((static_cast<float>(input[idx]) - float_max));
  }

  const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);

  if (threadIdx.x == 0) {
    normalizing_factor = 1.f / Z;
  }
  __syncthreads();

  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
    const int idx = thread_row_offset + ii;
    const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
    output[idx] = val;
  }
}

template <int TPB>
__launch_bounds__(TPB) __global__ void moeTopK(
    const float* inputs_after_softmax,
    const bool* finished,
    float* output,
    int* indices,
    int* source_rows,
    const int num_experts,
    const int k,
    const int start_expert,
    const int end_expert) {
  using cub_kvp = cub::KeyValuePair<int, float>;
  using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
  __shared__ typename BlockReduce::TempStorage tmpStorage;

  cub_kvp thread_kvp;
  cub::ArgMax arg_max;

  const int num_rows = gridDim.x;
  const int block_row = blockIdx.x;

  const bool row_is_active = finished ? !finished[block_row] : true;
  const int thread_read_offset = blockIdx.x * num_experts;
  for (int k_idx = 0; k_idx < k; ++k_idx) {
    thread_kvp.key = 0;
    thread_kvp.value = -1.f;  // This is OK because inputs are probabilities

    cub_kvp inp_kvp;
    for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
      const int idx = thread_read_offset + expert;
      inp_kvp.key = expert;
      inp_kvp.value = inputs_after_softmax[idx];

      for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
        const int prior_winning_expert = indices[k * block_row + prior_k];

        if (prior_winning_expert == expert) {
          inp_kvp = thread_kvp;
        }
      }

      thread_kvp = arg_max(inp_kvp, thread_kvp);
    }

    const cub_kvp result_kvp = BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
    if (threadIdx.x == 0) {
      // Ignore experts the node isn't responsible for with expert parallelism
      const int expert = result_kvp.key;
      const bool node_uses_expert = expert >= start_expert && expert < end_expert;
      const bool should_process_row = row_is_active && node_uses_expert;

      const int idx = k * block_row + k_idx;
      output[idx] = result_kvp.value;
      indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
      assert(indices[idx] >= 0);
      source_rows[idx] = k_idx * num_rows + block_row;
    }
    __syncthreads();
  }
}

// ====================== TopK softmax things ===============================

/*
  A Top-K gating softmax written to exploit when the number of experts in the MoE layers
  are a small power of 2. This allows us to cleanly share the rows among the threads in
  a single warp and eliminate communication between warps (so no need to use shared mem).

  It fuses the softmax, max and argmax into a single kernel.

  Limitations:
  1) This implementation is intended for when the number of experts is a small power of 2.
  2) This implementation assumes k is small, but will work for any k.
*/

template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ void topkGatingSoftmax(
    const float* input,
    const bool* finished,
    float* output,
    const int num_rows,
    int* indices,
    int* source_rows,
    const int k,
    const int start_expert,
    const int end_expert) {
  // We begin by enforcing compile time assertions and setting up compile time constants.
  static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
  static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
  static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
  static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");

  // Number of bytes each thread pulls in per load
  static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
  static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
  static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
  static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;

  // Restrictions based on previous section.
  static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
  static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
  static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
  static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");

  // We have NUM_EXPERTS elements per row. We specialize for small #experts
  static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
  static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
  static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;

  // Restrictions for previous section.
  static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0, "The elts per row must cleanly divide the total elt per warp");

  // ===================== From this point, we finally start computing run-time variables. ========================

  // Compute CTA and warp rows. We pack multiple rows into a single warp, and a block contains WARPS_PER_CTA warps.
  // This, each block processes a chunk of rows. We start by computing the start row for each block.
  const int cta_base_row = blockIdx.x * ROWS_PER_CTA;

  // Now, using the base row per thread block, we compute the base row per warp.
  const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;

  // The threads in a warp are split into sub-groups that will work on a row.
  // We compute row offset for each thread sub-group
  const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
  const int thread_row = warp_base_row + thread_row_in_warp;

  // Threads with indices out of bounds should early exit here.
  if (thread_row >= num_rows) {
    return;
  }
  const bool row_is_active = finished ? !finished[thread_row] : true;

  // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
  // row it will read.
  const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;

  // Now, we compute the group each thread belong to in order to determine the first column to start loads.
  const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
  const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
  const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;

  // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
  // this can support all powers of 2 up to 16.
  // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
  // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
  using AccessType = AlignedArray<float, ELTS_PER_LDG>;

  // Finally, we pull in the data from global mem
  float row_chunk[VPT];
  AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
  const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
#pragma unroll
  for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
    row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
  }

  // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
  // convert to float afterwards for the exp + sum reduction.
  float thread_max = row_chunk[0];
#pragma unroll
  for (int ii = 1; ii < VPT; ++ii) {
    thread_max = max(thread_max, row_chunk[ii]);
  }

// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
#pragma unroll
  for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
    thread_max = max(thread_max, SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, thread_max, mask, THREADS_PER_ROW));
  }

  // From this point, thread max in all the threads have the max within the row.
  // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
  float row_sum = 0;
#pragma unroll
  for (int ii = 0; ii < VPT; ++ii) {
    row_chunk[ii] = expf(row_chunk[ii] - thread_max);
    row_sum += row_chunk[ii];
  }

// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
#pragma unroll
  for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
    row_sum += SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, row_sum, mask, THREADS_PER_ROW);
  }

  // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
  // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
  // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
  // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
  // argmax after computing the softmax.
  const float reciprocal_row_sum = 1.f / row_sum;

#pragma unroll
  for (int ii = 0; ii < VPT; ++ii) {
    row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
  }

  // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
  // with the max index.
  int start_col = first_elt_read_by_thread;
  static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;

  for (int k_idx = 0; k_idx < k; ++k_idx) {
    // First, each thread does the local argmax
    float max_val = row_chunk[0];
    int expert = start_col;
#pragma unroll
    for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD; ++ldg, col += COLS_PER_GROUP_LDG) {
#pragma unroll
      for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
        float val = row_chunk[ldg * ELTS_PER_LDG + ii];

        // No check on the experts here since columns with the smallest index are processed first and only
        // updated if > (not >=)
        if (val > max_val) {
          max_val = val;
          expert = col + ii;
        }
      }
    }

// Now, we perform the argmax reduce. We use the butterfly pattern so threads reach consensus about the max.
// This will be useful for K > 1 so that the threads can agree on "who" had the max value. That thread can
// then blank out their max with -inf and the warp can run more iterations...
#pragma unroll
    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
      float other_max = SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, max_val, mask, THREADS_PER_ROW);
      int other_expert = SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, expert, mask, THREADS_PER_ROW);

      // We want lower indices to "win" in every thread so we break ties this way
      if (other_max > max_val || (other_max == max_val && other_expert < expert)) {
        max_val = other_max;
        expert = other_expert;
      }
    }

    // Write the max for this k iteration to global memory.
    if (thread_group_idx == 0) {
      // Add a guard to ignore experts not included by this node
      const bool node_uses_expert = expert >= start_expert && expert < end_expert;
      const bool should_process_row = row_is_active && node_uses_expert;

      // The lead thread from each sub-group will write out the final results to global memory. (This will be a
      // single) thread per row of the input/output matrices.
      const int idx = k * thread_row + k_idx;
      output[idx] = max_val;
      indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
      source_rows[idx] = k_idx * num_rows + thread_row;
    }

    // Finally, we clear the value in the thread with the current max if there is another iteration to run.
    if (k_idx + 1 < k) {
      const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
      const int thread_to_clear_in_group = (expert / ELTS_PER_LDG) % THREADS_PER_ROW;

      // Only the thread in the group which produced the max will reset the "winning" value to -inf.
      if (thread_group_idx == thread_to_clear_in_group) {
        const int offset_for_expert = expert % ELTS_PER_LDG;
        // Safe to set to any negative value since row_chunk values must be between 0 and 1.
        row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
      }
    }
  }
}

namespace detail {
// Constructs some constants needed to partition the work across threads at compile time.
template <int EXPERTS, int BYTES_PER_LDG>
struct TopkConstants {
  static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
  static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
  static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
  static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
  static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
  static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
};
}  // namespace detail

template <int EXPERTS, int WARPS_PER_TB>
void topkGatingSoftmaxLauncherHelper(
    const float* input,
    const bool* finished,
    float* output,
    int* indices,
    int* source_row,
    const int num_rows,
    const int k,
    const int start_expert,
    const int end_expert,
    cudaStream_t stream) {
  static constexpr std::size_t MAX_BYTES_PER_LDG = 16;

  static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
  using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
  static constexpr int VPT = Constants::VPT;
  static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
  const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
  const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;

  dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
  topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
      input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
}

#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)             \
  topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
      gating_output,                                          \
      nullptr,                                                \
      topk_weights,                                           \
      topk_indices,                                           \
      token_expert_indices,                                   \
      num_tokens,                                             \
      topk,                                                   \
      0,                                                      \
      num_experts,                                            \
      stream);

void topkGatingSoftmaxKernelLauncher(
    const float* gating_output,
    float* topk_weights,
    int* topk_indices,
    int* token_expert_indices,
    float* softmax_workspace,
    const int num_tokens,
    const int num_experts,
    const int topk,
    cudaStream_t stream) {
  static constexpr int WARPS_PER_TB = 4;
  switch (num_experts) {
    case 1:
      LAUNCH_SOFTMAX(1, WARPS_PER_TB);
      break;
    case 2:
      LAUNCH_SOFTMAX(2, WARPS_PER_TB);
      break;
    case 4:
      LAUNCH_SOFTMAX(4, WARPS_PER_TB);
      break;
    case 8:
      LAUNCH_SOFTMAX(8, WARPS_PER_TB);
      break;
    case 16:
      LAUNCH_SOFTMAX(16, WARPS_PER_TB);
      break;
    case 32:
      LAUNCH_SOFTMAX(32, WARPS_PER_TB);
      break;
    case 64:
      LAUNCH_SOFTMAX(64, WARPS_PER_TB);
      break;
    case 128:
      LAUNCH_SOFTMAX(128, WARPS_PER_TB);
      break;
    case 256:
      LAUNCH_SOFTMAX(256, WARPS_PER_TB);
      break;
    default: {
      TORCH_CHECK(
          softmax_workspace != nullptr,
          "softmax_workspace must be provided for num_experts that are not a power of 2.");
      static constexpr int TPB = 256;
      moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(gating_output, nullptr, softmax_workspace, num_experts);
      moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
          softmax_workspace,
          nullptr,
          topk_weights,
          topk_indices,
          token_expert_indices,
          num_experts,
          topk,
          0,
          num_experts);
    }
  }
}

void topk_softmax(
    torch::Tensor& topk_weights,          // [num_tokens, topk]
    torch::Tensor& topk_indices,          // [num_tokens, topk]
    torch::Tensor& token_expert_indices,  // [num_tokens, topk]
    torch::Tensor& gating_output)         // [num_tokens, num_experts]
{
  const int num_experts = gating_output.size(-1);
  const int num_tokens = gating_output.numel() / num_experts;
  const int topk = topk_weights.size(-1);

  const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
  const bool needs_workspace = !is_pow_2 || num_experts > 256;
  const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;

  const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
  topkGatingSoftmaxKernelLauncher(
      gating_output.data_ptr<float>(),
      topk_weights.data_ptr<float>(),
      topk_indices.data_ptr<int>(),
      token_expert_indices.data_ptr<int>(),
      softmax_workspace.data_ptr<float>(),
      num_tokens,
      num_experts,
      topk,
      stream);
}


================================================
FILE: kt-kernel/cuda/moe/ops.h
================================================
#pragma once

#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>

void topk_softmax(torch::Tensor& topk_weights,          // [num_tokens, topk]
                  torch::Tensor& topk_indices,          // [num_tokens, topk]
                  torch::Tensor& token_expert_indices,  // [num_tokens, topk]
                  torch::Tensor& gating_output);

================================================
FILE: kt-kernel/cuda/moe/utils.h
================================================
#pragma once

#include <ATen/Tensor.h>
#include <cuda_runtime.h>
#include <torch/all.h>

#include <sstream>

#ifndef USE_ROCM
// Adapt from FlashInfer
#ifdef FLASHINFER_ENABLE_F16
#define _DISPATCH_CASE_F16(c_type, ...) \
  case at::ScalarType::Half: {          \
    using c_type = nv_half;             \
    return __VA_ARGS__();               \
  }
#else
#define _DISPATCH_CASE_F16(c_type, ...)
#endif

#ifdef FLASHINFER_ENABLE_BF16
#define _DISPATCH_CASE_BF16(c_type, ...) \
  case at::ScalarType::BFloat16: {       \
    using c_type = nv_bfloat16;          \
    return __VA_ARGS__();                \
  }
#else
#define _DISPATCH_CASE_BF16(c_type, ...)
#endif

#ifdef FLASHINFER_ENABLE_FP8_E4M3
#define _DISPATCH_CASE_FP8_E4M3(c_type, ...) \
  case at::ScalarType::Float8_e4m3fn: {      \
    using c_type = __nv_fp8_e4m3;            \
    return __VA_ARGS__();                    \
  }
#else
#define _DISPATCH_CASE_FP8_E4M3(c_type, ...)
#endif

#ifdef FLASHINFER_ENABLE_FP8_E5M2
#define _DISPATCH_CASE_FP8_E5M2(c_type, ...) \
  case at::ScalarType::Float8_e5m2: {        \
    using c_type = __nv_fp8_e5m2;            \
    return __VA_ARGS__();                    \
  }
#else
#define _DISPATCH_CASE_FP8_E5M2(c_type, ...)
#endif

#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(pytorch_dtype, c_type, ...)                 \
  [&]() -> bool {                                                                        \
    switch (pytorch_dtype) {                                                             \
      _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                            \
      _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                           \
      default:                                                                           \
        std::ostringstream oss;                                                          \
        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
        TORCH_CHECK(false, oss.str());                                                   \
        return false;                                                                    \
    }                                                                                    \
  }()

#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP8(pytorch_dtype, c_type, ...)                      \
  [&]() -> bool {                                                                            \
    switch (pytorch_dtype) {                                                                 \
      _DISPATCH_CASE_FP8_E4M3(c_type, __VA_ARGS__)                                           \
      _DISPATCH_CASE_FP8_E5M2(c_type, __VA_ARGS__)                                           \
      default:                                                                               \
        std::ostringstream oss;                                                              \
        oss << __PRETTY_FUNCTION__ << " failed to dispatch fp8 data type " << pytorch_dtype; \
        TORCH_CHECK(false, oss.str());                                                       \
        return false;                                                                        \
    }                                                                                        \
  }()

#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE(pytorch_dtype, c_type, ...)                      \
  [&]() -> bool {                                                                        \
    switch (pytorch_dtype) {                                                             \
      _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                            \
      _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                           \
      _DISPATCH_CASE_FP8_E4M3(c_type, __VA_ARGS__)                                       \
      _DISPATCH_CASE_FP8_E5M2(c_type, __VA_ARGS__)                                       \
      default:                                                                           \
        std::ostringstream oss;                                                          \
        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
        TORCH_CHECK(false, oss.str());                                                   \
        return false;                                                                    \
    }                                                                                    \
  }()

#define _DISPATCH_SWITCH(var_name, cond, ...)                                           \
  [&]() -> bool {                                                                       \
    switch (cond) {                                                                     \
      __VA_ARGS__                                                                       \
      default:                                                                          \
        std::ostringstream oss;                                                         \
        oss << __PRETTY_FUNCTION__ << " failed to dispatch " var_name " " << int(cond); \
        TORCH_CHECK(false, oss.str());                                                  \
        return false;                                                                   \
    }                                                                                   \
  }()

#define _DISPATCH_SWITCH_U16x2(var1_name, var2_name, cond1, cond2, ...)                                             \
  [&]() -> bool {                                                                                                   \
    switch (pack_u16(cond1, cond2)) {                                                                               \
      __VA_ARGS__                                                                                                   \
      default:                                                                                                      \
        std::ostringstream oss;                                                                                     \
        oss << __PRETTY_FUNCTION__ << " failed to dispatch (" var1_name ", " var2_name "): (" << int(cond1) << ", " \
            << int(cond2) << ")";                                                                                   \
        TORCH_CHECK(false, oss.str());                                                                              \
        return false;                                                                                               \
    }                                                                                                               \
  }()

#define _DISPATCH_CASE(case_expr, case_var, ...) \
  case case_expr: {                              \
    constexpr auto case_var = case_expr;         \
    return __VA_ARGS__();                        \
  }

#define _DISPATCH_CASE_U16x2(case_expr1, case_expr2, case_var1, case_var2, ...) \
  case pack_u16(case_expr1, case_expr2): {                                      \
    constexpr auto case_var1 = case_expr1;                                      \
    constexpr auto case_var2 = case_expr2;                                      \
    return __VA_ARGS__();                                                       \
  }

#define DISPATCH_BOOL(expr, const_expr, ...) \
  [&]() -> bool {                            \
    if (expr) {                              \
      constexpr bool const_expr = true;      \
      return __VA_ARGS__();                  \
    } else {                                 \
      constexpr bool const_expr = false;     \
      return __VA_ARGS__();                  \
    }                                        \
  }()

inline void check_shape(const at::Tensor& a, const at::Tensor& b, const char* a_name, const char* b_name) {
  TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ", a.dim(), " vs ", b.dim());
  for (int i = 0; i < a.dim(); ++i) {
    TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name, ".size(", i, ")");
  }
}

inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) { return (uint32_t(a) << 16) | uint32_t(b); }

#define CHECK_GQA_HEAD_DIVISIBLE(num_qo_heads, num_kv_heads)                                                           \
  TORCH_CHECK(num_qo_heads % num_kv_heads == 0, "num_qo_heads(", num_qo_heads, ") must be divisible by num_kv_heads(", \
              num_kv_heads, ")")

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")

#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")

#define CHECK_INPUT(x) \
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)
#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
  CHECK_CUDA(x);                           \
  CHECK_LAST_DIM_CONTIGUOUS(x)

#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")

#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b)

#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)

#define CHECK_GE(a, b) TORCH_CHECK((a) >= (b), "CHECK_GE(" #a ", " #b ") failed. ", a, " vs ", b)

inline bool is_float8_tensor(const at::Tensor& tensor) {
  return tensor.scalar_type() == at::ScalarType::Float8_e4m3fn || tensor.scalar_type() == at::ScalarType::Float8_e5m2;
}
#endif

struct cuda_error : public std::runtime_error {
  /**
   * @brief Constructs a `cuda_error` object with the given `message`.
   *
   * @param message The error char array used to construct `cuda_error`
   */
  cuda_error(const char* message) : std::runtime_error(message) {}
  /**
   * @brief Constructs a `cuda_error` object with the given `message` string.
   *
   * @param message The `std::string` used to construct `cuda_error`
   */
  cuda_error(std::string const& message) : cuda_error{message.c_str()} {}
};

#define CHECK_CUDA_SUCCESS(cmd)                                         \
  do {                                                                  \
    cudaError_t e = cmd;                                                \
    if (e != cudaSuccess) {                                             \
      std::stringstream _message;                                       \
      auto s = cudaGetErrorString(e);                                   \
      _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
      throw cuda_error(_message.str());                                 \
    }                                                                   \
  } while (0)

#define CHECK_IS_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_IS_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) \
  CHECK_IS_CUDA(x);         \
  CHECK_IS_CONTIGUOUS(x)

inline int getSMVersion() {
  int device{-1};
  CHECK_CUDA_SUCCESS(cudaGetDevice(&device));
  int sm_major = 0;
  int sm_minor = 0;
  CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
  CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
  return sm_major * 10 + sm_minor;
}

// SGLANG_SHFL_XOR_* adapted from https://github.com/vllm-project/vllm/blob/v0.7.3/csrc/cuda_compat.h#L19-L28
#ifndef USE_ROCM
#define SGLANG_SHFL_XOR_SYNC(mask, var, lane_mask) __shfl_xor_sync((mask), (var), (lane_mask))
#define SGLANG_SHFL_XOR_SYNC_WIDTH(mask, var, lane_mask, width) __shfl_xor_sync((mask), (var), (lane_mask), (width))
#else
#define SGLANG_SHFL_XOR_SYNC(mask, var, lane_mask) __shfl_xor((var), (lane_mask))
#define SGLANG_SHFL_XOR_SYNC_WIDTH(mask, var, lane_mask, width) __shfl_xor((var), (lane_mask), (width))
#endif

#ifndef USE_ROCM
#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(pytorch_dtype, c_type, ...)           \
  [&]() -> bool {                                                                        \
    switch (pytorch_dtype) {                                                             \
      case at::ScalarType::Float: {                                                      \
        using c_type = float;                                                            \
        return __VA_ARGS__();                                                            \
      }                                                                                  \
        _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                          \
        _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                         \
      default:                                                                           \
        std::ostringstream oss;                                                          \
        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
        TORCH_CHECK(false, oss.str());                                                   \
        return false;                                                                    \
    }                                                                                    \
  }()
#endif

#define DISPATCH_CASE_INTEGRAL_TYPES(...)              \
  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)

#define DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))

#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
#define WARP_SIZE 32

#ifndef USE_ROCM
#include <c10/util/Float8_e4m3fn.h>
using FP8_TYPE = c10::Float8_e4m3fn;
C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<FP8_TYPE>::max();
#else
#include <c10/util/Float8_e4m3fnuz.h>

using FP8_TYPE = c10::Float8_e4m3fnuz;
constexpr auto FP8_E4M3_MAX = 224.0f;
#endif

#ifndef USE_ROCM
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
  float old;
  old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
                     : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
  return old;
}

__device__ __forceinline__ float warpReduceMax(float max_value) {
  max_value = fmaxf(max_value, SGLANG_SHFL_XOR_SYNC(0xffffffff, max_value, 16));
  max_value = fmaxf(max_value, SGLANG_SHFL_XOR_SYNC(0xffffffff, max_value, 8));
  max_value = fmaxf(max_value, SGLANG_SHFL_XOR_SYNC(0xffffffff, max_value, 4));
  max_value = fmaxf(max_value, SGLANG_SHFL_XOR_SYNC(0xffffffff, max_value, 2));
  max_value = fmaxf(max_value, SGLANG_SHFL_XOR_SYNC(0xffffffff, max_value, 1));
  return max_value;
}

__device__ __forceinline__ float blockReduceMax(float max_value) {
  static __shared__ float warpLevelMaxs[WARP_SIZE];
  const int laneId = threadIdx.x % WARP_SIZE;
  const int warpId = threadIdx.x / WARP_SIZE;

  max_value = warpReduceMax(max_value);

  if (laneId == 0) warpLevelMaxs[warpId] = max_value;
  __syncthreads();

  max_value = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelMaxs[laneId] : 0;
  if (warpId == 0) max_value = warpReduceMax(max_value);

  return max_value;
}
#endif

// Pads to a multiple of `alignment` rows.
inline torch::Tensor pad_tensor(const torch::Tensor& tensor, int64_t alignment = 4, bool is_column_major = false) {
  int64_t rows = tensor.size(0);
  int64_t cols = tensor.size(1);
  int64_t pad_rows = (alignment - (rows % alignment)) % alignment;  // Compute padding size

  if (pad_rows == 0) {
    return tensor;  // Already aligned
  }

  torch::Tensor padding = torch::zeros({pad_rows, cols}, tensor.options());
  torch::Tensor tensor_padded = torch::cat({tensor, padding}, 0);  // Pad along rows

  // Ensure column-major layout
  if (is_column_major) {
    return tensor_padded.t().contiguous().t();
  }
  return tensor_padded;
}


================================================
FILE: kt-kernel/cuda/setup.py
================================================

from setuptools import setup, Extension
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
    name='KTransformersOps',
    ext_modules=[
        CUDAExtension(
            'KTransformersOps', [
                'custom_gguf/dequant.cu',
                'binding.cpp',
                'gptq_marlin/gptq_marlin.cu',
                'moe/moe_topk_softmax_kernels.cu',
                # 'gptq_marlin_repack.cu',
            ],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': [
                    '-O3',
                    '--use_fast_math',
                    '-Xcompiler', '-fPIC',
                ]
            },
        )
    ],
    cmdclass={'build_ext': BuildExtension}
)

================================================
FILE: kt-kernel/cuda/test_dequant.py
================================================
import os
import sys
sys.path.insert(0,"/home/zbx/ktransformers")
from ktransformers.util.custom_loader import GGUFLoader
import torch

gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/")

torch.set_default_dtype(torch.bfloat16)

tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")

print(tensor_1[0, -64:])
print(tensor_2[0, -64:])

================================================
FILE: kt-kernel/demo/.gitignore
================================================
test.out
fp16-test
test.out

================================================
FILE: kt-kernel/demo/Makefile
================================================
# CFLAGS += $(shell pkg-config --cflags hwloc)
# CFLAGS += -march=armv8.2-a+fp16+dotprod+sve+bf16 -I/home/test/kt-code/HPCKit_25.0.0_Linux-aarch64/package/KunpengHPCKit-kml.25.0.0/include
# CFLAGS += -march=armv8.2-a+fp16+dotprod+sve+bf16 -I/home/test/kt-code/HPCKit_25.0.0_Linux-aarch64/package/KunpengHPCKit-kml.25.0.0/include
CFLAGS += -O3
CFLAGS += -I/usr/local/include/blis/ -fopenmp
LDLIBS += -L/usr/local/lib -lblis
# LDLIBS += $(shell pkg-config --libs hwloc) -lkml_rt

CXX = /usr/bin/g++

# i8_cal: i8_cal.cpp
# $(CXX) i8_cal.cpp $(CFLAGS) -o i8_cal $(LDLIBS)
# run: i8_cal
# ./i8_cal

simple_test_build: simple_test.cpp
	rm -f simple_test
	BLAS_NUM_THREADS=1 $(CXX) simple_test.cpp $(CFLAGS) -o simple_test $(LDLIBS)

simple_aocl_build: build simple_test_aocl.cpp
	$(CXX) simple_test_aocl.cpp $(CFLAGS) -o build/simple_test_aocl $(LDLIBS)

fp16_test_build: fp16-test.cpp
	rm -f fp16-test
	$(CXX) fp16-test.cpp $(CFLAGS) -o fp16-test $(LDLIBS)
bf16_test_build: bf16-test.cpp
	rm -f bf16-test
	$(CXX) bf16-test.cpp $(CFLAGS) -o bf16-test $(LDLIBS)
build: build
	mkdir -p build
bandwidth_build: bench_reorder_bandwidth.cpp
	$(CXX) bench_reorder_bandwidth.cpp $(CFLAGS) -o build/bench_reorder_bandwidth $(LDLIBS)
run: simple_aocl_build
	LD_LIBRARY_PATH=/usr/local/lib:$$LD_LIBRARY_PATH  ./build/simple_test_aocl
run_bandwidth: bandwidth_build
	LD_LIBRARY_PATH=/usr/local/lib:$$LD_LIBRARY_PATH  ./build/bench_reorder_bandwidth

================================================
FILE: kt-kernel/demo/bench_reorder_bandwidth.cpp
================================================
#include <blis.h>

#include <chrono>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>

namespace {
constexpr int kM = 1;
constexpr int kK = 7168;
constexpr int kN = 512;
constexpr int kIters = 10000;

void fill_random(int8_t* ptr, size_t count) {
  std::srand(47);
  for (size_t i = 0; i < count; ++i) {
    ptr[i] = static_cast<int8_t>(std::rand() % 30);
  }
}

void fill_zero(int32_t* ptr, size_t count) { std::memset(ptr, 0, count * sizeof(int32_t)); }

bool verify(const int8_t* a, const int8_t* b, const int32_t* c) {
  for (int m = 0; m < kM; ++m) {
    for (int n = 0; n < kN; ++n) {
      int32_t ref = 0;
      for (int k = 0; k < kK; ++k) {
        ref += static_cast<int32_t>(a[m * kK + k]) * static_cast<int32_t>(b[n * kK + k]);
      }
      if (ref != c[m * kN + n]) {
        std::printf("Mismatch at (%d, %d): got %d, expect %d\n", m, n, c[m * kN + n], ref);
        return false;
      }
    }
  }
  return true;
}
}  // namespace

int main() {
  int8_t* a = static_cast<int8_t*>(std::aligned_alloc(64, kM * kK));
  int8_t* b = static_cast<int8_t*>(std::aligned_alloc(64, kK * kN));
  int32_t* c = static_cast<int32_t*>(std::aligned_alloc(64, kM * kN * sizeof(int32_t)));
  int32_t* c_tmp = static_cast<int32_t*>(std::aligned_alloc(64, kM * kN * sizeof(int32_t)));

  if (!a || !b || !c || !c_tmp) {
    std::fprintf(stderr, "Allocation failed.\n");
    std::free(a);
    std::free(b);
    std::free(c);
    std::free(c_tmp);
    return EXIT_FAILURE;
  }

  fill_random(a, kM * kK);
  fill_random(b, kK * kN);
  fill_zero(c, kM * kN);
  fill_zero(c_tmp, kM * kN);

  const dim_t reorder_size = aocl_get_reorder_buf_size_s8s8s32os32('r', 't', 'B', kK, kN);
  int8_t* b_reordered = static_cast<int8_t*>(std::aligned_alloc(64, reorder_size));
  if (!b_reordered) {
    std::fprintf(stderr, "Reorder buffer allocation failed.\n");
    std::free(a);
    std::free(b);
    std::free(c);
    return EXIT_FAILURE;
  }

  aocl_reorder_s8s8s32os32('r', 't', 'B', b, b_reordered, kK, kN, kK);

  // Warm-up GEMM to load kernels.
  aocl_gemm_s8s8s32os32('r', 'n', 't', kM, kN, kK, 1, a, kK, 'n', b_reordered, kK, 'r', 0, c_tmp, kN, nullptr);
  fill_zero(c, kM * kN);

  const double bytes_per_mul = static_cast<double>(kM) * kK * sizeof(int8_t) +  // A matrix read
                               static_cast<double>(kK) * kN * sizeof(int8_t);   // original B read

  auto start = std::chrono::high_resolution_clock::now();
  for (int iter = 0; iter < kIters; ++iter) {
    aocl_gemm_s8s8s32os32('r', 'n', 't', kM, kN, kK, 1, a, kK, 'n', b_reordered, kK, 'r', 0, c, kN, nullptr);
  }
  auto end = std::chrono::high_resolution_clock::now();

  const double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
  const double total_bytes = bytes_per_mul * kIters;
  const double bandwidth_gbps = total_bytes / elapsed_seconds / 1e9;
  const double ops_per_mul = static_cast<double>(kM) * kN * kK * 2.0;
  const double tflops = (ops_per_mul * kIters) / elapsed_seconds / 1e12;

  std::printf("Reorder buffer size: %ld bytes\n", static_cast<long>(reorder_size));
  std::printf("Iterations: %d\n", kIters);
  std::printf("Elapsed time: %.4f s\n", elapsed_seconds);
  std::printf("Effective bandwidth: %.2f GB/s\n", bandwidth_gbps);
  std::printf("Int8 GEMM throughput: %.2f TOPS\n", tflops * 1e3);

  if (!verify(a, b, c)) {
    std::fprintf(stderr, "Verification failed.\n");
  } else {
    std::puts("Verification passed.");
  }

  std::free(a);
  std::free(b);
  std::free(b_reordered);
  std::free(c);
  return 0;
}


================================================
FILE: kt-kernel/demo/bf16-test.cpp
================================================
#define BGEMM

#include <arm_sve.h>
#include <dlfcn.h>
#include <kblas.h>
#include <unistd.h>

#include <chrono>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <random>

int main() {
  // 矩阵维度 M 是 1024，K 是 1024，N 是 1024（行主序）
  int M = 512;         // 行主序时，A 的行长度为 K
  const int K = 7168;  // B 的行长度为 N
  const int N = 512;   // C 的行长度为 N
  const int iter = 1;  // 迭代次数
  // int M = 10;        // 行主序时，A 的行长度为 K
  // const int K = 10; // B 的行长度为 N
  // const int N = 10;  // C 的行长度为 N

  // 分配矩阵内存
  bfloat16_t* A = new bfloat16_t[M * K];
  bfloat16_t* B = new bfloat16_t[K * N];
  bfloat16_t* C = new bfloat16_t[M * N];
  srand(123);

  // 初始化随机种子
  // std::mt19937 rng(124);
  // std::uniform_real_distribution <float> dist(0.0, 1.0);

  for (int j = 0; j < M * K; j++) {
    A[j] = static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX);
    // A[j] = dist(rng);
    // A[j] = j;
  }
  for (int j = 0; j < K * N; j++) {
    B[j] = static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX);
    // B[j] = dist(rng);
    // B[j] = j;
  }
  for (int j = 0; j < M * N; j++) {
    C[j] = 0.0;
  }

  // 设置 cblas_gemm_s8u8s32 的参数
  float alpha = 1.0f;
  float beta = 0.0f;

  // 打印矩阵 A、B
  // printf("A=\n");
  // for (int i = 0; i < M; i++) {
  //   for (int j = 0; j < K; j++) {
  //     printf("%f ", A[i * K + j]);
  //   }
  //   printf("\n");
  // }
  // printf("B=\n");
  // for (int i = 0; i < N; i++) {
  //   for (int j = 0; j < K; j++) {
  //     printf("%f ", B[i * K + j]);
  //   }
  //   printf("\n");
  // }
  // cblas_shgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, A, K, B, N, beta, C, N);
  // // 打印结果
  // printf("C=\n");
  // for (int i = 0; i < M; i++) {
  //   for (int j = 0; j < N; j++) {
  //     printf("%f ", C[i * N + j]);
  //   }
  //   printf("\n");
  // }
  // return 0;

  auto fout = fopen("test.out", "w");
  int stride = 16;
  for (int n = stride; n <= N; n += stride)
    for (int m = stride; m <= M; m += stride) {
      // 记录开始时间
      auto start = std::chrono::high_resolution_clock::now();
      // #pragma GCC unroll 8
      for (int i = 0; i < iter; i++) {
        cblas_bgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, K, alpha, A, K, B, N, beta, C, N);
        // cblas_gemm_s8s8s32(CblasRowMajor, CblasNoTrans, CblasTrans, CblasFixOffset, m, N, K, alpha, A, K, oa, B, K,
        // ob,
        //  beta, C, N, &oc);
      }

      // 打印结果
      // printf("result:\n");
      // for (int i = 0; i < M; i++) {
      //   for (int j = 0; j < N; j++) {
      //     printf("%f ", C[i * N + j]);
      //   }
      //   printf("\n");
      // }
      // return 0;

      // 记录结束时间
      auto end = std::chrono::high_resolution_clock::now();

      // 计算总时长（秒）
      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
      double time_sec = duration.count() / 1e6;  // 转换为秒

      // 计算理论浮点运算次数并转换为 TFLOPS
      double ops = iter * 2.0 * m * n * K;
      double tflops = ops / (duration.count() * 1e6);  // 转换为 TFLOPS

      // 输出结果
      printf("execute end time %f us, m n:%d %d\n", time_sec * 1e6, m, n);
      // printf("执行时间: %.4f 秒\n", time_sec);
      printf("计算性能: %.4f TFLOPS\n", tflops);
      printf("\n");

      fprintf(fout, "%d %d %f\n", m, n, tflops);
    }

  // 释放资源
  free(A);
  free(B);
  free(C);
  return 0;
}

================================================
FILE: kt-kernel/demo/fp16-test.cpp
================================================
#include <arm_sve.h>
#include <dlfcn.h>
#include <kblas.h>
#include <unistd.h>

#include <chrono>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <random>

int main() {
  // 矩阵维度 M 是 1024，K 是 1024，N 是 1024（行主序）
  int M = 5;           // 行主序时，A 的行长度为 K
  const int K = 10;    // B 的行长度为 N
  const int N = 7;     // C 的行长度为 N
  const int iter = 1;  // 迭代次数
  // int M = 10;        // 行主序时，A 的行长度为 K
  // const int K = 10; // B 的行长度为 N
  // const int N = 10;  // C 的行长度为 N

  // 分配矩阵内存
  float16_t* A = new float16_t[M * K];
  float16_t* B = new float16_t[K * N];
  float16_t* C = new float16_t[M * N];
  float16_t* Cc = new float16_t[M * N];
  srand(123);

  // 初始化随机种子
  // std::mt19937 rng(124);
  // std::uniform_real_distribution <float> dist(0.0, 1.0);

  for (int j = 0; j < M * K; j++) {
    A[j] = static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX) / 1.0;
    // A[j] = dist(rng);
    // A[j] = j;
  }
  for (int j = 0; j < K * N; j++) {
    B[j] = static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX) / 1.0;
    // B[j] = dist(rng);
    // B[j] = j;
  }
  for (int j = 0; j < M * N; j++) {
    C[j] = 10;
    Cc[j] = 10;
  }

  for (int i = 0; i < M; i++) {
    for (int j = 0; j < N; j++) {
      for (int k = 0; k < K; k++) {
        Cc[j * M + i] += A[i * K + k] * B[k * N + j];
      }
    }
  }

  // 设置 cblas_gemm_s8u8s32 的参数
  float alpha = 1.0f;
  float beta = 1.0f;

  // 打印矩阵 A、B
  printf("A=\n");
  for (int i = 0; i < M; i++) {
    for (int j = 0; j < K; j++) {
      printf("%f ", A[i * K + j]);
    }
    printf("\n");
  }
  printf("B=\n");
  for (int i = 0; i < K; i++) {
    for (int j = 0; j < N; j++) {
      printf("%f ", B[i * N + j]);
    }
    printf("\n");
  }
  cblas_hgemm(CblasColMajor, CblasTrans, CblasTrans, M, N, K, alpha, A, K, B, N, beta, C, M);
  // 打印结果
  printf("C=\n");
  for (int i = 0; i < M; i++) {
    for (int j = 0; j < N; j++) {
      printf("%f ", C[j * M + i]);
    }
    printf("\n");
  }

  printf("Cc=\n");
  for (int i = 0; i < M; i++) {
    for (int j = 0; j < N; j++) {
      printf("%f ", fabs(C[j * M + i] - Cc[j * M + i]));
    }
    printf("\n");
  }
  return 0;

  auto fout = fopen("test.out", "w");
  int stride = 16;
  for (int n = stride; n <= N; n += stride)
    for (int m = stride; m <= M; m += stride) {
      // 记录开始时间
      auto start = std::chrono::high_resolution_clock::now();
      // #pragma GCC unroll 8
      for (int i = 0; i < iter; i++) {
        cblas_hgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, K, alpha, A, K, B, N, beta, C, N);
        // cblas_gemm_s8s8s32(CblasRowMajor, CblasNoTrans, CblasTrans, CblasFixOffset, m, N, K, alpha, A, K, oa, B, K,
        // ob,
        //  beta, C, N, &oc);
      }

      // 打印结果
      // printf("result:\n");
      // for (int i = 0; i < M; i++) {
      //   for (int j = 0; j < N; j++) {
      //     printf("%f ", C[i * N + j]);
      //   }
      //   printf("\n");
      // }
      // return 0;

      // 记录结束时间
      auto end = std::chrono::high_resolution_clock::now();

      // 计算总时长（秒）
      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
      double time_sec = duration.count() / 1e6;  // 转换为秒

      // 计算理论浮点运算次数并转换为 TFLOPS
      double ops = iter * 2.0 * m * n * K;
      double tflops = ops / (duration.count() * 1e6);  // 转换为 TFLOPS

      // 输出结果
      printf("execute end time %f us, m n:%d %d\n", time_sec * 1e6, m, n);
      // printf("执行时间: %.4f 秒\n", time_sec);
      printf("计算性能: %.4f TFLOPS\n", tflops);
      printf("\n");

      fprintf(fout, "%d %d %f\n", m, n, tflops);
    }

  // 释放资源
  free(A);
  free(B);
  free(C);
  return 0;
}

================================================
FILE: kt-kernel/demo/plot.py
================================================
import matplotlib.pyplot as plt
import re

# 原始数据字符串
data_str = """
execute end,m is:2
计算性能: 0.0068 TFLOPS

execute end,m is:4
计算性能: 0.0143 TFLOPS

execute end,m is:6
计算性能: 0.0206 TFLOPS

execute end,m is:8
计算性能: 0.0273 TFLOPS

execute end,m is:10
计算性能: 0.0330 TFLOPS

execute end,m is:12
计算性能: 0.0390 TFLOPS

execute end,m is:14
计算性能: 0.0442 TFLOPS

execute end,m is:16
计算性能: 0.0495 TFLOPS

execute end,m is:18
计算性能: 0.0543 TFLOPS

execute end,m is:20
计算性能: 0.0595 TFLOPS

execute end,m is:22
计算性能: 0.0637 TFLOPS

execute end,m is:24
计算性能: 0.0684 TFLOPS

execute end,m is:26
计算性能: 0.0720 TFLOPS

execute end,m is:28
计算性能: 0.0769 TFLOPS

execute end,m is:30
计算性能: 0.0802 TFLOPS

execute end,m is:32
计算性能: 0.0843 TFLOPS

execute end,m is:34
计算性能: 0.0874 TFLOPS

execute end,m is:36
计算性能: 0.0916 TFLOPS

execute end,m is:38
计算性能: 0.0942 TFLOPS

execute end,m is:40
计算性能: 0.0977 TFLOPS

execute end,m is:42
计算性能: 0.1003 TFLOPS

execute end,m is:44
计算性能: 0.1043 TFLOPS

execute end,m is:46
计算性能: 0.1059 TFLOPS

execute end,m is:48
计算性能: 0.1103 TFLOPS

execute end,m is:50
计算性能: 0.1119 TFLOPS

execute end,m is:52
计算性能: 0.1153 TFLOPS

execute end,m is:54
计算性能: 0.1172 TFLOPS

execute end,m is:56
计算性能: 0.1202 TFLOPS

execute end,m is:58
计算性能: 0.1219 TFLOPS

execute end,m is:60
计算性能: 0.1251 TFLOPS

execute end,m is:62
计算性能: 0.1268 TFLOPS

execute end,m is:64
计算性能: 0.1286 TFLOPS

execute end,m is:66
计算性能: 0.1307 TFLOPS

execute end,m is:68
计算性能: 0.1342 TFLOPS

execute end,m is:70
计算性能: 0.1347 TFLOPS

execute end,m is:72
计算性能: 0.1383 TFLOPS

execute end,m is:74
计算性能: 0.1389 TFLOPS

execute end,m is:76
计算性能: 0.1416 TFLOPS

execute end,m is:78
计算性能: 0.1429 TFLOPS

execute end,m is:80
计算性能: 0.1451 TFLOPS

execute end,m is:82
计算性能: 0.1471 TFLOPS

execute end,m is:84
计算性能: 0.1489 TFLOPS

execute end,m is:86
计算性能: 0.1499 TFLOPS

execute end,m is:88
计算性能: 0.1519 TFLOPS

execute end,m is:90
计算性能: 0.1525 TFLOPS

execute end,m is:92
计算性能: 0.1544 TFLOPS

execute end,m is:94
计算性能: 0.1560 TFLOPS

execute end,m is:96
计算性能: 0.1583 TFLOPS

execute end,m is:98
计算性能: 0.1579 TFLOPS

execute end,m is:100
计算性能: 0.1600 TFLOPS

execute end,m is:102
计算性能: 0.1611 TFLOPS

execute end,m is:104
计算性能: 0.1630 TFLOPS

execute end,m is:106
计算性能: 0.1644 TFLOPS

execute end,m is:108
计算性能: 0.1669 TFLOPS

execute end,m is:110
计算性能: 0.1667 TFLOPS

execute end,m is:112
计算性能: 0.1687 TFLOPS

execute end,m is:114
计算性能: 0.1685 TFLOPS

execute end,m is:116
计算性能: 0.1712 TFLOPS

execute end,m is:118
计算性能: 0.1712 TFLOPS

execute end,m is:120
计算性能: 0.1733 TFLOPS

execute end,m is:122
计算性能: 0.1730 TFLOPS

execute end,m is:124
计算性能: 0.1753 TFLOPS

execute end,m is:126
计算性能: 0.1757 TFLOPS

execute end,m is:128
计算性能: 0.1767 TFLOPS

execute end,m is:130
计算性能: 0.1783 TFLOPS

execute end,m is:132
计算性能: 0.1792 TFLOPS

execute end,m is:134
计算性能: 0.1794 TFLOPS

execute end,m is:136
计算性能: 0.1821 TFLOPS

execute end,m is:138
计算性能: 0.1810 TFLOPS

execute end,m is:140
计算性能: 0.1844 TFLOPS

execute end,m is:142
计算性能: 0.1840 TFLOPS

execute end,m is:144
计算性能: 0.1853 TFLOPS

execute end,m is:146
计算性能: 0.1860 TFLOPS

execute end,m is:148
计算性能: 0.1867 TFLOPS

execute end,m is:150
计算性能: 0.1868 TFLOPS

execute end,m is:152
计算性能: 0.1882 TFLOPS

execute end,m is:154
计算性能: 0.1880 TFLOPS

execute end,m is:156
计算性能: 0.1900 TFLOPS

execute end,m is:158
计算性能: 0.1895 TFLOPS

execute end,m is:160
计算性能: 0.1921 TFLOPS

execute end,m is:162
计算性能: 0.1922 TFLOPS

execute end,m is:164
计算性能: 0.1937 TFLOPS

execute end,m is:166
计算性能: 0.1935 TFLOPS

execute end,m is:168
计算性能: 0.1934 TFLOPS

execute end,m is:170
计算性能: 0.1945 TFLOPS

execute end,m is:172
计算性能: 0.1961 TFLOPS

execute end,m is:174
计算性能: 0.1952 TFLOPS

execute end,m is:176
计算性能: 0.1962 TFLOPS

execute end,m is:178
计算性能: 0.1977 TFLOPS

execute end,m is:180
计算性能: 0.1980 TFLOPS

execute end,m is:182
计算性能: 0.1985 TFLOPS

execute end,m is:184
计算性能: 0.1993 TFLOPS

execute end,m is:186
计算性能: 0.1995 TFLOPS

execute end,m is:188
计算性能: 0.2007 TFLOPS

execute end,m is:190
计算性能: 0.2012 TFLOPS

execute end,m is:192
计算性能: 0.2024 TFLOPS

execute end,m is:194
计算性能: 0.2011 TFLOPS

execute end,m is:196
计算性能: 0.2037 TFLOPS

execute end,m is:198
计算性能: 0.2026 TFLOPS

execute end,m is:200
计算性能: 0.2044 TFLOPS

execute end,m is:202
计算性能: 0.2044 TFLOPS

execute end,m is:204
计算性能: 0.2052 TFLOPS

execute end,m is:206
计算性能: 0.2057 TFLOPS

execute end,m is:208
计算性能: 0.2061 TFLOPS

execute end,m is:210
计算性能: 0.2064 TFLOPS

execute end,m is:212
计算性能: 0.2074 TFLOPS

execute end,m is:214
计算性能: 0.2075 TFLOPS

execute end,m is:216
计算性能: 0.2082 TFLOPS

execute end,m is:218
计算性能: 0.2083 TFLOPS

execute end,m is:220
计算性能: 0.2091 TFLOPS

execute end,m is:222
计算性能: 0.2096 TFLOPS

execute end,m is:224
计算性能: 0.2097 TFLOPS

execute end,m is:226
计算性能: 0.2098 TFLOPS

execute end,m is:228
计算性能: 0.2107 TFLOPS

execute end,m is:230
计算性能: 0.2104 TFLOPS

execute end,m is:232
计算性能: 0.2118 TFLOPS

execute end,m is:234
计算性能: 0.2121 TFLOPS

execute end,m is:236
计算性能: 0.2125 TFLOPS

execute end,m is:238
计算性能: 0.2128 TFLOPS

execute end,m is:240
计算性能: 0.2133 TFLOPS

execute end,m is:242
计算性能: 0.2136 TFLOPS

execute end,m is:244
计算性能: 0.2137 TFLOPS

execute end,m is:246
计算性能: 0.2139 TFLOPS

execute end,m is:248
计算性能: 0.2150 TFLOPS

execute end,m is:250
计算性能: 0.2153 TFLOPS

execute end,m is:252
计算性能: 0.2160 TFLOPS

execute end,m is:254
计算性能: 0.2156 TFLOPS

execute end,m is:256
计算性能: 0.2169 TFLOPS

execute end,m is:258
计算性能: 0.2161 TFLOPS

execute end,m is:260
计算性能: 0.2175 TFLOPS

execute end,m is:262
计算性能: 0.2172 TFLOPS

execute end,m is:264
计算性能: 0.2175 TFLOPS

execute end,m is:266
计算性能: 0.2181 TFLOPS

execute end,m is:268
计算性能: 0.2189 TFLOPS

execute end,m is:270
计算性能: 0.2193 TFLOPS

execute end,m is:272
计算性能: 0.2201 TFLOPS

execute end,m is:274
计算性能: 0.2198 TFLOPS

execute end,m is:276
计算性能: 0.2195 TFLOPS

execute end,m is:278
计算性能: 0.2205 TFLOPS

execute end,m is:280
计算性能: 0.2212 TFLOPS

execute end,m is:282
计算性能: 0.2210 TFLOPS

execute end,m is:284
计算性能: 0.2210 TFLOPS

execute end,m is:286
计算性能: 0.2215 TFLOPS

execute end,m is:288
计算性能: 0.2225 TFLOPS

execute end,m is:290
计算性能: 0.2227 TFLOPS

execute end,m is:292
计算性能: 0.2234 TFLOPS

execute end,m is:294
计算性能: 0.2227 TFLOPS

execute end,m is:296
计算性能: 0.2242 TFLOPS

execute end,m is:298
计算性能: 0.2230 TFLOPS

execute end,m is:300
计算性能: 0.2232 TFLOPS

execute end,m is:302
计算性能: 0.2227 TFLOPS

execute end,m is:304
计算性能: 0.2234 TFLOPS

execute end,m is:306
计算性能: 0.2226 TFLOPS

execute end,m is:308
计算性能: 0.2239 TFLOPS

execute end,m is:310
计算性能: 0.2239 TFLOPS

execute end,m is:312
计算性能: 0.2249 TFLOPS

execute end,m is:314
计算性能: 0.2245 TFLOPS

execute end,m is:316
计算性能: 0.2254 TFLOPS

execute end,m is:318
计算性能: 0.2251 TFLOPS

execute end,m is:320
计算性能: 0.2262 TFLOPS

execute end,m is:322
计算性能: 0.2256 TFLOPS

execute end,m is:324
计算性能: 0.2262 TFLOPS

execute end,m is:326
计算性能: 0.2259 TFLOPS

execute end,m is:328
计算性能: 0.2265 TFLOPS

execute end,m is:330
计算性能: 0.2266 TFLOPS

execute end,m is:332
计算性能: 0.2275 TFLOPS

execute end,m is:334
计算性能: 0.2275 TFLOPS

execute end,m is:336
计算性能: 0.2280 TFLOPS

execute end,m is:338
计算性能: 0.2275 TFLOPS

execute end,m is:340
计算性能: 0.2281 TFLOPS

execute end,m is:342
计算性能: 0.2284 TFLOPS

execute end,m is:344
计算性能: 0.2288 TFLOPS

execute end,m is:346
计算性能: 0.2288 TFLOPS

execute end,m is:348
计算性能: 0.2295 TFLOPS

execute end,m is:350
计算性能: 0.2292 TFLOPS

execute end,m is:352
计算性能: 0.2300 TFLOPS

execute end,m is:354
计算性能: 0.2299 TFLOPS

execute end,m is:356
计算性能: 0.2303 TFLOPS

execute end,m is:358
计算性能: 0.2301 TFLOPS

execute end,m is:360
计算性能: 0.2307 TFLOPS

execute end,m is:362
计算性能: 0.2303 TFLOPS

execute end,m is:364
计算性能: 0.2312 TFLOPS

execute end,m is:366
计算性能: 0.2307 TFLOPS

execute end,m is:368
计算性能: 0.2316 TFLOPS

execute end,m is:370
计算性能: 0.2310 TFLOPS

execute end,m is:372
计算性能: 0.2318 TFLOPS

execute end,m is:374
计算性能: 0.2319 TFLOPS

execute end,m is:376
计算性能: 0.2320 TFLOPS

execute end,m is:378
计算性能: 0.2323 TFLOPS

execute end,m is:380
计算性能: 0.2328 TFLOPS

execute end,m is:382
计算性能: 0.2326 TFLOPS

execute end,m is:384
计算性能: 0.2328 TFLOPS

execute end,m is:386
计算性能: 0.2330 TFLOPS

execute end,m is:388
计算性能: 0.2334 TFLOPS

execute end,m is:390
计算性能: 0.2337 TFLOPS

execute end,m is:392
计算性能: 0.2336 TFLOPS

execute end,m is:394
计算性能: 0.2332 TFLOPS

execute end,m is:396
计算性能: 0.2341 TFLOPS

execute end,m is:398
计算性能: 0.2334 TFLOPS

execute end,m is:400
计算性能: 0.2347 TFLOPS

execute end,m is:402
计算性能: 0.2349 TFLOPS

execute end,m is:404
计算性能: 0.2350 TFLOPS

execute end,m is:406
计算性能: 0.2347 TFLOPS

execute end,m is:408
计算性能: 0.2353 TFLOPS

execute end,m is:410
计算性能: 0.2350 TFLOPS

execute end,m is:412
计算性能: 0.2356 TFLOPS

execute end,m is:414
计算性能: 0.2354 TFLOPS

execute end,m is:416
计算性能: 0.2357 TFLOPS

execute end,m is:418
计算性能: 0.2357 TFLOPS

execute end,m is:420
计算性能: 0.2361 TFLOPS

execute end,m is:422
计算性能: 0.2361 TFLOPS

execute end,m is:424
计算性能: 0.2364 TFLOPS

execute end,m is:426
计算性能: 0.2360 TFLOPS

execute end,m is:428
计算性能: 0.2372 TFLOPS

execute end,m is:430
计算性能: 0.2364 TFLOPS

execute end,m is:432
计算性能: 0.2369 TFLOPS

execute end,m is:434
计算性能: 0.2369 TFLOPS

execute end,m is:436
计算性能: 0.2372 TFLOPS

execute end,m is:438
计算性能: 0.2370 TFLOPS

execute end,m is:440
计算性能: 0.2377 TFLOPS

execute end,m is:442
计算性能: 0.2374 TFLOPS

execute end,m is:444
计算性能: 0.2382 TFLOPS

execute end,m is:446
计算性能: 0.2379 TFLOPS

execute end,m is:448
计算性能: 0.2385 TFLOPS

execute end,m is:450
计算性能: 0.2377 TFLOPS

execute end,m is:452
计算性能: 0.2385 TFLOPS

execute end,m is:454
计算性能: 0.2384 TFLOPS

execute end,m is:456
计算性能: 0.2389 TFLOPS

execute end,m is:458
计算性能: 0.2319 TFLOPS

execute end,m is:460
计算性能: 0.2386 TFLOPS

execute end,m is:462
计算性能: 0.2386 TFLOPS

execute end,m is:464
计算性能: 0.2389 TFLOPS

execute end,m is:466
计算性能: 0.2393 TFLOPS

execute end,m is:468
计算性能: 0.2393 TFLOPS

execute end,m is:470
计算性能: 0.2389 TFLOPS

execute end,m is:472
计算性能: 0.2393 TFLOPS

execute end,m is:474
计算性能: 0.2395 TFLOPS

execute end,m is:476
计算性能: 0.2399 TFLOPS

execute end,m is:478
计算性能: 0.2400 TFLOPS

execute end,m is:480
计算性能: 0.2400 TFLOPS

execute end,m is:482
计算性能: 0.2397 TFLOPS

execute end,m is:484
计算性能: 0.2407 TFLOPS

execute end,m is:486
计算性能: 0.2400 TFLOPS

execute end,m is:488
计算性能: 0.2407 TFLOPS

execute end,m is:490
计算性能: 0.2404 TFLOPS

execute end,m is:492
计算性能: 0.2411 TFLOPS

execute end,m is:494
计算性能: 0.2409 TFLOPS

execute end,m is:496
计算性能: 0.2407 TFLOPS

execute end,m is:498
计算性能: 0.2412 TFLOPS

execute end,m is:500
计算性能: 0.2418 TFLOPS

execute end,m is:502
计算性能: 0.2416 TFLOPS

execute end,m is:504
计算性能: 0.2418 TFLOPS

execute end,m is:506
计算性能: 0.2416 TFLOPS

execute end,m is:508
计算性能: 0.2421 TFLOPS

execute end,m is:510
计算性能: 0.2419 TFLOPS

execute end,m is:512
计算性能: 0.2423 TFLOPS
"""

# 使用正则表达式提取 m 和 TFLOPS 值
m_values = list(map(int, re.findall(r'm is:(\d+)', data_str)))
tflops_values = list(map(float, re.findall(r'计算性能: ([\d.]+) TFLOPS', data_str)))

# 绘图
plt.figure(figsize=(10, 6))
plt.plot(m_values, tflops_values, marker='o', linestyle='-', color='blue')
plt.title('m * k with k * n (k=7168 n=512) ')
plt.xlabel('m')
plt.ylabel('Tflops')
plt.grid(True)
plt.tight_layout()

# 保存图表为文件
plt.savefig('performance_plot.png')  # 保存为 PNG 格式
# plt.savefig('performance_plot.pdf')  # 或保存为 PDF 格式


================================================
FILE: kt-kernel/demo/simple_test.cpp
================================================
#include <dlfcn.h>
#include <kblas.h>
#include <unistd.h>

#include <chrono>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>

int main() {
  // 矩阵维度 M 是 1024，K 是 1024，N 是 1024（行主序）
  int M = 1024;        // 行主序时，A 的行长度为 K
  const int K = 1024;  // B 的行长度为 N
  const int N = 1024;  // C 的行长度为 N
  const int iter = 1;  // 迭代次数

  // 分配矩阵内存
  int8_t* A = (int8_t*)malloc(M * K * sizeof(int8_t));
  int8_t* B = (int8_t*)malloc(K * N * sizeof(int8_t));
  int32_t* C = (int32_t*)malloc(M * N * sizeof(int32_t));

  // 初始化随机种子
  srand((unsigned)time(NULL));

  // 随机初始化 A（范围 0 到 255）和 B（范围 -128 到 127）
  // 初始化矩阵 A 和 B
  for (int j = 0; j < M * K; j++) {
    // A[j] = rand() % 256;
    A[j] = j;
  }
  for (int j = 0; j < K * N; j++) {
    // B[j] = rand() % 256;
    B[j] = j;
  }
  // 初始化矩阵 C
  for (int j = 0; j < M * N; j++) {
    C[j] = 0;
  }

  // 设置 cblas_gemm_s8u8s32 的参数
  float alpha = 1.0f;
  float beta = 0.0f;
  int8_t oa = 0, ob = 0;
  int32_t oc = 0;

  // 打印矩阵 A、B
  // printf("A=\n");
  // for (int i = 0; i < M; i++) {
  //   for (int j = 0; j < K; j++) {
  //     printf("%d ", A[i * K + j]);
  //   }
  //   printf("\n");
  // }
  // printf("B=\n");
  // for (int i = 0; i < N; i++) {
  //   for (int j = 0; j < K; j++) {
  //     printf("%d ", B[i * K + j]);
  //   }
  //   printf("\n");
  // }

  // printf("format: 'generate end'\n");
  // 调用 cblas_gemm_s8u8s32 执行矩阵乘法：C = i1(A+ao)(B+bo) + 0*C + oc
  // 从m=10～256 都测一遍速度，步长是 stride
  int stride = 2;
  int start_m = M;
  for (int m = start_m; m <= M; m += stride) {
    // 记录开始时间
    auto start = std::chrono::high_resolution_clock::now();
#pragma GCC unroll 8
    for (int i = 0; i < iter; i++) {
      cblas_gemm_s8s8s32(CblasRowMajor, CblasNoTrans, CblasTrans, CblasFixOffset, m, N / 2, K, alpha, A, K, oa, B, K,
                         ob, beta, C, N, &oc);
      int8_t* B_high = B + K * N / 2;
      int32_t* C_high = C + N / 2;
      cblas_gemm_s8s8s32(CblasRowMajor, CblasNoTrans, CblasTrans, CblasFixOffset, m, N / 2, K, alpha, A, K, oa, B_high,
                         K, ob, beta, C_high, N, &oc);
    }

    // 打印结果
    // printf("result:\n");
    // for (int i = 0; i < M; i++) {
    //   for (int j = 0; j < N; j++) {
    //     printf("%d ", C[i * N + j]);
    //   }
    //   printf("\n");
    // }

    // 记录结束时间
    auto end = std::chrono::high_resolution_clock::now();

    // 计算总时长（秒）
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    double time_sec = duration.count() / 1e6;  // 转换为秒

    // 计算理论浮点运算次数并转换为 TFLOPS
    double ops = iter * 2.0 * m * N * K;
    double tflops = ops / (duration.count() * 1e6);  // 转换为 TFLOPS

    // 输出结果
    printf("execute end,m is:%d\n", m);
    // printf("执行时间: %.4f 秒\n", time_sec);
    printf("计算性能: %.4f TFLOPS\n", tflops);
    printf("\n");
  }

  // 释放资源
  free(A);
  free(B);
  free(C);
  return 0;
}

================================================
FILE: kt-kernel/demo/simple_test_aocl.cpp
================================================
#include <blis.h>

#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
// #define CHECK
namespace {
// B matrix is in col-major order
constexpr int kM = 3;
constexpr int kK = 7168;
constexpr int kN = 2048;
void fill_inputs(int8_t* a, int8_t* b) {
  srand(static_cast<unsigned>(time(nullptr)));
  for (int i = 0; i < kM * kK; ++i) {
    a[i] = static_cast<int8_t>(rand() % 127);
  }
  for (int i = 0; i < kK * kN; ++i) {
    b[i] = static_cast<int8_t>(rand() % 127);
  }
}

void compute_reference(const int8_t* a, const int8_t* b, int32_t* ref) {
  for (int m = 0; m < kM; ++m) {
    for (int n = 0; n < kN; ++n) {
      int32_t acc = 0;
      for (int k = 0; k < kK; ++k) {
        acc += static_cast<int32_t>(a[m * kK + k]) * static_cast<int32_t>(b[k * kN + n]);
      }
      ref[m * kN + n] = acc;
    }
  }
}

bool check_result(const int32_t* got, const int32_t* ref) {
  for (int idx = 0; idx < kM * kN; ++idx) {
    if (got[idx] != ref[idx]) {
      std::printf("Mismatch at %d: got %d, expected %d\n", idx, got[idx], ref[idx]);
      return false;
    }
  }
  return true;
}
}  // namespace

int main() {
  err_t err = BLIS_SUCCESS;
  int8_t* a = static_cast<int8_t*>(bli_malloc_user(kM * kK, &err));
  int8_t* b = static_cast<int8_t*>(bli_malloc_user(kK * kN, &err));
  int8_t* b_rowmajor = static_cast<int8_t*>(bli_malloc_user(kK * kN, &err));
  int8_t* b_reordered = nullptr;
  int32_t* c = static_cast<int32_t*>(bli_malloc_user(kM * kN * sizeof(int32_t), &err));
  int32_t* c_unp = static_cast<int32_t*>(bli_malloc_user(kM * kN * sizeof(int32_t), &err));
  int32_t* ref = static_cast<int32_t*>(bli_malloc_user(kM * kN * sizeof(int32_t), &err));

  if (!a || !b || !c || !ref || !c_unp) {
    std::fprintf(stderr, "Allocation failed\n");
    bli_free_user(a);
    bli_free_user(b);
    bli_free_user(c);
    bli_free_user(ref);
    bli_free_user(c_unp);
    return EXIT_FAILURE;
  }

  fill_inputs(a, b);
  // transform B from col-major to row-major
  for (int k = 0; k < kK; ++k) {
    for (int n = 0; n < kN; ++n) {
      // original B is in col-major: b[n * ld + k], here ld = kK
      int8_t val = b[n * kK + k];
      // target row-major: row index = k, col index = n
      b_rowmajor[k * kN + n] = val;
    }
  }
#ifdef CHECK
  // CHECK: printf inputs
  std::puts("\nMatrix A:\n");
  for (int m = 0; m < kM; ++m) {
    for (int k = 0; k < kK; ++k) {
      std::printf("%4d ", a[m * kK + k]);
    }
    std::puts("");
  }
  std::puts("\nMatrix B:\n");
  for (int k = 0; k < kK; ++k) {
    for (int n = 0; n < kN; ++n) {
      std::printf("%4d ", b[n * kK + k]);
    }
    std::puts("");
  }
#endif
  std::memset(c, 0, kM * kN * sizeof(int32_t));
  std::memset(c_unp, 0, kM * kN * sizeof(int32_t));
  std::memset(ref, 0, kM * kN * sizeof(int32_t));
  compute_reference(a, b_rowmajor, ref);
#ifdef CHECK
  // CHECK: printf reference
  std::puts("\nReference result:\n");
  for (int m = 0; m < kM; ++m) {
    for (int n = 0; n < kN; ++n) {
      std::printf("%6d ", ref[m * kN + n]);
    }
    std::puts("");
  }
#endif
  const dim_t reorder_size = aocl_get_reorder_buf_size_s8s8s32os32('c', 'n', 'B', kK, kN);
  b_reordered = static_cast<int8_t*>(bli_malloc_user(reorder_size, &err));
  if (!b_reordered) {
    std::fprintf(stderr, "Reorder buffer allocation failed\n");
    bli_free_user(a);
    bli_free_user(b);
    bli_free_user(c);
    bli_free_user(ref);
    return EXIT_FAILURE;
  }
  aocl_reorder_s8s8s32os32('c', 'n', 'B', b, b_reordered, kK, kN, kK);
#ifdef CHECK
  // CHECK: printf reordered B
  std::puts("\nReordered Matrix B:\n");
  for (int k = 0; k < kK; ++k) {
    for (int n = 0; n < kN; ++n) {
      std::printf("%4d ", b_reordered[k * kN + n]);
    }
    std::puts("");
  }
  std::printf("\nReorder buffer size: %zu bytes\n", reorder_size);
#endif

  const int32_t alpha = 1;
  const int32_t beta = 0;
  aocl_gemm_s8s8s32os32('r', 'n', 't', kM, kN, kK, alpha, a, kK, 'n', b_reordered, kK, 'r', beta, c, kN, nullptr);
  aocl_gemm_s8s8s32os32('r', 'n', 't', kM, kN, kK, alpha, a, kK, 'n', b, kK, 'n', beta, c_unp, kN, nullptr);
#ifdef CHECK
  // CHECK: printf AOCL result
  std::puts("\nAOCL GEMM result (with reordered B):\n");
  for (int m = 0; m < kM; ++m) {
    for (int n = 0; n < kN; ++n) {
      std::printf("%6d ", c[m * kN + n]);
    }
    std::puts("");
  }
  std::puts("\nAOCL GEMM result (without reordered B):\n");
  for (int m = 0; m < kM; ++m) {
    for (int n = 0; n < kN; ++n) {
      std::printf("%6d ", c_unp[m * kN + n]);
    }
    std::puts("");
  }
#endif

  if (check_result(c, ref)) {
    std::puts("AOCL GEMM output matches reference.");
  } else {
    std::puts("AOCL GEMM output mismatch detected.");
  }

  if (check_result(c_unp, ref)) {
    std::puts("unpack AOCL GEMM output matches reference.");
  } else {
    std::puts("unpack AOCL GEMM output mismatch detected.");
  }

  bli_free_user(a);
  bli_free_user(b);
  bli_free_user(b_rowmajor);
  bli_free_user(b_reordered);
  bli_free_user(c);
  bli_free_user(c_unp);
  bli_free_user(ref);
  return 0;
}

================================================
FILE: kt-kernel/demo/tflops.py
================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 读取数据
file_path = 'data.txt'  # 替换为你的文件路径
df = pd.read_csv(file_path, sep=r'\s+', names=['m', 'n', 'tflops'])

# 创建数据透视表，行为 m，列为 n，值为 tflops
pivot_table = df.pivot_table(index='m', columns='n', values='tflops')

# 画热力图
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap='viridis')
plt.title('TFLOPS Heatmap')
plt.xlabel('n')
plt.ylabel('m')
plt.tight_layout()
plt.show()


================================================
FILE: kt-kernel/examples/.gitignore
================================================
debug

================================================
FILE: kt-kernel/examples/bench_moe_amx_int8.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
AMX INT8 MoE Benchmark Script

Benchmarks performance of AMX-accelerated INT8 MOE operations with configurable parameters.
Supports uniform workload distribution across experts and optional CUDA stream mode.

Usage:
    python bench_moe_amx_int8.py [options]

Examples:
    # Default parameters
    python bench_moe_amx_int8.py

    # Custom parameters
    python bench_moe_amx_int8.py --layer_num 4 --expert_num 256 --workload 8 --use_cuda_stream

    # Full configuration
    python bench_moe_amx_int8.py --layer_num 2 --expert_num 128 --num_experts_per_tok 8 \
        --workload 4 --hidden_size 7168 --intermediate_size 2048 \
        --warmup_iter 100 --test_iter 1000 --use_cuda_stream
"""

import os
import sys
import time
import argparse

# Add build path for development
sys.path.insert(0, os.path.dirname(__file__) + "/../build")

import torch

try:
    from kt_kernel import kt_kernel_ext

    HAS_KT_KERNEL = True
except ImportError as e:
    HAS_KT_KERNEL = False
    import_error = str(e)


def parse_args():
    parser = argparse.ArgumentParser(
        description="AMX INT8 MoE Benchmark", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    # Model parameters
    parser.add_argument("--layer_num", type=int, default=2, help="Number of MoE layers")
    parser.add_argument("--expert_num", type=int, default=256, help="Number of experts per layer")
    parser.add_argument(
        "--num_experts_per_tok", type=int, default=8, help="Number of experts selected per token (top-k)"
    )
    parser.add_argument("--hidden_size", type=int, default=7168, help="Hidden dimension size")
    parser.add_argument("--intermediate_size", type=int, default=2048, help="Intermediate dimension size")

    # Workload parameters
    parser.add_argument("--workload", type=int, default=1, help="Workload (qlen, number of tokens)")
    parser.add_argument("--max_len", type=int, default=25600, help="Maximum sequence length for buffer allocation")

    # Benchmark parameters
    parser.add_argument("--warmup_iter", type=int, default=100, help="Number of warmup iterations")
    parser.add_argument("--test_iter", type=int, default=1000, help="Number of test iterations")

    # Execution mode
    parser.add_argument("--use_cuda_stream", action="store_true", help="Use CUDA stream mode (submit_with_cuda_stream)")
    parser.add_argument("--profile", action="store_true", help="Enable PyTorch profiler and export trace.json")
    parser.add_argument("--profile_path", type=str, default="./trace.json", help="Path to save profile trace")

    # Worker configuration
    parser.add_argument("--cpuinfer_threads", type=int, default=60, help="Total CPU inference threads")
    parser.add_argument("--numa_count", type=int, default=2, help="Number of NUMA nodes")
    parser.add_argument(
        "--num_gpu_experts", type=int, default=0, help="Number of experts to place on GPU (first N experts)"
    )

    return parser.parse_args()


def generate_uniform_workload(expert_num, num_experts_per_tok, workload):
    """
    Generate expert_ids and weights with uniform workload distribution.

    workload = qlen (number of tokens)
    Each token selects num_experts_per_tok experts.
    Total expert calls = workload * num_experts_per_tok
    """
    qlen = workload

    # Randomly select num_experts_per_tok experts (uniform, no duplicates)
    # All tokens will use the same expert combination
    selected_experts = torch.randperm(expert_num)[:num_experts_per_tok].tolist()

    # Create expert_ids: all tokens use the same expert combination
    expert_ids = [selected_experts for _ in range(qlen)]

    # Create on GPU then copy to CPU (faster)
    expert_ids = torch.tensor(expert_ids, dtype=torch.long, device="cuda").to("cpu").contiguous()
    print(f"Selected experts (all tokens use same): {selected_experts}")
    print(f"Expert IDs shape: {expert_ids.shape}")

    # Uniform weights (normalized) - create on GPU then copy
    weights = torch.ones((qlen, num_experts_per_tok), dtype=torch.float32, device="cuda") / num_experts_per_tok
    weights = weights.to("cpu").contiguous()

    return expert_ids, weights, qlen


def run_benchmark(args):
    """Run the AMX INT8 MoE benchmark."""

    print("=" * 60)
    print("AMX INT8 MoE Benchmark")
    print("=" * 60)
    print(f"\nConfiguration:")
    print(f"  Layers:              {args.layer_num}")
    print(f"  Experts per layer:   {args.expert_num}")
    print(f"  Experts per token:   {args.num_experts_per_tok}")
    print(f"  Hidden size:         {args.hidden_size}")
    print(f"  Intermediate size:   {args.intermediate_size}")
    print(f"  Workload (qlen):     {args.workload}")
    print(f"  Use CUDA stream:     {args.use_cuda_stream}")
    print(f"  Warmup iterations:   {args.warmup_iter}")
    print(f"  Test iterations:     {args.test_iter}")
    print(f"  CPU threads:         {args.cpuinfer_threads}")
    print(f"  NUMA nodes:          {args.numa_count}")

    # Generate uniform workload
    expert_ids, weights, qlen = generate_uniform_workload(args.expert_num, args.num_experts_per_tok, args.workload)
    print(f"\nActual qlen:           {qlen}")
    print(f"Total expert calls:    {qlen * args.num_experts_per_tok}")

    with torch.inference_mode():
        # Initialize CPUInfer
        if args.numa_count > 1:
            worker_config = kt_kernel_ext.WorkerPoolConfig()
            worker_config.subpool_count = args.numa_count
            worker_config.subpool_numa_map = list(range(args.numa_count))
            threads_per_numa = args.cpuinfer_threads // args.numa_count
            worker_config.subpool_thread_count = [threads_per_numa] * args.numa_count
            cpu_infer = kt_kernel_ext.CPUInfer(worker_config)
        else:
            cpu_infer = kt_kernel_ext.CPUInfer(args.cpuinfer_threads)

        # Physical to logical mapping (identity)
        physical_to_logical_map = torch.arange(args.expert_num, dtype=torch.int64, device="cpu").contiguous()

        # GPU experts mask - set first num_gpu_experts to True if specified
        gpu_experts_mask = torch.zeros(args.expert_num, dtype=torch.bool, device="cpu")
        if args.num_gpu_experts > 0:
            num_gpu = min(args.num_gpu_experts, args.expert_num)
            gpu_experts_mask[:num_gpu] = True
            print(f"  GPU experts: {num_gpu} (experts 0-{num_gpu-1})")

        # Initialize MoE layers
        print("\nInitializing MoE layers...")
        moes = []
        for layer_idx in range(args.layer_num):
            # Create random weights on GPU then copy to CPU (faster)
            gate_proj = (
                torch.randn(
                    (args.expert_num, args.intermediate_size, args.hidden_size), dtype=torch.bfloat16, device="cuda"
                )
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn(
                    (args.expert_num, args.intermediate_size, args.hidden_size), dtype=torch.bfloat16, device="cuda"
                )
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn(
                    (args.expert_num, args.hidden_size, args.intermediate_size), dtype=torch.bfloat16, device="cuda"
                )
                .to("cpu")
                .contiguous()
            )

            # Configure MoE
            config = kt_kernel_ext.moe.MOEConfig(
                args.expert_num,
                args.num_experts_per_tok,
                args.hidden_size,
                args.intermediate_size,
                gpu_experts_mask.data_ptr(),
            )
            config.max_len = args.max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = cpu_infer.backend_

            moe = kt_kernel_ext.moe.AMXInt8_MOE(config)
            cpu_infer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            cpu_infer.sync()

            moes.append(moe)
            print(f"  Layer {layer_idx} initialized")

        # Prepare input/output tensors
        input_tensor = torch.randn((qlen, args.hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        output_tensor = torch.zeros((qlen, args.hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
        bsz_tensor = torch.tensor([qlen], dtype=torch.int32, device="cpu")

        # CUDA stream setup (if enabled)
        cuda_stream = None
        if args.use_cuda_stream:
            if not torch.cuda.is_available():
                print("\nWarning: CUDA not available, falling back to non-stream mode")
                args.use_cuda_stream = False
            else:
                cuda_stream = torch.cuda.current_stream().cuda_stream
                print(f"\nUsing CUDA stream: {cuda_stream}")

        # Warmup
        print(f"\nWarmup ({args.warmup_iter} iterations)...")
        for i in range(args.warmup_iter):
            moe = moes[i % args.layer_num]
            task = moe.forward_task(
                bsz_tensor.data_ptr(),
                args.num_experts_per_tok,
                expert_ids.data_ptr(),
                weights.data_ptr(),
                input_tensor.data_ptr(),
                output_tensor.data_ptr(),
                False,  # incremental
            )

            if args.use_cuda_stream:
                cpu_infer.submit_with_cuda_stream(cuda_stream, task)
                cpu_infer.sync_with_cuda_stream(cuda_stream)
            else:
                cpu_infer.submit(task)
                cpu_infer.sync()

        # Benchmark
        print(f"Benchmarking ({args.test_iter} iterations)...")

        if args.use_cuda_stream:
            torch.cuda.synchronize()

        # Setup profiler if enabled
        profiler = None
        if args.profile:
            profiler = torch.profiler.profile(
                activities=[
                    torch.profiler.ProfilerActivity.CPU,
                    torch.profiler.ProfilerActivity.CUDA,
                ],
                record_shapes=False,
                with_stack=False,
            )
            profiler.__enter__()

        start_time = time.perf_counter()

        for i in range(args.test_iter):
            moe = moes[i % args.layer_num]

            if args.profile:
                torch.cuda.nvtx.range_push(f"iter_{i}")

            task = moe.forward_task(
                bsz_tensor.data_ptr(),
                args.num_experts_per_tok,
                expert_ids.data_ptr(),
                weights.data_ptr(),
                input_tensor.data_ptr(),
                output_tensor.data_ptr(),
                False,
            )

            if args.use_cuda_stream:
                if args.profile:
                    torch.cuda.nvtx.range_push("submit")
                cpu_infer.submit_with_cuda_stream(cuda_stream, task)
                if args.profile:
                    torch.cuda.nvtx.range_pop()
                    torch.cuda.nvtx.range_push("sync")
                cpu_infer.sync_with_cuda_stream(cuda_stream)
                if args.profile:
                    torch.cuda.nvtx.range_pop()
            else:
                cpu_infer.submit(task)
                cpu_infer.sync()

            if args.profile:
                torch.cuda.nvtx.range_pop()

        if args.use_cuda_stream:
            torch.cuda.synchronize()

        end_time = time.perf_counter()
        total_time = end_time - start_time

        # Export profiler trace
        if profiler:
            profiler.__exit__(None, None, None)
            profiler.export_chrome_trace(args.profile_path)
            print(f"\nProfile trace saved to: {args.profile_path}")

        # Calculate metrics
        # Note: each iteration processes ONE layer (round-robin: moe = moes[i % layer_num])
        time_per_iter_us = total_time / args.test_iter * 1e6

        # Bandwidth calculation
        # Weight size per expert: 3 * hidden_size * intermediate_size * bytes_per_elem
        bytes_per_elem = 1.0  # INT8
        weight_bytes_per_expert = 3 * args.hidden_size * args.intermediate_size * bytes_per_elem

        # Total weight bytes accessed per iteration (one layer per iteration)
        # Each token activates num_experts_per_tok experts
        total_experts_activated = qlen * args.num_experts_per_tok
        weight_bytes_per_iter = total_experts_activated * weight_bytes_per_expert

        bandwidth_gbs = weight_bytes_per_iter * args.test_iter / total_time / 1e9

        # FLOPS calculation
        # Per expert: 3 * hidden * intermediate * 2 (multiply-add)
        flops_per_expert = 3 * args.hidden_size * args.intermediate_size * 2
        total_flops = total_experts_activated * flops_per_expert * args.test_iter
        tflops = total_flops / total_time / 1e12

        # Results
        print("\n" + "=" * 60)
        print("Results")
        print("=" * 60)
        print(f"  Total time:           {total_time:.3f} s")
        print(f"  Time per iteration:   {time_per_iter_us:.2f} us  (= time per layer)")
        print(f"  Memory bandwidth:     {bandwidth_gbs:.2f} GB/s")
        print(f"  Compute throughput:   {tflops:.3f} TFLOPS")
        print("=" * 60)

        return {
            "total_time_s": total_time,
            "time_per_iter_us": time_per_iter_us,
            "bandwidth_gbs": bandwidth_gbs,
            "tflops": tflops,
        }


def main():
    args = parse_args()

    if not HAS_KT_KERNEL:
        print(f"Error: kt_kernel not available: {import_error}")
        sys.exit(1)

    run_benchmark(args)


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/examples/configuration_deepseek_v3.py
================================================
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepseekV3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V3.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 129280):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV3Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
            Number of nextn predict layers in the DeepSeekV3 Model.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        n_shared_experts (`int`, *optional*, defaults to None):
            Number of shared experts, None means dense model.
        n_routed_experts (`int`, *optional*, defaults to None):
            Number of routed experts, None means dense model.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        topk_method (`str`, *optional*, defaults to `gready`):
            Topk method used in routed gate.
        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
        moe_layer_freq (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
        scoring_func (`str`, *optional*, defaults to 'softmax'):
            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import DeepseekV3Model, DeepseekV3Config
    >>> # Initializing a Deepseek-V3 style configuration
    >>> configuration = DeepseekV3Config()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "deepseek_v3"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=129280,
        hidden_size=7168,
        intermediate_size=18432,
        moe_intermediate_size = 2048,
        num_hidden_layers=61,
        num_nextn_predict_layers=1,
        num_attention_heads=128,
        num_key_value_heads=128,
        n_shared_experts = 1,
        n_routed_experts = 256,
        ep_size = 1,
        routed_scaling_factor = 2.5,
        kv_lora_rank = 512,
        q_lora_rank = 1536,
        qk_rope_head_dim = 64,
        v_head_dim = 128,
        qk_nope_head_dim = 128,
        topk_method = 'noaux_tc',
        n_group = 8,
        topk_group = 4,
        num_experts_per_tok = 8,
        moe_layer_freq = 1,
        first_k_dense_replace = 3,
        norm_topk_prob = True,
        scoring_func = 'sigmoid',
        hidden_act="silu",
        max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=0,
        eos_token_id=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

================================================
FILE: kt-kernel/examples/modeling_deepseek_v3.py
================================================
# coding=utf-8
# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DeepSeek model."""
import math
import warnings
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_attention_mask,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import (
    ALL_LAYERNORM_LAYERS,
    is_torch_greater_or_equal_than_1_13,
)
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_torch_fx_available
from configuration_deepseek_v3 import DeepseekV3Config
import torch.distributed as dist
import numpy as np

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa


# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "DeepseekV3Config"


def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(
        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
    )
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


class DeepseekV3RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        DeepseekV3RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm)


class DeepseekV3RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (
            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings,
            device=self.inv_freq.device,
            dtype=torch.get_default_dtype(),
        )
        self.max_seq_len_cached = None

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq.to(t.device))
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3
class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
    """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        t = t / self.scaling_factor

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3
class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
    """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings)
                - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (
                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Inverse dim formula to find dim based on number of rotations
def yarn_find_correction_dim(
    num_rotations, dim, base=10000, max_position_embeddings=2048
):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def yarn_find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(
        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
    )
    high = math.ceil(
        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
    )
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case


def yarn_get_mscale(scale=1, mscale=1):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0


def yarn_linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func


class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        dim = self.dim

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(seq_len, device=device, dtype=torch.float32)

        freqs = torch.outer(t, inv_freq)

        _mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )

        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer(
            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
        )
        self.register_buffer(
            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
        )


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)

    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)

    b, h, s, d = k.shape
    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class DeepseekV3MLP(nn.Module):
    def __init__(self, config, hidden_size=None, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
        self.intermediate_size = (
            config.intermediate_size if intermediate_size is None else intermediate_size
        )

        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class MoEGate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.top_k = config.num_experts_per_tok
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.scoring_func = config.scoring_func
        self.topk_method = config.topk_method
        self.n_group = config.n_group
        self.topk_group = config.topk_group

        # topk selection algorithm
        self.norm_topk_prob = config.norm_topk_prob
        self.gating_dim = config.hidden_size
        self.weight = nn.Parameter(
            torch.empty((self.n_routed_experts, self.gating_dim))
        )
        if self.topk_method == "noaux_tc":
            self.e_score_correction_bias = nn.Parameter(
                torch.empty((self.n_routed_experts))
            )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        logits = F.linear(
            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
        )
        if self.scoring_func == "sigmoid":
            scores = logits.sigmoid()
        else:
            raise NotImplementedError(
                f"insupportable scoring function for MoE gating: {self.scoring_func}"
            )

        ### select top-k experts
        if self.topk_method == "noaux_tc":
            # assert not self.training
            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
            group_scores = (
                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
            )  # [n, n_group]
            group_idx = torch.topk(
                group_scores, k=self.topk_group, dim=-1, sorted=False
            )[
                1
            ]  # [n, top_k_group]
            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
            score_mask = (
                group_mask.unsqueeze(-1)
                .expand(
                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
                )
                .reshape(bsz * seq_len, -1)
            )  # [n, e]
            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
            _, topk_idx = torch.topk(
                tmp_scores, k=self.top_k, dim=-1, sorted=False
            )
            topk_weight = scores.gather(1, topk_idx)
        else:
            raise NotImplementedError(
                f"insupportable TopK function for MoE gating: {self.topk_method}"
            )

        ### norm gate to sum 1
        if self.top_k > 1 and self.norm_topk_prob:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor

        return topk_idx, topk_weight

class DeepseekV3MoE(nn.Module):
    """
    A mixed expert module containing shared experts.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.num_experts_per_tok = config.num_experts_per_tok

        if hasattr(config, "ep_size") and config.ep_size > 1:
            assert config.ep_size == dist.get_world_size()
            self.ep_size = config.ep_size
            self.experts_per_rank = config.n_routed_experts // config.ep_size
            self.ep_rank = dist.get_rank()
            self.experts = nn.ModuleList(
                [
                    (
                        DeepseekV3MLP(
                            config, intermediate_size=config.moe_intermediate_size
                        )
                        if i >= self.ep_rank * self.experts_per_rank
                        and i < (self.ep_rank + 1) * self.experts_per_rank
                        else None
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        else:
            self.ep_size = 1
            self.experts_per_rank = config.n_routed_experts
            self.ep_rank = 0
            self.experts = nn.ModuleList(
                [
                    DeepseekV3MLP(
                        config, intermediate_size=config.moe_intermediate_size
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        self.gate = MoEGate(config)
        if config.n_shared_experts is not None:
            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
            self.shared_experts = DeepseekV3MLP(
                config=config, intermediate_size=intermediate_size
            )

    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        flat_topk_idx = topk_idx.view(-1)
        if not self.training:
            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
        if self.config.n_shared_experts is not None:
            y = y + self.shared_experts(identity)
        return y

    @torch.no_grad()
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        sorted_tokens_shape = sorted_tokens.shape
        if self.ep_size > 1:
            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
            tokens_per_expert_group = tokens_per_expert.new_empty(
                tokens_per_expert.shape[0]
            )
            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
            output_splits = (
                tokens_per_expert_group.view(self.ep_size, -1)
                .sum(1)
                .cpu()
                .numpy()
                .tolist()
            )
            gathered_tokens = sorted_tokens.new_empty(
                tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
            )
            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
            dist.all_to_all(
                list(gathered_tokens.split(output_splits)),
                list(sorted_tokens.split(input_split_sizes)),
            )
            tokens_per_expert_post_gather = tokens_per_expert_group.view(
                self.ep_size, self.experts_per_rank
            ).sum(dim=0)
            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
            s = 0
            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
                gatherd_idxs[s : s + k] = i % self.experts_per_rank
                s += k
            gatherd_idxs = gatherd_idxs.argsort()
            sorted_tokens = gathered_tokens[gatherd_idxs]
            tokens_per_expert = tokens_per_expert_post_gather
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
        if self.ep_size > 1:
            new_x = torch.empty_like(outs)
            new_x[gatherd_idxs] = outs
            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
            dist.all_to_all(
                list(gathered_tokens.split(input_split_sizes)),
                list(new_x.split(output_splits)),
            )
            outs = gathered_tokens

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3
class DeepseekV3Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads

        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.q_lora_rank = config.q_lora_rank
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.kv_lora_rank = config.kv_lora_rank
        self.v_head_dim = config.v_head_dim
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim

        self.is_causal = True

        if self.q_lora_rank is None:
            self.q_proj = nn.Linear(
                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
            )
        else:
            self.q_a_proj = nn.Linear(
                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
            )
            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
            self.q_b_proj = nn.Linear(
                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
            )

        self.kv_a_proj_with_mqa = nn.Linear(
            self.hidden_size,
            config.kv_lora_rank + config.qk_rope_head_dim,
            bias=config.attention_bias,
        )
        self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
        self.kv_b_proj = nn.Linear(
            config.kv_lora_rank,
            self.num_heads
            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
            bias=False,
        )

        self.o_proj = nn.Linear(
            self.num_heads * self.v_head_dim,
            self.hidden_size,
            bias=config.attention_bias,
        )
        self._init_rope()

        self.softmax_scale = self.q_head_dim ** (-0.5)
        if self.config.rope_scaling is not None:
            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
            scaling_factor = self.config.rope_scaling["factor"]
            if mscale_all_dim:
                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                self.softmax_scale = self.softmax_scale * mscale * mscale

    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = DeepseekV3RotaryEmbedding(
                self.qk_rope_head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "yarn":
                kwargs = {
                    key: self.config.rope_scaling[key]
                    for key in [
                        "original_max_position_embeddings",
                        "beta_fast",
                        "beta_slow",
                        "mscale",
                        "mscale_all_dim",
                    ]
                    if key in self.config.rope_scaling
                }
                self.rotary_emb = DeepseekV3YarnRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                    **kwargs,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
        )

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )
        assert attention_mask is not None
        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3
class DeepseekV3FlashAttention2(DeepseekV3Attention):
    """
    DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # DeepseekV3FlashAttention2 attention does not support output_attentions
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

            # overwrite attention_mask with padding_mask
            attention_mask = kwargs.pop("padding_mask")

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]

        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe

        if self.q_head_dim != self.v_head_dim:
            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
        # to be able to avoid many of these transpose/reshape/view.
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (DeepseekV3RMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
            if hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            elif torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            else:
                target_dtype = (
                    self.q_proj.weight.dtype
                    if self.q_lora_rank is None
                    else self.q_a_proj.weight.dtype
                )

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            dropout=dropout_rate,
            softmax_scale=self.softmax_scale,
        )
        if self.q_head_dim != self.v_head_dim:
            attn_output = attn_output[:, :, :, : self.v_head_dim]

        attn_output = attn_output.reshape(
            bsz, q_len, self.num_heads * self.v_head_dim
        ).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        dropout=0.0,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`int`, *optional*):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            (
                query_states,
                key_states,
                value_states,
                indices_q,
                cu_seq_lens,
                max_seq_lens,
            ) = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            attn_output = pad_input(
                attn_output_unpad, indices_q, batch_size, query_length
            )
        else:
            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states,
                dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

        return attn_output

    def _upad_input(
        self, query_layer, key_layer, value_layer, attention_mask, query_length
    ):
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
                indices_k,
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                query_layer, attention_mask
            )

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


ATTENTION_CLASSES = {
    "eager": DeepseekV3Attention,
    "flash_attention_2": DeepseekV3FlashAttention2,
}


class DeepseekV3DecoderLayer(nn.Module):
    def __init__(self, config: DeepseekV3Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = (
            DeepseekV3MoE(config)
            if (
                config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0
            )
            else DeepseekV3MLP(config)
        )
        self.input_layernorm = DeepseekV3RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = DeepseekV3RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            **kwargs,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


DeepseekV3_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DeepseekV3Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3PreTrainedModel(PreTrainedModel):
    config_class = DeepseekV3Config
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["DeepseekV3DecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_cache_class = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


DeepseekV3_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3Model(DeepseekV3PreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]

    Args:
        config: DeepseekV3Config
    """

    def __init__(self, config: DeepseekV3Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                DeepseekV3DecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)

        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_key_values_length,
                seq_length + past_key_values_length,
                dtype=torch.long,
                device=device,
            )
            position_ids = position_ids.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if self._use_flash_attention_2:
            # 2d mask is passed through the layers
            attention_mask = (
                attention_mask
                if (attention_mask is not None and 0 in attention_mask)
                else None
            )
        else:
            # 4d mask is passed through the layers
            attention_mask = _prepare_4d_causal_attention_mask(
                attention_mask,
                (batch_size, seq_length),
                inputs_embeds,
                past_key_values_length,
            )

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
            )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache()
                if use_legacy_cache
                else next_decoder_cache
            )
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = DeepseekV3Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM

        >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states[:,-1:,:])
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                cache_length = past_key_values.get_seq_length()
                past_length = past_key_values.seen_tokens
                max_cache_length = past_key_values.get_max_length()
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if (
                attention_mask is not None
                and attention_mask.shape[1] > input_ids.shape[1]
            ):
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx.to(past_state.device))
                    for past_state in layer_past
                ),
            )
        return reordered_past


@add_start_docstrings(
    """
    The DeepseekV3 Model transformer with a sequence classification head on top (linear layer).

    [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = DeepseekV3Model(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                ).to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


================================================
FILE: kt-kernel/examples/repro_llamafile_re.py
================================================
#!/usr/bin/env python3
"""
Minimal LLAMAFILE repro harness to catch intermittent RuntimeError/RE.

Requirements:
- kt_kernel_ext built with LLAMAFILE (and CUDA stream integration)
- Valid GGUF weights directory (WEIGHT_PATH)

Usage:
  WEIGHT_PATH=/path/to/gguf python examples/repro_llamafile_re.py

Optional env:
  DEVICE=cuda|cpu           # default: auto (cuda if available)
  N_ITERS=1000              # iterations
  BATCH=4                   # batch size
  H=2048                    # hidden size
  EXPERTS=128               # total experts
  TOPK=8                    # experts per token
  INTER=768                 # intermediate size (must be divisible by 256)
  GPU_EXPERTS=100           # num experts on GPU side
  TP=2                      # threadpool_count
  CPU_THREADS=32            # cpuinfer_threads
  MAX_DEFER=2               # max_deferred_experts_per_token
  MODE=split|forward        # split=submit+sync, forward=wrapper.forward
  SEED=1                    # random seed

Debug tips:
  - Set CUDA_LAUNCH_BLOCKING=1 to catch async errors deterministically.
  - Try varying N_ITERS, BATCH, TOPK, MAX_DEFER.
  - Capture stdout/stderr for failure iteration index.
"""

from __future__ import annotations

import os
import sys
import faulthandler
import torch

from kt_kernel import KTMoEWrapper


def getenv_int(name: str, default: int) -> int:
    try:
        return int(os.environ.get(name, default))
    except Exception:
        return default


def get_stream_for(device: torch.device | str):
    device = torch.device(device)
    if device.type == "cuda" and torch.cuda.is_available():
        return torch.cuda.current_stream(device).cuda_stream
    return 0


def main() -> int:
    faulthandler.enable()

    weight_path = (os.environ.get("WEIGHT_PATH") or "").strip()
    if not weight_path:
        print("ERROR: WEIGHT_PATH env is required.")
        return 2
    if not os.path.exists(weight_path):
        print(f"ERROR: WEIGHT_PATH does not exist: {weight_path}")
        return 2

    device_str = os.environ.get("DEVICE") or ("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device(device_str)

    n_iters = getenv_int("N_ITERS", 1000)
    batch = getenv_int("BATCH", 4)
    hidden = getenv_int("H", 2048)
    experts = getenv_int("EXPERTS", 128)
    topk = getenv_int("TOPK", 8)
    inter = getenv_int("INTER", 768)
    gpu_experts = getenv_int("GPU_EXPERTS", 100)
    tp = getenv_int("TP", 2)
    cpu_threads = getenv_int("CPU_THREADS", 32)
    max_defer = getenv_int("MAX_DEFER", 2)
    seed = getenv_int("SEED", 1)
    mode = (os.environ.get("MODE") or "split").lower()

    if inter % 256 != 0:
        print(f"ERROR: INTER must be divisible by 256 for LLAMAFILE (got {inter}).")
        return 2

    print(
        f"LLAMAFILE Repro: device={device}, iters={n_iters}, batch={batch}, H={hidden}, topk={topk}, E={experts}, inter={inter}, TP={tp}, CPU_THREADS={cpu_threads}, mode={mode}"
    )
    print(f"Weights: {weight_path}")

    torch.manual_seed(seed)

    # Create wrapper and load weights once
    wrapper = KTMoEWrapper(
        layer_idx=0,
        num_experts=experts,
        num_experts_per_tok=topk,
        hidden_size=hidden,
        moe_intermediate_size=inter,
        num_gpu_experts=gpu_experts,
        cpuinfer_threads=cpu_threads,
        threadpool_count=tp,
        weight_path=weight_path,
        chunked_prefill_size=512,
        method="LLAMAFILE",
        max_deferred_experts_per_token=max_defer,
    )
    wrapper.load_weights()

    # Optional capture of small batch sizes
    KTMoEWrapper.set_capture_batch_sizes([1, 2, 4, 8, 16])

    stream = get_stream_for(device)

    # Allocate once and reuse to reduce allocator noise
    hidden_states = torch.empty(batch, hidden, dtype=torch.bfloat16, device=device)
    topk_ids = torch.empty(batch, topk, dtype=torch.long, device=device)
    topk_weights = torch.empty(batch, topk, dtype=torch.float32, device=device)

    def fill_random():
        hidden_states.normal_(mean=0.0, std=1.0)
        topk_ids.random_(0, experts)
        topk_weights.uniform_()
        topk_weights.div_(topk_weights.sum(dim=-1, keepdim=True) + 1e-6)

    # Warmup
    fill_random()
    _ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream)
    if device.type == "cuda":
        torch.cuda.synchronize(device)

    # Main loop
    for i in range(n_iters):
        try:
            fill_random()
            if mode == "forward":
                _ = wrapper.forward(hidden_states, topk_ids, topk_weights, stream)
            else:
                wrapper.submit_forward(hidden_states, topk_ids, topk_weights, stream)
                # Optional small GPU op to put work on the same stream
                if device.type == "cuda":
                    hidden_states.add_(0)  # no-op but enqueued on current stream
                _ = wrapper.sync_forward(hidden_states, stream)

            if (i + 1) % 50 == 0:
                print(f"ok: iter {i + 1}/{n_iters}")
                if device.type == "cuda":
                    torch.cuda.synchronize(device)

        except Exception as e:
            print(f"FAIL at iter {i}: {repr(e)}")
            # Flush GPU work for better diagnostics
            if device.type == "cuda":
                try:
                    torch.cuda.synchronize(device)
                except Exception as _:
                    pass
            return 1

    print("All iterations completed without error.")
    return 0


if __name__ == "__main__":
    sys.exit(main())


================================================
FILE: kt-kernel/examples/test-debug.py
================================================
import os
import sys

sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import torch
import ctypes
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.moe import MOEConfig, MOE, AMXBF16_MOE, AMXInt8_MOE, AMXInt4_MOE, AMXInt4_1_MOE

intermediate_size_full = 2048
moe_intermediate_size = 3072
hidden_size = 7168
experts_num = 256
num_experts_per_tok = 8
cpu_infer = kt_kernel_ext.CPUInfer(97)

up = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu")

gate = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu")

down = torch.empty(experts_num, hidden_size, intermediate_size_full, dtype=torch.bfloat16, device="cpu")

gate_ptr = ctypes.addressof(ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
up_ptr = ctypes.addressof(ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
down_ptr = ctypes.addressof(ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
moe_config = MOEConfig(
    experts_num,
    num_experts_per_tok,
    hidden_size,
    moe_intermediate_size,
)
moe_config.layer_idx = 45
moe_config.pool = cpu_infer.backend_
moe_config.max_len = 1024  # TODO(zbx): multi cuda graph
moe_config.gate_proj = gate_ptr
moe_config.up_proj = up_ptr
moe_config.down_proj = down_ptr
moe_config.path = ""
moe = AMXInt4_MOE(moe_config)


================================================
FILE: kt-kernel/examples/test_apply_rope.py
================================================
import torch


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(q, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    return q_embed

def my_apply(q,cos,sin):
    
    qa = q[:,:,range(0,64,2)]
    qb = q[:,:,range(1,65,2)]
    q1 = (qa * cos - qb * sin)
    q2 = (qb*cos + qa*sin)
    return torch.cat((q1,q2),-1)


num_heads = 128
seq_len = 1024
rope_size = 64

# theta = torch.randn(, dtype=torch.float32)


================================================
FILE: kt-kernel/examples/test_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from flash_attn import flash_attn_with_kvcache
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
cache_seqlen = 8192
cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
anchor_type = kt_kernel_ext.kvcache.AnchorType.DYNAMIC
kv_type = kt_kernel_ext.kvcache.ggml_type.FP16
retrieval_type = kt_kernel_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 2
max_batch_size: int = 1
max_block_num: int = 512
CPUInfer = kt_kernel_ext.CPUInfer(max_thread_num)
validation_iter = 100

with torch.inference_mode(mode=True):
    config = kt_kernel_ext.kvcache.KVCacheConfig(
        layer_num,
        kv_head_num,
        q_head_num,
        head_dim,
        block_len,
        anchor_num,
        anchor_type,
        kv_type,
        retrieval_type,
        layer_step,
        token_step,
        layer_offset,
        max_block_num,
        max_batch_size,
        max_thread_num,
    )
    local_kvcache = kt_kernel_ext.kvcache.KVCache(config)

    kvcaches = []
    block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)

    for layer_idx in range(layer_num):
        k_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
        v_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()

        CPUInfer.submit(
            local_kvcache.update_kvcache_fp16(
                k_cache.data_ptr(),
                v_cache.data_ptr(),
                layer_idx,
                block_table.data_ptr(),
                1,
                max_block_num,
                seqlens_zero.data_ptr(),
                cache_seqlen,
            )
        )
        CPUInfer.sync()

        kvcaches.append((k_cache.to("cuda"), v_cache.to("cuda")))

    # validation
    for i in range(validation_iter):

        k_cache = kvcaches[i % layer_num][0]
        v_cache = kvcaches[i % layer_num][1]
        input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
        output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()

        # attn_lse: (bsz, q_len, q_head_num)
        attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
        input = input / 100

        CPUInfer.submit(
            local_kvcache.attn(
                input.data_ptr(),
                output.data_ptr(),
                attn_lse.data_ptr(),
                i % layer_num,
                0,
                1,
                1,
                max_block_num,
                block_table.data_ptr(),
                cache_seqlens.data_ptr(),
                -1,
                -1,
                -1,
            )
        )
        CPUInfer.sync()
        # print("cpuinfer output", output)

        t_output = flash_attn_with_kvcache(
            q=input.to("cuda"),
            k_cache=k_cache,
            v_cache=v_cache,
            cache_seqlens=cache_seqlens.to("cuda"),
        )
        # print("torch output", t_output)

        diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(torch.abs(t_output))
        print("diff = ", diff)
        assert diff < 0.001


================================================
FILE: kt-kernel/examples/test_awq_moe_amx.py
================================================
import os, sys

sys.path.insert(0, os.path.dirname(__file__) + "/../build")

from kt_kernel import kt_kernel_ext
import torch

# Set fixed seed for reproducible results
torch.manual_seed(42)

# Constants for 4-bit packing
Q_BITS = 4
STORAGE_BITS = 32
PACK_NUM = STORAGE_BITS // Q_BITS  # 8


def pack(imatrix: torch.Tensor, direction: str = "row"):
    """
    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
    Packing order: 7 6 5 4 3 2 1 0 (MSB to LSB, original order)
    Args:
        imatrix (torch.Tensor): matrix of integers

    Returns:
        qmatrix (torch.Tensor): packed matrix of integers
    """
    shifts = torch.arange(0, STORAGE_BITS, Q_BITS, device=imatrix.device)

    imatrix = imatrix.to(torch.int8)
    imatrix = torch.bitwise_and(imatrix, 0x0F)  # eventually correct overflow

    if direction == "column":
        imatrix = imatrix.view(-1, imatrix.shape[1] // PACK_NUM, PACK_NUM)
        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)

    elif direction == "row":
        imatrix = imatrix.view(imatrix.shape[0] // PACK_NUM, PACK_NUM, -1)
        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)

    qmatrix = qmatrix.to(torch.int32)

    return qmatrix


expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
qlen = 1
layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(40)
validation_iter = 10
k_group_size = 64
debug_print_count = 16

physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()


def act_fn(x):
    return x / (1.0 + torch.exp(-x))


def generate_original_weights():
    """Generate original FP16/BF16 weights for online quantization testing"""
    # Set seed to ensure consistency between online and offline quantization
    torch.manual_seed(42)

    # Generate weights in the same format as test_moe_amx.py (bfloat16)
    gate_proj_bf16 = (
        torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cuda")
        .to("cpu")
        .contiguous()
    )
    up_proj_bf16 = (
        torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cuda")
        .to("cpu")
        .contiguous()
    )
    down_proj_bf16 = (
        torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.bfloat16, device="cuda")
        .to("cpu")
        .contiguous()
    )

    # Print first row of gate_proj for expert 0 (first debug_print_count elements)
    print(
        f"[DEBUG] Online quantization gate_proj expert 0, row 0, first {debug_print_count} elements: {gate_proj_bf16[0, 0, :debug_print_count]}"
    )

    return gate_proj_bf16, up_proj_bf16, down_proj_bf16


def generate_awq_quantized_weights():
    """Generate AWQ quantized weights (qweight, scales, qzeros) for testing"""
    # Reset seed to ensure same weights as online quantization
    torch.manual_seed(42)

    # Generate original FP16 weights (convert from same random values as online version)
    gate_proj_fp16 = (
        torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cuda")
        .to("cpu")
        .to(torch.float16)
        .contiguous()
    )
    up_proj_fp16 = (
        torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cuda")
        .to("cpu")
        .to(torch.float16)
        .contiguous()
    )
    down_proj_fp16 = (
        torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.bfloat16, device="cuda")
        .to("cpu")
        .to(torch.float16)
        .contiguous()
    )

    # Print first row of gate_proj for expert 0 (first debug_print_count elements)
    print(
        f"[DEBUG] Offline AWQ gate_proj expert 0, row 0, first {debug_print_count} elements: {gate_proj_fp16[0, 0, :debug_print_count]}"
    )

    # Calculate quantization parameters per group
    def quantize_tensor_awq(weight, group_size=128):
        """Simple AWQ-style quantization simulation with interleaving"""

        w_orig_shape = weight.shape
        expert_num, col, row = weight.shape
        group_num = (row + group_size - 1) // group_size

        # 1. reshape into groups along row dimension
        weight_grouped = weight.view(expert_num, col, group_num, group_size)  # [E, G, group_size, C]

        # 2. calculate scales per group (max abs value / 7.0 for 4-bit signed)
        max_val = torch.max(weight_grouped, dim=3).values
        min_val = torch.min(weight_grouped, dim=3).values
        scales = (max_val - min_val).clamp(min=1e-5) / 15.0  # [E, G, C]
        zeros = (-torch.round(min_val / scales)).clamp_(0, 15).to(torch.int8)

        # 5. quantize weights
        qweight_int = torch.clamp(
            torch.round((weight_grouped - min_val.unsqueeze(-1)) / scales.unsqueeze(-1)), 0, 15
        ).to(torch.int8)

        qweight_int = qweight_int.view(w_orig_shape)

        # 6. pack qweight along row (group_size) using helper
        qweight_packed_list = []
        for e in range(expert_num):
            packed = pack(qweight_int[e], direction="column")  # [1, ? , col] or similar
            qweight_packed_list.append(packed)
        qweight_packed = torch.stack(qweight_packed_list, dim=0)  # [E, row, col / 8]

        # 7. pack zeros along group dimension (row) using helper
        zeros_packed_list = []
        for e in range(expert_num):
            zeros_packed_list.append(pack(zeros[e].transpose(0, 1), direction="column"))  # [blocks, col]
        qzeros_packed = torch.stack(zeros_packed_list, dim=0)

        scales = scales.transpose(1, 2).to(torch.float16)
        print(scales.shape)
        scales = scales.flatten().contiguous()

        min_val = min_val.transpose(1, 2).to(torch.float16).flatten().contiguous()

        zeros = zeros.transpose(1, 2).flatten().contiguous()

        qzeros_packed = qzeros_packed.flatten().contiguous()

        qweight_packed = qweight_packed.flatten().contiguous()

        return {
            "qweight": qweight_packed,  # Same for both torch and AWQ-MoE
            "scales": scales,  # Same for both torch and AWQ-MoE
            "qzeros": qzeros_packed,  # Same for both torch and AWQ-MoE
            "mins": min_val,  # scales * zeros for comparison
        }

    # Quantize each projection
    gate_data = quantize_tensor_awq(gate_proj_fp16, k_group_size)
    up_data = quantize_tensor_awq(up_proj_fp16, k_group_size)
    down_data = quantize_tensor_awq(down_proj_fp16, k_group_size)

    return {
        # Data for both torch and AWQ-MoE (no interleaving)
        "gate_qweight": gate_data["qweight"],
        "gate_scales": gate_data["scales"],
        "gate_qzeros": gate_data["qzeros"],
        "gate_mins": gate_data["mins"],
        "up_qweight": up_data["qweight"],
        "up_scales": up_data["scales"],
        "up_qzeros": up_data["qzeros"],
        "up_mins": up_data["mins"],
        "down_qweight": down_data["qweight"],
        "down_scales": down_data["scales"],
        "down_qzeros": down_data["qzeros"],
        "down_mins": down_data["mins"],
        "original_fp16": {"gate_proj": gate_proj_fp16, "up_proj": up_proj_fp16, "down_proj": down_proj_fp16},
    }


def mlp_torch(input, gate_proj, up_proj, down_proj, debug_expert_id=None, debug_print=False):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())

    if debug_print and debug_expert_id is not None:
        print(f"[TORCH FP16 DEBUG] Expert {debug_expert_id}:")
        print(f"  gate_buf[:{debug_print_count}] = {gate_buf.flatten()[:debug_print_count]}")
        print(f"  up_buf[:{debug_print_count}] = {up_buf.flatten()[:debug_print_count]}")

    intermediate = act_fn(gate_buf) * up_buf

    if debug_print and debug_expert_id is not None:
        print(f"  intermediate[:{debug_print_count}] = {intermediate.flatten()[:debug_print_count]}")

    ret = torch.mm(intermediate, down_proj.t())

    if debug_print and debug_expert_id is not None:
        print(f"  down_output[:{debug_print_count}] = {ret.flatten()[:debug_print_count]}")

    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj, debug_print=False):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    # Get the first expert from expert_ids array to match AWQ-MoE behavior
    target_debug_expert = expert_ids[0, 0].item()  # First expert in expert_ids array

    outputs = []
    start_idx = 0
    activated_experts = []

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        activated_experts.append(i)
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        # Only debug the target expert that matches AWQ-MoE's first expert
        should_debug = debug_print and i == target_debug_expert
        if gate_proj[i].dtype == torch.float16:
            expert_out = mlp_torch(
                tokens_for_this_expert.to(torch.float16),
                gate_proj[i],
                up_proj[i],
                down_proj[i],
                debug_expert_id=i,
                debug_print=should_debug,
            )
        else:
            expert_out = mlp_torch(
                tokens_for_this_expert,
                gate_proj[i],
                up_proj[i],
                down_proj[i],
                debug_expert_id=i,
                debug_print=should_debug,
            )
        outputs.append(expert_out)
        start_idx = end_idx

    if debug_print:
        print(f"[TORCH DEBUG] Processing activated experts: {activated_experts}")
        print(f"[TORCH DEBUG] Target debug expert (matches AWQ): {target_debug_expert}")

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )

    if debug_print:
        print(f"[TORCH DEBUG] Final MoE output[:{debug_print_count}] = {t_output.flatten()[:debug_print_count]}")

    return t_output


def test_online_int4_kgroup_moe():
    """Test online Int4LowKGroup quantization (reference implementation)"""
    print("Testing Online Int4LowKGroup quantization (reference)...")

    # Generate original weights for online quantization
    gate_proj, up_proj, down_proj = generate_original_weights()

    with torch.inference_mode(mode=True):
        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []

        for _ in range(layer_num):
            # Create Int4LowKGroup configuration (online quantization)
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.gate_scale = 0
            config.pool = CPUInfer.backend_

            # Set quantization config for Int4LowKGroup (matches test_moe_amx.py)
            config.quant_config.bits = 4
            config.quant_config.group_size = k_group_size
            config.quant_config.zero_point = True

            # Enable weight dumping for comparison
            config.save = True
            config.path = "./awq_dump_online"

            # Create Int4LowKGroup MoE (online quantization during load_weights)
            moe = kt_kernel_ext.moe.AMXInt4_1KGroup_MOE(config)

            # Load weights (performs online quantization)
            print(f"Physical Map: {physical_to_logical_map.data_ptr()}")
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()

            # Warm up
            CPUInfer.submit(moe.warm_up_task())
            CPUInfer.sync()

            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)

        print("Online Int4LowKGroup MoE created and loaded successfully!")

        # Run validation tests
        results_online = []
        for i in range(validation_iter):
            # Reset seed for reproducible expert_ids and weights
            torch.manual_seed(100 + i)  # Different seed to avoid same random values

            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            # input = torch.tensor(
            #     data=torch.cat([torch.ones(qlen, 1), torch.zeros(qlen, hidden_size - 1)], dim=1),
            #     dtype=torch.bfloat16
            # )
            input = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous() / 100

            moe = moes[i % layer_num]

            # Enable debug for first few iterations
            enable_debug = i < 2
            if enable_debug:
                print(f"\n=== Online Int4LowKGroup Test Iteration {i} ===")
                print(f"input[:{debug_print_count}] = {input.flatten()[:debug_print_count]}")
                print(f"expert_ids = {expert_ids}")
                print(f"weights = {weights}")

            # Run online quantized MoE forward
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            if enable_debug:
                print(f"[ONLINE DEBUG] AMX output[:{debug_print_count}] = {output.flatten()[:debug_print_count]}")

            # Compare with FP16 reference
            gate_proj_ref = gate_projs[i % layer_num]
            up_proj_ref = up_projs[i % layer_num]
            down_proj_ref = down_projs[i % layer_num]

            t_output_online = moe_torch(
                input, expert_ids, weights, gate_proj_ref, up_proj_ref, down_proj_ref, debug_print=enable_debug
            )

            # Calculate differences
            diff_online = torch.mean(torch.abs(output - t_output_online)) / torch.mean(torch.abs(t_output_online))
            results_online.append(output.clone())

            print(f"Online Iteration {i}: Int4LowKGroup vs FP16 = {diff_online:.6f}")

            if enable_debug:
                abs_diff_online = torch.abs(output - t_output_online)
                print(f"[COMPARE] Online Int4LowKGroup vs FP16:")
                print(f"  Max abs diff = {torch.max(abs_diff_online):.6f}")
                print(f"  Mean abs diff = {torch.mean(abs_diff_online):.6f}")
                print(f"  Relative diff = {diff_online:.6f}")
                print("=" * 70)

        print("\n✅ Online Int4LowKGroup tests passed!")
        return results_online


def test_awq_moe():
    print("Testing AWQ MoE with Int4_1LowKGroup quantization...")

    # Generate AWQ quantized weights
    awq_data = generate_awq_quantized_weights()

    with torch.inference_mode(mode=True):
        moes = []

        for _ in range(layer_num):
            # Create AWQ MoE configuration
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len

            # Set quantization config for Int4_1LowKGroup
            config.quant_config.bits = 4
            config.quant_config.group_size = k_group_size
            config.quant_config.zero_point = True

            # Enable weight dumping for comparison
            config.save = True
            config.path = "./awq_dump_offline"

            # Set pointers to AWQ quantized data (no interleaving)
            config.gate_proj = awq_data["gate_qweight"].data_ptr()
            config.up_proj = awq_data["up_qweight"].data_ptr()
            config.down_proj = awq_data["down_qweight"].data_ptr()

            config.gate_scale = awq_data["gate_scales"].data_ptr()
            config.up_scale = awq_data["up_scales"].data_ptr()
            config.down_scale = awq_data["down_scales"].data_ptr()

            config.gate_zeros = awq_data["gate_qzeros"].data_ptr()
            config.up_zeros = awq_data["up_qzeros"].data_ptr()
            config.down_zeros = awq_data["down_qzeros"].data_ptr()

            config.pool = CPUInfer.backend_

            # Create Int4_1LowKGroup MoE
            moe = kt_kernel_ext.moe.AMXInt4_1KGroup_MOE(config)

            # Load weights
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()

            # Warm up
            CPUInfer.submit(moe.warm_up_task())
            CPUInfer.sync()

            moes.append(moe)

        print("AWQ MoE Int4_1LowKGroup created and loaded successfully!")

        # Run validation tests
        results_awq = []
        for i in range(validation_iter):
            # Reset seed for reproducible expert_ids and weights (same as online test)
            torch.manual_seed(100 + i)

            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            # input = torch.tensor(
            #     data=torch.cat([torch.ones(qlen, 1), torch.zeros(qlen, hidden_size - 1)], dim=1),
            #     dtype=torch.bfloat16
            # )
            input = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous() / 100

            moe = moes[i % layer_num]

            # Enable debug for first few iterations
            enable_debug = i < 2
            if enable_debug:
                print(f"\n=== AWQ MoE Int4_1LowKGroup Test Iteration {i} ===")
                print(f"input[:{debug_print_count}] = {input.flatten()[:debug_print_count]}")
                print(f"expert_ids = {expert_ids}")
                print(f"weights = {weights}")

                # Print which experts will be activated
                activated_experts = []
                for token in range(expert_ids.shape[0]):
                    for expert_idx in range(expert_ids.shape[1]):
                        expert_id = expert_ids[token][expert_idx].item()
                        if expert_id not in activated_experts:
                            activated_experts.append(expert_id)
                print(f"[TORCH DEBUG] Activated experts: {sorted(activated_experts)}")
                print(f"[TORCH DEBUG] First expert from expert_ids array: {expert_ids[0, 0].item()}")

            # Run AWQ MoE forward
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            if enable_debug:
                print(f"[AWQ-MoE DEBUG] AMX output[:{debug_print_count}] = {output.flatten()[:debug_print_count]}")

            # Compare with FP16 reference
            original_weights = awq_data["original_fp16"]
            gate_proj = original_weights["gate_proj"].to(torch.float16)
            up_proj = original_weights["up_proj"].to(torch.float16)
            down_proj = original_weights["down_proj"].to(torch.float16)

            t_output_fp16 = moe_torch(
                input, expert_ids, weights, gate_proj, up_proj, down_proj, debug_print=enable_debug
            )

            # Calculate differences
            diff_fp16 = torch.mean(torch.abs(output - t_output_fp16)) / torch.mean(torch.abs(t_output_fp16))
            results_awq.append(output.clone())

            print(f"AWQ Iteration {i}: AWQ-MoE vs FP16 = {diff_fp16:.6f}")

            if enable_debug:
                abs_diff_fp16 = torch.abs(output - t_output_fp16)
                print(f"[COMPARE] AWQ-MoE vs FP16:")
                print(f"  Max abs diff = {torch.max(abs_diff_fp16):.6f}")
                print(f"  Mean abs diff = {torch.mean(abs_diff_fp16):.6f}")
                print(f"  Relative diff = {diff_fp16:.6f}")
                print("=" * 70)

            # AWQ quantization typically has higher error tolerance due to 4-bit quantization vs FP16
            # assert(diff_fp16 < 0.5), f"AWQ-MoE vs FP16 error too large: {diff_fp16:.6f}"

        print("\n✅ All AWQ MoE tests passed!")
        return results_awq


def compare_quantization_methods():
    """Compare online and offline quantization methods"""
    print("=" * 70)
    print("Comparing Online vs Offline Quantization Methods")
    print("=" * 70)

    # Run online quantization test (reference)
    print("\n" + "=" * 70)
    print("PHASE 1: Online Int4LowKGroup Quantization (Reference)")
    print("=" * 70)
    results_online = test_online_int4_kgroup_moe()

    # Run offline AWQ quantization test
    print("\n" + "=" * 70)
    print("PHASE 2: Offline AWQ Int4_1LowKGroup Quantization")
    print("=" * 70)
    results_awq = test_awq_moe()

    # Compare the results
    print("\n" + "=" * 70)
    print("PHASE 3: Comparison Results")
    print("=" * 70)

    if len(results_online) != len(results_awq):
        print(f"❌ Different number of results: Online={len(results_online)}, AWQ={len(results_awq)}")
        return

    print("Comparing Online Int4LowKGroup vs Offline AWQ results:")
    total_diff = 0.0
    max_diff = 0.0

    for i in range(len(results_online)):
        diff = torch.mean(torch.abs(results_online[i] - results_awq[i]))
        rel_diff = diff / torch.mean(torch.abs(results_online[i]))
        total_diff += rel_diff
        max_diff = max(max_diff, diff.item())

        if i < 3:  # Show detailed comparison for first 3 iterations
            print(f"  Iteration {i}:")
            print(f"    Absolute diff: {diff:.6f}")
            print(f"    Relative diff: {rel_diff:.6f}")
            print(f"    Online output[:{debug_print_count//2}]:  {results_online[i].flatten()[:debug_print_count//2]}")
            print(f"    AWQ output[:{debug_print_count//2}]:     {results_awq[i].flatten()[:debug_print_count//2]}")
        else:
            print(f"  Iteration {i}: Relative diff = {rel_diff:.6f}")

    avg_diff = total_diff / len(results_online)
    print(f"\nOverall comparison:")
    print(f"  Average relative difference: {avg_diff:.6f}")
    print(f"  Maximum absolute difference: {max_diff:.6f}")

    # Determine if results match within acceptable tolerance
    tolerance = 0.01  # 1% tolerance
    if avg_diff < tolerance:
        print(f"✅ Results match within {tolerance:.1%} tolerance!")
        print("   Your offline AWQ quantization implementation appears to be correct.")
    else:
        print(f"❌ Results differ by more than {tolerance:.1%} tolerance.")
        print("   There may be differences between online and offline quantization.")


if __name__ == "__main__":
    print("=" * 70)
    print("AWQ MoE AMX Test - Online vs Offline Quantization Comparison")
    print("=" * 70)

    compare_quantization_methods()

    print("\n" + "=" * 70)
    print("Test completed successfully!")
    print("=" * 70)


================================================
FILE: kt-kernel/examples/test_bf16_moe.py
================================================
"""
Test script for AMX_BF16_MOE_TP (native BF16 MoE) kernel validation.

This script:
1. Generates random BF16 weights
2. Runs the BF16 MoE kernel
3. Compares results with PyTorch reference

BF16 format notes:
- Weight: BF16 stored as ggml_bf16_t, shape [expert_num, n, k]
- No scales needed (native BF16 precision)
"""

import os
import sys

sys.path.insert(0, os.path.dirname(__file__) + "/../build")

import torch
from kt_kernel import kt_kernel_ext

torch.manual_seed(42)

# Model config
hidden_size = 2048
intermediate_size = 768
max_len = 25600

expert_num = 128
num_experts_per_tok = 8

qlen = 1
layer_num = 5
CPUInfer = kt_kernel_ext.CPUInfer(3)
validation_iter = 5
debug_print_count = 16

physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()


def act_fn(x):
    """SiLU activation function"""
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    """Reference MLP computation in PyTorch"""
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    """Reference MoE computation in PyTorch"""
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output


def build_bf16_weights():
    """
    Generate random BF16 weights.

    Returns:
        dict with BF16 weights for gate, up, down projections
    """
    torch.manual_seed(42)

    # Generate random BF16 weights with small values
    gate_proj = (
        (torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32) / 100.0)
        .to(torch.bfloat16)
        .contiguous()
    )
    up_proj = (
        (torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32) / 100.0)
        .to(torch.bfloat16)
        .contiguous()
    )
    down_proj = (
        (torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32) / 100.0)
        .to(torch.bfloat16)
        .contiguous()
    )

    print(f"BF16 weights shape: gate={gate_proj.shape}, up={up_proj.shape}, down={down_proj.shape}")

    # Debug: Print BF16 weight info for expert 0
    print("\n=== DEBUG: BF16 Weight Info (Expert 0) ===")
    print(f"gate_proj[0] first 8 values: {gate_proj[0, 0, :8]}")
    print(f"gate_proj[0] stats: min={gate_proj[0].min()}, max={gate_proj[0].max()}")
    print(f"up_proj[0] first 8 values: {up_proj[0, 0, :8]}")
    print(f"down_proj[0] first 8 values: {down_proj[0, 0, :8]}")

    return {
        "gate_proj": gate_proj,
        "up_proj": up_proj,
        "down_proj": down_proj,
    }


def build_moes_from_bf16_data(bf16_data: dict):
    """
    Build BF16 MoE modules from BF16 weight data.
    """
    moes = []
    with torch.inference_mode(mode=True):
        for _ in range(layer_num):
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len

            # Set BF16 weight pointers (no scales needed)
            config.gate_proj = bf16_data["gate_proj"].data_ptr()
            config.up_proj = bf16_data["up_proj"].data_ptr()
            config.down_proj = bf16_data["down_proj"].data_ptr()

            # No scales for BF16
            config.gate_scale = 0
            config.up_scale = 0
            config.down_scale = 0
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            moes.append(moe)
    return moes


def run_bf16_moe_test():
    """
    Run BF16 MoE validation test.
    """
    print("\n" + "=" * 70)
    print("BF16 MoE Kernel Validation Test")
    print("=" * 70)

    # Build BF16 weights
    print("\nGenerating BF16 weights...")
    bf16_data = build_bf16_weights()

    # Build MoE modules
    print("\nBuilding BF16 MoE modules...")
    moes = build_moes_from_bf16_data(bf16_data)

    # Get weights for reference computation
    gate_proj = bf16_data["gate_proj"]
    up_proj = bf16_data["up_proj"]
    down_proj = bf16_data["down_proj"]

    diffs = []
    with torch.inference_mode(mode=True):
        for i in range(validation_iter):
            torch.manual_seed(114514 + i)
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.randn((qlen, num_experts_per_tok), dtype=torch.float32).contiguous() / 10
            input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous() * 3
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()

            moe = moes[i % layer_num]
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_tensor.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            assert not torch.isnan(output).any(), "NaN values detected in CPU expert output."
            assert not torch.isinf(output).any(), "Inf values detected in CPU expert output."

            # Reference computation using BF16 weights
            t_output = moe_torch(input_tensor, expert_ids, weights, gate_proj, up_proj, down_proj)

            t_output_flat = t_output.flatten()
            output_flat = output.flatten()

            diff = torch.mean(torch.abs(output_flat - t_output_flat)) / (torch.mean(torch.abs(t_output_flat)) + 1e-12)
            diffs.append(diff.item())
            print(f"Iteration {i}: relative L1 diff = {diff:.6f}")

            if i < 3:  # Print detailed output for first few iterations
                print(f"  kernel output: {output_flat[:debug_print_count]}")
                print(f"  torch output:  {t_output_flat[:debug_print_count]}")

    mean_diff = float(sum(diffs) / len(diffs))
    max_diff = float(max(diffs))
    min_diff = float(min(diffs))

    print("\n" + "=" * 70)
    print("BF16 MoE Test Results")
    print("=" * 70)
    print(f"Mean relative L1 diff: {mean_diff*100:.4f}%")
    print(f"Max relative L1 diff:  {max_diff*100:.4f}%")
    print(f"Min relative L1 diff:  {min_diff*100:.4f}%")

    # Pass/Fail criteria (BF16 should be very accurate, <5% error)
    threshold = 5.0
    if mean_diff * 100 < threshold:
        print(f"\nPASS: Mean error {mean_diff*100:.4f}% < {threshold}% threshold")
    else:
        print(f"\nFAIL: Mean error {mean_diff*100:.4f}% >= {threshold}% threshold")

    return {"mean": mean_diff, "max": max_diff, "min": min_diff}


if __name__ == "__main__":
    run_bf16_moe_test()


================================================
FILE: kt-kernel/examples/test_deepseekv3.py
================================================
import os, sys
import time

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
import sys
import json
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)

logger = logging.getLogger("reader")

from gguf.gguf_reader import GGUFReader

# load_layers = 6
load_layers = None
CPUInfer = kt_kernel_ext.CPUInfer(304)
max_qlen = 4096
max_kvlen = 4096
page_size = 256
pages_count = 200


def read_gguf_file(gguf_file_path):
    """
    Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.

    Parameters:
    - gguf_file_path: Path to the GGUF file.
    """

    reader = GGUFReader(gguf_file_path)

    # List all key-value pairs in a columnized format
    # print("Key-Value Pairs:") # noqa: NP100
    # max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
        # print(f"{key:{max_key_length}} : {value}") # noqa: NP100
    # print("----") # noqa: NP100

    # List all tensors
    # print("Tensors:") # noqa: NP100
    # tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
    # print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
    # print("-" * 80) # noqa: NP100
    re = []
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
        # print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
        re.append(tensor)
    return re


def read_gguf_directory(directory):
    """
    Reads all GGUF files in a directory and prints their contents.

    Parameters:
    - directory: Path to the directory containing GGUF files.
    """
    if not os.path.isdir(directory):
        logger.error(f"Directory {directory} does not exist.")
        return

    # List all GGUF files in the directory
    files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
    if not files:
        logger.info(f"No GGUF files found in {directory}.")
        return

    re = []
    for file in files:
        file_path = os.path.join(directory, file)
        # print(f"Reading {file_path}:") # noqa: NP100
        # print("\n") # noqa: NP100
        re.extend(read_gguf_file(file_path))
    re = {r.name: r for r in re}
    return re


def find_weights(name, weights):
    """
    Finds and returns the weights for a given name from the list of weights.

    Parameters:
    - name: The name of the weights to find.
    - weights: List of weight tensors.

    Returns:
    - The weight tensor if found, otherwise None.
    """
    for weight in weights:
        if weight.name == name:
            return weight
    raise ValueError(f"Weight with name {name} not found in the provided weights list.")


def get_torch_tensor_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous()


def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name


def type_to_ggml_type(type):
    if type == "F32":
        return ggml_type.FP32
    elif type == "F16":
        return ggml_type.FP16
    elif type == "BF16":
        return ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {type}")


def build_mla(layer_idx, json_config, gguf_weights):
    hidden_size = json_config["hidden_size"]
    num_heads = json_config["num_attention_heads"]
    q_lora_rank = json_config["q_lora_rank"]
    kv_lora_rank = json_config["kv_lora_rank"]
    nope_size = json_config["qk_nope_head_dim"]
    rope_size = json_config["qk_rope_head_dim"]
    max_position_embeddings = json_config["max_position_embeddings"]
    rope_theta = json_config["rope_theta"]
    rope_scaling = json_config["rope_scaling"]

    config = kt_kernel_ext.mla.MLAConfig(
        hidden_size,
        q_lora_rank,
        kv_lora_rank,
        num_heads,
        nope_size,
        rope_size,
    )
    config.max_qlen = max_qlen
    config.max_kvlen = max_kvlen
    config.max_position_embeddings = max_position_embeddings
    config.rope_scaling_factor = rope_scaling["factor"]
    config.rope_theta = rope_theta
    config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
    config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
    config.rope_scaling_mscale = rope_scaling["mscale"]
    config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
    config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

    q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
    config.q_a_proj = q_a_proj_weight.data_ptr()
    config.q_a_proj_type = type_to_ggml_type(type)
    q_a_type = type

    q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
    config.q_a_norm = q_a_norm_weight.data_ptr()
    config.q_a_norm_type = type_to_ggml_type(type)

    q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
    config.q_b_proj = q_b_proj_weight.data_ptr()
    config.q_b_proj_type = type_to_ggml_type(type)

    kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
        gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
    )
    config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
    config.kv_a_proj_with_mqa_type = type_to_ggml_type(type)

    kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
    config.kv_a_norm = kv_a_norm_weight.data_ptr()
    config.kv_a_norm_type = type_to_ggml_type(type)

    kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
    config.kv_b_proj = kv_b_proj_weight.data_ptr()
    config.kv_b_proj_type = type_to_ggml_type(type)

    o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
    config.o_proj = o_proj_weight.data_ptr()
    config.w_o_type = type_to_ggml_type(type)

    config.layer_idx = layer_idx
    config.pool = CPUInfer.backend_
    config.page_count = pages_count

    if q_a_type == "F32":
        mla = kt_kernel_ext.mla.MLA_F32(config)
    elif q_a_type == "F16":
        mla = kt_kernel_ext.mla.MLA_F16(config)
    elif q_a_type == "BF16":
        # mla = kt_kernel_ext.mla.MLA_F32(config)
        mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
    else:
        raise ValueError(f"Unsupported data type: {q_a_type}")

    mla.load_weights()
    mla.set_local_pages(pages_count)
    return mla


def build_ffn(layer_idx, json_config, gguf_weights):
    if f"blk.{layer_idx}.ffn_gate.weight" in gguf_weights:  # dense
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down.weight")

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    elif f"blk.{layer_idx}.ffn_gate_exps.weight" in gguf_weights:
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["n_routed_experts"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_exps.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_exps.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down_exps.weight")

        gate_sh, gate_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_gate_shexp.weight"
        )
        up_sh, up_sh_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_shexp.weight")
        down_sh, down_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_down_shexp.weight"
        )

        gate_sh_expanded = gate_sh.unsqueeze(0)
        gate = torch.cat([gate, gate_sh_expanded], dim=0).contiguous()
        up_sh_expanded = up_sh.unsqueeze(0)
        up = torch.cat([up, up_sh_expanded], dim=0).contiguous()
        down_sh_expanded = down_sh.unsqueeze(0)
        down = torch.cat([down, down_sh_expanded], dim=0).contiguous()

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    else:
        raise ValueError(f"Unsupported FFN type for layer {layer_idx}")


def build_moegate(layer_idx, json_config, gguf_weights):
    config = kt_kernel_ext.gate.GateConfig(
        json_config["hidden_size"],
        json_config["num_experts_per_tok"],
        json_config["n_routed_experts"],
        json_config["n_group"],
        json_config["topk_group"],
    )

    config.routed_scaling_factor = json_config["routed_scaling_factor"]

    config.pool = CPUInfer.backend_

    weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
    config.weight = weight.data_ptr()
    config.weight_type = type_to_ggml_type(weight_type)

    bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
    config.e_score_correction_bias = bias.data_ptr()
    config.e_score_correction_bias_type = type_to_ggml_type(bias_type)

    gate = kt_kernel_ext.gate.MoEGate(config)

    return gate


def build_llm(json_config, gguf_weights):

    general_config = kt_kernel_ext.GeneralConfig()
    general_config.vocab_size = json_config["vocab_size"]
    general_config.hidden_size = json_config["hidden_size"]
    general_config.num_experts_per_tok = json_config["num_experts_per_tok"]
    general_config.n_routed_experts = json_config["n_routed_experts"]
    general_config.n_shared_experts = json_config["n_shared_experts"]
    general_config.max_qlen = max_qlen

    lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
    general_config.lm_heads_ptr = lm_heads.data_ptr()
    general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)

    output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
    general_config.norm_weights_ptr = output_norm.data_ptr()
    general_config.norm_weights_type = type_to_ggml_type(output_norm_type)

    token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
    general_config.token_embd_ptr = token_embd.data_ptr()
    general_config.token_embd_type = type_to_ggml_type(token_embd_type)

    general_config.pool = CPUInfer.backend_

    llm = kt_kernel_ext.DeepseekV3ForCausalLM(general_config)
    model = kt_kernel_ext.DeepseekV3Model(general_config)
    llm.model = model

    decoder_layers = []
    real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers

    for i in range(real_load_layers):
        layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
        attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
        ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")

        layer.load_norm(
            attn_norm.data_ptr(),
            type_to_ggml_type(attn_norm_type),
            ffn_norm.data_ptr(),
            type_to_ggml_type(ffn_norm_type),
        )
        layer.self_attn = build_mla(i, json_config, gguf_weights)
        if f"blk.{i}.ffn_gate_inp.weight" in gguf_weights:
            layer.gate = build_moegate(i, json_config, gguf_weights)
        layer.ffn = build_ffn(i, json_config, gguf_weights)
        decoder_layers.append(layer)

    model.layers = decoder_layers
    return llm


safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)

gguf_path = "/home/bd/models/DeepSeek-R1-BF16"
weights = read_gguf_directory(gguf_path)
weights = dict(sorted(weights.items()))


for name, t in weights.items():
    # if not name.startswith("blk"):
    # if name.startswith("blk.10."):
    # if "ffn_gate." in name:
    # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
    print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")

print("Building LLM ...")
load_start_time = time.perf_counter()
llm = build_llm(json_config, weights)
load_end_time = time.perf_counter()
print(f"Load time: {load_end_time - load_start_time:.4f} seconds")

print("Release Weight Tensors ...")
weights = None
print("Loading Configs ...")


tokenizer = AutoTokenizer.from_pretrained(safetensor_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)

force_think = False


output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)


def start_chat(content=None):
    if content is None:
        content = input("Chat: ")

    messages = [{"role": "user", "content": content}]
    input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    if force_think:
        token_thinks = torch.tensor(
            [tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
        )
        input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
    input_tensor = input_tensor.squeeze(0)  # Add batch dimension

    print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
    kvlen = 0
    step = 2
    while True or step > 0:
        step -= 1
        stream = TextStreamer(tokenizer)

        qlen = input_tensor.shape[0]
        qlens = [qlen - kvlen]
        kvlens = [kvlen]
        page_tables = [list(range(pages_count))]
        start_time = time.perf_counter()
        llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
        end_time = time.perf_counter()
        print(
            f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
        )

        logits = output_logits[0]
        # print(logits)
        # sample
        next_token = torch.argmax(logits).item()
        # print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
        kvlen = input_tensor.shape[0]
        input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)

        if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
            stream.end()
            break
        else:
            stream.put(torch.tensor([next_token]))


job_id = 0
while True:
    try:
        # ---------- 让用户决定是否继续 ----------
        choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序： ").strip().lower()
        if choice in {"q", "quit", "exit"}:
            print("收到退出指令，程序结束。")
            break
        elif choice == "1":
            file_path = input("请输入要读取的文件路径：").strip()
            if not Path(file_path).is_file():
                print(f"文件 {file_path} 不存在，请检查路径。")
                continue
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
            print(f"读取到内容：\n{content}\n")
            start_chat(content)
        else:
            start_chat()

    except KeyboardInterrupt:
        # 随时 Ctrl-C：放弃当前任务并重启
        print(f"\n检测到 Ctrl-C，已终止对话 #{job_id}，马上重启…")
    except Exception as e:
        # 其他异常：打印错误信息并重启
        print(f"\n发生错误：{e}\n已终止对话 #{job_id}，马上重启…")
        logger.error(f"Error in job {job_id}: {e}", exc_info=True)
    finally:
        job_id += 1  # 不管中断与否，都给下一任务换编号


================================================
FILE: kt-kernel/examples/test_deepseekv3_prefill.py
================================================
import os, sys
import time

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
import sys
import json
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)

logger = logging.getLogger("reader")

from gguf.gguf_reader import GGUFReader

CPUInfer = kt_kernel_ext.CPUInfer(304)
max_qlen = 4096
max_kvlen = 4096
page_size = 256
pages_count = 200


def read_gguf_file(gguf_file_path):
    """
    Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.

    Parameters:
    - gguf_file_path: Path to the GGUF file.
    """

    reader = GGUFReader(gguf_file_path)

    # List all key-value pairs in a columnized format
    # print("Key-Value Pairs:") # noqa: NP100
    # max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
        # print(f"{key:{max_key_length}} : {value}") # noqa: NP100
    # print("----") # noqa: NP100

    # List all tensors
    # print("Tensors:") # noqa: NP100
    # tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
    # print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
    # print("-" * 80) # noqa: NP100
    re = []
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
        # print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
        re.append(tensor)
    return re


def read_gguf_directory(directory):
    """
    Reads all GGUF files in a directory and prints their contents.

    Parameters:
    - directory: Path to the directory containing GGUF files.
    """
    if not os.path.isdir(directory):
        logger.error(f"Directory {directory} does not exist.")
        return

    # List all GGUF files in the directory
    files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
    if not files:
        logger.info(f"No GGUF files found in {directory}.")
        return

    re = []
    for file in files:
        file_path = os.path.join(directory, file)
        # print(f"Reading {file_path}:") # noqa: NP100
        # print("\n") # noqa: NP100
        re.extend(read_gguf_file(file_path))
    re = {r.name: r for r in re}
    return re


def find_weights(name, weights):
    """
    Finds and returns the weights for a given name from the list of weights.

    Parameters:
    - name: The name of the weights to find.
    - weights: List of weight tensors.

    Returns:
    - The weight tensor if found, otherwise None.
    """
    for weight in weights:
        if weight.name == name:
            return weight
    raise ValueError(f"Weight with name {name} not found in the provided weights list.")


def get_torch_tensor_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous()


def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name


def type_to_ggml_type(type):
    if type == "F32":
        return ggml_type.FP32
    elif type == "F16":
        return ggml_type.FP16
    elif type == "BF16":
        return ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {type}")


def build_mla(layer_idx, json_config, gguf_weights):
    hidden_size = json_config["hidden_size"]
    num_heads = json_config["num_attention_heads"]
    q_lora_rank = json_config["q_lora_rank"]
    kv_lora_rank = json_config["kv_lora_rank"]
    nope_size = json_config["qk_nope_head_dim"]
    rope_size = json_config["qk_rope_head_dim"]
    max_position_embeddings = json_config["max_position_embeddings"]
    rope_theta = json_config["rope_theta"]
    rope_scaling = json_config["rope_scaling"]

    config = kt_kernel_ext.mla.MLAConfig(
        hidden_size,
        q_lora_rank,
        kv_lora_rank,
        num_heads,
        nope_size,
        rope_size,
    )
    config.max_qlen = max_qlen
    config.max_kvlen = max_kvlen
    config.max_position_embeddings = max_position_embeddings
    config.rope_scaling_factor = rope_scaling["factor"]
    config.rope_theta = rope_theta
    config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
    config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
    config.rope_scaling_mscale = rope_scaling["mscale"]
    config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
    config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

    q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
    config.q_a_proj = q_a_proj_weight.data_ptr()
    config.q_a_proj_type = type_to_ggml_type(type)
    q_a_type = type

    q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
    config.q_a_norm = q_a_norm_weight.data_ptr()
    config.q_a_norm_type = type_to_ggml_type(type)

    q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
    config.q_b_proj = q_b_proj_weight.data_ptr()
    config.q_b_proj_type = type_to_ggml_type(type)

    kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
        gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
    )
    config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
    config.kv_a_proj_with_mqa_type = type_to_ggml_type(type)

    kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
    config.kv_a_norm = kv_a_norm_weight.data_ptr()
    config.kv_a_norm_type = type_to_ggml_type(type)

    kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
    config.kv_b_proj = kv_b_proj_weight.data_ptr()
    config.kv_b_proj_type = type_to_ggml_type(type)

    o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
    config.o_proj = o_proj_weight.data_ptr()
    config.w_o_type = type_to_ggml_type(type)

    config.layer_idx = layer_idx
    config.pool = CPUInfer.backend_
    config.page_count = pages_count

    if q_a_type == "F32":
        mla = kt_kernel_ext.mla.MLA_F32(config)
    elif q_a_type == "F16":
        mla = kt_kernel_ext.mla.MLA_F16(config)
    elif q_a_type == "BF16":
        mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
        # mla = kt_kernel_ext.mla.MLA_F32(config)
    else:
        raise ValueError(f"Unsupported data type: {q_a_type}")

    mla.load_weights()
    mla.set_local_pages(pages_count)
    return mla


def build_ffn(layer_idx, json_config, gguf_weights):
    if f"blk.{layer_idx}.ffn_gate.weight" in gguf_weights:  # dense
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down.weight")

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    elif f"blk.{layer_idx}.ffn_gate_exps.weight" in gguf_weights:
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["n_routed_experts"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_exps.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_exps.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down_exps.weight")

        gate_sh, gate_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_gate_shexp.weight"
        )
        up_sh, up_sh_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_shexp.weight")
        down_sh, down_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_down_shexp.weight"
        )

        gate_sh_expanded = gate_sh.unsqueeze(0)
        gate = torch.cat([gate, gate_sh_expanded], dim=0).contiguous()
        up_sh_expanded = up_sh.unsqueeze(0)
        up = torch.cat([up, up_sh_expanded], dim=0).contiguous()
        down_sh_expanded = down_sh.unsqueeze(0)
        down = torch.cat([down, down_sh_expanded], dim=0).contiguous()

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    else:
        raise ValueError(f"Unsupported FFN type for layer {layer_idx}")


def build_moegate(layer_idx, json_config, gguf_weights):
    config = kt_kernel_ext.gate.GateConfig(
        json_config["hidden_size"],
        json_config["num_experts_per_tok"],
        json_config["n_routed_experts"],
        json_config["n_group"],
        json_config["topk_group"],
    )

    config.routed_scaling_factor = json_config["routed_scaling_factor"]

    config.pool = CPUInfer.backend_

    weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
    config.weight = weight.data_ptr()
    config.weight_type = type_to_ggml_type(weight_type)

    bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
    config.e_score_correction_bias = bias.data_ptr()
    config.e_score_correction_bias_type = type_to_ggml_type(bias_type)

    gate = kt_kernel_ext.gate.MoEGate(config)

    return gate


def build_llm(json_config, gguf_weights):

    general_config = kt_kernel_ext.GeneralConfig()
    general_config.vocab_size = json_config["vocab_size"]
    general_config.hidden_size = json_config["hidden_size"]
    general_config.num_experts_per_tok = json_config["num_experts_per_tok"]
    general_config.n_routed_experts = json_config["n_routed_experts"]
    general_config.n_shared_experts = json_config["n_shared_experts"]
    general_config.max_qlen = max_qlen

    lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
    general_config.lm_heads_ptr = lm_heads.data_ptr()
    general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)

    output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
    general_config.norm_weights_ptr = output_norm.data_ptr()
    general_config.norm_weights_type = type_to_ggml_type(output_norm_type)

    token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
    general_config.token_embd_ptr = token_embd.data_ptr()
    general_config.token_embd_type = type_to_ggml_type(token_embd_type)

    general_config.pool = CPUInfer.backend_

    llm = kt_kernel_ext.DeepseekV3ForCausalLM(general_config)
    model = kt_kernel_ext.DeepseekV3Model(general_config)
    llm.model = model

    decoder_layers = []
    for i in range(json_config["num_hidden_layers"]):
        # for i in range(6):
        # for i in [0,1,2,3,4,5,6,7,8,9,10]:
        layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
        attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
        ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")

        layer.load_norm(
            attn_norm.data_ptr(),
            type_to_ggml_type(attn_norm_type),
            ffn_norm.data_ptr(),
            type_to_ggml_type(ffn_norm_type),
        )
        layer.self_attn = build_mla(i, json_config, gguf_weights)
        if f"blk.{i}.ffn_gate_inp.weight" in gguf_weights:
            layer.gate = build_moegate(i, json_config, gguf_weights)
        layer.ffn = build_ffn(i, json_config, gguf_weights)
        decoder_layers.append(layer)

    model.layers = decoder_layers
    return llm


safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)

gguf_path = "/home/bd/models/DeepSeek-R1-BF16"
weights = read_gguf_directory(gguf_path)
weights = dict(sorted(weights.items()))


for name, t in weights.items():
    # if not name.startswith("blk"):
    # if name.startswith("blk.10."):
    # if "ffn_gate." in name:
    # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
    print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print("Building LLM ...")
llm = build_llm(json_config, weights)
print("Release Weight Tensors ...")
weights = None
print("Loading Configs ...")


tokenizer = AutoTokenizer.from_pretrained(safetensor_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
prompt_file = None
force_think = False


output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)


def start_chat():
    while True:
        content = input("Chat: ")
        if content.startswith('"""'):  # prefix """
            # multi lines input
            content = content[3:] + "\n"
            while True:
                line = input("")
                if line.endswith('"""'):
                    # end multi lines input
                    line = line[:-3]  # suffix """
                    if line:
                        content += line + "\n"
                    break
                else:
                    content += line + "\n"

        if content == "":
            if prompt_file != None:
                content = open(prompt_file, "r").read()
            else:
                content = "Please write a piece of quicksort code in C++."
        elif os.path.isfile(content):
            content = open(content, "r").read()

        messages = [{"role": "user", "content": content}]
        input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
        if force_think:
            token_thinks = torch.tensor(
                [tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
            )
            input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
        input_tensor = input_tensor.squeeze(0)  # Add batch dimension

        print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
        while True:
            stream = TextStreamer(tokenizer)

            qlen = input_tensor.shape[0]
            qlens = [qlen]
            kvlens = [0]
            page_tables = [list(range(pages_count))]
            llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())

            logits = output_logits[0]
            # print(logits)
            # sample
            next_token = torch.argmax(logits).item()
            # print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
            input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)

            if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
                print(stream.end(), end="", flush=True)
                break
            else:
                print(stream.put(torch.tensor([next_token])), end="", flush=True)


job_id = 0
while True:
    try:
        # ---------- 让用户决定是否继续 ----------
        choice = input("\n【回车】开始对话 | 输入 q/quit/exit 退出程序： ").strip().lower()
        if choice in {"q", "quit", "exit"}:
            print("收到退出指令，程序结束。")
            break

        # ----------------------------------------

        start_chat()  # 启动聊天会话
    except KeyboardInterrupt:
        # 随时 Ctrl-C：放弃当前任务并重启
        print(f"\n检测到 Ctrl-C，已终止对话 #{job_id}，马上重启…")
    finally:
        job_id += 1  # 不管中断与否，都给下一任务换编号


================================================
FILE: kt-kernel/examples/test_deepseekv3_prefill_speed.py
================================================
import os, sys
import time

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
import sys
import json
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)

logger = logging.getLogger("reader")

from gguf.gguf_reader import GGUFReader

# load_layers = 3
load_layers = None
worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [72, 72]
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)

max_qlen = 4096
max_kvlen = 4096
page_size = 256
pages_count = 200


def read_gguf_file(gguf_file_path):
    """
    Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.

    Parameters:
    - gguf_file_path: Path to the GGUF file.
    """

    reader = GGUFReader(gguf_file_path)

    # List all key-value pairs in a columnized format
    # print("Key-Value Pairs:") # noqa: NP100
    # max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
        # print(f"{key:{max_key_length}} : {value}") # noqa: NP100
    # print("----") # noqa: NP100

    # List all tensors
    # print("Tensors:") # noqa: NP100
    # tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
    # print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
    # print("-" * 80) # noqa: NP100
    re = []
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
        # print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
        re.append(tensor)
    return re


def read_gguf_directory(directory):
    """
    Reads all GGUF files in a directory and prints their contents.

    Parameters:
    - directory: Path to the directory containing GGUF files.
    """
    if not os.path.isdir(directory):
        logger.error(f"Directory {directory} does not exist.")
        return

    # List all GGUF files in the directory
    files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
    if not files:
        logger.info(f"No GGUF files found in {directory}.")
        return

    re = []
    for file in files:
        file_path = os.path.join(directory, file)
        # print(f"Reading {file_path}:") # noqa: NP100
        # print("\n") # noqa: NP100
        re.extend(read_gguf_file(file_path))
    re = {r.name: r for r in re}
    return re


def find_weights(name, weights):
    """
    Finds and returns the weights for a given name from the list of weights.

    Parameters:
    - name: The name of the weights to find.
    - weights: List of weight tensors.

    Returns:
    - The weight tensor if found, otherwise None.
    """
    for weight in weights:
        if weight.name == name:
            return weight
    raise ValueError(f"Weight with name {name} not found in the provided weights list.")


def get_torch_tensor_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous()


def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name


def type_to_ggml_type(type):
    if type == "F32":
        return ggml_type.FP32
    elif type == "F16":
        return ggml_type.FP16
    elif type == "BF16":
        return ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {type}")


def build_mla(layer_idx, json_config, gguf_weights):
    hidden_size = json_config["hidden_size"]
    num_heads = json_config["num_attention_heads"]
    q_lora_rank = json_config["q_lora_rank"]
    kv_lora_rank = json_config["kv_lora_rank"]
    nope_size = json_config["qk_nope_head_dim"]
    rope_size = json_config["qk_rope_head_dim"]
    max_position_embeddings = json_config["max_position_embeddings"]
    rope_theta = json_config["rope_theta"]
    rope_scaling = json_config["rope_scaling"]

    config = kt_kernel_ext.mla.MLAConfig(
        hidden_size,
        q_lora_rank,
        kv_lora_rank,
        num_heads,
        nope_size,
        rope_size,
    )
    config.max_qlen = max_qlen
    config.max_kvlen = max_kvlen
    config.max_position_embeddings = max_position_embeddings
    config.rope_scaling_factor = rope_scaling["factor"]
    config.rope_theta = rope_theta
    config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
    config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
    config.rope_scaling_mscale = rope_scaling["mscale"]
    config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
    config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

    q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
    config.q_a_proj = q_a_proj_weight.data_ptr()
    config.q_a_proj_type = type_to_ggml_type(type)
    q_a_type = type

    q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
    config.q_a_norm = q_a_norm_weight.data_ptr()
    config.q_a_norm_type = type_to_ggml_type(type)

    q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
    config.q_b_proj = q_b_proj_weight.data_ptr()
    config.q_b_proj_type = type_to_ggml_type(type)

    kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
        gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
    )
    config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
    config.kv_a_proj_with_mqa_type = type_to_ggml_type(type)

    kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
    config.kv_a_norm = kv_a_norm_weight.data_ptr()
    config.kv_a_norm_type = type_to_ggml_type(type)

    kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
    config.kv_b_proj = kv_b_proj_weight.data_ptr()
    config.kv_b_proj_type = type_to_ggml_type(type)

    o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
    config.o_proj = o_proj_weight.data_ptr()
    config.w_o_type = type_to_ggml_type(type)

    config.layer_idx = layer_idx
    config.pool = CPUInfer.backend_
    config.page_count = pages_count

    if q_a_type == "F32":
        mla = kt_kernel_ext.mla.MLA_F32(config)
    elif q_a_type == "F16":
        mla = kt_kernel_ext.mla.MLA_F16(config)
    elif q_a_type == "BF16":
        # mla = kt_kernel_ext.mla.MLA_F32(config)
        mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
    else:
        raise ValueError(f"Unsupported data type: {q_a_type}")

    mla.load_weights()
    mla.set_local_pages(pages_count)
    return mla


def build_ffn(layer_idx, json_config, gguf_weights):
    if f"blk.{layer_idx}.ffn_gate.weight" in gguf_weights:  # dense
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down.weight")

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    elif f"blk.{layer_idx}.ffn_gate_exps.weight" in gguf_weights:
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["n_routed_experts"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_exps.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_exps.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down_exps.weight")

        gate_sh, gate_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_gate_shexp.weight"
        )
        up_sh, up_sh_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_shexp.weight")
        down_sh, down_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_down_shexp.weight"
        )

        gate_sh_expanded = gate_sh.unsqueeze(0)
        gate = torch.cat([gate, gate_sh_expanded], dim=0).contiguous()
        up_sh_expanded = up_sh.unsqueeze(0)
        up = torch.cat([up, up_sh_expanded], dim=0).contiguous()
        down_sh_expanded = down_sh.unsqueeze(0)
        down = torch.cat([down, down_sh_expanded], dim=0).contiguous()

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    else:
        raise ValueError(f"Unsupported FFN type for layer {layer_idx}")


def build_moegate(layer_idx, json_config, gguf_weights):
    config = kt_kernel_ext.gate.GateConfig(
        json_config["hidden_size"],
        json_config["num_experts_per_tok"],
        json_config["n_routed_experts"],
        json_config["n_group"],
        json_config["topk_group"],
    )

    config.routed_scaling_factor = json_config["routed_scaling_factor"]

    config.pool = CPUInfer.backend_

    weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
    config.weight = weight.data_ptr()
    config.weight_type = type_to_ggml_type(weight_type)

    bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
    config.e_score_correction_bias = bias.data_ptr()
    config.e_score_correction_bias_type = type_to_ggml_type(bias_type)

    gate = kt_kernel_ext.gate.MoEGate(config)

    return gate


def build_llm(json_config, gguf_weights):

    general_config = kt_kernel_ext.GeneralConfig()
    general_config.vocab_size = json_config["vocab_size"]
    general_config.hidden_size = json_config["hidden_size"]
    general_config.num_experts_per_tok = json_config["num_experts_per_tok"]
    general_config.n_routed_experts = json_config["n_routed_experts"]
    general_config.n_shared_experts = json_config["n_shared_experts"]
    general_config.max_qlen = max_qlen

    lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
    general_config.lm_heads_ptr = lm_heads.data_ptr()
    general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)

    output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
    general_config.norm_weights_ptr = output_norm.data_ptr()
    general_config.norm_weights_type = type_to_ggml_type(output_norm_type)

    token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
    general_config.token_embd_ptr = token_embd.data_ptr()
    general_config.token_embd_type = type_to_ggml_type(token_embd_type)

    general_config.pool = CPUInfer.backend_

    llm = kt_kernel_ext.DeepseekV3ForCausalLM(general_config)
    model = kt_kernel_ext.DeepseekV3Model(general_config)
    llm.model = model

    decoder_layers = []
    real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers

    for i in range(real_load_layers):
        # for i in [2,3]:
        layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
        attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
        ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")

        layer.load_norm(
            attn_norm.data_ptr(),
            type_to_ggml_type(attn_norm_type),
            ffn_norm.data_ptr(),
            type_to_ggml_type(ffn_norm_type),
        )
        layer.self_attn = build_mla(i, json_config, gguf_weights)
        if f"blk.{i}.ffn_gate_inp.weight" in gguf_weights:
            layer.gate = build_moegate(i, json_config, gguf_weights)
        layer.ffn = build_ffn(i, json_config, gguf_weights)
        decoder_layers.append(layer)

    model.layers = decoder_layers
    return llm


safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)

gguf_path = "/home/bd/models/DeepSeek-R1-BF16"
weights = read_gguf_directory(gguf_path)
weights = dict(sorted(weights.items()))


# for name, t in weights.items():
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")

print("Building LLM ...")
load_start_time = time.perf_counter()
llm = build_llm(json_config, weights)
load_end_time = time.perf_counter()
print(f"Load time: {load_end_time - load_start_time:.4f} seconds")

print("Release Weight Tensors ...")
weights = None
print("Loading Configs ...")


tokenizer = AutoTokenizer.from_pretrained(safetensor_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)

force_think = False


output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)


def start_chat(content=None):
    if content is None:
        content = input("Chat: ")

    messages = [{"role": "user", "content": content}]
    input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    if force_think:
        token_thinks = torch.tensor(
            [tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
        )
        input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
    input_tensor = input_tensor.squeeze(0)  # Add batch dimension

    print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
    kvlen = 0
    step = 2
    while True or step > 0:
        step -= 1
        stream = TextStreamer(tokenizer)

        qlen = input_tensor.shape[0]
        qlens = [qlen]
        kvlens = [0]
        page_tables = [list(range(pages_count))]
        start_time = time.perf_counter()
        llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
        end_time = time.perf_counter()
        print(
            f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
        )

        logits = output_logits[0]
        # print(logits)
        # sample
        next_token = torch.argmax(logits).item()
        # print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
        # kvlen = input_tensor.shape[0]
        input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)

        if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
            stream.end()
            break
        else:
            stream.put(torch.tensor([next_token]))


job_id = 0
while True:
    try:
        # ---------- 让用户决定是否继续 ----------
        choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序： ").strip().lower()
        if choice in {"q", "quit", "exit"}:
            print("收到退出指令，程序结束。")
            break
        elif choice == "1":
            file_path = input("请输入要读取的文件路径：").strip()
            if not Path(file_path).is_file():
                print(f"文件 {file_path} 不存在，请检查路径。")
                continue
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
            print(f"读取到内容：\n{content}\n")
            start_chat(content)
        else:
            start_chat()

    except KeyboardInterrupt:
        # 随时 Ctrl-C：放弃当前任务并重启
        print(f"\n检测到 Ctrl-C，已终止对话 #{job_id}，马上重启…")
    except Exception as e:
        # 其他异常：打印错误信息并重启
        print(f"\n发生错误：{e}\n已终止对话 #{job_id}，马上重启…")
        logger.error(f"Error in job {job_id}: {e}", exc_info=True)
    finally:
        job_id += 1  # 不管中断与否，都给下一任务换编号


================================================
FILE: kt-kernel/examples/test_fp8_moe.py
================================================
"""
Test script for GemmKernel224FP8 (FP8 MoE) kernel validation.

This script:
1. Generates random BF16 weights
2. Quantizes them to FP8 format with 128x128 block-wise scales
3. Runs the FP8 MoE kernel
4. Compares results with PyTorch reference using dequantized BF16 weights

FP8 format notes:
- Weight: FP8 (E4M3) stored as uint8, shape [expert_num, n, k]
- Scale: FP32, shape [expert_num, n // group_size, k // group_size], group_size=128
"""

import os
import sys

sys.path.insert(0, os.path.dirname(__file__) + "/../build")

import torch
import kt_kernel
from kt_kernel import kt_kernel_ext

torch.manual_seed(42)

# Model config
hidden_size = 3072
intermediate_size = 1536
max_len = 25600

expert_num = 16
num_experts_per_tok = 8

qlen = 100
layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(40)
validation_iter = 1
fp8_group_size = 128  # FP8 uses 128x128 block quantization
debug_print_count = 16

physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()


def act_fn(x):
    """SiLU activation function"""
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    """Reference MLP computation in PyTorch"""
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    """Reference MoE computation in PyTorch"""
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output


# FP8 E4M3 constants
FP8_E4M3_MAX = 448.0  # Maximum representable value in FP8 E4M3


def fp8_e4m3_to_float(fp8_val: int) -> float:
    """
    Convert FP8 E4M3 value to float.
    FP8 E4M3 format: 1 sign bit, 4 exponent bits, 3 mantissa bits
    """
    sign = (fp8_val >> 7) & 1
    exp = (fp8_val >> 3) & 0xF
    mant = fp8_val & 0x7

    if exp == 0:
        # Subnormal or zero
        if mant == 0:
            return -0.0 if sign else 0.0
        # Subnormal: value = (-1)^sign * 2^(-6) * (0.mant)
        return ((-1) ** sign) * (2**-6) * (mant / 8.0)
    elif exp == 15:
        # NaN (FP8 E4M3 doesn't have Inf, all exp=15 are NaN)
        return float("nan")
    else:
        # Normal: value = (-1)^sign * 2^(exp-7) * (1.mant)
        return ((-1) ** sign) * (2 ** (exp - 7)) * (1.0 + mant / 8.0)


def float_to_fp8_e4m3(val: float) -> int:
    """
    Convert float to FP8 E4M3 value.
    """
    if val != val:  # NaN
        return 0x7F  # NaN representation

    sign = 1 if val < 0 else 0
    val = abs(val)

    if val == 0:
        return sign << 7

    # Clamp to max representable value
    val = min(val, FP8_E4M3_MAX)

    # Find exponent
    import math

    if val < 2**-9:  # Subnormal threshold
        # Subnormal
        mant = int(round(val / (2**-9)))
        mant = min(mant, 7)
        return (sign << 7) | mant

    exp = int(math.floor(math.log2(val))) + 7
    exp = max(1, min(exp, 14))  # Clamp exponent to valid range

    # Calculate mantissa
    mant = int(round((val / (2 ** (exp - 7)) - 1.0) * 8))
    mant = max(0, min(mant, 7))

    # Handle overflow to next exponent
    if mant > 7:
        mant = 0
        exp += 1
        if exp > 14:
            exp = 14
            mant = 7

    return (sign << 7) | (exp << 3) | mant


def quantize_to_fp8_blockwise(weights: torch.Tensor, group_size: int = 128):
    """
    Quantize BF16/FP32 weights to FP8 with block-wise scaling.

    Args:
        weights: [expert_num, n, k] tensor in BF16/FP32
        group_size: Block size for quantization (default 128 for DeepSeek)

    Returns:
        fp8_weights: [expert_num, n, k] uint8 tensor
        scales: [expert_num, n // group_size, k // group_size] BF16 tensor (scale_inv)
    """
    weights_f32 = weights.to(torch.float32)
    e, n, k = weights_f32.shape

    assert n % group_size == 0, f"n ({n}) must be divisible by group_size ({group_size})"
    assert k % group_size == 0, f"k ({k}) must be divisible by group_size ({group_size})"

    n_blocks = n // group_size
    k_blocks = k // group_size

    # Reshape to [e, n_blocks, group_size, k_blocks, group_size]
    reshaped = weights_f32.view(e, n_blocks, group_size, k_blocks, group_size)
    # Move to [e, n_blocks, k_blocks, group_size, group_size] for block processing
    reshaped = reshaped.permute(0, 1, 3, 2, 4)

    # Calculate max abs per block
    max_abs = reshaped.abs().amax(dim=(-2, -1), keepdim=True)
    max_abs = torch.clamp(max_abs, min=1e-12)

    # Scale to FP8 range: scale = max_abs / FP8_MAX
    # We store scale_inv = scale (for dequantization: fp8 * scale)
    scales = (max_abs / FP8_E4M3_MAX).squeeze(-1).squeeze(-1)  # [e, n_blocks, k_blocks]

    # Quantize: q = round(val / scale)
    scaled = reshaped / (scales.unsqueeze(-1).unsqueeze(-1) + 1e-12)

    # Convert to FP8 E4M3 using vectorized approach
    # Clamp to FP8 representable range
    scaled = scaled.clamp(-FP8_E4M3_MAX, FP8_E4M3_MAX)

    # Simple quantization: round to nearest representable FP8 value
    # For simplicity, we use a lookup table approach
    fp8_q = torch.zeros_like(scaled, dtype=torch.uint8)

    # Vectorized FP8 quantization
    sign_mask = (scaled < 0).to(torch.uint8) << 7
    abs_scaled = scaled.abs()

    # Handle different ranges
    # Subnormal: 0 < |x| < 2^-6
    subnormal_mask = (abs_scaled > 0) & (abs_scaled < 2**-6)
    subnormal_mant = (abs_scaled / (2**-9)).round().clamp(0, 7).to(torch.uint8)

    # Normal values
    normal_mask = abs_scaled >= 2**-6
    log2_val = torch.log2(abs_scaled.clamp(min=2**-9))
    exp = (log2_val.floor() + 7).clamp(1, 14).to(torch.int32)
    mant = ((abs_scaled / (2.0 ** (exp.float() - 7)) - 1.0) * 8).round().clamp(0, 7).to(torch.uint8)

    # Combine
    fp8_q = torch.where(subnormal_mask, sign_mask | subnormal_mant, fp8_q)
    fp8_q = torch.where(normal_mask, sign_mask | (exp.to(torch.uint8) << 3) | mant, fp8_q)

    # Reshape back to [e, n, k]
    fp8_q = fp8_q.permute(0, 1, 3, 2, 4).reshape(e, n, k)

    # Scales shape: [e, n_blocks, k_blocks] -> store as [e, n_blocks, k_blocks]
    scales_fp32 = scales.to(torch.float32).contiguous()

    return fp8_q.contiguous(), scales_fp32


def dequantize_fp8_blockwise(fp8_weights: torch.Tensor, scales: torch.Tensor, group_size: int = 128):
    """
    Dequantize FP8 weights back to BF16 for reference computation.

    Args:
        fp8_weights: [expert_num, n, k] uint8 tensor
        scales: [expert_num, n // group_size, k // group_size] BF16 tensor
        group_size: Block size

    Returns:
        dequantized: [expert_num, n, k] BF16 tensor
    """
    e, n, k = fp8_weights.shape
    n_blocks = n // group_size
    k_blocks = k // group_size

    # Convert FP8 to float
    # Build lookup table for FP8 E4M3 -> float
    fp8_lut = torch.tensor([fp8_e4m3_to_float(i) for i in range(256)], dtype=torch.float32)

    # Use lookup table
    fp8_float = fp8_lut[fp8_weights.to(torch.int64)]

    # Reshape for block-wise scaling
    fp8_reshaped = fp8_float.view(e, n_blocks, group_size, k_blocks, group_size)
    fp8_reshaped = fp8_reshaped.permute(0, 1, 3, 2, 4)  # [e, n_blocks, k_blocks, group_size, group_size]

    # Apply scales
    scales_f32 = scales.to(torch.float32).unsqueeze(-1).unsqueeze(-1)  # [e, n_blocks, k_blocks, 1, 1]
    dequantized = fp8_reshaped * scales_f32

    # Reshape back
    dequantized = dequantized.permute(0, 1, 3, 2, 4).reshape(e, n, k)

    return dequantized.to(torch.bfloat16).contiguous()


def build_random_fp8_weights():
    """
    Generate random BF16 weights and quantize to FP8.

    Returns:
        dict with fp8 weights, scales, and original bf16 for reference
    """
    torch.manual_seed(42)

    # Generate random BF16 weights with small values
    gate_proj = (torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32) / 100.0).to(
        torch.bfloat16
    )
    up_proj = (torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32) / 100.0).to(
        torch.bfloat16
    )
    down_proj = (torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32) / 100.0).to(
        torch.bfloat16
    )

    # Quantize to FP8
    gate_fp8, gate_scales = quantize_to_fp8_blockwise(gate_proj, fp8_group_size)
    up_fp8, up_scales = quantize_to_fp8_blockwise(up_proj, fp8_group_size)
    down_fp8, down_scales = quantize_to_fp8_blockwise(down_proj, fp8_group_size)

    # Dequantize for reference computation
    gate_deq = dequantize_fp8_blockwise(gate_fp8, gate_scales, fp8_group_size)
    up_deq = dequantize_fp8_blockwise(up_fp8, up_scales, fp8_group_size)
    down_deq = dequantize_fp8_blockwise(down_fp8, down_scales, fp8_group_size)

    print(f"FP8 weights shape: gate={gate_fp8.shape}, up={up_fp8.shape}, down={down_fp8.shape}")
    print(f"Scales shape: gate={gate_scales.shape}, up={up_scales.shape}, down={down_scales.shape}")

    # Debug: Print FP8 weight and scale info for expert 0
    print("\n=== DEBUG: FP8 Weight and Scale Info (Expert 0) ===")
    print(f"gate_fp8[0] first 8x8 block:")
    for i in range(8):
        print(f"  row {i}: {gate_fp8[0, i, :8].numpy().tobytes().hex(' ')}")
    print(f"gate_fp8[0] stats: min={gate_fp8[0].min()}, max={gate_fp8[0].max()}")
    print(f"gate_scales[0] first 4x4 block:\n{gate_scales[0, :4, :4]}")
    print(f"gate_scales[0] stats: min={gate_scales[0].min()}, max={gate_scales[0].max()}")

    print(f"\nup_fp8[0] first 8x8 block:")
    for i in range(8):
        print(f"  row {i}: {up_fp8[0, i, :8].numpy().tobytes().hex(' ')}")
    print(f"up_fp8[0] stats: min={up_fp8[0].min()}, max={up_fp8[0].max()}")
    print(f"up_scales[0] first 4x4 block:\n{up_scales[0, :4, :4]}")
    print(f"up_scales[0] stats: min={up_scales[0].min()}, max={up_scales[0].max()}")

    print(f"\ndown_fp8[0] first 8x8 block:")
    for i in range(8):
        print(f"  row {i}: {down_fp8[0, i, :8].numpy().tobytes().hex(' ')}")
    print(f"down_fp8[0] stats: min={down_fp8[0].min()}, max={down_fp8[0].max()}")
    print(f"down_scales[0] first 4x4 block:\n{down_scales[0, :4, :4]}")
    print(f"down_scales[0] stats: min={down_scales[0].min()}, max={down_scales[0].max()}")

    return {
        "gate_fp8": gate_fp8.contiguous(),
        "up_fp8": up_fp8.contiguous(),
        "down_fp8": down_fp8.contiguous(),
        "gate_scales": gate_scales.contiguous(),
        "up_scales": up_scales.contiguous(),
        "down_scales": down_scales.contiguous(),
        "gate_deq": gate_deq.contiguous(),
        "up_deq": up_deq.contiguous(),
        "down_deq": down_deq.contiguous(),
    }


def build_moes_from_fp8_data(fp8_data: dict):
    """
    Build FP8 MoE modules from quantized data.
    """
    moes = []
    with torch.inference_mode(mode=True):
        for _ in range(layer_num):
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.quant_config.bits = 8
            config.quant_config.group_size = fp8_group_size
            config.quant_config.zero_point = False

            # Set FP8 weight pointers
            config.gate_proj = fp8_data["gate_fp8"].data_ptr()
            config.up_proj = fp8_data["up_fp8"].data_ptr()
            config.down_proj = fp8_data["down_fp8"].data_ptr()

            # Set scale pointers
            config.gate_scale = fp8_data["gate_scales"].data_ptr()
            config.up_scale = fp8_data["up_scales"].data_ptr()
            config.down_scale = fp8_data["down_scales"].data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXFP8_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            moes.append(moe)
    return moes


def run_fp8_moe_test():
    """
    Run FP8 MoE validation test.
    """
    print("\n" + "=" * 70)
    print("FP8 MoE Kernel Validation Test")
    print("=" * 70)

    # Build FP8 weights
    print("\nGenerating and quantizing weights...")
    fp8_data = build_random_fp8_weights()

    # Build MoE modules
    print("\nBuilding FP8 MoE modules...")
    moes = build_moes_from_fp8_data(fp8_data)

    # Get dequantized weights for reference
    gate_deq = fp8_data["gate_deq"]
    up_deq = fp8_data["up_deq"]
    down_deq = fp8_data["down_deq"]

    diffs = []
    with torch.inference_mode(mode=True):
        for i in range(validation_iter):
            torch.manual_seed(100 + i)
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.randn((qlen, num_experts_per_tok), dtype=torch.float32).contiguous() / 100
            input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous() * 1.5
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()

            moe = moes[i % layer_num]
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_tensor.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            assert not torch.isnan(output).any(), "NaN values detected in CPU expert output."
            assert not torch.isinf(output).any(), "Inf values detected in CPU expert output."

            # Reference computation using dequantized weights
            t_output = moe_torch(input_tensor, expert_ids, weights, gate_deq, up_deq, down_deq)

            t_output_flat = t_output.flatten()
            output_flat = output.flatten()

            diff = torch.mean(torch.abs(output_flat - t_output_flat)) / (torch.mean(torch.abs(t_output_flat)) + 1e-12)
            diffs.append(diff.item())
            print(f"Iteration {i}: relative L1 diff = {diff:.6f}")

            if i < 3:  # Print detailed output for first few iterations
                print(f"  kernel output: {output_flat[:debug_print_count]}")
                print(f"  torch output:  {t_output_flat[:debug_print_count]}")

    mean_diff = float(sum(diffs) / len(diffs))
    max_diff = float(max(diffs))
    min_diff = float(min(diffs))

    print("\n" + "=" * 70)
    print("FP8 MoE Test Results")
    print("=" * 70)
    print(f"Mean relative L1 diff: {mean_diff*100:.4f}%")
    print(f"Max relative L1 diff:  {max_diff*100:.4f}%")
    print(f"Min relative L1 diff:  {min_diff*100:.4f}%")

    # Pass/Fail criteria
    threshold = 15.0  # 15% relative error threshold for FP8
    if mean_diff * 100 < threshold:
        print(f"\nPASS: Mean error {mean_diff*100:.4f}% < {threshold}% threshold")
    else:
        print(f"\nFAIL: Mean error {mean_diff*100:.4f}% >= {threshold}% threshold")

    return {"mean": mean_diff, "max": max_diff, "min": min_diff}


if __name__ == "__main__":
    run_fp8_moe_test()


================================================
FILE: kt-kernel/examples/test_fp8_perchannel_moe.py
================================================
"""
Test script for FP8 Per-Channel MoE kernel validation (GLM-4.7-FP8 style).

This script:
1. Generates random BF16 weights
2. Quantizes them to FP8 format with per-channel scales (one scale per output channel)
3. Runs the FP8 Per-Channel MoE kernel
4. Compares results with PyTorch reference using dequantized BF16 weights

FP8 Per-Channel format notes:
- Weight: FP8 (E4M3) stored as uint8, shape [expert_num, n, k]
- Scale: FP32, shape [expert_num, n] (one scale per output row)
"""

import os
import sys

sys.path.insert(0, os.path.dirname(__file__) + "/../build")

import torch
from kt_kernel import kt_kernel_ext

torch.manual_seed(42)

# Model config
hidden_size = 3072
intermediate_size = 1536
max_len = 25600

expert_num = 16
num_experts_per_tok = 8

qlen = 100
layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(40)
validation_iter = 1
debug_print_count = 16

physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()


def act_fn(x):
    """SiLU activation function"""
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    """Reference MLP computation in PyTorch"""
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    """Reference MoE computation in PyTorch"""
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output


# FP8 E4M3 constants
FP8_E4M3_MAX = 448.0  # Maximum representable value in FP8 E4M3


def fp8_e4m3_to_float(fp8_val: int) -> float:
    """
    Convert FP8 E4M3 value to float.
    FP8 E4M3 format: 1 sign bit, 4 exponent bits, 3 mantissa bits
    """
    sign = (fp8_val >> 7) & 1
    exp = (fp8_val >> 3) & 0xF
    mant = fp8_val & 0x7

    if exp == 0:
        # Subnormal or zero
        if mant == 0:
            return -0.0 if sign else 0.0
        # Subnormal: value = (-1)^sign * 2^(-6) * (0.mant)
        return ((-1) ** sign) * (2**-6) * (mant / 8.0)
    elif exp == 15:
        # NaN (FP8 E4M3 doesn't have Inf, all exp=15 are NaN)
        return float("nan")
    else:
        # Normal: value = (-1)^sign * 2^(exp-7) * (1.mant)
        return ((-1) ** sign) * (2 ** (exp - 7)) * (1.0 + mant / 8.0)


def float_to_fp8_e4m3(val: float) -> int:
    """
    Convert float to FP8 E4M3 value.
    """
    if val != val:  # NaN
        return 0x7F  # NaN representation

    sign = 1 if val < 0 else 0
    val = abs(val)

    if val == 0:
        return sign << 7

    # Clamp to max representable value
    val = min(val, FP8_E4M3_MAX)

    # Find exponent
    import math

    if val < 2**-9:  # Subnormal threshold
        # Subnormal
        mant = int(round(val / (2**-9)))
        mant = min(mant, 7)
        return (sign << 7) | mant

    exp = int(math.floor(math.log2(val))) + 7
    exp = max(1, min(exp, 14))  # Clamp exponent to valid range

    # Calculate mantissa
    mant = int(round((val / (2 ** (exp - 7)) - 1.0) * 8))
    mant = max(0, min(mant, 7))

    # Handle overflow to next exponent
    if mant > 7:
        mant = 0
        exp += 1
        if exp > 14:
            exp = 14
            mant = 7

    return (sign << 7) | (exp << 3) | mant


def quantize_to_fp8_perchannel(weights: torch.Tensor):
    """
    Quantize BF16/FP32 weights to FP8 with per-channel scaling.

    Args:
        weights: [expert_num, n, k] tensor in BF16/FP32

    Returns:
        fp8_weights: [expert_num, n, k] uint8 tensor
        scales: [expert_num, n] FP32 tensor (one scale per output row)
    """
    weights_f32 = weights.to(torch.float32)
    e, n, k = weights_f32.shape

    # Calculate max abs per row (per output channel)
    max_abs = weights_f32.abs().amax(dim=-1, keepdim=True)  # [e, n, 1]
    max_abs = torch.clamp(max_abs, min=1e-12)

    # Scale to FP8 range: scale = max_abs / FP8_MAX
    scales = (max_abs / FP8_E4M3_MAX).squeeze(-1)  # [e, n]

    # Quantize: q = round(val / scale)
    scaled = weights_f32 / (scales.unsqueeze(-1) + 1e-12)

    # Clamp to FP8 representable range
    scaled = scaled.clamp(-FP8_E4M3_MAX, FP8_E4M3_MAX)

    # Vectorized FP8 quantization
    fp8_q = torch.zeros_like(scaled, dtype=torch.uint8)

    sign_mask = (scaled < 0).to(torch.uint8) << 7
    abs_scaled = scaled.abs()

    # Handle different ranges
    # Subnormal: 0 < |x| < 2^-6
    subnormal_mask = (abs_scaled > 0) & (abs_scaled < 2**-6)
    subnormal_mant = (abs_scaled / (2**-9)).round().clamp(0, 7).to(torch.uint8)

    # Normal values
    normal_mask = abs_scaled >= 2**-6
    log2_val = torch.log2(abs_scaled.clamp(min=2**-9))
    exp = (log2_val.floor() + 7).clamp(1, 14).to(torch.int32)
    mant = ((abs_scaled / (2.0 ** (exp.float() - 7)) - 1.0) * 8).round().clamp(0, 7).to(torch.uint8)

    # Combine
    fp8_q = torch.where(subnormal_mask, sign_mask | subnormal_mant, fp8_q)
    fp8_q = torch.where(normal_mask, sign_mask | (exp.to(torch.uint8) << 3) | mant, fp8_q)

    return fp8_q.contiguous(), scales.to(torch.float32).contiguous()


def dequantize_fp8_perchannel(fp8_weights: torch.Tensor, scales: torch.Tensor):
    """
    Dequantize FP8 weights back to BF16 for reference computation.

    Args:
        fp8_weights: [expert_num, n, k] uint8 tensor
        scales: [expert_num, n] FP32 tensor

    Returns:
        dequantized: [expert_num, n, k] BF16 tensor
    """
    # Build lookup table for FP8 E4M3 -> float
    fp8_lut = torch.tensor([fp8_e4m3_to_float(i) for i in range(256)], dtype=torch.float32)

    # Use lookup table
    fp8_float = fp8_lut[fp8_weights.to(torch.int64)]

    # Apply per-channel scales
    scales_expanded = scales.unsqueeze(-1)  # [e, n, 1]
    dequantized = fp8_float * scales_expanded

    return dequantized.to(torch.bfloat16).contiguous()


def build_random_fp8_perchannel_weights():
    """
    Generate random BF16 weights and quantize to FP8 with per-channel scales.

    Returns:
        dict with fp8 weights, scales, and original bf16 for reference
    """
    torch.manual_seed(42)

    # Generate random BF16 weights with small values
    gate_proj = (torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32) / 100.0).to(
        torch.bfloat16
    )
    up_proj = (torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32) / 100.0).to(
        torch.bfloat16
    )
    down_proj = (torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32) / 100.0).to(
        torch.bfloat16
    )

    # Quantize to FP8 with per-channel scales
    gate_fp8, gate_scales = quantize_to_fp8_perchannel(gate_proj)
    up_fp8, up_scales = quantize_to_fp8_perchannel(up_proj)
    down_fp8, down_scales = quantize_to_fp8_perchannel(down_proj)

    # Dequantize for reference computation
    gate_deq = dequantize_fp8_perchannel(gate_fp8, gate_scales)
    up_deq = dequantize_fp8_perchannel(up_fp8, up_scales)
    down_deq = dequantize_fp8_perchannel(down_fp8, down_scales)

    print(f"FP8 Per-Channel weights shape: gate={gate_fp8.shape}, up={up_fp8.shape}, down={down_fp8.shape}")
    print(f"Per-Channel scales shape: gate={gate_scales.shape}, up={up_scales.shape}, down={down_scales.shape}")

    # Debug: Print FP8 weight and scale info for expert 0
    print("\n=== DEBUG: FP8 Per-Channel Weight and Scale Info (Expert 0) ===")
    print(f"gate_fp8[0] first 8x8 block:")
    for i in range(8):
        print(f"  row {i}: {gate_fp8[0, i, :8].numpy().tobytes().hex(' ')}")
    print(f"gate_fp8[0] stats: min={gate_fp8[0].min()}, max={gate_fp8[0].max()}")
    print(f"gate_scales[0] first 8 channels: {gate_scales[0, :8]}")
    print(f"gate_scales[0] stats: min={gate_scales[0].min():.6f}, max={gate_scales[0].max():.6f}")

    return {
        "gate_fp8": gate_fp8.contiguous(),
        "up_fp8": up_fp8.contiguous(),
        "down_fp8": down_fp8.contiguous(),
        "gate_scales": gate_scales.contiguous(),
        "up_scales": up_scales.contiguous(),
        "down_scales": down_scales.contiguous(),
        "gate_deq": gate_deq.contiguous(),
        "up_deq": up_deq.contiguous(),
        "down_deq": down_deq.contiguous(),
    }


def build_moes_from_fp8_perchannel_data(fp8_data: dict):
    """
    Build FP8 Per-Channel MoE modules from quantized data.
    """
    moes = []
    with torch.inference_mode(mode=True):
        for _ in range(layer_num):
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.quant_config.bits = 8
            config.quant_config.group_size = 0  # Not used for per-channel
            config.quant_config.zero_point = False
            config.quant_config.per_channel = True  # Enable per-channel mode

            # Set FP8 weight pointers
            config.gate_proj = fp8_data["gate_fp8"].data_ptr()
            config.up_proj = fp8_data["up_fp8"].data_ptr()
            config.down_proj = fp8_data["down_fp8"].data_ptr()

            # Set per-channel scale pointers
            config.gate_scale = fp8_data["gate_scales"].data_ptr()
            config.up_scale = fp8_data["up_scales"].data_ptr()
            config.down_scale = fp8_data["down_scales"].data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXFP8PerChannel_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            moes.append(moe)
    return moes


def run_fp8_perchannel_moe_test():
    """
    Run FP8 Per-Channel MoE validation test.
    """
    print("\n" + "=" * 70)
    print("FP8 Per-Channel MoE Kernel Validation Test")
    print("=" * 70)

    # Build FP8 per-channel weights
    print("\nGenerating and quantizing weights with per-channel scales...")
    fp8_data = build_random_fp8_perchannel_weights()

    # Build MoE modules
    print("\nBuilding FP8 Per-Channel MoE modules...")
    moes = build_moes_from_fp8_perchannel_data(fp8_data)

    # Get dequantized weights for reference
    gate_deq = fp8_data["gate_deq"]
    up_deq = fp8_data["up_deq"]
    down_deq = fp8_data["down_deq"]

    diffs = []
    with torch.inference_mode(mode=True):
        for i in range(validation_iter):
            torch.manual_seed(100 + i)
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.randn((qlen, num_experts_per_tok), dtype=torch.float32).contiguous() / 100
            input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous() * 1.5
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()

            moe = moes[i % layer_num]
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_tensor.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            assert not torch.isnan(output).any(), "NaN values detected in CPU expert output."
            assert not torch.isinf(output).any(), "Inf values detected in CPU expert output."

            # Reference computation using dequantized weights
            t_output = moe_torch(input_tensor, expert_ids, weights, gate_deq, up_deq, down_deq)

            t_output_flat = t_output.flatten()
            output_flat = output.flatten()

            diff = torch.mean(torch.abs(output_flat - t_output_flat)) / (torch.mean(torch.abs(t_output_flat)) + 1e-12)
            diffs.append(diff.item())
            print(f"Iteration {i}: relative L1 diff = {diff:.6f}")

            if i < 3:  # Print detailed output for first few iterations
                print(f"  kernel output: {output_flat[:debug_print_count]}")
                print(f"  torch output:  {t_output_flat[:debug_print_count]}")

    mean_diff = float(sum(diffs) / len(diffs))
    max_diff = float(max(diffs))
    min_diff = float(min(diffs))

    print("\n" + "=" * 70)
    print("FP8 Per-Channel MoE Test Results")
    print("=" * 70)
    print(f"Mean relative L1 diff: {mean_diff*100:.4f}%")
    print(f"Max relative L1 diff:  {max_diff*100:.4f}%")
    print(f"Min relative L1 diff:  {min_diff*100:.4f}%")

    # Pass/Fail criteria
    threshold = 15.0  # 15% relative error threshold for FP8
    if mean_diff * 100 < threshold:
        print(f"\nPASS: Mean error {mean_diff*100:.4f}% < {threshold}% threshold")
    else:
        print(f"\nFAIL: Mean error {mean_diff*100:.4f}% >= {threshold}% threshold")

    return {"mean": mean_diff, "max": max_diff, "min": min_diff}


if __name__ == "__main__":
    run_fp8_perchannel_moe_test()


================================================
FILE: kt-kernel/examples/test_gate.py
================================================
import math
import os, sys
import time
from typing import Optional

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type

import torch
from torch import nn
import torch.nn.functional as F

# from modeling_deepseek_v3 import MoEGate
from configuration_deepseek_v3 import DeepseekV3Config

seed = 42  # 你可以选择任何整数作为种子
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

seqlen = 64

config = DeepseekV3Config()

hidden_size = config.hidden_size
num_experts_per_token = config.num_experts_per_tok
n_routed_experts = config.n_routed_experts
n_group = config.n_group
topk_group = config.topk_group
routed_scaling_factor = config.routed_scaling_factor

weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to("cpu").contiguous()
bias = torch.randn((n_routed_experts,), dtype=torch.float32).to("cpu").contiguous()


# weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float16).to('cpu').contiguous  ()
def load_fp32_tensor(file_path, shape):
    return torch.zeros(shape, dtype=torch.float32).to("cpu").contiguous()
    with open(file_path, "rb") as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float32)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor


class MoEGate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.top_k = config.num_experts_per_tok
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.scoring_func = config.scoring_func
        self.topk_method = config.topk_method
        self.n_group = config.n_group
        self.topk_group = config.topk_group

        # topk selection algorithm
        self.norm_topk_prob = config.norm_topk_prob
        self.gating_dim = config.hidden_size
        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
        if self.topk_method == "noaux_tc":
            self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts)))
        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)

        h_to_check = load_fp32_tensor(
            "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input", (seq_len, h)
        )
        diff = (h_to_check - hidden_states).abs().max()
        # print("hidden_states diff:", diff)
        # assert diff<0.02

        bias_to_check = load_fp32_tensor(
            "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias", (n_routed_experts)
        )
        diff = (bias - bias_to_check).abs().max()
        # print('bias diff:',diff)
        # assert diff < 0.02

        logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)

        logits_to_check = load_fp32_tensor(
            "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits",
            (seq_len, n_routed_experts),
        )
        diff = (logits_to_check - logits).abs().max()
        # print("logits diff:", diff)
        # assert diff < 0.02

        if self.scoring_func == "sigmoid":
            scores = logits.sigmoid()
        else:
            raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")

        ### select top-k experts
        if self.topk_method == "noaux_tc":
            # assert not self.training
            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)

            scores_to_check = load_fp32_tensor(
                "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice",
                (seq_len, n_routed_experts),
            )
            diff = (scores_for_choice - scores_to_check).abs().max()
            print(f"score for choice diff = {diff}")

            group_scores = (
                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
            )  # [n, n_group]

            group_scores_to_check = load_fp32_tensor(
                "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores",
                (seq_len, n_group),
            )
            diff = (group_scores - group_scores_to_check).abs().max()
            print(f"group scores diff = {diff}")

            group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]  # [n, top_k_group]
            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
            score_mask = (
                group_mask.unsqueeze(-1)
                .expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
                .reshape(bsz * seq_len, -1)
            )  # [n, e]
            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
            tmp_scores_to_check = load_fp32_tensor(
                "/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped",
                (seq_len, n_routed_experts),
            )
            is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True)
            print(f"tmp_score ok {is_close.all()}")

            _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
            topk_weight = scores.gather(1, topk_idx)
        else:
            raise NotImplementedError(f"insupportable TopK function for MoE gating: {self.topk_method}")

        ### norm gate to sum 1
        if self.top_k > 1 and self.norm_topk_prob:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        topk_weight = topk_weight * self.routed_scaling_factor  # must multiply the scaling factor

        return topk_idx, topk_weight


def torch_gate(hidden_states):
    hidden_states.unsqueeze_(0)
    gate = MoEGate(config)
    gate.weight.data = weights
    gate.e_score_correction_bias.data = bias
    y = gate(hidden_states)
    # print(y)
    return y


def cpuinfer_gate(hidden_states):
    config = kt_kernel_ext.gate.GateConfig(
        hidden_size,
        num_experts_per_token,
        n_routed_experts,
        n_group,
        topk_group,
    )

    CPUInfer = kt_kernel_ext.CPUInfer(64)
    config.routed_scaling_factor = routed_scaling_factor

    config.pool = CPUInfer.backend_
    config.weight = weights.data_ptr()
    config.weight_type = ggml_type.FP32
    config.e_score_correction_bias = bias.data_ptr()
    config.e_score_correction_bias_type = ggml_type.FP32

    gate = kt_kernel_ext.gate.MoEGate(config)

    expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to("cpu").contiguous()
    expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to("cpu").contiguous()

    gate.forward(seqlen, hidden_states.data_ptr(), expert_ids.data_ptr(), expert_weights.data_ptr())

    # print(expert_ids,expert_weights)
    return expert_ids, expert_weights


input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to("cpu").contiguous()
# print(input)
ids, we = cpuinfer_gate(input)
idx = torch.argsort(ids, dim=-1, descending=True)
ids = torch.gather(ids, dim=-1, index=idx)
we = torch.gather(we, dim=-1, index=idx)


std_ids, std_we = torch_gate(input)
idx = torch.argsort(std_ids, dim=-1, descending=True)
std_we = torch.gather(std_we, dim=-1, index=idx)
std_ids = torch.gather(std_ids, dim=-1, index=idx)


# print("ids diff:", torch.abs(std_ids - ids).max())
# print("weights diff:", torch.abs(std_we - we).max())
assert torch.abs(std_ids - ids).max() == 0, "Expert IDs do not match!"
assert torch.abs(std_we - we).max() < 1e-2, "Expert Weights do not match!"
print("Expert IDs and Weights match successfully!")


================================================
FILE: kt-kernel/examples/test_k2_moe_amx.py
================================================
import math
import os
import sys
from typing import Dict, Literal

sys.path.insert(0, os.path.dirname(__file__) + "/../build")

import torch
from kt_kernel import kt_kernel_ext

torch.manual_seed(42)

hidden_size = 7168
intermediate_size = 2048
max_len = 25600

expert_num = 16
num_experts_per_tok = 8

qlen = 1
layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(40)
validation_iter = 10
k_group_size = 32
debug_print_count = 16

physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()


def _pattern_uniform(groups: int) -> torch.Tensor:
    return torch.full((groups,), 0.02, dtype=torch.float32)


def _pattern_alternating(groups: int) -> torch.Tensor:
    vals = torch.full((groups,), 0.015, dtype=torch.float32)
    vals[1::2] = 0.03
    return vals


def _pattern_ramp(groups: int) -> torch.Tensor:
    return torch.linspace(0.005, 0.04, steps=groups, dtype=torch.float32)


WEIGHT_PATTERNS = {
    "uniform_scale": ("All k-groups share the same abs max / scale", _pattern_uniform),
    "alternating_scale": ("Alternate small / large abs max per k-group", _pattern_alternating),
    "ramp_scale": ("Linearly increasing abs max per k-group", _pattern_ramp),
    "random": ("Random bf16 weights (baseline)", None),
}


def act_fn(x):
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    print(f"gate_buf: {gate_buf}")
    print(f"up_buf: {up_buf}")
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    print(f"intermediate: {intermediate}")
    print(f"mlp output: {ret}")
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output


def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: Literal[0, 1] = 1) -> torch.Tensor:
    if value.dtype is not torch.int8:
        raise ValueError("Tensor must be torch.int8 before packing")
    if not (1 <= num_bits <= 8):
        raise ValueError(f"num_bits must be in [1, 8], got {num_bits}")

    offset = 1 << (num_bits - 1)
    value = (value + offset).to(torch.uint8)
    device = value.device

    pack_factor = 32 // num_bits

    if packed_dim == 0:
        value = value.transpose(0, 1)

    rows, cols = value.shape
    padded_cols = math.ceil(cols / pack_factor) * pack_factor
    pad_len = padded_cols - cols

    if pad_len > 0:
        value = torch.nn.functional.pad(value, (0, pad_len))

    num_groups = padded_cols // pack_factor

    # Use int32 here
    reshaped = value.view(rows, num_groups, pack_factor).to(torch.int32)
    bit_shifts = torch.arange(pack_factor, device=device, dtype=torch.int32) * num_bits
    packed = (reshaped << bit_shifts).sum(dim=2, dtype=torch.int32)

    if packed_dim == 0:
        packed = packed.transpose(0, 1)

    return packed


def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor:
    e, rows, cols = q.shape
    flat = q.view(e * rows, cols)
    packed = pack_to_int32(flat, num_bits)
    return packed.view(e, rows, -1).contiguous()


def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
    """
    Symmetric max-abs/7 quantization per k-group following compressed_tensors packing.
    Args:
        weights: [expert_num, rows (N), cols (K)]
    Returns:
        packed_q: int32 tensor storing 8 int4s per element with shape [expert_num, rows * (cols // 8)]
        scales: bfloat16 tensor with shape [expert_num, rows * (cols // group_size)]
    """
    weights_f32 = weights.to(torch.float32)
    e, rows, cols = weights_f32.shape
    if cols % group_size != 0 or cols % 2 != 0:
        raise ValueError(f"cols ({cols}) must be divisible by group_size ({group_size}) and 2")

    reshaped = weights_f32.view(e, rows, cols // group_size, group_size)
    max_abs = reshaped.abs().amax(dim=-1, keepdim=True)
    max_abs = torch.clamp(max_abs, min=1e-8)
    scales = (max_abs / 7.0).squeeze(-1)
    q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
    q = q.view(e, rows, cols)
    packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous()
    scales = scales.to(torch.bfloat16).contiguous().view(e, rows, cols // group_size).contiguous()

    print(f"Quantized weights: {packed.shape}, scales: {scales.shape}")
    print(f"Quantized tensors: \n{packed},\n {scales}")
    return packed, scales


def build_structured_tensor(shape: torch.Size, pattern: str) -> torch.Tensor:
    if pattern == "random":
        torch.manual_seed(42)
        return (torch.randn(shape, dtype=torch.bfloat16, device="cpu") / 100.0).contiguous()

    e, rows, cols = shape
    groups = cols // k_group_size
    group_builder = WEIGHT_PATTERNS[pattern][1]
    group_vals = group_builder(groups).to(torch.float32)
    block = group_vals.view(1, 1, groups, 1).expand(e, rows, groups, k_group_size).clone()
    row_signs = torch.where(
        (torch.arange(rows) % 2 == 0),
        torch.ones(rows, dtype=torch.float32),
        -torch.ones(rows, dtype=torch.float32),
    ).view(1, rows, 1, 1)
    col_offsets = torch.linspace(-0.0005, 0.0005, steps=k_group_size, dtype=torch.float32).view(1, 1, 1, k_group_size)
    block = block * row_signs + col_offsets
    return block.reshape(shape).to(torch.bfloat16).contiguous()


def prepare_k2_quantized_weights(pattern: str) -> Dict[str, torch.Tensor]:
    if pattern not in WEIGHT_PATTERNS:
        raise ValueError(f"Unknown weight pattern: {pattern}")

    gate_proj = build_structured_tensor((expert_num, intermediate_size, hidden_size), pattern)
    up_proj = build_structured_tensor((expert_num, intermediate_size, hidden_size), pattern)
    down_proj = build_structured_tensor((expert_num, hidden_size, intermediate_size), pattern)

    gate_q, gate_scales = quantize_k2_tensor(gate_proj, k_group_size)
    up_q, up_scales = quantize_k2_tensor(up_proj, k_group_size)
    down_q, down_scales = quantize_k2_tensor(down_proj, k_group_size)

    return {
        "gate_qweight": gate_q.contiguous(),
        "up_qweight": up_q.contiguous(),
        "down_qweight": down_q.contiguous(),
        "gate_scales": gate_scales.contiguous(),
        "up_scales": up_scales.contiguous(),
        "down_scales": down_scales.contiguous(),
        "original_fp16": {
            "gate_proj": gate_proj.to(torch.float16).contiguous(),
            "up_proj": up_proj.to(torch.float16).contiguous(),
            "down_proj": down_proj.to(torch.float16).contiguous(),
        },
    }


def build_moes_from_quantized_data(quant_data: Dict[str, torch.Tensor]):
    moes = []
    with torch.inference_mode(mode=True):
        for _ in range(layer_num):
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.quant_config.bits = 4
            config.quant_config.group_size = k_group_size
            config.quant_config.zero_point = False

            config.gate_proj = quant_data["gate_qweight"].data_ptr()
            config.up_proj = quant_data["up_qweight"].data_ptr()
            config.down_proj = quant_data["down_qweight"].data_ptr()

            config.gate_scale = quant_data["gate_scales"].data_ptr()
            config.up_scale = quant_data["up_scales"].data_ptr()
            config.down_scale = quant_data["down_scales"].data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            # CPUInfer.submit(moe.warm_up_task())
            # CPUInfer.sync()
            moes.append(moe)
    return moes


def run_case(pattern: str) -> Dict[str, float]:
    print("\n" + "=" * 70)
    desc = WEIGHT_PATTERNS[pattern][0]
    print(f"Running case: {pattern} -> {desc}")
    print("=" * 70)

    quant_data = prepare_k2_quantized_weights(pattern)
    moes = build_moes_from_quantized_data(quant_data)

    original_weights = quant_data["original_fp16"]
    gate_fp16 = original_weights["gate_proj"]
    up_fp16 = original_weights["up_proj"]
    down_fp16 = original_weights["down_proj"]

    diffs = []
    with torch.inference_mode(mode=True):
        for i in range(validation_iter):
            torch.manual_seed(100 + i)
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.randn((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous() / 100
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()

            moe = moes[i % layer_num]
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_tensor.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            input_tensor_fp16 = input_tensor.to(torch.float16)
            t_output = moe_torch(input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16).to(
                torch.bfloat16
            )

            t_output = t_output.flatten()
            output = output.flatten()

            diff = torch.mean(torch.abs(output - t_output)) / (torch.mean(torch.abs(t_output)) + 1e-12)
            diffs.append(diff.item())
            print(f"[{pattern}] Iteration {i}: relative L1 diff = {diff:.4f}")
            print(f"           output   {output}")
            print(f"           t_output {t_output}")

    mean_diff = float(sum(diffs) / len(diffs))
    max_diff = float(max(diffs))
    min_diff = float(min(diffs))
    return {"case": pattern, "description": desc, "mean": mean_diff, "max": max_diff, "min": min_diff}


def run_k2_moe_test():
    summary_rows = []
    for case_name in WEIGHT_PATTERNS.keys():
        results = run_case(case_name)
        summary_rows.append(results)
        # break

    print("\n=== Case vs. Relative Error Summary ===")
    print(f"{'Case':<20} {'Mean':>10} {'Max':>10} {'Min':>10}")
    for row in summary_rows:
        print(f"{row['case']:<20} {row['mean']*100:9.2f}% {row['max']*100:9.2f}% {row['min']*100:9.2f}%")


if __name__ == "__main__":
    run_k2_moe_test()


================================================
FILE: kt-kernel/examples/test_k2_write_buffer.py
================================================
import os
import sys
import time

import torch
import numpy as np


from kt_kernel import kt_kernel_ext
from kt_kernel_ext import CPUInfer


def make_cpu_infer(thread_num=80):
    return CPUInfer(thread_num)


def build_config(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size, group_size):
    cfg = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    cfg.max_len = 1
    cfg.quant_config.bits = 4
    cfg.quant_config.group_size = group_size
    cfg.quant_config.zero_point = False
    cfg.pool = cpuinfer.backend_
    return cfg


def allocate_weights(expert_num, hidden_size, intermediate_size, group_size):
    # packed int4 weights: 2 values per byte
    per_mat_weight_bytes = (hidden_size * intermediate_size) // 2
    per_mat_scale_elems = (hidden_size * intermediate_size) // group_size

    gate_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    up_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    down_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)

    gate_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
    up_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
    down_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)

    return (
        gate_q,
        up_q,
        down_q,
        gate_scale,
        up_scale,
        down_scale,
        per_mat_weight_bytes,
        per_mat_scale_elems,
    )


def test_with_tp(gpu_tp_count):
    """Test write_weight_scale_to_buffer with a specific gpu_tp_count"""
    torch.manual_seed(123)

    expert_num = 8  # Reduced for faster testing
    gpu_experts = expert_num  # Number of experts on GPU

    num_experts_per_tok = 8
    hidden_size = 7168
    intermediate_size = 2048
    group_size = 32

    cpuinfer = make_cpu_infer()
    cfg = build_config(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size, group_size)

    (
        gate_q,
        up_q,
        down_q,
        gate_scale,
        up_scale,
        down_scale,
        per_mat_weight_bytes,
        per_mat_scale_elems,
    ) = allocate_weights(expert_num, hidden_size, intermediate_size, group_size)

    cfg.gate_proj = gate_q.data_ptr()
    cfg.up_proj = up_q.data_ptr()
    cfg.down_proj = down_q.data_ptr()
    cfg.gate_scale = gate_scale.data_ptr()
    cfg.up_scale = up_scale.data_ptr()
    cfg.down_scale = down_scale.data_ptr()

    moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(cfg)

    physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
    cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    cpuinfer.sync()

    # TP configuration
    # Calculate sizes per TP part (per expert)
    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
    scale_elems_per_expert_per_tp = per_mat_scale_elems // gpu_tp_count

    # Total sizes for all gpu_experts
    total_weight_bytes_per_tp = gpu_experts * weight_bytes_per_expert_per_tp
    total_scale_elems_per_tp = gpu_experts * scale_elems_per_expert_per_tp

    # Create buffer lists for w13 (gate+up) and w2 (down)
    # These hold all experts' data for each GPU TP
    w13_weight_bufs = []
    w13_scale_bufs = []
    w2_weight_bufs = []
    w2_scale_bufs = []

    for tp_idx in range(gpu_tp_count):
        # w13 combines gate and up, so needs 2x the size per expert
        w13_weight_bufs.append(torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8))
        w13_scale_bufs.append(torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16))
        w2_weight_bufs.append(torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8))
        w2_scale_bufs.append(torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16))

    print(f"Total experts: {expert_num}, GPU experts: {gpu_experts}")
    print(f"GPU TP count: {gpu_tp_count}")
    print(f"Original per matrix weight bytes: {per_mat_weight_bytes}")
    print(f"Original per matrix scale elements: {per_mat_scale_elems}")
    print(f"Weight bytes per expert per TP: {weight_bytes_per_expert_per_tp}")
    print(f"Scale elements per expert per TP: {scale_elems_per_expert_per_tp}")
    print(f"Total weight bytes per TP (w13): {2 * total_weight_bytes_per_tp}")
    print(f"Total weight bytes per TP (w2): {total_weight_bytes_per_tp}")

    # Helper function to get pointers with expert offset
    # K2 write_weights_to_buffer writes one expert at a time, so we need to pass
    # pointers that already point to the correct location for each expert
    def get_expert_ptrs(expert_id):
        w13_weight_ptrs = []
        w13_scale_ptrs = []
        w2_weight_ptrs = []
        w2_scale_ptrs = []

        for tp_idx in range(gpu_tp_count):
            # Calculate byte offsets for this expert
            # w13: gate_weight + up_weight interleaved by expert
            # Layout: [expert0_gate, expert0_up, expert1_gate, expert1_up, ...]
            w13_weight_expert_offset = expert_id * 2 * weight_bytes_per_expert_per_tp
            w13_scale_expert_offset = expert_id * 2 * scale_elems_per_expert_per_tp
            w2_weight_expert_offset = expert_id * weight_bytes_per_expert_per_tp
            w2_scale_expert_offset = expert_id * scale_elems_per_expert_per_tp

            w13_weight_ptrs.append(w13_weight_bufs[tp_idx].data_ptr() + w13_weight_expert_offset)
            w13_scale_ptrs.append(w13_scale_bufs[tp_idx].data_ptr() + w13_scale_expert_offset * 2)  # bf16 = 2 bytes
            w2_weight_ptrs.append(w2_weight_bufs[tp_idx].data_ptr() + w2_weight_expert_offset)
            w2_scale_ptrs.append(w2_scale_bufs[tp_idx].data_ptr() + w2_scale_expert_offset * 2)  # bf16 = 2 bytes

        return w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs

    # Warm up
    for i in range(2):
        for expert_id in range(gpu_experts):
            w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
            cpuinfer.submit(
                moe.write_weight_scale_to_buffer_task(
                    gpu_tp_count=gpu_tp_count,
                    expert_id=expert_id,
                    w13_weight_ptrs=w13_weight_ptrs,
                    w13_scale_ptrs=w13_scale_ptrs,
                    w2_weight_ptrs=w2_weight_ptrs,
                    w2_scale_ptrs=w2_scale_ptrs,
                )
            )
            cpuinfer.sync()

    # Timing
    begin_time = time.perf_counter_ns()
    for expert_id in range(gpu_experts):
        w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
        cpuinfer.submit(
            moe.write_weight_scale_to_buffer_task(
                gpu_tp_count=gpu_tp_count,
                expert_id=expert_id,
                w13_weight_ptrs=w13_weight_ptrs,
                w13_scale_ptrs=w13_scale_ptrs,
                w2_weight_ptrs=w2_weight_ptrs,
                w2_scale_ptrs=w2_scale_ptrs,
            )
        )
        cpuinfer.sync()
    end_time = time.perf_counter_ns()
    elapsed_ms = (end_time - begin_time) / 1000000
    total_weights = hidden_size * intermediate_size * gpu_experts * 3
    total_bytes = total_weights // group_size * 2 + total_weights // 2  # scale (bf16) + weight (int4)
    print(f"write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
    print(f"Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")

    def split_expert_tensor(tensor, chunk):
        """Split tensor by experts"""
        return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]

    # Split by experts first
    gate_q_experts = split_expert_tensor(gate_q, per_mat_weight_bytes)
    up_q_experts = split_expert_tensor(up_q, per_mat_weight_bytes)
    down_q_experts = split_expert_tensor(down_q, per_mat_weight_bytes)

    gate_scale_experts = split_expert_tensor(gate_scale, per_mat_scale_elems)
    up_scale_experts = split_expert_tensor(up_scale, per_mat_scale_elems)
    down_scale_experts = split_expert_tensor(down_scale, per_mat_scale_elems)

    # Verify buffers for each TP part
    for tp_idx in range(gpu_tp_count):
        expected_w13_weights = []
        expected_w13_scales = []
        expected_w2_weights = []
        expected_w2_scales = []

        weight13_per_tp = per_mat_weight_bytes // gpu_tp_count
        scale13_per_tp = per_mat_scale_elems // gpu_tp_count

        # Process each GPU expert
        for expert_id in range(gpu_experts):
            # For w13 (gate and up), the slicing is straightforward
            start_weight = tp_idx * weight13_per_tp
            end_weight = (tp_idx + 1) * weight13_per_tp
            start_scale = tp_idx * scale13_per_tp
            end_scale = (tp_idx + 1) * scale13_per_tp

            # Gate
            gate_weight_tp = gate_q_experts[expert_id][start_weight:end_weight]
            gate_scale_tp = gate_scale_experts[expert_id][start_scale:end_scale]

            # Up
            up_weight_tp = up_q_experts[expert_id][start_weight:end_weight]
            up_scale_tp = up_scale_experts[expert_id][start_scale:end_scale]

            # Down matrix needs special handling because it's sliced column-wise
            # We need to reconstruct it from column slices
            down_weight_tp_parts = []
            down_scale_tp_parts = []

            # Iterate through each column to extract the corresponding parts
            for col_idx in range(hidden_size):
                col_weight_start = col_idx * (intermediate_size // 2)
                col_scale_start = col_idx * (intermediate_size // group_size)

                # Direct mapping: each CPU TP corresponds to a GPU TP
                tp_slice_weight_size = (intermediate_size // gpu_tp_count) // 2
                tp_slice_scale_size = (intermediate_size // gpu_tp_count) // group_size

                tp_weight_offset = col_weight_start + tp_idx * tp_slice_weight_size
                tp_scale_offset = col_scale_start + tp_idx * tp_slice_scale_size

                down_weight_tp_parts.append(
                    down_q_experts[expert_id][tp_weight_offset : tp_weight_offset + tp_slice_weight_size]
                )
                down_scale_tp_parts.append(
                    down_scale_experts[expert_id][tp_scale_offset : tp_scale_offset + tp_slice_scale_size]
                )

            # Concatenate all column slices for this TP
            down_weight_tp = torch.cat(down_weight_tp_parts)
            down_scale_tp = torch.cat(down_scale_tp_parts)

            # Append to expected lists - interleaved by expert: [gate0, up0, gate1, up1, ...]
            expected_w13_weights.append(gate_weight_tp)
            expected_w13_weights.append(up_weight_tp)
            expected_w13_scales.append(gate_scale_tp)
            expected_w13_scales.append(up_scale_tp)
            expected_w2_weights.append(down_weight_tp)
            expected_w2_scales.append(down_scale_tp)

        # Concatenate all experts for this TP part
        expected_w13_weight = torch.cat(expected_w13_weights)
        expected_w13_scale = torch.cat(expected_w13_scales)
        expected_w2_weight = torch.cat(expected_w2_weights)
        expected_w2_scale = torch.cat(expected_w2_scales)

        print(f"=== Checking TP part {tp_idx} ===")
        print(f"  w13 weight shape: actual={w13_weight_bufs[tp_idx].shape}, expected={expected_w13_weight.shape}")
        print(f"  w13 scale shape: actual={w13_scale_bufs[tp_idx].shape}, expected={expected_w13_scale.shape}")
        print(f"  w2 weight shape: actual={w2_weight_bufs[tp_idx].shape}, expected={expected_w2_weight.shape}")
        print(f"  w2 scale shape: actual={w2_scale_bufs[tp_idx].shape}, expected={expected_w2_scale.shape}")

        # Assert all checks pass
        if not torch.equal(w13_weight_bufs[tp_idx], expected_w13_weight):
            diff_mask = w13_weight_bufs[tp_idx] != expected_w13_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            print(f"  w13 weight mismatch at index {first_diff_idx}")
            print(f"    actual: {w13_weight_bufs[tp_idx][first_diff_idx:first_diff_idx+10]}")
            print(f"    expected: {expected_w13_weight[first_diff_idx:first_diff_idx+10]}")
            raise AssertionError(f"w13 weight bytes mismatch for TP {tp_idx}")

        if not torch.allclose(w13_scale_bufs[tp_idx], expected_w13_scale):
            diff = torch.abs(w13_scale_bufs[tp_idx].float() - expected_w13_scale.float())
            max_diff_idx = diff.argmax().item()
            print(f"  w13 scale mismatch, max diff at index {max_diff_idx}")
            print(f"    actual: {w13_scale_bufs[tp_idx][max_diff_idx]}")
            print(f"    expected: {expected_w13_scale[max_diff_idx]}")
            raise AssertionError(f"w13 scale values mismatch for TP {tp_idx}")

        if not torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight):
            diff_mask = w2_weight_bufs[tp_idx] != expected_w2_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            print(f"  w2 weight mismatch at index {first_diff_idx}")
            print(f"    actual: {w2_weight_bufs[tp_idx][first_diff_idx:first_diff_idx+10]}")
            print(f"    expected: {expected_w2_weight[first_diff_idx:first_diff_idx+10]}")
            raise AssertionError(f"w2 weight bytes mismatch for TP {tp_idx}")

        if not torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale):
            diff = torch.abs(w2_scale_bufs[tp_idx].float() - expected_w2_scale.float())
            max_diff_idx = diff.argmax().item()
            print(f"  w2 scale mismatch, max diff at index {max_diff_idx}")
            print(f"    actual: {w2_scale_bufs[tp_idx][max_diff_idx]}")
            print(f"    expected: {expected_w2_scale[max_diff_idx]}")
            raise AssertionError(f"w2 scale values mismatch for TP {tp_idx}")

    print(
        f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts"
    )
    return True


def main():
    """Run tests for all gpu_tp_count values: 1, 2, 4, 8"""
    tp_values = [1, 2, 4, 8]
    all_passed = True
    results = {}

    print("=" * 60)
    print("Testing K2 write_weight_scale_to_buffer for TP = 1, 2, 4, 8")
    print("=" * 60)

    for tp in tp_values:
        print(f"\n{'='*60}")
        print(f"Testing with gpu_tp_count = {tp}")
        print(f"{'='*60}")
        try:
            test_with_tp(tp)
            results[tp] = "PASSED"
            print(f"✓ TP={tp} PASSED")
        except Exception as e:
            results[tp] = f"FAILED: {e}"
            all_passed = False
            print(f"✗ TP={tp} FAILED: {e}")

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    for tp, result in results.items():
        status = "✓" if "PASSED" in result else "✗"
        print(f"  {status} TP={tp}: {result}")

    if all_passed:
        print("\n✓ ALL TESTS PASSED")
    else:
        print("\n✗ SOME TESTS FAILED")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/examples/test_linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:36:59
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch

input_size = 16384
output_size = 5120
stride = 32
group_max_len = 1024
proj_type = 1  # ggml_type::GGML_TYPE_F16
hidden_type = 1  # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = kt_kernel_ext.CPUInfer(48)
validation_iter = 100

with torch.inference_mode(mode=True):
    linears = []
    projs = []
    for _ in range(layer_num):
        proj = torch.randn((output_size, input_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
        config = kt_kernel_ext.linear.LinearConfig(
            input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
        )
        linear = kt_kernel_ext.linear.Linear(config)
        projs.append(proj)
        linears.append(linear)

    # validation
    for i in range(validation_iter):
        linear = linears[i % layer_num]
        input = torch.randn((qlen, input_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
        input = input / 100

        CPUInfer.submit(linear.forward(qlen, input.data_ptr(), output.data_ptr()))
        CPUInfer.sync()
        # print('cpuinfer output', output)

        proj = projs[i % layer_num]
        t_output = torch.mm(input, proj.t())
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print("diff = ", diff)
        assert diff < 0.001


================================================
FILE: kt-kernel/examples/test_mla.py
================================================
import logging
import os, sys
import time
from typing import Optional

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding

logger = logging.getLogger("reader")

from gguf.gguf_reader import GGUFReader


def read_gguf_file(gguf_file_path):
    """
    Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.

    Parameters:
    - gguf_file_path: Path to the GGUF file.
    """

    reader = GGUFReader(gguf_file_path)

    # List all key-value pairs in a columnized format
    # print("Key-Value Pairs:") # noqa: NP100
    # max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
        # print(f"{key:{max_key_length}} : {value}") # noqa: NP100
    # print("----") # noqa: NP100

    # List all tensors
    # print("Tensors:") # noqa: NP100
    # tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
    # print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
    # print("-" * 80) # noqa: NP100
    re = []
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
        # print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
        re.append(tensor)
    return re


def get_torch_tensor_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous()


def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name


def type_to_ggml_type(type):
    if type == "F32":
        return ggml_type.FP32
    elif type == "F16":
        return ggml_type.FP16
    elif type == "BF16":
        return ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {type}")


use_real_weights = True
gguf_path = "/home/bd/models/DeepSeek-R1-BF16"

seed = 42  # 你可以选择任何整数作为种子
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

qlen = 3212
kvlen = 0


page_table = range(20)
bsz_tensors = torch.tensor([1])


page_size = 256
pages_count = 200
tp_count = 4


hidden_size = 7168
q_lora_rank = 1536
kv_lora_rank = 512
num_heads = 128
nope_size = 128
rope_size = 64

rope_theta = 10000
max_qlen = 4096
max_kvlen = 4096

max_position_embeddings = 163840


rope_scaling = {
    "beta_fast": 32,
    "beta_slow": 1,
    "factor": 40,
    "mscale": 1.0,
    "mscale_all_dim": 1.0,
    "original_max_position_embeddings": 4096,
    "type": "yarn",
}


CPUInfer = kt_kernel_ext.CPUInfer(30)
validation_iter = 100


# data_type = torch.float32
weight_type = torch.bfloat16
# weight_type = torch.float16


input_type = {
    torch.float32: torch.float32,
    torch.float16: torch.float16,
    torch.bfloat16: torch.float32,
}[weight_type]

q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)


def read_gguf_directory(directory):
    """
    Reads all GGUF files in a directory and prints their contents.

    Parameters:
    - directory: Path to the directory containing GGUF files.
    """
    if not os.path.isdir(directory):
        logger.error(f"Directory {directory} does not exist.")
        return

    # List all GGUF files in the directory
    files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
    if not files:
        logger.info(f"No GGUF files found in {directory}.")
        return

    re = []
    for file in files:
        file_path = os.path.join(directory, file)
        # print(f"Reading {file_path}:") # noqa: NP100
        # print("\n") # noqa: NP100
        re.extend(read_gguf_file(file_path))
    re = {r.name: r for r in re}
    return re


if use_real_weights := True:
    gguf_weights = read_gguf_directory(gguf_path)
    layer_idx = 0
    q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
    q_a_proj.weight = nn.Parameter(q_a_proj_weight.view(torch.bfloat16), requires_grad=False)
    q_a_type = type

    q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
    q_a_norm = q_a_norm_weight.view(torch.float32)
    # config.q_a_norm = q_a_norm_weight.data_ptr()
    # config.q_a_norm_type = type_to_ggml_type(type)

    q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
    q_b_proj.weight = nn.Parameter(q_b_proj_weight.view(torch.bfloat16), requires_grad=False)

    kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
        gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
    )
    kv_a_proj_with_mqa.weight = nn.Parameter(kv_a_proj_with_mqa_weight.view(torch.bfloat16), requires_grad=False)

    kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
    kv_a_norm = kv_a_norm_weight.view(torch.float32)
    # config.kv_a_norm = kv_a_norm_weight.data_ptr()
    # config.kv_a_norm_type = type_to_ggml_type(type)

    kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
    kv_b_proj.weight = nn.Parameter(kv_b_proj_weight.view(torch.bfloat16), requires_grad=False)

    o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
    o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False)

else:
    init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
    init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
    init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
    init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
    init.normal_(o_proj.weight, mean=0.0, std=0.02)

x_reshaped = kv_b_proj.weight.view(num_heads, 2, nope_size, kv_lora_rank)
q_absorb = x_reshaped[:, 0]
out_absorb = x_reshaped[:, 1]


hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()


def test_cpu_mla():
    os.environ["BLAS_NUM_THREADS"] = "1"
    q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
    q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
    kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
    kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
    o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()

    config = kt_kernel_ext.mla.MLAConfig(
        hidden_size,
        q_lora_rank,
        kv_lora_rank,
        num_heads,
        nope_size,
        rope_size,
    )
    config.max_qlen = max_qlen
    config.max_kvlen = max_kvlen
    config.max_position_embeddings = max_position_embeddings
    config.rope_scaling_factor = rope_scaling["factor"]
    config.rope_theta = rope_theta
    config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
    config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
    config.rope_scaling_mscale = rope_scaling["mscale"]
    config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
    config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

    config.q_a_proj = q_a_proj_weight.data_ptr()
    config.q_b_proj = q_b_proj_weight.data_ptr()
    config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
    config.kv_b_proj = kv_b_proj_weight.data_ptr()
    config.o_proj = o_proj_weight.data_ptr()

    config.q_a_norm = q_a_norm.data_ptr()
    config.q_a_norm_type = ggml_type.FP32
    config.kv_a_norm = kv_a_norm.data_ptr()
    config.kv_a_norm_type = ggml_type.FP32
    config.page_count = pages_count

    if weight_type == torch.float32:
        config.q_a_proj_type = ggml_type.FP32
        config.q_b_proj_type = ggml_type.FP32
        config.kv_a_proj_with_mqa_type = ggml_type.FP32
        config.kv_b_proj_type = ggml_type.FP32
        config.w_o_type = ggml_type.FP32
    elif weight_type == torch.float16:
        config.q_a_proj_type = ggml_type.FP16
        config.q_b_proj_type = ggml_type.FP16
        config.kv_a_proj_with_mqa_type = ggml_type.FP16
        config.kv_b_proj_type = ggml_type.FP16
        config.w_o_type = ggml_type.FP16
    elif weight_type == torch.bfloat16:
        config.q_a_proj_type = ggml_type.BF16
        config.q_b_proj_type = ggml_type.BF16
        config.kv_a_proj_with_mqa_type = ggml_type.BF16
        config.kv_b_proj_type = ggml_type.BF16
        config.w_o_type = ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {weight_type}")

    config.pool = CPUInfer.backend_

    if weight_type == torch.float32:
        mla = kt_kernel_ext.mla.MLA_F32(config)
    elif weight_type == torch.float16:
        mla = kt_kernel_ext.mla.MLA_F16(config)
    elif weight_type == torch.bfloat16:
        # mla = kt_kernel_ext.mla.MLA_F32(config)
        mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
    else:
        raise ValueError(f"Unsupported data type: {weight_type}")

    mla.load_weights()
    mla.set_local_pages(pages_count)

    output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
    mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
    print("CPU MLA Output: ", output)
    return output


def load_fp16_tensor(file_path, shape):
    # return load_fp32_tensor(file_path, shape)
    return torch.zeros(shape)
    with open(file_path, "rb") as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=weight_type)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor


def load_fp32_tensor(file_path, shape):
    return torch.zeros(shape)
    with open(file_path, "rb") as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float32)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor


def test_torch():
    torch.set_grad_enabled(False)

    softmax_scale = (nope_size + rope_size) ** -0.5
    # 1代表的是压缩的kv的头数
    k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type)
    kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)

    q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)
    q_a_layernorm.weight = nn.Parameter(q_a_norm, requires_grad=False)

    x = torch.randn(q_lora_rank, dtype=weight_type) * 100
    print(x)
    print(q_a_layernorm(x))

    kv_a_layernorm = DeepseekV2RMSNorm(kv_lora_rank)
    kv_a_layernorm.weight = nn.Parameter(kv_a_norm, requires_grad=False)

    # 第三步：拆分成两个 tensor
    # q_absorb, out_absorb = x_permuted[:, 0], x_permuted[:, 1]  # 都是 (num_heads, nope_size, kv_lora_rank
    # q_absorb = kv_b_proj[:, ] # torch.randn(num_heads, nope_size, kv_lora_rank, dtype=data_type)
    # out_absorb = kv_b_proj # torch.randn(num_heads, nope_size, kv_lora_rank, dtype=data_type)

    rotary_emb = DeepseekV3YarnRotaryEmbedding(
        rope_size,
        max_position_embeddings=max_position_embeddings,
        scaling_factor=rope_scaling["factor"],
        base=rope_theta,
        beta_fast=rope_scaling["beta_fast"],
        beta_slow=rope_scaling["beta_slow"],
        mscale=rope_scaling["mscale"],
        mscale_all_dim=rope_scaling["mscale_all_dim"],
        original_max_position_embeddings=rope_scaling["original_max_position_embeddings"],
    )
    # 构造一个qlen 长度的输入 hidden_states, 对应的历史 kv_indptr 是[0:bsz]
    # kv_indices 是[0:bsz]，page_idx=[0:bsz], page_offset=[kvlen:qlen+kvlen]
    # last_page_len = [qlen+kvlen,...] layer_idx = 1
    # position_ids = [kvlen:qlen+kvlen]
    q_indptr = torch.tensor([0, qlen]).to(torch.int32)

    kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
    kv_indices = torch.tensor(range(pages_count)).to(torch.int32)

    page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
    page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)

    last_page_len = torch.tensor([256], device=hidden_states.device)
    position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)

    # 按照行创建 mask [qlen,kvlen+qlen]
    attention_masks = torch.zeros((max_qlen, max_kvlen), dtype=weight_type)
    for i in range(max_qlen):
        attention_masks[i, i + kvlen + 1 :] = -inf

    def torch_attn(
        hidden_states_i: torch.Tensor,
        kv_cache: KDeepSeekV3Cache,
        position_ids: torch.Tensor,
        page_idx: torch.Tensor,
        page_offset: torch.Tensor,
        attention_masks: Optional[list[torch.Tensor]] = None,
        q_indptr: Optional[torch.Tensor] = None,
        kv_indices: Optional[torch.Tensor] = None,
        kv_indptr: Optional[torch.Tensor] = None,
        bsz_tensors: Optional[torch.Tensor] = None,
        last_page_len: Optional[torch.Tensor] = None,
        layer_idx: Optional[int] = None,
    ):
        global out_absorb
        global q_absorb
        hidden_states = hidden_states_i.to(weight_type)
        # range bsz_tensors
        final_attention_output = torch.tensor([], device=hidden_states.device)
        for i in range(bsz_tensors[0]):
            batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
            batch_last_page_len = last_page_len[i]
            # kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
            batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
            batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
            # kv_page_nums is the number of pages for the current batch
            kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
            # kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
            kv_total_len = kv_page_nums * page_size
            if batch_last_page_len is not None:
                kv_total_len = kv_total_len - (page_size - batch_last_page_len)
            # print(f"kv_total_len's shape {kv_total_len.shape}")
            # kv_index is the index of the kv cache pages for the current batch
            kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
            # we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
            # from q_indptr[i] to q_indptr[i+1] is the range of the current batch
            batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
            batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
            qlen, _ = batch_hidden_states.size()
            # print("qlen -> ", qlen)

            hidden_states_to_check = load_fp16_tensor("./debug/query_0_tp_0_input.bin", batch_hidden_states.shape)
            diff = torch.abs(batch_hidden_states - hidden_states_to_check).max()
            print("hidden_states diff -> ", diff)

            q_lora = q_a_proj(batch_hidden_states)
            # q_lora_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora.bin', q_lora.shape)
            # q_lora_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_test.bin', q_lora.shape)
            # diff = torch.abs(q_lora - q_lora_to_check).max()
            # diff_test = torch.abs(q_lora - q_lora_to_check_test).max()
            # print("q_lora max diff -> ", diff)
            # print("q_lora max diff test -> ", diff_test)
            # mae =  torch.mean(torch.abs(q_lora - q_lora_to_check))
            # mae_test =  torch.mean(torch.abs(q_lora - q_lora_to_check_test))
            # print("q_lora mae -> ", mae)
            # print("q_lora mae test -> ", mae_test)

            q_lora_norm = q_a_layernorm(q_lora)
            # q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
            # q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
            # diff = torch.abs(q_lora_norm - q_lora_norm_to_check).max()
            # mae =  torch.mean(torch.abs(q_lora_norm - q_lora_norm_to_check))
            # diff_test = torch.abs(q_lora_norm - q_lora_norm_to_check_test).max()
            # mae_test =  torch.mean(torch.abs(q_lora_norm - q_lora_norm_to_check_test))
            # print("q_lora_norm diff -> ", diff)
            # print("q_lora_norm mae -> ", mae)
            # print("q_lora_norm diff test -> ", diff_test)
            # print("q_lora_norm mae test -> ", mae_test)

            q = q_b_proj(q_lora_norm)
            # for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
            q = q.view(qlen, num_heads, nope_size + rope_size)
            # q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
            # q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
            q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)

            # compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
            compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
            # compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
            compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
            compressed_kv = compressed_kv.contiguous()

            # compressed_kv_page_0 = compressed_kv[0:page_size, :]
            # compressed_kv_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_kv_lora_rank',
            #                                           compressed_kv_page_0.shape)
            # diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
            # mae =  torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
            # print("compressed_kv diff -> ", diff)
            # print("compressed_kv mae -> ", mae)

            compressed_kv = kv_a_layernorm(compressed_kv)
            # k_pe is [qlen, 1, qk_rope_head_dim(64)]

            # compressed_kv_page_0 = compressed_kv[0:page_size, :]
            # compressed_kv_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_kv_lora_rank_norm',
            #                                           compressed_kv_page_0.shape)
            # diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
            # mae =  torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
            # print("compressed_kv diff norm -> ", diff)
            # print("compressed_kv mae norm -> ", mae)

            k_pe = k_pe.view(qlen, 1, rope_size)
            # compressed_kv is [qlen, 1, kv_lora_rank(512)]
            compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)

            cos, sin = rotary_emb(q_pe, batch_position_ids)

            # q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below

            # q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope_check[0].shape)
            # q_nope_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_nope_test', q_nope_check[0].shape)
            # diff = torch.abs(q_nope_check[0] - q_nope_0_to_check).max()
            # mae =  torch.mean(torch.abs(q_nope_check[0] - q_nope_0_to_check))
            # diff_test = torch.abs(q_nope_check[0] - q_nope_0_to_check_test).max()
            # mae_test =  torch.mean(torch.abs(q_nope_check[0] - q_nope_0_to_check_test))
            # print("q_nope[0] diff -> ", diff)
            # print("q_nope[0] mae -> ", mae)
            # print("q_nope[0] diff test -> ", diff_test)
            # print("q_nope[0] mae test -> ", mae_test)

            q_pe_nope = q_pe.transpose(0, 1)
            # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe_nope[0].shape)
            # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope', q_pe_nope[0].shape)
            # q_pe_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope_test', q_pe_nope[0].shape)
            # diff = torch.abs(q_pe_nope[0] - q_pe_0_to_check).max()
            # mae =  torch.mean(torch.abs(q_pe_nope[0] - q_pe_0_to_check))
            # diff_test = torch.abs(q_pe_nope[0] - q_pe_0_to_check_test).max()
            # mae_test =  torch.mean(torch.abs(q_pe_nope[0] - q_pe_0_to_check_test))
            # print("q_pe nope[0] diff -> ", diff)
            # print("q_pe nope[0] mae -> ", mae)
            # print("q_pe nope[0] diff test -> ", diff_test)
            # print("q_pe nope[0] mae test -> ", mae_test)

            # cos_to_check = load_fp32_tensor('./debug/query_0_tp_0_rope_cos', (qlen,32))
            # diff = torch.abs(cos[:,:32]-cos_to_check).max()
            # mae =  torch.mean(torch.abs(cos[:,:32]-cos_to_check))
            # print("cos diff -> ", diff)
            # print("cos mae -> ", mae)
            # sin_to_check = load_fp32_tensor('./debug/query_0_tp_0_rope_sin', (qlen,32))
            # diff = torch.abs(sin[:,:32]-sin_to_check).max()
            # mae =  torch.mean(torch.abs(sin[:,:32]-sin_to_check))
            # print("sin diff -> ", diff)
            # print("sin mae -> ", mae)

            # new_q_pe = q_pe.transpose(0, 1)
            # qa = new_q_pe[:,:,range(0,64,2)]
            # qb = new_q_pe[:,:,range(1,65,2)]
            # # q1 = (qa * cos[:,:32] - qb * sin[:,:32])
            # # q2 = (qb*cos[:,:32] + qa*sin[:,:32])
            # q1 = (qa * cos_to_check - qb * sin_to_check)
            # q2 = (qb*cos_to_check + qa*sin_to_check)
            # q_new = torch.cat((q1,q2), dim=-1)
            # print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
            # new_q_pe = torch.zeros_like(q_pe)
            # new_q_pe[:,:,range(0,64,2)] = 1
            # new_q_pe[:,:,range(1,65,2)] = 10
            q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
            q_pe = q_pe.squeeze(0)
            # q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
            q_pe.transpose_(0, 1)

            # diff = torch.abs(q_pe - q_new).max()
            # print("q_pe diff -> ", diff)

            # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
            # diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
            # mae =  torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
            # print("q_pe[0] diff -> ", diff)
            # print("q_pe[0] mae -> ", mae)

            # diff = torch.abs(q_pe_0_to_check - q_new[0]).max()
            # mae =  torch.mean(torch.abs(q_pe_0_to_check - q_new[0]))
            # print("q_pe[0] 2  diff -> ", diff)
            # print("q_pe[0] 2 mae -> ", mae)

            if kv_cache is not None:
                cache_kwargs = {
                    "sin": sin,
                    "cos": cos,
                    "page_idx": batch_page_idx,
                    "page_offset": batch_page_offset,
                }  # Specific to RoPE models
                compressed_kv_with_k_pe = kv_cache.update(
                    compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
                )
                compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
                k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
            # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
            # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
            # q_absorb, out_absorb = get_absorbed()
            # q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
            q_nope = q_nope.transpose(0, 1)  # qlen is 1, no GPU overhead, same below

            # q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope[0].shape)
            # diff = torch.abs(q_nope[0] - q_nope_0_to_check).max()
            # mae =  torch.mean(torch.abs(q_nope[0] - q_nope_0_to_check))
            # print("q_nope[0] diff -> ", diff)

            # q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
            q_nope = torch.matmul(q_nope, q_absorb)  # batched MM

            # k_b_proj_check = load_fp16_tensor('./debug/query_0_tp_0_k_b_lora', (nope_size,kv_lora_rank))
            # diff = torch.abs(q_absorb[0] - k_b_proj_check).max()
            # print("kv b lora weight[0] diff -> ", diff)

            # q_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_q_absorb', (kv_lora_rank,1024))
            # q_absorb_check = q_absorb_check[:,0:qlen].transpose(0,1)
            # diff = torch.abs(q_nope[0] - q_absorb_check).max()
            # mae =  torch.mean(torch.abs(q_nope[0] - q_absorb_check))
            # print("q_nope absorb diff -> ", diff)
            # print("q_nope absorb mae -> ", mae)

            # # q_nope is [qlen, num_heads(128), kv_lora_rank(512)]
            # q_nope = q_nope.transpose(0, 1)

            # we need to index out the compressed_kv and k_pe for the current batch
            batch_compressed_kv = None
            batch_k_pe = None
            for page_index in kv_index:
                if kv_total_len > page_size:
                    tmp_compressed_kv = compressed_kv[page_index, 0:page_size, :]
                    tmp_k_pe = k_pe[page_index, 0:page_size, :]
                    if batch_compressed_kv is None or batch_k_pe is None:
                        batch_compressed_kv = tmp_compressed_kv
                        batch_k_pe = tmp_k_pe
                    else:
                        batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                        batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                    kv_total_len -= page_size
                else:
                    tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
                    tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
                    if batch_compressed_kv is None or batch_k_pe is None:
                        batch_compressed_kv = tmp_compressed_kv
                        batch_k_pe = tmp_k_pe
                    else:
                        batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                        batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                    break
            # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
            # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]

            # k_pe_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_k_rope', (256,64))
            # diff = torch.abs(batch_k_pe[:256] - k_pe_to_check).max()
            # mae =  torch.mean(torch.abs(batch_k_pe[:256] - k_pe_to_check))
            # print("k_pe diff -> ", diff)
            # print("k_pe mae -> ", mae)

            pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
            kv_total_len = kv_page_nums * page_size
            # pe_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_pe_attention_weights', (1024,4096))
            # pe_weights_0 = pe_weights_0[0:qlen, 0:kv_total_len]
            # diff = torch.abs(pe_weights[0] - pe_weights_0).max()
            # print("pe_weights[0] diff -> ", diff)

            attention_weights = pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)

            # raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
            # raw_weights = raw_weights[0:qlen, 0:kv_total_len]
            # diff = torch.abs(attention_weights[0] - raw_weights).max()
            # print("raw attention_weights[0] diff -> ", diff)

            attention_weights = attention_weights * softmax_scale
            # attention_weights is [num_heads(128), qlen, k_len]

            # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)

            # attention_masks[i] is [qlen, k_len]

            print(attention_weights.shape)
            print(attention_masks.shape)
            attention_weights = (
                attention_weights + attention_masks[: attention_weights.shape[1], : attention_weights.shape[2]]
            )
            # attention_weights shape is [num_heads(128), qlen, k_len]

            attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=weight_type).to(q_pe.dtype)

            # attention_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_weights', (1024, 4096))
            # attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
            # diff = torch.abs(attention_weights[0] - attention_weights_0).max()
            # print("attention_weights[0] diff -> ", diff)

            attn_output = torch.matmul(attention_weights, batch_compressed_kv)  # [num_heads(128),qlen, lora_rank(512)]
            # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]

            # o_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_o_absorb', (qlen,kv_lora_rank))
            # diff = torch.abs(attn_output[0] - o_absorb_check).max()
            # print("o absorb[0] diff -> ", diff)

            out_absorb = out_absorb.transpose(1, 2)  # [qlen, num_heads(128), v_head_dim(128)]
            # q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
            attn_output = torch.matmul(attn_output, out_absorb)  # [num_heads(128), qlen, v_head_dim(128)]

            # attn_output_check_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_output', (qlen, nope_size))
            # diff = torch.abs(attn_output[0] - attn_output_check_0).max()
            # print("attn_output[0] diff -> ", diff)

            attn_output = attn_output.transpose(0, 1)  # [qlen, num_heads(128), v_head_dim(128)]
            attn_output = attn_output.reshape(qlen, num_heads * nope_size)

            w_o = o_proj.weight.view([hidden_size, num_heads * nope_size])
            output = torch.matmul(attn_output, w_o.transpose(0, 1))
            output = output.view(qlen, hidden_size)

            # output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
            # h1_o = w_o[:,:128]
            # local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128))
            # diff = torch.abs(local_o_check - h1_o).max()
            # print("local w_o diff -> ", diff)

            # h1_output = torch.matmul(attn_output[:,:128],h1_o.transpose(0,1))
            # diff = torch.abs(h1_output - output_0_check).max()
            # print("h1_output diff -> ", diff)

            # output_check = load_fp16_tensor('./debug/output.bin', output.shape)
            # diff = torch.abs(output - output_check).max()
            # mae =   torch.mean(torch.abs(output - output_check))
            # print("output diff -> ", diff)

            final_attention_output = torch.cat((final_attention_output, output), dim=0)
        return final_attention_output

    torch_output = torch_attn(
        hidden_states,
        kv_cache,
        position_ids,
        page_idx,
        page_offset,
        attention_masks=attention_masks,
        q_indptr=q_indptr,
        kv_indices=kv_indices,
        kv_indptr=kv_indptr,
        bsz_tensors=bsz_tensors,
        last_page_len=last_page_len,
        layer_idx=0,
    )
    print("Torch Output: ", torch_output)
    return torch_output


torch.set_printoptions(sci_mode=False, precision=5)
output_cpu = test_cpu_mla()
output_torch = test_torch()
print("Output CPU: ", output_cpu)
print("Output Torch: ", output_torch)
diff = (output_cpu - output_torch).abs()
# 计算相对误差
diff_relative = diff / (output_cpu.abs())
# 把 diff_relative 中的 NaN 替换为 0
diff_relative = torch.where(torch.isnan(diff_relative), torch.zeros_like(diff_relative), diff_relative)
diff_relative_mean = torch.mean(torch.abs(output_cpu - output_torch)) / torch.mean(torch.abs(output_torch))

print(
    f"Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()},  relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}"
)
assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"


================================================
FILE: kt-kernel/examples/test_mla_qlen.py
================================================
import logging
import os, sys
import time
from typing import Optional

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding

logger = logging.getLogger("reader")

from gguf.gguf_reader import GGUFReader


def read_gguf_file(gguf_file_path):
    """
    Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.

    Parameters:
    - gguf_file_path: Path to the GGUF file.
    """

    reader = GGUFReader(gguf_file_path)

    # List all key-value pairs in a columnized format
    # print("Key-Value Pairs:") # noqa: NP100
    # max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
        # print(f"{key:{max_key_length}} : {value}") # noqa: NP100
    # print("----") # noqa: NP100

    # List all tensors
    # print("Tensors:") # noqa: NP100
    # tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
    # print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
    # print("-" * 80) # noqa: NP100
    re = []
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
        # print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
        re.append(tensor)
    return re


def get_torch_tensor_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous()


def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name


def type_to_ggml_type(type):
    if type == "F32":
        return ggml_type.FP32
    elif type == "F16":
        return ggml_type.FP16
    elif type == "BF16":
        return ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {type}")


use_real_weights = True
gguf_path = "/home/bd/models/DeepSeek-R1-BF16"

seed = 42  # 你可以选择任何整数作为种子
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

qlen = 1024
kvlen = 0


page_table = range(20)
bsz_tensors = torch.tensor([1])


page_size = 256
pages_count = 200
tp_count = 4


hidden_size = 7168
q_lora_rank = 1536
kv_lora_rank = 512
num_heads = 128
nope_size = 128
rope_size = 64

rope_theta = 10000
max_qlen = 1024
max_kvlen = 4096

max_position_embeddings = 163840


rope_scaling = {
    "beta_fast": 32,
    "beta_slow": 1,
    "factor": 40,
    "mscale": 1.0,
    "mscale_all_dim": 1.0,
    "original_max_position_embeddings": 4096,
    "type": "yarn",
}


CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 100


# data_type = torch.float32
weight_type = torch.bfloat16
# weight_type = torch.float16


input_type = {
    torch.float32: torch.float32,
    torch.float16: torch.float16,
    torch.bfloat16: torch.float32,
}[weight_type]

q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)


def read_gguf_directory(directory):
    """
    Reads all GGUF files in a directory and prints their contents.

    Parameters:
    - directory: Path to the directory containing GGUF files.
    """
    if not os.path.isdir(directory):
        logger.error(f"Directory {directory} does not exist.")
        return

    # List all GGUF files in the directory
    files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
    if not files:
        logger.info(f"No GGUF files found in {directory}.")
        return

    re = []
    for file in files:
        file_path = os.path.join(directory, file)
        # print(f"Reading {file_path}:") # noqa: NP100
        # print("\n") # noqa: NP100
        re.extend(read_gguf_file(file_path))
    re = {r.name: r for r in re}
    return re


if use_real_weights := True:
    gguf_weights = read_gguf_directory(gguf_path)
    layer_idx = 0
    q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
    q_a_proj.weight = nn.Parameter(q_a_proj_weight.view(torch.bfloat16), requires_grad=False)
    q_a_type = type

    q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
    q_a_norm = q_a_norm_weight.view(torch.float32)
    # config.q_a_norm = q_a_norm_weight.data_ptr()
    # config.q_a_norm_type = type_to_ggml_type(type)

    q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
    q_b_proj.weight = nn.Parameter(q_b_proj_weight.view(torch.bfloat16), requires_grad=False)

    kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
        gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
    )
    kv_a_proj_with_mqa.weight = nn.Parameter(kv_a_proj_with_mqa_weight.view(torch.bfloat16), requires_grad=False)

    kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
    kv_a_norm = kv_a_norm_weight.view(torch.float32)
    # config.kv_a_norm = kv_a_norm_weight.data_ptr()
    # config.kv_a_norm_type = type_to_ggml_type(type)

    kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
    kv_b_proj.weight = nn.Parameter(kv_b_proj_weight.view(torch.bfloat16), requires_grad=False)

    o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
    o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False)

else:
    init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
    init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
    init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
    init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
    init.normal_(o_proj.weight, mean=0.0, std=0.02)

x_reshaped = kv_b_proj.weight.view(num_heads, 2, nope_size, kv_lora_rank)
q_absorb = x_reshaped[:, 0]
out_absorb = x_reshaped[:, 1]


hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()


def build_mla():
    os.environ["BLAS_NUM_THREADS"] = "1"
    q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
    q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
    kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
    kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
    o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()

    config = kt_kernel_ext.mla.MLAConfig(
        hidden_size,
        q_lora_rank,
        kv_lora_rank,
        num_heads,
        nope_size,
        rope_size,
    )
    config.max_qlen = max_qlen
    config.max_kvlen = max_kvlen
    config.max_position_embeddings = max_position_embeddings
    config.rope_scaling_factor = rope_scaling["factor"]
    config.rope_theta = rope_theta
    config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
    config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
    config.rope_scaling_mscale = rope_scaling["mscale"]
    config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
    config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

    config.q_a_proj = q_a_proj_weight.data_ptr()
    config.q_b_proj = q_b_proj_weight.data_ptr()
    config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
    config.kv_b_proj = kv_b_proj_weight.data_ptr()
    config.o_proj = o_proj_weight.data_ptr()

    config.q_a_norm = q_a_norm.data_ptr()
    config.q_a_norm_type = ggml_type.FP32
    config.kv_a_norm = kv_a_norm.data_ptr()
    config.kv_a_norm_type = ggml_type.FP32

    if weight_type == torch.float32:
        config.q_a_proj_type = ggml_type.FP32
        config.q_b_proj_type = ggml_type.FP32
        config.kv_a_proj_with_mqa_type = ggml_type.FP32
        config.kv_b_proj_type = ggml_type.FP32
        config.w_o_type = ggml_type.FP32
    elif weight_type == torch.float16:
        config.q_a_proj_type = ggml_type.FP16
        config.q_b_proj_type = ggml_type.FP16
        config.kv_a_proj_with_mqa_type = ggml_type.FP16
        config.kv_b_proj_type = ggml_type.FP16
        config.w_o_type = ggml_type.FP16
    elif weight_type == torch.bfloat16:
        config.q_a_proj_type = ggml_type.BF16
        config.q_b_proj_type = ggml_type.BF16
        config.kv_a_proj_with_mqa_type = ggml_type.BF16
        config.kv_b_proj_type = ggml_type.BF16
        config.w_o_type = ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {weight_type}")

    config.pool = CPUInfer.backend_

    if weight_type == torch.float32:
        mla = kt_kernel_ext.mla.MLA_F32(config)
    elif weight_type == torch.float16:
        mla = kt_kernel_ext.mla.MLA_F16(config)
    elif weight_type == torch.bfloat16:
        mla = kt_kernel_ext.mla.MLA_F32(config)
    else:
        raise ValueError(f"Unsupported data type: {weight_type}")

    mla.load_weights()
    mla.set_local_pages(pages_count)
    return mla


def load_fp32_tensor(file_path, shape):
    with open(file_path, "rb") as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float32)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor


# page3 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))
# page3_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))

# diff = torch.abs(page3 - page3_2)
# print(f'Diff: ave:{diff.mean()}, max:{diff.max()}')

# q_pe_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_q_rope.f32',(1, rope_size))
# q_pe_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_q_rope.f32',(qlen, rope_size))
# diff = torch.abs(q_pe_1 - q_pe_2[-1])
# print(f'Q PE Diff: ave:{diff.mean()}, max:{diff.max()}')

# q_nope_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_q_nope.f32',(1, nope_size))
# q_nope_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_q_nope.f32',(qlen, nope_size))
# diff = torch.abs(q_nope_1 - q_nope_2[-1])
# print(f'Q Nope Diff: ave:{diff.mean()}, max:{diff.max()}')


# pe_attn_w_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_pe_attention_weights.f32',(1,max_kvlen))
# pe_attn_w_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_pe_attention_weights.f32',(qlen,max_kvlen))
# diff = torch.abs(pe_attn_w_1 - pe_attn_w_2[-1])
# print(f'PE Attention Weights Diff: ave:{diff.mean()}, max:{diff.max()}')


# raw_attn_w_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_raw_attention_weights.f32',(1,max_kvlen))
# raw_attn_w_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_raw_attention_weights.f32',(qlen,max_kvlen))
# diff = torch.abs(raw_attn_w_1 - raw_attn_w_2[-1])
# print(f'Raw Attention Weights Diff: ave:{diff.mean()}, max:{diff.max()}')


# output_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/output.bin.f32',shape=(1, hidden_size))
# output_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/output.bin.f32',shape=(qlen, hidden_size))

# diff = torch.abs(output_1 - output_2[-1])
# print(f'Output Diff: ave:{diff.mean()}, max:{diff.max()}')


mla = build_mla()
output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output[-1])


output_2 = torch.zeros((1, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([1], [page_table], [qlen - 1], hidden_states[-1].data_ptr(), output_2.data_ptr())
print("CPU MLA Output 2: ", output_2[-1])

diff = torch.abs(output[-1] - output_2[-1])
print(f"Diff: ave:{diff.mean()}, max:{diff.max()}")
assert diff.max() < 1e-1, "CPU and Torch outputs are not close enough!"


================================================
FILE: kt-kernel/examples/test_mla_quant.py
================================================
import logging
import os, sys
import time
from typing import Optional

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding

logger = logging.getLogger("reader")

from gguf.gguf_reader import GGUFReader


def load_fp32_tensor_raw(file_path):
    # return torch.zeros(shape)
    with open(file_path, "rb") as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float32)
    return tensor


def load_fp16_tensor(file_path, shape=None):
    # return load_fp32_tensor(file_path, shape)
    return load_fp32_tensor_raw(file_path)
    # return torch.zeros(shape)
    with open(file_path, "rb") as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=weight_type)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor


def load_fp32_tensor(file_path, shape):
    # return torch.zeros(shape)
    with open(file_path, "rb") as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float32)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor


def test_torch():
    torch.set_grad_enabled(False)

    hidden_states_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_input.bin")
    hidden_states_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_input.bin")
    # diff = torch.abs(hidden_states_to_check_prefill - hidden_states_to_check_decode).max()
    # print("hidden_states diff -> ", diff)

    q_lora_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora.bin")
    q_lora_to_check_test_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora_test.bin")
    q_lora_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora.bin")
    q_lora_to_check_test_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora_test.bin")
    # diff = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
    # diff_test = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
    # print("q_lora max diff -> ", diff)
    # print("q_lora max diff test -> ", diff_test)
    # mae =  torch.mean(torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode))
    # mae_test =  torch.mean(torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode))
    # print("q_lora mae -> ", mae)
    # print("q_lora mae test -> ", mae_test)

    # q_lora_norm = q_a_layernorm(q_lora)
    # q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
    # q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
    # diff = torch.abs(q_lora_norm - q_lora_norm_to_check).max()
    # mae =  torch.mean(torch.abs(q_lora_norm - q_lora_norm_to_check))
    # diff_test = torch.abs(q_lora_norm - q_lora_norm_to_check_test).max()
    # mae_test =  torch.mean(torch.abs(q_lora_norm - q_lora_norm_to_check_test))
    # print("q_lora_norm diff -> ", diff)
    # print("q_lora_norm mae -> ", mae)
    # print("q_lora_norm diff test -> ", diff_test)
    # print("q_lora_norm mae test -> ", mae_test)

    # q = q_b_proj(q_lora_norm)
    # for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
    # q = q.view(qlen, num_heads, nope_size+rope_size)
    # q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
    # q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
    # q_nope, q_pe = torch.split(
    #     q, [nope_size, rope_size], dim=-1
    # )

    # compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
    # compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
    # compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
    # compressed_kv, k_pe = torch.split(
    #     compressed_kv, [kv_lora_rank, rope_size], dim=-1
    # )
    # compressed_kv = compressed_kv.contiguous()

    # compressed_kv_page_0 = compressed_kv[0:page_size, :]
    compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank")
    compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank")
    # diff = torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode).max()
    # mae =  torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode))
    # print("compressed_kv diff -> ", diff)
    # print("compressed_kv mae -> ", mae)

    # compressed_kv = kv_a_layernorm(compressed_kv)
    # k_pe is [qlen, 1, qk_rope_head_dim(64)]

    # compressed_kv_page_0 = compressed_kv[0:page_size, :]
    compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm")
    compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm")
    # diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
    # mae =  torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
    # print("compressed_kv diff norm -> ", diff)
    # print("compressed_kv mae norm -> ", mae)

    # k_pe = k_pe.view(qlen, 1, rope_size)
    # compressed_kv is [qlen, 1, kv_lora_rank(512)]
    # compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)

    # cos, sin = rotary_emb(q_pe, batch_position_ids)

    # q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below

    # q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope_check[0].shape)
    # q_nope_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_nope_test', q_nope_check[0].shape)
    # diff = torch.abs(q_nope_check[0] - q_nope_0_to_check).max()
    # mae =  torch.mean(torch.abs(q_nope_check[0] - q_nope_0_to_check))
    # diff_test = torch.abs(q_nope_check[0] - q_nope_0_to_check_test).max()
    # mae_test =  torch.mean(torch.abs(q_nope_check[0] - q_nope_0_to_check_test))
    # print("q_nope[0] diff -> ", diff)
    # print("q_nope[0] mae -> ", mae)
    # print("q_nope[0] diff test -> ", diff_test)
    # print("q_nope[0] mae test -> ", mae_test)

    # q_pe_nope = q_pe.transpose(0,1)
    q_pe_0_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_q_rope")
    q_pe_0_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_q_rope")

    # q_pe_0_to_check_decode_test = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope_test')
    # q_pe_0_to_check_prefill_test = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope_test')

    # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope', q_pe_nope[0].shape)
    # q_pe_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope_test', q_pe_nope[0].shape)
    # diff = torch.abs(q_pe_nope[0] - q_pe_0_to_check).max()
    # mae =  torch.mean(torch.abs(q_pe_nope[0] - q_pe_0_to_check))
    # diff_test = torch.abs(q_pe_nope[0] - q_pe_0_to_check_test).max()
    # mae_test =  torch.mean(torch.abs(q_pe_nope[0] - q_pe_0_to_check_test))
    # print("q_pe nope[0] diff -> ", diff)
    # print("q_pe nope[0] mae -> ", mae)
    # print("q_pe nope[0] diff test -> ", diff_test)
    # print("q_pe nope[0] mae test -> ", mae_test)

    # cos_to_check = load_fp32_tensor('./debug/query_0_tp_0_rope_cos', (qlen,32))
    # diff = torch.abs(cos[:,:32]-cos_to_check).max()
    # mae =  torch.mean(torch.abs(cos[:,:32]-cos_to_check))
    # print("cos diff -> ", diff)
    # print("cos mae -> ", mae)
    # sin_to_check = load_fp32_tensor('./debug/query_0_tp_0_rope_sin', (qlen,32))
    # diff = torch.abs(sin[:,:32]-sin_to_check).max()
    # mae =  torch.mean(torch.abs(sin[:,:32]-sin_to_check))
    # print("sin diff -> ", diff)
    # print("sin mae -> ", mae)

    # new_q_pe = q_pe.transpose(0, 1)
    # qa = new_q_pe[:,:,range(0,64,2)]
    # qb = new_q_pe[:,:,range(1,65,2)]
    # q1 = (qa * cos[:,:32] - qb * sin[:,:32])
    # q2 = (qb*cos[:,:32] + qa*sin[:,:32])
    # q1 = (qa * cos_to_check - qb * sin_to_check)
    # q2 = (qb*cos_to_check + qa*sin_to_check)
    # q_new = torch.cat((q1,q2), dim=-1)
    # print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
    # new_q_pe = torch.zeros_like(q_pe)
    # new_q_pe[:,:,range(0,64,2)] = 1
    # new_q_pe[:,:,range(1,65,2)] = 10
    # q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
    # q_pe = q_pe.squeeze(0)
    # q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
    # q_pe.transpose_(0, 1)

    # diff = torch.abs(q_pe - q_new).max()
    # print("q_pe diff -> ", diff)

    # q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
    # diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
    # mae =  torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
    # print("q_pe[0] diff -> ", diff)
    # print("q_pe[0] mae -> ", mae)

    # diff = torch.abs(q_pe_0_to_check - q_new[0]).max()
    # mae =  torch.mean(torch.abs(q_pe_0_to_check - q_new[0]))
    # print("q_pe[0] 2  diff -> ", diff)
    # print("q_pe[0] 2 mae -> ", mae)

    # if kv_cache is not None:
    #     cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset}  # Specific to RoPE models
    #     compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
    #     compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
    #     k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
    # # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
    # # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
    # # q_absorb, out_absorb = get_absorbed()
    # # q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
    # q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below

    # q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope[0].shape)
    # diff = torch.abs(q_nope[0] - q_nope_0_to_check).max()
    # mae =  torch.mean(torch.abs(q_nope[0] - q_nope_0_to_check))
    # print("q_nope[0] diff -> ", diff)

    # # q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
    # q_nope = torch.matmul(q_nope, q_absorb) # batched MM

    # k_b_proj_check = load_fp16_tensor('./debug/query_0_tp_0_k_b_lora', (nope_size,kv_lora_rank))
    # diff = torch.abs(q_absorb[0] - k_b_proj_check).max()
    # print("kv b lora weight[0] diff -> ", diff)

    # q_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_q_absorb', (kv_lora_rank,1024))
    # q_absorb_check = q_absorb_check[:,0:qlen].transpose(0,1)
    # diff = torch.abs(q_nope[0] - q_absorb_check).max()
    # mae =  torch.mean(torch.abs(q_nope[0] - q_absorb_check))
    # print("q_nope absorb diff -> ", diff)
    # print("q_nope absorb mae -> ", mae)

    # # q_nope is [qlen, num_heads(128), kv_lora_rank(512)]
    # q_nope = q_nope.transpose(0, 1)

    # we need to index out the compressed_kv and k_pe for the current batch
    # batch_compressed_kv = None
    # batch_k_pe = None
    # for page_index in kv_index:
    #     if kv_total_len > page_size:
    #         tmp_compressed_kv = compressed_kv[page_index, 0:page_size, :]
    #         tmp_k_pe = k_pe[page_index, 0:page_size, :]
    #         if batch_compressed_kv is None or batch_k_pe is None:
    #             batch_compressed_kv = tmp_compressed_kv
    #             batch_k_pe = tmp_k_pe
    #         else:
    #             batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
    #             batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
    #         kv_total_len -= page_size
    #     else:
    #         tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
    #         tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
    #         if batch_compressed_kv is None or batch_k_pe is None:
    #             batch_compressed_kv = tmp_compressed_kv
    #             batch_k_pe = tmp_k_pe
    #         else:
    #             batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
    #             batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
    #         break
    # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
    # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]

    k_pe_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_k_rope", (256, 64))
    k_pe_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_k_rope", (256, 64))
    # diff = torch.abs(k_pe_to_check_prefill - k_pe_to_check_decode).max()
    # mae =  torch.mean(k_pe_to_check_prefill - k_pe_to_check_decode)
    # print("k_pe diff -> ", diff)
    # print("k_pe mae -> ", mae)

    # pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
    # kv_total_len = kv_page_nums * page_size
    pe_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_pe_attention_weights", (1024, 4096))
    pe_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_pe_attention_weights", (1024, 4096))

    # diff = torch.abs(pe_weights[0] - pe_weights_0).max()
    # print("pe_weights[0] diff -> ", diff)

    # attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))

    # raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
    # raw_weights = raw_weights[0:qlen, 0:kv_total_len]
    # diff = torch.abs(attention_weights[0] - raw_weights).max()
    # print("raw attention_weigh/ts[0] diff -> ", diff)

    # attention_weights = attention_weights * softmax_scale
    # attention_weights is [num_heads(128), qlen, k_len]

    # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)

    # attention_masks[i] is [qlen, k_len]

    # attention_weights = (attention_weights + attention_masks)
    # attention_weights shape is [num_heads(128), qlen, k_len]

    # attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)

    attention_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_attention_weights", (1024, 4096))
    attention_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_attention_weights", (1024, 4096))

    # attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
    # diff = torch.abs(attention_weights[0] - attention_weights_0).max()
    # print("attention_weights[0] diff -> ", diff)

    # attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
    # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]

    # o_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_o_absorb', (qlen,kv_lora_rank))
    # diff = torch.abs(attn_output[0] - o_absorb_check).max()
    # print("o absorb[0] diff -> ", diff)

    # out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)]
    # # q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
    # attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]

    # attn_output_check_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_output', (qlen, nope_size))
    # diff = torch.abs(attn_output[0] - attn_output_check_0).max()
    # print("attn_output[0] diff -> ", diff)

    # attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
    # attn_output = attn_output.reshape(qlen, num_heads * nope_size)

    # w_o = o_proj.weight.view([hidden_size,num_heads * nope_size])
    # output = torch.matmul(attn_output,w_o.transpose(0,1))
    # output = output.view(qlen, hidden_size)

    # output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
    # h1_o = w_o[:,:128]
    # local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128))
    # diff = torch.abs(local_o_check - h1_o).max()
    # print("local w_o diff -> ", diff)

    # h1_output = torch.matmul(attn_output[:,:128],h1_o.transpose(0,1))
    # diff = torch.abs(h1_output - output_0_check).max()
    # print("h1_output diff -> ", diff)

    output_check_decode = load_fp16_tensor("./debug_decode/output.bin")
    output_check_prefill = load_fp16_tensor("./debug_prefill/output.bin")
    # diff = torch.abs(output - output_check).max()
    # mae =   torch.mean(torch.abs(output - output_check))
    # print("output diff -> ", diff)

    return None


torch.set_printoptions(sci_mode=False, precision=5)
# output_cpu = test_cpu_mla()
# output_cpu_quant = test_cpu_mla_quant()
output_torch = test_torch()
# print("Output CPU: ", output_cpu)
# print("Output CPU: ", output_cpu_quant)
# print("Output Torch: ", output_torch)
# diff = (output_cpu - output_torch).abs()
# # 计算相对误差
# diff_relative = diff / (output_cpu.abs())
# # 把 diff_relative 中的 NaN 替换为 0
# diff_relative = torch.where(torch.isnan(diff_relative), torch.zeros_like(diff_relative), diff_relative)
# diff_relative_mean = torch.mean(torch.abs(output_cpu-output_torch)) / torch.mean(torch.abs(output_torch))

# print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()},  relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
# assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"


================================================
FILE: kt-kernel/examples/test_mla_simple.py
================================================
import math
import random
import os, sys
import time
import subprocess
import platform
import json
from typing import Any, Dict, Optional, Tuple
import numpy as np
import torch.nn.init as init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding

import torch
from tqdm import tqdm
from torch import nn
"""
"rope_scaling": {
    "beta_fast": 32,
    "beta_slow": 1,
    "factor": 40,
    "mscale": 1.0,
    "mscale_all_dim": 1.0,
    "original_max_position_embeddings": 4096,
    "type": "yarn"
  },
""" 

rope_scaling = {
    "beta_fast": 32,
    "beta_slow": 1,
    "factor": 40,
    "mscale": 1.0,
    "mscale_all_dim": 1.0,
    "original_max_position_embeddings": 4096,
    "type": "yarn"
}
seed = 42  # 你可以选择任何整数作为种子
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

# "rope_theta": 10000
rope_theta = 10000


hidden_size = 7168
num_heads = 128
kv_lora_rank = 512
q_lora_rank = 512
nope_size = 128
rope_size = 64

# page 的个数
page_nums = 10
page_size = 512
layer_num = 10
max_position_embeddings =  163840


warm_up_iter = 1000
test_iter = 1000

q_len = 200
his_kv_len = 128

bsz_tensors=torch.tensor([1])

softmax_scale = (nope_size + rope_size) ** -0.5
# 1代表的是压缩的kv的头数
k_caches = torch.randn(layer_num,page_nums, page_size,1, kv_lora_rank + rope_size).to(torch.float16)
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)

q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)

x = torch.randn(q_lora_rank, dtype=torch.float16)*100
print(x)
print(q_a_layernorm(x))

kv_a_layernorm = DeepseekV2RMSNorm(kv_lora_rank)

q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear(kv_lora_rank, num_heads * (nope_size + nope_size), bias=False, dtype=torch.float16)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)

init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
init.normal_(o_proj.weight, mean=0.0, std=0.02)
# # 这里的权重初始化是为了测试
# # 将权重设置为全 1
# with torch.no_grad():
#     q_a_proj.weight.fill_(1.0)
#     q_b_proj.weight.fill_(1.0)
#     kv_a_proj_with_mqa.weight.fill_(1.0)
#     kv_b_proj.weight.fill_(1.0)
#     o_proj.weight.fill_(1.0)

q_absorb = torch.randn(num_heads, nope_size, kv_lora_rank, dtype=torch.float16)
out_absorb = torch.randn(num_heads, nope_size, kv_lora_rank, dtype=torch.float16)

rotary_emb = DeepseekV3YarnRotaryEmbedding(
    rope_size,
    max_position_embeddings=max_position_embeddings,
    scaling_factor=rope_scaling["factor"],
    base=rope_theta,
    beta_fast=rope_scaling["beta_fast"],
    beta_slow=rope_scaling["beta_slow"],
    mscale=rope_scaling["mscale"],
    mscale_all_dim=rope_scaling["mscale_all_dim"],
    original_max_position_embeddings=rope_scaling["original_max_position_embeddings"],
)
# 构造一个q_len 长度的输入 hidden_states, 对应的历史 kv_indptr 是[0:bsz]
# kv_indices 是[0:bsz]，page_idx=[0:bsz], page_offset=[his_kv_len:q_len+his_kv_len]
# last_page_len = [q_len+his_kv_len,...] layer_idx = 1
# position_ids = [his_kv_len:q_len+his_kv_len]
hidden_states = torch.randn(q_len, hidden_size, dtype=torch.float16)
q_indptr = torch.tensor([0,q_len]).to(torch.int32)
kv_indptr = torch.tensor(range(0, bsz_tensors[0] + 1)).to(torch.int32)
kv_indices = torch.tensor(range(0, bsz_tensors[0])).to(torch.int32)
page_idx = torch.tensor(range(0, bsz_tensors[0])).to(torch.int32)
page_offset = torch.tensor(range(his_kv_len, his_kv_len + q_len)).to(torch.int32)
last_page_len = torch.tensor([q_len+his_kv_len]*bsz_tensors[0], device=hidden_states.device)
position_ids = torch.tensor(range(his_kv_len, his_kv_len + q_len)).to(torch.int32)


# 按照行创建 mask [q_len,his_kv_len+q_len]
attention_masks = torch.zeros((q_len, his_kv_len + q_len), dtype=torch.float16)
for i in range(q_len):
    attention_masks[i, i + his_kv_len + 1: i + his_kv_len + q_len] = -65504.0


def torch_attn(hidden_states: torch.Tensor,
                kv_cache: KDeepSeekV3Cache,
                position_ids: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                attention_masks: Optional[list[torch.Tensor]] = None,
                q_indptr: Optional[torch.Tensor] = None,
                kv_indices: Optional[torch.Tensor] = None,
                kv_indptr: Optional[torch.Tensor] = None,
                bsz_tensors: Optional[torch.Tensor] = None,
                last_page_len: Optional[torch.Tensor] = None,
                layer_idx: Optional[int] = None,
                ):
    global out_absorb
    global q_absorb
    # range bsz_tensors
    final_attention_output = torch.tensor([], device=hidden_states.device)
    for i in range(bsz_tensors[0]):
        print("page_idx", page_idx)
        print("page_offset", page_offset)
        print("q_indptr", q_indptr)
        print("kv_indices", kv_indices)
        print("kv_indptr", kv_indptr)

        batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
        batch_last_page_len = last_page_len[i]
        # kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
        batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
        print('batch_page_idx',batch_page_idx)
        batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
        # kv_page_nums is the number of pages for the current batch
        kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
        # kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
        kv_total_len = kv_page_nums * page_size
        if batch_last_page_len is not None:
            kv_total_len = kv_total_len - (page_size - batch_last_page_len)
        # print(f"kv_total_len's shape {kv_total_len.shape}")
        # kv_index is the index of the kv cache pages for the current batch
        kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
        # we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
        # from q_indptr[i] to q_indptr[i+1] is the range of the current batch
        batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
        batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
        q_len, _ = batch_hidden_states.size()
        # print("q_len -> ", q_len)
        q_lora = q_a_proj(batch_hidden_states)
        print('q_a_proj',q_a_proj.weight)
        print('q_lora',q_lora)
        
        q = q_b_proj(q_a_layernorm(q_lora))
        print('q_b_proj',q_b_proj.weight)
        # for v3, bsz, q_len, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
        q = q.view(q_len, num_heads, nope_size+rope_size)
        # q_nope is [q_len, num_heads(128), qk_nope_head_dim(128)]
        # q_pe is [q_len, num_heads(128), qk_rope_head_dim(64)]
        q_nope, q_pe = torch.split(
            q, [nope_size, rope_size], dim=-1
        )
        print('q_nope',q_nope)
        print('q_pe',q_pe)
        # compressed_kv is [q_len, kv_lora_rank(512) + rope(64)]
        compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
        # compressed_kv is [q_len, kv_lora_rank(512)], k_pe is [q_len, rope(64)]
        compressed_kv, k_pe = torch.split(
            compressed_kv, [kv_lora_rank, rope_size], dim=-1
        )
        compressed_kv = compressed_kv.contiguous()
        compressed_kv = kv_a_layernorm(compressed_kv)
        # k_pe is [q_len, 1, qk_rope_head_dim(64)]
        print('compressed_kv ',compressed_kv)
        print('k_pe ',k_pe)
        k_pe = k_pe.view(q_len, 1, rope_size)
        # compressed_kv is [q_len, 1, kv_lora_rank(512)]
        compressed_kv = compressed_kv.view(q_len, 1, kv_lora_rank)
        
        cos, sin = rotary_emb(q_pe, batch_position_ids)
        # print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
        q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
        q_pe = q_pe.squeeze(0)
        # q_pe is [num_heads(128), q_len, qk_rope_head_dim(64)]
        q_pe.transpose_(0, 1)            
        if kv_cache is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset}  # Specific to RoPE models
            compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
            compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
            k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
        # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
        # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
        # q_absorb, out_absorb = get_absorbed()
        # q_nope is [num_heads(128), q_len, qk_nope_head_dim(128)]
        q_nope = q_nope.transpose(0, 1) # q_len is 1, no GPU overhead, same below
        # q_nope is [num_heads(128), q_len, kv_lora_rank(512)]
        q_nope = torch.matmul(q_nope, q_absorb) # batched MM

        # # q_nope is [q_len, num_heads(128), kv_lora_rank(512)]
        # q_nope = q_nope.transpose(0, 1)

        # we need to index out the compressed_kv and k_pe for the current batch
        batch_compressed_kv = None
        batch_k_pe = None
        for page_index in kv_index:
            if kv_total_len > page_size:
                tmp_compressed_kv = compressed_kv[page_index, 0:page_size, :]
                tmp_k_pe = k_pe[page_index, 0:page_size, :]
                if batch_compressed_kv is None or batch_k_pe is None:
                    batch_compressed_kv = tmp_compressed_kv
                    batch_k_pe = tmp_k_pe
                else: 
                    batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                    batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                kv_total_len -= page_size
            else:
                tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
                tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
                if batch_compressed_kv is None or batch_k_pe is None:
                    batch_compressed_kv = tmp_compressed_kv
                    batch_k_pe = tmp_k_pe
                else: 
                    batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                    batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                break
        # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
        # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
        pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
        print('pe_weights',pe_weights)
        attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) * softmax_scale
        # attention_weights is [num_heads(128), q_len, k_len]
        
        # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(q_len,-1,-1).transpose(0,1)
        
        # attention_masks[i] is [q_len, k_len]
        
        attention_weights = (attention_weights + attention_masks[i])
        # attention_weights shape is [num_heads(128), q_len, k_len]
        attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float16).to(q_pe.dtype)
        attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),q_len, lora_rank(512)]
        # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
        out_absorb = out_absorb.transpose(1,2)
        # q for q_len, n for num_heads, h for v_head_dim, v for kv_lora_rank
        attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), q_len, v_head_dim(128)]
        attn_output = attn_output.transpose(0, 1) # [q_len, num_heads(128), v_head_dim(128)]
        attn_output = attn_output.reshape(q_len, num_heads * nope_size)
        attn_output = o_proj(attn_output)
        final_attention_output = torch.cat((final_attention_output, attn_output), dim=0)
    return final_attention_output


def torch_attn_for_test(hidden_states,kv_cache,):
    pass

def test_mla_simple():
    result = torch_attn(
        hidden_states,
        kv_cache,
        position_ids,
        page_idx,
        page_offset,
        attention_masks=attention_masks,
        q_indptr=q_indptr,
        kv_indices=kv_indices,
        kv_indptr=kv_indptr,
        bsz_tensors=bsz_tensors,
        last_page_len=last_page_len,
        layer_idx=1
    )
    print(result.shape)
    print(result)
    
test_mla_simple()

================================================
FILE: kt-kernel/examples/test_mla_torch.py
================================================
import os, sys
import time
from typing import Optional

sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding


seed = 42  # 你可以选择任何整数作为种子
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

qlen = 1024
kvlen = 0


page_table = range(20)
bsz_tensors = torch.tensor([1])


page_size = 256
pages_count = 200
tp_count = 4


hidden_size = 7168
q_lora_rank = 1536
kv_lora_rank = 512
num_heads = 128
nope_size = 128
rope_size = 64

rope_theta = 10000
max_qlen = 1024
max_kvlen = 4096

max_position_embeddings = 163840


rope_scaling = {
    "beta_fast": 32,
    "beta_slow": 1,
    "factor": 40,
    "mscale": 1.0,
    "mscale_all_dim": 1.0,
    "original_max_position_embeddings": 4096,
    "type": "yarn",
}


CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 100


q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear(kv_lora_rank, num_heads * (nope_size + nope_size), bias=False, dtype=torch.float16)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)

init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
init.normal_(o_proj.weight, mean=0.0, std=0.02)

q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()


config = kt_kernel_ext.mla.MLAConfig(
    hidden_size,
    q_lora_rank,
    kv_lora_rank,
    num_heads,
    nope_size,
    rope_size,
)
config.max_qlen = max_qlen
config.max_kvlen = max_kvlen
config.max_position_embeddings = max_position_embeddings
config.rope_scaling_factor = rope_scaling["factor"]
config.rope_theta = rope_theta
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
config.rope_scaling_mscale = rope_scaling["mscale"]
config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

config.q_a_proj = q_a_proj_weight.data_ptr()
config.q_b_proj = q_b_proj_weight.data_ptr()
config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
config.kv_b_proj = kv_b_proj_weight.data_ptr()
config.o_proj = o_proj_weight.data_ptr()

config.q_a_proj_type = ggml_type.FP16
config.q_b_proj_type = ggml_type.FP16
config.kv_a_proj_with_mqa_type = ggml_type.FP16
config.kv_b_proj_type = ggml_type.FP16
config.w_o_type = ggml_type.FP16


config.pool = CPUInfer.backend_


mla = kt_kernel_ext.mla.MLA(config)
mla.load_weights()
mla.set_local_pages(pages_count)


input = torch.randn((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()


output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], input.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output)


softmax_scale = (nope_size + rope_size) ** -0.5
# 1代表的是压缩的kv的头数
k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16)
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)

q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)

x = torch.randn(q_lora_rank, dtype=torch.float16) * 100
print(x)
print(q_a_layernorm(x))

kv_a_layernorm = DeepseekV2RMSNorm(kv_lora_rank)


q_absorb = torch.randn(num_heads, nope_size, kv_lora_rank, dtype=torch.float16)
out_absorb = torch.randn(num_heads, nope_size, kv_lora_rank, dtype=torch.float16)

rotary_emb = DeepseekV3YarnRotaryEmbedding(
    rope_size,
    max_position_embeddings=max_position_embeddings,
    scaling_factor=rope_scaling["factor"],
    base=rope_theta,
    beta_fast=rope_scaling["beta_fast"],
    beta_slow=rope_scaling["beta_slow"],
    mscale=rope_scaling["mscale"],
    mscale_all_dim=rope_scaling["mscale_all_dim"],
    original_max_position_embeddings=rope_scaling["original_max_position_embeddings"],
)
# 构造一个qlen 长度的输入 hidden_states, 对应的历史 kv_indptr 是[0:bsz]
# kv_indices 是[0:bsz]，page_idx=[0:bsz], page_offset=[kvlen:qlen+kvlen]
# last_page_len = [qlen+kvlen,...] layer_idx = 1
# position_ids = [kvlen:qlen+kvlen]
hidden_states = torch.randn(qlen, hidden_size, dtype=torch.float16)
q_indptr = torch.tensor([0, qlen]).to(torch.int32)

kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
kv_indices = torch.tensor(range(pages_count)).to(torch.int32)

page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)

last_page_len = torch.tensor([(qlen + kvlen) % page_size], device=hidden_states.device)
position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)


# 按照行创建 mask [qlen,kvlen+qlen]
attention_masks = torch.zeros((qlen, kvlen + qlen), dtype=torch.float16)
for i in range(qlen):
    attention_masks[i, i + kvlen + 1 : i + kvlen + qlen] = -65504.0


def torch_attn(
    hidden_states: torch.Tensor,
    kv_cache: KDeepSeekV3Cache,
    position_ids: torch.Tensor,
    page_idx: torch.Tensor,
    page_offset: torch.Tensor,
    attention_masks: Optional[list[torch.Tensor]] = None,
    q_indptr: Optional[torch.Tensor] = None,
    kv_indices: Optional[torch.Tensor] = None,
    kv_indptr: Optional[torch.Tensor] = None,
    bsz_tensors: Optional[torch.Tensor] = None,
    last_page_len: Optional[torch.Tensor] = None,
    layer_idx: Optional[int] = None,
):
    global out_absorb
    global q_absorb
    # range bsz_tensors
    final_attention_output = torch.tensor([], device=hidden_states.device)
    for i in range(bsz_tensors[0]):
        batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
        batch_last_page_len = last_page_len[i]
        # kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
        batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
        batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
        # kv_page_nums is the number of pages for the current batch
        kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
        # kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
        kv_total_len = kv_page_nums * page_size
        if batch_last_page_len is not None:
            kv_total_len = kv_total_len - (page_size - batch_last_page_len)
        # print(f"kv_total_len's shape {kv_total_len.shape}")
        # kv_index is the index of the kv cache pages for the current batch
        kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
        # we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
        # from q_indptr[i] to q_indptr[i+1] is the range of the current batch
        batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
        batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
        qlen, _ = batch_hidden_states.size()
        # print("qlen -> ", qlen)
        q_lora = q_a_proj(batch_hidden_states)
        print("q_a_proj", q_a_proj.weight)
        print("q_lora", q_lora)

        q = q_b_proj(q_a_layernorm(q_lora))
        print("q_b_proj", q_b_proj.weight)
        # for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
        q = q.view(qlen, num_heads, nope_size + rope_size)
        # q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
        # q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
        q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)
        print("q_nope", q_nope)
        print("q_pe", q_pe)
        # compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
        compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
        # compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
        compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
        compressed_kv = compressed_kv.contiguous()
        compressed_kv = kv_a_layernorm(compressed_kv)
        # k_pe is [qlen, 1, qk_rope_head_dim(64)]
        print("compressed_kv ", compressed_kv)
        print("k_pe ", k_pe)
        k_pe = k_pe.view(qlen, 1, rope_size)
        # compressed_kv is [qlen, 1, kv_lora_rank(512)]
        compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)

        cos, sin = rotary_emb(q_pe, batch_position_ids)
        # print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
        q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
        q_pe = q_pe.squeeze(0)
        # q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
        q_pe.transpose_(0, 1)
        if kv_cache is not None:
            cache_kwargs = {
                "sin": sin,
                "cos": cos,
                "page_idx": batch_page_idx,
                "page_offset": batch_page_offset,
            }  # Specific to RoPE models
            compressed_kv_with_k_pe = kv_cache.update(
                compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
            )
            compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
            k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
        # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
        # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
        # q_absorb, out_absorb = get_absorbed()
        # q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
        q_nope = q_nope.transpose(0, 1)  # qlen is 1, no GPU overhead, same below
        # q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
        q_nope = torch.matmul(q_nope, q_absorb)  # batched MM

        # # q_nope is [qlen, num_heads(128), kv_lora_rank(512)]
        # q_nope = q_nope.transpose(0, 1)

        # we need to index out the compressed_kv and k_pe for the current batch
        batch_compressed_kv = None
        batch_k_pe = None
        for page_index in kv_index:
            if kv_total_len > page_size:
                tmp_compressed_kv = compressed_kv[page_index, 0:page_size, :]
                tmp_k_pe = k_pe[page_index, 0:page_size, :]
                if batch_compressed_kv is None or batch_k_pe is None:
                    batch_compressed_kv = tmp_compressed_kv
                    batch_k_pe = tmp_k_pe
                else:
                    batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                    batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                kv_total_len -= page_size
            else:
                tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
                tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
                if batch_compressed_kv is None or batch_k_pe is None:
                    batch_compressed_kv = tmp_compressed_kv
                    batch_k_pe = tmp_k_pe
                else:
                    batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                    batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                break
        # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
        # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
        pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
        print("pe_weights", pe_weights)
        attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) * softmax_scale
        # attention_weights is [num_heads(128), qlen, k_len]

        # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)

        # attention_masks[i] is [qlen, k_len]

        attention_weights = attention_weights + attention_masks[i]
        # attention_weights shape is [num_heads(128), qlen, k_len]
        attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=torch.float16).to(q_pe.dtype)
        attn_output = torch.matmul(attention_weights, batch_compressed_kv)  # [num_heads(128),qlen, lora_rank(512)]
        # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
        out_absorb = out_absorb.transpose(1, 2)
        # q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
        attn_output = torch.matmul(attn_output, out_absorb)  # [num_heads(128), qlen, v_head_dim(128)]
        attn_output = attn_output.transpose(0, 1)  # [qlen, num_heads(128), v_head_dim(128)]
        attn_output = attn_output.reshape(qlen, num_heads * nope_size)
        attn_output = o_proj(attn_output)
        final_attention_output = torch.cat((final_attention_output, attn_output), dim=0)
    return final_attention_output


torch_output = torch_attn(
    input,
    kv_cache,
    position_ids,
    page_idx,
    page_offset,
    attention_masks=attention_masks,
    q_indptr=q_indptr,
    kv_indices=kv_indices,
    kv_indptr=kv_indptr,
    bsz_tensors=bsz_tensors,
    last_page_len=last_page_len,
    layer_idx=0,
)
print("Torch Output: ", torch_output)


================================================
FILE: kt-kernel/examples/test_mlp.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:37:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch

hidden_size = 5120
intermediate_size = 3072
stride = 32
group_max_len = 1024
gate_type = 1  # ggml_type::GGML_TYPE_F16
up_type = 1  # ggml_type::GGML_TYPE_F16
down_type = 1  # ggml_type::GGML_TYPE_F16
hidden_type = 1  # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = kt_kernel_ext.CPUInfer(48)
validation_iter = 100


def act_fn(x):
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


with torch.inference_mode(mode=True):
    mlps = []
    gate_projs = []
    up_projs = []
    down_projs = []
    for _ in range(layer_num):
        gate_proj = (
            torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
        )
        up_proj = (
            torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
        )
        down_proj = (
            torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
        )
        config = kt_kernel_ext.mlp.MLPConfig(
            hidden_size,
            intermediate_size,
            stride,
            group_max_len,
            gate_proj.data_ptr(),
            up_proj.data_ptr(),
            down_proj.data_ptr(),
            gate_type,
            up_type,
            down_type,
            hidden_type,
        )
        mlp = kt_kernel_ext.mlp.MLP(config)
        gate_projs.append(gate_proj)
        up_projs.append(up_proj)
        down_projs.append(down_proj)
        mlps.append(mlp)

    # validation
    for i in range(validation_iter):
        mlp = mlps[i % layer_num]
        input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
        input = input / 100

        CPUInfer.submit(mlp.forward(qlen, input.data_ptr(), output.data_ptr()))
        CPUInfer.sync()
        # print('cpuinfer output', output)

        gate_proj = gate_projs[i % layer_num]
        up_proj = up_projs[i % layer_num]
        down_proj = down_projs[i % layer_num]
        t_output = mlp_torch(input, gate_proj, up_proj, down_proj)
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print("diff = ", diff)
        assert diff < 0.001


================================================
FILE: kt-kernel/examples/test_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : SkqLiao
LastEditTime : 2025-03-13 11:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
from tqdm import tqdm
from kt_kernel_ext.kvcache import ggml_type

torch.manual_seed(0)

expert_num = 8
hidden_size = 2048  # 7168
intermediate_size = 2048
stride = 32
group_min_len = 10
group_max_len = 2560
num_experts_per_tok = 8
layer_num = 1
# expert_num = 8
# hidden_size = 7168
# intermediate_size = 2048
# stride = 32
# group_min_len = 10
# group_max_len = 10240
# num_experts_per_tok = 8
# qlen = 1024
# layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 10


def act_fn(x):
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output


def to_cpuinfer_tensor(tensor, type):
    size = torch.prod(torch.tensor(tensor.shape, dtype=torch.int32)).item()
    return kt_kernel_ext.utils.from_float(tensor.data_ptr(), size, type)


def from_cpuinfer_tensor(tensor, size, type):
    return kt_kernel_ext.utils.to_float(tensor.data_ptr(), size, type)


qlens = [1, 64]  # [64, 512, 2048, 8192, 16384]
# gate_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
# up_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
# down_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q5_K]
gate_types = [ggml_type.Q4_K]
up_types = [ggml_type.Q4_K]
down_types = [ggml_type.Q6_K]
hidden_type = ggml_type.BF16
print(f"Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}")
print(f"group_max_len: ", group_max_len)

for qlen in qlens:
    for gate_type, up_type, down_type in zip(gate_types, up_types, down_types):
        with torch.inference_mode(mode=True):
            moes = []
            gate_projs = []
            up_projs = []
            down_projs = []
            print("Preparing data...")
            converted_tensors = []
            for _ in range(layer_num):
                size = expert_num * intermediate_size * hidden_size
                gate_proj = (
                    torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                    .to("cpu")
                    .contiguous()
                )
                up_proj = (
                    torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                    .to("cpu")
                    .contiguous()
                )
                down_proj = (
                    torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                    .to("cpu")
                    .contiguous()
                )

                gate_tensor = to_cpuinfer_tensor(gate_proj, gate_type)
                up_tensor = to_cpuinfer_tensor(up_proj, up_type)
                down_tensor = to_cpuinfer_tensor(down_proj, down_type)

                config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
                config.pool = CPUInfer.backend_
                config.stride = stride
                config.group_min_len = group_min_len
                config.group_max_len = group_max_len
                config.gate_proj = gate_tensor.data_ptr()
                config.up_proj = up_tensor.data_ptr()
                config.down_proj = down_tensor.data_ptr()
                config.gate_type = gate_type
                config.up_type = up_type
                config.down_type = down_type
                config.hidden_type = hidden_type

                moe = kt_kernel_ext.moe.MOE(config)
                gate_projs.append(gate_proj)
                up_projs.append(up_proj)
                down_projs.append(down_proj)
                CPUInfer.submit(moe.load_weights_task())
                CPUInfer.sync()
                moes.append(moe)
                converted_tensors.append((gate_tensor, up_tensor, down_tensor))
            print("Finished initialization!")

            CPUInfer.submit(moes[0].warm_up_task())
            CPUInfer.sync()
            print("Warm up finished!")

            # validation
            progress_bar = tqdm(range(validation_iter), desc="Starting")
            total_diff = 0

            for i in tqdm(progress_bar):
                progress_bar.set_description("Round: {}/{}".format(i + 1, validation_iter))
                expert_ids = torch.stack(
                    [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
                ).contiguous()
                weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
                input_proj = torch.randn((qlen, hidden_size), dtype=torch.float32).contiguous() / 100
                output_proj = torch.empty((qlen, hidden_size), dtype=torch.float32).contiguous()

                input_tensor = to_cpuinfer_tensor(input_proj, hidden_type)
                output_tensor = to_cpuinfer_tensor(output_proj, hidden_type)

                qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
                moe = moes[i % layer_num]
                CPUInfer.submit(
                    moe.forward_task(
                        qlen_tensor.data_ptr(),
                        num_experts_per_tok,
                        expert_ids.data_ptr(),
                        weights.data_ptr(),
                        input_tensor.data_ptr(),
                        output_tensor.data_ptr(),
                    )
                )
                CPUInfer.sync()
                cpu_output = from_cpuinfer_tensor(output_tensor, qlen * hidden_size, hidden_type)

                gate_proj = gate_projs[i % layer_num]
                up_proj = up_projs[i % layer_num]
                down_proj = down_projs[i % layer_num]
                t_output = moe_torch(input_proj, expert_ids, weights, gate_proj, up_proj, down_proj)
                print("cpuinfer output", cpu_output)
                print("torch output", t_output)
                diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(
                    torch.abs(t_output.flatten())
                )
                assert diff < 0.5
                total_diff += diff

            print(f"gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}")
            print(f"Average diff: {total_diff / validation_iter:.4f}")


================================================
FILE: kt-kernel/examples/test_moe_amx.py
================================================
import os, sys

sys.path.insert(0, os.path.dirname(__file__) + "/../build")
print("sys.path:", sys.path)

import torch
from kt_kernel import kt_kernel_ext

expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
qlen = 1
# qlen = 640
layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(90)
# validation_iter = 10000
validation_iter = 2
k_group_size = 64
debug_print_count = 16  # Number of values to print in debug output
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()


def act_fn(x):
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj, debug_expert_id=None, debug_print=False):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())

    if debug_print and debug_expert_id is not None:
        print(f"[TORCH DEBUG] Expert {debug_expert_id}:")
        print(f"  gate_buf[:{debug_print_count}] = {gate_buf.flatten()[:debug_print_count]}")
        print(f"  up_buf[:{debug_print_count}] = {up_buf.flatten()[:debug_print_count]}")

    intermediate = act_fn(gate_buf) * up_buf

    if debug_print and debug_expert_id is not None:
        print(f"  intermediate[:{debug_print_count}] = {intermediate.flatten()[:debug_print_count]}")

    ret = torch.mm(intermediate, down_proj.t())

    if debug_print and debug_expert_id is not None:
        print(f"  down_output[:{debug_print_count}] = {ret.flatten()[:debug_print_count]}")

    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj, debug_print=False):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    # Get the first expert from expert_ids array to match AWQ-MoE behavior
    target_debug_expert = expert_ids[0, 0].item()  # First expert in expert_ids array

    outputs = []
    start_idx = 0
    activated_experts = []

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        activated_experts.append(i)
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        # Only debug the target expert that matches AWQ-MoE's first expert
        should_debug = debug_print and i == target_debug_expert
        expert_out = mlp_torch(
            tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i], debug_expert_id=i, debug_print=should_debug
        )
        outputs.append(expert_out)
        start_idx = end_idx

    if debug_print:
        print(f"[TORCH DEBUG] Processing activated experts: {activated_experts}")
        print(f"[TORCH DEBUG] Target debug expert (matches AWQ): {target_debug_expert}")

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )

    if debug_print:
        print(f"[TORCH DEBUG] Final MoE output[:{debug_print_count}] = {t_output.flatten()[:debug_print_count]}")

    return t_output


def test_moe(quant_mode: str):
    assert (
        quant_mode == "bf16"
        or quant_mode == "int8"
        or quant_mode == "int4"
        or quant_mode == "int4_1"
        or quant_mode == "int4_1k"
    )
    with torch.inference_mode(mode=True):
        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.bfloat16, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.gate_scale = 0
            config.pool = CPUInfer.backend_
            if quant_mode == "bf16":
                moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
                CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
                CPUInfer.sync()
                CPUInfer.submit(moe.warm_up_task())
                CPUInfer.sync()
            elif quant_mode == "int8":
                moe = kt_kernel_ext.moe.AMXInt8_MOE(config)
                CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
                CPUInfer.sync()
                # CPUInfer.submit(moe.warm_up_task())
                # CPUInfer.sync()
            elif quant_mode == "int4":
                moe = kt_kernel_ext.moe.AMXInt4_MOE(config)
                CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
                CPUInfer.sync()
                CPUInfer.submit(moe.warm_up_task())
                CPUInfer.sync()
            elif quant_mode == "int4_1":
                moe = kt_kernel_ext.moe.AMXInt4_1_MOE(config)
                CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
                CPUInfer.sync()
                CPUInfer.submit(moe.warm_up_task())
                CPUInfer.sync()
            elif quant_mode == "int4_1k":
                config.quant_config.bits = 4
                config.quant_config.group_size = k_group_size
                config.quant_config.zero_point = True
                moe = kt_kernel_ext.moe.AMXInt4_1KGroup_MOE(config)
                # import debugpy
                # debugpy.listen(("127.0.0.1", 5678))
                # debugpy.wait_for_client()
                # debugpy.breakpoint()
                print(f"the physical_logical map:{physical_to_logical_map.data_ptr()}")
                CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
                CPUInfer.sync()
                # CPUInfer.submit(moe.warm_up_task())
                # CPUInfer.sync()
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)

        # validation
        for i in range(validation_iter):
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            input = input / 100
            moe = moes[i % layer_num]

            # Enable debug for first few iterations
            enable_debug = i < 2
            enable_debug = False
            if enable_debug:
                print(f"\n=== Iteration {i} Debug Info ===")
                print(f"input[:{debug_print_count}] = {input.flatten()[:debug_print_count]}")
                print(f"expert_ids = {expert_ids}")
                print(f"weights = {weights}")
                # Print which experts will be activated for comparison
                activated_experts = []
                for token in range(expert_ids.shape[0]):
                    for expert_idx in range(expert_ids.shape[1]):
                        expert_id = expert_ids[token][expert_idx].item()
                        if expert_id not in activated_experts:
                            activated_experts.append(expert_id)
                print(f"[TORCH DEBUG] Activated experts: {sorted(activated_experts)}")
                print(f"[TORCH DEBUG] First expert from expert_ids array: {expert_ids[0, 0].item()}")
            print(f"expert_ids = {expert_ids}")
            # print('expert ids:',expert_ids)
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            if enable_debug:
                print(f"[AWQ-MOE DEBUG] AMX output[:{debug_print_count}] = {output.flatten()[:debug_print_count]}")

            gate_proj = gate_projs[i % layer_num]
            up_proj = up_projs[i % layer_num]
            down_proj = down_projs[i % layer_num]
            t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj, debug_print=enable_debug)
            print("torch output", t_output)
            print("amx output", output)

            # print(output - t_output)
            # print(torch.abs(output - t_output))
            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            # print(f'output_shape:{output.shape}, t_output_shape:{t_output.shape}\n')
            print(f"Iteration {i}, diff = {diff:.6f}")

            if enable_debug:
                abs_diff = torch.abs(output - t_output)
                print(f"[COMPARE] Max abs diff = {torch.max(abs_diff):.6f}")
                print(f"[COMPARE] Mean abs diff = {torch.mean(abs_diff):.6f}")
                print(f"[COMPARE] Relative diff = {diff:.6f}")
                print("=" * 50)

            if quant_mode == "int4" or quant_mode == "int4_1" or quant_mode == "int4_1k":
                assert diff < 0.35
            else:
                assert diff < 0.05


# only turn on 1 at a time

# Debug mode is enabled for the first 2 iterations to compare intermediate results
# between torch implementation and AWQ-MoE implementation.
# The debug output shows:
# 1. Input values and expert assignments
# 2. Gate and up projection results
# 3. Intermediate values after activation function
# 4. Down projection results
# 5. Final output comparison

# test_moe("bf16")
test_moe("int8")
test_moe("int4")
test_moe("int4_1")
test_moe("int4_1k")


================================================
FILE: kt-kernel/examples/test_moe_kernel.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.insert(0, os.path.dirname(__file__) + "/../build")
os.environ["BLAS_NUM_THREADS"] = "1"
import torch
from kt_kernel import kt_kernel_ext


expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 4096
num_experts_per_tok = 8
m_block = 320
n_block_up_gate = 32
n_block_down = 64
n_block_up_gate_prefi = 32
n_block_down_prefi = 64
# qlen = 1
qlen = 1024
layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(160)
# validation_iter = 10000
validation_iter = 1


def act_fn(x):
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output


def test_moe(quant_mode: str):
    assert quant_mode == "int8" or quant_mode == "int4" or quant_mode == "int4_1"
    with torch.inference_mode(mode=True):
        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.bfloat16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_
            if quant_mode == "int8":
                d = kt_kernel_ext.moe.tiling.get_int8()
                nbug_prefi = n_block_up_gate_prefi
                nbd_prefi = n_block_down_prefi
                kb = d["k_block"]
                nb = d["n_block"]
                mb = m_block
                nbug = n_block_up_gate
                nbd = n_block_down
                print(
                    f"Int8 Tiling: nbug {nbug}, nbd {nbd}, nb {nb}, mb {mb}, kb {kb}, nbug_prefi {nbug_prefi}, nbd_prefi {nbd_prefi}"
                )
                kt_kernel_ext.moe.tiling.set_int8(nbug, nbd, nb, mb, kb, nbug_prefi, nbd_prefi)
                moe = kt_kernel_ext.moe.Int8_KERNEL_MOE(config)
                CPUInfer.submit(moe.load_weights_task())
                CPUInfer.sync()
                # CPUInfer.submit(moe.warm_up_task())
                # CPUInfer.sync()
            elif quant_mode == "int4":
                moe = kt_kernel_ext.moe.Int4_KERNEL_MOE(config)
                CPUInfer.submit(moe.load_weights_task())
                CPUInfer.sync()
                CPUInfer.submit(moe.warm_up_task())
                CPUInfer.sync()
            else:
                raise ValueError(f"Unsupported quantization mode: {quant_mode}")
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)

        # validation
        for i in range(validation_iter):
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            input = input / 100
            # 打印 input 的内容
            print("input:", input)
            moe = moes[i % layer_num]
            # print('expert ids:',expert_ids)
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            print("cpuinfer output", output)

            gate_proj = gate_projs[i % layer_num]
            up_proj = up_projs[i % layer_num]
            down_proj = down_projs[i % layer_num]
            t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj)
            print("torch output", t_output)

            # print(output - t_output)
            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            print("diff = ", diff)
            if quant_mode == "int4":
                assert diff < 0.35
            else:
                assert diff < 0.05


test_moe("int8")
# test_moe("int4")


================================================
FILE: kt-kernel/examples/test_moe_kml.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022
LastEditTime : 2024-08-06 10:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time

sys.path.insert(0, os.path.dirname(__file__) + "/../build")
os.environ["BLAS_NUM_THREADS"] = "1"
from kt_kernel import kt_kernel_ext
import torch

expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 4096
num_experts_per_tok = 8
qlen = 512
# qlen = 640
layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(112)
# validation_iter = 10000
validation_iter = 1


def act_fn(x):
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output


def test_moe(quant_mode: str):
    assert quant_mode == "bf16" or quant_mode == "int8" or quant_mode == "int4" or quant_mode == "int4_1"
    with torch.inference_mode(mode=True):
        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.bfloat16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.bfloat16, device="cpu")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_
            if quant_mode == "bf16":
                moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
                CPUInfer.submit(moe.load_weights_task())
                CPUInfer.sync()
                CPUInfer.submit(moe.warm_up_task())
                CPUInfer.sync()
            elif quant_mode == "int8":
                moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
                CPUInfer.submit(moe.load_weights_task())
                CPUInfer.sync()
                # CPUInfer.submit(moe.warm_up_task())
                # CPUInfer.sync()
            elif quant_mode == "int4":
                moe = kt_kernel_ext.moe.KMLInt4_MOE(config)
                CPUInfer.submit(moe.load_weights_task())
                CPUInfer.sync()
                CPUInfer.submit(moe.warm_up_task())
                CPUInfer.sync()
            else:
                raise ValueError(f"Unsupported quantization mode: {quant_mode}")
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)

        # validation
        for i in range(validation_iter):
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            input = input / 100
            # 打印 input 的内容
            print("input:", input)
            moe = moes[i % layer_num]
            # print('expert ids:',expert_ids)
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
            print("cpuinfer output", output)

            gate_proj = gate_projs[i % layer_num]
            up_proj = up_projs[i % layer_num]
            down_proj = down_projs[i % layer_num]
            t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj)
            print("torch output", t_output)

            # print(output - t_output)
            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            print("diff = ", diff)
            if quant_mode == "int4":
                assert diff < 0.35
            else:
                assert diff < 0.05


# test_moe("bf16")
# test_moe("int8")
test_moe("int4")


================================================
FILE: kt-kernel/examples/test_rope.cpp
================================================
#include <cassert>
#include <iostream>
#include <random>
#include <vector>

#include "../operators/rope.hpp"

std::vector<float> create_random_vector(size_t total_size, std::vector<size_t> shape, unsigned int seed = 0) {
  std::vector<float> vec(total_size);
  std::mt19937 gen(seed == 0 ? std::random_device{}() : seed);
  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
  // for (size_t i = 0; i < total_size; ++i) {
  //   vec[i] = 1; // dist(gen);
  // }
  for (size_t i = 0; i < shape[0]; ++i) {
    size_t offset_i = i * shape[1] * shape[2] * shape[3];
    for (size_t j = 0; j < shape[1]; ++j) {
      size_t offset_j = j * shape[2] * shape[3];
      for (size_t k = 0; k < shape[2]; ++k) {
        size_t offset_k = k * shape[3];
        for (size_t a = 0; a < shape[3]; ++a) {
          vec[offset_i + offset_j + offset_k + a] = a;
        }
      }
    }
  }
  return vec;
}

void print_vector_to_file(const std::vector<float>& vec, const char* filename) {
  FILE* fp = fopen(filename, "w");
  for (auto x : vec) {
    fprintf(fp, "%.2f ", x);
  }
  fclose(fp);
}

std::pair<std::vector<float>, std::vector<float>> cpp_torch_rope_with_apply_single(
    const std::vector<float>& q_in_const, const std::vector<float>& k_in_const,
    DeepseekV3YarnRotaryEmbedding<float>& rotary_emb, size_t B, size_t H, size_t S, size_t D_rope) {
  rotary_emb.init(S);

  const float* full_cos_cache_ptr = rotary_emb.cos();
  const float* full_sin_cache_ptr = rotary_emb.sin();

  std::vector<float> q_out = q_in_const;
  std::vector<float> k_out = k_in_const;

  size_t stride_head = S * D_rope;
  size_t stride_batch = H * stride_head;

  for (size_t b = 0; b < B; ++b) {
    for (size_t h = 0; h < H; ++h) {
      float* current_k_head_ptr = k_out.data() + b * stride_batch + h * stride_head;
      Rope<DeepseekV3YarnRotaryEmbedding<float>, float>::apply_multiple(rotary_emb, current_k_head_ptr,
                                                                        static_cast<int>(D_rope), 0, S);
      for (size_t s = 0; s < S; ++s) {
        float* current_q_head_ptr = q_out.data() + b * stride_batch + h * stride_head + s * D_rope;

        Rope<DeepseekV3YarnRotaryEmbedding<float>, float>::apply_single(rotary_emb, current_q_head_ptr,
                                                                        static_cast<int>(D_rope), s);
      }
    }
  }

  return {q_out, k_out};
}

int main() {
  size_t batch_size = 2;
  size_t num_heads = 16;
  size_t seq_len = 32;
  size_t rope_size = 16;
  float theta = 10000.0f;

  float beta_fast_cfg = 32.0f;
  float beta_slow_cfg = 1.0f;
  float factor_cfg = 40.0f;
  float mscale_cfg = 1.0f;
  float mscale_all_dim_cfg = 1.0f;
  size_t original_max_pos_embeddings_cfg = 4096;

  std::cout << "--- Test Parameters ---" << std::endl;
  std::cout << "Batch Size: " << batch_size << std::endl;
  std::cout << "Num Heads: " << num_heads << std::endl;
  std::cout << "Seq Len: " << seq_len << std::endl;
  std::cout << "Rope Size (dim): " << rope_size << std::endl;
  std::cout << "Theta (base): " << theta << std::endl;
  std::cout << "Scaling Factor: " << factor_cfg << std::endl;
  std::cout << "Original Max Pos Embeddings: " << original_max_pos_embeddings_cfg << std::endl;
  std::cout << "-----------------------" << std::endl << std::endl;

  DeepseekV3YarnRotaryEmbedding<float> rotary_emb(rope_size, original_max_pos_embeddings_cfg, theta, factor_cfg,
                                                  original_max_pos_embeddings_cfg, beta_fast_cfg, beta_slow_cfg,
                                                  mscale_cfg, mscale_all_dim_cfg);
  std::cout << "DeepseekV3YarnRotaryEmbedding instantiated." << std::endl;

  size_t total_elements_per_tensor = batch_size * num_heads * seq_len * rope_size;

  unsigned int q_seed = 123;
  unsigned int k_seed = 456;
  std::vector<float> q_pe_vec =
      create_random_vector(total_elements_per_tensor, {batch_size, num_heads, seq_len, rope_size}, q_seed);
  std::vector<float> k_pe_vec =
      create_random_vector(total_elements_per_tensor, {batch_size, num_heads, seq_len, rope_size}, k_seed);

  std::cout << "Input Q_PE and K_PE vectors created. Total elements per tensor: " << total_elements_per_tensor
            << std::endl;

  std::cout << std::endl;

  std::cout << "Applying RoPE using cpp_torch_rope_with_apply_single..." << std::endl;
  auto [q2_vec, k2_vec] =
      cpp_torch_rope_with_apply_single(q_pe_vec, k_pe_vec, rotary_emb, batch_size, num_heads, seq_len, rope_size);
  std::cout << "RoPE application finished." << std::endl << std::endl;

  std::cout << std::endl << "test_rope.cpp finished successfully." << std::endl;

  print_vector_to_file(q2_vec, "q_cpp.out");
  print_vector_to_file(k2_vec, "k_cpp.out");

  return 0;
}

================================================
FILE: kt-kernel/examples/test_rope.py
================================================
import torch
from torch_attention import apply_rotary_pos_emb, DeepseekV3YarnRotaryEmbedding, DeepseekV3RotaryEmbedding

batch_size  = 1
num_heads   = 1
seq_len     = 1024
rope_size   = 64
theta       = 10000

max_position_embeddings =  163840

scaling_cfg = {
    "beta_fast": 32,
    "beta_slow": 1,
    "factor": 40,
    "mscale": 1.0,
    "mscale_all_dim": 1.0,
    "original_max_position_embeddings": 4096,
    "type": "yarn"
}

rotary_emb = DeepseekV3YarnRotaryEmbedding(
    rope_size,
    max_position_embeddings=max_position_embeddings,
    scaling_factor=scaling_cfg["factor"],
    base=theta,
    beta_fast=scaling_cfg["beta_fast"],
    beta_slow=scaling_cfg["beta_slow"],
    mscale=scaling_cfg["mscale"],
    mscale_all_dim=scaling_cfg["mscale_all_dim"],
    original_max_position_embeddings=scaling_cfg["original_max_position_embeddings"],
)


def load_fp16_tensor(file_path, shape):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float16)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor

def load_fp32_tensor(file_path, shape):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float32)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor

#q_pe = torch.randn(batch_size, num_heads, seq_len, rope_size, dtype=torch.float32)
#k_pe = torch.randn_like(q_pe)

q_pe = load_fp16_tensor("csrc/ktransformers_ext/build/before_rope",(batch_size, num_heads, seq_len, rope_size)) 
# k_pe = torch.ones_like(q_pe) 
k_pe = load_fp16_tensor("csrc/ktransformers_ext/build/before_rope",(batch_size, num_heads, seq_len, rope_size)) 
print(q_pe)

check = load_fp16_tensor("csrc/ktransformers_ext/build/after_rope",(batch_size, num_heads, seq_len, rope_size))


def torch_rope(q, k):
    cos, sin = rotary_emb(q, seq_len=seq_len)

    cos_to_check = load_fp32_tensor("csrc/ktransformers_ext/build/cos",(seq_len, rope_size//2))
    sin_to_check = load_fp32_tensor("csrc/ktransformers_ext/build/sin",(seq_len, rope_size//2))


    sin = sin.unsqueeze(0)
    cos = cos.unsqueeze(0)
    q2, k2 = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1)
    return q2, k2

q2, k2 = torch_rope(q_pe, k_pe)
print(q2,k2)
print(check)

diff = torch.abs(q2 - check).max()


print(diff)

# print(q2,k2)

# print_tensor(q2, 'q_py.out')
# print_tensor(k2, 'k_py.out')


================================================
FILE: kt-kernel/examples/test_softmax.py
================================================

import torch
from torch import nn


def load_fp16_tensor(file_path, shape):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    tensor = torch.frombuffer(raw_data, dtype=torch.float16)
    tensor = tensor.view(shape)  # 根据你的 shape reshape
    return tensor

a = load_fp16_tensor("csrc/ktransformers_ext/build/before_softmax", (64,1024))
check = load_fp16_tensor("csrc/ktransformers_ext/build/after_softmax", (64,1024))


a = nn.functional.softmax(a, dim=-1, dtype=torch.float16)
diff = torch.abs(a - check).max()

print(a)
print(check)
print(diff)


================================================
FILE: kt-kernel/examples/test_write_buffer.py
================================================
"""
Test write_weight_scale_to_buffer for AMX MOE operators.

Supports:
- FP8: FP8 weights (1 byte) + float32 scales (block-wise)
- FP8_PERCHANNEL: FP8 weights (1 byte) + float32 per-channel scales
- BF16: Native BF16 weights (2 bytes), no scales

Usage:
    python test_write_buffer.py          # Run all modes
    python test_write_buffer.py fp8      # Run FP8 only
    python test_write_buffer.py fp8_perchannel  # Run FP8 per-channel only
    python test_write_buffer.py bf16     # Run BF16 only
"""

import os
import sys
import time

import torch

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))

from kt_kernel import kt_kernel_ext
from kt_kernel_ext import CPUInfer


def make_cpu_infer(thread_num=80):
    return CPUInfer(thread_num)


def div_up(a, b):
    return (a + b - 1) // b


def build_config_fp8(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size, group_size):
    cfg = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    cfg.max_len = 1
    cfg.quant_config.bits = 8  # FP8
    cfg.quant_config.group_size = group_size
    cfg.quant_config.zero_point = False
    cfg.pool = cpuinfer.backend_
    return cfg


def build_config_fp8_perchannel(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size):
    cfg = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    cfg.max_len = 1
    cfg.quant_config.bits = 8  # FP8
    cfg.quant_config.group_size = 0  # Not used for per-channel
    cfg.quant_config.zero_point = False
    cfg.quant_config.per_channel = True
    cfg.pool = cpuinfer.backend_
    return cfg


def build_config_bf16(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size):
    cfg = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    cfg.max_len = 1
    cfg.pool = cpuinfer.backend_
    return cfg


def allocate_weights_fp8(expert_num, hidden_size, intermediate_size, group_size):
    """Allocate FP8 weights and scales for testing"""
    # FP8 weights: 1 byte per element
    per_mat_weight_bytes = hidden_size * intermediate_size
    # FP8 scales: block-wise (group_size x group_size blocks), stored as float32
    n_blocks_n_gate_up = div_up(intermediate_size, group_size)
    n_blocks_k = div_up(hidden_size, group_size)
    per_mat_scale_elems_gate_up = n_blocks_n_gate_up * n_blocks_k

    # For down: n=hidden_size, k=intermediate_size
    n_blocks_n_down = n_blocks_k
    n_blocks_k_down = n_blocks_n_gate_up
    per_mat_scale_elems_down = n_blocks_n_down * n_blocks_k_down

    gate_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    up_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    down_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)

    gate_scale = torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32)
    up_scale = torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32)
    down_scale = torch.randn(expert_num * per_mat_scale_elems_down, dtype=torch.float32)

    return {
        "gate_q": gate_q,
        "up_q": up_q,
        "down_q": down_q,
        "gate_scale": gate_scale,
        "up_scale": up_scale,
        "down_scale": down_scale,
        "per_mat_weight_bytes": per_mat_weight_bytes,
        "per_mat_scale_elems_gate_up": per_mat_scale_elems_gate_up,
        "per_mat_scale_elems_down": per_mat_scale_elems_down,
    }


def allocate_weights_fp8_perchannel(expert_num, hidden_size, intermediate_size):
    """Allocate FP8 per-channel weights and scales for testing"""
    per_mat_weight_bytes = hidden_size * intermediate_size
    per_mat_scale_elems_gate_up = intermediate_size  # one scale per output channel
    per_mat_scale_elems_down = hidden_size

    gate_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    up_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
    down_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)

    gate_scale = torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32)
    up_scale = torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32)
    down_scale = torch.randn(expert_num * per_mat_scale_elems_down, dtype=torch.float32)

    return {
        "gate_q": gate_q,
        "up_q": up_q,
        "down_q": down_q,
        "gate_scale": gate_scale,
        "up_scale": up_scale,
        "down_scale": down_scale,
        "per_mat_weight_bytes": per_mat_weight_bytes,
        "per_mat_scale_elems_gate_up": per_mat_scale_elems_gate_up,
        "per_mat_scale_elems_down": per_mat_scale_elems_down,
    }


def allocate_weights_bf16(expert_num, hidden_size, intermediate_size):
    """Allocate BF16 weights for testing (no scales)"""
    # BF16 weights: 2 bytes per element
    per_mat_weight_elems = hidden_size * intermediate_size
    per_mat_weight_bytes = per_mat_weight_elems * 2  # BF16 = 2 bytes

    gate_proj = torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16)
    up_proj = torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16)
    down_proj = torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16)

    return {
        "gate_proj": gate_proj,
        "up_proj": up_proj,
        "down_proj": down_proj,
        "per_mat_weight_bytes": per_mat_weight_bytes,
        "per_mat_weight_elems": per_mat_weight_elems,
    }


def test_fp8_write_buffer(gpu_tp_count):
    """Test write_weight_scale_to_buffer with FP8 weights"""
    torch.manual_seed(123)

    expert_num = 256
    gpu_experts = expert_num
    num_experts_per_tok = 8
    hidden_size = 3072
    intermediate_size = 1536
    group_size = 128

    cpuinfer = make_cpu_infer()
    cfg = build_config_fp8(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size, group_size)
    weights = allocate_weights_fp8(expert_num, hidden_size, intermediate_size, group_size)

    cfg.gate_proj = weights["gate_q"].data_ptr()
    cfg.up_proj = weights["up_q"].data_ptr()
    cfg.down_proj = weights["down_q"].data_ptr()
    cfg.gate_scale = weights["gate_scale"].data_ptr()
    cfg.up_scale = weights["up_scale"].data_ptr()
    cfg.down_scale = weights["down_scale"].data_ptr()

    moe = kt_kernel_ext.moe.AMXFP8_MOE(cfg)

    physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
    cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    cpuinfer.sync()

    per_mat_weight_bytes = weights["per_mat_weight_bytes"]
    per_mat_scale_elems_gate_up = weights["per_mat_scale_elems_gate_up"]
    per_mat_scale_elems_down = weights["per_mat_scale_elems_down"]

    # Calculate sizes per TP part
    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
    gpu_n_w13 = intermediate_size // gpu_tp_count
    gpu_k_w13 = hidden_size
    scale_elems_per_expert_per_tp_gate_up = div_up(gpu_n_w13, group_size) * div_up(gpu_k_w13, group_size)
    gpu_n_w2 = hidden_size
    gpu_k_w2 = intermediate_size // gpu_tp_count
    scale_elems_per_expert_per_tp_down = div_up(gpu_n_w2, group_size) * div_up(gpu_k_w2, group_size)

    total_weight_bytes_per_tp = gpu_experts * weight_bytes_per_expert_per_tp
    total_scale_elems_per_tp_gate_up = gpu_experts * scale_elems_per_expert_per_tp_gate_up
    total_scale_elems_per_tp_down = gpu_experts * scale_elems_per_expert_per_tp_down

    # Create buffer lists
    w13_weight_bufs = [torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w13_scale_bufs = [
        torch.empty(2 * total_scale_elems_per_tp_gate_up, dtype=torch.float32) for _ in range(gpu_tp_count)
    ]
    w2_weight_bufs = [torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w2_scale_bufs = [torch.empty(total_scale_elems_per_tp_down, dtype=torch.float32) for _ in range(gpu_tp_count)]

    print(f"[FP8] GPU TP count: {gpu_tp_count}, Experts: {expert_num}")
    print(f"[FP8] Weight bytes per expert per TP: {weight_bytes_per_expert_per_tp}")
    print(f"[FP8] Scale elements per expert per TP (gate/up): {scale_elems_per_expert_per_tp_gate_up}")

    def get_expert_ptrs(expert_id):
        w13_weight_ptrs = []
        w13_scale_ptrs = []
        w2_weight_ptrs = []
        w2_scale_ptrs = []
        for tp_idx in range(gpu_tp_count):
            w13_weight_expert_offset = expert_id * 2 * weight_bytes_per_expert_per_tp
            w13_scale_expert_offset = expert_id * 2 * scale_elems_per_expert_per_tp_gate_up
            w2_weight_expert_offset = expert_id * weight_bytes_per_expert_per_tp
            w2_scale_expert_offset = expert_id * scale_elems_per_expert_per_tp_down

            w13_weight_ptrs.append(w13_weight_bufs[tp_idx].data_ptr() + w13_weight_expert_offset)
            w13_scale_ptrs.append(w13_scale_bufs[tp_idx].data_ptr() + w13_scale_expert_offset * 4)
            w2_weight_ptrs.append(w2_weight_bufs[tp_idx].data_ptr() + w2_weight_expert_offset)
            w2_scale_ptrs.append(w2_scale_bufs[tp_idx].data_ptr() + w2_scale_expert_offset * 4)
        return w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs

    # Warm up
    for _ in range(2):
        for expert_id in range(gpu_experts):
            w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
            cpuinfer.submit(
                moe.write_weight_scale_to_buffer_task(
                    gpu_tp_count=gpu_tp_count,
                    expert_id=expert_id,
                    w13_weight_ptrs=w13_weight_ptrs,
                    w13_scale_ptrs=w13_scale_ptrs,
                    w2_weight_ptrs=w2_weight_ptrs,
                    w2_scale_ptrs=w2_scale_ptrs,
                )
            )
            cpuinfer.sync()

    # Timing
    begin_time = time.perf_counter_ns()
    for expert_id in range(gpu_experts):
        w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
        cpuinfer.submit(
            moe.write_weight_scale_to_buffer_task(
                gpu_tp_count=gpu_tp_count,
                expert_id=expert_id,
                w13_weight_ptrs=w13_weight_ptrs,
                w13_scale_ptrs=w13_scale_ptrs,
                w2_weight_ptrs=w2_weight_ptrs,
                w2_scale_ptrs=w2_scale_ptrs,
            )
        )
        cpuinfer.sync()
    end_time = time.perf_counter_ns()
    elapsed_ms = (end_time - begin_time) / 1e6

    total_bytes = (
        hidden_size * intermediate_size * gpu_experts * 3
        + (per_mat_scale_elems_gate_up * 2 + per_mat_scale_elems_down) * gpu_experts * 4
    )
    print(f"[FP8] write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
    print(f"[FP8] Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")

    # Verify correctness
    def split_expert_tensor(tensor, chunk):
        return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]

    gate_q = weights["gate_q"]
    up_q = weights["up_q"]
    down_q = weights["down_q"]
    gate_scale = weights["gate_scale"]
    up_scale = weights["up_scale"]
    down_scale = weights["down_scale"]

    gate_q_experts = split_expert_tensor(gate_q, per_mat_weight_bytes)
    up_q_experts = split_expert_tensor(up_q, per_mat_weight_bytes)
    down_q_experts = split_expert_tensor(down_q, per_mat_weight_bytes)
    gate_scale_experts = split_expert_tensor(gate_scale, per_mat_scale_elems_gate_up)
    up_scale_experts = split_expert_tensor(up_scale, per_mat_scale_elems_gate_up)
    down_scale_experts = split_expert_tensor(down_scale, per_mat_scale_elems_down)

    n_blocks_n = div_up(hidden_size, group_size)
    n_blocks_k = div_up(intermediate_size, group_size)
    n_blocks_k_per_tp = n_blocks_k // gpu_tp_count

    for tp_idx in range(gpu_tp_count):
        expected_w13_weights = []
        expected_w13_scales = []
        expected_w2_weights = []
        expected_w2_scales = []

        weight13_per_tp = per_mat_weight_bytes // gpu_tp_count
        scale13_per_tp = per_mat_scale_elems_gate_up // gpu_tp_count

        for expert_id in range(gpu_experts):
            start_weight = tp_idx * weight13_per_tp
            end_weight = (tp_idx + 1) * weight13_per_tp
            start_scale = tp_idx * scale13_per_tp
            end_scale = (tp_idx + 1) * scale13_per_tp

            gate_weight_tp = gate_q_experts[expert_id][start_weight:end_weight]
            gate_scale_tp = gate_scale_experts[expert_id][start_scale:end_scale]
            up_weight_tp = up_q_experts[expert_id][start_weight:end_weight]
            up_scale_tp = up_scale_experts[expert_id][start_scale:end_scale]

            down_weight_tp_parts = []
            down_scale_tp_parts = []
            tp_slice_weight_size = intermediate_size // gpu_tp_count

            for row_idx in range(hidden_size):
                row_weight_start = row_idx * intermediate_size
                tp_weight_offset = row_weight_start + tp_idx * tp_slice_weight_size
                down_weight_tp_parts.append(
                    down_q_experts[expert_id][tp_weight_offset : tp_weight_offset + tp_slice_weight_size]
                )

            for bn in range(n_blocks_n):
                row_scale_start = bn * n_blocks_k
                tp_scale_offset = row_scale_start + tp_idx * n_blocks_k_per_tp
                down_scale_tp_parts.append(
                    down_scale_experts[expert_id][tp_scale_offset : tp_scale_offset + n_blocks_k_per_tp]
                )

            down_weight_tp = torch.cat(down_weight_tp_parts)
            down_scale_tp = torch.cat(down_scale_tp_parts)

            expected_w13_weights.append(gate_weight_tp)
            expected_w13_weights.append(up_weight_tp)
            expected_w13_scales.append(gate_scale_tp)
            expected_w13_scales.append(up_scale_tp)
            expected_w2_weights.append(down_weight_tp)
            expected_w2_scales.append(down_scale_tp)

        expected_w13_weight = torch.cat(expected_w13_weights)
        expected_w13_scale = torch.cat(expected_w13_scales)
        expected_w2_weight = torch.cat(expected_w2_weights)
        expected_w2_scale = torch.cat(expected_w2_scales)

        if not torch.equal(w13_weight_bufs[tp_idx], expected_w13_weight):
            diff_mask = w13_weight_bufs[tp_idx] != expected_w13_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            raise AssertionError(f"[FP8] w13 weight mismatch for TP {tp_idx} at index {first_diff_idx}")

        if not torch.allclose(w13_scale_bufs[tp_idx], expected_w13_scale):
            raise AssertionError(f"[FP8] w13 scale mismatch for TP {tp_idx}")

        if not torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight):
            diff_mask = w2_weight_bufs[tp_idx] != expected_w2_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            raise AssertionError(f"[FP8] w2 weight mismatch for TP {tp_idx} at index {first_diff_idx}")

        if not torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale):
            raise AssertionError(f"[FP8] w2 scale mismatch for TP {tp_idx}")

    print(f"[FP8] TP={gpu_tp_count} PASSED (verified {gpu_experts} experts across {gpu_tp_count} TP parts)")
    return True


def test_fp8_perchannel_write_buffer(gpu_tp_count):
    """Test write_weight_scale_to_buffer with FP8 per-channel weights"""
    torch.manual_seed(123)

    expert_num = 256
    gpu_experts = expert_num
    num_experts_per_tok = 8
    hidden_size = 3072
    intermediate_size = 1536

    cpuinfer = make_cpu_infer()
    cfg = build_config_fp8_perchannel(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    weights = allocate_weights_fp8_perchannel(expert_num, hidden_size, intermediate_size)

    cfg.gate_proj = weights["gate_q"].data_ptr()
    cfg.up_proj = weights["up_q"].data_ptr()
    cfg.down_proj = weights["down_q"].data_ptr()
    cfg.gate_scale = weights["gate_scale"].data_ptr()
    cfg.up_scale = weights["up_scale"].data_ptr()
    cfg.down_scale = weights["down_scale"].data_ptr()

    moe = kt_kernel_ext.moe.AMXFP8PerChannel_MOE(cfg)

    physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
    cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    cpuinfer.sync()

    per_mat_weight_bytes = weights["per_mat_weight_bytes"]
    per_mat_scale_elems_gate_up = weights["per_mat_scale_elems_gate_up"]
    per_mat_scale_elems_down = weights["per_mat_scale_elems_down"]

    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
    gpu_n_w13 = intermediate_size // gpu_tp_count
    scale_elems_per_expert_per_tp_gate_up = gpu_n_w13
    scale_elems_per_expert_per_tp_down = per_mat_scale_elems_down

    total_weight_bytes_per_tp = gpu_experts * weight_bytes_per_expert_per_tp
    total_scale_elems_per_tp_gate_up = gpu_experts * scale_elems_per_expert_per_tp_gate_up
    total_scale_elems_per_tp_down = gpu_experts * scale_elems_per_expert_per_tp_down

    w13_weight_bufs = [torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w13_scale_bufs = [
        torch.empty(2 * total_scale_elems_per_tp_gate_up, dtype=torch.float32) for _ in range(gpu_tp_count)
    ]
    w2_weight_bufs = [torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w2_scale_bufs = [torch.empty(total_scale_elems_per_tp_down, dtype=torch.float32) for _ in range(gpu_tp_count)]

    print(f"[FP8_PERCHANNEL] GPU TP count: {gpu_tp_count}, Experts: {expert_num}")
    print(f"[FP8_PERCHANNEL] Weight bytes per expert per TP: {weight_bytes_per_expert_per_tp}")
    print(f"[FP8_PERCHANNEL] Scale elements per expert per TP (gate/up): {scale_elems_per_expert_per_tp_gate_up}")

    def get_expert_ptrs(expert_id):
        w13_weight_ptrs = []
        w13_scale_ptrs = []
        w2_weight_ptrs = []
        w2_scale_ptrs = []
        for tp_idx in range(gpu_tp_count):
            w13_weight_expert_offset = expert_id * 2 * weight_bytes_per_expert_per_tp
            w13_scale_expert_offset = expert_id * 2 * scale_elems_per_expert_per_tp_gate_up
            w2_weight_expert_offset = expert_id * weight_bytes_per_expert_per_tp
            w2_scale_expert_offset = expert_id * scale_elems_per_expert_per_tp_down

            w13_weight_ptrs.append(w13_weight_bufs[tp_idx].data_ptr() + w13_weight_expert_offset)
            w13_scale_ptrs.append(w13_scale_bufs[tp_idx].data_ptr() + w13_scale_expert_offset * 4)
            w2_weight_ptrs.append(w2_weight_bufs[tp_idx].data_ptr() + w2_weight_expert_offset)
            w2_scale_ptrs.append(w2_scale_bufs[tp_idx].data_ptr() + w2_scale_expert_offset * 4)
        return w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs

    for _ in range(2):
        for expert_id in range(gpu_experts):
            w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
            cpuinfer.submit(
                moe.write_weight_scale_to_buffer_task(
                    gpu_tp_count=gpu_tp_count,
                    expert_id=expert_id,
                    w13_weight_ptrs=w13_weight_ptrs,
                    w13_scale_ptrs=w13_scale_ptrs,
                    w2_weight_ptrs=w2_weight_ptrs,
                    w2_scale_ptrs=w2_scale_ptrs,
                )
            )
            cpuinfer.sync()

    begin_time = time.perf_counter_ns()
    for expert_id in range(gpu_experts):
        w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
        cpuinfer.submit(
            moe.write_weight_scale_to_buffer_task(
                gpu_tp_count=gpu_tp_count,
                expert_id=expert_id,
                w13_weight_ptrs=w13_weight_ptrs,
                w13_scale_ptrs=w13_scale_ptrs,
                w2_weight_ptrs=w2_weight_ptrs,
                w2_scale_ptrs=w2_scale_ptrs,
            )
        )
        cpuinfer.sync()
    end_time = time.perf_counter_ns()
    elapsed_ms = (end_time - begin_time) / 1e6

    total_bytes = (
        hidden_size * intermediate_size * gpu_experts * 3
        + (per_mat_scale_elems_gate_up * 2 + per_mat_scale_elems_down) * gpu_experts * 4
    )
    print(f"[FP8_PERCHANNEL] write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
    print(f"[FP8_PERCHANNEL] Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")

    def split_expert_tensor(tensor, chunk):
        return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]

    gate_q = weights["gate_q"]
    up_q = weights["up_q"]
    down_q = weights["down_q"]
    gate_scale = weights["gate_scale"]
    up_scale = weights["up_scale"]
    down_scale = weights["down_scale"]

    gate_q_experts = split_expert_tensor(gate_q, per_mat_weight_bytes)
    up_q_experts = split_expert_tensor(up_q, per_mat_weight_bytes)
    down_q_experts = split_expert_tensor(down_q, per_mat_weight_bytes)
    gate_scale_experts = split_expert_tensor(gate_scale, per_mat_scale_elems_gate_up)
    up_scale_experts = split_expert_tensor(up_scale, per_mat_scale_elems_gate_up)
    down_scale_experts = split_expert_tensor(down_scale, per_mat_scale_elems_down)

    for tp_idx in range(gpu_tp_count):
        expected_w13_weights = []
        expected_w13_scales = []
        expected_w2_weights = []
        expected_w2_scales = []

        weight13_per_tp = per_mat_weight_bytes // gpu_tp_count
        scale13_per_tp = per_mat_scale_elems_gate_up // gpu_tp_count

        for expert_id in range(gpu_experts):
            start_weight = tp_idx * weight13_per_tp
            end_weight = (tp_idx + 1) * weight13_per_tp
            start_scale = tp_idx * scale13_per_tp
            end_scale = (tp_idx + 1) * scale13_per_tp

            gate_weight_tp = gate_q_experts[expert_id][start_weight:end_weight]
            gate_scale_tp = gate_scale_experts[expert_id][start_scale:end_scale]
            up_weight_tp = up_q_experts[expert_id][start_weight:end_weight]
            up_scale_tp = up_scale_experts[expert_id][start_scale:end_scale]

            down_weight_tp_parts = []
            tp_slice_weight_size = intermediate_size // gpu_tp_count

            for row_idx in range(hidden_size):
                row_weight_start = row_idx * intermediate_size
                tp_weight_offset = row_weight_start + tp_idx * tp_slice_weight_size
                down_weight_tp_parts.append(
                    down_q_experts[expert_id][tp_weight_offset : tp_weight_offset + tp_slice_weight_size]
                )

            down_weight_tp = torch.cat(down_weight_tp_parts)
            down_scale_tp = down_scale_experts[expert_id]

            expected_w13_weights.append(gate_weight_tp)
            expected_w13_weights.append(up_weight_tp)
            expected_w13_scales.append(gate_scale_tp)
            expected_w13_scales.append(up_scale_tp)
            expected_w2_weights.append(down_weight_tp)
            expected_w2_scales.append(down_scale_tp)

        expected_w13_weight = torch.cat(expected_w13_weights)
        expected_w13_scale = torch.cat(expected_w13_scales)
        expected_w2_weight = torch.cat(expected_w2_weights)
        expected_w2_scale = torch.cat(expected_w2_scales)

        if not torch.equal(w13_weight_bufs[tp_idx], expected_w13_weight):
            diff_mask = w13_weight_bufs[tp_idx] != expected_w13_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            raise AssertionError(f"[FP8_PERCHANNEL] w13 weight mismatch for TP {tp_idx} at index {first_diff_idx}")

        if not torch.allclose(w13_scale_bufs[tp_idx], expected_w13_scale):
            raise AssertionError(f"[FP8_PERCHANNEL] w13 scale mismatch for TP {tp_idx}")

        if not torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight):
            diff_mask = w2_weight_bufs[tp_idx] != expected_w2_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            raise AssertionError(f"[FP8_PERCHANNEL] w2 weight mismatch for TP {tp_idx} at index {first_diff_idx}")

        if not torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale):
            raise AssertionError(f"[FP8_PERCHANNEL] w2 scale mismatch for TP {tp_idx}")

    print(f"[FP8_PERCHANNEL] TP={gpu_tp_count} PASSED (verified {gpu_experts} experts across {gpu_tp_count} TP parts)")
    return True


def test_bf16_write_buffer(gpu_tp_count):
    """Test write_weight_scale_to_buffer with BF16 weights (no scales)"""
    torch.manual_seed(123)

    expert_num = 16
    gpu_experts = expert_num
    num_experts_per_tok = 8
    hidden_size = 3072
    intermediate_size = 1536

    cpuinfer = make_cpu_infer()
    cfg = build_config_bf16(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size)
    weights = allocate_weights_bf16(expert_num, hidden_size, intermediate_size)

    cfg.gate_proj = weights["gate_proj"].data_ptr()
    cfg.up_proj = weights["up_proj"].data_ptr()
    cfg.down_proj = weights["down_proj"].data_ptr()
    cfg.gate_scale = 0
    cfg.up_scale = 0
    cfg.down_scale = 0

    moe = kt_kernel_ext.moe.AMXBF16_MOE(cfg)

    physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
    cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
    cpuinfer.sync()

    per_mat_weight_elems = weights["per_mat_weight_elems"]

    # Calculate sizes per TP part (BF16 = 2 bytes per element)
    weight_elems_per_expert_per_tp = per_mat_weight_elems // gpu_tp_count
    weight_bytes_per_expert_per_tp = weight_elems_per_expert_per_tp * 2

    total_weight_bytes_per_tp = gpu_experts * weight_bytes_per_expert_per_tp

    # Create buffer lists (BF16: weights only, no scales)
    w13_weight_bufs = [torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    w2_weight_bufs = [torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
    # Empty scale buffers (not used for BF16 but needed for interface)
    w13_scale_bufs = [torch.empty(1, dtype=torch.float32) for _ in range(gpu_tp_count)]
    w2_scale_bufs = [torch.empty(1, dtype=torch.float32) for _ in range(gpu_tp_count)]

    print(f"[BF16] GPU TP count: {gpu_tp_count}, Experts: {expert_num}")
    print(f"[BF16] Weight bytes per expert per TP: {weight_bytes_per_expert_per_tp}")

    def get_expert_ptrs(expert_id):
        w13_weight_ptrs = []
        w13_scale_ptrs = []
        w2_weight_ptrs = []
        w2_scale_ptrs = []
        for tp_idx in range(gpu_tp_count):
            w13_weight_expert_offset = expert_id * 2 * weight_bytes_per_expert_per_tp
            w2_weight_expert_offset = expert_id * weight_bytes_per_expert_per_tp

            w13_weight_ptrs.append(w13_weight_bufs[tp_idx].data_ptr() + w13_weight_expert_offset)
            w13_scale_ptrs.append(w13_scale_bufs[tp_idx].data_ptr())  # Not used
            w2_weight_ptrs.append(w2_weight_bufs[tp_idx].data_ptr() + w2_weight_expert_offset)
            w2_scale_ptrs.append(w2_scale_bufs[tp_idx].data_ptr())  # Not used
        return w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs

    # Warm up
    for _ in range(2):
        for expert_id in range(gpu_experts):
            w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
            cpuinfer.submit(
                moe.write_weight_scale_to_buffer_task(
                    gpu_tp_count=gpu_tp_count,
                    expert_id=expert_id,
                    w13_weight_ptrs=w13_weight_ptrs,
                    w13_scale_ptrs=w13_scale_ptrs,
                    w2_weight_ptrs=w2_weight_ptrs,
                    w2_scale_ptrs=w2_scale_ptrs,
                )
            )
            cpuinfer.sync()

    # Timing
    begin_time = time.perf_counter_ns()
    for expert_id in range(gpu_experts):
        w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs = get_expert_ptrs(expert_id)
        cpuinfer.submit(
            moe.write_weight_scale_to_buffer_task(
                gpu_tp_count=gpu_tp_count,
                expert_id=expert_id,
                w13_weight_ptrs=w13_weight_ptrs,
                w13_scale_ptrs=w13_scale_ptrs,
                w2_weight_ptrs=w2_weight_ptrs,
                w2_scale_ptrs=w2_scale_ptrs,
            )
        )
        cpuinfer.sync()
    end_time = time.perf_counter_ns()
    elapsed_ms = (end_time - begin_time) / 1e6

    total_bytes = hidden_size * intermediate_size * gpu_experts * 3 * 2  # BF16 = 2 bytes
    print(f"[BF16] write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
    print(f"[BF16] Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")

    # Verify correctness (BF16: weights only, no scales)
    def split_expert_tensor(tensor, chunk):
        return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]

    gate_proj = weights["gate_proj"]
    up_proj = weights["up_proj"]
    down_proj = weights["down_proj"]

    # View BF16 as uint8 for byte-level comparison
    gate_bytes = gate_proj.view(torch.uint8)
    up_bytes = up_proj.view(torch.uint8)
    down_bytes = down_proj.view(torch.uint8)

    per_mat_bytes = per_mat_weight_elems * 2  # BF16 = 2 bytes
    gate_experts = split_expert_tensor(gate_bytes, per_mat_bytes)
    up_experts = split_expert_tensor(up_bytes, per_mat_bytes)
    down_experts = split_expert_tensor(down_bytes, per_mat_bytes)

    for tp_idx in range(gpu_tp_count):
        expected_w13_weights = []
        expected_w2_weights = []

        weight_bytes_per_tp = per_mat_bytes // gpu_tp_count

        for expert_id in range(gpu_experts):
            start_weight = tp_idx * weight_bytes_per_tp
            end_weight = (tp_idx + 1) * weight_bytes_per_tp

            gate_weight_tp = gate_experts[expert_id][start_weight:end_weight]
            up_weight_tp = up_experts[expert_id][start_weight:end_weight]

            # Down matrix: sliced column-wise (BF16 = 2 bytes per element)
            down_weight_tp_parts = []
            tp_slice_elems = intermediate_size // gpu_tp_count
            tp_slice_bytes = tp_slice_elems * 2

            for row_idx in range(hidden_size):
                row_byte_start = row_idx * intermediate_size * 2
                tp_byte_offset = row_byte_start + tp_idx * tp_slice_bytes
                down_weight_tp_parts.append(down_experts[expert_id][tp_byte_offset : tp_byte_offset + tp_slice_bytes])

            down_weight_tp = torch.cat(down_weight_tp_parts)

            expected_w13_weights.append(gate_weight_tp)
            expected_w13_weights.append(up_weight_tp)
            expected_w2_weights.append(down_weight_tp)

        expected_w13_weight = torch.cat(expected_w13_weights)
        expected_w2_weight = torch.cat(expected_w2_weights)

        if not torch.equal(w13_weight_bufs[tp_idx], expected_w13_weight):
            diff_mask = w13_weight_bufs[tp_idx] != expected_w13_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            raise AssertionError(f"[BF16] w13 weight mismatch for TP {tp_idx} at index {first_diff_idx}")

        if not torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight):
            diff_mask = w2_weight_bufs[tp_idx] != expected_w2_weight
            first_diff_idx = diff_mask.nonzero()[0].item() if diff_mask.any() else -1
            raise AssertionError(f"[BF16] w2 weight mismatch for TP {tp_idx} at index {first_diff_idx}")

    print(f"[BF16] TP={gpu_tp_count} PASSED (verified {gpu_experts} experts across {gpu_tp_count} TP parts)")
    return True


def test_with_tp(quant_mode: str, gpu_tp_count: int):
    """Test write_weight_scale_to_buffer with specified mode and TP count"""
    if quant_mode == "fp8":
        return test_fp8_write_buffer(gpu_tp_count)
    elif quant_mode == "fp8_perchannel":
        return test_fp8_perchannel_write_buffer(gpu_tp_count)
    elif quant_mode == "bf16":
        return test_bf16_write_buffer(gpu_tp_count)
    else:
        raise ValueError(f"Unsupported quant_mode: {quant_mode}")


def main(quant_modes=None):
    """Run tests for specified quant modes"""
    if quant_modes is None:
        quant_modes = ["fp8", "fp8_perchannel", "bf16"]

    tp_values = [1, 2, 4]
    all_passed = True
    results = {}

    for quant_mode in quant_modes:
        print("\n" + "=" * 60)
        print(f"Testing {quant_mode.upper()} write_weight_scale_to_buffer")
        print("=" * 60)

        for tp in tp_values:
            print(f"\n--- Testing {quant_mode.upper()} with gpu_tp_count = {tp} ---")
            try:
                test_with_tp(quant_mode, tp)
                results[(quant_mode, tp)] = "PASSED"
            except Exception as e:
                results[(quant_mode, tp)] = f"FAILED: {e}"
                all_passed = False
                print(f"[{quant_mode.upper()}] TP={tp} FAILED: {e}")

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    for (mode, tp), result in results.items():
        status = "PASS" if "PASSED" in result else "FAIL"
        print(f"  [{status}] {mode.upper()} TP={tp}: {result}")

    if all_passed:
        print("\nALL TESTS PASSED")
    else:
        print("\nSOME TESTS FAILED")
        sys.exit(1)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        mode = sys.argv[1].lower()
        if mode in ["fp8", "fp8_perchannel", "bf16"]:
            main([mode])
        else:
            print(f"Unknown mode: {mode}. Use 'fp8', 'fp8_perchannel' or 'bf16'")
            sys.exit(1)
    else:
        main()


================================================
FILE: kt-kernel/examples/torch_attention.py
================================================

import math
import os, sys
import time
import subprocess
import platform
import json
from typing import Any, Dict, Optional, Tuple
import torch
import torch.nn.init as init
from torch import nn

class KDeepSeekV3Cache(nn.Module):
    def __init__(
        self,
        # config: PretrainedConfig,
        page_size: int = 256,
        kv_lora_rank: int = 128,
        k_caches: Optional[torch.Tensor] = None,
        dtype=torch.bfloat16,
        device=torch.device("cuda:0"),
        
    ):
        super().__init__()
        # self.config = config
        self.dtype = dtype
        self.device = device
        self.kv_lora_rank = kv_lora_rank
        self.page_size = page_size
        self.v_caches = []
        self.k_caches = k_caches

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,

        page_idx: torch.Tensor,
        page_offset: torch.Tensor,

        cache_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
                to know how where to write in the cache.

        Return:
            A tuple containing the updated key and value states.
        """
        k_out = self.k_caches[layer_idx]

        k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states.reshape(-1, *key_states.shape[2:])
        k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states.reshape(-1, *value_states.shape[2:])
        return k_out

        
    def get_page_table(self, cache_position: torch.Tensor, q_indptr: torch.Tensor, kv_indptr: torch.Tensor, kv_indices: torch.Tensor, bsz_tensors: torch.tensor):
        page_offset = cache_position % self.page_size  
        page_idx_local = cache_position // self.page_size  
        query_ids = torch.zeros_like(cache_position)
        for i in range(len(q_indptr) - 1):
            start_idx = q_indptr[i]
            end_idx = q_indptr[i + 1]
            query_ids[start_idx:end_idx] = i
        page_idx = torch.zeros_like(page_idx_local)
        for i in range(bsz_tensors[0]):
            query_id = query_ids[i]
            local_block = page_idx_local[i]
            start_block = kv_indptr[query_id]
            if local_block < kv_indptr[query_id + 1] - kv_indptr[query_id]:
                page_idx[i] = kv_indices[start_block + local_block]
        
        return page_idx, page_offset


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    b, h, s, d = k.shape
    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class DeepseekV2RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        DeepseekV2RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return (self.weight * hidden_states).to(input_dtype)


class DeepseekV2RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

    @torch.no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)   

class DeepseekV3RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (
            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings,
            device=self.inv_freq.device,
            dtype=torch.get_default_dtype(),
        )
        # self.max_seq_len_cached = None

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq.to(t.device))
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        print("emb", emb.shape)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if self.max_seq_len_cached is None: # or seq_len[-1] > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[seq_len].to(dtype=x.dtype),
            self.sin_cached[seq_len].to(dtype=x.dtype),
        )

# Inverse dim formula to find dim based on number of rotations
def yarn_find_correction_dim(
    num_rotations, dim, base=10000, max_position_embeddings=2048
):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def yarn_find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(
        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
    )
    high = math.ceil(
        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
    )
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case

def yarn_linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func

def yarn_get_mscale(scale=1, mscale=1):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0

class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        dim = self.dim

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        # 判断 seq_len是否是 tensor
        if isinstance(seq_len,torch.Tensor):
            t = seq_len
        else:
            t = torch.arange(seq_len, device=device, dtype=torch.float32)

        freqs = torch.outer(t, inv_freq)

        _mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )

        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer(
            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
        )
        self.register_buffer(
            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
        )


================================================
FILE: kt-kernel/ext_bindings.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022, Jianwei Dong
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
// Python bindings
#include <sys/types.h>

#include <cstddef>

#include "cpu_backend/cpuinfer.h"
#include "cpu_backend/worker_pool.h"
#include "operators/common.hpp"

#if defined(USE_MOE_KERNEL)
#include "operators/moe_kernel/la/kernel.hpp"
#include "operators/moe_kernel/moe.hpp"
#endif

#if defined(__aarch64__) && defined(CPU_USE_KML)
#if defined(KTRANSFORMERS_CPU_MLA)
#include "operators/kml/deepseekv3.hpp"
#include "operators/kml/gate.hpp"
#include "operators/kml/mla.hpp"
#include "operators/kml/mla_int8.hpp"
#endif
#include "operators/kml/moe.hpp"
static const bool _is_plain_ = true;
#else
static const bool _is_plain_ = false;
#endif

#if defined(__x86_64__) && defined(USE_AMX_AVX_KERNEL)
#include "operators/amx/awq-moe.hpp"
#if defined(__AVX512BF16__)
#include "operators/amx/bf16-moe.hpp"            // Native BF16 MoE using CRTP pattern
#include "operators/amx/fp8-moe.hpp"             // FP8 MoE requires AVX512 BF16 support
#include "operators/amx/fp8-perchannel-moe.hpp"  // FP8 Per-Channel MoE for GLM-4.7-FP8
#endif
#include "operators/amx/k2-moe.hpp"
#include "operators/amx/la/amx_kernels.hpp"
#include "operators/amx/moe.hpp"
#endif
#include <pybind11/stl.h>  // std::vector/std::pair/std::string conversions

#include <cstdint>
#include <memory>
#include <type_traits>

#include "operators/kvcache/kvcache.h"
#include "operators/llamafile/linear.h"
#include "operators/llamafile/mla.hpp"
#include "operators/llamafile/mlp.h"
#include "operators/llamafile/moe.hpp"
#include "pybind11/pybind11.h"

namespace py = pybind11;
using namespace pybind11::literals;

py::object to_float_ptr(uintptr_t input_ptr, int size, ggml_type type) {
  if (type < 0 || type >= GGML_TYPE_COUNT) {
    PyErr_SetString(PyExc_ValueError, "Invalid ggml_type");
    throw py::error_already_set();
  }

  py::module torch = py::module::import("torch");
  py::dict kwargs;
  kwargs["dtype"] = torch.attr("float32");
  py::object tensor = torch.attr("empty")(size, **kwargs);

  uintptr_t output_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
  float* output_float_ptr = reinterpret_cast<float*>(output_ptr);

  try {
    to_float(reinterpret_cast<void*>(input_ptr), output_float_ptr, size, type);
  } catch (const std::exception& e) {
    PyErr_SetString(PyExc_RuntimeError, e.what());
    throw py::error_already_set();
  }

  return tensor;
}

py::object from_float_ptr(uintptr_t input_ptr, int size, ggml_type type) {
  if (type < 0 || type >= GGML_TYPE_COUNT) {
    PyErr_SetString(PyExc_ValueError, "Invalid ggml_type");
    throw py::error_already_set();
  }

  py::module torch = py::module::import("torch");

  size_t output_elem_bytes = ggml_type_size(type);
  size_t output_elem_count = (size + ggml_blck_size(type) - 1) / ggml_blck_size(type);
  size_t total_bytes = output_elem_count * output_elem_bytes;

  py::dict kwargs;
  kwargs["dtype"] = torch.attr("uint8");
  py::object tensor = torch.attr("empty")(total_bytes, **kwargs);

  uintptr_t output_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
  void* output_void_ptr = reinterpret_cast<void*>(output_ptr);

  try {
    from_float(reinterpret_cast<float*>(input_ptr), output_void_ptr, size, type);
  } catch (const std::exception& e) {
    PyErr_SetString(PyExc_RuntimeError, e.what());
    throw py::error_already_set();
  }

  return tensor;
}

template <typename T>
std::vector<std::vector<uintptr_t>> void_ptr_nested_to_uint(const std::vector<std::vector<T*>>& input) {
  std::vector<std::vector<uintptr_t>> result;
  for (const auto& row : input) {
    std::vector<uintptr_t> new_row;
    for (auto ptr : row) {
      new_row.push_back(reinterpret_cast<uintptr_t>(ptr));
    }
    result.push_back(std::move(new_row));
  }
  return result;
}

template <typename T>
std::vector<std::vector<T*>> uint_to_void_ptr_nested(const std::vector<std::vector<uintptr_t>>& input) {
  std::vector<std::vector<T*>> result;
  for (const auto& row : input) {
    std::vector<T*> new_row;
    for (auto val : row) {
      new_row.push_back(reinterpret_cast<T*>(val));
    }
    result.push_back(std::move(new_row));
  }
  return result;
}

#define DEF_PTR_PROPERTY(cls, name)                                                  \
  def_property(                                                                      \
      #name, [](const cls& self) { return reinterpret_cast<uintptr_t>(self.name); }, \
      [](cls& self, uintptr_t val) { self.name = reinterpret_cast<void*>(val); })

#define DEF_PTR_2D_PROPERTY(cls, name)                                                 \
  def_property(                                                                        \
      #name, [](const cls& self) { return void_ptr_nested_to_uint<void>(self.name); }, \
      [](cls& self, const std::vector<std::vector<uintptr_t>>& val) {                  \
        self.name = uint_to_void_ptr_nested<void>(val);                                \
      })

template <class T>
class MOEBindings {
 public:
  class WarmUpBindings {
   public:
    struct Args {
      CPUInfer* cpuinfer;
      TP_MOE<T>* moe;
    };
    static void inner(void* args) {
      Args* args_ = (Args*)args;
      args_->cpuinfer->enqueue(&TP_MOE<T>::warm_up, args_->moe);
    }
    static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<TP_MOE<T>> moe) {
      Args* args = new Args{nullptr, moe.get()};
      return std::make_pair((intptr_t)&inner, (intptr_t)args);
    }
  };
  class LoadWeightsBindings {
   public:
    struct Args {
      CPUInfer* cpuinfer;
      TP_MOE<T>* moe;
    };
    static void inner(void* args) {
      Args* args_ = (Args*)args;
      args_->cpuinfer->enqueue(&TP_MOE<T>::load_weights, args_->moe);
    }
    static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<TP_MOE<T>> moe,
                                                            const uintptr_t physical_to_logical_map = 0) {
      Args* args = new Args{nullptr, moe.get()};
      if (physical_to_logical_map) {
        // printf("debug physical_to_logical_map in arg:%lu\n", physical_to_logical_map);
        moe->config.physical_to_logical_map = reinterpret_cast<void*>(physical_to_logical_map);
        // printf("moe ptr:%p,confirm: moe->config.physical_to_logical_map:%lu\n", reinterpret_cast<void*>(moe.get()),
        //  reinterpret_cast<uintptr_t>(moe->config.physical_to_logical_map));
      }
      return std::make_pair((intptr_t)&inner, (intptr_t)args);
    }
    static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<TP_MOE<T>> moe) {
      return cpuinfer_interface(moe, 0);
    }
  };
  class ForwardBindings {
   public:
    struct Args {
      CPUInfer* cpuinfer;
      TP_MOE<T>* moe;
      intptr_t qlen;
      int k;
      intptr_t expert_ids;
      intptr_t weights;
      intptr_t input;
      intptr_t output;
      bool incremental;
    };
    static void inner(void* args) {
      Args* args_ = (Args*)args;
      args_->cpuinfer->enqueue(&TP_MOE<T>::forward_binding, args_->moe, args_->qlen, args_->k, args_->expert_ids,
                               args_->weights, args_->input, args_->output, args_->incremental);
    }
    static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<TP_MOE<T>> moe, intptr_t qlen, int k,
                                                            intptr_t expert_ids, intptr_t weights, intptr_t input,
                                                            intptr_t output, bool incremental = false) {
      Args* args = new Args{nullptr, moe.get(), qlen, k, expert_ids, weights, input, output, incremental};
      return std::make_pair((intptr_t)&inner, (intptr_t)args);
    }
    static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<TP_MOE<T>> moe, intptr_t qlen, int k,
                                                            intptr_t expert_ids, intptr_t weights, intptr_t input,
                                                            intptr_t output) {
      return cpuinfer_interface(moe, qlen, k, expert_ids, weights, input, output, false);
    }
  };
};

template <typename MoeTP>
void bind_moe_module(py::module_& moe_module, const char* name) {
  using MoeClass = TP_MOE<MoeTP>;
  using MoeBindings = MOEBindings<MoeTP>;

  auto moe_cls = py::class_<MoeClass, MoE_Interface, std::shared_ptr<MoeClass>>(moe_module, name);

  moe_cls.def(py::init<GeneralMOEConfig>())
      .def("warm_up_task", &MoeBindings::WarmUpBindings::cpuinfer_interface)
      .def("load_weights_task",
           py::overload_cast<std::shared_ptr<MoeClass>>(&MoeBindings::LoadWeightsBindings::cpuinfer_interface))
      .def("load_weights_task",
           py::overload_cast<std::shared_ptr<MoeClass>, const uintptr_t>(
               &MoeBindings::LoadWeightsBindings::cpuinfer_interface),
           py::arg("physical_to_logical_map"))
      // .def("forward_task", &MoeBindings::ForwardBindings::cpuinfer_interface)
      .def("forward_task",
           py::overload_cast<std::shared_ptr<MoeClass>, intptr_t, int, intptr_t, intptr_t, intptr_t, intptr_t>(
               &MoeBindings::ForwardBindings::cpuinfer_interface))
      .def("forward_task",
           py::overload_cast<std::shared_ptr<MoeClass>, intptr_t, int, intptr_t, intptr_t, intptr_t, intptr_t, bool>(
               &MoeBindings::ForwardBindings::cpuinfer_interface))
      .def("warm_up", &MoeClass::warm_up)
      .def("load_weights", &MoeClass::load_weights)
      .def("forward", &MoeClass::forward_binding);

  // Bind write_weight_scale_to_buffer_task for MoE types that support it
  // Uses SFINAE to detect if MoeClass has write_weight_scale_to_buffer method
  if constexpr (requires { &MoeClass::write_weight_scale_to_buffer; }) {
    struct WriteWeightScaleToBufferBindings {
      struct Args {
        CPUInfer* cpuinfer;
        MoeClass* moe;
        int gpu_tp_count;
        int expert_id;
        std::vector<uintptr_t> w13_weight_ptrs;
        std::vector<uintptr_t> w13_scale_ptrs;
        std::vector<uintptr_t> w2_weight_ptrs;
        std::vector<uintptr_t> w2_scale_ptrs;
      };

      static void inner(void* args) {
        Args* args_ = (Args*)args;
        args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe, args_->gpu_tp_count,
                                 args_->expert_id, args_->w13_weight_ptrs, args_->w13_scale_ptrs, args_->w2_weight_ptrs,
                                 args_->w2_scale_ptrs);
      }

      static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe, int gpu_tp_count,
                                                              int expert_id, py::list w13_weight_ptrs,
                                                              py::list w13_scale_ptrs, py::list w2_weight_ptrs,
                                                              py::list w2_scale_ptrs) {
        // Convert Python lists to std::vector<uintptr_t>
        std::vector<uintptr_t> w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec;

        for (auto item : w13_weight_ptrs) w13_weight_vec.push_back(py::cast<uintptr_t>(item));
        for (auto item : w13_scale_ptrs) w13_scale_vec.push_back(py::cast<uintptr_t>(item));
        for (auto item : w2_weight_ptrs) w2_weight_vec.push_back(py::cast<uintptr_t>(item));
        for (auto item : w2_scale_ptrs) w2_scale_vec.push_back(py::cast<uintptr_t>(item));

        Args* args = new Args{nullptr,        moe.get(),     gpu_tp_count,  expert_id,
                              w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec};
        return std::make_pair((intptr_t)&inner, (intptr_t)args);
      }
    };

    moe_cls.def("write_weight_scale_to_buffer_task", &WriteWeightScaleToBufferBindings::cpuinfer_interface,
                py::arg("gpu_tp_count"), py::arg("expert_id"), py::arg("w13_weight_ptrs"), py::arg("w13_scale_ptrs"),
                py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
  }
}

PYBIND11_MODULE(kt_kernel_ext, m) {
  py::class_<WorkerPool>(m, "WorkerPool").def(py::init<int>());
  py::class_<WorkerPoolConfig>(m, "WorkerPoolConfig")
      .def(py::init<>())
      .def_readwrite("subpool_count", &WorkerPoolConfig::subpool_count)
      .def_readwrite("subpool_numa_map", &WorkerPoolConfig::subpool_numa_map)
      .def_readwrite("subpool_thread_count", &WorkerPoolConfig::subpool_thread_count);

  py::class_<CPUInfer>(m, "CPUInfer")
      .def(py::init<int>())
      .def(py::init<WorkerPoolConfig>())
      .def("submit", &CPUInfer::submit)
      .def("sync", &CPUInfer::sync, py::arg("allow_n_pending") = 0)
      .def_readwrite("backend_", &CPUInfer::backend_)
#ifndef KTRANSFORMERS_CPU_ONLY
      .def("sync_with_cuda_stream", &CPUInfer::sync_with_cuda_stream, py::arg("user_cuda_stream"),
           py::arg("allow_n_pending") = 0)
      .def("submit_with_cuda_stream", &CPUInfer::submit_with_cuda_stream)
#endif
      ;

  auto linear_module = m.def_submodule("linear");
  py::class_<LinearConfig>(linear_module, "LinearConfig")
      .def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t proj,
                       int proj_type, int hidden_type) {
        return LinearConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)proj, (ggml_type)proj_type,
                            (ggml_type)hidden_type);
      }));
  // py::class_<Linear>(linear_module, "Linear")
  //     .def(py::init<LinearConfig>())
  //     .def("warm_up", &LinearBindings::WarmUpBindings::cpuinfer_interface)
  //     .def("forward", &LinearBindings::ForwardBindings::cpuinfer_interface);

  auto mlp_module = m.def_submodule("mlp");
  py::class_<MLPConfig>(mlp_module, "MLPConfig")
      .def(py::init([](int hidden_size, int intermediate_size, int stride, int group_max_len, intptr_t gate_proj,
                       intptr_t up_proj, intptr_t down_proj, int gate_type, int up_type, int down_type,
                       int hidden_type) {
        return MLPConfig(hidden_size, intermediate_size, stride, group_max_len, (void*)gate_proj, (void*)up_proj,
                         (void*)down_proj, (ggml_type)gate_type, (ggml_type)up_type, (ggml_type)down_type,
                         (ggml_type)hidden_type);
      }));
  // py::class_<MLP>(mlp_module, "MLP")
  //     .def(py::init<MLPConfig>())
  //     .def("warm_up", &MLPBindings::WarmUpBindings::cpuinfer_interface)
  //     .def("forward", &MLPBindings::ForwardBindings::cpuinfer_interface);

  py::class_<GeneralConfig>(m, "GeneralConfig")
      .def(py::init<>())
      .def_readwrite("vocab_size", &GeneralConfig::vocab_size)
      .def_readwrite("hidden_size", &GeneralConfig::hidden_size)
      .def_readwrite("num_experts_per_tok", &GeneralConfig::num_experts_per_tok)
      .def_readwrite("n_routed_experts", &GeneralConfig::n_routed_experts)
      .def_readwrite("n_shared_experts", &GeneralConfig::n_shared_experts)
      .def_readwrite("max_qlen", &GeneralConfig::max_qlen)
      .DEF_PTR_PROPERTY(GeneralConfig, lm_heads_ptr)
      .def_readwrite("lm_heads_type", &GeneralConfig::lm_heads_type)
      .DEF_PTR_PROPERTY(GeneralConfig, norm_weights_ptr)
      .def_readwrite("norm_weights_type", &GeneralConfig::norm_weights_type)
      .DEF_PTR_PROPERTY(GeneralConfig, token_embd_ptr)
      .def_readwrite("token_embd_type", &GeneralConfig::token_embd_type)
      .def_readwrite("pool", &GeneralConfig::pool);
#if defined(__aarch64__) && defined(CPU_USE_KML) && defined(KTRANSFORMERS_CPU_MLA)
  py::class_<DeepseekV3ForCausalLM, std::shared_ptr<DeepseekV3ForCausalLM>>(m, "DeepseekV3ForCausalLM")
      .def(py::init([](GeneralConfig config) { return std::make_shared<DeepseekV3ForCausalLM>(config); }))
      .def_readwrite("model", &DeepseekV3ForCausalLM::model)
      .def("forward", &DeepseekV3ForCausalLM::forward_binding);

  py::class_<DeepseekV3Model, std::shared_ptr<DeepseekV3Model>>(m, "DeepseekV3Model")
      .def(py::init([](GeneralConfig config) { return std::make_shared<DeepseekV3Model>(config); }))
      .def_readwrite("layers", &DeepseekV3Model::layers);

  py::class_<DeepseekV3DecoderLayer, std::shared_ptr<DeepseekV3DecoderLayer>>(m, "DeepseekV3DecoderLayer")
      .def(py::init([](GeneralConfig config, size_t layer_idx) {
        return std::make_shared<DeepseekV3DecoderLayer>(config, layer_idx);
      }))
      .def("load_norm", &DeepseekV3DecoderLayer::load_norm_binding)
      .def_readwrite("self_attn", &DeepseekV3DecoderLayer::self_attn)
      .def_readwrite("gate", &DeepseekV3DecoderLayer::gate)
      .def_readwrite("ffn", &DeepseekV3DecoderLayer::ffn);
#endif
  auto mla_module = m.def_submodule("mla");
  py::class_<GeneralMLAConfig>(mla_module, "MLAConfig")
      .def(py::init([](size_t hidden_size, size_t q_lora_rank, size_t num_heads, size_t nope_size, size_t rope_size,
                       size_t kv_lora_rank) {
        return GeneralMLAConfig(hidden_size, q_lora_rank, num_heads, nope_size, rope_size, kv_lora_rank);
      }))
      .def_readwrite("layer_idx", &GeneralMLAConfig::layer_idx)
      .def_readwrite("pool", &GeneralMLAConfig::pool)
      .def_readwrite("token_count_in_page", &GeneralMLAConfig::token_count_in_page)
      .def_readwrite("max_qlen", &GeneralMLAConfig::max_qlen)
      .def_readwrite("max_kvlen", &GeneralMLAConfig::max_kvlen)

      .def_readwrite("max_position_embeddings", &GeneralMLAConfig::max_position_embeddings)
      .def_readwrite("rope_scaling_factor", &GeneralMLAConfig::rope_scaling_factor)
      .def_readwrite("rope_theta", &GeneralMLAConfig::rope_theta)
      .def_readwrite("rope_scaling_beta_fast", &GeneralMLAConfig::rope_scaling_beta_fast)
      .def_readwrite("rope_scaling_beta_slow", &GeneralMLAConfig::rope_scaling_beta_slow)
      .def_readwrite("rope_scaling_mscale", &GeneralMLAConfig::rope_scaling_mscale)
      .def_readwrite("rope_scaling_mscale_all_dim", &GeneralMLAConfig::rope_scaling_mscale_all_dim)
      .def_readwrite("rope_scaling_original_max_position_embeddings",
                     &GeneralMLAConfig::rope_scaling_original_max_position_embeddings)

      .DEF_PTR_PROPERTY(GeneralMLAConfig, q_a_proj)
      .DEF_PTR_PROPERTY(GeneralMLAConfig, q_a_norm)
      .DEF_PTR_PROPERTY(GeneralMLAConfig, q_b_proj)
      .DEF_PTR_PROPERTY(GeneralMLAConfig, kv_a_proj_with_mqa)
      .DEF_PTR_PROPERTY(GeneralMLAConfig, kv_a_norm)
      .DEF_PTR_PROPERTY(GeneralMLAConfig, kv_b_proj)
      .DEF_PTR_PROPERTY(GeneralMLAConfig, o_proj)

      .def_readwrite("q_a_proj_type", &GeneralMLAConfig::q_a_proj_type)
      .def_readwrite("q_a_norm_type", &GeneralMLAConfig::q_a_norm_type)
      .def_readwrite("q_b_proj_type", &GeneralMLAConfig::q_b_proj_type)
      .def_readwrite("kv_a_proj_with_mqa_type", &GeneralMLAConfig::kv_a_proj_with_mqa_type)
      .def_readwrite("kv_a_norm_type", &GeneralMLAConfig::kv_a_norm_type)
      .def_readwrite("kv_b_proj_type", &GeneralMLAConfig::kv_b_proj_type)
      .def_readwrite("w_o_type", &GeneralMLAConfig::w_o_type)
      .def_readwrite("page_count", &GeneralMLAConfig::page_count)

      ;
  py::class_<MLA_Interface, std::shared_ptr<MLA_Interface>>(mla_module, "MLA_Interface");
#if defined(__aarch64__) && defined(CPU_USE_KML) && defined(KTRANSFORMERS_CPU_MLA)
  py::class_<TP_MLA<KML_MLA_TP<float16_t>>, MLA_Interface, std::shared_ptr<TP_MLA<KML_MLA_TP<float16_t>>>>(mla_module,
                                                                                                           "MLA_F16")
      .def(py::init<GeneralMLAConfig>())
      .def("load_weights", &TP_MLA<KML_MLA_TP<float16_t>>::load_weights)
      .def("forward",
           [](TP_MLA<KML_MLA_TP<float16_t>>& op, std::vector<int> qlens, std::vector<std::vector<int>> page_tables,
              std::vector<int> kvlens, intptr_t input,
              intptr_t output) { op.forward(qlens, page_tables, kvlens, (const void*)input, (void*)output); })
      .def("set_local_pages", &TP_MLA<KML_MLA_TP<float16_t>>::set_local_pages)
      .def("set_pages", [](TP_MLA<KML_MLA_TP<float16_t>>& op, std::vector<std::vector<intptr_t>> nope_pages,
                           std::vector<std::vector<intptr_t>> rope_pages) {
        std::vector<std::vector<void*>> nope_pages_ptr;
        std::vector<std::vector<void*>> rope_pages_ptr;
        op.set_pages(nope_pages_ptr, rope_pages_ptr);
      });

  py::class_<TP_MLA<KML_MLA_TP<float>>, MLA_Interface, std::shared_ptr<TP_MLA<KML_MLA_TP<float>>>>(mla_module,
                                                                                                   "MLA_F32")
      .def(py::init<GeneralMLAConfig>())
      .def("load_weights", &TP_MLA<KML_MLA_TP<float>>::load_weights)
      .def("forward",
           [](TP_MLA<KML_MLA_TP<float>>& op, std::vector<int> qlens, std::vector<std::vector<int>> page_tables,
              std::vector<int> kvlens, intptr_t input,
              intptr_t output) { op.forward(qlens, page_tables, kvlens, (const void*)input, (void*)output); })
      .def("set_local_pages", &TP_MLA<KML_MLA_TP<float>>::set_local_pages)
      .def("set_pages", [](TP_MLA<KML_MLA_TP<float>>& op, std::vector<std::vector<intptr_t>> nope_pages,
                           std::vector<std::vector<intptr_t>> rope_pages) {
        std::vector<std::vector<void*>> nope_pages_ptr;
        std::vector<std::vector<void*>> rope_pages_ptr;
        op.set_pages(nope_pages_ptr, rope_pages_ptr);
      });
  py::class_<TP_MLA<KML_MLA_TP_QUAN<float>>, MLA_Interface, std::shared_ptr<TP_MLA<KML_MLA_TP_QUAN<float>>>>(
      mla_module, "MLA_QUAN_F32")
      .def(py::init<GeneralMLAConfig>())
      .def("load_weights", &TP_MLA<KML_MLA_TP_QUAN<float>>::load_weights)
      .def("forward",
           [](TP_MLA<KML_MLA_TP_QUAN<float>>& op, std::vector<int> qlens, std::vector<std::vector<int>> page_tables,
              std::vector<int> kvlens, intptr_t input,
              intptr_t output) { op.forward(qlens, page_tables, kvlens, (const void*)input, (void*)output); })
      .def("set_local_pages", &TP_MLA<KML_MLA_TP_QUAN<float>>::set_local_pages)
      .def("set_pages", [](TP_MLA<KML_MLA_TP_QUAN<float>>& op, std::vector<std::vector<intptr_t>> nope_pages,
                           std::vector<std::vector<intptr_t>> rope_pages) {
        std::vector<std::vector<void*>> nope_pages_ptr;
        std::vector<std::vector<void*>> rope_pages_ptr;
        op.set_pages(nope_pages_ptr, rope_pages_ptr);
      });

  auto gate_module = m.def_submodule("gate");
  py::class_<GeneralGateConfig>(gate_module, "GateConfig")
      .def(py::init([](int hidden_size, int num_experts_per_tok, int n_routed_experts, int n_group, int topk_group) {
        return GeneralGateConfig(hidden_size, num_experts_per_tok, n_routed_experts, n_group, topk_group);
      }))
      .def_readwrite("routed_scaling_factor", &GeneralGateConfig::routed_scaling_factor)

      .def_readwrite("layer_idx", &GeneralGateConfig::layer_idx)
      .def_readwrite("pool", &GeneralGateConfig::pool)
      .DEF_PTR_PROPERTY(GeneralGateConfig, weight)
      .def_readwrite("weight_type", &GeneralGateConfig::weight_type)
      .DEF_PTR_PROPERTY(GeneralGateConfig, e_score_correction_bias)
      .def_readwrite("e_score_correction_bias_type", &GeneralGateConfig::e_score_correction_bias_type)

      ;
  py::class_<MoEGate, std::shared_ptr<MoEGate>>(gate_module, "MoEGate")
      .def(py::init<GeneralGateConfig>())
      .def("forward", &MoEGate::forward_binding);
#endif

  py::class_<QuantConfig>(m, "QuantConfig")
      .def(py::init<>())
      .def_readwrite("quant_method", &QuantConfig::quant_method)
      .def_readwrite("bits", &QuantConfig::bits)
      .def_readwrite("group_size", &QuantConfig::group_size)
      .def_readwrite("zero_point", &QuantConfig::zero_point)
      .def_readwrite("per_channel", &QuantConfig::per_channel);

  auto moe_module = m.def_submodule("moe");

  py::class_<GeneralMOEConfig>(moe_module, "MOEConfig")
      .def(py::init([](int expert_num, int routed_expert_num, int hidden_size, int intermediate_size) {
        return GeneralMOEConfig(expert_num, routed_expert_num, hidden_size, intermediate_size);
      }))
      .def(py::init([](int expert_num, int routed_expert_num, int hidden_size, int intermediate_size,
                       uintptr_t gpu_experts_mask_ptr) {
        GeneralMOEConfig cfg(expert_num, routed_expert_num, hidden_size, intermediate_size);
        cfg.gpu_experts_mask = reinterpret_cast<uint8_t*>(gpu_experts_mask_ptr);
        cfg.compute_num_gpu_experts();
        return cfg;
      }))
      .def_readwrite("layer_idx", &GeneralMOEConfig::layer_idx)
      .def_readwrite("pool", &GeneralMOEConfig::pool)

      .def_readonly("num_gpu_experts", &GeneralMOEConfig::num_gpu_experts)
      .def_property(
          "gpu_experts_mask",
          [](const GeneralMOEConfig& self) { return reinterpret_cast<uintptr_t>(self.gpu_experts_mask); },
          [](GeneralMOEConfig& self, uintptr_t val) { self.gpu_experts_mask = reinterpret_cast<uint8_t*>(val); })
      .DEF_PTR_PROPERTY(GeneralMOEConfig, physical_to_logical_map)

      .DEF_PTR_PROPERTY(GeneralMOEConfig, gate_proj)
      .DEF_PTR_PROPERTY(GeneralMOEConfig, up_proj)
      .DEF_PTR_PROPERTY(GeneralMOEConfig, down_proj)

      .DEF_PTR_PROPERTY(GeneralMOEConfig, gate_scale)
      .DEF_PTR_PROPERTY(GeneralMOEConfig, up_scale)
      .DEF_PTR_PROPERTY(GeneralMOEConfig, down_scale)

      .DEF_PTR_PROPERTY(GeneralMOEConfig, gate_zero)
      .DEF_PTR_PROPERTY(GeneralMOEConfig, up_zero)
      .DEF_PTR_PROPERTY(GeneralMOEConfig, down_zero)

      .def_readwrite("quant_config", &GeneralMOEConfig::quant_config)

      .def_readwrite("max_len", &GeneralMOEConfig::max_len)

      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, gate_projs)
      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, up_projs)
      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, down_projs)

      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, gate_scales)
      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, up_scales)
      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, down_scales)

      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, gate_zeros)
      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, up_zeros)
      .DEF_PTR_2D_PROPERTY(GeneralMOEConfig, down_zeros)

      .def_readwrite("path", &GeneralMOEConfig::path)
      .def_readwrite("save", &GeneralMOEConfig::save)
      .def_readwrite("load", &GeneralMOEConfig::load)
      .def_readwrite("m_block", &GeneralMOEConfig::m_block)
      .def_readwrite("group_min_len", &GeneralMOEConfig::group_min_len)
      .def_readwrite("group_max_len", &GeneralMOEConfig::group_max_len)

      .def_readwrite("gate_type", &GeneralMOEConfig::gate_type)
      .def_readwrite("up_type", &GeneralMOEConfig::up_type)
      .def_readwrite("down_type", &GeneralMOEConfig::down_type)
      .def_readwrite("hidden_type", &GeneralMOEConfig::hidden_type)

      ;

  py::class_<MoE_Interface, std::shared_ptr<MoE_Interface>>(moe_module, "MoE_Interface");

  bind_moe_module<LLAMA_MOE_TP>(moe_module, "MOE");

#if defined(__x86_64__) && defined(USE_AMX_AVX_KERNEL)
  bind_moe_module<AMX_MOE_TP<amx::GemmKernel224Int8>>(moe_module, "AMXInt8_MOE");
  bind_moe_module<AMX_MOE_TP<amx::GemmKernel224Int4>>(moe_module, "AMXInt4_MOE");
  bind_moe_module<AMX_MOE_TP<amx::GemmKernel224Int4_1>>(moe_module, "AMXInt4_1_MOE");
  bind_moe_module<AMX_AWQ_MOE_TP<amx::GemmKernel224Int4_1_LowKGroup>>(moe_module, "AMXInt4_1KGroup_MOE");
  bind_moe_module<AMX_K2_MOE_TP<amx::GemmKernel224Int4SmallKGroup>>(moe_module, "AMXInt4_KGroup_MOE");
#if defined(__AVX512BF16__)
  bind_moe_module<AMX_BF16_MOE_TP<amx::GemmKernel224BF16>>(moe_module, "AMXBF16_MOE");
  bind_moe_module<AMX_FP8_MOE_TP<amx::GemmKernel224FP8>>(moe_module, "AMXFP8_MOE");
  bind_moe_module<AMX_FP8_PERCHANNEL_MOE_TP<amx::GemmKernel224FP8PerChannel>>(moe_module, "AMXFP8PerChannel_MOE");
#endif
#endif
#if defined(USE_MOE_KERNEL)
  bind_moe_module<MOE_KERNEL_TP<moe_kernel::GemmKernelInt8, _is_plain_>>(moe_module, "Int8_KERNEL_MOE");
#if defined(__aarch64__) && defined(CPU_USE_KML)
  // amd have not implemented int4 kernel yet
  bind_moe_module<MOE_KERNEL_TP<moe_kernel::GemmKernelInt4, _is_plain_>>(moe_module, "Int4_KERNEL_MOE");
#endif
#endif

  // Expose kernel tiling/runtime parameters so Python can modify them at runtime
  {
    auto tiling_module = moe_module.def_submodule("tiling");
#if defined(USE_MOE_KERNEL)
    tiling_module.def(
        "get_int8",
        []() {
          auto t = moe_kernel::GemmKernelInt8::get_tiling();
          py::dict d;
          d["n_block_up_gate"] = std::get<0>(t);
          d["n_block_down"] = std::get<1>(t);
          d["n_block"] = std::get<2>(t);
          d["m_block"] = std::get<3>(t);
          d["k_block"] = std::get<4>(t);
          d["n_block_up_gate_prefi"] = std::get<5>(t);
          d["n_block_down_prefi"] = std::get<6>(t);
          return d;
        },
        "Get current tiling parameters for INT8 kernel");
    tiling_module.def(
        "set_int8",
        [](int n_block_up_gate, int n_block_down, int n_block, int m_block, int k_block, int n_block_up_gate_prefi,
           int n_block_down_prefi) {
          moe_kernel::GemmKernelInt8::set_tiling(n_block_up_gate, n_block_down, n_block, m_block, k_block,
                                                 n_block_up_gate_prefi, n_block_down_prefi);
        },
        py::arg("n_block_up_gate"), py::arg("n_block_down"), py::arg("n_block"), py::arg("m_block"), py::arg("k_block"),
        py::arg("n_block_up_gate_prefi"), py::arg("n_block_down_prefi"), "Set tiling parameters for INT8 kernel");

    tiling_module.def(
        "get_int4",
        []() {
          auto t = moe_kernel::GemmKernelInt4::get_tiling();
          py::dict d;
          d["n_block_up_gate"] = std::get<0>(t);
          d["n_block_down"] = std::get<1>(t);
          d["n_block"] = std::get<2>(t);
          d["m_block"] = std::get<3>(t);
          d["k_block"] = std::get<4>(t);
          d["n_block_up_gate_prefi"] = std::get<5>(t);
          d["n_block_down_prefi"] = std::get<6>(t);
          return d;
        },
        "Get current tiling parameters for INT4 kernel");
    tiling_module.def(
        "set_int4",
        [](int n_block_up_gate, int n_block_down, int n_block, int m_block, int k_block, int n_block_up_gate_prefi,
           int n_block_down_prefi) {
          moe_kernel::GemmKernelInt4::set_tiling(n_block_up_gate, n_block_down, n_block, m_block, k_block,
                                                 n_block_up_gate_prefi, n_block_down_prefi);
        },
        py::arg("n_block_up_gate"), py::arg("n_block_down"), py::arg("n_block"), py::arg("m_block"), py::arg("k_block"),
        py::arg("n_block_up_gate_prefi"), py::arg("n_block_down_prefi"), "Set tiling parameters for INT4 kernel");

    // Convenience: set both
    tiling_module.def(
        "set_all",
        [](int n_block_up_gate, int n_block_down, int n_block, int m_block, int k_block, int n_block_up_gate_prefi,
           int n_block_down_prefi) {
          moe_kernel::GemmKernelInt8::set_tiling(n_block_up_gate, n_block_down, n_block, m_block, k_block,
                                                 n_block_up_gate_prefi, n_block_down_prefi);
          moe_kernel::GemmKernelInt4::set_tiling(n_block_up_gate, n_block_down, n_block, m_block, k_block,
                                                 n_block_up_gate_prefi, n_block_down_prefi);
        },
        py::arg("n_block_up_gate"), py::arg("n_block_down"), py::arg("n_block"), py::arg("m_block"), py::arg("k_block"),
        py::arg("n_block_up_gate_prefi"), py::arg("n_block_down_prefi"),
        "Set tiling parameters for both INT8 and INT4 kernels");
#endif
  }

  auto kvcache_module = m.def_submodule("kvcache");

  py::enum_<AnchorType>(kvcache_module, "AnchorType")
      .value("FIXED", AnchorType::FIXED_ANCHOR)
      .value("DYNAMIC", AnchorType::DYNAMIC)
      .value("QUEST", AnchorType::QUEST)
      .value("BLOCK_MAX", AnchorType::BLOCK_MAX)
      .value("BLOCK_MEAN", AnchorType::BLOCK_MEAN);
  py::enum_<ggml_type>(kvcache_module, "ggml_type")
      // .value("FP16", ggml_type::GGML_TYPE_F16)
      // .value("FP32", ggml_type::GGML_TYPE_F32)
      // .value("Q4_0", ggml_type::GGML_TYPE_Q4_0)
      // .value("Q8_0", ggml_type::GGML_TYPE_Q8_0)
      .value("FP32", GGML_TYPE_F32)
      .value("FP16", GGML_TYPE_F16)
      .value("Q4_0", GGML_TYPE_Q4_0)
      .value("Q4_1", GGML_TYPE_Q4_1)
      .value("Q5_0", GGML_TYPE_Q5_0)
      .value("Q5_1", GGML_TYPE_Q5_1)
      .value("Q8_0", GGML_TYPE_Q8_0)
      .value("Q8_1", GGML_TYPE_Q8_1)
      .value("Q2_K", GGML_TYPE_Q2_K)
      .value("Q3_K", GGML_TYPE_Q3_K)
      .value("Q4_K", GGML_TYPE_Q4_K)
      .value("Q5_K", GGML_TYPE_Q5_K)
      .value("Q6_K", GGML_TYPE_Q6_K)
      .value("Q8_K", GGML_TYPE_Q8_K)
      .value("IQ2_XXS", GGML_TYPE_IQ2_XXS)
      .value("IQ2_XS", GGML_TYPE_IQ2_XS)
      .value("IQ3_XXS", GGML_TYPE_IQ3_XXS)
      .value("IQ1_S", GGML_TYPE_IQ1_S)
      .value("IQ4_NL", GGML_TYPE_IQ4_NL)
      .value("IQ3_S", GGML_TYPE_IQ3_S)
      .value("IQ2_S", GGML_TYPE_IQ2_S)
      .value("IQ4_XS", GGML_TYPE_IQ4_XS)
      .value("I8", GGML_TYPE_I8)
      .value("I16", GGML_TYPE_I16)
      .value("I32", GGML_TYPE_I32)
      .value("I64", GGML_TYPE_I64)
      .value("F64", GGML_TYPE_F64)
      .value("IQ1_M", GGML_TYPE_IQ1_M)
      .value("BF16", GGML_TYPE_BF16)
      .export_values();

  py::enum_<RetrievalType>(kvcache_module, "RetrievalType")
      .value("LAYER", RetrievalType::LAYER)
      .value("KVHEAD", RetrievalType::KVHEAD)
      .value("QHEAD", RetrievalType::QHEAD);

  py::class_<KVCacheConfig>(kvcache_module, "KVCacheConfig")
      .def(py::init<int, int, int, int, int, int, AnchorType, ggml_type, RetrievalType, int, int, int, int, int, int>())
      .def_readwrite("layer_num", &KVCacheConfig::layer_num)
      .def_readwrite("kv_head_num", &KVCacheConfig::kv_head_num)
      .def_readwrite("q_head_num", &KVCacheConfig::q_head_num)
      .def_readwrite("head_dim", &KVCacheConfig::head_dim)
      .def_readwrite("block_len", &KVCacheConfig::block_len)
      .def_readwrite("anchor_num", &KVCacheConfig::anchor_num)
      .def_readwrite("anchor_type", &KVCacheConfig::anchor_type)
      .def_readwrite("kv_type", &KVCacheConfig::kv_type)
      .def_readwrite("retrieval_type", &KVCacheConfig::retrieval_type)
      .def_readwrite("layer_step", &KVCacheConfig::layer_step)
      .def_readwrite("token_step", &KVCacheConfig::token_step)
      .def_readwrite("layer_offset", &KVCacheConfig::layer_offset)
      .def_readwrite("max_block_num", &KVCacheConfig::max_block_num)
      .def_readwrite("max_batch_size", &KVCacheConfig::max_batch_size)
      .def_readwrite("max_thread_num", &KVCacheConfig::max_thread_num);
  py::class_<KVCache>(kvcache_module, "KVCache")
      .def(py::init<KVCacheConfig>())
      .def("get_cache_total_len", &KVCache::get_cache_total_len)
      .def("update_cache_total_len",
           [](KVCache& kvcache, int cache_total_len) { kvcache.update_cache_total_len(cache_total_len); });

  auto utils = m.def_submodule("utils");

  // 注册转换函数
  utils.def("to_float", &to_float_ptr, "Convert tensor from any GGML type to float32", py::arg("input"),
            py::arg("size"), py::arg("type"));

  utils.def("from_float", &from_float_ptr, "Convert tensor from float32 to any GGML type", py::arg("input"),
            py::arg("size"), py::arg("type"));
}


================================================
FILE: kt-kernel/install.sh
================================================
#!/usr/bin/env bash
set -euo pipefail

usage() {
  cat <<EOF
Usage: $0 [SUBCOMMAND] [BUILD_OPTIONS]

Two-step installation in one file. Choose a subcommand:

SUBCOMMANDS:
  deps            Install system prerequisites only
  build           Build and install kt-kernel (no dependency install)
  all             Run deps then build (default when no subcommand)
  -h, --help      Show this help message

BUILD_OPTIONS (for "build" or "all"):
  (none)          Auto-detect CPU and configure automatically (recommended)
  --manual        Skip auto-detection, use manual configuration (see below)
  --no-clean      Do not delete local build/ before building (default cleans)

AUTO-DETECTION (Default):
  The script will automatically detect your CPU and use ALL available features:
  - CPUINFER_CPU_INSTRUCT = NATIVE (uses -march=native)
  - CPUINFER_ENABLE_AMX   = ON/OFF (based on detection)
  - CPUINFER_ENABLE_AVX512_VNNI = ON/OFF (with fallback if OFF)
  - CPUINFER_ENABLE_AVX512_BF16 = ON/OFF (with fallback if OFF)
  - CPUINFER_ENABLE_AVX512_VBMI = ON/OFF (required for FP8 MoE)

  ✓ Best performance on YOUR machine
  ✗ Binary may NOT work on different/older CPUs

  Use this when: Installing for local use only

MANUAL CONFIGURATION:
  Use --manual flag when building for DISTRIBUTION or different machines.
  Set these environment variables before running:

  CPUINFER_CPU_INSTRUCT   - Target CPU instruction set
                            Options: AVX512, AVX2, FANCY, NATIVE
  CPUINFER_ENABLE_AMX     - Enable Intel AMX support
                            Options: ON, OFF

Distribution examples (portable binaries):

┌──────────────────────────────────────────────────────────────────────────┐
│ Configuration          │ Target CPUs              │ Use Case             │
├────────────────────────┼──────────────────────────┼──────────────────────┤
│ AVX512 + AMX=OFF       │ Skylake-X, Ice Lake,     │ General distribution │
│                        │ Cascade Lake, Zen 4      │ (recommended)        │
├────────────────────────┼──────────────────────────┼──────────────────────┤
│ AVX2 + AMX=OFF         │ Haswell (2013) and newer │ Maximum compatibility│
├────────────────────────┼──────────────────────────┼──────────────────────┤
│ FANCY + AMX=OFF        │ Ice Lake+, Zen 4+        │ Modern CPUs only     │
│                        │ (with full AVX512 ext)   │                      │
└────────────────────────┴──────────────────────────┴──────────────────────┘

  Use this when: Building Docker images, PyPI packages, or deploying to clusters

  Example: Build for general distribution
    export CPUINFER_CPU_INSTRUCT=AVX512
    export CPUINFER_ENABLE_AMX=OFF
    $0 build --manual
    # Result: Works on any CPU with AVX512 (2017+)

  Example: Build for maximum compatibility
    export CPUINFER_CPU_INSTRUCT=AVX2
    export CPUINFER_ENABLE_AMX=OFF
    $0 build --manual
    # Result: Works on any CPU with AVX2 (2013+)

Optional variables (with defaults):
  CPUINFER_BUILD_TYPE=Release           Build type (Debug/RelWithDebInfo/Release)
  CPUINFER_PARALLEL=8                   Number of parallel build jobs
  CPUINFER_VERBOSE=1                    Verbose build output (0/1)
  CPUINFER_ENABLE_AVX512_VNNI=ON/OFF    Override VNNI detection (auto if unset)
  CPUINFER_ENABLE_AVX512_BF16=ON/OFF    Override BF16 detection (auto if unset)
  CPUINFER_ENABLE_AVX512_VBMI=ON/OFF    Override VBMI detection (auto if unset)

Software Fallback Support:
  ✓ If VNNI not available: Uses AVX512BW fallback (2-3x slower but works)
  ✓ If BF16 not available: Uses AVX512F fallback (5-10x slower but works)
  → Old CPUs with only AVX512F+BW can run all code (slower but functional)

EOF
  exit 1
}

install_dependencies() {
  echo "Checking and installing system dependencies..."

  # Determine if we need to use sudo
  SUDO=""
  if [ "${EUID:-0}" -ne 0 ]; then
    if command -v sudo &> /dev/null; then
      SUDO="sudo"
    else
      echo "Warning: Not running as root and sudo not found. Package installation may fail."
      echo "Please run as root or install sudo."
    fi
  fi

  if command -v conda &> /dev/null; then
    echo "Installing cmake via conda..."
    conda install -y cmake
  else
    echo "Warning: conda not found. Skipping cmake installation via conda."
    echo "Please install conda or manually install cmake."
  fi

  # Detect OS type
  if [ -f /etc/os-release ]; then
    . /etc/os-release
    OS=$ID
  elif [ -f /etc/debian_version ]; then
    OS="debian"
  elif [ -f /etc/redhat-release ]; then
    OS="rhel"
  else
    echo "Warning: Unable to detect OS type. Skipping dependency installation."
    return 0
  fi

  # Install dependencies based on OS
  case "$OS" in
    debian|ubuntu|linuxmint|pop)
      echo "Detected Debian-based system. Installing libhwloc-dev and pkg-config..."
      $SUDO apt update
      $SUDO apt install -y libhwloc-dev pkg-config
      ;;
    fedora|rhel|centos|rocky|almalinux)
      echo "Detected Red Hat-based system. Installing hwloc-devel and pkgconfig..."
      $SUDO dnf install -y hwloc-devel pkgconfig || $SUDO yum install -y hwloc-devel pkgconfig
      ;;
    arch|manjaro)
      echo "Detected Arch-based system. Installing hwloc and pkgconf..."
      $SUDO pacman -S --noconfirm hwloc pkgconf
      ;;
    opensuse*|sles)
      echo "Detected openSUSE-based system. Installing hwloc-devel and pkg-config..."
      $SUDO zypper install -y hwloc-devel pkg-config
      ;;
    *)
      echo "Warning: Unsupported OS '$OS'. Please manually install libhwloc-dev and pkg-config."
      ;;
  esac
}

# Function to detect CPU features
# Returns: "has_amx has_avx512f has_avx512_vnni has_avx512_bf16 has_avx512_vbmi" (space-separated 0/1 values)
detect_cpu_features() {
  local has_amx=0
  local has_avx512f=0
  local has_avx512_vnni=0
  local has_avx512_bf16=0
  local has_avx512_vbmi=0

  if [ -f /proc/cpuinfo ]; then
    local cpu_flags
    cpu_flags=$(grep -m1 "^flags" /proc/cpuinfo | tr ' ' '\n')

    # Check for AMX support on Linux
    if echo "$cpu_flags" | grep -qE "amx_tile|amx_int8|amx_bf16"; then
      has_amx=1
    fi

    # Check for AVX512F (foundation)
    if echo "$cpu_flags" | grep -qE "avx512f"; then
      has_avx512f=1
    fi

    # Check for AVX512_VNNI support
    if echo "$cpu_flags" | grep -qE "avx512_vnni|avx512vnni"; then
      has_avx512_vnni=1
    fi

    # Check for AVX512_BF16 support
    if echo "$cpu_flags" | grep -qE "avx512_bf16|avx512bf16"; then
      has_avx512_bf16=1
    fi

    # Check for AVX512_VBMI support
    if echo "$cpu_flags" | grep -qE "avx512_vbmi|avx512vbmi"; then
      has_avx512_vbmi=1
    fi
  elif [ "$(uname)" = "Darwin" ]; then
    # macOS doesn't have AMX (ARM or Intel without AMX)
    has_amx=0
    has_avx512f=0
    has_avx512_vnni=0
    has_avx512_bf16=0
    has_avx512_vbmi=0
  fi

  echo "$has_amx $has_avx512f $has_avx512_vnni $has_avx512_bf16 $has_avx512_vbmi"
}

build_step() {
  # Parse build-only flags from arguments to this function
  local MANUAL_MODE=0
  local CLEAN_BUILD=1
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --manual) MANUAL_MODE=1; shift ;;
      --no-clean) CLEAN_BUILD=0; shift ;;
      -h|--help) usage ;;
      *) break ;;
    esac
  done

  # Clean local build directory to ensure a fresh CMake/configure
  local REPO_ROOT
  REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  if [[ "$CLEAN_BUILD" -eq 1 ]]; then
    if [[ -d "$REPO_ROOT/build" ]]; then
      echo "Cleaning previous build directory: $REPO_ROOT/build"
      rm -rf "$REPO_ROOT/build"
    fi
  else
    echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)"
  fi

  if [ "$MANUAL_MODE" = "0" ]; then
  # Auto-detection mode
  echo "=========================================="
  echo "Auto-detecting CPU capabilities..."
  echo "=========================================="
  echo ""

  # detect_cpu_features returns "has_amx has_avx512f has_avx512_vnni has_avx512_bf16 has_avx512_vbmi"
  CPU_FEATURES=$(detect_cpu_features)
  HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
  HAS_AVX512F=$(echo "$CPU_FEATURES" | cut -d' ' -f2)
  HAS_AVX512_VNNI=$(echo "$CPU_FEATURES" | cut -d' ' -f3)
  HAS_AVX512_BF16=$(echo "$CPU_FEATURES" | cut -d' ' -f4)
  HAS_AVX512_VBMI=$(echo "$CPU_FEATURES" | cut -d' ' -f5)

  export CPUINFER_CPU_INSTRUCT=NATIVE

  if [ "$HAS_AMX" = "1" ]; then
    echo "✓ AMX instructions detected"
    export CPUINFER_ENABLE_AMX=ON
    echo ""
    echo "Configuration: NATIVE + AMX=ON"
    echo "  ✓ Best performance on this machine"
    echo "  ✗ Binary requires Sapphire Rapids or newer CPU"
  else
    echo "ℹ AMX instructions not detected"
    export CPUINFER_ENABLE_AMX=OFF
    echo ""
    echo "Configuration: NATIVE + AMX=OFF"
    echo "  ✓ Using AVX512/AVX2 instructions"
  fi

  echo ""
  echo "  ⚠️  IMPORTANT: This binary is optimized for THIS CPU only"
  echo "     To build portable binaries for distribution, use:"
  echo "       export CPUINFER_CPU_INSTRUCT=AVX512  # or AVX2"
  echo "       export CPUINFER_ENABLE_AMX=OFF"
  echo "       ./install.sh build --manual"

  # Fine-grained AVX512 subset detection (with fallback support)
  echo ""
  echo "AVX512 Feature Detection:"

  # AVX512F: Foundation (required for all AVX512 variants)
  if [ "$HAS_AVX512F" = "1" ]; then
    echo "  AVX512F: ✓ Detected (foundation)"
  else
    echo "  AVX512F: ✗ Not detected (AVX512 not available)"
  fi

  # VNNI: Check if user manually set it, otherwise auto-detect
  if [ -n "${CPUINFER_ENABLE_AVX512_VNNI:-}" ]; then
    echo "  VNNI: User override = $CPUINFER_ENABLE_AVX512_VNNI"
  else
    if [ "$HAS_AVX512_VNNI" = "1" ]; then
      echo "  VNNI: ✓ Detected (hardware acceleration enabled)"
      export CPUINFER_ENABLE_AVX512_VNNI=ON
    else
      echo "  VNNI: ✗ Not detected (will use software fallback, 2-3x slower)"
      export CPUINFER_ENABLE_AVX512_VNNI=OFF
    fi
  fi

  # BF16: Check if user manually set it, otherwise auto-detect
  if [ -n "${CPUINFER_ENABLE_AVX512_BF16:-}" ]; then
    echo "  BF16: User override = $CPUINFER_ENABLE_AVX512_BF16"
  else
    if [ "$HAS_AVX512_BF16" = "1" ]; then
      echo "  BF16: ✓ Detected (hardware acceleration enabled)"
      export CPUINFER_ENABLE_AVX512_BF16=ON
    else
      echo "  BF16: ✗ Not detected (will use software fallback, 5-10x slower)"
      export CPUINFER_ENABLE_AVX512_BF16=OFF
    fi
  fi

  # VBMI: Check if user manually set it, otherwise auto-detect
  if [ -n "${CPUINFER_ENABLE_AVX512_VBMI:-}" ]; then
    echo "  VBMI: User override = $CPUINFER_ENABLE_AVX512_VBMI"
  else
    if [ "$HAS_AVX512_VBMI" = "1" ]; then
      echo "  VBMI: ✓ Detected (byte permutation enabled)"
      export CPUINFER_ENABLE_AVX512_VBMI=ON
    else
      echo "  VBMI: ✗ Not detected (FP8 MoE may not work)"
      export CPUINFER_ENABLE_AVX512_VBMI=OFF
    fi
  fi

  echo ""
  echo "  Note: Software fallbacks ensure all code works on older CPUs"
  echo "  Note: FP8 MoE requires AVX512F + BF16 + VNNI + VBMI"
  echo "  Tip: Override with CPUINFER_ENABLE_AVX512_[VNNI|BF16|VBMI]=ON/OFF"

  echo ""
  echo "To use manual configuration instead, run: $0 build --manual"
  echo ""
  else
  # Manual mode - validate user configuration (no exports)
  if [ -z "$CPUINFER_CPU_INSTRUCT" ] || [ -z "$CPUINFER_ENABLE_AMX" ]; then
    echo "Error: Manual mode requires CPUINFER_CPU_INSTRUCT and CPUINFER_ENABLE_AMX to be set."
    echo ""
    usage
  fi

  # Validate CPUINFER_CPU_INSTRUCT
  case "$CPUINFER_CPU_INSTRUCT" in
    NATIVE|FANCY|AVX512|AVX2)
      ;;
    *)
      echo "Error: Invalid CPUINFER_CPU_INSTRUCT='$CPUINFER_CPU_INSTRUCT'"
      echo "Must be one of: NATIVE, FANCY, AVX512, AVX2"
      exit 1
      ;;
  esac

  # Validate CPUINFER_ENABLE_AMX
  case "$CPUINFER_ENABLE_AMX" in
    ON|OFF)
      ;;
    *)
      echo "Error: Invalid CPUINFER_ENABLE_AMX='$CPUINFER_ENABLE_AMX'"
      echo "Must be either: ON or OFF"
      exit 1
      ;;
  esac

  # Warn about problematic configuration
  if [ "$CPUINFER_CPU_INSTRUCT" = "NATIVE" ] && [ "$CPUINFER_ENABLE_AMX" = "OFF" ]; then
    CPU_FEATURES=$(detect_cpu_features)
    HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
    if [ "$HAS_AMX" = "1" ]; then
      echo "=========================================="
      echo "⚠️  WARNING: Risky Configuration"
      echo "=========================================="
      echo ""
      echo "Your configuration:"
      echo "  CPUINFER_CPU_INSTRUCT = NATIVE"
      echo "  CPUINFER_ENABLE_AMX   = OFF"
      echo ""
      echo "Your CPU HAS AMX support!"
      echo ""
      echo "Problem:"
      echo "  • NATIVE uses -march=native which auto-enables ALL CPU features"
      echo "  • This may IGNORE your AMX=OFF setting"
      echo "  • The binary may still contain AMX instructions"
      echo ""
      echo "Recommended fixes:"
      echo "  1) For portable build (recommended for distribution):"
      echo "       export CPUINFER_CPU_INSTRUCT=AVX512"
      echo ""
      echo "  2) If you want best performance on this CPU:"
      echo "       export CPUINFER_ENABLE_AMX=ON"
      echo ""
      read -p "Continue with risky configuration? (y/N) " -n 1 -r
      echo
      if [[ ! $REPLY =~ ^[Yy]$ ]]; then
        exit 1
      fi
    fi
  fi

# Close MANUAL_MODE conditional
  fi

echo "=========================================="
echo "Building kt-kernel with configuration:"
echo "=========================================="
echo "  CPUINFER_CPU_INSTRUCT        = $CPUINFER_CPU_INSTRUCT"
echo "  CPUINFER_ENABLE_AMX          = $CPUINFER_ENABLE_AMX"
echo "  CPUINFER_ENABLE_AVX512_VNNI  = ${CPUINFER_ENABLE_AVX512_VNNI:-AUTO}"
echo "  CPUINFER_ENABLE_AVX512_BF16  = ${CPUINFER_ENABLE_AVX512_BF16:-AUTO}"
echo "  CPUINFER_ENABLE_AVX512_VBMI  = ${CPUINFER_ENABLE_AVX512_VBMI:-AUTO}"
echo "  CPUINFER_BUILD_TYPE          = ${CPUINFER_BUILD_TYPE:-Release}"
echo "  CPUINFER_PARALLEL            = ${CPUINFER_PARALLEL:-AUTO}"
echo "  CPUINFER_VERBOSE             = ${CPUINFER_VERBOSE:-1}"
echo ""

if [ ${CPUINFER_VERBOSE:-1} = "0" ]; then
  python3 -m pip install .
else
  python3 -m pip install . -v
fi
}

# Subcommand dispatcher: default to "all"
SUBCMD="all"
if [[ $# -gt 0 ]]; then
  case "$1" in
    deps|build|all) SUBCMD="$1"; shift ;;
    -h|--help) usage ;;
    *) SUBCMD="build" ;; # backward compatibility: flags-only => build
  esac
fi

case "$SUBCMD" in
  deps)
    install_dependencies
    ;;
  build)
    build_step "$@"
    ;;
  all)
    install_dependencies
    build_step "$@"
    ;;
esac


================================================
FILE: kt-kernel/operators/amx/awq-moe.hpp
================================================
/**
 * @Description  : AWQ Int4 AMX MoE operator with KGroup quantization and zero-point support
 * @Author       : chenht2022, oql
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 2.0.0
 * @LastEditors  : oql
 * @LastEditTime : 2025-12-10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 *
 * This file implements AWQ Int4 MoE using CRTP pattern, inheriting from moe_base.hpp.
 * AWQ weights are stored with group-wise scales and zero-points (KGroup Int4 with zeros).
 **/
#ifndef CPUINFER_OPERATOR_AMX_AWQ_MOE_H
#define CPUINFER_OPERATOR_AMX_AWQ_MOE_H

// #define CHECK

#include "moe_base.hpp"

/**
 * @brief AWQ Int4 MoE operator using CRTP pattern
 * @tparam T Kernel type for AWQ quantization
 *
 * This class provides AWQ-specific implementations:
 * - do_gate_up_gemm: Int4 weight with KGroup scale + zeros + AMX GEMM
 * - do_down_gemm: Same Int4 KGroup GEMM
 * - load_weights: Load Int4 weights with group-wise scales and zero-points
 */
template <class T>
class AMX_AWQ_MOE_TP : public AMX_MOE_BASE<T, AMX_AWQ_MOE_TP<T>> {
 private:
  using Base = AMX_MOE_BASE<T, AMX_AWQ_MOE_TP<T>>;
  using Base::config_;
  using Base::down_ba_;
  using Base::down_bb_;
  using Base::down_bc_;
  using Base::gate_bb_;
  using Base::gate_bc_;
  using Base::gate_up_ba_;
  using Base::m_local_num_;
  using Base::tp_part_idx;
  using Base::up_bb_;
  using Base::up_bc_;

#ifdef CHECK
  char verify_bb[100000000];
  char check_bb[100000000];
  uint8_t compare_expers = 3;
#endif

  inline void write_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
                            size_t scale_size) {
    std::ofstream of(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                               std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"));
    if (of.is_open() == false) {
      printf("Failed to open weights file for writing\n");
      return;
    }
    of.write((char*)bb, size - scale_size);
    of.close();

    of.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" + std::to_string(scale_size) + "Byte" +
                      "_scale_" + ".kt"));
    if (of.is_open() == false) {
      printf("Failed to open scales file for writing\n");
      return;
    }
    of.write(((char*)bb) + size - scale_size, scale_size);
    of.close();
  }

  // Enhanced version that writes all data including mins for complete comparison
  inline void write_weights(std::filesystem::path prefix, std::string mat_class, typename T::BufferB* buffer,
                            int expert_idx, const std::string& quantization_type = "") {
    auto& quant_config = config_.quant_config;
    int& group_size = quant_config.group_size;

    // Calculate dimensions based on matrix type
    int rows, cols, num_groups;
    size_t scale_elem_count;
    std::string matrix_type = mat_class.substr(1, mat_class.length() - 2);  // Remove leading/trailing underscore
    if (matrix_type == "gate" || matrix_type == "up") {
      rows = config_.intermediate_size;
      cols = config_.hidden_size;
      num_groups = cols / group_size;
      scale_elem_count = num_groups * rows;
    } else {  // down
      rows = config_.hidden_size;
      cols = config_.intermediate_size;
      num_groups = cols / group_size;
      scale_elem_count = num_groups * rows;
    }

    size_t weight_size = (rows * cols) / 2;  // INT4 packed
    size_t scale_size = scale_elem_count * sizeof(float);

    // Create filename prefix
    std::string filename_base = T::name() + mat_class + std::to_string(expert_idx);
    if (!quantization_type.empty()) {
      filename_base += "_" + quantization_type;
    }

    // Write quantized weights
    std::ofstream of(prefix / (filename_base + "_" + std::to_string(weight_size) + "Byte_quant.kt"));
    if (of.is_open()) {
      of.write((char*)buffer->b, weight_size);
      of.close();
    }

    // Write scales
    of.open(prefix / (filename_base + "_" + std::to_string(scale_size) + "Byte_scale.kt"));
    if (of.is_open()) {
      of.write((char*)buffer->d, scale_size);
      of.close();
    }

    // Write mins if available
    if (quant_config.zero_point && buffer->mins) {
      of.open(prefix / (filename_base + "_" + std::to_string(scale_size) + "Byte_mins.kt"));
      if (of.is_open()) {
        of.write((char*)buffer->mins, scale_size);
        of.close();
      }
    }
  }

  inline void read_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
                           size_t scale_size, uint8_t mat_split, uint8_t mat_split_idex) {
    std::ifstream f(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                              std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"));
    if (f.is_open() == false) {
      printf("Failed to open quantized weights file for reading\n");
      return;
    }
    f.seekg(mat_split_idex * (size - scale_size) / mat_split);
    f.read(((char*)bb) + mat_split_idex * (size - scale_size) / mat_split, (size - scale_size) / mat_split);
    f.close();

    f.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" + std::to_string(scale_size) + "Byte" +
                     "_scale_" + ".kt"));
    if (f.is_open() == false) {
      printf("Failed to open scales file for reading\n");
      return;
    }
    f.seekg(mat_split_idex * scale_size / mat_split);
    f.read((((char*)bb) + size - scale_size) + mat_split_idex * scale_size / mat_split, scale_size / mat_split);
    f.close();
  }

  // Enhanced version that reads all data including mins
  inline bool read_weights(std::filesystem::path prefix, std::string mat_class, typename T::BufferB* buffer,
                           int expert_idx, const std::string& quantization_type = "") {
    auto& quant_config = config_.quant_config;
    int& group_size = quant_config.group_size;

    // Calculate dimensions based on matrix type
    int rows, cols, num_groups;
    size_t scale_elem_count;
    std::string matrix_type = mat_class.substr(1, mat_class.length() - 2);  // Remove leading/trailing underscore
    if (matrix_type == "gate" || matrix_type == "up") {
      rows = config_.intermediate_size;
      cols = config_.hidden_size;
      num_groups = cols / group_size;
      scale_elem_count = num_groups * rows;
    } else {  // down
      rows = config_.hidden_size;
      cols = config_.intermediate_size;
      num_groups = cols / group_size;
      scale_elem_count = num_groups * rows;
    }

    size_t weight_size = (rows * cols) / 2;  // INT4 packed
    size_t scale_size = scale_elem_count * sizeof(float);

    // Create filename prefix
    std::string filename_base = T::name() + mat_class + std::to_string(expert_idx);
    if (!quantization_type.empty()) {
      filename_base += "_" + quantization_type;
    }

    // Read quantized weights
    std::ifstream f(prefix / (filename_base + "_" + std::to_string(weight_size) + "Byte_quant.kt"));
    if (!f.is_open()) {
      return false;
    }
    f.read((char*)buffer->b, weight_size);
    f.close();

    // Read scales
    f.open(prefix / (filename_base + "_" + std::to_string(scale_size) + "Byte_scale.kt"));
    if (!f.is_open()) {
      return false;
    }
    f.read((char*)buffer->d, scale_size);
    f.close();

    // Read mins if available and buffer supports it
    if (quant_config.zero_point && buffer->mins) {
      f.open(prefix / (filename_base + "_" + std::to_string(scale_size) + "Byte_mins.kt"));
      if (f.is_open()) {
        f.read((char*)buffer->mins, scale_size);
        f.close();
      }
    }

    return true;
  }

  // AWQ-specific function to read quantized weights, scales and zeros from files
  inline void read_awq_weights(std::filesystem::path prefix, std::string proj_name, int expert_idx, char* weights_buf,
                               float* scales_buf, uint8_t* zeros_buf, size_t weights_size, size_t scales_size,
                               size_t zeros_size, uint8_t mat_split, uint8_t mat_split_idx) {
    // Read qweights (quantized weights)
    std::string weights_filename = proj_name + ".qweight." + std::to_string(expert_idx) + ".bin";
    std::ifstream weights_file(prefix / weights_filename, std::ios::binary);
    if (!weights_file.is_open()) {
      printf("Failed to open weights file: %s\n", (prefix / weights_filename).c_str());
      throw std::runtime_error("Failed to open weights file: " + weights_filename);
    }

    weights_file.seekg(mat_split_idx * weights_size / mat_split);
    weights_file.read(weights_buf + mat_split_idx * weights_size / mat_split, weights_size / mat_split);
    weights_file.close();

    // Read scales
    std::string scales_filename = proj_name + ".scales." + std::to_string(expert_idx) + ".bin";
    std::ifstream scales_file(prefix / scales_filename, std::ios::binary);
    if (!scales_file.is_open()) {
      printf("Failed to open scales file: %s\n", (prefix / scales_filename).c_str());
      throw std::runtime_error("Failed to open scales file: " + scales_filename);
    }

    scales_file.seekg(mat_split_idx * scales_size / mat_split);
    scales_file.read(reinterpret_cast<char*>(scales_buf) + mat_split_idx * scales_size / mat_split,
                     scales_size / mat_split);
    scales_file.close();

    // Read qzeros (quantized zeros)
    std::string zeros_filename = proj_name + ".qzeros." + std::to_string(expert_idx) + ".bin";
    std::ifstream zeros_file(prefix / zeros_filename, std::ios::binary);
    if (!zeros_file.is_open()) {
      printf("Failed to open zeros file: %s\n", (prefix / zeros_filename).c_str());
      throw std::runtime_error("Failed to open zeros file: " + zeros_filename);
    }

    zeros_file.seekg(mat_split_idx * zeros_size / mat_split);
    zeros_file.read(reinterpret_cast<char*>(zeros_buf) + mat_split_idx * zeros_size / mat_split,
                    zeros_size / mat_split);
    zeros_file.close();
  }

#ifdef CHECK
  inline void load_check() {
    memcpy(check_bb, (char*)down_bb_[compare_expers]->b,
           T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, config_.quant_config.group_size));
  }

  void verify_load_right() {
    memcpy(verify_bb, (char*)down_bb_[compare_expers]->b,
           T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, config_.quant_config.group_size));
    if (memcmp(verify_bb, check_bb,
               T::BufferB::required_size(config_.hidden_size, config_.intermediate_size,
                                         config_.quant_config.group_size)) != 0) {
      printf("verify error\n");
      for (size_t i = 0; i < T::BufferB::required_size(config_.hidden_size, config_.intermediate_size,
                                                       config_.quant_config.group_size);
           ++i) {
        if (verify_bb[i] != check_bb[i]) {
          printf("Difference at byte %zu: verify_bb_%d[%zu] = %02x, check_bb[%zu] = %02x\n", i, compare_expers, i,
                 (unsigned char)verify_bb[i], i, (unsigned char)check_bb[i]);
          break;
        }
      }
      assert(0);
    } else {
      printf("pass verify\n");
      printf("numa %d, verify_bb_%d:\n", tp_part_idx, compare_expers);
      size_t size =
          T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, config_.quant_config.group_size);
      size_t scale_size = config_.hidden_size * sizeof(float);
      for (size_t i = size - scale_size; i < size - scale_size + 50; ++i) {
        printf("%02x ", (unsigned char)verify_bb[i]);
      }
      printf("\n");
    }
  }
#endif

  // Function to dump Buffer B data for debugging quantization results
  inline void dump_buffer_b(const std::string& quantization_type, int expert_idx, const std::string& matrix_type,
                            typename T::BufferB* buffer) {
    auto& quant_config = config_.quant_config;
    int& group_size = quant_config.group_size;

    printf("[DUMP_BUFFER_B] TP%d %s Expert%d %s:\n", tp_part_idx, quantization_type.c_str(), expert_idx,
           matrix_type.c_str());

    // Calculate dimensions based on matrix type
    int rows, cols, num_groups;
    size_t scale_elem_count;
    if (matrix_type == "gate" || matrix_type == "up") {
      rows = config_.intermediate_size;
      cols = config_.hidden_size;
      num_groups = cols / group_size;
      scale_elem_count = num_groups * rows;
    } else {  // down
      rows = config_.hidden_size;
      cols = config_.intermediate_size;
      num_groups = cols / group_size;
      scale_elem_count = num_groups * rows;
    }

    // Dump scales (as float)
    printf("  Scales[first 16]: ");
    for (int i = 0; i < std::min(16, (int)scale_elem_count); i++) {
      printf("%.6f ", buffer->d[i]);
    }
    printf("\n");

    if (scale_elem_count > 16) {
      printf("  Scales[last 16]: ");
      int start_idx = std::max(0, (int)scale_elem_count - 16);
      for (int i = start_idx; i < (int)scale_elem_count; i++) {
        printf("%.6f ", buffer->d[i]);
      }
      printf("\n");
    }

    // Dump mins (as float) if available
    if (quant_config.zero_point && buffer->mins) {
      printf("  Mins[first 16]: ");
      for (int i = 0; i < std::min(16, (int)scale_elem_count); i++) {
        printf("%.6f ", buffer->mins[i]);
      }
      printf("\n");

      if (scale_elem_count > 16) {
        printf("  Mins[last 16]: ");
        int start_idx = std::max(0, (int)scale_elem_count - 16);
        for (int i = start_idx; i < (int)scale_elem_count; i++) {
          printf("%.6f ", buffer->mins[i]);
        }
        printf("\n");
      }
    }

    // Dump quantized weights (as hex uint8)
    size_t weight_size = (rows * cols) / 2;  // INT4 packed
    uint8_t* weight_ptr = (uint8_t*)buffer->b;

    printf("  Weights[first 32 bytes]: ");
    for (int i = 0; i < std::min(32, (int)weight_size); i++) {
      printf("%02x ", weight_ptr[i]);
    }
    printf("\n");

    if (weight_size > 32) {
      printf("  Weights[last 32 bytes]: ");
      int start_idx = std::max(32, (int)weight_size - 32);
      for (int i = start_idx; i < (int)weight_size; i++) {
        printf("%02x ", weight_ptr[i]);
      }
      printf("\n");
    }

    printf("  Matrix dimensions: %dx%d, Groups: %d, Group size: %d, Scale elements: %zu\n", rows, cols, num_groups,
           group_size, scale_elem_count);
    printf("\n");
  }

  // AVX-optimized function to convert INT4 zeros to float mins
  // mins = -(zeros * scales) (element-wise), where scales is float format
  inline void convert_zeros_to_mins_avx(const uint32_t* zeros_int4_packed, const float* scales, float* mins,
                                        size_t num_elements) {
    constexpr size_t simd_width = 8;  // 每次解 8 个 int4

    for (size_t i = 0; i < num_elements; i += simd_width) {
      uint32_t packed_vals = zeros_int4_packed[i / 8];

      for (int j = 0; j < 8; j++) {
        int v = packed_vals & 0xF;  // 取出4bit
        mins[i + j] = -(scales[i + j] * v);
        packed_vals = packed_vals >> 4;
      }
    }
  }

 public:
  using typename Base::input_t;
  using typename Base::output_t;

  AMX_AWQ_MOE_TP() = default;

  AMX_AWQ_MOE_TP(GeneralMOEConfig config, int tp_part_idx_ = 0) : Base(config, tp_part_idx_) {}

  void derived_init() {
    auto& quant_config = config_.quant_config;
    if (quant_config.group_size == 0 || !quant_config.zero_point) {
      throw std::runtime_error("AWQ-Quantization AMX MoE only support KGroup Int4_1");
    }

    printf("Creating AMX_AWQ_MOE_TP %d at numa %d\n", tp_part_idx, numa_node_of_cpu(sched_getcpu()));

    auto& load = config_.load;
    auto& save = config_.save;

    std::filesystem::path prefix = config_.path;
    prefix = prefix / ("_layer_" + std::to_string(config_.layer_idx)) / ("_numa_" + std::to_string(tp_part_idx));
    if (save) {
      std::cout << "Creating " << prefix << std::endl;
      std::filesystem::create_directories(prefix);
    }
    if (load) {
      if (std::filesystem::exists(prefix)) {
        std::cout << "Loading from " << prefix << std::endl;
      } else {
        throw std::runtime_error("Path not found: " + prefix.string());
      }
    }
  }

  ~AMX_AWQ_MOE_TP() = default;

  // ============================================================================
  // CRTP buffer creation - with group_size (AWQ uses zero-point)
  // ============================================================================

  size_t buffer_a_required_size_impl(size_t m, size_t k) const {
    return T::BufferA::required_size(m, k, config_.quant_config.group_size);
  }
  size_t buffer_b_required_size_impl(size_t n, size_t k) const {
    return T::BufferB::required_size(n, k, config_.quant_config.group_size);
  }
  size_t buffer_c_required_size_impl(size_t m, size_t n) const { return T::BufferC::required_size(m, n); }

  std::shared_ptr<typename T::BufferA> make_buffer_a_impl(size_t m, size_t k, void* data) const {
    return std::make_shared<typename T::BufferA>(m, k, config_.quant_config.group_size, data);
  }
  std::shared_ptr<typename T::BufferB> make_buffer_b_impl(size_t n, size_t k, void* data) const {
    return std::make_shared<typename T::BufferB>(n, k, config_.quant_config.group_size, data);
  }
  std::shared_ptr<typename T::BufferC> make_buffer_c_impl(size_t m, size_t n, void* data) const {
    return std::make_shared<typename T::BufferC>(m, n, data);
  }

  // ============================================================================
  // CRTP virtual points - GEMM dispatch (uses kgroup with zeros)
  // ============================================================================

  void do_gate_up_gemm(bool do_up, int expert_idx, int ith, int nth, int qlen) {
    auto& group_size = config_.quant_config.group_size;
    int m = m_local_num_[expert_idx];
    auto& ba = gate_up_ba_[expert_idx];
    auto& bb = do_up ? up_bb_[expert_idx] : gate_bb_[expert_idx];
    auto& bc = do_up ? up_bc_[expert_idx] : gate_bc_[expert_idx];

    // Dispatch based on qlen threshold
    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul_kgroup(m, config_.intermediate_size, config_.hidden_size, group_size, ba, bb, bc, ith, nth);
    } else {
      amx::vec_mul_kgroup(m, config_.intermediate_size, config_.hidden_size, group_size, ba, bb, bc, ith, nth);
    }
  }

  void do_down_gemm(int expert_idx, int ith, int nth, int qlen) {
    auto& group_size = config_.quant_config.group_size;
    int m = m_local_num_[expert_idx];

    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul_kgroup(m, config_.hidden_size, config_.intermediate_size, group_size, down_ba_[expert_idx],
                          down_bb_[expert_idx], down_bc_[expert_idx], ith, nth);
    } else {
      amx::vec_mul_kgroup(m, config_.hidden_size, config_.intermediate_size, group_size, down_ba_[expert_idx],
                          down_bb_[expert_idx], down_bc_[expert_idx], ith, nth);
    }
  }

  /**
   * @brief Load Int4 weights with scales and zero-points
   *
   * AWQ weights include:
   * - Quantized INT4 weights
   * - FP16 scales (converted to FP32)
   * - INT4 zeros (converted to FP32 mins = -scale * zero)
   */
  void load_weights() {
    auto& quant_config = config_.quant_config;
    int& group_size = quant_config.group_size;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
    if (quant_config.group_size == 0 || !quant_config.zero_point) {
      throw std::runtime_error("AWQ-Quantization AMX MoE only support KGroup Int4_1");
    }

    auto pool = config_.pool->get_subpool(tp_part_idx);
    if (config_.gate_projs.size()) {
      throw std::runtime_error("AMX load weights from gate_projs is not supported");
    } else {
      int nth = T::recommended_nth(config_.intermediate_size);
      std::filesystem::path prefix = config_.path;
      prefix = prefix / ("_layer_" + std::to_string(config_.layer_idx)) / ("_numa_" + std::to_string(tp_part_idx));

      if (config_.load) {
        throw std::runtime_error("AMX load weights from file is not supported");
      }
#ifdef CHECK
      load_check();
#endif
#ifndef CHECK
      else if (config_.gate_scale != nullptr)
#endif
      {
        // Loading quantized weights with scales and zeros
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              uint64_t expert_idx = task_id / nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // gate part
              gate_bb_[expert_idx]->from_raw_mat(
                  (uint8_t*)config_.gate_proj +
                      ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
                  ith, nth);
              // up part
              up_bb_[expert_idx]->from_raw_mat(
                  (uint8_t*)config_.up_proj +
                      ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
                  ith, nth);
            },
            nullptr);

        nth = T::recommended_nth(config_.hidden_size);
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              uint64_t expert_idx = task_id / nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // down part
              down_bb_[expert_idx]->from_raw_mat(
                  (uint8_t*)config_.down_proj +
                      ((logical_expert_id * config_.hidden_size * config_.intermediate_size) >> 1),
                  ith, nth);
            },
            nullptr);

        pool->do_work_stealing_job(
            config_.expert_num, nullptr,
            [this, physical_to_logical_map](int task_id) {
              uint64_t expert_idx = task_id;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              size_t scale_elem_count =
                  (config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;

              // convert scales from FP16 to FP32
              convert_or_copy(gate_bb_[expert_idx]->d,
                              (ggml_fp16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count),
                              scale_elem_count);
              convert_or_copy(up_bb_[expert_idx]->d,
                              (ggml_fp16_t*)config_.up_scale + (logical_expert_id * scale_elem_count),
                              scale_elem_count);
              convert_or_copy(down_bb_[expert_idx]->d,
                              (ggml_fp16_t*)config_.down_scale + (logical_expert_id * scale_elem_count),
                              scale_elem_count);

              // Convert INT4 zeros to FP32 mins: mins = -(scale * zero)
              convert_zeros_to_mins_avx(
                  (const uint32_t*)((uint8_t*)config_.gate_zero + ((logical_expert_id * scale_elem_count) >> 1)),
                  gate_bb_[expert_idx]->d, gate_bb_[expert_idx]->mins, scale_elem_count);
              convert_zeros_to_mins_avx(
                  (const uint32_t*)((uint8_t*)config_.up_zero + ((logical_expert_id * scale_elem_count) >> 1)),
                  up_bb_[expert_idx]->d, up_bb_[expert_idx]->mins, scale_elem_count);
              convert_zeros_to_mins_avx(
                  (const uint32_t*)((uint8_t*)config_.down_zero + ((logical_expert_id * scale_elem_count) >> 1)),
                  down_bb_[expert_idx]->d, down_bb_[expert_idx]->mins, scale_elem_count);
            },
            nullptr);

        // Save offline quantization data if requested
        if (config_.save) {
          for (int expert_idx = 0; expert_idx < config_.expert_num; expert_idx++) {
            write_weights(prefix, "_gate_", gate_bb_[expert_idx].get(), expert_idx, "OFFLINE");
            write_weights(prefix, "_up_", up_bb_[expert_idx].get(), expert_idx, "OFFLINE");
            write_weights(prefix, "_down_", down_bb_[expert_idx].get(), expert_idx, "OFFLINE");
          }
        }
      }
      else {
        // Online Quantization from BF16
        assert(config_.gate_proj != nullptr);

        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              int64_t expert_idx = task_id / nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // gate part
              gate_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.gate_proj +
                      (logical_expert_id * config_.intermediate_size * config_.hidden_size),
                  ith, nth);
              // up part
              up_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.up_proj + (logical_expert_id * config_.intermediate_size * config_.hidden_size),
                  ith, nth);
            },
            nullptr);

        nth = T::recommended_nth(config_.hidden_size);
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              int64_t expert_idx = task_id / nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // down part
              down_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.down_proj +
                      (logical_expert_id * config_.hidden_size * config_.intermediate_size),
                  ith, nth);
            },
            nullptr);

        // Save online quantization data if requested
        if (config_.save) {
          for (int expert_idx = 0; expert_idx < config_.expert_num; expert_idx++) {
            write_weights(prefix, "_gate_", gate_bb_[expert_idx].get(), expert_idx, "ONLINE");
            write_weights(prefix, "_up_", up_bb_[expert_idx].get(), expert_idx, "ONLINE");
            write_weights(prefix, "_down_", down_bb_[expert_idx].get(), expert_idx, "ONLINE");
          }
        }
      }
#ifdef CHECK
      verify_load_right();
#endif
    }
  }

  // forward, forward_prefill, forward_decode, warm_up are inherited from Base
};

// ============================================================================
// TP_MOE specialization for AMX_AWQ_MOE_TP
// Inherits from TP_MOE<AMX_MOE_BASE<...>> to reuse merge_results implementation
// ============================================================================

template <typename K>
class TP_MOE<AMX_AWQ_MOE_TP<K>> : public TP_MOE<AMX_MOE_BASE<K, AMX_AWQ_MOE_TP<K>>> {
 public:
  using Base = TP_MOE<AMX_MOE_BASE<K, AMX_AWQ_MOE_TP<K>>>;
  using Base::Base;

  void load_weights() override {
    auto& config = this->config;
    auto& tps = this->tps;
    auto& tp_count = this->tp_count;
    auto pool = config.pool;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;
    if (config.gate_projs.empty() == false) {
      printf("TP Load from loader\n");
      DO_TPS_LOAD_WEIGHTS(pool);
      this->weights_loaded = true;
    } else if (config.gate_scale != nullptr) {
      printf("From Packed Int4 with KGroup Scale and Zeros\n");
      int& group_size = config.quant_config.group_size;
      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        size_t weight_elem_count = tpc.intermediate_size * tpc.hidden_size;
        tpc.gate_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
        tpc.up_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
        tpc.down_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];

        size_t scales_elem_count = (tpc.hidden_size / group_size) * tpc.intermediate_size;

        tpc.gate_scale = new ggml_fp16_t[(tpc.expert_num * scales_elem_count)];
        tpc.up_scale = new ggml_fp16_t[(tpc.expert_num * scales_elem_count)];
        tpc.down_scale = new ggml_fp16_t[(tpc.expert_num * scales_elem_count)];

        tpc.gate_zero = new uint8_t[(tpc.expert_num * scales_elem_count) / 2];
        tpc.up_zero = new uint8_t[(tpc.expert_num * scales_elem_count) / 2];
        tpc.down_zero = new uint8_t[(tpc.expert_num * scales_elem_count) / 2];
        if (tps[i]->config_.load == false) {
          pool->get_subpool(i)->do_work_stealing_job(
              tpc.expert_num, nullptr,
              [&](int expert_id_) {
                size_t expert_id = expert_map(physical_to_logical_map, expert_id_);

                // weight TP-slicing
                memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
                       (uint8_t*)config.gate_proj +
                           ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
                       ((sizeof(uint8_t) * weight_elem_count) >> 1));

                memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
                       (uint8_t*)config.up_proj +
                           ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
                       ((sizeof(uint8_t) * weight_elem_count) >> 1));

                // down scales and zeros TP-slicing
                memcpy((ggml_fp16_t*)tpc.down_scale + (expert_id * scales_elem_count),
                       (ggml_fp16_t*)config.down_scale +
                           (expert_id * (config.intermediate_size / group_size) * config.hidden_size +
                            i * scales_elem_count),
                       sizeof(ggml_fp16_t) * scales_elem_count);

                memcpy((uint8_t*)tpc.down_zero + ((expert_id * scales_elem_count) >> 1),
                       (uint8_t*)config.down_zero +
                           ((expert_id * (config.intermediate_size / group_size) * config.hidden_size +
                             i * scales_elem_count) >>
                            1),
                       (sizeof(uint8_t) * scales_elem_count) >> 1);

                for (size_t kg = 0; kg < config.hidden_size / group_size; kg++) {
                  // copy gate/up scales
                  memcpy((ggml_fp16_t*)tpc.gate_scale + (expert_id * scales_elem_count) + kg * tpc.intermediate_size,
                         (ggml_fp16_t*)config.gate_scale +
                             (expert_id * ((config.hidden_size / group_size) * config.intermediate_size) +
                              kg * config.intermediate_size + i * tpc.intermediate_size),
                         (sizeof(ggml_fp16_t) * tpc.intermediate_size));

                  memcpy((ggml_fp16_t*)tpc.up_scale + (expert_id * scales_elem_count) + kg * tpc.intermediate_size,
                         (ggml_fp16_t*)config.up_scale +
                             (expert_id * ((config.hidden_size / group_size) * config.intermediate_size) +
                              kg * config.intermediate_size + i * tpc.intermediate_size),
                         (sizeof(ggml_fp16_t) * tpc.intermediate_size));

                  // copy gate/up zeros TP-slicing
                  memcpy(
                      (uint8_t*)tpc.gate_zero + (((expert_id * scales_elem_count) + kg * tpc.intermediate_size) >> 1),
                      (uint8_t*)config.gate_zero +
                          ((expert_id * ((config.hidden_size / group_size) * config.intermediate_size) +
                            kg * config.intermediate_size + i * tpc.intermediate_size) >>
                           1),
                      ((sizeof(uint8_t) * tpc.intermediate_size) >> 1));

                  memcpy((uint8_t*)tpc.up_zero + (((expert_id * scales_elem_count) + kg * tpc.intermediate_size) >> 1),
                         (uint8_t*)config.up_zero +
                             ((expert_id * ((config.hidden_size / group_size) * config.intermediate_size) +
                               kg * config.intermediate_size + i * tpc.intermediate_size) >>
                              1),
                         ((sizeof(uint8_t) * tpc.intermediate_size) >> 1));
                }

                // down weights TP-slicing (column-wise)
                for (size_t col = 0; col < config.hidden_size; col++) {
                  memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
                         (uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
                                                        col * config.intermediate_size + i * tpc.intermediate_size) >>
                                                       1),
                         (sizeof(uint8_t) * tpc.intermediate_size) >> 1);
                }
              },
              nullptr);
        }
      }

      DO_TPS_LOAD_WEIGHTS(pool);

      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        delete[] (uint8_t*)(tpc.gate_proj);
        delete[] (uint8_t*)(tpc.up_proj);
        delete[] (uint8_t*)(tpc.down_proj);

        delete[] (ggml_fp16_t*)(tpc.gate_scale);
        delete[] (ggml_fp16_t*)(tpc.up_scale);
        delete[] (ggml_fp16_t*)(tpc.down_scale);

        delete[] (uint8_t*)(tpc.gate_zero);
        delete[] (uint8_t*)(tpc.up_zero);
        delete[] (uint8_t*)(tpc.down_zero);
      }

      this->weights_loaded = true;
    } else if (config.gate_proj != nullptr) {
      printf("From BF16 Online Quantization.\n");
      fflush(stdout);
      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        size_t gate_up_elcount = tpc.intermediate_size * tpc.hidden_size;
        tpc.gate_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        tpc.up_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        tpc.down_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        if (tps[i]->config_.load == false) {
          pool->get_subpool(i)->do_work_stealing_job(
              tpc.expert_num, nullptr,
              [&](int expert_id_) {
                size_t expert_id = expert_map(physical_to_logical_map, expert_id_);
                memcpy((ggml_bf16_t*)tpc.gate_proj + expert_id * gate_up_elcount,
                       (ggml_bf16_t*)config.gate_proj + expert_id * config.intermediate_size * config.hidden_size +
                           i * gate_up_elcount,
                       sizeof(ggml_bf16_t) * gate_up_elcount);
                memcpy((ggml_bf16_t*)tpc.up_proj + expert_id * gate_up_elcount,
                       (ggml_bf16_t*)config.up_proj + expert_id * config.intermediate_size * config.hidden_size +
                           i * gate_up_elcount,
                       sizeof(ggml_bf16_t) * gate_up_elcount);
                for (size_t col = 0; col < config.hidden_size; col++) {
                  memcpy((ggml_bf16_t*)tpc.down_proj + expert_id * tpc.hidden_size * tpc.intermediate_size +
                             col * tpc.intermediate_size,
                         (ggml_bf16_t*)config.down_proj + expert_id * config.intermediate_size * config.hidden_size +
                             col * config.intermediate_size + i * tpc.intermediate_size,
                         sizeof(ggml_bf16_t) * tpc.intermediate_size);
                }
              },
              nullptr);
        }
      }

      DO_TPS_LOAD_WEIGHTS(pool);

      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        delete[] (ggml_bf16_t*)(tpc.gate_proj);
        delete[] (ggml_bf16_t*)(tpc.up_proj);
        delete[] (ggml_bf16_t*)(tpc.down_proj);
      }

      this->weights_loaded = true;
    } else if (config.path != "") {
      printf("TP Load from file\n");
      DO_TPS_LOAD_WEIGHTS(pool);
      this->weights_loaded = true;
    } else {
      throw std::runtime_error("no weight source");
    }
  }

  // merge_results is inherited from TP_MOE<AMX_MOE_BASE<K, AMX_AWQ_MOE_TP<K>>>
};

#endif


================================================
FILE: kt-kernel/operators/amx/bf16-moe.hpp
================================================
/**
 * @Description  : BF16 AMX MoE operator for native BF16 inference
 * @Author       : oql, Codex and Claude
 * @Date         : 2026-01-06
 * @Version      : 1.0.0
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 *
 * This file implements BF16 MoE using CRTP pattern, inheriting from moe_base.hpp.
 * BF16 weights are stored without quantization (no scales).
 **/
#ifndef CPUINFER_OPERATOR_AMX_BF16_MOE_H
#define CPUINFER_OPERATOR_AMX_BF16_MOE_H

// #define DEBUG_BF16_MOE

#include "la/amx_kernels.hpp"  // For vec_mul/mat_mul
#include "la/amx_raw_buffers.hpp"
#include "la/amx_raw_kernels.hpp"
#include "la/amx_utils.hpp"  // For transpose_16x16_32bit
#include "moe_base.hpp"

/**
 * @brief BF16 MoE operator using CRTP pattern
 * @tparam T Kernel type, defaults to GemmKernel224BF16
 *
 * This class provides BF16-specific implementations:
 * - do_gate_up_gemm, do_down_gemm: BF16 weight mat mul (no quantization)
 * - load_weights: Load native BF16 weights (no scales)
 */
template <class T = amx::GemmKernel224BF16>
class AMX_BF16_MOE_TP : public AMX_MOE_BASE<T, AMX_BF16_MOE_TP<T>> {
  using Base = AMX_MOE_BASE<T, AMX_BF16_MOE_TP<T>>;
  using Base::config_;
  using Base::down_ba_;
  using Base::down_bb_;
  using Base::down_bc_;
  using Base::gate_bb_;
  using Base::gate_bc_;
  using Base::gate_up_ba_;
  using Base::m_local_num_;
  using Base::tp_part_idx;
  using Base::up_bb_;
  using Base::up_bc_;

 public:
  using typename Base::input_t;
  using typename Base::output_t;

  AMX_BF16_MOE_TP() = default;

  AMX_BF16_MOE_TP(GeneralMOEConfig config, int tp_part_idx_ = 0) : Base(config, tp_part_idx_) {
    // Initialization now happens in derived_init() which is called by base constructor
  }

  void derived_init() {
    // BF16 has no quantization, no need to check quant_config
    printf("Created AMX_BF16_MOE_TP %d at numa %d\n", tp_part_idx, numa_node_of_cpu(sched_getcpu()));
  }

  ~AMX_BF16_MOE_TP() = default;

  // ============================================================================
  // CRTP buffer creation - without group_size
  // ============================================================================

  size_t buffer_a_required_size_impl(size_t m, size_t k) const { return T::BufferA::required_size(m, k); }

  size_t buffer_b_required_size_impl(size_t n, size_t k) const {
    return T::BufferB::required_size(n, k);  // 2 parameters - no group_size
  }

  size_t buffer_c_required_size_impl(size_t m, size_t n) const { return T::BufferC::required_size(m, n); }

  std::shared_ptr<typename T::BufferA> make_buffer_a_impl(size_t m, size_t k, void* data) const {
    return std::make_shared<typename T::BufferA>(m, k, data);
  }

  std::shared_ptr<typename T::BufferB> make_buffer_b_impl(size_t n, size_t k, void* data) const {
    return std::make_shared<typename T::BufferB>(n, k, data);  // 2 parameters - no group_size
  }

  std::shared_ptr<typename T::BufferC> make_buffer_c_impl(size_t m, size_t n, void* data) const {
    return std::make_shared<typename T::BufferC>(m, n, data);
  }

  // ============================================================================
  // CRTP virtual points - GEMM dispatch
  // ============================================================================

  void do_gate_up_gemm(bool do_up, int expert_idx, int ith, int nth, int qlen) {
    int m = m_local_num_[expert_idx];
    auto& ba = gate_up_ba_[expert_idx];
    auto& bb = do_up ? up_bb_[expert_idx] : gate_bb_[expert_idx];
    auto& bc = do_up ? up_bc_[expert_idx] : gate_bc_[expert_idx];

    // Use vec_mul/mat_mul (no group_size)
    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul(m, config_.intermediate_size, config_.hidden_size, ba, bb, bc, ith, nth);
    } else {
      amx::vec_mul(m, config_.intermediate_size, config_.hidden_size, ba, bb, bc, ith, nth);
    }
  }

  void do_down_gemm(int expert_idx, int ith, int nth, int qlen) {
    int m = m_local_num_[expert_idx];

    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul(m, config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx], down_bb_[expert_idx],
                   down_bc_[expert_idx], ith, nth);
    } else {
      amx::vec_mul(m, config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx], down_bb_[expert_idx],
                   down_bc_[expert_idx], ith, nth);
    }
  }

#ifdef DEBUG_BF16_MOE
  // Function to dump Buffer B data for debugging
  inline void dump_buffer_b(int expert_idx, const std::string& matrix_type, typename T::BufferB* buffer) {
    printf("[DUMP_BUFFER_B] TP%d BF16 Expert%d %s:\n", tp_part_idx, expert_idx, matrix_type.c_str());

    // Calculate dimensions based on matrix type
    int rows, cols;
    if (matrix_type == "gate" || matrix_type == "up") {
      rows = config_.intermediate_size;
      cols = config_.hidden_size;
    } else {  // down
      rows = config_.hidden_size;
      cols = config_.intermediate_size;
    }

    // Dump BF16 weights
    size_t weight_size = (size_t)rows * cols;
    ggml_bf16_t* weight_ptr = buffer->b;

    printf("  BF16 Weights[first 16]: ");
    for (int i = 0; i < std::min(16, (int)weight_size); i++) {
      printf("%.6f ", ggml_bf16_to_fp32(weight_ptr[i]));
    }
    printf("\n");

    if (weight_size > 16) {
      printf("  BF16 Weights[last 16]: ");
      int start_idx = std::max(0, (int)weight_size - 16);
      for (int i = start_idx; i < (int)weight_size; i++) {
        printf("%.6f ", ggml_bf16_to_fp32(weight_ptr[i]));
      }
      printf("\n");
    }

    printf("  Matrix dimensions: %dx%d (n x k)\n", rows, cols);
  }
#endif

  /**
   * @brief Load BF16 weights from contiguous memory layout
   *
   * Loads weights from config_.gate_proj, up_proj, down_proj (no scales).
   */
  void load_weights() {
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
    auto pool = config_.pool->get_subpool(tp_part_idx);

    if (config_.gate_proj == nullptr) {
      throw std::runtime_error("BF16 MOE requires native BF16 weight.");
    }

    // Load gate + up weights
    int nth = T::recommended_nth(config_.intermediate_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;

          // Gate: from BF16 data (no scale)
          gate_bb_[expert_idx]->from_mat(
              (ggml_bf16_t*)config_.gate_proj + (logical_expert_id * config_.intermediate_size * config_.hidden_size),
              ith, nth);  // 3 parameters: (bf16*, ith, nth)

          // Up: same
          up_bb_[expert_idx]->from_mat(
              (ggml_bf16_t*)config_.up_proj + (logical_expert_id * config_.intermediate_size * config_.hidden_size),
              ith, nth);
        },
        nullptr);

    // Load down weights
    nth = T::recommended_nth(config_.hidden_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;

          // Down
          down_bb_[expert_idx]->from_mat(
              (ggml_bf16_t*)config_.down_proj + (logical_expert_id * config_.intermediate_size * config_.hidden_size),
              ith, nth);
        },
        nullptr);

#ifdef DEBUG_BF16_MOE
    dump_buffer_b(0, "gate", gate_bb_[0].get());
    dump_buffer_b(0, "down", down_bb_[0].get());
#endif
  }

  // Fast 64-byte (512-bit) memcpy using AVX512
  static inline void fast_memcpy_64(void* __restrict dst, const void* __restrict src) {
    __m512i data = _mm512_loadu_si512(src);
    _mm512_storeu_si512(dst, data);
  }

  // Fast 64-byte non-temporal store (bypass cache for write-only patterns)
  static inline void fast_stream_64(void* __restrict dst, const void* __restrict src) {
    __m512i data = _mm512_loadu_si512(src);
    _mm512_stream_si512((__m512i*)dst, data);
  }

  // Fast memcpy for arbitrary sizes using AVX512
  static inline void fast_memcpy(void* __restrict dst, const void* __restrict src, size_t bytes) {
    uint8_t* d = (uint8_t*)dst;
    const uint8_t* s = (const uint8_t*)src;
    size_t chunks = bytes / 64;
    for (size_t i = 0; i < chunks; i++) {
      fast_memcpy_64(d, s);
      d += 64;
      s += 64;
    }
    bytes -= chunks * 64;
    if (bytes > 0) {
      std::memcpy(d, s, bytes);
    }
  }

  /**
   * @brief Unpack a single N_STEP x K_STEP block from packed BufferB format to n-major format (BF16 version)
   *
   * This is the inverse of the packing done in BufferBBF16Impl::from_mat.
   * BF16 elements are 2 bytes, and the packed format includes 16x16 32-bit transpose.
   *
   * @param src Pointer to packed data (N_STEP * K_STEP * 2 bytes in packed layout)
   * @param dst Pointer to destination in n-major layout
   * @param dst_row_stride Row stride in destination buffer (number of BF16 elements per row)
   */
  static inline void unpack_nk_block_bf16(const ggml_bf16_t* src, ggml_bf16_t* dst, size_t dst_row_stride) {
    constexpr int N_STEP = T::N_STEP;  // 32
    constexpr int K_STEP = T::K_STEP;  // 32
    constexpr int TILE_N = T::TILE_N;  // 16

    // The packed format has two 16x16 blocks (32-bit view) that were transposed
    // We need to reverse the transpose first, then copy to n-major layout

    // Create aligned temporary buffers for transpose
    alignas(64) __m512i temp_block1[TILE_N];
    alignas(64) __m512i temp_block2[TILE_N];

    // Copy source data to temporary buffers
    const __m512i* src_vec = reinterpret_cast<const __m512i*>(src);
    for (int i = 0; i < TILE_N; i++) {
      temp_block1[i] = src_vec[i];
      temp_block2[i] = src_vec[TILE_N + i];
    }

    // Reverse transpose (transpose is self-inverse)
    amx::transpose_16x16_32bit(temp_block1);
    amx::transpose_16x16_32bit(temp_block2);

    // Copy transposed data to destination in n-major layout using non-temporal stores
    // First 16 rows (block 1)
    for (int i = 0; i < TILE_N; i++) {
      fast_stream_64(dst + i * dst_row_stride, &temp_block1[i]);
    }

    // Next 16 rows (block 2)
    for (int i = 0; i < TILE_N; i++) {
      fast_stream_64(dst + (TILE_N + i) * dst_row_stride, &temp_block2[i]);
    }

    // Ensure all stores complete before returning
    _mm_sfence();
  }

  /**
   * @brief Reconstruct weights for a single expert to the output buffers
   *
   * Directly unpacks from packed BufferB format to n-major GPU buffers without intermediate storage.
   * BF16 version - no scales needed.
   *
   * @param gpu_tp_count Number of GPU TP parts (1, 2, 4, or 8)
   * @param cpu_tp_count Number of CPU TP parts
   * @param expert_id Expert index to process
   * @param full_config Full configuration (before CPU TP split)
   * @param w13_weight_ptrs Pointers to gate+up weight buffers (one per GPU TP)
   * @param w13_scale_ptrs Pointers to gate+up scale buffers (unused for BF16, kept for interface compatibility)
   * @param w2_weight_ptrs Pointers to down weight buffers (one per GPU TP)
   * @param w2_scale_ptrs Pointers to down scale buffers (unused for BF16, kept for interface compatibility)
   */
  void write_weights_to_buffer(int gpu_tp_count, [[maybe_unused]] int cpu_tp_count, int expert_id,
                               const GeneralMOEConfig& full_config, const std::vector<uintptr_t>& w13_weight_ptrs,
                               [[maybe_unused]] const std::vector<uintptr_t>& w13_scale_ptrs,
                               const std::vector<uintptr_t>& w2_weight_ptrs,
                               [[maybe_unused]] const std::vector<uintptr_t>& w2_scale_ptrs) const {
    auto& config = config_;
    auto pool = config.pool->get_subpool(tp_part_idx);

    constexpr int N_STEP = T::N_STEP;
    constexpr int K_STEP = T::K_STEP;
    constexpr int N_BLOCK = T::N_BLOCK;
    constexpr int K_BLOCK = T::K_BLOCK;

    // ========= W13 (gate+up): Shape [intermediate, hidden], split by N only =========
    const int cpu_n_w13 = config.intermediate_size;
    const int cpu_k_w13 = config.hidden_size;
    const int gpu_n_w13 = full_config.intermediate_size / gpu_tp_count;
    const int gpu_k_w13 = full_config.hidden_size;
    const int global_n_offset_w13 = tp_part_idx * cpu_n_w13;

    const size_t gpu_w13_weight_per_mat = (size_t)gpu_n_w13 * gpu_k_w13;

    // ========= W2 (down): Shape [hidden, intermediate], split by K =========
    const int cpu_n_w2 = config.hidden_size;
    const int cpu_k_w2 = config.intermediate_size;
    const int gpu_n_w2 = full_config.hidden_size;
    const int gpu_k_w2 = full_config.intermediate_size / gpu_tp_count;
    const int global_k_offset_w2 = tp_part_idx * cpu_k_w2;

    // ========= Optimized job layout =========
    constexpr int NUM_W13_TASKS = 32;  // Per matrix (gate or up), total 64 for w13
    constexpr int NUM_W2_TASKS = 32;   // For down matrix

    const int total_tasks = NUM_W13_TASKS * 2 + NUM_W2_TASKS;

    // Calculate N_STEP blocks per task
    const int w13_n_steps = div_up(cpu_n_w13, N_STEP);
    const int w13_steps_per_task = div_up(w13_n_steps, NUM_W13_TASKS);
    const int w2_n_steps = div_up(cpu_n_w2, N_STEP);
    const int w2_steps_per_task = div_up(w2_n_steps, NUM_W2_TASKS);

    pool->do_work_stealing_job(
        total_tasks, nullptr,
        [=, &w13_weight_ptrs, &w2_weight_ptrs, this](int task_id) {
          if (task_id < NUM_W13_TASKS * 2) {
            // ========= W13 weight task: process chunk of rows x full K =========
            const bool is_up = task_id >= NUM_W13_TASKS;
            const int chunk_idx = task_id % NUM_W13_TASKS;
            const auto& bb = is_up ? up_bb_[expert_id] : gate_bb_[expert_id];

            const int step_start = chunk_idx * w13_steps_per_task;
            const int step_end = std::min(step_start + w13_steps_per_task, w13_n_steps);
            if (step_start >= w13_n_steps) return;
            const int chunk_n_start = step_start * N_STEP;
            const int chunk_n_end = std::min(step_end * N_STEP, cpu_n_w13);

            for (int local_n_start = chunk_n_start; local_n_start < chunk_n_end; local_n_start += N_STEP) {
              const int global_n = global_n_offset_w13 + local_n_start;
              const int target_gpu = global_n / gpu_n_w13;
              const int n_in_gpu = global_n % gpu_n_w13;

              ggml_bf16_t* weight_base = (ggml_bf16_t*)w13_weight_ptrs[target_gpu];
              const size_t expert_weight_off = is_up ? gpu_w13_weight_per_mat : 0;

              const int n_block_idx = local_n_start / N_BLOCK;
              const int n_block_begin = n_block_idx * N_BLOCK;
              const int n_block_size = std::min(N_BLOCK, cpu_n_w13 - n_block_begin);
              const int n_in_block = local_n_start - n_block_begin;

              for (int k_block_begin = 0; k_block_begin < cpu_k_w13; k_block_begin += K_BLOCK) {
                const int k_block_size = std::min(K_BLOCK, cpu_k_w13 - k_block_begin);

                for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
                  const ggml_bf16_t* src = bb->b + (size_t)n_block_begin * cpu_k_w13 +
                                           (size_t)k_block_begin * n_block_size + (size_t)n_in_block * k_block_size +
                                           (size_t)k_begin * N_STEP;
                  ggml_bf16_t* dst =
                      weight_base + expert_weight_off + (size_t)n_in_gpu * gpu_k_w13 + k_block_begin + k_begin;
                  unpack_nk_block_bf16(src, dst, gpu_k_w13);
                }
              }
            }

          } else {
            // ========= W2 weight task: process chunk of rows x all K slices =========
            const int chunk_idx = task_id - NUM_W13_TASKS * 2;
            const auto& bb = down_bb_[expert_id];

            const int step_start = chunk_idx * w2_steps_per_task;
            const int step_end = std::min(step_start + w2_steps_per_task, w2_n_steps);
            if (step_start >= w2_n_steps) return;
            const int chunk_n_start = step_start * N_STEP;
            const int chunk_n_end = std::min(step_end * N_STEP, cpu_n_w2);

            for (int local_n_start = chunk_n_start; local_n_start < chunk_n_end; local_n_start += N_STEP) {
              const int n_block_idx = local_n_start / N_BLOCK;
              const int n_block_begin = n_block_idx * N_BLOCK;
              const int n_block_size = std::min(N_BLOCK, cpu_n_w2 - n_block_begin);
              const int n_in_block = local_n_start - n_block_begin;

              for (int k_slice_start = 0; k_slice_start < cpu_k_w2; k_slice_start += gpu_k_w2) {
                const int k_slice_end = std::min(k_slice_start + gpu_k_w2, cpu_k_w2);

                const int global_k_start = global_k_offset_w2 + k_slice_start;
                const int target_gpu = global_k_start / gpu_k_w2;
                const int k_in_gpu_base = global_k_start % gpu_k_w2;

                ggml_bf16_t* weight_base = (ggml_bf16_t*)w2_weight_ptrs[target_gpu];

                for (int k_abs = k_slice_start; k_abs < k_slice_end; k_abs += K_STEP) {
                  const int k_block_idx = k_abs / K_BLOCK;
                  const int k_block_begin = k_block_idx * K_BLOCK;
                  const int k_block_size = std::min(K_BLOCK, cpu_k_w2 - k_block_begin);
                  const int k_in_block = k_abs - k_block_begin;
                  const int k_in_gpu = k_in_gpu_base + (k_abs - k_slice_start);

                  const ggml_bf16_t* src = bb->b + (size_t)n_block_begin * cpu_k_w2 +
                                           (size_t)k_block_begin * n_block_size + (size_t)n_in_block * k_block_size +
                                           (size_t)k_in_block * N_STEP;
                  ggml_bf16_t* dst = weight_base + (size_t)local_n_start * gpu_k_w2 + k_in_gpu;
                  unpack_nk_block_bf16(src, dst, gpu_k_w2);
                }
              }
            }
          }
        },
        nullptr);
  }
};

template <typename K>
class TP_MOE<AMX_BF16_MOE_TP<K>> : public TP_MOE<AMX_MOE_BASE<K, AMX_BF16_MOE_TP<K>>> {
 public:
  using Base = TP_MOE<AMX_MOE_BASE<K, AMX_BF16_MOE_TP<K>>>;
  using Base::Base;

  void load_weights() override {
    auto& config = this->config;
    auto& tps = this->tps;
    auto& tp_count = this->tp_count;
    auto pool = config.pool;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;

    // BF16 has no quantization check needed
    if (config.gate_projs.empty() && config.gate_proj == nullptr) {
      throw std::runtime_error("no weight source");
    }

    const bool use_per_expert_ptrs = !config.gate_projs.empty();
    const size_t full_weight_elems = (size_t)config.intermediate_size * config.hidden_size;

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      const size_t tp_weight_elems = (size_t)tpc.intermediate_size * tpc.hidden_size;

      // Allocate BF16 weights (2 bytes/element)
      tpc.gate_proj = new ggml_bf16_t[tpc.expert_num * tp_weight_elems];
      tpc.up_proj = new ggml_bf16_t[tpc.expert_num * tp_weight_elems];
      tpc.down_proj = new ggml_bf16_t[tpc.expert_num * tp_weight_elems];

      const size_t tp_idx = (size_t)i;
      const size_t gate_up_weight_src_offset = i * tp_weight_elems;
      const size_t down_weight_src_col_offset = i * (size_t)tpc.intermediate_size;

      pool->get_subpool(i)->do_work_stealing_job(
          tpc.expert_num, nullptr,
          [&, &tpc](int expert_id_) {
            const size_t expert_id = expert_map(physical_to_logical_map, expert_id_);

            ggml_bf16_t* gate_dst = (ggml_bf16_t*)tpc.gate_proj + expert_id * tp_weight_elems;
            ggml_bf16_t* up_dst = (ggml_bf16_t*)tpc.up_proj + expert_id * tp_weight_elems;
            ggml_bf16_t* down_dst = (ggml_bf16_t*)tpc.down_proj + expert_id * tp_weight_elems;

            const ggml_bf16_t* gate_src;
            const ggml_bf16_t* up_src;
            const ggml_bf16_t* down_src;

            if (use_per_expert_ptrs) {
              gate_src = (const ggml_bf16_t*)config.gate_projs[0][expert_id] + gate_up_weight_src_offset;
              up_src = (const ggml_bf16_t*)config.up_projs[0][expert_id] + gate_up_weight_src_offset;
              down_src = (const ggml_bf16_t*)config.down_projs[0][expert_id];
            } else {
              gate_src =
                  (const ggml_bf16_t*)config.gate_proj + expert_id * full_weight_elems + gate_up_weight_src_offset;
              up_src = (const ggml_bf16_t*)config.up_proj + expert_id * full_weight_elems + gate_up_weight_src_offset;
              down_src = (const ggml_bf16_t*)config.down_proj + expert_id * full_weight_elems;
            }

            // Copy gate and up weights
            std::memcpy(gate_dst, gate_src, tp_weight_elems * sizeof(ggml_bf16_t));
            std::memcpy(up_dst, up_src, tp_weight_elems * sizeof(ggml_bf16_t));

            // Copy down weights (row-wise split)
            for (int row = 0; row < config.hidden_size; row++) {
              const size_t src_row_offset = (size_t)row * (size_t)config.intermediate_size + down_weight_src_col_offset;
              const size_t dst_row_offset = (size_t)row * (size_t)tpc.intermediate_size;
              std::memcpy(down_dst + dst_row_offset, down_src + src_row_offset,
                          (size_t)tpc.intermediate_size * sizeof(ggml_bf16_t));
            }
          },
          nullptr);
    });

    DO_TPS_LOAD_WEIGHTS(pool);

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      delete[] (ggml_bf16_t*)tpc.gate_proj;
      delete[] (ggml_bf16_t*)tpc.up_proj;
      delete[] (ggml_bf16_t*)tpc.down_proj;
    });

    this->weights_loaded = true;
  }

  /**
   * @brief Write weights to GPU buffer for all TP parts
   *
   * BF16 version - no scales needed, scale_ptrs parameters are kept for interface compatibility.
   */
  void write_weight_scale_to_buffer(int gpu_tp_count, int expert_id, const std::vector<uintptr_t>& w13_weight_ptrs,
                                    const std::vector<uintptr_t>& w13_scale_ptrs,
                                    const std::vector<uintptr_t>& w2_weight_ptrs,
                                    const std::vector<uintptr_t>& w2_scale_ptrs) {
    if (this->weights_loaded == false) {
      throw std::runtime_error("Not Loaded");
    }
    if (this->tps.empty()) {
      throw std::runtime_error("No TP parts initialized");
    }
    if ((int)w13_weight_ptrs.size() != gpu_tp_count || (int)w2_weight_ptrs.size() != gpu_tp_count) {
      throw std::runtime_error("Weight pointer arrays size must match gpu_tp_count");
    }

    this->config.pool->dispense_backend()->do_numa_job([&, this](int i) {
      this->tps[i]->write_weights_to_buffer(gpu_tp_count, this->tp_count, expert_id, this->config, w13_weight_ptrs,
                                            w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs);
    });
  }
};

#endif  // CPUINFER_OPERATOR_AMX_BF16_MOE_H


================================================
FILE: kt-kernel/operators/amx/fp8-moe.hpp
================================================
/**
 * @Description  : FP8 AMX MoE operator for DeepSeek V3.2 native inference
 * @Author       : oql, Codex and Claude
 * @Date         : 2025-12-09
 * @Version      : 1.0.0
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 *
 * This file implements FP8 MoE using CRTP pattern, inheriting from moe_base.hpp.
 * FP8 weights are stored with 128x128 block-wise scales.
 **/
#ifndef CPUINFER_OPERATOR_AMX_FP8_MOE_H
#define CPUINFER_OPERATOR_AMX_FP8_MOE_H

// #define DEBUG_FP8_MOE

#include "la/amx_raw_buffers.hpp"
#include "la/amx_raw_kernels.hpp"
#include "moe_base.hpp"

/**
 * @brief FP8 MoE operator using CRTP pattern
 * @tparam T Kernel type, defaults to GemmKernel224FP8
 *
 * This class provides FP8-specific implementations:
 * - do_gate_up_gemm, do_down_gemm : FP8 weight -> BF16 conversion mat mul
 * - load_weights: Load FP8 weights with 128x128 block scales
 */
template <class T = amx::GemmKernel224FP8>
class AMX_FP8_MOE_TP : public AMX_MOE_BASE<T, AMX_FP8_MOE_TP<T>> {
  using Base = AMX_MOE_BASE<T, AMX_FP8_MOE_TP<T>>;
  using Base::config_;
  using Base::down_ba_;
  using Base::down_bb_;
  using Base::down_bc_;
  using Base::gate_bb_;
  using Base::gate_bc_;
  using Base::gate_up_ba_;
  using Base::m_local_num_;
  using Base::tp_part_idx;
  using Base::up_bb_;
  using Base::up_bc_;

 public:
  using typename Base::input_t;
  using typename Base::output_t;

  AMX_FP8_MOE_TP() = default;

  AMX_FP8_MOE_TP(GeneralMOEConfig config, int tp_part_idx_ = 0) : Base(config, tp_part_idx_) {
    // Initialization now happens in derived_init() which is called by base constructor
  }

  void derived_init() {
    auto& quant_config = config_.quant_config;
    if (quant_config.group_size == 0 || quant_config.zero_point) {
      throw std::runtime_error("KT-Kernel fp8 MoE only support block-wise FP8");
    }
    printf("Created AMX_FP8_MOE_TP %d at numa %d\n", tp_part_idx, numa_node_of_cpu(sched_getcpu()));
  }

  ~AMX_FP8_MOE_TP() = default;
  // ============================================================================
  // CRTP buffer creation - with group_size
  // ============================================================================

  size_t buffer_a_required_size_impl(size_t m, size_t k) const { return T::BufferA::required_size(m, k); }
  size_t buffer_b_required_size_impl(size_t n, size_t k) const {
    return T::BufferB::required_size(n, k, config_.quant_config.group_size);
  }
  size_t buffer_c_required_size_impl(size_t m, size_t n) const { return T::BufferC::required_size(m, n); }

  std::shared_ptr<typename T::BufferA> make_buffer_a_impl(size_t m, size_t k, void* data) const {
    return std::make_shared<typename T::BufferA>(m, k, data);
  }
  std::shared_ptr<typename T::BufferB> make_buffer_b_impl(size_t n, size_t k, void* data) const {
    return std::make_shared<typename T::BufferB>(n, k, config_.quant_config.group_size, data);
  }
  std::shared_ptr<typename T::BufferC> make_buffer_c_impl(size_t m, size_t n, void* data) const {
    return std::make_shared<typename T::BufferC>(m, n, data);
  }

  // ============================================================================
  // CRTP virtual points - GEMM dispatch
  // ============================================================================

  void do_gate_up_gemm(bool do_up, int expert_idx, int ith, int nth, int qlen) {
    auto& group_size = config_.quant_config.group_size;
    int m = m_local_num_[expert_idx];
    auto& ba = gate_up_ba_[expert_idx];
    auto& bb = do_up ? up_bb_[expert_idx] : gate_bb_[expert_idx];
    auto& bc = do_up ? up_bc_[expert_idx] : gate_bc_[expert_idx];

    amx::vec_mul_kgroup(m, config_.intermediate_size, config_.hidden_size, group_size, ba, bb, bc, ith, nth);
  }
  void do_down_gemm(int expert_idx, int ith, int nth, int qlen) {
    auto& group_size = config_.quant_config.group_size;
    int m = m_local_num_[expert_idx];

    amx::vec_mul_kgroup(m, config_.hidden_size, config_.intermediate_size, group_size, down_ba_[expert_idx],
                        down_bb_[expert_idx], down_bc_[expert_idx], ith, nth);
  }

#ifdef DEBUG_FP8_MOE
  // Function to dump Buffer B data for debugging FP8 quantization results
  inline void dump_buffer_b(const std::string& quantization_type, int expert_idx, const std::string& matrix_type,
                            typename T::BufferB* buffer) {
    auto& quant_config = config_.quant_config;
    int& group_size = quant_config.group_size;

    printf("[DUMP_BUFFER_B] TP%d %s Expert%d %s:\n", tp_part_idx, quantization_type.c_str(), expert_idx,
           matrix_type.c_str());

    // Calculate dimensions based on matrix type
    int rows, cols;
    size_t scale_elem_count;
    if (matrix_type == "gate" || matrix_type == "up") {
      rows = config_.intermediate_size;
      cols = config_.hidden_size;
    } else {  // down
      rows = config_.hidden_size;
      cols = config_.intermediate_size;
    }
    int n_blocks_n = (rows + group_size - 1) / group_size;
    int n_blocks_k = (cols + group_size - 1) / group_size;
    scale_elem_count = n_blocks_n * n_blocks_k;

    // Dump scales (as BF16 converted to float)
    printf("  Scales[first 16]: ");
    for (int i = 0; i < std::min(16, (int)scale_elem_count); i++) {
      printf("%.6f ", buffer->d[i]);
    }
    printf("\n");

    if (scale_elem_count > 16) {
      printf("  Scales[last 16]: ");
      int start_idx = std::max(0, (int)scale_elem_count - 16);
      for (int i = start_idx; i < (int)scale_elem_count; i++) {
        printf("%.6f ", buffer->d[i]);
      }
      printf("\n");
    }

    // Dump FP8 weights (as hex uint8)
    size_t weight_size = (size_t)rows * cols;  // FP8 is 1 byte per element
    uint8_t* weight_ptr = (uint8_t*)buffer->b;

    printf("  FP8 Weights[first 32 bytes]: ");
    for (int i = 0; i < std::min(32, (int)weight_size); i++) {
      printf("%02x ", weight_ptr[i]);
    }
    printf("\n");

    if (weight_size > 32) {
      printf("  FP8 Weights[last 32 bytes]: ");
      int start_idx = std::max(32, (int)weight_size - 32);
      for (int i = start_idx; i < (int)weight_size; i++) {
        printf("%02x ", weight_ptr[i]);
      }
      printf("\n");
    }

    printf("  Matrix dimensions: %dx%d (n x k), Scale blocks: %dx%d, Group size: %d, Scale elements: %zu\n", rows, cols,
           n_blocks_n, n_blocks_k, group_size, scale_elem_count);
  }
#endif

  /**
   * @brief Load FP8 weights from contiguous memory layout
   *
   * Loads weights from config_.gate_proj, up_proj, down_proj with scales
   * from config_.gate_scale, up_scale, down_scale.
   */
  void load_weights() {
    auto& quant_config = config_.quant_config;
    int& group_size = quant_config.group_size;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
    auto pool = config_.pool->get_subpool(tp_part_idx);

    if (config_.gate_scale == nullptr) {
      throw std::runtime_error("FP8 AVX MOE only support native weight.");
    }

    // load weight
    int nth = T::recommended_nth(config_.intermediate_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map, group_size](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;
          // gate part
          gate_bb_[expert_idx]->from_mat(
              (uint8_t*)config_.gate_proj + (logical_expert_id * config_.intermediate_size * config_.hidden_size),
              (float*)config_.gate_scale +
                  (logical_expert_id * (config_.hidden_size / group_size) * (config_.intermediate_size / group_size)),
              ith, nth);
          // up part
          up_bb_[expert_idx]->from_mat(
              (uint8_t*)config_.up_proj + (logical_expert_id * config_.intermediate_size * config_.hidden_size),
              (float*)config_.up_scale +
                  (logical_expert_id * (config_.hidden_size / group_size) * (config_.intermediate_size / group_size)),
              ith, nth);
        },
        nullptr);

    nth = T::recommended_nth(config_.hidden_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map, group_size](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;
          // down part
          down_bb_[expert_idx]->from_mat(
              (uint8_t*)config_.down_proj + (logical_expert_id * config_.intermediate_size * config_.hidden_size),
              (float*)config_.down_scale +
                  (logical_expert_id * (config_.hidden_size / group_size) * (config_.intermediate_size / group_size)),
              ith, nth);
        },
        nullptr);
#ifdef DEBUG_FP8_MOE
    dump_buffer_b("Native FP8", 0, "gate", gate_bb_[0].get());
    dump_buffer_b("Native FP8", 0, "down", down_bb_[0].get());
#endif
  }

  // Fast 64-byte (512-bit) memcpy using AVX512
  static inline void fast_memcpy_64(void* __restrict dst, const void* __restrict src) {
    __m512i data = _mm512_loadu_si512(src);
    _mm512_storeu_si512(dst, data);
  }

  // Fast memcpy for arbitrary sizes using AVX512
  static inline void fast_memcpy(void* __restrict dst, const void* __restrict src, size_t bytes) {
    uint8_t* d = (uint8_t*)dst;
    const uint8_t* s = (const uint8_t*)src;
    size_t chunks = bytes / 64;
    for (size_t i = 0; i < chunks; i++) {
      fast_memcpy_64(d, s);
      d += 64;
      s += 64;
    }
    bytes -= chunks * 64;
    if (bytes > 0) {
      std::memcpy(d, s, bytes);
    }
  }

  /**
   * @brief Unpack a single N_STEP x K_STEP block from packed BufferB format to n-major format
   *
   * This is the inverse of the packing done in BufferBFP8Impl::from_mat.
   * Optimized with AVX512 gather for efficient non-contiguous reads.
   *
   * @param src Pointer to packed data (N_STEP * K_STEP bytes in packed layout)
   * @param dst Pointer to destination in n-major layout
   * @param dst_row_stride Row stride in destination buffer (number of columns in full matrix)
   */
  static inline void unpack_nk_block(const uint8_t* src, uint8_t* dst, size_t dst_row_stride) {
    // row_map[packed_i] gives the base row for packed index packed_i
    static constexpr int row_map[8] = {0, 16, 4, 20, 8, 24, 12, 28};
    const uint64_t* src64 = reinterpret_cast<const uint64_t*>(src);

    // Gather indices: src64[8*j + packed_i] for j = 0..7
    // Offsets in uint64 units: 0, 8, 16, 24, 32, 40, 48, 56 (+ packed_i for each group)
    const __m512i gather_offsets = _mm512_set_epi64(56, 48, 40, 32, 24, 16, 8, 0);

    // Process each packed group (8 groups of 4 rows each = 32 rows total)
    for (int packed_i = 0; packed_i < 8; packed_i++) {
      const int base_row = row_map[packed_i];
      const uint64_t* base_src = src64 + packed_i;

      // Gather 8 values for j=0..7 and j=8..15
      __m512i vals_0_7 = _mm512_i64gather_epi64(gather_offsets, base_src, 8);
      __m512i vals_8_15 = _mm512_i64gather_epi64(gather_offsets, base_src + 64, 8);

      // Extract 4 rows from each set of 8 values
      // Row 0: bits 0-15
      __m128i row0_lo = _mm512_cvtepi64_epi16(_mm512_and_si512(vals_0_7, _mm512_set1_epi64(0xFFFF)));
      __m128i row0_hi = _mm512_cvtepi64_epi16(_mm512_and_si512(vals_8_15, _mm512_set1_epi64(0xFFFF)));
      // Row 1: bits 16-31
      __m128i row1_lo =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_0_7, 16), _mm512_set1_epi64(0xFFFF)));
      __m128i row1_hi =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_8_15, 16), _mm512_set1_epi64(0xFFFF)));
      // Row 2: bits 32-47
      __m128i row2_lo =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_0_7, 32), _mm512_set1_epi64(0xFFFF)));
      __m128i row2_hi =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_8_15, 32), _mm512_set1_epi64(0xFFFF)));
      // Row 3: bits 48-63
      __m128i row3_lo = _mm512_cvtepi64_epi16(_mm512_srli_epi64(vals_0_7, 48));
      __m128i row3_hi = _mm512_cvtepi64_epi16(_mm512_srli_epi64(vals_8_15, 48));

      // Store 32 bytes (16 x uint16) to each row
      // Combine two 128-bit values into 256-bit for more efficient stores
      uint8_t* row0_dst = dst + (size_t)base_row * dst_row_stride;
      uint8_t* row1_dst = dst + (size_t)(base_row + 1) * dst_row_stride;
      uint8_t* row2_dst = dst + (size_t)(base_row + 2) * dst_row_stride;
      uint8_t* row3_dst = dst + (size_t)(base_row + 3) * dst_row_stride;

      // Combine lo and hi into 256-bit and store
      __m256i row0_256 = _mm256_set_m128i(row0_hi, row0_lo);
      __m256i row1_256 = _mm256_set_m128i(row1_hi, row1_lo);
      __m256i row2_256 = _mm256_set_m128i(row2_hi, row2_lo);
      __m256i row3_256 = _mm256_set_m128i(row3_hi, row3_lo);

      _mm256_storeu_si256((__m256i*)row0_dst, row0_256);
      _mm256_storeu_si256((__m256i*)row1_dst, row1_256);
      _mm256_storeu_si256((__m256i*)row2_dst, row2_256);
      _mm256_storeu_si256((__m256i*)row3_dst, row3_256);
    }
  }

  /**
   * @brief Unpack 4 consecutive N_STEP x K_STEP blocks to maximize cache line utilization
   *
   * Processing 4 blocks together means each row write is 128 bytes = 2 cache lines,
   * which greatly improves write efficiency compared to 32 bytes per row.
   *
   * @param src Array of 4 source pointers (each pointing to a 32x32 packed block)
   * @param dst Destination pointer in n-major layout
   * @param dst_row_stride Row stride in destination buffer
   */
  static inline void unpack_4nk_blocks(const uint8_t* src[4], uint8_t* dst, size_t dst_row_stride) {
    static constexpr int row_map[8] = {0, 16, 4, 20, 8, 24, 12, 28};
    constexpr int K_STEP = T::K_STEP;  // 32

    // Reinterpret as uint64 arrays for efficient access
    const uint64_t* src0 = reinterpret_cast<const uint64_t*>(src[0]);
    const uint64_t* src1 = reinterpret_cast<const uint64_t*>(src[1]);
    const uint64_t* src2 = reinterpret_cast<const uint64_t*>(src[2]);
    const uint64_t* src3 = reinterpret_cast<const uint64_t*>(src[3]);

    // Process all 32 rows, writing 128 bytes (4 x 32) per row
    for (int packed_i = 0; packed_i < 8; packed_i++) {
      const int base_row = row_map[packed_i];

      // Process 4 rows at a time
      for (int r = 0; r < 4; r++) {
        uint16_t* row_dst = reinterpret_cast<uint16_t*>(dst + (size_t)(base_row + r) * dst_row_stride);
        const int shift = r * 16;

        // Unroll: process all 4 blocks x 16 columns = 64 uint16 values
        // Block 0: columns 0-15
        for (int j = 0; j < 16; j++) {
          row_dst[j] = static_cast<uint16_t>(src0[8 * j + packed_i] >> shift);
        }
        // Block 1: columns 16-31
        for (int j = 0; j < 16; j++) {
          row_dst[16 + j] = static_cast<uint16_t>(src1[8 * j + packed_i] >> shift);
        }
        // Block 2: columns 32-47
        for (int j = 0; j < 16; j++) {
          row_dst[32 + j] = static_cast<uint16_t>(src2[8 * j + packed_i] >> shift);
        }
        // Block 3: columns 48-63
        for (int j = 0; j < 16; j++) {
          row_dst[48 + j] = static_cast<uint16_t>(src3[8 * j + packed_i] >> shift);
        }
      }
    }
  }

  /**
   * @brief Reconstruct weights for a single expert to the output buffers (no temp buffer version)
   *
   * Directly unpacks from packed BufferB format to n-major GPU buffers without intermediate storage.
   * Optimized version with coarse-grained task splitting for better cache utilization.
   *
   * Key optimizations:
   * - Reduced task count (~40 vs ~350) to minimize scheduling overhead
   * - Larger chunks per task for better cache line utilization
   * - Process multiple N_STEPs per task for better write locality
   *
   * @param gpu_tp_count Number of GPU TP parts (1, 2, 4, or 8)
   * @param cpu_tp_count Number of CPU TP parts
   * @param expert_id Expert index to process
   * @param full_config Full configuration (before CPU TP split)
   * @param w13_weight_ptrs Pointers to gate+up weight buffers (one per GPU TP)
   * @param w13_scale_ptrs Pointers to gate+up scale buffers (one per GPU TP)
   * @param w2_weight_ptrs Pointers to down weight buffers (one per GPU TP)
   * @param w2_scale_ptrs Pointers to down scale buffers (one per GPU TP)
   */
  void write_weights_to_buffer(int gpu_tp_count, [[maybe_unused]] int cpu_tp_count, int expert_id,
                               const GeneralMOEConfig& full_config, const std::vector<uintptr_t>& w13_weight_ptrs,
                               const std::vector<uintptr_t>& w13_scale_ptrs,
                               const std::vector<uintptr_t>& w2_weight_ptrs,
                               const std::vector<uintptr_t>& w2_scale_ptrs) const {
    auto& config = config_;
    const int group_size = config.quant_config.group_size;
    auto pool = config.pool->get_subpool(tp_part_idx);

    constexpr int N_STEP = T::N_STEP;
    constexpr int K_STEP = T::K_STEP;
    constexpr int N_BLOCK = T::N_BLOCK;
    constexpr int K_BLOCK = T::K_BLOCK;

    // ========= W13 (gate+up): Shape [intermediate, hidden], split by N only =========
    const int cpu_n_w13 = config.intermediate_size;
    const int cpu_k_w13 = config.hidden_size;
    const int gpu_n_w13 = full_config.intermediate_size / gpu_tp_count;
    const int gpu_k_w13 = full_config.hidden_size;
    const int global_n_offset_w13 = tp_part_idx * cpu_n_w13;

    const size_t gpu_w13_weight_per_mat = (size_t)gpu_n_w13 * gpu_k_w13;
    const size_t gpu_w13_scale_per_mat = (size_t)div_up(gpu_n_w13, group_size) * div_up(gpu_k_w13, group_size);
    const int cpu_scale_k_blocks_w13 = div_up(cpu_k_w13, group_size);
    const int gpu_scale_k_blocks_w13 = div_up(gpu_k_w13, group_size);

    // ========= W2 (down): Shape [hidden, intermediate], split by K =========
    const int cpu_n_w2 = config.hidden_size;
    const int cpu_k_w2 = config.intermediate_size;
    const int gpu_n_w2 = full_config.hidden_size;
    const int gpu_k_w2 = full_config.intermediate_size / gpu_tp_count;
    const int global_k_offset_w2 = tp_part_idx * cpu_k_w2;

    const size_t gpu_w2_weight_per_mat = (size_t)gpu_n_w2 * gpu_k_w2;
    const size_t gpu_w2_scale_per_mat = (size_t)div_up(gpu_n_w2, group_size) * div_up(gpu_k_w2, group_size);
    const int cpu_scale_k_blocks_w2 = div_up(cpu_k_w2, group_size);
    const int gpu_scale_k_blocks_w2 = div_up(gpu_k_w2, group_size);

    // ========= Scale dimensions =========
    const int cpu_scale_n_blocks_w13 = div_up(cpu_n_w13, group_size);
    const int gpu_scale_n_blocks_w13 = div_up(gpu_n_w13, group_size);
    const int cpu_scale_n_blocks_w2 = div_up(cpu_n_w2, group_size);

    // ========= Optimized job layout =========
    // Use task count slightly above CPU core count for good work stealing
    // For 80-core system, ~100 tasks provides good balance
    constexpr int NUM_W13_TASKS = 32;  // Per matrix (gate or up), total 64 for w13
    constexpr int NUM_W2_TASKS = 32;   // For down matrix
    constexpr int SCALE_TASKS = 3;     // gate_scale, up_scale, down_scale

    const int total_tasks = NUM_W13_TASKS * 2 + NUM_W2_TASKS + SCALE_TASKS;

    // Calculate N_STEP blocks per task (must be N_STEP aligned for correct BufferB addressing)
    const int w13_n_steps = div_up(cpu_n_w13, N_STEP);
    const int w13_steps_per_task = div_up(w13_n_steps, NUM_W13_TASKS);
    const int w2_n_steps = div_up(cpu_n_w2, N_STEP);
    const int w2_steps_per_task = div_up(w2_n_steps, NUM_W2_TASKS);

    pool->do_work_stealing_job(
        total_tasks, nullptr,
        [=, &w13_weight_ptrs, &w13_scale_ptrs, &w2_weight_ptrs, &w2_scale_ptrs, this](int task_id) {
          if (task_id < NUM_W13_TASKS * 2) {
            // ========= W13 weight task: process chunk of rows x full K =========
            const bool is_up = task_id >= NUM_W13_TASKS;
            const int chunk_idx = task_id % NUM_W13_TASKS;
            const auto& bb = is_up ? up_bb_[expert_id] : gate_bb_[expert_id];

            // Calculate row range for this task (N_STEP aligned)
            const int step_start = chunk_idx * w13_steps_per_task;
            const int step_end = std::min(step_start + w13_steps_per_task, w13_n_steps);
            if (step_start >= w13_n_steps) return;
            const int chunk_n_start = step_start * N_STEP;
            const int chunk_n_end = std::min(step_end * N_STEP, cpu_n_w13);

            // Process each N_STEP within this chunk
            for (int local_n_start = chunk_n_start; local_n_start < chunk_n_end; local_n_start += N_STEP) {
              // Calculate GPU target and offset for each N_STEP (may cross GPU TP boundaries)
              const int global_n = global_n_offset_w13 + local_n_start;
              const int target_gpu = global_n / gpu_n_w13;
              const int n_in_gpu = global_n % gpu_n_w13;

              uint8_t* weight_base = (uint8_t*)w13_weight_ptrs[target_gpu];
              // Pointer already points to current expert's location, only add offset for up matrix
              const size_t expert_weight_off = is_up ? gpu_w13_weight_per_mat : 0;

              // Calculate N_BLOCK info for source addressing
              const int n_block_idx = local_n_start / N_BLOCK;
              const int n_block_begin = n_block_idx * N_BLOCK;
              const int n_block_size = std::min(N_BLOCK, cpu_n_w13 - n_block_begin);
              const int n_in_block = local_n_start - n_block_begin;

              // Process all K in groups of 4 K_STEPs when possible for cache efficiency
              for (int k_block_begin = 0; k_block_begin < cpu_k_w13; k_block_begin += K_BLOCK) {
                const int k_block_size = std::min(K_BLOCK, cpu_k_w13 - k_block_begin);

                // Try to process 4 K_STEPs at once (128 columns = 2 cache lines per row)
                int k_begin = 0;
                for (; k_begin + 4 * K_STEP <= k_block_size; k_begin += 4 * K_STEP) {
                  const uint8_t* src_ptrs[4];
                  for (int i = 0; i < 4; i++) {
                    src_ptrs[i] = bb->b + (size_t)n_block_begin * cpu_k_w13 + (size_t)k_block_begin * n_block_size +
                                  (size_t)n_in_block * k_block_size + (size_t)(k_begin + i * K_STEP) * N_STEP;
                  }
                  uint8_t* dst =
                      weight_base + expert_weight_off + (size_t)n_in_gpu * gpu_k_w13 + k_block_begin + k_begin;
                  unpack_4nk_blocks(src_ptrs, dst, gpu_k_w13);
                }

                // Handle remaining K_STEPs one by one
                for (; k_begin < k_block_size; k_begin += K_STEP) {
                  const uint8_t* src = bb->b + (size_t)n_block_begin * cpu_k_w13 +
                                       (size_t)k_block_begin * n_block_size + (size_t)n_in_block * k_block_size +
                                       (size_t)k_begin * N_STEP;
                  uint8_t* dst =
                      weight_base + expert_weight_off + (size_t)n_in_gpu * gpu_k_w13 + k_block_begin + k_begin;
                  unpack_nk_block(src, dst, gpu_k_w13);
                }
              }
            }

          } else if (task_id < NUM_W13_TASKS * 2 + NUM_W2_TASKS) {
            // ========= W2 weight task: process chunk of rows x all K slices =========
            const int chunk_idx = task_id - NUM_W13_TASKS * 2;
            const auto& bb = down_bb_[expert_id];

            // Calculate row range for this task (N_STEP aligned)
            const int step_start = chunk_idx * w2_steps_per_task;
            const int step_end = std::min(step_start + w2_steps_per_task, w2_n_steps);
            if (step_start >= w2_n_steps) return;
            const int chunk_n_start = step_start * N_STEP;
            const int chunk_n_end = std::min(step_end * N_STEP, cpu_n_w2);

            // Process each N_STEP within this chunk
            for (int local_n_start = chunk_n_start; local_n_start < chunk_n_end; local_n_start += N_STEP) {
              // Calculate N_BLOCK info for source addressing
              const int n_block_idx = local_n_start / N_BLOCK;
              const int n_block_begin = n_block_idx * N_BLOCK;
              const int n_block_size = std::min(N_BLOCK, cpu_n_w2 - n_block_begin);
              const int n_in_block = local_n_start - n_block_begin;

              // Process all K slices (each slice goes to a different GPU TP)
              for (int k_slice_start = 0; k_slice_start < cpu_k_w2; k_slice_start += gpu_k_w2) {
                const int k_slice_end = std::min(k_slice_start + gpu_k_w2, cpu_k_w2);

                const int global_k_start = global_k_offset_w2 + k_slice_start;
                const int target_gpu = global_k_start / gpu_k_w2;
                const int k_in_gpu_base = global_k_start % gpu_k_w2;

                uint8_t* weight_base = (uint8_t*)w2_weight_ptrs[target_gpu];
                // Pointer already points to current expert's location
                const size_t expert_weight_off = 0;

                // Process K within this slice, trying 4 K_STEPs at once when aligned
                for (int k_abs = k_slice_start; k_abs < k_slice_end;) {
                  const int k_block_idx = k_abs / K_BLOCK;
                  const int k_block_begin = k_block_idx * K_BLOCK;
                  const int k_block_size = std::min(K_BLOCK, cpu_k_w2 - k_block_begin);
                  const int k_in_block = k_abs - k_block_begin;
                  const int k_in_gpu = k_in_gpu_base + (k_abs - k_slice_start);

                  // Check if we can process 4 K_STEPs at once
                  const int remaining_in_block = k_block_size - k_in_block;
                  const int remaining_in_slice = k_slice_end - k_abs;

                  if (remaining_in_block >= 4 * K_STEP && remaining_in_slice >= 4 * K_STEP) {
                    const uint8_t* src_ptrs[4];
                    for (int i = 0; i < 4; i++) {
                      src_ptrs[i] = bb->b + (size_t)n_block_begin * cpu_k_w2 + (size_t)k_block_begin * n_block_size +
                                    (size_t)n_in_block * k_block_size + (size_t)(k_in_block + i * K_STEP) * N_STEP;
                    }
                    uint8_t* dst = weight_base + expert_weight_off + (size_t)local_n_start * gpu_k_w2 + k_in_gpu;
                    unpack_4nk_blocks(src_ptrs, dst, gpu_k_w2);
                    k_abs += 4 * K_STEP;
                  } else {
                    const uint8_t* src = bb->b + (size_t)n_block_begin * cpu_k_w2 +
                                         (size_t)k_block_begin * n_block_size + (size_t)n_in_block * k_block_size +
                                         (size_t)k_in_block * N_STEP;
                    uint8_t* dst = weight_base + expert_weight_off + (size_t)local_n_start * gpu_k_w2 + k_in_gpu;
                    unpack_nk_block(src, dst, gpu_k_w2);
                    k_abs += K_STEP;
                  }
                }
              }
            }

          } else {
            // ========= Scale copy task: simple linear copy with fast_memcpy =========
            const int scale_task_id = task_id - NUM_W13_TASKS * 2 - NUM_W2_TASKS;

            if (scale_task_id < 2) {
              // Gate (0) or Up (1) scale copy
              const bool is_up = scale_task_id == 1;
              const auto& bb = is_up ? up_bb_[expert_id] : gate_bb_[expert_id];

              // W13 scales: copy N blocks corresponding to this CPU TP
              // Note: when gpu_tp > cpu_tp, scale blocks may span multiple GPU TPs
              const int bn_start_global = global_n_offset_w13 / group_size;

              for (int bn = 0; bn < cpu_scale_n_blocks_w13; bn++) {
                const int global_bn = bn_start_global + bn;
                const int target_gpu = global_bn / gpu_scale_n_blocks_w13;
                const int gpu_bn = global_bn % gpu_scale_n_blocks_w13;

                float* scale_dst = (float*)w13_scale_ptrs[target_gpu];
                // Pointer already points to current expert's location, only add offset for up matrix
                const size_t expert_scale_off = is_up ? gpu_w13_scale_per_mat : 0;

                fast_memcpy(scale_dst + expert_scale_off + (size_t)gpu_bn * gpu_scale_k_blocks_w13,
                            bb->d + (size_t)bn * cpu_scale_k_blocks_w13, cpu_scale_k_blocks_w13 * sizeof(float));
              }
            } else {
              // Down scale copy (scale_task_id == 2)
              const auto& bb = down_bb_[expert_id];

              // W2 scales: K dimension is split, copy to each GPU TP
              for (int k_slice_idx = 0; k_slice_idx < div_up(cpu_k_w2, gpu_k_w2); k_slice_idx++) {
                const int k_slice_start = k_slice_idx * gpu_k_w2;
                const int k_slice_end = std::min(k_slice_start + gpu_k_w2, cpu_k_w2);

                const int global_k_start = global_k_offset_w2 + k_slice_start;
                const int target_gpu = global_k_start / gpu_k_w2;
                const int bk_gpu_base = (global_k_start % gpu_k_w2) / group_size;

                float* scale_dst = (float*)w2_scale_ptrs[target_gpu];
                // Pointer already points to current expert's location
                const size_t expert_scale_off = 0;

                const int bk_start = k_slice_start / group_size;
                const int bk_end = div_up(k_slice_end, group_size);
                const int bk_count = bk_end - bk_start;

                for (int bn = 0; bn < cpu_scale_n_blocks_w2; bn++) {
                  fast_memcpy(scale_dst + expert_scale_off + (size_t)bn * gpu_scale_k_blocks_w2 + bk_gpu_base,
                              bb->d + (size_t)bn * cpu_scale_k_blocks_w2 + bk_start, bk_count * sizeof(float));
                }
              }
            }
          }
        },
        nullptr);
  }
};

template <typename K>
class TP_MOE<AMX_FP8_MOE_TP<K>> : public TP_MOE<AMX_MOE_BASE<K, AMX_FP8_MOE_TP<K>>> {
 public:
  using Base = TP_MOE<AMX_MOE_BASE<K, AMX_FP8_MOE_TP<K>>>;
  using Base::Base;

  void load_weights() override {
    auto& config = this->config;
    auto& tps = this->tps;
    auto& tp_count = this->tp_count;
    auto pool = config.pool;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;

    const int group_size = config.quant_config.group_size;
    if (group_size == 0 || config.quant_config.zero_point) {
      throw std::runtime_error("FP8 MoE only supports have group_size, zero_point=false");
    }

    if (config.gate_projs.empty() && config.gate_proj == nullptr) {
      throw std::runtime_error("no weight source");
    }
    const bool use_per_expert_ptrs = !config.gate_projs.empty();

    const size_t full_weight_elems = (size_t)config.intermediate_size * config.hidden_size;
    const size_t full_scale_elems =
        (size_t)div_up(config.hidden_size, group_size) * div_up(config.intermediate_size, group_size);

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      const size_t tp_weight_elems = (size_t)tpc.intermediate_size * tpc.hidden_size;
      const size_t tp_scale_elems =
          (size_t)div_up(tpc.intermediate_size, group_size) * div_up(tpc.hidden_size, group_size);

      tpc.gate_proj = new uint8_t[tpc.expert_num * tp_weight_elems];
      tpc.up_proj = new uint8_t[tpc.expert_num * tp_weight_elems];
      tpc.down_proj = new uint8_t[tpc.expert_num * tp_weight_elems];

      tpc.gate_scale = new float[tpc.expert_num * tp_scale_elems];
      tpc.up_scale = new float[tpc.expert_num * tp_scale_elems];
      tpc.down_scale = new float[tpc.expert_num * tp_scale_elems];

      const size_t tp_idx = (size_t)i;
      const size_t gate_up_weight_src_offset = i * tp_weight_elems;
      const size_t gate_up_scale_src_offset = i * tp_scale_elems;

      const size_t down_weight_src_col_offset = i * (size_t)tpc.intermediate_size;
      const size_t down_scale_src_block_k_offset = down_weight_src_col_offset / (size_t)group_size;

      pool->get_subpool(i)->do_work_stealing_job(
          tpc.expert_num, nullptr,
          [&, &tpc](int expert_id_) {
            const size_t expert_id = expert_map(physical_to_logical_map, expert_id_);

            uint8_t* gate_dst = (uint8_t*)tpc.gate_proj + expert_id * tp_weight_elems;
            uint8_t* up_dst = (uint8_t*)tpc.up_proj + expert_id * tp_weight_elems;
            uint8_t* down_dst = (uint8_t*)tpc.down_proj + expert_id * tp_weight_elems;

            float* gate_scale_dst = (float*)tpc.gate_scale + expert_id * tp_scale_elems;
            float* up_scale_dst = (float*)tpc.up_scale + expert_id * tp_scale_elems;
            float* down_scale_dst = (float*)tpc.down_scale + expert_id * tp_scale_elems;

            const uint8_t* gate_src;
            const uint8_t* up_src;
            const uint8_t* down_src;
            const float* gate_scale_src;
            const float* up_scale_src;
            const float* down_scale_src;

            if (use_per_expert_ptrs) {
              gate_src = (const uint8_t*)config.gate_projs[0][expert_id] + gate_up_weight_src_offset;
              up_src = (const uint8_t*)config.up_projs[0][expert_id] + gate_up_weight_src_offset;
              down_src = (const uint8_t*)config.down_projs[0][expert_id];

              gate_scale_src = (const float*)config.gate_scales[0][expert_id] + gate_up_scale_src_offset;
              up_scale_src = (const float*)config.up_scales[0][expert_id] + gate_up_scale_src_offset;
              down_scale_src = (const float*)config.down_scales[0][expert_id];
            } else {
              gate_src = (const uint8_t*)config.gate_proj + expert_id * full_weight_elems + gate_up_weight_src_offset;
              up_src = (const uint8_t*)config.up_proj + expert_id * full_weight_elems + gate_up_weight_src_offset;
              down_src = (const uint8_t*)config.down_proj + expert_id * full_weight_elems;

              gate_scale_src =
                  (const float*)config.gate_scale + expert_id * full_scale_elems + gate_up_scale_src_offset;
              up_scale_src = (const float*)config.up_scale + expert_id * full_scale_elems + gate_up_scale_src_offset;
              down_scale_src = (const float*)config.down_scale + expert_id * full_scale_elems;
            }

            std::memcpy(gate_dst, gate_src, tp_weight_elems);
            std::memcpy(up_dst, up_src, tp_weight_elems);
            std::memcpy(gate_scale_dst, gate_scale_src, sizeof(float) * tp_scale_elems);
            std::memcpy(up_scale_dst, up_scale_src, sizeof(float) * tp_scale_elems);

            for (int row = 0; row < config.hidden_size; row++) {
              const size_t src_row_offset = (size_t)row * (size_t)config.intermediate_size + down_weight_src_col_offset;
              const size_t dst_row_offset = (size_t)row * (size_t)tpc.intermediate_size;
              std::memcpy(down_dst + dst_row_offset, down_src + src_row_offset, (size_t)tpc.intermediate_size);
            }

            const int n_blocks_n = div_up(config.hidden_size, group_size);
            const int full_n_blocks_k = div_up(config.intermediate_size, group_size);
            const int tp_n_blocks_k = div_up(tpc.intermediate_size, group_size);
            for (int bn = 0; bn < n_blocks_n; bn++) {
              const float* src = down_scale_src + (size_t)bn * (size_t)full_n_blocks_k + down_scale_src_block_k_offset;
              float* dst = down_scale_dst + (size_t)bn * (size_t)tp_n_blocks_k;
              std::memcpy(dst, src, sizeof(float) * (size_t)tp_n_blocks_k);
            }
          },
          nullptr);
    });

    DO_TPS_LOAD_WEIGHTS(pool);

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      delete[] (uint8_t*)tpc.gate_proj;
      delete[] (uint8_t*)tpc.up_proj;
      delete[] (uint8_t*)tpc.down_proj;
      delete[] (float*)tpc.gate_scale;
      delete[] (float*)tpc.up_scale;
      delete[] (float*)tpc.down_scale;
    });

    this->weights_loaded = true;
  }

  void write_weight_scale_to_buffer(int gpu_tp_count, int expert_id, const std::vector<uintptr_t>& w13_weight_ptrs,
                                    const std::vector<uintptr_t>& w13_scale_ptrs,
                                    const std::vector<uintptr_t>& w2_weight_ptrs,
                                    const std::vector<uintptr_t>& w2_scale_ptrs) {
    if (this->weights_loaded == false) {
      throw std::runtime_error("Not Loaded");
    }
    if (this->tps.empty()) {
      throw std::runtime_error("No TP parts initialized");
    }
    if ((int)w13_weight_ptrs.size() != gpu_tp_count || (int)w13_scale_ptrs.size() != gpu_tp_count ||
        (int)w2_weight_ptrs.size() != gpu_tp_count || (int)w2_scale_ptrs.size() != gpu_tp_count) {
      throw std::runtime_error("Pointer arrays size must match gpu_tp_count");
    }

    this->config.pool->dispense_backend()->do_numa_job([&, this](int i) {
      this->tps[i]->write_weights_to_buffer(gpu_tp_count, this->tp_count, expert_id, this->config, w13_weight_ptrs,
                                            w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs);
    });
  }
};

#endif  // CPUINFER_OPERATOR_AMX_FP8_MOE_H


================================================
FILE: kt-kernel/operators/amx/fp8-perchannel-moe.hpp
================================================
/**
 * @Description  : FP8 Per-Channel AMX MoE operator for GLM-4.7-FP8 native inference
 * @Author       : Claude
 * @Date         : 2025-01-12
 * @Version      : 1.0.0
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 *
 * This file implements FP8 MoE with per-channel quantization using CRTP pattern.
 * Per-channel quantization: each output channel (row) has one scale factor.
 * This is different from block-wise quantization where each 128x128 block has one scale.
 **/
#ifndef CPUINFER_OPERATOR_AMX_FP8_PERCHANNEL_MOE_H
#define CPUINFER_OPERATOR_AMX_FP8_PERCHANNEL_MOE_H

#include "la/amx_raw_buffers.hpp"
#include "la/amx_raw_kernels.hpp"
#include "moe_base.hpp"

/**
 * @brief FP8 Per-Channel MoE operator using CRTP pattern
 * @tparam T Kernel type, defaults to GemmKernel224FP8PerChannel
 *
 * This class provides FP8 per-channel specific implementations:
 * - do_gate_up_gemm, do_down_gemm : FP8 weight -> BF16 conversion mat mul with per-channel scale
 * - load_weights: Load FP8 weights with per-channel scales (shape: [n])
 */
template <class T = amx::GemmKernel224FP8PerChannel>
class AMX_FP8_PERCHANNEL_MOE_TP : public AMX_MOE_BASE<T, AMX_FP8_PERCHANNEL_MOE_TP<T>> {
  using Base = AMX_MOE_BASE<T, AMX_FP8_PERCHANNEL_MOE_TP<T>>;
  using Base::config_;
  using Base::down_ba_;
  using Base::down_bb_;
  using Base::down_bc_;
  using Base::gate_bb_;
  using Base::gate_bc_;
  using Base::gate_up_ba_;
  using Base::m_local_num_;
  using Base::tp_part_idx;
  using Base::up_bb_;
  using Base::up_bc_;

 public:
  using typename Base::input_t;
  using typename Base::output_t;

  AMX_FP8_PERCHANNEL_MOE_TP() = default;

  AMX_FP8_PERCHANNEL_MOE_TP(GeneralMOEConfig config, int tp_part_idx_ = 0) : Base(config, tp_part_idx_) {
    // Initialization now happens in derived_init() which is called by base constructor
  }

  void derived_init() {
    auto& quant_config = config_.quant_config;
    if (!quant_config.per_channel) {
      throw std::runtime_error("KT-Kernel FP8 Per-Channel MoE requires per_channel=true");
    }
    printf("Created AMX_FP8_PERCHANNEL_MOE_TP %d at numa %d\n", tp_part_idx, numa_node_of_cpu(sched_getcpu()));
  }

  ~AMX_FP8_PERCHANNEL_MOE_TP() = default;

  // ============================================================================
  // CRTP buffer creation - per-channel (no group_size needed)
  // ============================================================================

  size_t buffer_a_required_size_impl(size_t m, size_t k) const { return T::BufferA::required_size(m, k); }
  size_t buffer_b_required_size_impl(size_t n, size_t k) const {
    // Per-channel: weight size + n scales (no group_size)
    return T::BufferB::required_size(n, k);
  }
  size_t buffer_c_required_size_impl(size_t m, size_t n) const { return T::BufferC::required_size(m, n); }

  std::shared_ptr<typename T::BufferA> make_buffer_a_impl(size_t m, size_t k, void* data) const {
    return std::make_shared<typename T::BufferA>(m, k, data);
  }
  std::shared_ptr<typename T::BufferB> make_buffer_b_impl(size_t n, size_t k, void* data) const {
    // Per-channel BufferB doesn't need group_size
    return std::make_shared<typename T::BufferB>(n, k, data);
  }
  std::shared_ptr<typename T::BufferC> make_buffer_c_impl(size_t m, size_t n, void* data) const {
    return std::make_shared<typename T::BufferC>(m, n, data);
  }

  // ============================================================================
  // CRTP virtual points - GEMM dispatch (per-channel)
  // ============================================================================

  void do_gate_up_gemm(bool do_up, int expert_idx, int ith, int nth, int qlen) {
    int m = m_local_num_[expert_idx];
    auto& ba = gate_up_ba_[expert_idx];
    auto& bb = do_up ? up_bb_[expert_idx] : gate_bb_[expert_idx];
    auto& bc = do_up ? up_bc_[expert_idx] : gate_bc_[expert_idx];

    // Per-channel: use vec_mul_perchannel instead of vec_mul_kgroup
    amx::float_mat_vec_perchannel<T>(m, config_.intermediate_size, config_.hidden_size, ba.get(), bb.get(), bc.get(),
                                     ith, nth);
  }

  void do_down_gemm(int expert_idx, int ith, int nth, int qlen) {
    int m = m_local_num_[expert_idx];

    amx::float_mat_vec_perchannel<T>(m, config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx].get(),
                                     down_bb_[expert_idx].get(), down_bc_[expert_idx].get(), ith, nth);
  }

  // Fast 64-byte (512-bit) memcpy using AVX512
  static inline void fast_memcpy_64(void* __restrict dst, const void* __restrict src) {
    __m512i data = _mm512_loadu_si512(src);
    _mm512_storeu_si512(dst, data);
  }

  // Fast memcpy for arbitrary sizes using AVX512
  static inline void fast_memcpy(void* __restrict dst, const void* __restrict src, size_t bytes) {
    uint8_t* d = (uint8_t*)dst;
    const uint8_t* s = (const uint8_t*)src;
    size_t chunks = bytes / 64;
    for (size_t i = 0; i < chunks; i++) {
      fast_memcpy_64(d, s);
      d += 64;
      s += 64;
    }
    bytes -= chunks * 64;
    if (bytes > 0) {
      std::memcpy(d, s, bytes);
    }
  }

  /**
   * @brief Unpack a single N_STEP x K_STEP block from packed BufferB format to n-major format
   *
   * This is the inverse of the packing done in BufferBFP8PerChannelImpl::from_mat.
   * Optimized with AVX512 gather for efficient non-contiguous reads.
   *
   * @param src Pointer to packed data (N_STEP * K_STEP bytes in packed layout)
   * @param dst Pointer to destination in n-major layout
   * @param dst_row_stride Row stride in destination buffer (number of columns in full matrix)
   */
  static inline void unpack_nk_block(const uint8_t* src, uint8_t* dst, size_t dst_row_stride) {
    // row_map[packed_i] gives the base row for packed index packed_i
    static constexpr int row_map[8] = {0, 16, 4, 20, 8, 24, 12, 28};
    const uint64_t* src64 = reinterpret_cast<const uint64_t*>(src);

    // Gather indices: src64[8*j + packed_i] for j = 0..7
    // Offsets in uint64 units: 0, 8, 16, 24, 32, 40, 48, 56 (+ packed_i for each group)
    const __m512i gather_offsets = _mm512_set_epi64(56, 48, 40, 32, 24, 16, 8, 0);

    // Process each packed group (8 groups of 4 rows each = 32 rows total)
    for (int packed_i = 0; packed_i < 8; packed_i++) {
      const int base_row = row_map[packed_i];
      const uint64_t* base_src = src64 + packed_i;

      // Gather 8 values for j=0..7 and j=8..15
      __m512i vals_0_7 = _mm512_i64gather_epi64(gather_offsets, base_src, 8);
      __m512i vals_8_15 = _mm512_i64gather_epi64(gather_offsets, base_src + 64, 8);

      // Extract 4 rows from each set of 8 values
      // Row 0: bits 0-15
      __m128i row0_lo = _mm512_cvtepi64_epi16(_mm512_and_si512(vals_0_7, _mm512_set1_epi64(0xFFFF)));
      __m128i row0_hi = _mm512_cvtepi64_epi16(_mm512_and_si512(vals_8_15, _mm512_set1_epi64(0xFFFF)));
      // Row 1: bits 16-31
      __m128i row1_lo =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_0_7, 16), _mm512_set1_epi64(0xFFFF)));
      __m128i row1_hi =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_8_15, 16), _mm512_set1_epi64(0xFFFF)));
      // Row 2: bits 32-47
      __m128i row2_lo =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_0_7, 32), _mm512_set1_epi64(0xFFFF)));
      __m128i row2_hi =
          _mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(vals_8_15, 32), _mm512_set1_epi64(0xFFFF)));
      // Row 3: bits 48-63
      __m128i row3_lo = _mm512_cvtepi64_epi16(_mm512_srli_epi64(vals_0_7, 48));
      __m128i row3_hi = _mm512_cvtepi64_epi16(_mm512_srli_epi64(vals_8_15, 48));

      // Store 32 bytes (16 x uint16) to each row
      // Combine two 128-bit values into 256-bit for more efficient stores
      uint8_t* row0_dst = dst + (size_t)base_row * dst_row_stride;
      uint8_t* row1_dst = dst + (size_t)(base_row + 1) * dst_row_stride;
      uint8_t* row2_dst = dst + (size_t)(base_row + 2) * dst_row_stride;
      uint8_t* row3_dst = dst + (size_t)(base_row + 3) * dst_row_stride;

      // Combine lo and hi into 256-bit and store
      __m256i row0_256 = _mm256_set_m128i(row0_hi, row0_lo);
      __m256i row1_256 = _mm256_set_m128i(row1_hi, row1_lo);
      __m256i row2_256 = _mm256_set_m128i(row2_hi, row2_lo);
      __m256i row3_256 = _mm256_set_m128i(row3_hi, row3_lo);

      _mm256_storeu_si256((__m256i*)row0_dst, row0_256);
      _mm256_storeu_si256((__m256i*)row1_dst, row1_256);
      _mm256_storeu_si256((__m256i*)row2_dst, row2_256);
      _mm256_storeu_si256((__m256i*)row3_dst, row3_256);
    }
  }

  /**
   * @brief Unpack 4 consecutive N_STEP x K_STEP blocks to maximize cache line utilization
   *
   * Processing 4 blocks together means each row write is 128 bytes = 2 cache lines,
   * which greatly improves write efficiency compared to 32 bytes per row.
   *
   * @param src Array of 4 source pointers (each pointing to a 32x32 packed block)
   * @param dst Destination pointer in n-major layout
   * @param dst_row_stride Row stride in destination buffer
   */
  static inline void unpack_4nk_blocks(const uint8_t* src[4], uint8_t* dst, size_t dst_row_stride) {
    static constexpr int row_map[8] = {0, 16, 4, 20, 8, 24, 12, 28};
    constexpr int K_STEP = T::K_STEP;  // 32

    // Reinterpret as uint64 arrays for efficient access
    const uint64_t* src0 = reinterpret_cast<const uint64_t*>(src[0]);
    const uint64_t* src1 = reinterpret_cast<const uint64_t*>(src[1]);
    const uint64_t* src2 = reinterpret_cast<const uint64_t*>(src[2]);
    const uint64_t* src3 = reinterpret_cast<const uint64_t*>(src[3]);

    // Process all 32 rows, writing 128 bytes (4 x 32) per row
    for (int packed_i = 0; packed_i < 8; packed_i++) {
      const int base_row = row_map[packed_i];

      // Process 4 rows at a time
      for (int r = 0; r < 4; r++) {
        uint16_t* row_dst = reinterpret_cast<uint16_t*>(dst + (size_t)(base_row + r) * dst_row_stride);
        const int shift = r * 16;

        // Unroll: process all 4 blocks x 16 columns = 64 uint16 values
        // Block 0: columns 0-15
        for (int j = 0; j < 16; j++) {
          row_dst[j] = static_cast<uint16_t>(src0[8 * j + packed_i] >> shift);
        }
        // Block 1: columns 16-31
        for (int j = 0; j < 16; j++) {
          row_dst[16 + j] = static_cast<uint16_t>(src1[8 * j + packed_i] >> shift);
        }
        // Block 2: columns 32-47
        for (int j = 0; j < 16; j++) {
          row_dst[32 + j] = static_cast<uint16_t>(src2[8 * j + packed_i] >> shift);
        }
        // Block 3: columns 48-63
        for (int j = 0; j < 16; j++) {
          row_dst[48 + j] = static_cast<uint16_t>(src3[8 * j + packed_i] >> shift);
        }
      }
    }
  }

  /**
   * @brief Reconstruct weights for a single expert to the output buffers (per-channel version)
   *
   * Directly unpacks from packed BufferB format to n-major GPU buffers without intermediate storage.
   * Scale handling is simplified for per-channel quantization (linear copy instead of block-wise).
   *
   * @param gpu_tp_count Number of GPU TP parts (1, 2, 4, or 8)
   * @param cpu_tp_count Number of CPU TP parts
   * @param expert_id Expert index to process
   * @param full_config Full configuration (before CPU TP split)
   * @param w13_weight_ptrs Pointers to gate+up weight buffers (one per GPU TP)
   * @param w13_scale_ptrs Pointers to gate+up scale buffers (one per GPU TP)
   * @param w2_weight_ptrs Pointers to down weight buffers (one per GPU TP)
   * @param w2_scale_ptrs Pointers to down scale buffers (one per GPU TP)
   */
  void write_weights_to_buffer(int gpu_tp_count, [[maybe_unused]] int cpu_tp_count, int expert_id,
                               const GeneralMOEConfig& full_config, const std::vector<uintptr_t>& w13_weight_ptrs,
                               const std::vector<uintptr_t>& w13_scale_ptrs,
                               const std::vector<uintptr_t>& w2_weight_ptrs,
                               const std::vector<uintptr_t>& w2_scale_ptrs) const {
    auto& config = config_;
    auto pool = config.pool->get_subpool(tp_part_idx);

    constexpr int N_STEP = T::N_STEP;
    constexpr int K_STEP = T::K_STEP;
    constexpr int N_BLOCK = T::N_BLOCK;
    constexpr int K_BLOCK = T::K_BLOCK;

    // ========= W13 (gate+up): Shape [intermediate, hidden], split by N only =========
    const int cpu_n_w13 = config.intermediate_size;
    const int cpu_k_w13 = config.hidden_size;
    const int gpu_n_w13 = full_config.intermediate_size / gpu_tp_count;
    const int gpu_k_w13 = full_config.hidden_size;
    const int global_n_offset_w13 = tp_part_idx * cpu_n_w13;

    const size_t gpu_w13_weight_per_mat = (size_t)gpu_n_w13 * gpu_k_w13;
    // Per-channel scale: shape [n] for each matrix
    const size_t gpu_w13_scale_per_mat = (size_t)gpu_n_w13;

    // ========= W2 (down): Shape [hidden, intermediate], split by K =========
    const int cpu_n_w2 = config.hidden_size;
    const int cpu_k_w2 = config.intermediate_size;
    const int gpu_n_w2 = full_config.hidden_size;
    const int gpu_k_w2 = full_config.intermediate_size / gpu_tp_count;
    const int global_k_offset_w2 = tp_part_idx * cpu_k_w2;

    const size_t gpu_w2_weight_per_mat = (size_t)gpu_n_w2 * gpu_k_w2;
    // Per-channel scale for down: shape [hidden_size] - not split by K
    const size_t gpu_w2_scale_per_mat = (size_t)gpu_n_w2;

    // ========= Optimized job layout =========
    constexpr int NUM_W13_TASKS = 32;  // Per matrix (gate or up), total 64 for w13
    constexpr int NUM_W2_TASKS = 32;   // For down matrix
    constexpr int SCALE_TASKS = 3;     // gate_scale, up_scale, down_scale

    const int total_tasks = NUM_W13_TASKS * 2 + NUM_W2_TASKS + SCALE_TASKS;

    // Calculate N_STEP blocks per task (must be N_STEP aligned for correct BufferB addressing)
    const int w13_n_steps = div_up(cpu_n_w13, N_STEP);
    const int w13_steps_per_task = div_up(w13_n_steps, NUM_W13_TASKS);
    const int w2_n_steps = div_up(cpu_n_w2, N_STEP);
    const int w2_steps_per_task = div_up(w2_n_steps, NUM_W2_TASKS);

    pool->do_work_stealing_job(
        total_tasks, nullptr,
        [=, &w13_weight_ptrs, &w13_scale_ptrs, &w2_weight_ptrs, &w2_scale_ptrs, this](int task_id) {
          if (task_id < NUM_W13_TASKS * 2) {
            // ========= W13 weight task: process chunk of rows x full K =========
            const bool is_up = task_id >= NUM_W13_TASKS;
            const int chunk_idx = task_id % NUM_W13_TASKS;
            const auto& bb = is_up ? up_bb_[expert_id] : gate_bb_[expert_id];

            // Calculate row range for this task (N_STEP aligned)
            const int step_start = chunk_idx * w13_steps_per_task;
            const int step_end = std::min(step_start + w13_steps_per_task, w13_n_steps);
            if (step_start >= w13_n_steps) return;
            const int chunk_n_start = step_start * N_STEP;
            const int chunk_n_end = std::min(step_end * N_STEP, cpu_n_w13);

            // Process each N_STEP within this chunk
            for (int local_n_start = chunk_n_start; local_n_start < chunk_n_end; local_n_start += N_STEP) {
              // Calculate GPU target and offset for each N_STEP (may cross GPU TP boundaries)
              const int global_n = global_n_offset_w13 + local_n_start;
              const int target_gpu = global_n / gpu_n_w13;
              const int n_in_gpu = global_n % gpu_n_w13;

              uint8_t* weight_base = (uint8_t*)w13_weight_ptrs[target_gpu];
              // Pointer already points to current expert's location, only add offset for up matrix
              const size_t expert_weight_off = is_up ? gpu_w13_weight_per_mat : 0;

              // Calculate N_BLOCK info for source addressing
              const int n_block_idx = local_n_start / N_BLOCK;
              const int n_block_begin = n_block_idx * N_BLOCK;
              const int n_block_size = std::min(N_BLOCK, cpu_n_w13 - n_block_begin);
              const int n_in_block = local_n_start - n_block_begin;

              // Process all K in groups of 4 K_STEPs when possible for cache efficiency
              for (int k_block_begin = 0; k_block_begin < cpu_k_w13; k_block_begin += K_BLOCK) {
                const int k_block_size = std::min(K_BLOCK, cpu_k_w13 - k_block_begin);

                // Try to process 4 K_STEPs at once (128 columns = 2 cache lines per row)
                int k_begin = 0;
                for (; k_begin + 4 * K_STEP <= k_block_size; k_begin += 4 * K_STEP) {
                  const uint8_t* src_ptrs[4];
                  for (int i = 0; i < 4; i++) {
                    src_ptrs[i] = bb->b + (size_t)n_block_begin * cpu_k_w13 + (size_t)k_block_begin * n_block_size +
                                  (size_t)n_in_block * k_block_size + (size_t)(k_begin + i * K_STEP) * N_STEP;
                  }
                  uint8_t* dst =
                      weight_base + expert_weight_off + (size_t)n_in_gpu * gpu_k_w13 + k_block_begin + k_begin;
                  unpack_4nk_blocks(src_ptrs, dst, gpu_k_w13);
                }

                // Handle remaining K_STEPs one by one
                for (; k_begin < k_block_size; k_begin += K_STEP) {
                  const uint8_t* src = bb->b + (size_t)n_block_begin * cpu_k_w13 +
                                       (size_t)k_block_begin * n_block_size + (size_t)n_in_block * k_block_size +
                                       (size_t)k_begin * N_STEP;
                  uint8_t* dst =
                      weight_base + expert_weight_off + (size_t)n_in_gpu * gpu_k_w13 + k_block_begin + k_begin;
                  unpack_nk_block(src, dst, gpu_k_w13);
                }
              }
            }

          } else if (task_id < NUM_W13_TASKS * 2 + NUM_W2_TASKS) {
            // ========= W2 weight task: process chunk of rows x all K slices =========
            const int chunk_idx = task_id - NUM_W13_TASKS * 2;
            const auto& bb = down_bb_[expert_id];

            // Calculate row range for this task (N_STEP aligned)
            const int step_start = chunk_idx * w2_steps_per_task;
            const int step_end = std::min(step_start + w2_steps_per_task, w2_n_steps);
            if (step_start >= w2_n_steps) return;
            const int chunk_n_start = step_start * N_STEP;
            const int chunk_n_end = std::min(step_end * N_STEP, cpu_n_w2);

            // Process each N_STEP within this chunk
            for (int local_n_start = chunk_n_start; local_n_start < chunk_n_end; local_n_start += N_STEP) {
              // Calculate N_BLOCK info for source addressing
              const int n_block_idx = local_n_start / N_BLOCK;
              const int n_block_begin = n_block_idx * N_BLOCK;
              const int n_block_size = std::min(N_BLOCK, cpu_n_w2 - n_block_begin);
              const int n_in_block = local_n_start - n_block_begin;

              // Process all K slices (each slice goes to a different GPU TP)
              for (int k_slice_start = 0; k_slice_start < cpu_k_w2; k_slice_start += gpu_k_w2) {
                const int k_slice_end = std::min(k_slice_start + gpu_k_w2, cpu_k_w2);

                const int global_k_start = global_k_offset_w2 + k_slice_start;
                const int target_gpu = global_k_start / gpu_k_w2;
                const int k_in_gpu_base = global_k_start % gpu_k_w2;

                uint8_t* weight_base = (uint8_t*)w2_weight_ptrs[target_gpu];
                // Pointer already points to current expert's location
                const size_t expert_weight_off = 0;

                // Process K within this slice, trying 4 K_STEPs at once when aligned
                for (int k_abs = k_slice_start; k_abs < k_slice_end;) {
                  const int k_block_idx = k_abs / K_BLOCK;
                  const int k_block_begin = k_block_idx * K_BLOCK;
                  const int k_block_size = std::min(K_BLOCK, cpu_k_w2 - k_block_begin);
                  const int k_in_block = k_abs - k_block_begin;
                  const int k_in_gpu = k_in_gpu_base + (k_abs - k_slice_start);

                  // Check if we can process 4 K_STEPs at once
                  const int remaining_in_block = k_block_size - k_in_block;
                  const int remaining_in_slice = k_slice_end - k_abs;

                  if (remaining_in_block >= 4 * K_STEP && remaining_in_slice >= 4 * K_STEP) {
                    const uint8_t* src_ptrs[4];
                    for (int i = 0; i < 4; i++) {
                      src_ptrs[i] = bb->b + (size_t)n_block_begin * cpu_k_w2 + (size_t)k_block_begin * n_block_size +
                                    (size_t)n_in_block * k_block_size + (size_t)(k_in_block + i * K_STEP) * N_STEP;
                    }
                    uint8_t* dst = weight_base + expert_weight_off + (size_t)local_n_start * gpu_k_w2 + k_in_gpu;
                    unpack_4nk_blocks(src_ptrs, dst, gpu_k_w2);
                    k_abs += 4 * K_STEP;
                  } else {
                    const uint8_t* src = bb->b + (size_t)n_block_begin * cpu_k_w2 +
                                         (size_t)k_block_begin * n_block_size + (size_t)n_in_block * k_block_size +
                                         (size_t)k_in_block * N_STEP;
                    uint8_t* dst = weight_base + expert_weight_off + (size_t)local_n_start * gpu_k_w2 + k_in_gpu;
                    unpack_nk_block(src, dst, gpu_k_w2);
                    k_abs += K_STEP;
                  }
                }
              }
            }

          } else {
            // ========= Scale copy task: per-channel (simple linear copy) =========
            const int scale_task_id = task_id - NUM_W13_TASKS * 2 - NUM_W2_TASKS;

            if (scale_task_id < 2) {
              // Gate (0) or Up (1) scale copy - per-channel: [intermediate_size]
              const bool is_up = scale_task_id == 1;
              const auto& bb = is_up ? up_bb_[expert_id] : gate_bb_[expert_id];

              // W13 per-channel scales: copy N range corresponding to this CPU TP
              // Each GPU TP gets [gpu_n_w13] scales
              const int n_start_global = global_n_offset_w13;

              for (int local_n = 0; local_n < cpu_n_w13;) {
                const int global_n = n_start_global + local_n;
                const int target_gpu = global_n / gpu_n_w13;
                const int n_in_gpu = global_n % gpu_n_w13;

                // Calculate how many scales to copy to this GPU TP
                const int remaining_in_gpu = gpu_n_w13 - n_in_gpu;
                const int remaining_local = cpu_n_w13 - local_n;
                const int copy_count = std::min(remaining_in_gpu, remaining_local);

                float* scale_dst = (float*)w13_scale_ptrs[target_gpu];
                // Pointer already points to current expert's location, only add offset for up matrix
                const size_t expert_scale_off = is_up ? gpu_w13_scale_per_mat : 0;

                fast_memcpy(scale_dst + expert_scale_off + n_in_gpu, bb->d + local_n, copy_count * sizeof(float));

                local_n += copy_count;
              }
            } else {
              // Down scale copy (scale_task_id == 2) - per-channel: [hidden_size]
              const auto& bb = down_bb_[expert_id];

              // W2 per-channel scales: shape [hidden_size], not split by K
              // All GPU TPs get the same scales (full hidden_size)
              // However, since K is split, we need to write to each GPU TP
              for (int gpu_idx = 0; gpu_idx < gpu_tp_count; gpu_idx++) {
                // Check if this CPU TP contributes to this GPU TP's K range
                const int gpu_k_start = gpu_idx * gpu_k_w2;
                const int gpu_k_end = gpu_k_start + gpu_k_w2;
                const int cpu_k_start = global_k_offset_w2;
                const int cpu_k_end = cpu_k_start + cpu_k_w2;

                // Check for overlap
                if (cpu_k_start < gpu_k_end && cpu_k_end > gpu_k_start) {
                  // This CPU TP contributes to this GPU TP
                  // Only the first CPU TP for this GPU should write scales
                  if (cpu_k_start == gpu_k_start || cpu_k_start % gpu_k_w2 == 0) {
                    float* scale_dst = (float*)w2_scale_ptrs[gpu_idx];
                    // Pointer already points to current expert's location
                    fast_memcpy(scale_dst, bb->d, cpu_n_w2 * sizeof(float));
                  }
                }
              }
            }
          }
        },
        nullptr);
  }

  /**
   * @brief Load FP8 weights from contiguous memory layout with per-channel scales
   *
   * Loads weights from config_.gate_proj, up_proj, down_proj with scales
   * from config_.gate_scale, up_scale, down_scale.
   *
   * Per-channel scale shape: [n] (one scale per output channel)
   */
  void load_weights() {
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
    auto pool = config_.pool->get_subpool(tp_part_idx);

    if (config_.gate_scale == nullptr) {
      throw std::runtime_error("FP8 Per-Channel MoE requires scale pointers.");
    }

    // load gate and up weights
    int nth = T::recommended_nth(config_.intermediate_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;

          // Per-channel scale: shape [intermediate_size] for gate/up
          const size_t weight_offset = logical_expert_id * config_.intermediate_size * config_.hidden_size;
          const size_t scale_offset = logical_expert_id * config_.intermediate_size;

          // gate part
          gate_bb_[expert_idx]->from_mat((uint8_t*)config_.gate_proj + weight_offset,
                                         (float*)config_.gate_scale + scale_offset, ith, nth);
          // up part
          up_bb_[expert_idx]->from_mat((uint8_t*)config_.up_proj + weight_offset,
                                       (float*)config_.up_scale + scale_offset, ith, nth);
        },
        nullptr);

    // load down weights
    nth = T::recommended_nth(config_.hidden_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;

          // Per-channel scale: shape [hidden_size] for down
          const size_t weight_offset = logical_expert_id * config_.intermediate_size * config_.hidden_size;
          const size_t scale_offset = logical_expert_id * config_.hidden_size;

          // down part
          down_bb_[expert_idx]->from_mat((uint8_t*)config_.down_proj + weight_offset,
                                         (float*)config_.down_scale + scale_offset, ith, nth);
        },
        nullptr);
  }
};

/**
 * @brief TP_MOE specialization for FP8 Per-Channel MoE
 */
template <typename K>
class TP_MOE<AMX_FP8_PERCHANNEL_MOE_TP<K>> : public TP_MOE<AMX_MOE_BASE<K, AMX_FP8_PERCHANNEL_MOE_TP<K>>> {
 public:
  using Base = TP_MOE<AMX_MOE_BASE<K, AMX_FP8_PERCHANNEL_MOE_TP<K>>>;
  using Base::Base;

  /**
   * @brief Write weights and scales to GPU buffer for a single expert
   *
   * This method coordinates all CPU TP parts to write their portions
   * of weights and scales to the GPU buffers.
   *
   * @param gpu_tp_count Number of GPU TP parts
   * @param expert_id Expert index to write
   * @param w13_weight_ptrs Pointers to gate+up weight buffers (one per GPU TP)
   * @param w13_scale_ptrs Pointers to gate+up scale buffers (one per GPU TP)
   * @param w2_weight_ptrs Pointers to down weight buffers (one per GPU TP)
   * @param w2_scale_ptrs Pointers to down scale buffers (one per GPU TP)
   */
  void write_weight_scale_to_buffer(int gpu_tp_count, int expert_id, const std::vector<uintptr_t>& w13_weight_ptrs,
                                    const std::vector<uintptr_t>& w13_scale_ptrs,
                                    const std::vector<uintptr_t>& w2_weight_ptrs,
                                    const std::vector<uintptr_t>& w2_scale_ptrs) {
    if (this->weights_loaded == false) {
      throw std::runtime_error("Not Loaded");
    }
    if (this->tps.empty()) {
      throw std::runtime_error("No TP parts initialized");
    }
    if ((int)w13_weight_ptrs.size() != gpu_tp_count || (int)w13_scale_ptrs.size() != gpu_tp_count ||
        (int)w2_weight_ptrs.size() != gpu_tp_count || (int)w2_scale_ptrs.size() != gpu_tp_count) {
      throw std::runtime_error("Pointer arrays size must match gpu_tp_count");
    }

    this->config.pool->dispense_backend()->do_numa_job([&, this](int i) {
      this->tps[i]->write_weights_to_buffer(gpu_tp_count, this->tp_count, expert_id, this->config, w13_weight_ptrs,
                                            w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs);
    });
  }

  void load_weights() override {
    auto& config = this->config;
    auto& tps = this->tps;
    auto& tp_count = this->tp_count;
    auto pool = config.pool;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;

    if (!config.quant_config.per_channel) {
      throw std::runtime_error("FP8 Per-Channel MoE requires per_channel=true");
    }

    if (config.gate_projs.empty() && config.gate_proj == nullptr) {
      throw std::runtime_error("no weight source");
    }
    const bool use_per_expert_ptrs = !config.gate_projs.empty();

    const size_t full_weight_elems = (size_t)config.intermediate_size * config.hidden_size;
    // Per-channel: scale count = output dimension
    const size_t gate_up_scale_elems = (size_t)config.intermediate_size;
    const size_t down_scale_elems = (size_t)config.hidden_size;

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      const size_t tp_weight_elems = (size_t)tpc.intermediate_size * tpc.hidden_size;
      // Per-channel scales for TP part
      const size_t tp_gate_up_scale_elems = (size_t)tpc.intermediate_size;
      const size_t tp_down_scale_elems = (size_t)tpc.hidden_size;

      tpc.gate_proj = new uint8_t[tpc.expert_num * tp_weight_elems];
      tpc.up_proj = new uint8_t[tpc.expert_num * tp_weight_elems];
      tpc.down_proj = new uint8_t[tpc.expert_num * tp_weight_elems];

      tpc.gate_scale = new float[tpc.expert_num * tp_gate_up_scale_elems];
      tpc.up_scale = new float[tpc.expert_num * tp_gate_up_scale_elems];
      tpc.down_scale = new float[tpc.expert_num * tp_down_scale_elems];

      const size_t tp_idx = (size_t)i;
      // gate/up: split by N (intermediate_size)
      const size_t gate_up_weight_src_offset = i * tp_weight_elems;
      const size_t gate_up_scale_src_offset = i * tp_gate_up_scale_elems;

      // down: split by K (intermediate_size)
      const size_t down_weight_src_col_offset = i * (size_t)tpc.intermediate_size;

      pool->get_subpool(i)->do_work_stealing_job(
          tpc.expert_num, nullptr,
          [&, &tpc](int expert_id_) {
            const size_t expert_id = expert_map(physical_to_logical_map, expert_id_);

            uint8_t* gate_dst = (uint8_t*)tpc.gate_proj + expert_id * tp_weight_elems;
            uint8_t* up_dst = (uint8_t*)tpc.up_proj + expert_id * tp_weight_elems;
            uint8_t* down_dst = (uint8_t*)tpc.down_proj + expert_id * tp_weight_elems;

            float* gate_scale_dst = (float*)tpc.gate_scale + expert_id * tp_gate_up_scale_elems;
            float* up_scale_dst = (float*)tpc.up_scale + expert_id * tp_gate_up_scale_elems;
            float* down_scale_dst = (float*)tpc.down_scale + expert_id * tp_down_scale_elems;

            const uint8_t* gate_src;
            const uint8_t* up_src;
            const uint8_t* down_src;
            const float* gate_scale_src;
            const float* up_scale_src;
            const float* down_scale_src;

            if (use_per_expert_ptrs) {
              gate_src = (const uint8_t*)config.gate_projs[0][expert_id] + gate_up_weight_src_offset;
              up_src = (const uint8_t*)config.up_projs[0][expert_id] + gate_up_weight_src_offset;
              down_src = (const uint8_t*)config.down_projs[0][expert_id];

              gate_scale_src = (const float*)config.gate_scales[0][expert_id] + gate_up_scale_src_offset;
              up_scale_src = (const float*)config.up_scales[0][expert_id] + gate_up_scale_src_offset;
              down_scale_src = (const float*)config.down_scales[0][expert_id];
            } else {
              gate_src = (const uint8_t*)config.gate_proj + expert_id * full_weight_elems + gate_up_weight_src_offset;
              up_src = (const uint8_t*)config.up_proj + expert_id * full_weight_elems + gate_up_weight_src_offset;
              down_src = (const uint8_t*)config.down_proj + expert_id * full_weight_elems;

              gate_scale_src =
                  (const float*)config.gate_scale + expert_id * gate_up_scale_elems + gate_up_scale_src_offset;
              up_scale_src = (const float*)config.up_scale + expert_id * gate_up_scale_elems + gate_up_scale_src_offset;
              down_scale_src = (const float*)config.down_scale + expert_id * down_scale_elems;
            }

            // Copy gate/up weights and scales (N dimension split)
            std::memcpy(gate_dst, gate_src, tp_weight_elems);
            std::memcpy(up_dst, up_src, tp_weight_elems);
            std::memcpy(gate_scale_dst, gate_scale_src, sizeof(float) * tp_gate_up_scale_elems);
            std::memcpy(up_scale_dst, up_scale_src, sizeof(float) * tp_gate_up_scale_elems);

            // Copy down weights (K dimension split) - row by row
            for (int row = 0; row < config.hidden_size; row++) {
              const size_t src_row_offset = (size_t)row * (size_t)config.intermediate_size + down_weight_src_col_offset;
              const size_t dst_row_offset = (size_t)row * (size_t)tpc.intermediate_size;
              std::memcpy(down_dst + dst_row_offset, down_src + src_row_offset, (size_t)tpc.intermediate_size);
            }

            // Copy down scales (N dimension = hidden_size, full copy for each TP)
            std::memcpy(down_scale_dst, down_scale_src, sizeof(float) * tp_down_scale_elems);
          },
          nullptr);
    });

    DO_TPS_LOAD_WEIGHTS(pool);

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      delete[] (uint8_t*)tpc.gate_proj;
      delete[] (uint8_t*)tpc.up_proj;
      delete[] (uint8_t*)tpc.down_proj;
      delete[] (float*)tpc.gate_scale;
      delete[] (float*)tpc.up_scale;
      delete[] (float*)tpc.down_scale;
    });

    this->weights_loaded = true;
  }
};

#endif  // CPUINFER_OPERATOR_AMX_FP8_PERCHANNEL_MOE_H


================================================
FILE: kt-kernel/operators/amx/k2-moe.hpp
================================================
/**
 * @Description  : K2 AMX MoE operator for Kimi-K2 native inference
 * @Author       : oql, Codex and Claude
 * @Date         : 2025-12-09
 * @Version      : 1.0.0
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 *
 * This file implements K2 Int4 MoE using CRTP pattern, inheriting from moe_base.hpp.
 * K2 weights are stored with group-wise scales (KGroup Int4).
 **/
#ifndef CPUINFER_OPERATOR_AMX_K2_MOE_H
#define CPUINFER_OPERATOR_AMX_K2_MOE_H

// #define LOAD_TIME_PROFILE

#include "moe_base.hpp"

/**
 * @brief K2 Int4 MoE operator using CRTP pattern
 * @tparam T Kernel type, defaults to amx::GemmKernel224Int4SmallKGroup
 *
 * This class provides K2-specific GEMM implementations:
 * - do_gate_up_gemm: Int4 weight with KGroup scale + AMX GEMM
 * - do_down_gemm: Same Int4 KGroup GEMM
 * - load_weights: Load Int4 weights with group-wise scales
 */
template <class T = amx::GemmKernel224Int4SmallKGroup>
class AMX_K2_MOE_TP : public AMX_MOE_BASE<T, AMX_K2_MOE_TP<T>> {
  using Base = AMX_MOE_BASE<T, AMX_K2_MOE_TP<T>>;
  using Base::config_;
  using Base::down_ba_;
  using Base::down_bb_;
  using Base::down_bc_;
  using Base::gate_bb_;
  using Base::gate_bc_;
  using Base::gate_up_ba_;
  using Base::m_local_num_;
  using Base::tp_part_idx;
  using Base::up_bb_;
  using Base::up_bc_;

 public:
  using typename Base::input_t;
  using typename Base::output_t;

  AMX_K2_MOE_TP() = default;

  AMX_K2_MOE_TP(GeneralMOEConfig config, int tp_part_idx_ = 0) : Base(config, tp_part_idx_) {}

  void derived_init() {
    auto& quant_config = config_.quant_config;
    if (quant_config.group_size == 0 || quant_config.zero_point) {
      throw std::runtime_error("Kimi-K2 MoE only support KGroup Int4");
    }
    printf("Creating AMX_K2_MOE_TP %d at numa %d\n", tp_part_idx, numa_node_of_cpu(sched_getcpu()));
  }

  ~AMX_K2_MOE_TP() = default;

  // ============================================================================
  // CRTP buffer creation - with group_size
  // ============================================================================

  size_t buffer_a_required_size_impl(size_t m, size_t k) const {
    return T::BufferA::required_size(m, k, config_.quant_config.group_size);
  }
  size_t buffer_b_required_size_impl(size_t n, size_t k) const {
    return T::BufferB::required_size(n, k, config_.quant_config.group_size);
  }
  size_t buffer_c_required_size_impl(size_t m, size_t n) const { return T::BufferC::required_size(m, n); }

  std::shared_ptr<typename T::BufferA> make_buffer_a_impl(size_t m, size_t k, void* data) const {
    return std::make_shared<typename T::BufferA>(m, k, config_.quant_config.group_size, data);
  }
  std::shared_ptr<typename T::BufferB> make_buffer_b_impl(size_t n, size_t k, void* data) const {
    return std::make_shared<typename T::BufferB>(n, k, config_.quant_config.group_size, data);
  }
  std::shared_ptr<typename T::BufferC> make_buffer_c_impl(size_t m, size_t n, void* data) const {
    return std::make_shared<typename T::BufferC>(m, n, data);
  }

  // ============================================================================
  // CRTP virtual points - GEMM dispatch
  // ============================================================================

  void do_gate_up_gemm(bool do_up, int expert_idx, int ith, int nth, int qlen) {
    auto& group_size = config_.quant_config.group_size;
    int m = m_local_num_[expert_idx];
    auto& ba = gate_up_ba_[expert_idx];
    auto& bb = do_up ? up_bb_[expert_idx] : gate_bb_[expert_idx];
    auto& bc = do_up ? up_bc_[expert_idx] : gate_bc_[expert_idx];

    // Dispatch based on qlen threshold
    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul_kgroup(m, config_.intermediate_size, config_.hidden_size, group_size, ba, bb, bc, ith, nth);
    } else {
      amx::vec_mul_kgroup(m, config_.intermediate_size, config_.hidden_size, group_size, ba, bb, bc, ith, nth);
    }
  }

  void do_down_gemm(int expert_idx, int ith, int nth, int qlen) {
    auto& group_size = config_.quant_config.group_size;
    int m = m_local_num_[expert_idx];

    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul_kgroup(m, config_.hidden_size, config_.intermediate_size, group_size, down_ba_[expert_idx],
                          down_bb_[expert_idx], down_bc_[expert_idx], ith, nth);
    } else {
      amx::vec_mul_kgroup(m, config_.hidden_size, config_.intermediate_size, group_size, down_ba_[expert_idx],
                          down_bb_[expert_idx], down_bc_[expert_idx], ith, nth);
    }
  }

  /**
   * @brief Load Int4 weights from contiguous memory layout
   *
   * Loads weights from config_.gate_proj, up_proj, down_proj with scales
   * from config_.gate_scale, up_scale, down_scale.
   */
  void load_weights() {
    auto& quant_config = config_.quant_config;
    int& group_size = quant_config.group_size;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
    auto pool = config_.pool->get_subpool(tp_part_idx);

    if (quant_config.group_size == 0 || quant_config.zero_point) {
      throw std::runtime_error("Kimi AVX MOE only support KGroup Int4.");
    }
    if (config_.gate_scale == nullptr) {
      throw std::runtime_error("Kimi AVX MOE only support load native weight.");
    }

    // load weight
    int nth = T::recommended_nth(config_.intermediate_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;
          // gate part
          gate_bb_[expert_idx]->from_raw_mat(
              (uint8_t*)config_.gate_proj +
                  ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
              ith, nth);
          // up part
          up_bb_[expert_idx]->from_raw_mat(
              (uint8_t*)config_.up_proj + ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
              ith, nth);
        },
        nullptr);

    nth = T::recommended_nth(config_.hidden_size);
    pool->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [this, nth, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id / nth;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          int ith = task_id % nth;
          // down part
          down_bb_[expert_idx]->from_raw_mat(
              (uint8_t*)config_.down_proj +
                  ((logical_expert_id * config_.hidden_size * config_.intermediate_size) >> 1),
              ith, nth);
        },
        nullptr);

    pool->do_work_stealing_job(
        config_.expert_num, nullptr,
        [this, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          size_t scale_elem_count = (config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;

          // convert scales from BF16 to FP32
          convert_or_copy(gate_bb_[expert_idx]->d,
                          (ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
          convert_or_copy(up_bb_[expert_idx]->d,
                          (ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
          convert_or_copy(down_bb_[expert_idx]->d,
                          (ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
        },
        nullptr);
#ifdef DEBUG_K2_MOE
    dump_buffer_b("native", 0, "down", down_bb_[0].get());
#endif
  }

  static inline void fast_memcpy(void* __restrict dst, const void* __restrict src, size_t bytes) {
    uint8_t* d = (uint8_t*)dst;
    const uint8_t* s = (const uint8_t*)src;

    // Main loop: 512-bit (64-byte) SIMD copies
    size_t chunks = bytes / 64;
    for (size_t i = 0; i < chunks; i++) {
      __m512i data = _mm512_loadu_si512((__m512i*)s);
      _mm512_storeu_si512((__m512i*)d, data);
      d += 64;
      s += 64;
    }
    bytes -= chunks * 64;

    // Handle remaining bytes
    if (bytes > 0) {
      std::memcpy(d, s, bytes);
    }
  }

  // Optimized SIMD float32 to bf16 conversion
  static inline void fast_fp32_to_bf16(ggml_bf16_t* __restrict dst, const float* __restrict src, size_t count) {
    size_t i = 0;

    // Process 32 elements at a time (2x __m512, output 1x __m512i = 32 bf16)
    for (; i + 32 <= count; i += 32) {
      __m512 v0 = _mm512_loadu_ps(src + i);
      __m512 v1 = _mm512_loadu_ps(src + i + 16);

      // Convert to bf16 using truncation (shift right 16 bits)
      __m512i i0 = _mm512_srli_epi32(_mm512_castps_si512(v0), 16);
      __m512i i1 = _mm512_srli_epi32(_mm512_castps_si512(v1), 16);

      // Pack 32-bit values to 16-bit
      __m512i packed = _mm512_packus_epi32(i0, i1);

      // Reorder due to packus lane behavior:
      // packus outputs interleaved: [i0[0-3], i1[0-3], i0[4-7], i1[4-7], i0[8-11], i1[8-11], i0[12-15], i1[12-15]]
      // We need sequential: [i0[0-15], i1[0-15]] = [i0[0-3], i0[4-7], i0[8-11], i0[12-15], i1[0-3], i1[4-7], i1[8-11],
      // i1[12-15]] Permutation: [0, 2, 4, 6, 1, 3, 5, 7] (qword indices)
      __m512i permuted = _mm512_permutexvar_epi64(_mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0), packed);

      _mm512_storeu_si512((__m512i*)(dst + i), permuted);
    }

    // Handle remaining elements with scalar conversion
    for (; i < count; i++) {
      dst[i] = ggml_fp32_to_bf16(src[i]);
    }
  }

  // Write a single expert's weights to the output buffers
  // The caller provides pointers that already point to the target expert's location (no offset needed)
  // expert_id: the index of the expert to write
  // Optimized for maximum memory bandwidth using streaming stores
  void write_weights_to_buffer(int gpu_tp_count, int cpu_tp_count, int expert_id, const GeneralMOEConfig& full_config,
                               const std::vector<uintptr_t>& w13_weight_ptrs,
                               const std::vector<uintptr_t>& w13_scale_ptrs,
                               const std::vector<uintptr_t>& w2_weight_ptrs,
                               const std::vector<uintptr_t>& w2_scale_ptrs) const {
    const int group_size = config_.quant_config.group_size;
    auto pool = config_.pool->get_subpool(tp_part_idx);

    // Calculate sizes for CPU TP part (this instance)
    size_t cpu_tp_weight_elem_count = (size_t)config_.intermediate_size * config_.hidden_size;
    size_t cpu_tp_weight_bytes = cpu_tp_weight_elem_count / 2;  // int4 packing
    size_t cpu_tp_scale_elem_count = cpu_tp_weight_elem_count / group_size;

    // Calculate sizes for GPU TP part
    size_t gpu_tp_weight_elem_count = (size_t)full_config.intermediate_size * full_config.hidden_size / gpu_tp_count;
    size_t gpu_tp_weight_bytes = gpu_tp_weight_elem_count / 2;  // int4 packing
    size_t gpu_tp_scale_elem_count = gpu_tp_weight_elem_count / group_size;

    // Determine mapping: which GPU TP parts should this CPU TP part write to?
    // Since weights are col-major and we slice directly by memory order:
    // - If cpu_tp_count >= gpu_tp_count: multiple(or one) CPU TPs write to one GPU TP
    // - If cpu_tp_count < gpu_tp_count: one CPU TP writes to multiple GPU TPs
    if (cpu_tp_count >= gpu_tp_count) {
      // Multiple CPU TPs map to one GPU TP
      int target_gpu_tp = tp_part_idx / (cpu_tp_count / gpu_tp_count);
      int local_idx = tp_part_idx % (cpu_tp_count / gpu_tp_count);

      // Get pointers for this GPU TP part (already pointing to target expert's location)
      uint8_t* w13_weight_dst = (uint8_t*)w13_weight_ptrs[target_gpu_tp];
      ggml_bf16_t* w13_scale_dst = (ggml_bf16_t*)w13_scale_ptrs[target_gpu_tp];
      uint8_t* w2_weight_dst = (uint8_t*)w2_weight_ptrs[target_gpu_tp];
      ggml_bf16_t* w2_scale_dst = (ggml_bf16_t*)w2_scale_ptrs[target_gpu_tp];

      // Calculate offset within the GPU TP buffer (for CPU TP slice within GPU TP)
      size_t offset_in_gpu_weight = local_idx * cpu_tp_weight_bytes;
      size_t offset_in_gpu_scale = local_idx * cpu_tp_scale_elem_count;

      // Optimized task layout for maximum bandwidth:
      // - Larger chunks to reduce task overhead
      // - Separate large contiguous copies (gate_w, up_w) from strided copies (down)
      // - Scale conversions are relatively small, merge with weight tasks

      // Use fewer, larger tasks for better efficiency
      constexpr int NUM_WEIGHT_TASKS = 8;  // Fewer tasks, larger chunks
      constexpr int MIN_COLS_PER_TASK = 128;
      int num_down_tasks = std::max(1, (int)config_.hidden_size / MIN_COLS_PER_TASK);
      num_down_tasks = std::min(num_down_tasks, 32);

      // Total tasks: gate_weight + up_weight + down_weight_scale + gate_scale + up_scale
      int total_tasks = NUM_WEIGHT_TASKS * 2 + num_down_tasks + 2;

      size_t weight_chunk_size = (cpu_tp_weight_bytes + NUM_WEIGHT_TASKS - 1) / NUM_WEIGHT_TASKS;
      // Align chunk size to 64 bytes for optimal streaming stores
      weight_chunk_size = (weight_chunk_size + 63) & ~63ULL;

      pool->do_work_stealing_job(
          total_tasks, nullptr,
          [&, this, num_down_tasks, expert_id, weight_chunk_size](int task_id) {
            if (task_id < NUM_WEIGHT_TASKS) {
              // Gate weight copy - chunked
              int chunk_idx = task_id;
              size_t start = chunk_idx * weight_chunk_size;
              size_t end = std::min(start + weight_chunk_size, cpu_tp_weight_bytes);
              if (start < end) {
                uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b;
                fast_memcpy(w13_weight_dst + offset_in_gpu_weight + start, gate_weight_src + start, end - start);
              }
            } else if (task_id < NUM_WEIGHT_TASKS * 2) {
              // Up weight copy - chunked
              int chunk_idx = task_id - NUM_WEIGHT_TASKS;
              size_t start = chunk_idx * weight_chunk_size;
              size_t end = std::min(start + weight_chunk_size, cpu_tp_weight_bytes);
              if (start < end) {
                uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b;
                fast_memcpy(w13_weight_dst + offset_in_gpu_weight + gpu_tp_weight_bytes + start, up_weight_src + start,
                            end - start);
              }
            } else if (task_id < NUM_WEIGHT_TASKS * 2 + num_down_tasks) {
              // Down columns - split by column chunks
              // Each task handles multiple consecutive columns for better cache locality
              int chunk_idx = task_id - NUM_WEIGHT_TASKS * 2;
              size_t cols_per_chunk = (config_.hidden_size + num_down_tasks - 1) / num_down_tasks;
              size_t col_start = chunk_idx * cols_per_chunk;
              size_t col_end = std::min(col_start + cols_per_chunk, (size_t)config_.hidden_size);

              size_t weight_per_col = config_.intermediate_size >> 1;
              size_t scale_per_col = config_.intermediate_size / group_size;
              size_t gpu_weight_stride = (full_config.intermediate_size / gpu_tp_count) >> 1;
              size_t gpu_scale_stride = (full_config.intermediate_size / gpu_tp_count) / group_size;
              size_t gpu_weight_slice_offset = local_idx * weight_per_col;
              size_t gpu_scale_slice_offset = local_idx * scale_per_col;

              for (size_t col = col_start; col < col_end; col++) {
                fast_memcpy(w2_weight_dst + col * gpu_weight_stride + gpu_weight_slice_offset,
                            (uint8_t*)down_bb_[expert_id]->b + col * weight_per_col, weight_per_col);

                fast_fp32_to_bf16(w2_scale_dst + col * gpu_scale_stride + gpu_scale_slice_offset,
                                  down_bb_[expert_id]->d + col * scale_per_col, scale_per_col);
              }
            } else if (task_id == NUM_WEIGHT_TASKS * 2 + num_down_tasks) {
              // Gate scale convert
              float* gate_scale_src = gate_bb_[expert_id]->d;
              fast_fp32_to_bf16(w13_scale_dst + offset_in_gpu_scale, gate_scale_src, cpu_tp_scale_elem_count);
            } else {
              // Up scale convert
              float* up_scale_src = up_bb_[expert_id]->d;
              fast_fp32_to_bf16(w13_scale_dst + offset_in_gpu_scale + gpu_tp_scale_elem_count, up_scale_src,
                                cpu_tp_scale_elem_count);
            }
          },
          nullptr);
    } else {
      // cpu_tp_count < gpu_tp_count: one CPU TP writes to multiple GPU TPs
      int gpu_tps_per_cpu_tp = gpu_tp_count / cpu_tp_count;
      int start_gpu_tp = tp_part_idx * gpu_tps_per_cpu_tp;

      // Size of data per GPU TP within this CPU TP
      size_t data_per_gpu_tp_weight = cpu_tp_weight_bytes / gpu_tps_per_cpu_tp;
      size_t data_per_gpu_tp_scale = cpu_tp_scale_elem_count / gpu_tps_per_cpu_tp;

      // Optimized task layout
      constexpr int NUM_WEIGHT_TASKS = 8;
      constexpr int MIN_COLS_PER_TASK = 128;
      int num_down_tasks = std::max(1, (int)config_.hidden_size / MIN_COLS_PER_TASK);
      num_down_tasks = std::min(num_down_tasks, 32);

      int tasks_per_gpu_tp = NUM_WEIGHT_TASKS * 2 + num_down_tasks + 2;
      int total_tasks = tasks_per_gpu_tp * gpu_tps_per_cpu_tp;

      size_t weight_chunk_size = (data_per_gpu_tp_weight + NUM_WEIGHT_TASKS - 1) / NUM_WEIGHT_TASKS;
      weight_chunk_size = (weight_chunk_size + 63) & ~63ULL;

      pool->do_work_stealing_job(
          total_tasks, nullptr,
          [&, this, gpu_tps_per_cpu_tp, start_gpu_tp, data_per_gpu_tp_weight, data_per_gpu_tp_scale, num_down_tasks,
           tasks_per_gpu_tp, expert_id, weight_chunk_size](int task_id) {
            int local_gpu_idx = task_id / tasks_per_gpu_tp;
            int task_type = task_id % tasks_per_gpu_tp;
            int gpu_tp_idx = start_gpu_tp + local_gpu_idx;

            // Get pointers for this GPU TP part
            uint8_t* w13_weight_dst = (uint8_t*)w13_weight_ptrs[gpu_tp_idx];
            ggml_bf16_t* w13_scale_dst = (ggml_bf16_t*)w13_scale_ptrs[gpu_tp_idx];
            uint8_t* w2_weight_dst = (uint8_t*)w2_weight_ptrs[gpu_tp_idx];
            ggml_bf16_t* w2_scale_dst = (ggml_bf16_t*)w2_scale_ptrs[gpu_tp_idx];

            // Calculate offsets within CPU TP buffers
            size_t cpu_offset_weight = local_gpu_idx * data_per_gpu_tp_weight;
            size_t cpu_offset_scale = local_gpu_idx * data_per_gpu_tp_scale;

            if (task_type < NUM_WEIGHT_TASKS) {
              // Gate weight copy - chunked
              int chunk_idx = task_type;
              size_t start = chunk_idx * weight_chunk_size;
              size_t end = std::min(start + weight_chunk_size, data_per_gpu_tp_weight);
              if (start < end) {
                uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b + cpu_offset_weight;
                fast_memcpy(w13_weight_dst + start, gate_weight_src + start, end - start);
              }
            } else if (task_type < NUM_WEIGHT_TASKS * 2) {
              // Up weight copy - chunked
              int chunk_idx = task_type - NUM_WEIGHT_TASKS;
              size_t start = chunk_idx * weight_chunk_size;
              size_t end = std::min(start + weight_chunk_size, data_per_gpu_tp_weight);
              if (start < end) {
                uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b + cpu_offset_weight;
                fast_memcpy(w13_weight_dst + gpu_tp_weight_bytes + start, up_weight_src + start, end - start);
              }
            } else if (task_type < NUM_WEIGHT_TASKS * 2 + num_down_tasks) {
              // Down columns - split by column chunks
              int chunk_idx = task_type - NUM_WEIGHT_TASKS * 2;
              size_t cols_per_chunk = (config_.hidden_size + num_down_tasks - 1) / num_down_tasks;
              size_t col_start = chunk_idx * cols_per_chunk;
              size_t col_end = std::min(col_start + cols_per_chunk, (size_t)config_.hidden_size);

              size_t weight_per_gpu_col = (config_.intermediate_size / gpu_tps_per_cpu_tp) >> 1;
              size_t scale_per_gpu_col = (config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size;

              for (size_t col = col_start; col < col_end; col++) {
                size_t col_offset_weight = (col * config_.intermediate_size / 2) +
                                           (local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
                size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) +
                                          (local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);

                fast_memcpy(w2_weight_dst + col * weight_per_gpu_col,
                            (uint8_t*)down_bb_[expert_id]->b + col_offset_weight, weight_per_gpu_col);

                fast_fp32_to_bf16(w2_scale_dst + col * scale_per_gpu_col, down_bb_[expert_id]->d + col_offset_scale,
                                  scale_per_gpu_col);
              }
            } else if (task_type == NUM_WEIGHT_TASKS * 2 + num_down_tasks) {
              // Gate scale convert
              float* gate_scale_src = gate_bb_[expert_id]->d + cpu_offset_scale;
              fast_fp32_to_bf16(w13_scale_dst, gate_scale_src, data_per_gpu_tp_scale);
            } else {
              // Up scale convert
              float* up_scale_src = up_bb_[expert_id]->d + cpu_offset_scale;
              fast_fp32_to_bf16(w13_scale_dst + gpu_tp_scale_elem_count, up_scale_src, data_per_gpu_tp_scale);
            }
          },
          nullptr);
    }
  }
};

// ============================================================================
// TP_MOE specialization for AMX_K2_MOE_TP
// Inherits from TP_MOE<AMX_MOE_BASE<...>> to reuse merge_results implementation
// ============================================================================

template <typename K>
class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE<AMX_MOE_BASE<K, AMX_K2_MOE_TP<K>>> {
 public:
  using Base = TP_MOE<AMX_MOE_BASE<K, AMX_K2_MOE_TP<K>>>;
  using Base::Base;

  void load_weights() override {
    auto& config = this->config;
    auto& tps = this->tps;
    auto& tp_count = this->tp_count;
    auto pool = config.pool;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;

#ifdef LOAD_TIME_PROFILE
    auto load_start_time = std::chrono::high_resolution_clock::now();
    auto load_last = load_start_time;
    long alloc_and_tp_slice_time = 0, tps_load_time = 0, cleanup_time = 0;
#endif

    bool use_per_expert_ptrs = !config.gate_projs.empty();

    if (config.gate_projs.empty() && config.gate_scale == nullptr) {
      throw std::runtime_error("K2 MoE only supports Packed Int4 with KGroup Scale");
    }

    if (use_per_expert_ptrs) {
      printf("From per-expert pointers (gate_projs)\n");
    } else {
      printf("From Packed Int4 with KGroup Scale\n");
    }

    int& group_size = config.quant_config.group_size;

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      size_t weight_elem_count = tpc.intermediate_size * tpc.hidden_size;
      size_t scales_elem_count = (tpc.hidden_size / group_size) * tpc.intermediate_size;

      tpc.gate_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
      tpc.up_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
      tpc.down_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
      tpc.gate_scale = new ggml_bf16_t[(tpc.expert_num * scales_elem_count)];
      tpc.up_scale = new ggml_bf16_t[(tpc.expert_num * scales_elem_count)];
      tpc.down_scale = new ggml_bf16_t[(tpc.expert_num * scales_elem_count)];

      if (use_per_expert_ptrs) {
        pool->get_subpool(i)->do_work_stealing_job(
            tpc.expert_num, nullptr,
            [&, i](int expert_id_) {
              size_t expert_id = expert_map(physical_to_logical_map, expert_id_);

              uint8_t* src_gate = (uint8_t*)config.gate_projs[0][expert_id];
              uint8_t* src_up = (uint8_t*)config.up_projs[0][expert_id];
              uint8_t* src_down = (uint8_t*)config.down_projs[0][expert_id];
              ggml_bf16_t* src_gate_scale = (ggml_bf16_t*)config.gate_scales[0][expert_id];
              ggml_bf16_t* src_up_scale = (ggml_bf16_t*)config.up_scales[0][expert_id];
              ggml_bf16_t* src_down_scale = (ggml_bf16_t*)config.down_scales[0][expert_id];

              memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
                     src_gate + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));

              memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
                     src_up + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));

              memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
                     src_gate_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);

              memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
                     src_up_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);

              for (size_t col = 0; col < config.hidden_size; col++) {
                memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
                       src_down + ((col * config.intermediate_size + i * tpc.intermediate_size) >> 1),
                       (tpc.intermediate_size >> 1));
                memcpy((ggml_bf16_t*)tpc.down_scale +
                           (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
                       src_down_scale +
                           (col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
                       sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
              }
            },
            nullptr);
      } else {
        if (tpc.load == false) {
          pool->get_subpool(i)->do_work_stealing_job(
              tpc.expert_num, nullptr,
              [&, i](int expert_id_) {
                size_t expert_id = expert_map(physical_to_logical_map, expert_id_);

                memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
                       (uint8_t*)config.gate_proj +
                           ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
                       ((sizeof(uint8_t) * weight_elem_count) >> 1));

                memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
                       (uint8_t*)config.up_proj +
                           ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
                       ((sizeof(uint8_t) * weight_elem_count) >> 1));

                memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
                       (ggml_bf16_t*)config.gate_scale +
                           (expert_id * (config.hidden_size / group_size) * config.intermediate_size +
                            i * scales_elem_count),
                       sizeof(ggml_bf16_t) * scales_elem_count);

                memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
                       (ggml_bf16_t*)config.up_scale +
                           (expert_id * (config.hidden_size / group_size) * config.intermediate_size +
                            i * scales_elem_count),
                       sizeof(ggml_bf16_t) * scales_elem_count);

                for (size_t col = 0; col < config.hidden_size; col++) {
                  memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
                         (uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
                                                        col * config.intermediate_size + i * tpc.intermediate_size) >>
                                                       1),
                         (sizeof(uint8_t) * tpc.intermediate_size) >> 1);
                  memcpy((ggml_bf16_t*)tpc.down_scale +
                             (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
                         (ggml_bf16_t*)config.down_scale +
                             ((expert_id * (config.intermediate_size / group_size) * config.hidden_size) +
                              col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
                         sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
                }
              },
              nullptr);
        }
      }
      printf("TP %d load weight done.\n", i);
    });

#ifdef LOAD_TIME_PROFILE
    {
      auto load_now_time = std::chrono::high_resolution_clock::now();
      alloc_and_tp_slice_time =
          std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
      load_last = load_now_time;
    }
#endif

    DO_TPS_LOAD_WEIGHTS(pool);

#ifdef LOAD_TIME_PROFILE
    {
      auto load_now_time = std::chrono::high_resolution_clock::now();
      tps_load_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
      load_last = load_now_time;
    }
#endif

    pool->dispense_backend()->do_numa_job([&, this](int i) {
      auto& tpc = tps[i]->config_;
      delete[] (uint8_t*)(tpc.gate_proj);
      delete[] (uint8_t*)(tpc.up_proj);
      delete[] (uint8_t*)(tpc.down_proj);

      delete[] (ggml_bf16_t*)(tpc.gate_scale);
      delete[] (ggml_bf16_t*)(tpc.up_scale);
      delete[] (ggml_bf16_t*)(tpc.down_scale);
    });

#ifdef LOAD_TIME_PROFILE
    {
      auto load_now_time = std::chrono::high_resolution_clock::now();
      cleanup_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
    }
    auto load_end_time = std::chrono::high_resolution_clock::now();
    auto load_total_time =
        std::chrono::duration_cast<std::chrono::microseconds>(load_end_time - load_start_time).count();
    printf(
        "[K2 MoE Load Weights] tp_count: %d, alloc_and_tp_slice: %ld us, tps_load_weights: %ld us, cleanup: %ld us, "
        "total: %ld us\n",
        tp_count, alloc_and_tp_slice_time, tps_load_time, cleanup_time, load_total_time);
#endif

    this->weights_loaded = true;
  }

  void write_weight_scale_to_buffer(int gpu_tp_count, int expert_id, const std::vector<uintptr_t>& w13_weight_ptrs,
                                    const std::vector<uintptr_t>& w13_scale_ptrs,
                                    const std::vector<uintptr_t>& w2_weight_ptrs,
                                    const std::vector<uintptr_t>& w2_scale_ptrs) {
    if (this->weights_loaded == false) {
      throw std::runtime_error("Not Loaded");
    }
    if (this->tps.empty()) {
      throw std::runtime_error("No TP parts initialized");
    }

    if (w13_weight_ptrs.size() != gpu_tp_count || w13_scale_ptrs.size() != gpu_tp_count ||
        w2_weight_ptrs.size() != gpu_tp_count || w2_scale_ptrs.size() != gpu_tp_count) {
      throw std::runtime_error("Pointer arrays size must match gpu_tp_count");
    }

    this->config.pool->dispense_backend()->do_numa_job([&, this](int i) {
      this->tps[i]->write_weights_to_buffer(gpu_tp_count, this->tp_count, expert_id, this->config, w13_weight_ptrs,
                                            w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs);
    });
  }

  // merge_results is inherited from TP_MOE<AMX_MOE_BASE<K, AMX_K2_MOE_TP<K>>>
};

#endif  // CPUINFER_OPERATOR_AMX_K2_MOE_H


================================================
FILE: kt-kernel/operators/amx/la/amx-example.cpp
================================================
#include <random>
#include <stdexcept>

#include "amx.hpp"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"

int main() {
  // init GGML
  struct ggml_init_params params = {
      0,
      NULL,
      true,
  };

  auto ctx_eval = ggml_init(params);

  if (!ctx_eval) {
    throw std::runtime_error("Failed to create ggml context");
  }

  // Allocate Memory
  int m = 1000, n = 8, k = 512;
  float* a = new float[m * k];  // m x k, Row Major
  float* b = new float[k * n];  // k x n, Column Major
  size_t c_row_size = n * sizeof(float);
  c_row_size = (c_row_size + 63) / 64 * 64;  // pad C row
  float* c = new (std::align_val_t(64)) float[m * c_row_size];
  memset(c, 0, m * c_row_size * sizeof(float));
  size_t ldc = c_row_size * sizeof(float);

  std::mt19937 gen(123);
  std::uniform_real_distribution<float> dis(0, 16);
  for (int i = 0; i < m * k; i++) {
    a[i] = dis(gen);
  }
  for (int i = 0; i < k * n; i++) {
    b[i] = dis(gen);
  }

  // Convert to BF16
  // QA and QB must be aligned to 64 for BF16
  // k is a multiple of 32, so no need for padding
  ggml_bf16_t* qa = new (std::align_val_t(64)) ggml_bf16_t[m * k];
  size_t lda = k * sizeof(ggml_bf16_t);
  ggml_bf16_t* qb = new (std::align_val_t(64)) ggml_bf16_t[k * n];
  size_t ldb = k * sizeof(ggml_bf16_t);
  ggml_fp32_to_bf16_row(a, qa, m * k);
  ggml_fp32_to_bf16_row(b, qb, k * n);

  // AMX Computation
  amx::init_tile(GGML_TYPE_BF16, GGML_TYPE_BF16, GGML_TYPE_F32);
  int nth = amx::recommended_nth(m, n, k, GGML_TYPE_BF16, GGML_TYPE_BF16, GGML_TYPE_F32);

#pragma omp parallel for
  for (int ith = 0; ith < nth; ith++) {
    amx::gemm(m, n, k, qa, lda, GGML_TYPE_BF16, qb, ldb, GGML_TYPE_BF16, c, ldc, GGML_TYPE_F32, ith, nth);
  }

  // Check
  float* d = new float[m * n];
  memset(d, 0, m * n * sizeof(float));
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      for (int kk = 0; kk < k; kk++) {
        d[i * n + j] += a[i * k + kk] * b[j * k + kk];
      }
    }
  }

  float max_error = 0;
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      max_error = std::max(max_error, std::abs(d[i * n + j] - c[i * c_row_size + j]) / std::abs(d[i * n + j]));
      // printf("%.2f ",c[i*c_row_size+j]);
    }
    // printf("\n");
  }
  printf("Max Error %f%%\n", max_error * 100);

  return 0;
}


================================================
FILE: kt-kernel/operators/amx/la/amx.hpp
================================================
#ifndef AMX_HPP
#define AMX_HPP
#include <emmintrin.h>
#include <immintrin.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <tmmintrin.h>
#include <unistd.h>

#include <cassert>
#include <cstdio>
#include <stdexcept>

#include "llama.cpp/ggml-quants.h"

// Include the split AMX headers
#include "amx_config.hpp"
#include "amx_kernels.hpp"

namespace amx {

static inline __m512 exp_avx512(__m512 x) {
  const __m512 log2e = _mm512_set1_ps(1.44269504089f);
  const __m512 c1 = _mm512_set1_ps(0.69314718056f);

  __m512 y = _mm512_mul_ps(x, log2e);
  __m512i int_part = _mm512_cvtps_epi32(y);
  __m512 frac_part = _mm512_sub_ps(y, _mm512_cvtepi32_ps(int_part));

  const __m512 poly_1 = _mm512_set1_ps(0.9999999995f);
  const __m512 poly_2 = _mm512_set1_ps(0.6931471805f);
  const __m512 poly_3 = _mm512_set1_ps(0.2402265069f);
  const __m512 poly_4 = _mm512_set1_ps(0.0555041087f);
  const __m512 poly_5 = _mm512_set1_ps(0.0096181291f);
  const __m512 poly_6 = _mm512_set1_ps(0.0013333558f);

  __m512 frac_exp = _mm512_fmadd_ps(
      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(poly_6, frac_part, poly_5), frac_part, poly_4),
                                      frac_part, poly_3),
                      frac_part, poly_2),
      frac_part, poly_1);

  __m512 two_pow_i = _mm512_scalef_ps(_mm512_set1_ps(1.0f), _mm512_cvtepi32_ps(int_part));
  return _mm512_mul_ps(two_pow_i, frac_exp);
}

static inline __m512 act_fn(__m512 gate_val, __m512 up_val) {
  __m512 neg_gate_val = _mm512_sub_ps(_mm512_setzero_ps(), gate_val);
  // Clamp neg_gate_val to avoid exp overflow (exp(88) overflows for float32)
  const __m512 max_exp_input = _mm512_set1_ps(88.0f);
  neg_gate_val = _mm512_min_ps(neg_gate_val, max_exp_input);
  __m512 exp_neg_gate = exp_avx512(neg_gate_val);
  __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), exp_neg_gate);
  __m512 act_val = _mm512_div_ps(gate_val, denom);

  return _mm512_mul_ps(act_val, up_val);
}

#define AMX_DISPATCH_QTYPES(QA, QB, ...)                                 \
  [&] {                                                                  \
    switch (QB) {                                                        \
      case GGML_TYPE_Q8_0: {                                             \
        using qb = block_q8_0;                                           \
        switch (QA) {                                                    \
          case GGML_TYPE_Q4_0: {                                         \
            using qa = block_q4_0;                                       \
            return __VA_ARGS__();                                        \
          }                                                              \
          case GGML_TYPE_Q8_0: {                                         \
            using qa = block_q8_0;                                       \
            return __VA_ARGS__();                                        \
          }                                                              \
          default:                                                       \
            throw std::runtime_error("Unsupported quantized data type"); \
        }                                                                \
      }                                                                  \
      case GGML_TYPE_Q8_K: {                                             \
        using qb = block_q8_K;                                           \
        switch (QA) {                                                    \
          case GGML_TYPE_Q4_K: {                                         \
            using qa = block_q4_K;                                       \
            return __VA_ARGS__();                                        \
          }                                                              \
          default:                                                       \
            throw std::runtime_error("Unsupported quantized data type"); \
        }                                                                \
      }                                                                  \
      case GGML_TYPE_BF16: {                                             \
        using qb = ggml_bf16_t;                                          \
        switch (QA) {                                                    \
          case GGML_TYPE_BF16: {                                         \
            using qa = ggml_bf16_t;                                      \
            return __VA_ARGS__();                                        \
          }                                                              \
          default:                                                       \
            throw std::runtime_error("Unsupported quantized data type"); \
        }                                                                \
      }                                                                  \
      default:                                                           \
        throw std::runtime_error("Unsupported quantized data type");     \
    }                                                                    \
  }()

inline void gemm(int m, int n, int k, const void* a, size_t lda, int type_a, const void* b, size_t ldb, int type_b,
                 void* c, size_t ldc, int type_c, int ith, int nth) {
  assert(reinterpret_cast<intptr_t>(c) % 64 == 0);
  assert(ldc % 64 == 0);
  assert(type_c == GGML_TYPE_F32);
  float* cs = (float*)c;
  AMX_DISPATCH_QTYPES(type_a, type_b, [&]() { mat_mul(m, n, k, (qa*)a, lda, (qb*)b, ldb, cs, ldc, ith, nth); });
}

inline void init_tile(int type_a, int type_b, int type_c) {
#ifdef HAVE_AMX
  enable_amx();
  assert(type_c == GGML_TYPE_F32);
  AMX_DISPATCH_QTYPES(type_a, type_b, []() { return GemmKernel<qa, qb, float>::type::config(); });
#endif
}

inline int recommended_nth(int m, int n, int k, int type_a, int type_b, int type_c) {
  assert(type_c == GGML_TYPE_F32);
  return AMX_DISPATCH_QTYPES(type_a, type_b, [&]() { return GemmKernel<qa, qb, float>::type::recommended_nth(m); });
}

}  // namespace amx

#endif  // AMX_HPP

================================================
FILE: kt-kernel/operators/amx/la/amx_buffers.hpp
================================================
#ifndef AMX_BUFFERS_HPP
#define AMX_BUFFERS_HPP
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <limits>
#include <vector>

#include "amx_config.hpp"
#include "amx_utils.hpp"
#include "llama.cpp/ggml-impl.h"
#include "pack.hpp"
#include "utils.hpp"

namespace amx {

template <typename K>
struct BufferAImpl {
  int8_t* a;
  float* d;
  int max_m, k;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

  static size_t required_size(int max_m, int k) { return sizeof(int8_t) * max_m * k + sizeof(float) * max_m; }

  BufferAImpl(int max_m, int k, void* ptr) : max_m(max_m), k(k) {
    assert(max_m % M_STEP == 0);
    assert(k % K_STEP == 0);
    if (max_m % M_STEP || k % K_STEP) {
      printf("max_m = %d, k = %d, M_STEP = %d, K_STEP = %d\n", max_m, k, M_STEP, K_STEP);
      throw std::runtime_error("BufferAImpl: max_m and k must be multiple of M_STEP and K_STEP");
    }
    set_data(ptr);
  }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    a = reinterpret_cast<int8_t*>(ptr);
    d = reinterpret_cast<float*>(a + max_m * k);
  }

  void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
    assert(m <= max_m);
    assert(ith == 0 && nth == 1);
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
        __m512 amax_v0 = _mm512_setzero_ps();
        __m512 amax_v1 = _mm512_setzero_ps();
        __m512 amax_v2 = _mm512_setzero_ps();
        __m512 amax_v3 = _mm512_setzero_ps();
        __m512 amax_v4 = _mm512_setzero_ps();
        __m512 amax_v5 = _mm512_setzero_ps();
        __m512 amax_v6 = _mm512_setzero_ps();
        __m512 amax_v7 = _mm512_setzero_ps();
        for (int j = 0; j < k; j += 128) {
          __m512 f0, f1, f2, f3, f4, f5, f6, f7;
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 0), &f0, &f1);
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 32), &f2, &f3);
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 64), &f4, &f5);
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 96), &f6, &f7);
          amax_v0 = vector_abs_max(amax_v0, f0);
          amax_v1 = vector_abs_max(amax_v1, f1);
          amax_v2 = vector_abs_max(amax_v2, f2);
          amax_v3 = vector_abs_max(amax_v3, f3);
          amax_v4 = vector_abs_max(amax_v4, f4);
          amax_v5 = vector_abs_max(amax_v5, f5);
          amax_v6 = vector_abs_max(amax_v6, f6);
          amax_v7 = vector_abs_max(amax_v7, f7);
        }
        amax_v0 = vector_abs_max(amax_v0, amax_v1);
        amax_v2 = vector_abs_max(amax_v2, amax_v3);
        amax_v4 = vector_abs_max(amax_v4, amax_v5);
        amax_v6 = vector_abs_max(amax_v6, amax_v7);
        amax_v0 = vector_abs_max(amax_v0, amax_v2);
        amax_v4 = vector_abs_max(amax_v4, amax_v6);
        amax_v0 = vector_abs_max(amax_v0, amax_v4);
        float amax = _mm512_reduce_max_ps(amax_v0);
        d[m_begin + i] = amax / ((1 << 7) - 1);
      }
    }
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512 id = _mm512_set1_ps(d[m_begin + i] ? 1.0f / d[m_begin + i] : 0.0f);
            int8_t* dst = a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP;
            __m512 f0, f1, f2, f3;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin), &f0, &f1);
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
            __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
            __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
            __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
            __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
            __m128i s0 = _mm512_cvtsepi32_epi8(i0);
            __m128i s1 = _mm512_cvtsepi32_epi8(i1);
            __m128i s2 = _mm512_cvtsepi32_epi8(i2);
            __m128i s3 = _mm512_cvtsepi32_epi8(i3);
            _mm_store_si128((__m128i*)dst, s0);
            _mm_store_si128((__m128i*)(dst + 16), s1);
            _mm_store_si128((__m128i*)(dst + 32), s2);
            _mm_store_si128((__m128i*)(dst + 48), s3);
          }
        }
      }
    }
  }

  int8_t* get_submat(int m, int k, int m_begin, int k_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
  }

  float* get_scale(int m, int m_begin) { return d + m_begin; }
};

template <typename K>
struct BufferAWithSumImpl {
  int8_t* a;
  float* d;
  float* sum;
  int max_m, k;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

  static size_t required_size(int max_m, int k) { return sizeof(int8_t) * max_m * k + sizeof(float) * max_m * 2; }

  BufferAWithSumImpl(int max_m, int k, void* ptr) : max_m(max_m), k(k) {
    assert(max_m % M_STEP == 0);
    assert(k % K_STEP == 0);
    if (max_m % M_STEP || k % K_STEP) {
      printf("max_m = %d, k = %d, M_STEP = %d, K_STEP = %d\n", max_m, k, M_STEP, K_STEP);
      throw std::runtime_error("BufferAWithSumImpl: max_m and k must be multiple of M_STEP and K_STEP");
    }
    set_data(ptr);
  }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    a = reinterpret_cast<int8_t*>(ptr);
    d = reinterpret_cast<float*>(a + max_m * k);
    sum = d + max_m;
  }

  void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
    assert(m <= max_m);
    assert(ith == 0 && nth == 1);
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
        float amax = 0.0f;
        float row_sum = 0.0f;
        for (int j = 0; j < k; j += 32) {
          __m512 f0, f1;
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j), &f0, &f1);
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          row_sum += _mm512_reduce_add_ps(f0);
          row_sum += _mm512_reduce_add_ps(f1);
        }
        d[m_begin + i] = amax / ((1 << 7) - 1);
        sum[m_begin + i] = row_sum;
      }
    }
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512 id = _mm512_set1_ps(d[m_begin + i] ? 1.0f / d[m_begin + i] : 0.0f);
            int8_t* dst = a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP;
            __m512 f0, f1, f2, f3;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin), &f0, &f1);
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
            __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
            __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
            __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
            __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
            __m128i s0 = _mm512_cvtsepi32_epi8(i0);
            __m128i s1 = _mm512_cvtsepi32_epi8(i1);
            __m128i s2 = _mm512_cvtsepi32_epi8(i2);
            __m128i s3 = _mm512_cvtsepi32_epi8(i3);
            _mm_store_si128((__m128i*)dst, s0);
            _mm_store_si128((__m128i*)(dst + 16), s1);
            _mm_store_si128((__m128i*)(dst + 32), s2);
            _mm_store_si128((__m128i*)(dst + 48), s3);
          }
        }
      }
    }
  }

  int8_t* get_submat(int m, int k, int m_begin, int k_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
  }

  float* get_scale(int m, int m_begin) { return d + m_begin; }
  float* get_sum(int m, int m_begin) { return sum + m_begin; }
};

template <typename K>
struct BufferAWithSumKGroupImpl {
  int8_t* a;
  float* d;
  float* sum;
  int max_m, k, k_group_size, k_group_count;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

  static size_t required_size(int max_m, int k, int k_group_size) {
    return sizeof(int8_t) * max_m * k + sizeof(float) * max_m * (k / k_group_size) * 2;
  }

  BufferAWithSumKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
      : max_m(max_m), k(k), k_group_size(k_group_size) {
    if (max_m % M_STEP || k % K_STEP || k % k_group_size) {
      printf("max_m = %d, k = %d, M_STEP = %d, K_STEP = %d, k_group_size = %d\n", max_m, k, M_STEP, K_STEP,
             k_group_size);
      throw std::runtime_error("BufferAWithSumImpl: max_m and k must be multiple of M_STEP and K_STEP");
    }
    k_group_count = k / k_group_size;
    set_data(ptr);
  }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    a = reinterpret_cast<int8_t*>(ptr);
    d = reinterpret_cast<float*>(a + max_m * k);
    sum = d + max_m * k_group_count;
  }

  void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
    assert(m <= max_m);
    assert(ith == 0 && nth == 1);
    // for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
    //   for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
    //     for(int kg = 0; kg < k_group_count; kg++){
    //       float amax = 0.0f;
    //       float row_sum = 0.0f;
    //       for (int j = 0; j < k; j += 32) {
    //         __m512 f0, f1;
    //         avx512_32xbf16_to_32xfp32((__m512i *)(src + (m_begin + i) * k + j), &f0, &f1);
    //         amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
    //         amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
    //         row_sum += _mm512_reduce_add_ps(f0);
    //         row_sum += _mm512_reduce_add_ps(f1);
    //       }
    //       d[(m_begin + i) * k_group_count + kg] = amax / ((1 << 7) - 1);
    //       sum[(m_begin + i) * k_group_count + kg] = row_sum;
    //     }
    //   }
    // }
    for (int m_idx = 0; m_idx < m; m_idx++) {
      for (int kg = 0; kg < k_group_count; kg++) {
        float amax = 0.0f;
        float row_sum = 0.0f;
        int k_start = kg * k_group_size;
        int k_end = k_start + k_group_size;
        for (int j = k_start; j < k_end; j += 32) {
          __m512 f0, f1;
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_idx)*k + j), &f0, &f1);
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          row_sum += _mm512_reduce_add_ps(f0);
          row_sum += _mm512_reduce_add_ps(f1);
        }
        d[(m_idx)*k_group_count + kg] = amax / ((1 << 7) - 1);
        sum[(m_idx)*k_group_count + kg] = row_sum;
      }
    }
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            int k_group_idx = (k_block_begin + k_begin) / k_group_size;
            float scale = d[(m_begin + i) * k_group_count + k_group_idx];
            __m512 id = _mm512_set1_ps(scale ? 1.0f / scale : 0.0f);
            // __m512 id = _mm512_set1_ps(d[m_begin + i] ? 1.0f / d[m_begin + i] : 0.0f);
            int8_t* dst = a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP;
            __m512 f0, f1, f2, f3;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin), &f0, &f1);
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
            __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
            __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
            __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
            __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
            __m128i s0 = _mm512_cvtsepi32_epi8(i0);
            __m128i s1 = _mm512_cvtsepi32_epi8(i1);
            __m128i s2 = _mm512_cvtsepi32_epi8(i2);
            __m128i s3 = _mm512_cvtsepi32_epi8(i3);
            _mm_store_si128((__m128i*)dst, s0);
            _mm_store_si128((__m128i*)(dst + 16), s1);
            _mm_store_si128((__m128i*)(dst + 32), s2);
            _mm_store_si128((__m128i*)(dst + 48), s3);
          }
        }
      }
    }
  }

  int8_t* get_submat(int m, int k, int m_begin, int k_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
  }

  float* get_scale(int m, int m_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return d + m_begin * k_group_count + k_group_idx;
  }
  float* get_sum(int m, int m_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return sum + m_begin * k_group_count + k_group_idx;
  }
};

template <typename K>
struct BufferAKGroupImpl {
  int8_t* a;
  float* d;
  int max_m, k, k_group_size, k_group_count;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

  static size_t required_size(int max_m, int k, int k_group_size) {
    ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
    return sizeof(int8_t) * max_m * k + sizeof(float) * max_m * (k / k_group_size);
  }

  BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : max_m(max_m), k(k), k_group_size(k_group_size) {
    ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
    ASSERT_RELEASE(max_m % M_STEP == 0, "max_m must be multiple of M_STEP");
    ASSERT_RELEASE(k % K_STEP == 0, "k must be multiple of K_STEP");
    ASSERT_RELEASE(K_BLOCK % k_group_size == 0, "K_BLOCK must be multiple of k_group_size");
    // ASSERT_RELEASE(k % K_BLOCK == 0, "k must be multiple of K_BLOCK");
    k_group_count = k / k_group_size;

    set_data(ptr);
  }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    a = reinterpret_cast<int8_t*>(ptr);
    d = reinterpret_cast<float*>(a + max_m * k);
  }

  int8_t* get_submat(int m, int k, int m_begin, int k_begin) {
    // Follow BufferAImpl pattern
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
  }

  void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
    assert(m <= max_m);
    assert(ith == 0 && nth == 1);

    // 计算每个 k_group 的 scale
    for (int m_idx = 0; m_idx < m; m_idx++) {
      for (int kg = 0; kg < k_group_count; kg++) {
        float amax = 0.0f;
        int k_start = kg * k_group_size;
        int k_end = k_start + k_group_size;
        // 32 -> M_STEP
        for (int j = k_start; j < k_end; j += 32) {
          __m512 f0, f1;
          avx512_32xbf16_to_32xfp32((__m512i*)(src + m_idx * k + j), &f0, &f1);
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
        }
        d[m_idx * k_group_count + kg] = amax / ((1 << 7) - 1);
      }
    }

    // Simplified quantization following BufferAImpl pattern but with k-group support
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            // Get the scale for this k_group
            int k_group_idx = (k_block_begin + k_begin) / k_group_size;
            float scale = d[(m_begin + i) * k_group_count + k_group_idx];
            __m512 id = _mm512_set1_ps(scale ? 1.0f / scale : 0.0f);

            // Calculate destination similar to BufferAImpl but accounting for k-groups
            int8_t* dst = a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP;

            __m512 f0, f1, f2, f3;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin), &f0, &f1);
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
            __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
            __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
            __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
            __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
            __m128i s0 = _mm512_cvtsepi32_epi8(i0);
            __m128i s1 = _mm512_cvtsepi32_epi8(i1);
            __m128i s2 = _mm512_cvtsepi32_epi8(i2);
            __m128i s3 = _mm512_cvtsepi32_epi8(i3);
            _mm_store_si128((__m128i*)dst, s0);
            _mm_store_si128((__m128i*)(dst + 16), s1);
            _mm_store_si128((__m128i*)(dst + 32), s2);
            _mm_store_si128((__m128i*)(dst + 48), s3);
          }
        }
      }
    }
  }

  float* get_scale(int m, int m_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return d + m_begin * k_group_count + k_group_idx;
  }
};

// BufferASmallKGroupImpl: For kernels with K_STEP=32 (e.g., GemmKernel224Int4SmallKGroup)
// This fixes the buffer overflow issue where the base class writes 64 bytes per K_STEP iteration
// but the buffer is only sized for 32-byte steps.
template <typename K>
struct BufferASmallKGroupImpl : public BufferAKGroupImpl<K> {
  using Base = BufferAKGroupImpl<K>;
  using Base::a;
  using Base::d;
  using Base::k;
  using Base::k_group_count;
  using Base::k_group_size;
  using Base::max_m;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

  BufferASmallKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : Base(max_m, k, k_group_size, ptr) {}

  // Override from_mat to write only 32 bytes per K_STEP iteration
  void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
    assert(m <= max_m);
    assert(ith == 0 && nth == 1);

    // Calculate scale for each k_group (same as base class)
    for (int m_idx = 0; m_idx < m; m_idx++) {
      for (int kg = 0; kg < k_group_count; kg++) {
        float amax = 0.0f;
        int k_start = kg * k_group_size;
        int k_end = k_start + k_group_size;
        for (int j = k_start; j < k_end; j += 32) {
          __m512 f0, f1;
          avx512_32xbf16_to_32xfp32((__m512i*)(src + m_idx * k + j), &f0, &f1);
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
        }
        d[m_idx * k_group_count + kg] = amax / ((1 << 7) - 1);
      }
    }

    // Quantization with 32-byte writes per K_STEP iteration
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            // Get the scale for this k_group
            int k_group_idx = (k_block_begin + k_begin) / k_group_size;
            float scale = d[(m_begin + i) * k_group_count + k_group_idx];
            __m512 id = _mm512_set1_ps(scale ? 1.0f / scale : 0.0f);

            // Calculate destination - writes K_STEP (32) bytes
            int8_t* dst = a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP;

            // Only process 32 bytes (2 x __m512 -> 2 x __m128i) per iteration
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin), &f0, &f1);
            __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
            __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
            __m128i s0 = _mm512_cvtsepi32_epi8(i0);
            __m128i s1 = _mm512_cvtsepi32_epi8(i1);
            _mm_store_si128((__m128i*)dst, s0);
            _mm_store_si128((__m128i*)(dst + 16), s1);
          }
        }
      }
    }
  }
};

template <typename K>
struct BufferBInt4Impl {
  using dt = typename K::dt;
  dt* b;
  float* d;
  int n, k;

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int TILE_N = K::TILE_N;

  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static const int B_K_STEP = 2 * K_STEP;
  static constexpr bool SCALE = true;

  static size_t required_size(int n, int k) { return sizeof(int8_t) * n * k / 2 + sizeof(float) * n; }

  BufferBInt4Impl(int n, int k, void* ptr) : n(n), k(k) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(n % N_STEP == 0);
    assert(k % B_K_STEP == 0);
    if (n % N_STEP || k % B_K_STEP) {
      printf("n: %d, k: %d, N_STEP: %d, B_K_STEP: %d\n", n, k, N_STEP, B_K_STEP);
      throw std::runtime_error("n or k is not aligned to N_STEP or B_K_STEP");
    }
    b = reinterpret_cast<dt*>(ptr);
    d = reinterpret_cast<float*>(offset_pointer(b, n * k / 2));
  }

  static __m128i round_4bit_s8(__m128i x) {
    __m128i s = _mm_and_si128(x, _mm_set1_epi8(0x80));
    s = _mm_or_si128(s, _mm_srai_epi16(s, 1));
    s = _mm_or_si128(s, _mm_srai_epi16(s, 2));
    s = _mm_or_si128(s, _mm_srai_epi16(s, 4));

    x = _mm_abs_epi8(x);
    x = _mm_add_epi8(x, _mm_set1_epi8(0x08));
    x = _mm_and_si128(x, _mm_set1_epi8(0xF0));
    x = _mm_xor_si128(x, s);
    x = _mm_sub_epi8(x, s);
    return x;
  }

  void from_mat(ggml_bf16_t* src, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int i = 0; i < N_STEP; i++) {
        float amax = 0.0f;
        for (int j = 0; j < k; j += 32) {
          __m512 f0, f1;
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
        }
        d[n_block_begin + n_begin + i] = amax / 112.0;  // 7*16
      }
    }
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += B_K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            __m512 id = _mm512_set1_ps(d[n_block_begin + n_begin + i] ? 1.0f / d[n_block_begin + n_begin + i] : 0.0f);
            dt* dst = offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                         k_begin * N_STEP + i * B_K_STEP) /
                                            2);
            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              s0 = _mm_srli_epi16(round_4bit_s8(s0), 4);
              s1 = _mm_srli_epi16(round_4bit_s8(s1), 4);
              s2 = _mm_srli_epi16(round_4bit_s8(s2), 4);
              s3 = _mm_srli_epi16(round_4bit_s8(s3), 4);
              // s0 = _mm_or_si128(round_up4(s0), _mm_srli_epi16(round_up4(s1), 4));
              // s2 = _mm_or_si128(round_up4(s2), _mm_srli_epi16(round_up4(s3), 4));
              _mm_store_si128((__m128i*)dst, s0);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)), s1);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)), s2);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)), s3);
            }

            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 2, &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 3, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              s0 = round_4bit_s8(s0);
              s1 = round_4bit_s8(s1);
              s2 = round_4bit_s8(s2);
              s3 = round_4bit_s8(s3);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 0)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 0))), s0));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 16))), s1));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 32))), s2));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 48))), s3));
            }
          }
          transpose_16x16_32bit((__m512i*)(offset_pointer(
              b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2)));
          transpose_16x16_32bit(
              (__m512i*)(offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                            k_begin * N_STEP + TILE_N * B_K_STEP) /
                                               2)));
        }
      }
    }
  }

  dt* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return offset_pointer(
        b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2);
  }

  float* get_scale(int n, int n_begin) { return d + n_begin; }
};

template <typename K>
struct BufferBKGroupImpl {
  using dt = typename K::dt;
  dt* b;
  float* d;
  int n, k, k_group_size, k_group_count;

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int TILE_N = K::TILE_N;

  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static const int B_K_STEP = 2 * K_STEP;
  static constexpr bool SCALE = true;

  static size_t required_size(int n, int k, int k_group_size) {
    ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
    return sizeof(int8_t) * n * k / 2 + sizeof(float) * n * (k / k_group_size);
  }

  BufferBKGroupImpl(int n, int k, int k_group_size, void* ptr) : n(n), k(k), k_group_size(k_group_size) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(n % N_STEP == 0);
    assert(k % B_K_STEP == 0);
    ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
    ASSERT_RELEASE(K_BLOCK % k_group_size == 0, "K_BLOCK must be multiple of k_group_size");
    if (n % N_STEP || k % B_K_STEP) {
      printf("n: %d, k: %d, N_STEP: %d, B_K_STEP: %d\n", n, k, N_STEP, B_K_STEP);
      throw std::runtime_error("n or k is not aligned to N_STEP or B_K_STEP");
    }
    k_group_count = k / k_group_size;
    b = reinterpret_cast<dt*>(ptr);
    d = reinterpret_cast<float*>(offset_pointer(b, n * k / 2));
  }

  static __m128i round_4bit_s8(__m128i x) {
    __m128i s = _mm_and_si128(x, _mm_set1_epi8(0x80));
    s = _mm_or_si128(s, _mm_srai_epi16(s, 1));
    s = _mm_or_si128(s, _mm_srai_epi16(s, 2));
    s = _mm_or_si128(s, _mm_srai_epi16(s, 4));

    x = _mm_abs_epi8(x);
    x = _mm_add_epi8(x, _mm_set1_epi8(0x08));
    x = _mm_and_si128(x, _mm_set1_epi8(0xF0));
    x = _mm_xor_si128(x, s);
    x = _mm_sub_epi8(x, s);
    return x;
  }

  void from_mat(ggml_bf16_t* src, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;

    // Compute scales per k-group for each n
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int i = 0; i < N_STEP; i++) {
        for (int kg = 0; kg < k_group_count; kg++) {
          float amax = 0.0f;
          int k_start = kg * k_group_size;
          int k_end = k_start + k_group_size;

          for (int j = k_start; j < k_end; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          }
          d[kg * n + (n_block_begin + n_begin + i)] = amax / 112.0;  // 7*16
          // d[(n_block_begin + n_begin + i) * k_group_count + kg] = amax / 112.0; // 7*16
        }
      }
    }

    // Quantize with per k-group scaling
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += B_K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            // Get the scale for this k_group
            int k_group_idx0 = (k_block_begin + k_begin) / k_group_size;
            int k_group_idx1 = (k_block_begin + k_begin + K_STEP) / k_group_size;
            float scale0 = d[k_group_idx0 * n + (n_block_begin + n_begin + i)];
            float scale1 = d[k_group_idx1 * n + (n_block_begin + n_begin + i)];
            // float scale = d[(n_block_begin + n_begin + i) * k_group_count + k_group_idx];
            __m512 id0 = _mm512_set1_ps(scale0 ? 1.0f / scale0 : 0.0f);
            __m512 id1 = _mm512_set1_ps(scale1 ? 1.0f / scale1 : 0.0f);

            dt* dst = offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                         k_begin * N_STEP + i * B_K_STEP) /
                                            2);
            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id0));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id0));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id0));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id0));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              s0 = _mm_srli_epi16(round_4bit_s8(s0), 4);
              s1 = _mm_srli_epi16(round_4bit_s8(s1), 4);
              s2 = _mm_srli_epi16(round_4bit_s8(s2), 4);
              s3 = _mm_srli_epi16(round_4bit_s8(s3), 4);
              _mm_store_si128((__m128i*)dst, s0);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)), s1);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)), s2);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)), s3);
            }

            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 2, &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 3, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id1));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id1));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id1));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id1));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              s0 = round_4bit_s8(s0);
              s1 = round_4bit_s8(s1);
              s2 = round_4bit_s8(s2);
              s3 = round_4bit_s8(s3);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 0)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 0))), s0));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 16))), s1));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 32))), s2));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 48))), s3));
            }
          }
          transpose_16x16_32bit((__m512i*)(offset_pointer(
              b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2)));
          transpose_16x16_32bit(
              (__m512i*)(offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                            k_begin * N_STEP + TILE_N * B_K_STEP) /
                                               2)));
        }
      }
    }
  }

  dt* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return offset_pointer(
        b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2);
  }

  float* get_scale(int n, int n_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return d + k_group_idx * n + n_begin;
    // return d + n_begin * k_group_count + k_group_idx;
  }
};

template <typename K>
struct BufferBInt4WithZeroImpl {
  using dt = typename K::dt;
  dt* b;
  float *d, *mins;  // scale, mins
  int n, k;

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int TILE_N = K::TILE_N;

  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static const int B_K_STEP = 2 * K_STEP;
  static constexpr bool SCALE = true;

  static size_t required_size(int n, int k) { return sizeof(int8_t) * n * k / 2 + sizeof(float) * n * 2; }

  BufferBInt4WithZeroImpl(int n, int k, void* ptr) : n(n), k(k) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(n % N_STEP == 0);
    assert(k % B_K_STEP == 0);
    if (n % N_STEP || k % B_K_STEP) {
      printf("n: %d, k: %d, N_STEP: %d, B_K_STEP: %d\n", n, k, N_STEP, B_K_STEP);
      throw std::runtime_error("n or k is not aligned to N_STEP or B_K_STEP");
    }
    b = reinterpret_cast<dt*>(ptr);
    d = reinterpret_cast<float*>(offset_pointer(b, n * k / 2));
    mins = d + n;
  }

  // 对 uint8_t 批量四舍五入到最接近的 16 倍数
  static __m128i round_4bit_u8(__m128i x) {
    // 加 8 做四舍五入，使用 Saturate
    x = _mm_adds_epi8(x, _mm_set1_epi8(0x08));
    // 清除低 4 位（即对 16 对齐）
    x = _mm_and_si128(x, _mm_set1_epi8(0xF0));
    return x;
  }

  void from_mat(ggml_bf16_t* src, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int i = 0; i < N_STEP; i++) {
        float amax = std::numeric_limits<float>::lowest();
        float amin = std::numeric_limits<float>::max();
        for (int j = 0; j < k; j += 32) {
          __m512 f0, f1;
          avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
          amax = MAX(amax, _mm512_reduce_max_ps(f0));
          amax = MAX(amax, _mm512_reduce_max_ps(f1));
          amin = MIN(amin, _mm512_reduce_min_ps(f0));
          amin = MIN(amin, _mm512_reduce_min_ps(f1));
        }
        d[n_block_begin + n_begin + i] = (amax - amin) / 240.0;  // 15*16
        mins[n_block_begin + n_begin + i] = amin;
      }
    }
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += B_K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            __m512 id = _mm512_set1_ps(d[n_block_begin + n_begin + i] ? 1.0f / d[n_block_begin + n_begin + i] : 0.0f);
            __m512 zps = _mm512_set1_ps(-mins[n_block_begin + n_begin + i]);
            dt* dst = offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                         k_begin * N_STEP + i * B_K_STEP) /
                                            2);
            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f0, zps), id));
              __m512i i1 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f1, zps), id));
              __m512i i2 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f2, zps), id));
              __m512i i3 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f3, zps), id));
              __m128i s0 = _mm512_cvtusepi32_epi8(i0);
              __m128i s1 = _mm512_cvtusepi32_epi8(i1);
              __m128i s2 = _mm512_cvtusepi32_epi8(i2);
              __m128i s3 = _mm512_cvtusepi32_epi8(i3);
              s0 = _mm_srli_epi16(round_4bit_u8(s0), 4);
              s1 = _mm_srli_epi16(round_4bit_u8(s1), 4);
              s2 = _mm_srli_epi16(round_4bit_u8(s2), 4);
              s3 = _mm_srli_epi16(round_4bit_u8(s3), 4);
              // s0 = _mm_or_si128(round_up4(s0), _mm_srli_epi16(round_up4(s1), 4));
              // s2 = _mm_or_si128(round_up4(s2), _mm_srli_epi16(round_up4(s3), 4));
              _mm_store_si128((__m128i*)dst, s0);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)), s1);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)), s2);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)), s3);
            }

            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 2, &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 3, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f0, zps), id));
              __m512i i1 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f1, zps), id));
              __m512i i2 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f2, zps), id));
              __m512i i3 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f3, zps), id));
              __m128i s0 = _mm512_cvtusepi32_epi8(i0);
              __m128i s1 = _mm512_cvtusepi32_epi8(i1);
              __m128i s2 = _mm512_cvtusepi32_epi8(i2);
              __m128i s3 = _mm512_cvtusepi32_epi8(i3);
              s0 = round_4bit_u8(s0);
              s1 = round_4bit_u8(s1);
              s2 = round_4bit_u8(s2);
              s3 = round_4bit_u8(s3);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 0)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 0))), s0));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 16))), s1));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 32))), s2));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 48))), s3));
            }
          }
          transpose_16x16_32bit((__m512i*)(offset_pointer(
              b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2)));
          transpose_16x16_32bit(
              (__m512i*)(offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                            k_begin * N_STEP + TILE_N * B_K_STEP) /
                                               2)));
        }
      }
    }
  }

  dt* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return offset_pointer(
        b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2);
  }

  float* get_scale(int n, int n_begin) { return d + n_begin; }
  float* get_min(int n, int n_begin) { return mins + n_begin; }
};

// BufferB for Signed Int4 with KGroup Scale (no zero point)
// Used for K2 MoE - signed int4 range: [-8, 7]
template <typename K>
struct BufferBInt4KGroupImpl {
  using dt = typename K::dt;
  dt* b;     // packed signed int4 weights, col majored
  float* d;  // scales only (no mins/zero-points), row majored
  int n, k, k_group_size, k_group_count;

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr bool SCALE = true;

  // Size calculation: packed int4 weights + scales (NO mins)
  static size_t required_size(int n, int k, int k_group_size) {
    return sizeof(int8_t) * n * k / 2 + sizeof(float) * n * (k / k_group_size);
  }

  BufferBInt4KGroupImpl(int n, int k, int k_group_size, void* ptr) : n(n), k(k), k_group_size(k_group_size) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(n % N_STEP == 0);
    assert(k % K_STEP == 0);
    if (n % N_STEP || k % K_STEP || k % k_group_size) {
      printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP, K_STEP,
             k_group_size);
      throw std::runtime_error("n or k is not aligned to N_STEP or K_STEP");
    }
    k_group_count = k / k_group_size;
    b = reinterpret_cast<dt*>(ptr);
    d = reinterpret_cast<float*>(offset_pointer(b, n * k / 2));
  }

  // Load from packed signed int4 format
  // Input: proj is packed int4 weights (2 int4 values per byte)
  // Each int4 value is in range [-8, 7] (signed)
  void from_raw_mat(uint8_t* proj, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    if (n_start >= n_end) {
      return;
    }
    const size_t row_bytes = static_cast<size_t>(k) / 2;
    const size_t rows = static_cast<size_t>(n_end - n_start);
    uint8_t* dst_weights = reinterpret_cast<uint8_t*>(b) + n_start * row_bytes;
    const uint8_t* src_weights = proj + n_start * row_bytes;
    std::memcpy(dst_weights, src_weights, rows * row_bytes);
  }

  // Get pointer to submatrix for computation
  dt* get_submat(int n, int k, int n_begin, int k_begin) {
    const size_t row_bytes = static_cast<size_t>(k) / 2;
    const size_t row_offset = static_cast<size_t>(n_begin) * row_bytes;
    const size_t col_offset = static_cast<size_t>(k_begin) / 2;
    return reinterpret_cast<dt*>(reinterpret_cast<uint8_t*>(b) + row_offset + col_offset);
  }

  // Get scale pointer for a specific row and k_group
  float* get_scale(int n, int n_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return d + n_begin * (k / k_group_size) + k_group_idx;
  }

  // Split range for parallel processing
  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_per_thread = (n + nth - 1) / nth;
    n_per_thread = (n_per_thread + N_STEP - 1) / N_STEP * N_STEP;
    int n_start = std::min(ith * n_per_thread, n);
    int n_end = std::min(n_start + n_per_thread, n);
    return {n_start, n_end};
  }
};

template <typename K>
struct BufferBInt4WithZeroKGroupImpl {
  using dt = typename K::dt;
  dt* b;
  float *d, *mins;  // scale, mins
  int n, k, k_group_size, k_group_count;

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int TILE_N = K::TILE_N;

  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static const int B_K_STEP = 2 * K_STEP;
  static constexpr bool SCALE = true;

  static size_t required_size(int n, int k, int k_group_size) {
    return sizeof(int8_t) * n * k / 2 + sizeof(float) * n * (k / k_group_size) * 2;
  }

  BufferBInt4WithZeroKGroupImpl(int n, int k, int k_group_size, void* ptr) : n(n), k(k), k_group_size(k_group_size) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(n % N_STEP == 0);
    assert(k % B_K_STEP == 0);
    if (n % N_STEP || k % B_K_STEP || k % k_group_size) {
      printf("n: %d, k: %d, N_STEP: %d, B_K_STEP: %d, k_group_size: %d\n", n, k, N_STEP, B_K_STEP, k_group_size);
      throw std::runtime_error("n or k is not aligned to N_STEP or B_K_STEP");
    }
    k_group_count = k / k_group_size;
    b = reinterpret_cast<dt*>(ptr);
    d = reinterpret_cast<float*>(offset_pointer(b, n * k / 2));
    mins = d + n * k_group_count;
  }

  // 对 uint8_t 批量四舍五入到最接近的 16 倍数
  static __m128i round_4bit_u8(__m128i x) {
    // 加 8 做四舍五入，使用 Saturate
    x = _mm_adds_epi8(x, _mm_set1_epi8(0x08));
    // 清除低 4 位（即对 16 对齐）
    x = _mm_and_si128(x, _mm_set1_epi8(0xF0));
    return x;
  }

  void from_raw_mat(uint8_t* proj, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;

    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += B_K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            uint8_t* dst = (uint8_t*)offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size +
                                                        n_begin * k_block_size + k_begin * N_STEP + i * B_K_STEP) >>
                                                           1);
            uint32_t* src =
                (uint32_t*)offset_pointer(proj, ((n_block_begin + n_begin + i) * k + k_block_begin + k_begin) >> 1);
            for (int a0 = 0; a0 < 8; a0++) {
              uint32_t src0 = src[a0], src1 = src[a0 + 8];
              for (int a1 = 0; a1 < 8; a1++) {
                uint8_t cur_src0 = src0 & 0x0F, cur_src1 = src1 & 0x0F;
                dst[(a0 * 8) + a1] = (cur_src0 | (cur_src1 << 4));
                src0 = src0 >> 4;
                src1 = src1 >> 4;
              }
            }
          }
          transpose_16x16_32bit((__m512i*)(offset_pointer(
              b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2)));
          transpose_16x16_32bit(
              (__m512i*)(offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                            k_begin * N_STEP + TILE_N * B_K_STEP) /
                                               2)));
        }
      }
    }
  }

  void from_mat(ggml_bf16_t* src, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int i = 0; i < N_STEP; i++) {
        for (int kg = 0; kg < k_group_count; kg++) {
          int k_start = kg * k_group_size;
          int k_end = k_start + k_group_size;

          float amax = std::numeric_limits<float>::lowest();
          float amin = std::numeric_limits<float>::max();
          for (int j = k_start; j < k_end; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(f0));
            amax = MAX(amax, _mm512_reduce_max_ps(f1));
            amin = MIN(amin, _mm512_reduce_min_ps(f0));
            amin = MIN(amin, _mm512_reduce_min_ps(f1));
          }
          d[kg * n + n_block_begin + n_begin + i] = (amax - amin) / 240.0;  // 15*16
          // d[n_block_begin + n_begin + i] = (amax - amin) / 240.0; // 15*16
          mins[kg * n + n_block_begin + n_begin + i] = amin;
        }
      }
    }
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += B_K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            int k_group_idx0 = (k_block_begin + k_begin) / k_group_size;
            int k_group_idx1 = (k_block_begin + k_begin + K_STEP) / k_group_size;
            float scale0 = d[k_group_idx0 * n + n_block_begin + n_begin + i];
            float scale1 = d[k_group_idx1 * n + n_block_begin + n_begin + i];
            __m512 id0 = _mm512_set1_ps(scale0 ? 1.0f / scale0 : 0.0f);
            __m512 id1 = _mm512_set1_ps(scale1 ? 1.0f / scale1 : 0.0f);
            __m512 zps0 = _mm512_set1_ps(-mins[k_group_idx0 * n + n_block_begin + n_begin + i]);
            __m512 zps1 = _mm512_set1_ps(-mins[k_group_idx1 * n + n_block_begin + n_begin + i]);
            // __m512 zps = _mm512_set1_ps(-mins[n_block_begin + n_begin + i]);
            dt* dst = offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                         k_begin * N_STEP + i * B_K_STEP) /
                                            2);
            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f0, zps0), id0));
              __m512i i1 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f1, zps0), id0));
              __m512i i2 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f2, zps0), id0));
              __m512i i3 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f3, zps0), id0));
              __m128i s0 = _mm512_cvtusepi32_epi8(i0);
              __m128i s1 = _mm512_cvtusepi32_epi8(i1);
              __m128i s2 = _mm512_cvtusepi32_epi8(i2);
              __m128i s3 = _mm512_cvtusepi32_epi8(i3);
              s0 = _mm_srli_epi16(round_4bit_u8(s0), 4);
              s1 = _mm_srli_epi16(round_4bit_u8(s1), 4);
              s2 = _mm_srli_epi16(round_4bit_u8(s2), 4);
              s3 = _mm_srli_epi16(round_4bit_u8(s3), 4);
              // s0 = _mm_or_si128(round_up4(s0), _mm_srli_epi16(round_up4(s1), 4));
              // s2 = _mm_or_si128(round_up4(s2), _mm_srli_epi16(round_up4(s3), 4));
              _mm_store_si128((__m128i*)dst, s0);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)), s1);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)), s2);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)), s3);
            }

            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 2, &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 3, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f0, zps1), id1));
              __m512i i1 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f1, zps1), id1));
              __m512i i2 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f2, zps1), id1));
              __m512i i3 = _mm512_cvtps_epu32(_mm512_mul_ps(_mm512_add_ps(f3, zps1), id1));
              __m128i s0 = _mm512_cvtusepi32_epi8(i0);
              __m128i s1 = _mm512_cvtusepi32_epi8(i1);
              __m128i s2 = _mm512_cvtusepi32_epi8(i2);
              __m128i s3 = _mm512_cvtusepi32_epi8(i3);
              s0 = round_4bit_u8(s0);
              s1 = round_4bit_u8(s1);
              s2 = round_4bit_u8(s2);
              s3 = round_4bit_u8(s3);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 0)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 0))), s0));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 16))), s1));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 32))), s2));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 48))), s3));
            }
          }
          transpose_16x16_32bit((__m512i*)(offset_pointer(
              b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2)));
          transpose_16x16_32bit(
              (__m512i*)(offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                            k_begin * N_STEP + TILE_N * B_K_STEP) /
                                               2)));
        }
      }
    }
  }

  dt* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return offset_pointer(
        b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2);
  }

  float* get_scale(int n, int n_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return d + k_group_idx * n + n_begin;
  }
  float* get_min(int n, int n_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return mins + k_group_idx * n + n_begin;
  }
};

template <typename K>
struct BufferBInt4WithZeroLowKGroupImpl {
  using dt = typename K::dt;
  dt* b;
  float *d, *mins;  // scale, mins
  int n, k, k_group_size, k_group_count;

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int TILE_N = K::TILE_N;

  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static const int B_K_STEP = 2 * K_STEP;
  static constexpr bool SCALE = true;

  static size_t required_size(int n, int k, int k_group_size) {
    return sizeof(int8_t) * n * k / 2 + sizeof(float) * n * (k / k_group_size) * 2;
  }

  BufferBInt4WithZeroLowKGroupImpl(int n, int k, int k_group_size, void* ptr) : n(n), k(k), k_group_size(k_group_size) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(n % N_STEP == 0);
    assert(k % B_K_STEP == 0);
    if (n % N_STEP || k % B_K_STEP || k % k_group_size) {
      printf("n: %d, k: %d, N_STEP: %d, B_K_STEP: %d, k_group_size: %d\n", n, k, N_STEP, B_K_STEP, k_group_size);
      throw std::runtime_error("n or k is not aligned to N_STEP or B_K_STEP");
    }
    k_group_count = k / k_group_size;
    b = reinterpret_cast<dt*>(ptr);
    d = reinterpret_cast<float*>(offset_pointer(b, n * k / 2));
    mins = d + n * k_group_count;
  }

  // 对 uint8_t 批量四舍五入到最接近的 16 倍数
  static __m128i round_4bit_u8(__m128i x) {
    // 加 8 做四舍五入，使用 Saturate
    x = _mm_adds_epi8(x, _mm_set1_epi8(0x08));
    // 清除低 4 位（即对 16 对齐）
    x = _mm_and_si128(x, _mm_set1_epi8(0xF0));
    return x;
  }

  void from_raw_mat(uint8_t* proj, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;

    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += B_K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            uint8_t* dst = (uint8_t*)offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size +
                                                        n_begin * k_block_size + k_begin * N_STEP + i * B_K_STEP) >>
                                                           1);
            uint32_t* src =
                (uint32_t*)offset_pointer(proj, ((n_block_begin + n_begin + i) * k + k_block_begin + k_begin) >> 1);
            for (int a0 = 0; a0 < 8; a0++) {
              uint32_t src0 = src[a0], src1 = src[a0 + 8];
              for (int a1 = 0; a1 < 8; a1++) {
                uint8_t cur_src0 = src0 & 0x0F, cur_src1 = src1 & 0x0F;
                dst[(a0 * 8) + a1] = (cur_src0 | (cur_src1 << 4));
                src0 = src0 >> 4;
                src1 = src1 >> 4;
              }
            }
          }
          transpose_16x16_32bit((__m512i*)(offset_pointer(
              b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2)));
          transpose_16x16_32bit(
              (__m512i*)(offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                            k_begin * N_STEP + TILE_N * B_K_STEP) /
                                               2)));
        }
      }
    }
  }

  void from_mat(ggml_bf16_t* src, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int i = 0; i < N_STEP; i++) {
        for (int kg = 0; kg < k_group_count; kg++) {
          int k_start = kg * k_group_size;
          int k_end = k_start + k_group_size;

          float amax = std::numeric_limits<float>::lowest();
          float amin = std::numeric_limits<float>::max();
          for (int j = k_start; j < k_end; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(f0));
            amax = MAX(amax, _mm512_reduce_max_ps(f1));
            amin = MIN(amin, _mm512_reduce_min_ps(f0));
            amin = MIN(amin, _mm512_reduce_min_ps(f1));
          }
          d[kg * n + n_block_begin + n_begin + i] = (amax - amin) / 15.0;  // 15*16
          // d[n_block_begin + n_begin + i] = (amax - amin) / 240.0; // 15*16
          mins[kg * n + n_block_begin + n_begin + i] = amin;
        }
      }
    }

    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += B_K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            int k_group_idx0 = (k_block_begin + k_begin) / k_group_size;
            int k_group_idx1 = (k_block_begin + k_begin + K_STEP) / k_group_size;
            float scale0 = d[k_group_idx0 * n + n_block_begin + n_begin + i];
            float scale1 = d[k_group_idx1 * n + n_block_begin + n_begin + i];
            __m512 id0 = _mm512_set1_ps(scale0 ? 1.0f / scale0 : 0.0f);
            __m512 id1 = _mm512_set1_ps(scale1 ? 1.0f / scale1 : 0.0f);
            __m512 zps0 = _mm512_set1_ps(-mins[k_group_idx0 * n + n_block_begin + n_begin + i]);
            __m512 zps1 = _mm512_set1_ps(-mins[k_group_idx1 * n + n_block_begin + n_begin + i]);
            // __m512 zps = _mm512_set1_ps(-mins[n_block_begin + n_begin + i]);
            dt* dst = offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                         k_begin * N_STEP + i * B_K_STEP) /
                                            2);
            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f0, zps0), id0),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m512i i1 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f1, zps0), id0),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m512i i2 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f2, zps0), id0),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m512i i3 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f3, zps0), id0),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m128i s0 = _mm512_cvtusepi32_epi8(i0);
              __m128i s1 = _mm512_cvtusepi32_epi8(i1);
              __m128i s2 = _mm512_cvtusepi32_epi8(i2);
              __m128i s3 = _mm512_cvtusepi32_epi8(i3);
              // s0 = _mm_srli_epi16(s0, 4);
              // s1 = _mm_srli_epi16(s1, 4);
              // s2 = _mm_srli_epi16(s2, 4);
              // s3 = _mm_srli_epi16(s3, 4);
              // s0 = _mm_or_si128(round_up4(s0), _mm_srli_epi16(round_up4(s1), 4));
              // s2 = _mm_or_si128(round_up4(s2), _mm_srli_epi16(round_up4(s3), 4));
              _mm_store_si128((__m128i*)dst, s0);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)), s1);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)), s2);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)), s3);
            }

            {
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 2, &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 3, &f2, &f3);
              __m512i i0 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f0, zps1), id1),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m512i i1 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f1, zps1), id1),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m512i i2 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f2, zps1), id1),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m512i i3 = _mm512_cvt_roundps_epu32(_mm512_mul_ps(_mm512_add_ps(f3, zps1), id1),
                                                    _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
              __m128i s0 = _mm512_cvtusepi32_epi8(i0);
              __m128i s1 = _mm512_cvtusepi32_epi8(i1);
              __m128i s2 = _mm512_cvtusepi32_epi8(i2);
              __m128i s3 = _mm512_cvtusepi32_epi8(i3);
              s0 = _mm_slli_epi16(s0, 4);
              s1 = _mm_slli_epi16(s1, 4);
              s2 = _mm_slli_epi16(s2, 4);
              s3 = _mm_slli_epi16(s3, 4);
              _mm_store_si128((__m128i*)(offset_pointer(dst, 0)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 0))), s0));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 16)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 16))), s1));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 32)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 32))), s2));
              _mm_store_si128((__m128i*)(offset_pointer(dst, 48)),
                              _mm_or_si128(_mm_loadu_si128((__m128i*)(offset_pointer(dst, 48))), s3));
            }
          }
          transpose_16x16_32bit((__m512i*)(offset_pointer(
              b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2)));
          transpose_16x16_32bit(
              (__m512i*)(offset_pointer(b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                            k_begin * N_STEP + TILE_N * B_K_STEP) /
                                               2)));
        }
      }
    }
  }

  dt* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return offset_pointer(
        b, (n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP) / 2);
  }

  float* get_scale(int n, int n_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return d + k_group_idx * n + n_begin;
  }
  float* get_min(int n, int n_begin, int k, int k_begin) {
    int k_group_idx = k_begin / k_group_size;
    return mins + k_group_idx * n + n_begin;
  }
};

template <typename K>
struct BufferCImpl {
  float* c;
  int max_m, n;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;

  static size_t required_size(int max_m, int n) { return sizeof(float) * max_m * n; }

  BufferCImpl(int max_m, int n, void* ptr) : max_m(max_m), n(n) {
    assert(max_m % M_STEP == 0);
    assert(n % N_STEP == 0);
    if (max_m % M_STEP || n % N_STEP) {
      printf("max_m = %d, n = %d, M_STEP = %d, N_STEP = %d\n", max_m, n, M_STEP, N_STEP);
      throw std::runtime_error("BufferCImpl: max_m and n must be multiple of M_STEP and N_STEP");
    }
    set_data(ptr);
  }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    c = reinterpret_cast<float*>(ptr);
  }

  void to_mat(int m, ggml_bf16_t* dst, int ith, int nth) {
    assert(m <= max_m);
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
          __m512* x0 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
          __m512* x1 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP + 16);
          avx512_32xfp32_to_32xbf16(x0, x1, (__m512i*)(dst + (m_begin + i) * n + n_block_begin + n_begin));
        }
      }
    }
  }

  float* get_submat(int m, int n, int m_begin, int n_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    n_begin -= n_block_begin;
    return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
  }
};

template <typename K>
struct BufferCReduceImpl {
  float* c;
  int32_t* int_c;  // Additional int32_t buffer, same size as c
  int max_m, n;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;

  static size_t required_size(int max_m, int n) {
    // Need space for both float* c and int32_t* int_c
    return sizeof(float) * max_m * n + sizeof(int32_t) * max_m * n;
  }

  BufferCReduceImpl(int max_m, int n, void* ptr) : max_m(max_m), n(n) {
    assert(max_m % M_STEP == 0);
    assert(n % N_STEP == 0);
    if (max_m % M_STEP || n % N_STEP) {
      printf("max_m = %d, n = %d, M_STEP = %d, N_STEP = %d\n", max_m, n, M_STEP, N_STEP);
      throw std::runtime_error("BufferCReduceImpl: max_m and n must be multiple of M_STEP and N_STEP");
    }
    set_data(ptr);
  }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    c = reinterpret_cast<float*>(ptr);
    // int_c starts after the float buffer
    int_c = reinterpret_cast<int32_t*>(c + max_m * n);
  }

  void to_mat(int m, ggml_bf16_t* dst, int ith, int nth) {
    assert(m <= max_m);
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
          __m512* x0 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
          __m512* x1 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP + 16);
          avx512_32xfp32_to_32xbf16(x0, x1, (__m512i*)(dst + (m_begin + i) * n + n_block_begin + n_begin));
        }
      }
    }
  }

  float* get_submat(int m, int n, int m_begin, int n_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    n_begin -= n_block_begin;
    return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
  }

  int32_t* get_int_submat(int m, int n, int m_begin, int n_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    n_begin -= n_block_begin;
    return int_c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
  }

  // Clear the int_c buffer
  void clear_int_buffer() { std::memset(int_c, 0, sizeof(int32_t) * max_m * n); }

  // Convert int32_t results to float
  void convert_int_to_float(int m) {
    assert(m <= max_m);
    for (int i = 0; i < m * n; i++) {
      c[i] = static_cast<float>(int_c[i]);
    }
  }
};

}  // namespace amx

#endif  // AMX_BUFFERS_HPP

================================================
FILE: kt-kernel/operators/amx/la/amx_config.hpp
================================================
#ifndef AMX_CONFIG_HPP
#define AMX_CONFIG_HPP
#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
#define RESTRICT __restrict__
#endif

#if (defined(_WIN32) || defined(_WIN64))
#define ALWAYS_INLINE __forceinline
#elif __has_attribute(always_inline) || defined(__GNUC__)
#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
#else
#define ALWAYS_INLINE inline
#endif
#include <immintrin.h>
#if defined(__AMX__) || defined(__AMXINT8__) || defined(__AMXBF16__) || defined(__AMX_TILE__) || defined(HAVE_AMX)
#ifndef HAVE_AMX
#define HAVE_AMX
#endif
#include <emmintrin.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <tmmintrin.h>
#include <unistd.h>

#include <array>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <stdexcept>

namespace amx {

#define ARCH_GET_XCOMP_SUPP 0x1021
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
#define XFEATURE_MASK_XTILE ((1 << XFEATURE_XTILECFG) | (1 << XFEATURE_XTILEDATA))

const int TMMCount = 8;
const int MaxTileHeight = 16;
const int MaxTileWidth = 64;

const int AMX_BLK_SIZE = 32;

#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7

inline bool enable_amx() {
  // CHECK: whether this can be removed?
  // static thread_local bool initialized = false;
  // if (initialized) {
  //   return true;
  // }
  // initialized = true;

  // if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
  //   printf("\n Fail to do XFEATURE_XTILEDATA \n\n");
  //   return false;
  // } else {
  //   // printf("\n TILE DATA USE SET - OK \n\n");
  //   return true;
  // }
  // return true;
  unsigned long features;
  long rc;
  rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features);

  if (!rc && (features & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) {
    unsigned long bitmask = 0;
    long status = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
    if (0 != status) return false;
    if (bitmask & XFEATURE_MASK_XTILEDATA) return true;

    status = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
    if (0 != status) return false;  // XFEATURE_XTILEDATA setup is failed, TMUL usage is not allowed
    status = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);

    // XFEATURE_XTILEDATA setup is failed, can't use TMUL
    if (0 != status || !(bitmask & XFEATURE_MASK_XTILEDATA)) return false;

    // XFEATURE_XTILEDATA set successfully, TMUL usage is allowed
    // printf("\n TILE DATA USE SET - OK \n\n");
    return true;
  }
  return false;
}

struct alignas(64) TileConfig {
  uint8_t palette;
  uint8_t start_row;
  std::array<uint8_t, 14> __0 = {};
  std::array<uint16_t, 8> colsb;
  std::array<uint8_t, 16> __1 = {};
  std::array<uint8_t, 8> rows;
  std::array<uint8_t, 8> __2 = {};

  TileConfig() {
    palette = 1;
    start_row = 0;
    for (int i = 0; i < 8; i++) {
      set_row_col(i, 0, 0);
    }
  }

  void set_row_col(int i, uint8_t row, uint16_t col) {
    colsb[i] = col;
    rows[i] = row;
  }

  void set_config() { _tile_loadconfig(this); }

  static void load_data(int to, void* from, size_t stride) {
    switch (to) {
      case 0:
        _tile_loadd(0, from, stride);
        break;
      case 1:
        _tile_loadd(1, from, stride);
        break;
      case 2:
        _tile_loadd(2, from, stride);
        break;
      case 3:
        _tile_loadd(3, from, stride);
        break;
      case 4:
        _tile_loadd(4, from, stride);
        break;
      case 5:
        _tile_loadd(5, from, stride);
        break;
      case 6:
        _tile_loadd(6, from, stride);
        break;
      case 7:
        _tile_loadd(7, from, stride);
        break;
      default:
        throw std::runtime_error("no such tile");
    }
  }

  static void store_data(int from, void* to, size_t stride) {
    switch (from) {
      case 0:
        _tile_stored(0, to, stride);
        break;
      case 1:
        _tile_stored(1, to, stride);
        break;
      case 2:
        _tile_stored(2, to, stride);
        break;
      case 3:
        _tile_stored(3, to, stride);
        break;
      case 4:
        _tile_stored(4, to, stride);
        break;
      case 5:
        _tile_stored(5, to, stride);
        break;
      case 6:
        _tile_stored(6, to, stride);
        break;
      case 7:
        _tile_stored(7, to, stride);
        break;
      default:
        throw std::runtime_error("no such tile");
    }
  }
};

static_assert(sizeof(TileConfig) == 64);

}  // namespace amx
#endif  // defined(__AMX__)
#endif  // AMX_CONFIG_HPP

================================================
FILE: kt-kernel/operators/amx/la/amx_kernels.hpp
================================================
#ifndef AMX_KERNELS_HPP
#define AMX_KERNELS_HPP
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <memory>

#include "amx_buffers.hpp"
#include "amx_config.hpp"
#include "amx_quantization.hpp"
#include "amx_utils.hpp"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llamafile/sgemm.h"
#include "utils.hpp"

namespace amx {

// Compile-time detection: true when AMX intrinsics are available
#if defined(__AMX__) || defined(__AMXINT8__) || defined(__AMXBF16__) || defined(__AMX_TILE__) || defined(HAVE_AMX)
inline constexpr bool AMX_AVAILABLE = true;
#ifndef HAVE_AMX
#define HAVE_AMX
#endif
#else
inline constexpr bool AMX_AVAILABLE = false;
#endif

/*
We use 1-3-3
 C = A x B


A is a row major matrix of size M x K, usually an Linear Layer weight matrix
B is a col major vector of size K x N, usually an input vector, N is usually
quite small

   B
 A C
 A C
 A C

  TMM 0-2: A
  TMM 3: B
  TMM 4-6: C

   3
 0 4
 1 5
 2 6
*/

template <class, class>
struct dpb133 {
  static void run();
};

template <>
inline void dpb133<int8_t, int8_t>::run() {
  _tile_dpbssd(4, 0, 3);
  _tile_dpbssd(5, 1, 3);
  _tile_dpbssd(6, 2, 3);
}

template <>
inline void dpb133<int8_t, uint8_t>::run() {
  _tile_dpbsud(4, 0, 3);
  _tile_dpbsud(5, 1, 3);
  _tile_dpbsud(6, 2, 3);
}

template <>
inline void dpb133<uint8_t, int8_t>::run() {
  _tile_dpbusd(4, 0, 3);
  _tile_dpbusd(5, 1, 3);
  _tile_dpbusd(6, 2, 3);
}

template <>
inline void dpb133<uint8_t, uint8_t>::run() {
  _tile_dpbuud(4, 0, 3);
  _tile_dpbuud(5, 1, 3);
  _tile_dpbuud(6, 2, 3);
}

template <int TILE_K = 32>
struct GemmKernel133 {
  static const int TILE_M = 16;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;
  static const int OUTPUT_T_SIZE = 4;

  static const int M_STEP = TILE_M * 3;
  static const int N_STEP = TILE_N;
  static const int K_STEP = TILE_K;

  static int recommended_nth(int m) { return (m + M_STEP - 1) / M_STEP; }

  static void config() {
#ifdef HAVE_AMX
    TileConfig tile_config;

    for (int i = 0; i < 3; i++) tile_config.set_row_col(i, TILE_M, TILE_K);

    tile_config.set_row_col(3, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK);

    for (int i = 4; i < 7; i++) tile_config.set_row_col(i, TILE_M, TILE_N * OUTPUT_T_SIZE);

    tile_config.set_config();
#endif
  }

  template <typename TA, typename TB, typename TC>
  static void run_full_tile(const TA* a, size_t lda, const TB* b, size_t ldb, TC* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
    _tile_loadd(2, offset_pointer(a, lda * TILE_M * 2), lda);

    _tile_loadd(3, b, ldb);

    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, ldc * TILE_N), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_N * 2), ldc);

    dpb133<TA, TB>::run();

    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, ldc * TILE_N), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_N * 2), ldc);
#endif
  }

  template <typename TA, typename TB, typename TC>
  static void run_full_tile_zero(const TA* a, size_t lda, const TB* b, size_t ldb, TC* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
    _tile_loadd(2, offset_pointer(a, lda * TILE_M * 2), lda);

    _tile_loadd(3, b, ldb);

    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);

    dpb133<TA, TB>::run();

    // debug_tiles(7);

    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, ldc * TILE_N), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_N * 2), ldc);
#endif
  }

  static void convert_full_tile_b_to_vnni_inplace(void* b) { transpose_16x8_32bit((__m256i*)b); }

  template <typename TA>
  struct ATile {
    TA v[3 * TILE_M * TILE_K];
    void partial_load(TA* a, int m, int k, size_t lda) {
      // memset(v, 0, sizeof(TA) * 3 * TILE_M * TILE_K);
      for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
          v[i * TILE_K + j] = a[i * lda + j];
        }
      }
    }

    void partial_load_quant(block_q4_0* a, int m, int k, size_t lda) {
      assert(k == 32);
      // memset(v, 0, sizeof(TA) * 3 * TILE_M * TILE_K);
      __m256i* vv = (__m256i*)v;
      for (int i = 0; i < m; i++) {
        vv[i] = dequant4x32(offset_pointer(a, lda * i)->qs);
        vv[i] = _mm256_sub_epi8(vv[i], _mm256_set1_epi8(8));
      }
    }

    void partial_load_quant(block_q8_0* a, int m, int k, size_t lda) {
      assert(k == 32);
      // memset(v, 0, sizeof(TA) * 3 * TILE_M * TILE_K);
      __m256i* vv = (__m256i*)v;
      for (int i = 0; i < m; i++) {
        vv[i] = unaligned_copy8x32(offset_pointer(a, lda * i)->qs);
      }
    }

    template <typename QA>
    void partial_load_quant(TA* a, int m, size_t lda) {
      // memset(v, 0, sizeof(TA) * 3 * TILE_M * TILE_K);
      if constexpr (std::is_same_v<QA, blocks_aligned_q8_0_ref>) {
        __m512i* vv = (__m512i*)v;
        for (int i = 0; i < m; i++) {
          vv[i] = copy8x64(offset_pointer(a, lda * i));
        }
      } else if constexpr (std::is_same_v<QA, blocks_aligned_q4_0_ref>) {
        assert(0);
      } else {
        assert(0);
      }
    }

    void partial_load_quant(block_q4_K* a, int m, int inner_block_idx, size_t lda) {
      // memset(v, 0, sizeof(TA) * 3 * TILE_M * TILE_K);
      __m256i* vv = (__m256i*)v;

      size_t qs_offset = inner_block_idx / 2 * 32;
      for (int i = 0; i < m; i++) {
        block_q4_K* spa = offset_pointer_row_major(a, i, 0, lda);
        if (inner_block_idx % 2 == 0) {
          vv[i] = lo4bit(spa->qs + qs_offset);
        } else {
          vv[i] = hi4bit(spa->qs + qs_offset);
        }
      }
    }

    void partial_load_quant(blocks_aligned_q8_0_ref a, int m, int k, int blck_stride) {
      // memset(v, 0, sizeof(TA) * 3 * TILE_M * TILE_K);
      __m512i* vv = (__m512i*)v;
      for (int i = 0; i < m; i++) {
        vv[i] = copy8x64(a.offset(blck_stride * i).qs);
      }
    }
  };

  template <typename TB>
  struct alignas(64) BTile {
    TB v[TILE_N * TILE_K];
    __m512 scale = {};

    void partial_load(TB* b, int n, int k, size_t ldb) {
      for (int i = 0; i < n; i++) {
        for (int j = 0; j < k; j++) {
          v[i * TILE_K + j] = b[i * ldb + j];
        }
      }
      transpose_16x8_32bit((__m256i*)v);
    }

    void partial_load_quant(block_q8_0* b, int n, int k, size_t ldb) {
      assert(k == 32);
      memset(v, 0, sizeof(TB) * TILE_K * TILE_N);
      __m256i* vv = (__m256i*)v;
      float* bss = reinterpret_cast<float*>(&scale);
      for (int i = 0; i < n; i++) {
        vv[i] = unaligned_copy8x32(offset_pointer(b, ldb * i)->qs);
        float sb = GGML_FP16_TO_FP32(offset_pointer_col_major(b, 0, i, ldb)->d);
        bss[i] = sb;
      }

      transpose_16x8_32bit(vv);
    }

    void partial_load_quant(blocks_aligned_q8_0_ref b, int n, int k, int blck_stride) {
      assert(k == 64);
      memset(v, 0, sizeof(TB) * TILE_K * TILE_N);
      __m512i* vv = (__m512i*)v;
      float* vs = reinterpret_cast<float*>(&scale);
      for (int i = 0; i < n; i++) {
        auto ref = b.offset(blck_stride * i);
        vv[i] = copy8x64(ref.qs);
        float sb = GGML_FP16_TO_FP32(*ref.d);
        vs[i] = sb;
      }
      transpose_16x16_32bit(vv);
    }

    void load_from(TB* b, size_t ldb) {
      __m256i* vb = (__m256i*)b;
      __m256i* vo = (__m256i*)v;
      for (int i = 0; i < 16; i++) {
        vo[i] = *offset_pointer(vb, ldb * i);
      }
      transpose_16x8_32bit(vo);
    }

    template <typename TA, typename TC>
    void run_full_ac(TA* a, size_t lda, TC* c, size_t ldc) {
      run_full_tile(a, lda, v, TILE_N * VNNI_BLK, c, ldc);
    }
  };

  template <typename TB>
  struct alignas(64) BTileSum {
    TB v[TILE_N * TILE_K];
    __m512 scale = {};
    __m512 sum = {};
    void partial_load_quant(block_q8_K* b, int n, int inner_block_idx, size_t ldb) {
      memset(v, 0, TILE_K * TILE_N);
      __m256i* vv = (__m256i*)v;
      float* scale_s = reinterpret_cast<float*>(&scale);
      float* sum_s = reinterpret_cast<float*>(&sum);
      for (int i = 0; i < n; i++) {
        block_q8_K* spb = offset_pointer_col_major(b, 0, i, ldb);
        vv[i] = unaligned_copy8x32(spb->qs + inner_block_idx * 32);
        scale_s[i] = spb->d;
        sum_s[i] =
            spb->bsums[inner_block_idx * 2] + spb->bsums[inner_block_idx * 2 + 1];  // TODO: may this will be slow
        // printf("scale[%d] = %f, sum_s[%d] = %f\n", i, scale_s[i], i,
        // sum_s[i]);
      }
      transpose_16x8_32bit(vv);
    }
  };
  template <typename TC>
  struct alignas(64) CTile {
    static_assert(sizeof(TC) == 4);
    TC v[3 * TILE_M * TILE_N] = {};

    void partial_load(TC* c, int m, int n, size_t ldc) {
      for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
          v[i * TILE_N + j] = offset_pointer(c, ldc * i)[j];
        }
      }
    }

    void partial_store(TC* c, int m, int n, size_t ldc) {
      for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
          offset_pointer(c, ldc * i)[j] = v[i * TILE_N + j];
        }
      }
    }

    void to_fp32() {
      __m512i* vv = (__m512i*)v;
      __m512* vf = (__m512*)v;
      for (int i = 0; i < 3 * TILE_M; i++) {
        vf[i] = _mm512_cvtepi32_ps(vv[i]);
      }
    }
  };

  template <typename TA, typename TB, typename TC>
  struct PartialTiles {
    ATile<TA> ta;
    BTile<TB> tb;
    CTile<TC> tc;
    void partial_run(int m, int n, int k, TA* a, size_t lda, TB* b, size_t ldb, TC* c, size_t ldc) {
      ta.partial_load(a, m, k, lda);
      tb.partial_load(b, n, k, ldb);
      tc.partial_load(c, m, n, ldc);
      run_full_tile(ta.v, TILE_K, tb.v, TILE_N * VNNI_BLK, tc.v, TILE_N * OUTPUT_T_SIZE);
      tc.partial_store(c, m, n, ldc);
    }

    template <typename QA>
    void partial_run_quant(int m, int n, int k, QA* a, size_t lda, block_q8_0* b, size_t ldb, float* c, size_t ldc) {
      assert(QK4_0 == 32);
      assert(QK8_0 == 32);

      ta.partial_load_quant(a, m, k, lda);
      tb.partial_load_quant(b, n, k, ldb);

      run_full_tile_zero(ta.v, TILE_K, tb.v, TILE_N * VNNI_BLK, tc.v, TILE_N * OUTPUT_T_SIZE);

      __m512i* cs = (__m512i*)tc.v;
      for (int i = 0; i < m; i++) {
        __m512 as = _mm512_set1_ps(GGML_FP16_TO_FP32(offset_pointer_row_major(a, i, 0, lda)->d));
        __m512* now = reinterpret_cast<__m512*>(offset_pointer_row_major(c, i, 0, ldc));
        *now = _mm512_fmadd_ps(_mm512_mul_ps(as, tb.scale), _mm512_cvtepi32_ps(cs[i]), *now);
      }
    }

    template <typename QA>
    void partial_run_quant_ac(int m, int n, int k, QA* a, size_t lda, float* c, size_t ldc) {
      assert(QK4_0 == 32);
      assert(QK8_0 == 32);

      ta.partial_load_quant(a, m, k, lda);

      run_full_tile_zero(ta.v, TILE_K, tb.v, TILE_N * VNNI_BLK, tc.v, TILE_N * OUTPUT_T_SIZE);

      __m512i* cs = (__m512i*)tc.v;
      for (int i = 0; i < m; i++) {
        __m512 as = _mm512_set1_ps(GGML_FP16_TO_FP32(offset_pointer_row_major(a, i, 0, lda)->d));
        __m512* now = reinterpret_cast<__m512*>(offset_pointer_row_major(c, i, 0, ldc));
        *now = _mm512_fmadd_ps(_mm512_mul_ps(as, tb.scale), _mm512_cvtepi32_ps(cs[i]), *now);
      }
    }

    template <typename AQA>
    void partial_run_quant_ac(int m, int n, int k, AQA a, int a_blck_stride, float* c, size_t ldc) {
      assert(AQA::block_size == 64);

      ta.partial_load_quant(a, m, k, a_blck_stride);

      run_full_tile_zero(ta.v, TILE_K, tb.v, TILE_N * VNNI_BLK, tc.v, TILE_N * OUTPUT_T_SIZE);

      __m512i* cs = (__m512i*)tc.v;
      for (int i = 0; i < m; i++) {
        __m512 as = _mm512_set1_ps(GGML_FP16_TO_FP32(*a.offset(i * a_blck_stride).d));
        // printf("%f\n", GGML_FP16_TO_FP32(*a.offset(i * a_blck_stride).d));
        __m512* now = reinterpret_cast<__m512*>(offset_pointer_row_major(c, i, 0, ldc));
        *now = _mm512_fmadd_ps(_mm512_mul_ps(as, tb.scale), _mm512_cvtepi32_ps(cs[i]), *now);
      }
    }
  };

  template <typename TA, typename TB, typename TC>
  struct PartialTilesSum {
    ATile<TA> ta;
    BTileSum<TB> tb;
    CTile<TC> tc;

    void partial_run_quant_ac(int m, int n, int inner_block_idx, block_q4_K* a, size_t lda, float* c, size_t ldc,
                              float a_scale, float a_min) {
      ta.partial_load_quant(a, m, inner_block_idx, lda);

      run_full_tile_zero(ta.v, TILE_K, tb.v, TILE_N * VNNI_BLK, tc.v, TILE_N * OUTPUT_T_SIZE);

      __m512i* cs = (__m512i*)tc.v;
      for (int i = 0; i < m; i++) {
        __m512* now = reinterpret_cast<__m512*>(offset_pointer_row_major(c, i, 0, ldc));
        *now = _mm512_fmadd_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_cvtepi32_ps(cs[i]), _mm512_set1_ps(a_scale)),
                                             _mm512_mul_ps(tb.sum, _mm512_set1_ps(a_min))),
                               tb.scale, *now);
        // C += Bscale * (Ascale * dp - Amin * Bsum)
      }
    }
  };
};

struct GemmKernel133BF {
  using dt = ggml_bf16_t;
  using output_t = float;
  static const int TILE_M = 16;
  static const int TILE_K = 32;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 2;

  static const int M_STEP = TILE_M * 3;
  static const int N_STEP = TILE_N;
  static const int K_STEP = TILE_K;

  static int recommended_nth(int m) { return (m + M_STEP - 1) / M_STEP; }
  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 32
    for (int i = 0; i < 3; i++) tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 8 x 64
    tile_config.set_row_col(3, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 64
    for (int i = 4; i < 7; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  static void run_full_tile(const dt* a, size_t lda, const dt* b, size_t ldb, output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
    _tile_loadd(2, offset_pointer(a, lda * TILE_M * 2), lda);

    _tile_loadd(3, b, ldb);

    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, ldc * TILE_N), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_N * 2), ldc);

    _tile_dpbf16ps(4, 0, 3);
    _tile_dpbf16ps(5, 1, 3);
    _tile_dpbf16ps(6, 2, 3);

    // debug_tiles(7);

    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, ldc * TILE_N), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_N * 2), ldc);
#endif
  }

  struct ATile {
    dt v[3 * TILE_M * TILE_K];

    void partial_load(dt* a, int m, int k, size_t lda) {
      assert(k == TILE_K);
      __m512* vv = (__m512*)v;
      __m512* va = (__m512*)a;
      for (int i = 0; i < m; i++) {
        vv[i] = *offset_pointer_row_major(va, i, 0, lda);
      }
    }
  };

  struct alignas(64) BTile {
    dt v[TILE_N * TILE_K];

    void full_load(dt* b, size_t ldb) { partial_load(b, TILE_N, TILE_K, ldb); }

    void partial_load(dt* b, int n, int k, size_t ldb) {
      __m512* vv = (__m512*)v;
      __m512* vb = (__m512*)b;
      for (int i = 0; i < n; i++) {
        vv[i] = *offset_pointer_col_major(vb, 0, i, ldb);
      }
      transpose_16x16_32bit((__m512i*)v);
    }

    template <typename TA, typename TC>
    void run_full_ac(TA* a, size_t lda, TC* c, size_t ldc) {
      run_full_tile(a, lda, v, TILE_N * VNNI_BLK * sizeof(dt), c, ldc);
    }
  };

  struct alignas(64) CTile {
    output_t v[3 * TILE_M * TILE_N];
    // c must be 64 aligned, ldc must be 64 aligned
    void partial_load(float* c, int m, int n, size_t ldc) {
      assert(n <= TILE_N);
      __m512* vv = (__m512*)v;
      __m512* vc = (__m512*)c;
      for (int i = 0; i < m; i++) {
        vv[i] = *offset_pointer_row_major(vc, i, 0, ldc);
      }
    }

    void partial_store(float* c, int m, int n, size_t ldc) {
      assert(n <= TILE_N);
      __m512* vv = (__m512*)v;
      __m512* vc = (__m512*)c;
      for (int i = 0; i < m; i++) {
        *offset_pointer_row_major(vc, i, 0, ldc) = vv[i];
      }
    }
  };

  struct PartialTiles {
    ATile ta;
    BTile tb;
    CTile tc;
    void partial_run(int m, int n, int k, dt* a, size_t lda, dt* b, size_t ldb, output_t* c, size_t ldc) {
      ta.partial_load(a, m, k, lda);
      tb.partial_load(b, n, k, ldb);
      tc.partial_load(c, m, n, ldc);
      run_full_tile(ta.v, TILE_K * sizeof(dt), tb.v, TILE_N * VNNI_BLK * sizeof(dt), tc.v, TILE_N * sizeof(output_t));
      tc.partial_store(c, m, n, ldc);
    }
  };
};

template <typename T1, typename T2>
constexpr T2 convert_to(const T1& value) {
  if constexpr (std::is_same<T1, T2>::value) {
    return value;
  } else if constexpr (std::is_same<T1, ggml_bf16_t>::value && std::is_same<T2, float>::value) {
    return GGML_BF16_TO_FP32(value);
  } else if constexpr (std::is_same<T1, float>::value && std::is_same<T2, ggml_bf16_t>::value) {
    return GGML_FP32_TO_BF16(value);
  }
}

struct GemmKernel224BF {
  using dt = ggml_bf16_t;
  using output_t = float;
  static constexpr double ELEMENT_SIZE = 2;
  static const int TILE_M = 16;
  static const int TILE_K = 32;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 2;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  static inline const int K_BLOCK = 1792;
  static std::string name() { return "BF16"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 32
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 16 x 32
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  static void load_a(dt* a, size_t lda) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
    (void)a;
    (void)lda;
#endif
  }

  static void load_b(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    _tile_loadd(2, b, ldb);
    _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void clean_c() {
#ifdef HAVE_AMX
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
#endif
  }

  static void load_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void store_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbf16ps(4, 0, 2);
    _tile_dpbf16ps(5, 0, 3);
    _tile_dpbf16ps(6, 1, 2);
    _tile_dpbf16ps(7, 1, 3);
#endif
  }

  struct BufferA {
    ggml_bf16_t* a;
    int max_m, k;

    static size_t required_size(int max_m, int k) { return sizeof(ggml_bf16_t) * max_m * k; }

    BufferA(int max_m, int k, void* ptr) : max_m(max_m), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(k % K_STEP == 0);
      a = reinterpret_cast<ggml_bf16_t*>(ptr);
    }

    void set_data(void* new_ptr) { a = reinterpret_cast<ggml_bf16_t*>(new_ptr); }

    void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
      assert(m <= max_m);
      assert(ith == 0 && nth == 1);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
              __m512i* s = (__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin);
              __m512i* d =
                  (__m512i*)(a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP);
              avx512_copy_32xbf16(s, d);
            }
          }
        }
      }
    }

    ggml_bf16_t* get_submat(int m, int k, int m_begin, int k_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
    }
  };

  struct BufferB {
    ggml_bf16_t* b;
    int n, k;
    static constexpr bool SCALE = false;

    static size_t required_size(int n, int k) { return sizeof(ggml_bf16_t) * n * k; }

    BufferB(int n, int k, void* ptr) : n(n), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(n % N_STEP == 0);
      assert(k % K_STEP == 0);
      b = reinterpret_cast<ggml_bf16_t*>(ptr);
    }

    void set_data(void* new_ptr) { b = reinterpret_cast<ggml_bf16_t*>(new_ptr); }

    void from_mat(ggml_bf16_t* src, int ith, int nth) {
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < N_STEP; i++) {
              __m512i* s = (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin);
              __m512i* d = (__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                      k_begin * N_STEP + i * K_STEP);
              avx512_copy_32xbf16(s, d);
            }
            transpose_16x16_32bit((__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size +
                                             n_begin * k_block_size + k_begin * N_STEP));
            transpose_16x16_32bit((__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size +
                                             n_begin * k_block_size + k_begin * N_STEP + TILE_N * K_STEP));
          }
        }
      }
    }

    ggml_bf16_t* get_submat(int n, int k, int n_begin, int k_begin) {
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      n_begin -= n_block_begin;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP;
    }
  };

  struct BufferC {
    float* c;
    int max_m, n;
    // 物理布局(按 float 元素数)：
    // 逻辑矩阵 C 为 (max_m, n) 行主序，max_m 为 M_STEP 的倍数，
    // n 按 N_BLOCK 分块。
    // 存储顺序：
    //   n_block(N_BLOCK 列) → m_block(M_STEP 行) → n_step(N_STEP 列) → (M_STEP×N_STEP) 行主序 tile。
    // 因此可视为 5D：
    //   c[n_blocks][m_blocks][n_steps][M_STEP][N_STEP]，
    //   n_blocks = ceil(n / N_BLOCK)，m_blocks = max_m / M_STEP，
    //   n_steps = N_BLOCK / N_STEP（尾块可能更小）。
    // get_submat(m_begin, n_begin) 返回连续的 (M_STEP×N_STEP) tile 起始地址。

    static size_t required_size(int max_m, int n) { return sizeof(float) * max_m * n; }

    BufferC(int max_m, int n, void* ptr) : max_m(max_m), n(n) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(n % N_STEP == 0);
      c = reinterpret_cast<float*>(ptr);
    }

    void set_data(void* new_ptr) { c = reinterpret_cast<float*>(new_ptr); }

    void to_mat(int m, ggml_bf16_t* dst, int ith, int nth) {
      assert(m <= max_m);
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512* x0 =
                (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
            __m512* x1 = (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP +
                                   i * N_STEP + 16);
            avx512_32xfp32_to_32xbf16(x0, x1, (__m512i*)(dst + (m_begin + i) * n + n_block_begin + n_begin));
          }
        }
      }
    }

    float* get_submat(int m, int n, int m_begin, int n_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      n_begin -= n_block_begin;
      return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
    }
  };
};

struct GemmKernel224Int8 {
  using dt = int8_t;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 1;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  // static inline const int N_BLOCK = 256;
  static inline const int N_BLOCK = 64;
  // static inline const int N_BLOCK = 32;
  static inline const int K_BLOCK = 3584;
  static std::string name() { return "INT8"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 64
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 16 x 64
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  static void load_a(dt* a, size_t lda) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
    (void)a;
    (void)lda;
#endif
  }

  static void load_b(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    _tile_loadd(2, b, ldb);
    _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void clean_c() {
#ifdef HAVE_AMX
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
#endif
  }

  static void load_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void store_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbssd(4, 0, 2);
    _tile_dpbssd(5, 0, 3);
    _tile_dpbssd(6, 1, 2);
    _tile_dpbssd(7, 1, 3);
#endif
  }

  using BufferA = BufferAImpl<GemmKernel224Int8>;
  using BufferC = BufferCImpl<GemmKernel224Int8>;

  struct BufferB {
    int8_t* b;
    float* d;
    int n, k;
    static constexpr bool SCALE = true;

    static size_t required_size(int n, int k) { return sizeof(int8_t) * n * k + sizeof(float) * n; }

    BufferB(int n, int k, void* ptr) : n(n), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(n % N_STEP == 0);
      assert(k % K_STEP == 0);
      if (n % N_STEP || k % K_STEP) {
        printf("n: %d, k: %d, N_STEP: %d, K_STEP: %d\n", n, k, N_STEP, K_STEP);
        throw std::runtime_error("BufferB: n and k must be multiples of N_STEP and K_STEP");
      }
      b = reinterpret_cast<int8_t*>(ptr);
      d = reinterpret_cast<float*>(b + n * k);
    }

    void from_mat(ggml_bf16_t* src, int ith, int nth) {  // CHECK: nth has no usage
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP; i++) {
          float amax = 0.0f;
          for (int j = 0; j < k; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          }
          d[n_block_begin + n_begin + i] = amax / ((1 << 7) - 1);
        }
      }
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < N_STEP; i++) {
              __m512 id = _mm512_set1_ps(d[n_block_begin + n_begin + i] ? 1.0f / d[n_block_begin + n_begin + i] : 0.0f);
              int8_t* dst = b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                            k_begin * N_STEP + i * K_STEP;
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              _mm_store_si128((__m128i*)dst, s0);
              _mm_store_si128((__m128i*)(dst + 16), s1);
              _mm_store_si128((__m128i*)(dst + 32), s2);
              _mm_store_si128((__m128i*)(dst + 48), s3);
            }
            transpose_16x16_32bit((__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size +
                                             n_begin * k_block_size + k_begin * N_STEP));
            transpose_16x16_32bit((__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size +
                                             n_begin * k_block_size + k_begin * N_STEP + TILE_N * K_STEP));
          }
        }
      }
    }

    int8_t* get_submat(int n, int k, int n_begin, int k_begin) {
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      n_begin -= n_block_begin;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP;
    }

    float* get_scale(int n, int n_begin) { return d + n_begin; }
  };

  static void amx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    using K = GemmKernel224Int8;
    if (k_block_begin == 0) {
      K::clean_c();
    } else {
      K::load_c((int32_t*)c, K::N_STEP * sizeof(int32_t));
    }
    for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
      K::load_b(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
      K::run_tile();
    }
    K::store_c((int32_t*)c, K::N_STEP * sizeof(int32_t));
  }
  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    __m512i* c512 = (__m512i*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
    }

    for (int k_begin = 0; k_begin < K_BLOCK && k_block_begin + k_begin < k; k_begin += K_STEP) {
      static_assert(K_STEP * sizeof(int8_t) == sizeof(__m512i));
      static_assert(N_STEP / TILE_N == 2, "Must be lke this");

      int32_t* a32 = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma = _mm512_set1_epi32(a32[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            c512[m_i * 2 + n_i] = _mm512_dpbssd_epi32(c512[m_i * 2 + n_i], ma, b512[n_i * 16 + k_i]);
          }
        }
      }
    }
  }

  static void apply_scale(int m, int n, int m_begin, int n_begin, float* c, BufferA* ba, BufferB* bb) {
    using K = GemmKernel224Int8;
    int to = m - m_begin;
    if (m - m_begin > K::M_STEP) {
      to = K::M_STEP;
    }
    for (int i = 0; i < to; i++) {
      __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i));
      __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin));
      __m512i now = _mm512_load_si512((__m512i*)(c + i * K::N_STEP));
      __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      _mm512_store_ps((__m512*)(c + i * K::N_STEP), result);
      bs = _mm512_load_ps(bb->get_scale(n, n_begin) + K::TILE_N);
      now = _mm512_load_si512((__m512i*)(c + i * K::N_STEP + K::TILE_N));
      result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      _mm512_store_ps((__m512*)(c + i * K::N_STEP + K::TILE_N), result);
    }
  }
};

struct GemmKernel224Int4 {
  using dt = void;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  // static inline const int N_BLOCK = 256;
  static inline const int N_BLOCK = 128;
  // static inline const int N_BLOCK = 64;
  // static inline const int K_BLOCK = 7168;
  static inline const int K_BLOCK = 3584;
  // static inline const int K_BLOCK = 2560;

  static std::string name() { return "INT4"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 64
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K);

    // size is 16 x 64
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK);

    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  alignas(64) static constexpr uint8_t hi_mask_arr[64] = {
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};

  alignas(64) static constexpr uint8_t lo_mask_arr[64] = {
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};

  alignas(64) static constexpr uint8_t sign_mask_arr[64] = {
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  };

  static __m512i hi_mask() { return *((__m512i*)(&hi_mask_arr[0])); }
  static __m128i hi_mask_128() { return *((__m128i*)(&hi_mask_arr[0])); }
  static __m512i lo_mask() { return *((__m512i*)(&lo_mask_arr[0])); }
  static __m128i lo_mask_128() { return *((__m128i*)(&lo_mask_arr[0])); }
  static __m128i si_mask_128() { return *((__m128i*)(&sign_mask_arr[0])); }

  static void load_b_hi(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i)));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N))));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_b_lo(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_slli_epi32(_mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_slli_epi32(
          _mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N)))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_a(dt* a, size_t lda) {
#ifdef HAVE_AMX
    _tile_stream_loadd(0, a, lda);
    _tile_stream_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
    (void)a;
    (void)lda;
#endif
  }

  static void clean_c() {
#ifdef HAVE_AMX
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
#endif
  }

  static void load_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void store_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbssd(4, 0, 2);
    _tile_dpbssd(5, 0, 3);
    _tile_dpbssd(6, 1, 2);
    _tile_dpbssd(7, 1, 3);
#endif
  }

  using BufferA = BufferAImpl<GemmKernel224Int4>;
  using BufferB = BufferBInt4Impl<GemmKernel224Int4>;
  using BufferC = BufferCImpl<GemmKernel224Int4>;

  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    using K = GemmKernel224Int4;
    __m512i* c512 = (__m512i*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
    }

    for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::BufferB::B_K_STEP) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin + K::K_STEP);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
            c512[m_i * 2 + n_i] = _mm512_dpbssd_epi32(c512[m_i * 2 + n_i], ma_lo, b512_lo);
            __m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
            c512[m_i * 2 + n_i] = _mm512_dpbssd_epi32(c512[m_i * 2 + n_i], ma_hi, b512_hi);
          }
        }
      }
    }
  }
  static void amx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    using K = GemmKernel224Int4;
    if (k_block_begin == 0) {
      K::clean_c();
    } else {
      // printf("load from c int4\n");
      K::load_c((int32_t*)c, K::N_STEP * sizeof(int32_t));
    }
    for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::BufferB::B_K_STEP) {
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_lo(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::BufferB::B_K_STEP / 2);
      K::run_tile();
      // DEBUG
      // if(m_begin == 0 && n_begin == 0 && k_begin==0){
      //   int8_t *ba_ptr = ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      //   int8_t *bb_ptr = (int8_t *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
      //   printf("k_begin:%d,k_block_begin:%d\n",k_begin,k_block_begin);
      //   for(int j=0;j<4096;j++){
      //     printf("a[%d]: %d ", j, ba_ptr[j]);
      //   }
      //   printf("\n");
      //   for(int j=0;j<4096;j++){
      //     printf("b[%d]: %d ", j, bb_ptr[j]);
      //   }
      //   printf("\n");
      // }

      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin + K::K_STEP), K::K_STEP * sizeof(int8_t));
      K::load_b_hi(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    }

    // debug_tiles_224();
    K::store_c((int32_t*)c, K::N_STEP * sizeof(int32_t));
    // DEBUG c 的值,第一行的前 30 列
    // printf("\nint4, m_begin:%d,n_begin:%d,k_block_begin:%d\n",m_begin,n_begin,k_block_begin);
    // for(int j=0;j<30;j++){
    //   printf("c[%d]: %d ", j, ((int32_t *)c)[j]);
    // }
    // printf("\n");
  }

  static void apply_scale(int m, int n, int m_begin, int n_begin, float* c, BufferA* ba, BufferB* bb) {
    using K = GemmKernel224Int4;
    int to = m - m_begin;
    if (m - m_begin > K::M_STEP) {
      to = K::M_STEP;
    }
    for (int i = 0; i < to; i++) {
      __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i));
      __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin));
      __m512i now = _mm512_load_epi32((__m512i*)(c + i * K::N_STEP));
      __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      // if(i==0){
      //   printf("\nnormal\n");
      //   printf("m_begin:%d,n_begin:%d\n", m_begin, n_begin);
      //   // 打印 result 结果，16 个 float 数值
      //   for(int j = 0; j < 16; j++) {
      //     float val = *((float *) &result + j);
      //     int32_t now_val = *((int32_t *) &now + j);
      //     printf("result[%d]: %f,now:%d ", j, val, now_val);
      //   }
      //   printf("\n");
      // }
      _mm512_store_ps((__m512*)(c + i * K::N_STEP), result);
      bs = _mm512_load_ps(bb->get_scale(n, n_begin) + K::TILE_N);
      now = _mm512_load_si512((__m512i*)(c + i * K::N_STEP + K::TILE_N));
      result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      // if(i==0){
      //   printf("\nnormal\n");
      //   printf("m_begin:%d,n_begin:%d\n", m_begin, n_begin);
      //   // 打印 result 结果，16 个 float 数值
      //   for(int j = 0; j < 16; j++) {
      //     float val = *((float *) &result + j);
      //     int32_t now_val = *((int32_t *) &now + j);
      //     printf("result[%d]: %f,now:%d ", j+16, val, now_val);
      //   }
      //   printf("\n");
      // }
      _mm512_store_ps((__m512*)(c + i * K::N_STEP + K::TILE_N), result);
    }
  }
};

struct GemmKernel224Int4_1 {
  using dt = void;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  // static inline const int K_BLOCK = 7168;
  static inline const int K_BLOCK = 3584;
  // static inline const int K_BLOCK = 2560;
  static std::string name() { return "INT4_1"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 64
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K);

    // size is 16 x 64
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK);

    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  alignas(64) static constexpr uint8_t hi_mask_arr[64] = {
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};

  alignas(64) static constexpr uint8_t lo_mask_arr[64] = {
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};

  alignas(64) static constexpr uint8_t sign_mask_arr[64] = {
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  };

  static __m512i hi_mask() { return *((__m512i*)(&hi_mask_arr[0])); }
  static __m128i hi_mask_128() { return *((__m128i*)(&hi_mask_arr[0])); }
  static __m512i lo_mask() { return *((__m512i*)(&lo_mask_arr[0])); }
  static __m128i lo_mask_128() { return *((__m128i*)(&lo_mask_arr[0])); }
  static __m128i si_mask_128() { return *((__m128i*)(&sign_mask_arr[0])); }

  static void load_b_hi(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i)));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N))));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_b_lo(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_slli_epi32(_mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_slli_epi32(
          _mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N)))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_a(dt* a, size_t lda) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
    (void)a;
    (void)lda;
#endif
  }

  // static void load_b(dt* b, size_t ldb) {
  //   _tile_loadd(2, b, ldb);
  //   _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
  // }

  static void clean_c() {
#ifdef HAVE_AMX
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
#endif
  }

  static void load_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void store_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbsud(4, 0, 2);
    _tile_dpbsud(5, 0, 3);
    _tile_dpbsud(6, 1, 2);
    _tile_dpbsud(7, 1, 3);
#endif
  }

  using BufferA = BufferAWithSumImpl<GemmKernel224Int4_1>;

  using BufferB = BufferBInt4WithZeroImpl<GemmKernel224Int4_1>;

  using BufferC = BufferCImpl<GemmKernel224Int4_1>;

  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    using K = GemmKernel224Int4_1;
    __m512i* c512 = (__m512i*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
    }
    for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::BufferB::B_K_STEP) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin + K::K_STEP);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
            __m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
          }
        }
      }
    }
  }
  static void amx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    using K = GemmKernel224Int4_1;
    if (k_block_begin == 0) {
      K::clean_c();
    } else {
      K::load_c((int32_t*)c, K::N_STEP * sizeof(int32_t));
    }
    for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::BufferB::B_K_STEP) {
      // printf("offset a %ld\n", pointer_offset(ba->get_submat(m, k, m_begin, k_block_begin + k_begin),
      // ba->a)); printf("offset b %ld\n", pointer_offset(bb->get_submat(n, k, n_begin, k_block_begin +
      // k_begin), bb->b));
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_lo(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::BufferB::B_K_STEP / 2);
      K::run_tile();
      // DEBUG
      // if(m_begin == 0 && n_begin == 0 && k_begin==0){
      //   int8_t *ba_ptr = ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      //   int8_t *bb_ptr = (int8_t *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
      //   printf("k_begin:%d,k_block_begin:%d\n",k_begin,k_block_begin);
      //   for(int j=0;j<2048;j++){
      //     printf("a[%d]: %d ", j, ba_ptr[j]);
      //   }
      //   printf("\n");
      //   for(int j=0;j<2048;j++){
      //     printf("b[%d]: %d ", j, bb_ptr[j]);
      //   }
      //   printf("\n");
      // }
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin + K::K_STEP), K::K_STEP * sizeof(int8_t));
      K::load_b_hi(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    }

    // debug_tiles_224();
    K::store_c((int32_t*)c, K::N_STEP * sizeof(int32_t));
    // DEBUG c 的值,第一行的前 30 列
    // printf("\nint4_1, m_begin:%d,n_begin:%d,k_block_begin:%d\n",m_begin,n_begin,k_block_begin);
    // for(int j=0;j<30;j++){
    //   printf("c[%d]: %d ", j, ((int32_t *)c)[j]);
    // }
    // printf("\n");
  }

  static void apply_scale(int m, int n, int m_begin, int n_begin, float* c, BufferA* ba, BufferB* bb) {
    using K = GemmKernel224Int4_1;
    int to = m - m_begin;
    if (m - m_begin > K::M_STEP) {
      to = K::M_STEP;
    }
    for (int i = 0; i < to; i++) {
      __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i));
      __m512 asum = _mm512_set1_ps(*ba->get_sum(m, m_begin + i));

      __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin));
      __m512 b_mins = _mm512_load_ps(bb->get_min(n, n_begin));
      __m512i now = _mm512_load_epi32((__m512i*)(c + i * K::N_STEP));
      __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      result = _mm512_add_ps(result, _mm512_mul_ps(asum, b_mins));
      _mm512_store_ps((__m512*)(c + i * K::N_STEP), result);

      bs = _mm512_load_ps(bb->get_scale(n, n_begin) + K::TILE_N);
      b_mins = _mm512_load_ps(bb->get_min(n, n_begin) + K::TILE_N);
      now = _mm512_load_si512((__m512i*)(c + i * K::N_STEP + K::TILE_N));
      result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      result = _mm512_add_ps(result, _mm512_mul_ps(asum, b_mins));
      _mm512_store_ps((__m512*)(c + i * K::N_STEP + K::TILE_N), result);
    }
  }
};

template <typename TA, typename TB, typename TC>
void mat_mul_single(int m, int n, int k, TA* a, size_t lda, TB* b, size_t ldb, TC* c, size_t ldc);
template <>
inline void mat_mul_single(int m, int n, int k, int8_t* a, size_t lda, int8_t* b, size_t ldb, int32_t* c, size_t ldc) {
  using Kernel = GemmKernel133<32>;
  for (int m_begin = 0; m_begin < m; m_begin += GemmKernel133<32>::M_STEP) {
    int m_end = std::min(m_begin + GemmKernel133<32>::M_STEP, m);
    for (int n_begin = 0; n_begin < n; n_begin += GemmKernel133<32>::N_STEP) {
      int n_end = std::min(n_begin + GemmKernel133<32>::N_STEP, n);
      for (int k_begin = 0; k_begin < k; k_begin += GemmKernel133<32>::K_STEP) {
        int k_end = std::min(k_begin + GemmKernel133<32>::K_STEP, k);
        int8_t* as = offset_pointer_row_major(a, m_begin, k_begin, lda);
        int8_t* bs = offset_pointer_col_major(b, k_begin, n_begin, ldb);
        int32_t* cs = offset_pointer_row_major(c, m_begin, n_begin, ldc);
        GemmKernel133<32>::BTile<int8_t> tb;
        if (n_end - n_begin == GemmKernel133<32>::N_STEP && k_end - k_begin == GemmKernel133<32>::K_STEP) {
          tb.load_from(bs, ldb);
        } else {
          tb.partial_load(bs, n_end - n_begin, k_end - k_begin, ldb);
        }
        if (m_end - m_begin == GemmKernel133<32>::M_STEP && k_end - k_begin == GemmKernel133<32>::K_STEP) {
          // printf("sub mat mul, full tile: (%d,%d),(%d,%d),(%d,%d)\n",
          // m_begin, m_end, n_begin, n_end, k_begin, k_end);
          tb.run_full_ac(as, lda, cs, ldc);
        } else {
          // printf("sub mat mul, partial tile: (%d,%d),(%d,%d),(%d,%d)\n",
          // m_begin, m_end, n_begin, n_end, k_begin, k_end);
          GemmKernel133<32>::PartialTiles<int8_t, int8_t, int32_t> p;
          p.partial_run(m_end - m_begin, n_end - n_begin, k_end - k_begin, as, lda, bs, ldb, cs, ldc);
        }
      }
    }
  }
}

template <>
inline void mat_mul_single(int m, int n, int k, ggml_bf16_t* a, size_t lda, ggml_bf16_t* b, size_t ldb, float* c,
                           size_t ldc) {
  // // GemmKernel133BF::config();

  // for (int m_begin = 0; m_begin < m; m_begin += GemmKernel133BF::M_STEP) {
  //   int m_end = std::min(m_begin + GemmKernel133BF::M_STEP, m);
  //   for (int n_begin = 0; n_begin < n; n_begin += GemmKernel133BF::N_STEP) {
  //     int n_end = std::min(n_begin + GemmKernel133BF::N_STEP, n);

  //     for (int k_begin = 0; k_begin < k; k_begin += GemmKernel133BF::K_STEP)
  //     {
  //       int k_end = std::min(k_begin + GemmKernel133BF::K_STEP, k);

  //       ggml_bf16_t* as = offset_pointer_row_major(a, m_begin, k_begin, lda);
  //       ggml_bf16_t* bs = offset_pointer_col_major(b, k_begin, n_begin, ldb);
  //       GemmKernel133BF::BTile tb;
  //       if (n_end - n_begin == GemmKernel133BF::N_STEP && k_end - k_begin ==
  //       GemmKernel133BF::K_STEP) {
  //         tb.full_load(bs, ldb);
  //       } else {
  //         tb.partial_load(bs, n_end - n_begin, k_end - k_begin, ldb);
  //       }
  //       float* cs = offset_pointer_row_major(c, m_begin, n_begin, ldc);

  //       if (m_end - m_begin == GemmKernel133<32>::M_STEP && k_end - k_begin
  //       == GemmKernel133<32>::K_STEP) {
  //         // printf("sub mat mul, full tile: (%d,%d),(%d,%d),(%d,%d)\n",
  //         m_begin, m_end, n_begin, n_end, k_begin,
  //         // k_end);
  //         tb.run_full_ac(as, lda, cs, ldc);
  //       } else {
  //         // printf("sub mat mul, partial tile: (%d,%d),(%d,%d),(%d,%d)\n",
  //         m_begin, m_end, n_begin, n_end, k_begin,
  //         // k_end);
  //         GemmKernel133BF::PartialTiles p;
  //         p.partial_run(m_end - m_begin, n_end - n_begin, k_end - k_begin,
  //         as, lda, bs, ldb, cs, ldc);
  //       }
  //     }
  //   }
  // }
}

template <typename QA>
void mat_mul_single(int m, int n, int k, QA* a, size_t lda, block_q8_0* b, size_t ldb, float* c, size_t ldc) {
  // amx::init();
  assert(QK8_0 == 32);
  assert(QK4_0 == 32);
  assert(GemmKernel133<32>::K_STEP == 32);
  // assert(reinterpret_cast<intptr_t>(c) % 64 == 0);
  assert(ldc % 64 == 0);

  // GemmKernal133::config();
  for (int n_begin = 0; n_begin < n; n_begin += GemmKernel133<32>::N_STEP) {
    int n_end = std::min(n_begin + GemmKernel133<32>::N_STEP, n);

    for (int k_begin = 0; k_begin < k; k_begin += GemmKernel133<32>::K_STEP) {
      int k_end = std::min(k_begin + GemmKernel133<32>::K_STEP, k);
      int kb = k_begin / GemmKernel133<32>::K_STEP;
      block_q8_0* bs = offset_pointer_col_major(b, kb, n_begin, ldb);
      GemmKernel133<32>::PartialTiles<int8_t, int8_t, int32_t> p;
      p.tb.partial_load_quant(bs, n_end - n_begin, k_end - k_begin, ldb);
      for (int m_begin = 0; m_begin < m; m_begin += GemmKernel133<32>::M_STEP) {
        int m_end = std::min(m_begin + GemmKernel133<32>::M_STEP, m);
        QA* as = offset_pointer_row_major(a, m_begin, kb, lda);

        float* cs = offset_pointer_row_major(c, m_begin, n_begin, ldc);
        // printf("sub mat mul: (%d,%d),(%d,%d),(%d,%d) %ld %ld\n", m_begin,
        // m_end, n_begin, n_end, k_begin, k_end,as-a,bs-b);

        // p.partial_run_quant(m_end - m_begin, n_end - n_begin, k_end -
        // k_begin, as, lda, bs, ldb, cs, ldc);
        p.partial_run_quant_ac(m_end - m_begin, n_end - n_begin, k_end - k_begin, as, lda, cs, ldc);
      }
    }
  }
}

inline void mat_mul_single(int m, int n, int k, block_q4_K* a, size_t lda, block_q8_K* b, size_t ldb, float* c,
                           size_t ldc) {
  assert(QK_K == 256);
  assert(k % QK_K == 0);
  assert(QK_K % GemmKernel133<32>::K_STEP == 0);
  assert(GemmKernel133<32>::K_STEP == 32);
  assert(ldc % 64 == 0);

  for (int m_begin = 0; m_begin < m; m_begin += GemmKernel133<32>::M_STEP) {
    int m_end = std::min(m_begin + GemmKernel133<32>::M_STEP, m);
    for (int n_begin = 0; n_begin < n; n_begin += GemmKernel133<32>::N_STEP) {
      int n_end = std::min(n_begin + GemmKernel133<32>::N_STEP, n);
      float* cs = offset_pointer_row_major(c, m_begin, n_begin, ldc);
      for (int k_bigstart = 0; k_bigstart < k; k_bigstart += QK_K) {
        int k_bigend = k_bigstart + QK_K;
        int super_block_index = k_bigstart / QK_K;

        block_q8_K* super_bs = offset_pointer_col_major(b, super_block_index, n_begin, ldb);

        block_q4_K* super_as = offset_pointer_row_major(a, m_begin, super_block_index, lda);
        float super_scale = GGML_FP16_TO_FP32(super_as->d);
        float super_min = GGML_FP16_TO_FP32(super_as->dmin);
        __m512 a_sm = _mm512_mul_ps(
            _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(make_q4K_scale_and_min(super_as->scales))),
            _mm512_insertf32x8(_mm512_castps256_ps512(_mm256_set1_ps(super_scale)), _mm256_set1_ps(super_min), 1));
        float* a_scale = reinterpret_cast<float*>(&a_sm);
        float* a_min = a_scale + 8;

        for (int inner_idx = 0; inner_idx < 256 / 32; inner_idx++) {
          amx::GemmKernel133<32>::PartialTilesSum<uint8_t, int8_t, float> t;
          // printf("sub mat mul: (%d,%d),(%d,%d),(%d,%d) %d\n", m_begin, m_end,
          // n_begin, n_end, k_bigstart,
          //        k_bigend,inner_idx);
          t.tb.partial_load_quant(super_bs, n_end - n_begin, inner_idx, ldb);
          t.partial_run_quant_ac(m_end - m_begin, n_end - n_begin, inner_idx, super_as, lda, cs, ldc,
                                 a_scale[inner_idx], a_min[inner_idx]);
        }
      }
    }
  }
}

inline void mat_mul_single(int m, int n, int k, blocks_aligned_q8_0_ref a, int a_blck_stride, blocks_aligned_q8_0_ref b,
                           int b_blck_stride, float* c, size_t ldc) {
  using Kernel = GemmKernel133<64>;
  using TA = uint8_t;
  using TB = int8_t;

  for (int m_begin = 0; m_begin < m; m_begin += Kernel::M_STEP) {
    int m_end = std::min(m_begin + Kernel::M_STEP, m);
    for (int n_begin = 0; n_begin < n; n_begin += Kernel::N_STEP) {
      int n_end = std::min(n_begin + Kernel::N_STEP, n);
      for (int k_begin = 0; k_begin < k; k_begin += Kernel::K_STEP) {
        int k_end = std::min(k_begin + Kernel::K_STEP, k);

        int k_block = k_begin / Kernel::K_STEP;

        auto as = a.offset(m_begin * a_blck_stride + k_block);
        auto bs = b.offset(n_begin * b_blck_stride + k_block);
        auto cs = offset_pointer_row_major(c, m_begin, n_begin, ldc);

        // printf("sub mat mul: (%d,%d),(%d,%d),(%d,%d) %ld %ld\n", m_begin,
        // m_end, n_begin, n_end, k_begin, k_end,as.d-a.d,bs.d-b.d);

        Kernel::PartialTiles<TA, TB, int32_t> t;
        t.tb.partial_load_quant(bs, n_end - n_begin, k_end - k_begin, b_blck_stride);
        t.partial_run_quant_ac(m_end - m_begin, n_end - n_begin, k_end - k_begin, as, a_blck_stride, cs, ldc);
      }
    }
  }
}

inline void merge_mat(int d0, int d1, float* a, float* b, size_t ld) {
  __m512* va = (__m512*)a;
  __m512* vb = (__m512*)b;

  size_t d1v = (d1 + 15) / 16;

  for (int i = 0; i < d0; i++) {
    auto ta = offset_pointer_row_major(va, i, 0, ld);
    auto tb = offset_pointer_row_major(vb, i, 0, ld);
    for (int j = 0; j < d1v; j++) {
      ta[j] = _mm512_add_ps(ta[j], tb[j]);
    }
  }
}

inline void merge_mats(int d0, int d1, int cnt, float** data, size_t ld) {
  for (int i = 0; i < cnt; i++) {
    assert((intptr_t)data[i] % 64 == 0);
    assert(ld % 64 == 0);
  }

  while (cnt > 1) {
    int new_cnt = (cnt + 1) / 2;
    for (int i = 0; i < new_cnt; i++) {
      int j = new_cnt + i;
      if (j < cnt) {
        // printf("merge %d %d\n", i, j);
        merge_mat(d0, d1, data[i], data[j], ld);
      }
    }
    cnt = new_cnt;
  }
}

template <typename TA, typename TB, typename TC>
struct GemmKernel {
  static_assert(sizeof(TA) == -1, "No associated type defined for this type.");
  using type = GemmKernel224BF;
};

template <typename TB>
struct GemmKernel<uint8_t, TB, float> {
  using type = GemmKernel133<32>;
};

template <typename TB>
struct GemmKernel<int8_t, TB, float> {
  using type = GemmKernel133<32>;
};

template <>
struct GemmKernel<block_q4_0, block_q8_0, float> {
  using type = GemmKernel133<32>;
};

template <>
struct GemmKernel<block_q8_0, block_q8_0, float> {
  using type = GemmKernel133<32>;
};

template <>
struct GemmKernel<block_q4_K, block_q8_K, float> {
  using type = GemmKernel133<32>;
};

template <>
struct GemmKernel<ggml_bf16_t, ggml_bf16_t, float> {
  // using type = GemmKernel133BF;
  using type = GemmKernel224BF;
};

// template <typename TA, typename TB, typename TC>
// void mat_mul(int m, int n, int k, TA* a, size_t lda, TB* b, size_t ldb, TC*
// c, size_t ldc, int ith, int nth) {
//   using K = typename GemmKernel<TA, TB, TC>::type;

//   int m_partition_count = (m + K::M_STEP - 1) / K::M_STEP;
//   int partition_count_per_thread = (m_partition_count + nth - 1) / nth;
//   int partition_start = ith * partition_count_per_thread;
//   int partition_end = std::min(partition_start + partition_count_per_thread,
//   m_partition_count); int m_start = partition_start * K::M_STEP; int m_end =
//   std::min(m, partition_end * K::M_STEP);

//   mat_mul_single(m_end - m_start, n, k, offset_pointer(a, m_start * lda),
//   lda, b, ldb, offset_pointer(c, m_start * ldc),
//                  ldc);
// }

template <typename TA, typename TB, typename TC>
void mat_mul(int m, int n, int k, TA* a, size_t lda, TB* b, size_t ldb, TC* c, size_t ldc, int ith, int nth) {
  using K = typename GemmKernel<TA, TB, TC>::type;

  int n_partition_count = (n + K::N_STEP - 1) / K::N_STEP;
  int partition_count_per_thread = (n_partition_count + nth - 1) / nth;
  int partition_start = ith * partition_count_per_thread;
  int partition_end = std::min(partition_start + partition_count_per_thread, n_partition_count);
  int n_start = partition_start * K::N_STEP;
  int n_end = std::min(n, partition_end * K::N_STEP);

  mat_mul_single(m, n_end - n_start, k, a, lda, offset_pointer_col_major(b, 0, n_start, ldb), ldb,
                 offset_pointer_row_major(c, 0, n_start, ldc), ldc);
}

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF::BufferA> ba,
                    std::shared_ptr<GemmKernel224BF::BufferB> bb, std::shared_ptr<GemmKernel224BF::BufferC> bc, int ith,
                    int nth) {
  using K = GemmKernel224BF;
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  // printf("n_start %d n_end %d\n", n_start, n_end);
  for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
        float* c = bc->get_submat(m, n, m_begin, n_begin);
        // if (m - m_begin == 1) {
        if (false) {
          // if(k_block_begin==0&&m_begin==0&&n_begin==n_start)
          // printf("AVX");
          __m512* c512 = (__m512*)c;
          if (k_block_begin == 0) {
            for (int m_i = 0; m_i < m; m_i++) {
              c512[m_i * 2] = _mm512_setzero_ps();
              c512[m_i * 2 + 1] = _mm512_setzero_ps();
            }
          }

          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            int32_t* a32 = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
            __m512bh* b512 = (__m512bh*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
            for (int m_i = 0; m_i < m; m_i++) {
              for (int k_i = 0; k_i < 16; k_i++) {
                __m512bh ma = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i]);
                for (int n_i = 0; n_i < 2; n_i++) {
                  c512[m_i * 2 + n_i] = _mm512_dpbf16_ps(c512[m_i * 2 + n_i], ma, b512[n_i * 16 + k_i]);
                }
              }
            }
          }

        } else {
          if (k_block_begin == 0) {
            K::clean_c();
          } else {
            K::load_c(c, K::N_STEP * sizeof(float));
          }
          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(ggml_bf16_t));
            K::load_b(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::K_STEP * sizeof(ggml_bf16_t));
            K::run_tile();
          }
          K::store_c(c, K::N_STEP * sizeof(float));
        }
      }
    }
  }
}

inline void vec_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF::BufferA> ba,
                    std::shared_ptr<GemmKernel224BF::BufferB> bb, std::shared_ptr<GemmKernel224BF::BufferC> bc, int ith,
                    int nth) {
  mat_mul(m, n, k, ba, bb, bc, ith, nth);
}

template <typename K, bool amx_or_avx = true>
void integer_mat_mul(int m, int n, int k, typename K::BufferA* ba, typename K::BufferB* bb, typename K::BufferC* bc,
                     int ith, int nth) {
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
        float* c = bc->get_submat(m, n, m_begin, n_begin);
        if constexpr (amx_or_avx && AMX_AVAILABLE) {
          K::amx_kernel(m, n, k, m_begin, n_begin, k_block_begin, c, ba, bb);
        } else {
          K::avx_kernel(m, n, k, m_begin, n_begin, k_block_begin, c, ba, bb);
        }

        if (k_block_begin + K::K_BLOCK >= k) {
          K::apply_scale(m, n, m_begin, n_begin, c, ba, bb);
        }
      }
    }
  }
}

inline void vec_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int8::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int8::BufferB> bb, std::shared_ptr<GemmKernel224Int8::BufferC> bc,
                    int ith, int nth) {
  integer_mat_mul<GemmKernel224Int8, false>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int8::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int8::BufferB> bb, std::shared_ptr<GemmKernel224Int8::BufferC> bc,
                    int ith, int nth) {
  integer_mat_mul<GemmKernel224Int8, true>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void vec_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int4::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int4::BufferB> bb, std::shared_ptr<GemmKernel224Int4::BufferC> bc,
                    int ith, int nth) {
  integer_mat_mul<GemmKernel224Int4, false>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int4::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int4::BufferB> bb, std::shared_ptr<GemmKernel224Int4::BufferC> bc,
                    int ith, int nth) {
  integer_mat_mul<GemmKernel224Int4, true>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void vec_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int4_1::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int4_1::BufferB> bb, std::shared_ptr<GemmKernel224Int4_1::BufferC> bc,
                    int ith, int nth) {
  integer_mat_mul<GemmKernel224Int4_1, false>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int4_1::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int4_1::BufferB> bb, std::shared_ptr<GemmKernel224Int4_1::BufferC> bc,
                    int ith, int nth) {
  integer_mat_mul<GemmKernel224Int4_1, true>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void mat_mul(int m, int n, int k, blocks_aligned_q8_0_ref aref, int a_blck_stride, blocks_aligned_q8_0_ref bref,
                    int b_blck_stride, float* c, size_t ldc, int ith, int nth) {
  using K = GemmKernel133<64>;

  int m_partition_count = (m + K::M_STEP - 1) / K::M_STEP;
  int partition_count_per_thread = (m_partition_count + nth - 1) / nth;
  int partition_start = ith * partition_count_per_thread;
  int partition_end = std::min(partition_start + partition_count_per_thread, m_partition_count);
  int m_start = partition_start * K::M_STEP;
  int m_end = std::min(m, partition_end * K::M_STEP);

  mat_mul_single(m_end - m_start, n, k, aref.offset(m_start * a_blck_stride), a_blck_stride, bref, b_blck_stride,
                 offset_pointer(c, m_start * ldc), ldc);
}

// K-group quantization kernel with intermediate int32 accumulation
struct GemmKernel224Int4KGroup {
  using dt = void;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;
  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;
  static inline const int N_BLOCK = 256;
  // K_BLOCK should match k_group_size for proper scaling
  static inline const int K_BLOCK = 7168;  // Will be overridden by k_group_size

  static std::string name() { return "INT4_KGROUP"; }
  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;
    // size is 16 x 64
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K);
    // size is 16 x 64
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK);
    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));
    tile_config.set_config();
#endif
  }

  alignas(64) static constexpr uint8_t hi_mask_arr[64] = {
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};

  alignas(64) static constexpr uint8_t lo_mask_arr[64] = {
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};

  static __m512i hi_mask() { return *((__m512i*)(&hi_mask_arr[0])); }
  static __m512i lo_mask() { return *((__m512i*)(&lo_mask_arr[0])); }

  static void clean_c() {
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
  }

  static void load_c(output_t* c, size_t ldc) {
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void store_c(output_t* c, size_t ldc) {
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void load_a(dt* a, size_t lda) {
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
  }

  static void load_b_lo(dt* b, size_t ldb) {
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      // __m512i temp = _mm512_and_si512(lo_mask(), *static_cast<__m512i *>(offset_pointer(b, ldb * i)));
      // db[i] = _mm512_slli_epi32(temp, 4);
      db[i] = _mm512_slli_epi32(_mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      // __m512i temp = _mm512_and_si512(lo_mask(), *static_cast<__m512i *>(offset_pointer(b, ldb * (i + TILE_N))));
      // db[i] = _mm512_slli_epi32(temp, 4);
      db[i] = _mm512_slli_epi32(
          _mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N)))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
  }

  static void load_b_hi(dt* b, size_t ldb) {
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i)));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N))));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbssd(4, 0, 2);
    _tile_dpbssd(5, 0, 3);
    _tile_dpbssd(6, 1, 2);
    _tile_dpbssd(7, 1, 3);
#endif
  }

  using BufferA = BufferAKGroupImpl<GemmKernel224Int4KGroup>;
  using BufferB = BufferBKGroupImpl<GemmKernel224Int4KGroup>;
  using BufferC = BufferCReduceImpl<GemmKernel224Int4KGroup>;

  // K-group aware AVX kernel - processes a single B_K_STEP chunk
  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, int32_t* int_c, BufferA* ba,
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4KGroup;
    __m512i* c512 = (__m512i*)int_c;
    int m_block_end = std::min(m - m_begin, M_STEP);

    // Initialize int_c to zero at the start of k_group
    if (k_block_begin % k_group_size == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
    }
    int k_offset = k_block_begin % K::BufferB::B_K_STEP;
    if (k_offset == 0) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
            c512[m_i * 2 + n_i] = _mm512_dpbssd_epi32(c512[m_i * 2 + n_i], ma_lo, b512_lo);
          }
        }
      }
    } else {
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
            c512[m_i * 2 + n_i] = _mm512_dpbssd_epi32(c512[m_i * 2 + n_i], ma_hi, b512_hi);
          }
        }
      }
    }
  }

  // K-group aware AMX kernel - processes a single K_STEP chunk (lo or hi nibble)
  static void amx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, int32_t* int_c, BufferA* ba,
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4KGroup;
    // Initialize or load int_c at start of k_group
    if (k_block_begin % k_group_size == 0) {
      K::clean_c();
    } else {
      K::load_c(int_c, K::N_STEP * sizeof(int32_t));
    }

    // Determine if we're processing lo or hi nibble based on position within B_K_STEP
    int k_offset = k_block_begin % K::BufferB::B_K_STEP;
    if (k_offset == 0) {
      // Process lo nibble
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_lo(bb->get_submat(n, k, n_begin, k_block_begin), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    } else {
      // Process hi nibble (k_offset == K_STEP)
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_hi(bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    }

    K::store_c(int_c, K::N_STEP * sizeof(int32_t));
  }

  // K-group aware scale application
  static void apply_scale_kgroup(int m, int n, int m_begin, int n_begin, int k_begin, float* c, int32_t* int_c,
                                 BufferA* ba, BufferB* bb, int k, int k_group_size) {
    using K = GemmKernel224Int4KGroup;
    int to = m - m_begin;
    if (m - m_begin > K::M_STEP) {
      to = K::M_STEP;
    }

    for (int i = 0; i < to; i++) {
      // Get scale for this k_group
      __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i, k, k_begin));
      __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin, k, k_begin));
      __m512i now = _mm512_load_epi32((__m512i*)(int_c + i * K::N_STEP));
      __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      // Load existing float value from c and add
      __m512 existing = _mm512_load_ps((__m512*)(c + i * K::N_STEP));
      result = _mm512_add_ps(existing, result);
      _mm512_store_ps((__m512*)(c + i * K::N_STEP), result);

      // Second half
      bs = _mm512_load_ps(bb->get_scale(n, n_begin, k, k_begin) + K::TILE_N);
      now = _mm512_load_si512((__m512i*)(int_c + i * K::N_STEP + K::TILE_N));
      result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      existing = _mm512_load_ps((__m512*)(c + i * K::N_STEP + K::TILE_N));
      result = _mm512_add_ps(existing, result);
      _mm512_store_ps((__m512*)(c + i * K::N_STEP + K::TILE_N), result);
    }
  }
};
struct GemmKernel224Int4_1KGroup {
  using dt = void;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  // static inline const int K_BLOCK = 7168;
  static inline const int K_BLOCK = 3584;
  // static inline const int K_BLOCK = 2560;
  static std::string name() { return "INT4_1K"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 64
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K);

    // size is 16 x 64
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK);

    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  alignas(64) static constexpr uint8_t hi_mask_arr[64] = {
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};

  alignas(64) static constexpr uint8_t lo_mask_arr[64] = {
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};

  alignas(64) static constexpr uint8_t sign_mask_arr[64] = {
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  };

  static __m512i hi_mask() { return *((__m512i*)(&hi_mask_arr[0])); }
  static __m128i hi_mask_128() { return *((__m128i*)(&hi_mask_arr[0])); }
  static __m512i lo_mask() { return *((__m512i*)(&lo_mask_arr[0])); }
  static __m128i lo_mask_128() { return *((__m128i*)(&lo_mask_arr[0])); }
  static __m128i si_mask_128() { return *((__m128i*)(&sign_mask_arr[0])); }

  static void load_b_hi(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i)));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N))));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_b_lo(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_slli_epi32(_mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_slli_epi32(
          _mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N)))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_a(dt* a, size_t lda) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
    (void)a;
    (void)lda;
#endif
  }

  // static void load_b(dt* b, size_t ldb) {
  //   _tile_loadd(2, b, ldb);
  //   _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
  // }

  static void clean_c() {
#ifdef HAVE_AMX
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
#endif
  }

  static void load_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void store_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbsud(4, 0, 2);
    _tile_dpbsud(5, 0, 3);
    _tile_dpbsud(6, 1, 2);
    _tile_dpbsud(7, 1, 3);
#endif
  }

  using BufferA = BufferAWithSumKGroupImpl<GemmKernel224Int4_1KGroup>;

  using BufferB = BufferBInt4WithZeroKGroupImpl<GemmKernel224Int4_1KGroup>;

  using BufferC = BufferCReduceImpl<GemmKernel224Int4_1KGroup>;

  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, int32_t* int_c, BufferA* ba,
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4_1KGroup;
    __m512i* c512 = (__m512i*)int_c;
    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin % k_group_size == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
    }
    int k_offset = k_block_begin % K::BufferB::B_K_STEP;
    if (k_offset == 0) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
          }
        }
      }
    } else {
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
          }
        }
      }
    }
  }
  static void amx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, int32_t* int_c, BufferA* ba,
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4_1KGroup;
    if (k_block_begin % k_group_size == 0) {
      K::clean_c();
    } else {
      K::load_c(int_c, K::N_STEP * sizeof(int32_t));
    }

    // Determine if we're processing lo or hi nibble based on position within B_K_STEP
    int k_offset = k_block_begin % K::BufferB::B_K_STEP;
    if (k_offset == 0) {
      // Process lo nibble
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_lo(bb->get_submat(n, k, n_begin, k_block_begin), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    } else {
      // Process hi nibble (k_offset == K_STEP)
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_hi(bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    }

    K::store_c(int_c, K::N_STEP * sizeof(int32_t));
  }

  static void apply_scale_kgroup(int m, int n, int m_begin, int n_begin, int k_begin, float* c, int32_t* int_c,
                                 BufferA* ba, BufferB* bb, int k, int k_group_size) {
    using K = GemmKernel224Int4_1KGroup;
    int to = m - m_begin;
    if (m - m_begin > K::M_STEP) {
      to = K::M_STEP;
    }
    for (int i = 0; i < to; i++) {
      __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i, k, k_begin));
      __m512 asum = _mm512_set1_ps(*ba->get_sum(m, m_begin + i, k, k_begin));

      __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin, k, k_begin));
      __m512 b_mins = _mm512_load_ps(bb->get_min(n, n_begin, k, k_begin));
      __m512i now = _mm512_load_epi32((__m512i*)(int_c + i * K::N_STEP));
      __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      result = _mm512_add_ps(result, _mm512_mul_ps(asum, b_mins));
      __m512 existing = _mm512_load_ps((__m512*)(c + i * K::N_STEP));
      result = _mm512_add_ps(result, existing);
      _mm512_store_ps((__m512*)(c + i * K::N_STEP), result);

      bs = _mm512_load_ps(bb->get_scale(n, n_begin, k, k_begin) + K::TILE_N);
      b_mins = _mm512_load_ps(bb->get_min(n, n_begin, k, k_begin) + K::TILE_N);
      now = _mm512_load_si512((__m512i*)(int_c + i * K::N_STEP + K::TILE_N));
      result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      result = _mm512_add_ps(result, _mm512_mul_ps(asum, b_mins));
      existing = _mm512_load_ps((__m512*)(c + i * K::N_STEP + K::TILE_N));
      result = _mm512_add_ps(result, existing);
      _mm512_store_ps((__m512*)(c + i * K::N_STEP + K::TILE_N), result);
    }
  }
};

struct GemmKernel224Int4_1_LowKGroup {
  using dt = void;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  // static inline const int K_BLOCK = 7168;
  static inline const int K_BLOCK = 3584;
  // static inline const int K_BLOCK = 2560;
  static std::string name() { return "INT4_1K"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 64
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K);

    // size is 16 x 64
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK);

    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  alignas(64) static constexpr uint8_t hi_mask_arr[64] = {
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};

  alignas(64) static constexpr uint8_t lo_mask_arr[64] = {
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};

  alignas(64) static constexpr uint8_t sign_mask_arr[64] = {
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  };

  static __m512i hi_mask() { return *((__m512i*)(&hi_mask_arr[0])); }
  static __m128i hi_mask_128() { return *((__m128i*)(&hi_mask_arr[0])); }
  static __m512i lo_mask() { return *((__m512i*)(&lo_mask_arr[0])); }
  static __m128i lo_mask_128() { return *((__m128i*)(&lo_mask_arr[0])); }
  static __m128i si_mask_128() { return *((__m128i*)(&sign_mask_arr[0])); }

  static void load_b_hi(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_srli_epi32(_mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_srli_epi32(
          _mm512_and_si512(hi_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N)))), 4);
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_b_lo(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    // 在函数内部分配一个局部(栈上)对齐缓冲区
    alignas(64) int8_t local_buffer[TILE_N * TILE_K];
    __m512i* db = reinterpret_cast<__m512i*>(local_buffer);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * i)));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(2, db, TILE_K);

    for (size_t i = 0; i < TILE_N; i++) {
      db[i] = _mm512_and_si512(lo_mask(), *static_cast<__m512i*>(offset_pointer(b, ldb * (i + TILE_N))));
    }
    asm volatile("" ::: "memory");
    _tile_loadd(3, db, TILE_K);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void load_a(dt* a, size_t lda) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
    (void)a;
    (void)lda;
#endif
  }

  // static void load_b(dt* b, size_t ldb) {
  //   _tile_loadd(2, b, ldb);
  //   _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
  // }

  static void clean_c() {
#ifdef HAVE_AMX
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
#endif
  }

  static void load_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void store_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbsud(4, 0, 2);
    _tile_dpbsud(5, 0, 3);
    _tile_dpbsud(6, 1, 2);
    _tile_dpbsud(7, 1, 3);
#endif
  }

  using BufferA = BufferAWithSumKGroupImpl<GemmKernel224Int4_1_LowKGroup>;

  using BufferB = BufferBInt4WithZeroLowKGroupImpl<GemmKernel224Int4_1_LowKGroup>;

  using BufferC = BufferCReduceImpl<GemmKernel224Int4_1_LowKGroup>;

  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, int32_t* int_c, BufferA* ba,
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4_1_LowKGroup;
    __m512i* c512 = (__m512i*)int_c;
    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin % k_group_size == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
    }
    int k_offset = k_block_begin % K::BufferB::B_K_STEP;
    if (k_offset == 0) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]);
            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
          }
        }
      }
    } else {
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP);
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_hi = _mm512_srli_epi32(_mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]), 4);
            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
          }
        }
      }
    }
  }
  static void amx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, int32_t* int_c, BufferA* ba,
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4_1_LowKGroup;
    if (k_block_begin % k_group_size == 0) {
      K::clean_c();
    } else {
      K::load_c(int_c, K::N_STEP * sizeof(int32_t));
    }

    // Determine if we're processing lo or hi nibble based on position within B_K_STEP
    int k_offset = k_block_begin % K::BufferB::B_K_STEP;
    if (k_offset == 0) {
      // Process lo nibble
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_lo(bb->get_submat(n, k, n_begin, k_block_begin), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    } else {
      // Process hi nibble (k_offset == K_STEP)
      K::load_a(ba->get_submat(m, k, m_begin, k_block_begin), K::K_STEP * sizeof(int8_t));
      K::load_b_hi(bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP), K::BufferB::B_K_STEP / 2);
      K::run_tile();
    }

    K::store_c(int_c, K::N_STEP * sizeof(int32_t));
  }

  static void apply_scale_kgroup(int m, int n, int m_begin, int n_begin, int k_begin, float* c, int32_t* int_c,
                                 BufferA* ba, BufferB* bb, int k, int k_group_size) {
    using K = GemmKernel224Int4_1_LowKGroup;
    int to = m - m_begin;
    if (m - m_begin > K::M_STEP) {
      to = K::M_STEP;
    }
    for (int i = 0; i < to; i++) {
      __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i, k, k_begin));
      __m512 asum = _mm512_set1_ps(*ba->get_sum(m, m_begin + i, k, k_begin));

      __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin, k, k_begin));
      __m512 b_mins = _mm512_load_ps(bb->get_min(n, n_begin, k, k_begin));
      __m512i now = _mm512_load_epi32((__m512i*)(int_c + i * K::N_STEP));
      __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      result = _mm512_add_ps(result, _mm512_mul_ps(asum, b_mins));
      __m512 existing = _mm512_load_ps((__m512*)(c + i * K::N_STEP));
      result = _mm512_add_ps(result, existing);
      _mm512_store_ps((__m512*)(c + i * K::N_STEP), result);

      bs = _mm512_load_ps(bb->get_scale(n, n_begin, k, k_begin) + K::TILE_N);
      b_mins = _mm512_load_ps(bb->get_min(n, n_begin, k, k_begin) + K::TILE_N);
      now = _mm512_load_si512((__m512i*)(int_c + i * K::N_STEP + K::TILE_N));
      result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
      result = _mm512_add_ps(result, _mm512_mul_ps(asum, b_mins));
      existing = _mm512_load_ps((__m512*)(c + i * K::N_STEP + K::TILE_N));
      result = _mm512_add_ps(result, existing);
      _mm512_store_ps((__m512*)(c + i * K::N_STEP + K::TILE_N), result);
    }
  }
};

// K2 Signed Int4 K-group quantization kernel (AVX only, no AMX)
// For K2 MoE - signed int4 range: [-8, 7]
struct GemmKernel224Int4SmallKGroup {
  using dt = uint8_t;  // packed int4 type
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int VNNI_BLK = 4;

  static const int M_STEP = 1;
  static const int N_STEP = 32;
  static const int K_STEP = 32;

  static inline const int N_BLOCK = 256;
  // K_BLOCK should match k_group_size for proper scaling
  static inline const int K_BLOCK = 7168;  // Will be overridden by k_group_size

  static std::string name() { return "K2_INT4_KGROUP"; }
  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }
  static void config() {}

  alignas(64) static constexpr uint8_t hi_mask_arr[32] = {
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};

  alignas(64) static constexpr uint8_t lo_mask_arr[32] = {
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};

  alignas(64) static constexpr uint8_t sign_xor_arr[32] = {
      0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
      0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88};
  static __m256i hi_mask() { return *((__m256i*)(&hi_mask_arr[0])); }
  static __m256i lo_mask() { return *((__m256i*)(&lo_mask_arr[0])); }
  static __m256i sign_xor_mask() { return *((__m256i*)(&sign_xor_arr[0])); }

  using BufferA = BufferASmallKGroupImpl<GemmKernel224Int4SmallKGroup>;
  using BufferB = BufferBInt4KGroupImpl<GemmKernel224Int4SmallKGroup>;  // Use new signed int4 buffer
  using BufferC = BufferCReduceImpl<GemmKernel224Int4SmallKGroup>;

  // K-group aware AVX kernel for signed int4
  static inline __m512i compressed_int4_to_int8_avx512(__m256i b256) {
    b256 = _mm256_xor_si256(b256, sign_xor_mask());
    __m256i b_hi = _mm256_and_si256(b256, hi_mask());
    __m256i b_lo = _mm256_slli_epi16(_mm256_andnot_si256(hi_mask(), b256), 4);

    __m256i unpack_lo = _mm256_unpacklo_epi8(b_lo, b_hi);
    __m256i unpack_hi = _mm256_unpackhi_epi8(b_lo, b_hi);
    __m512i result = _mm512_inserti64x4(_mm512_castsi256_si512(unpack_lo), unpack_hi, 1);
    const __m512i lane_shuffle = _mm512_set_epi64(7, 6, 3, 2, 5, 4, 1, 0);
    return _mm512_permutexvar_epi64(lane_shuffle, result);
  }
  static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB* bb,
                                            BufferC* bc, int ith, int nth) {
    auto [n_start, n_end] = split_range_n(n, ith, nth);
    for (int m_begin = 0; m_begin < m; m_begin++) {
      float* c = bc->get_submat(m, n, m_begin, n_start);
      __m512i* a512 = (__m512i*)ba->get_submat(m, k, m_begin, 0);

      for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin++) {
        __m256i* b256 = (__m256i*)bb->get_submat(n, k, n_block_begin, 0);
        float* as = (float*)ba->get_scale(m, m_begin, k, 0);
        float* bs = (float*)bb->get_scale(n, n_block_begin, k, 0);

        __m512 sum = _mm512_setzero_ps();
#define WORK_K_BLOCK(k_block)                                                                     \
  {                                                                                               \
    __m256 abscale0 = _mm256_set1_ps(as[(k_block) * 2] * bs[(k_block) * 2]);                      \
    __m256 abscale1 = _mm256_set1_ps(as[(k_block) * 2 + 1] * bs[(k_block) * 2 + 1]);              \
    __m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1);           \
    __m512i mul = _mm512_setzero_si512();                                                         \
    mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
    sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul)));                    \
  }

        for (int k_block = 0; k_block < k / 64; k_block += 2) {
          WORK_K_BLOCK(k_block);
          WORK_K_BLOCK(k_block + 1);
        }

        c[n_block_begin - n_start] = _mm512_reduce_add_ps(sum) / 16;
      }
    }
  }
};

inline void vec_mul_kgroup(int m, int n, int k, int k_group_size,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
  GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void mat_mul_kgroup(int m, int n, int k, int k_group_size,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
  GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
}

// New k-group aware matrix multiplication function
template <typename K, bool amx_or_avx = true>
void integer_mat_mul_kgroup(int m, int n, int k, int k_group_size, typename K::BufferA* ba, typename K::BufferB* bb,
                            typename K::BufferC* bc, int ith, int nth) {
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);
  assert(k % k_group_size == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);
  // Process by k_groups
  for (int k_group_begin = 0; k_group_begin < k; k_group_begin += k_group_size) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
        float* c = bc->get_submat(m, n, m_begin, n_begin);
        int32_t* int_c = bc->get_int_submat(m, n, m_begin, n_begin);

        // Initialize float c to zero at the very beginning
        if (k_group_begin == 0) {
          for (int i = 0; i < K::M_STEP && m_begin + i < m; i++) {
            for (int j = 0; j < K::N_STEP; j++) {
              c[i * K::N_STEP + j] = 0.0f;
            }
          }
        }
        for (int k_begin = k_group_begin; k_begin < std::min(k, k_group_begin + k_group_size); k_begin += K::K_STEP) {
          if constexpr (amx_or_avx && AMX_AVAILABLE) {
            K::amx_kernel(m, n, k, m_begin, n_begin, k_begin, int_c, ba, bb, k_group_size);
          } else {
            K::avx_kernel(m, n, k, m_begin, n_begin, k_begin, int_c, ba, bb, k_group_size);
          }
        }
        // }

        // Apply scale and accumulate to float buffer at end of k_group
        K::apply_scale_kgroup(m, n, m_begin, n_begin, k_group_begin, c, int_c, ba, bb, k, k_group_size);
      }
    }
  }
}

// Convenience functions for k-group kernels
inline void vec_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4KGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4KGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4KGroup::BufferC> bc, int ith, int nth) {
  integer_mat_mul_kgroup<GemmKernel224Int4KGroup, false>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void mat_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4KGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4KGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4KGroup::BufferC> bc, int ith, int nth) {
  integer_mat_mul_kgroup<GemmKernel224Int4KGroup, true>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
}

// Convenience functions for k-group kernels
inline void vec_mul_kgroup(int m, int n, int k, int k_group_size,
                           std::shared_ptr<GemmKernel224Int4_1KGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4_1KGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4_1KGroup::BufferC> bc, int ith, int nth) {
  integer_mat_mul_kgroup<GemmKernel224Int4_1KGroup, false>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith,
                                                           nth);
}

inline void mat_mul_kgroup(int m, int n, int k, int k_group_size,
                           std::shared_ptr<GemmKernel224Int4_1KGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4_1KGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4_1KGroup::BufferC> bc, int ith, int nth) {
  integer_mat_mul_kgroup<GemmKernel224Int4_1KGroup, true>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith,
                                                          nth);
}

// Convenience functions for k-group kernels
inline void vec_mul_kgroup(int m, int n, int k, int k_group_size,
                           std::shared_ptr<GemmKernel224Int4_1_LowKGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4_1_LowKGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4_1_LowKGroup::BufferC> bc, int ith, int nth) {
  integer_mat_mul_kgroup<GemmKernel224Int4_1_LowKGroup, false>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith,
                                                               nth);
}

inline void mat_mul_kgroup(int m, int n, int k, int k_group_size,
                           std::shared_ptr<GemmKernel224Int4_1_LowKGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4_1_LowKGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4_1_LowKGroup::BufferC> bc, int ith, int nth) {
  integer_mat_mul_kgroup<GemmKernel224Int4_1_LowKGroup, true>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith,
                                                              nth);
}

}  // namespace amx

#endif  // AMX_KERNELS_HPP


================================================
FILE: kt-kernel/operators/amx/la/amx_quantization.hpp
================================================
#ifndef AMX_QUANTIZATION_HPP
#define AMX_QUANTIZATION_HPP
#include <algorithm>
#include <cmath>

#include "amx_config.hpp"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "utils.hpp"

namespace amx {

struct blocks_aligned_q4_0_ref {
  static constexpr int block_size = 64;
  static constexpr double bytes_per_element = double(sizeof(ggml_half) + double(block_size) / 2) / block_size;

  ggml_half* d;
  uint8_t* qs;

  blocks_aligned_q4_0_ref offset(size_t blck_cnt) const {
    blocks_aligned_q4_0_ref re;
    re.d = &d[blck_cnt];
    re.qs = &qs[blck_cnt * block_size / 2];
    return re;
  }

  static size_t expected_data_size(int64_t k) {
    assert(k % block_size == 0);
    return (sizeof(ggml_half) + block_size / 2) * (k / block_size);
  }

  uint8_t* get_qs(int block_idx) { return offset_pointer(qs, block_idx * (block_size / 2)); }

  static blocks_aligned_q4_0_ref quantize(const float* RESTRICT x, void* RESTRICT data, int64_t k) {
    assert(reinterpret_cast<intptr_t>(data) % 64 == 0);

    blocks_aligned_q4_0_ref re;
    re.qs = reinterpret_cast<uint8_t*>(data);
    re.d = reinterpret_cast<ggml_half*>(offset_pointer(re.qs, k / 2));

    static const int qk = block_size;

    assert(k % qk == 0);

    const int nb = k / qk;

    for (int i = 0; i < nb; i++) {
      float amax = 0.0f;  // absolute max
      float max = 0.0f;

      for (int j = 0; j < qk; j++) {
        const float v = x[i * qk + j];
        if (amax < fabsf(v)) {
          amax = fabsf(v);
          max = v;
        }
      }

      const float d = max / -8;
      const float id = d ? 1.0f / d : 0.0f;

      re.d[i] = GGML_FP32_TO_FP16(d);

      for (int j = 0; j < qk / 2; ++j) {
        const float x0 = x[i * qk + 0 + j] * id;
        const float x1 = x[i * qk + qk / 2 + j] * id;

        const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
        const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));

        re.get_qs(i)[j] = xi0;
        re.get_qs(i)[j] |= xi1 << 4;
      }
    }
    return re;
  }

  void dequantize(float* y, int64_t k) {
    static const int qk = block_size;
    assert(k % qk == 0);

    const int nb = k / qk;

    for (int i = 0; i < nb; i++) {
      const float d = GGML_FP16_TO_FP32(this->d[i]);

      for (int j = 0; j < qk / 2; ++j) {
        const int x0 = (get_qs(i)[j] & 0x0F) - 8;
        const int x1 = (get_qs(i)[j] >> 4) - 8;

        y[i * qk + j + 0] = x0 * d;
        y[i * qk + j + qk / 2] = x1 * d;
      }
    }
  }
};

struct blocks_aligned_q8_0_ref {
  static constexpr int block_size = 64;
  static constexpr double bytes_per_element = double(sizeof(ggml_half) + block_size) / block_size;

  ggml_half* d;
  int8_t* qs;

  blocks_aligned_q8_0_ref offset(size_t blck_cnt) const {
    blocks_aligned_q8_0_ref re;
    re.d = &d[blck_cnt];
    re.qs = &qs[blck_cnt * block_size];
    return re;
  }

  static size_t expected_data_size(int64_t k) {
    assert(k % block_size == 0);
    return (sizeof(ggml_half) + block_size) * (k / block_size);
  }
  int8_t* get_qs(int block_idx) { return offset_pointer(qs, block_idx * block_size); }

  static blocks_aligned_q8_0_ref quantize(const float* RESTRICT x, void* RESTRICT data, int64_t k) {
    assert(k % block_size == 0);
    assert(reinterpret_cast<intptr_t>(data) % 64 == 0);

    blocks_aligned_q8_0_ref re;
    re.qs = reinterpret_cast<int8_t*>(data);
    re.d = reinterpret_cast<ggml_half*>(offset_pointer(re.qs, k));
    const int nb = k / block_size;

    for (int i = 0; i < nb; i++) {
      float amax = 0.0f;  // absolute max

      for (int j = 0; j < block_size; j++) {
        const float v = x[i * block_size + j];
        amax = MAX(amax, fabsf(v));
      }

      const float d = amax / ((1 << 7) - 1);
      const float id = d ? 1.0f / d : 0.0f;

      re.d[i] = GGML_FP32_TO_FP16(d);

      for (int j = 0; j < block_size; ++j) {
        const float x0 = x[i * block_size + j] * id;
        re.get_qs(i)[j] = roundf(x0);
      }
    }
    return re;
  }

  void dequantize(float* y, int64_t k) {
    static const int qk = block_size;

    assert(k % qk == 0);

    const int nb = k / qk;

    for (int i = 0; i < nb; i++) {
      const float d = GGML_FP16_TO_FP32(this->d[i]);

      for (int j = 0; j < qk; ++j) {
        y[i * qk + j] = get_qs(i)[j] * d;
      }
    }
  }
};

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)

template <typename Block>
struct Dequantizer {};

const __m256i MASK256_LO = _mm256_set1_epi8(0x0f);
const __m256i MASK256_4HI = _mm256_set1_epi8(0xf0);
const __m256i MASK256_8 = _mm256_set1_epi8(8);

const __m512i MASK512_LO = _mm512_set1_epi8(0x0f);
const __m512i MASK512_4HI = _mm512_set1_epi8(0xf0);
const __m512i MASK512_8 = _mm512_set1_epi8(8);

inline __m256i dequant4x32(const uint8_t* qs) {
  const __m128i aux128 = _mm_loadu_si128((const __m128i*)qs);
  return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), MASK256_LO);
}

inline __m256i unaligned_copy8x32(const int8_t* qs) { return _mm256_loadu_si256((const __m256i*)qs); }

inline __m512i copy8x64(const int8_t* qs) { return _mm512_load_si512((const __m512i*)qs); }

inline __m256i lo4bit(const uint8_t* qs) {
  return _mm256_and_si256(_mm256_loadu_si256((const __m256i*)qs), MASK256_LO);
}
inline __m256i hi4bit(const uint8_t* qs) {
  return _mm256_srli_epi16(_mm256_and_si256(_mm256_loadu_si256((const __m256i*)qs), MASK256_4HI), 4);
}

inline __m128i make_q4K_scale_and_min(const uint8_t* scales8) {
  __m128i re;
  uint32_t* aux32 = (uint32_t*)&re;
  const uint16_t* scales = (const uint16_t*)scales8;
  const uint32_t a0 = scales[0] | (scales[1] << 16);
  const uint32_t a1 = scales[2] | (scales[3] << 16);
  const uint32_t a2 = scales[4] | (scales[5] << 16);
  aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030);
  aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030);
  aux32[2] = a1 & 0x3f3f3f3f;
  aux32[0] = a0 & 0x3f3f3f3f;
  // aux32[1:0] is scale
  // aux32[3:2] is min
  return re;
}

inline __m256i merge_q8K_bsum(block_q8_K* b) {
  return _mm256_madd_epi16(_mm256_loadu_si256((__m256i*)b->bsums), _mm256_set1_epi16(1));
}

inline __m512i _mm512_dpbusd_epi32_compat(__m512i src, __m512i a, __m512i b) {
#if defined(__AVX512VNNI__)
  return _mm512_dpbusd_epi32(src, a, b);
#else
  const __m512i mask_lo = _mm512_set1_epi16(0x00FF);
  const __m512i ones16 = _mm512_set1_epi16(1);

  __m512i a_even = _mm512_and_si512(a, mask_lo);
  __m512i b_even = _mm512_srai_epi16(_mm512_slli_epi16(b, 8), 8);

  __m512i a_odd = _mm512_srli_epi16(a, 8);
  __m512i b_odd = _mm512_srai_epi16(b, 8);

  __m512i prod_even = _mm512_mullo_epi16(a_even, b_even);
  __m512i prod_odd = _mm512_mullo_epi16(a_odd, b_odd);

  __m512i sum_even = _mm512_madd_epi16(prod_even, ones16);
  __m512i sum_odd = _mm512_madd_epi16(prod_odd, ones16);

  return _mm512_add_epi32(src, _mm512_add_epi32(sum_even, sum_odd));
#endif
}

inline __m512i _mm512_dpbssd_epi32(__m512i src, __m512i a, __m512i b) {
  __m256i a_lo = _mm512_extracti64x4_epi64(a, 0);
  __m256i a_hi = _mm512_extracti64x4_epi64(a, 1);
  __m256i b_lo = _mm512_extracti64x4_epi64(b, 0);
  __m256i b_hi = _mm512_extracti64x4_epi64(b, 1);

  b_lo = _mm256_sign_epi8(b_lo, a_lo);
  b_hi = _mm256_sign_epi8(b_hi, a_hi);

  b = _mm512_inserti64x4(b, b_lo, 0);
  b = _mm512_inserti64x4(b, b_hi, 1);

  a = _mm512_abs_epi8(a);

  return _mm512_dpbusd_epi32_compat(src, a, b);
}

}  // namespace amx

#endif  // AMX_QUANTIZATION_HPP

================================================
FILE: kt-kernel/operators/amx/la/amx_raw_buffers.hpp
================================================
#ifndef AMX_RAW_BUFFERS_HPP
#define AMX_RAW_BUFFERS_HPP

/**
 * @file amx_raw_buffers.hpp
 * @brief Raw data format buffer management (FP8, BF16, etc.)
 *
 * 本文件实现原精度格式的缓冲区管理，用于 DeepSeek V3.2 等原精度推理。
 *
 * 缓冲区类型：
 * - BufferAFP8Impl: 输入激活缓冲区，支持动态 FP8 量化
 * - BufferBFP8Impl: 权重缓冲区，FP8 格式 + 128x128 块缩放
 * - BufferBFP8BlockImpl: 优化的块量化权重缓冲区
 *
 * 内存布局：
 * - FP8 数据：1 字节/元素
 * - Scale：4 字节/块（BufferB 每 128x128 块一个，BufferA 每 128 行一个）
 */

#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <limits>
#include <vector>

#include "amx_config.hpp"
#include "amx_utils.hpp"
#include "llama.cpp/ggml-impl.h"
#include "pack.hpp"
#include "utils.hpp"

namespace amx {

// ============================================================================
// BufferAFP8Impl: FP8 激活缓冲区（支持动态量化）
// ============================================================================
/* 物理布局(按 bf16 元素数)：
 * 逻辑矩阵 A 为 (m, k) 行主序，m pad 到 max_m(=m_block_size，M_STEP 的倍数)。
 * 存储顺序：
 *   k_block(K_BLOCK 列) → m_block(M_STEP 行) → k_step(K_STEP 列) → (M_STEP×K_STEP) 行主序 tile。
 * 因此可视为 5D：
 *   a[k_blocks][m_blocks][k_steps][M_STEP][K_STEP]，
 *   k_blocks = ceil(k / K_BLOCK)，m_blocks = max_m / M_STEP，
 *   k_steps = K_BLOCK / K_STEP（最后一个 k_block 可能更小）。
 * get_submat(m_begin, k_begin) 返回连续的 (M_STEP×K_STEP) tile。
 */
template <typename K>
struct BufferABF16Impl {
  ggml_bf16_t* a;
  int max_m, k;
  static constexpr int M_STEP = K::M_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

  static size_t required_size(int max_m, int k) { return sizeof(ggml_bf16_t) * max_m * k; }

  BufferABF16Impl(int max_m, int k, void* ptr) : max_m(max_m), k(k) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(max_m % M_STEP == 0);
    assert(k % K_STEP == 0);
    a = reinterpret_cast<ggml_bf16_t*>(ptr);
  }

  void set_data(void* new_ptr) { a = reinterpret_cast<ggml_bf16_t*>(new_ptr); }

  void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
    assert(m <= max_m);
    assert(ith == 0 && nth == 1);
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512i* s = (__m512i*)(src + (m_begin + i) * k + k_block_begin + k_begin);
            __m512i* d =
                (__m512i*)(a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP);
            avx512_copy_32xbf16(s, d);
          }
        }
      }
    }
  }

  ggml_bf16_t* get_submat(int m, int k, int m_begin, int k_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
  }
};

// ============================================================================
// BufferB
// ============================================================================

/**
 * @brief BF16 BufferB
 * 物理布局(按 bf16 元素数)：
 * 逻辑矩阵 B 为 (n, k) 行主序（用于 NT GEMM），n 按 N_BLOCK 分块。
 * 存储顺序：
 *   n_block(N_BLOCK 行) → k_block(K_BLOCK 列) → n_step(N_STEP 行) → k_step(K_STEP 列)
 *   → (N_STEP×K_STEP) tile；每个 tile 内部再对两个 16×16 子块做 transpose，
 *   以匹配 AMX BTile 的 VNNI 布局（TILE_K/VNNI_BLK × TILE_N*VNNI_BLK）。
 * 因此可视为 6D：
 *   b[n_blocks][k_blocks][n_steps][k_steps][N_STEP][K_STEP]，
 *   n_blocks = ceil(n / N_BLOCK)，k_blocks = ceil(k / K_BLOCK)，
 *   n_steps = N_BLOCK / N_STEP，k_steps = K_BLOCK / K_STEP（尾块可能更小）。
 * get_submat(n_begin, k_begin) 返回连续的 (N_STEP×K_STEP) tile 起始地址。
 * @tparam K Kernel 类型
 */

template <typename K>
struct BufferBBF16Impl {
  ggml_bf16_t* b;
  int n, k;
  static constexpr bool SCALE = false;
  static constexpr int N_STEP = K::N_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static constexpr int TILE_N = K::TILE_N;
  static size_t required_size(int n, int k) { return sizeof(ggml_bf16_t) * n * k; }

  BufferBBF16Impl(int n, int k, void* ptr) : n(n), k(k) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(n % N_STEP == 0);
    assert(k % K_STEP == 0);
    b = reinterpret_cast<ggml_bf16_t*>(ptr);
  }
  void set_data(void* new_ptr) { b = reinterpret_cast<ggml_bf16_t*>(new_ptr); }

  void from_mat(ggml_bf16_t* src, int ith, int nth) {
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          for (int i = 0; i < N_STEP; i++) {
            __m512i* s = (__m512i*)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin);
            __m512i* d = (__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                    k_begin * N_STEP + i * K_STEP);
            avx512_copy_32xbf16(s, d);
          }
          transpose_16x16_32bit((__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size +
                                           n_begin * k_block_size + k_begin * N_STEP));
          transpose_16x16_32bit((__m512i*)(b + n_block_begin * k + k_block_begin * n_block_size +
                                           n_begin * k_block_size + k_begin * N_STEP + TILE_N * K_STEP));
        }
      }
    }
  }
  ggml_bf16_t* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP;
  }
};

/**
 * @brief FP8 权重缓冲区
 *
 * 存储 FP8 格式的权重矩阵，每个 128x128 块有一个缩放因子。
 * 这与 DeepSeek V3.2 的原精度格式匹配。
 *
 * @tparam K Kernel 类型
 */
template <typename K>
struct BufferBFP8Impl {
  uint8_t* b;              // FP8 weight
  float* d;                // scale_inv [n / k_group_size, k / k_group_size]
  int n, k, k_group_size;  // k_group_size = 128 in DeepSeek

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static constexpr bool SCALE = true;

  /**
   * @brief 计算所需内存大小
   */
  static size_t required_size(int n, int k, int k_group_size) {
    int n_blocks_n = (n + k_group_size - 1) / k_group_size;
    int n_blocks_k = (k + k_group_size - 1) / k_group_size;
    return sizeof(uint8_t) * n * k + sizeof(float) * n_blocks_n * n_blocks_k;
  }

  /**
   * @brief 构造函数
   */
  BufferBFP8Impl(int n, int k, int k_group_size, void* ptr) : n(n), k(k), k_group_size(k_group_size) { set_data(ptr); }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    b = reinterpret_cast<uint8_t*>(ptr);
    d = reinterpret_cast<float*>(b + (size_t)n * k);
  }

  static constexpr int mat_offset[8] = {0, 2, 4, 6, 1, 3, 5, 7};  // fp8 matrix offset for reordering
  /**
   * @brief 从原始 FP8 权重加载（已经是量化格式）
   *
   * @param b_src FP8 权重源数据 (n-major, n×k)
   * @param d_src FP32 scale_inv 源数据 (n-major, ceil(n/128)×ceil(k/128))
   */
  void from_mat(const uint8_t* b_src, const float* d_src, int ith, int nth) {
    assert(b != nullptr && d != nullptr);
    assert(N_STEP == 32 && K_STEP == 32);  // from mat block copy assumes this

    // Copy scales (per 128x128 block). Each thread copies its own n-block range.
    const int n_blocks_k = (k + k_group_size - 1) / k_group_size;
    if (d_src != nullptr) {
      auto [n_start, n_end] = K::split_range_n(n, ith, nth);
      int bn_start = n_start / k_group_size;
      int bn_end = (n_end + k_group_size - 1) / k_group_size;
      memcpy(d + bn_start * n_blocks_k, d_src + bn_start * n_blocks_k,
             sizeof(float) * (bn_end - bn_start) * n_blocks_k);
    }

    // Reorder FP8 weights into KT block-major layout (same panel->tile order as BF16 BufferB).
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      int n_step_size = std::min(N_STEP, n_block_size - n_begin);
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          int k_step_size = std::min(K_STEP, k_block_size - k_begin);
          // [k_step_size, n_step_size] block copy
          const uint8_t* block_b_src = b_src + (size_t)(n_block_begin + n_begin) * k + k_block_begin + k_begin;
          uint64_t* block_b_dst =
              reinterpret_cast<uint64_t*>(b + (size_t)n_block_begin * k + (size_t)k_block_begin * n_block_size +
                                          (size_t)n_begin * k_block_size + (size_t)k_begin * N_STEP);
          for (int i = 0; i < 8; i++) {
            const uint16_t* s = reinterpret_cast<const uint16_t*>(block_b_src + (size_t)i * k * 4);
            for (int j = 0; j < 16; j++) {
              uint64_t val = (((uint64_t)s[j])) | (((uint64_t)s[j + (k / 2) * 1]) << 16) |
                             (((uint64_t)s[j + (k / 2) * 2]) << 32) | (((uint64_t)s[j + (k / 2) * 3]) << 48);
              block_b_dst[8 * j + mat_offset[i]] = val;
            }
          }
        }
      }
    }
  }

  /**
   * @brief get scale_inv
   */
  float* get_scale(int n, int n_begin, int k, int k_begin) {
    int n_blocks_k = (k + k_group_size - 1) / k_group_size;
    int bn = n_begin / k_group_size;
    int bk = k_begin / k_group_size;
    return d + bn * n_blocks_k + bk;
  }

  /**
   * @brief 获取子矩阵指针
   */
  uint8_t* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return b + (size_t)n_block_begin * k + (size_t)k_block_begin * n_block_size + (size_t)n_begin * k_block_size +
           (size_t)k_begin * N_STEP;
  }

  /**
   * @brief Inverse mapping for mat_offset used in to_mat
   * mat_offset = {0, 2, 4, 6, 1, 3, 5, 7}
   * inv_mat_offset[mat_offset[i]] = i
   */
  static constexpr int inv_mat_offset[8] = {0, 4, 1, 5, 2, 6, 3, 7};

  /**
   * @brief Unpack FP8 weights from KT block-major layout back to n-major layout
   *
   * This is the inverse operation of from_mat.
   *
   * @param b_dst FP8 输出缓冲区 (n-major, n×k)
   * @param d_dst FP32 scale_inv 输出缓冲区 (n-major, ceil(n/128)×ceil(k/128))
   * @param ith Thread index
   * @param nth Total number of threads
   */
  void to_mat(uint8_t* b_dst, float* d_dst, int ith, int nth) const {
    assert(b != nullptr && d != nullptr);
    assert(N_STEP == 32 && K_STEP == 32);

    // Calculate N_BLOCK range for this thread
    // Unlike split_range_n which gives one N_BLOCK per thread, we need to handle
    // the case where nth < n/N_BLOCK (fewer threads than blocks)
    int total_n_blocks = (n + N_BLOCK - 1) / N_BLOCK;
    int blocks_per_thread = (total_n_blocks + nth - 1) / nth;
    int start_n_block_idx = ith * blocks_per_thread;
    int end_n_block_idx = std::min((ith + 1) * blocks_per_thread, total_n_blocks);

    // Copy scales (per 128x128 block). Each thread copies its own n-block range.
    const int n_blocks_k = (k + k_group_size - 1) / k_group_size;
    if (d_dst != nullptr) {
      int bn_start = start_n_block_idx;
      int bn_end = end_n_block_idx;
      memcpy(d_dst + bn_start * n_blocks_k, d + bn_start * n_blocks_k,
             sizeof(float) * (bn_end - bn_start) * n_blocks_k);
    }

    // Reorder FP8 weights back to n-major layout (inverse of from_mat)
    // Process each N_BLOCK assigned to this thread
    for (int n_block_idx = start_n_block_idx; n_block_idx < end_n_block_idx; n_block_idx++) {
      int n_block_begin = n_block_idx * N_BLOCK;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);

      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            // Source: packed layout (KT block-major)
            const uint64_t* block_b_src =
                reinterpret_cast<const uint64_t*>(b + (size_t)n_block_begin * k + (size_t)k_block_begin * n_block_size +
                                                  (size_t)n_begin * k_block_size + (size_t)k_begin * N_STEP);

            // Destination: n-major layout
            uint8_t* block_b_dst = b_dst + (size_t)(n_block_begin + n_begin) * k + k_block_begin + k_begin;

            // Inverse of from_mat transformation
            for (int packed_i = 0; packed_i < 8; packed_i++) {
              int i = inv_mat_offset[packed_i];
              uint16_t* d_row = reinterpret_cast<uint16_t*>(block_b_dst + (size_t)i * k * 4);
              for (int j = 0; j < 16; j++) {
                uint64_t val = block_b_src[8 * j + packed_i];
                d_row[j] = (uint16_t)(val & 0xFFFF);
                d_row[j + (k / 2) * 1] = (uint16_t)((val >> 16) & 0xFFFF);
                d_row[j + (k / 2) * 2] = (uint16_t)((val >> 32) & 0xFFFF);
                d_row[j + (k / 2) * 3] = (uint16_t)((val >> 48) & 0xFFFF);
              }
            }
          }
        }
      }
    }
  }
};

// ============================================================================
// BufferCFP8Impl: FP32 输出缓冲区
// ============================================================================

/**
 * @brief FP32 输出缓冲区
 *
 * 存储 FP32 格式的累加器，支持转换为 BF16 输出
 *
 * @tparam K Kernel 类型
 */
template <typename K>
struct BufferCFP32Impl {
  float* c;
  int max_m, n;
  static constexpr int M_STEP = K::M_STEP;
  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  // 物理布局(按 float 元素数)：
  // 逻辑矩阵 C 为 (max_m, n) 行主序，max_m 为 M_STEP 的倍数，
  // n 按 N_BLOCK 分块。
  // 存储顺序：
  //   n_block(N_BLOCK 列) → m_block(M_STEP 行) → n_step(N_STEP 列) → (M_STEP×N_STEP) 行主序 tile。
  // 因此可视为 5D：
  //   c[n_blocks][m_blocks][n_steps][M_STEP][N_STEP]，
  //   n_blocks = ceil(n / N_BLOCK)，m_blocks = max_m / M_STEP，
  //   n_steps = N_BLOCK / N_STEP（尾块可能更小）。
  // get_submat(m_begin, n_begin) 返回连续的 (M_STEP×N_STEP) tile 起始地址。

  static size_t required_size(int max_m, int n) { return sizeof(float) * max_m * n; }

  BufferCFP32Impl(int max_m, int n, void* ptr) : max_m(max_m), n(n) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(max_m % M_STEP == 0);
    assert(n % N_STEP == 0);
    c = reinterpret_cast<float*>(ptr);
  }

  void set_data(void* new_ptr) { c = reinterpret_cast<float*>(new_ptr); }

  void to_mat(int m, ggml_bf16_t* dst, int ith, int nth) {
    assert(m <= max_m);
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
          __m512* x0 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
          __m512* x1 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP + 16);
          avx512_32xfp32_to_32xbf16(x0, x1, (__m512i*)(dst + (m_begin + i) * n + n_block_begin + n_begin));
        }
      }
    }
  }

  float* get_submat(int m, int n, int m_begin, int n_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    n_begin -= n_block_begin;
    return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
  }
};

template <typename K>
struct BufferCFP32ReduceImpl {
  float* c;
  float* reduce_buf;
  int max_m, n;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int N_STEP = K::N_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;

  static size_t required_size(int max_m, int n) { return sizeof(float) * (size_t)max_m * n * 2; }

  BufferCFP32ReduceImpl(int max_m, int n, void* ptr) : max_m(max_m), n(n) {
    assert(max_m % M_STEP == 0);
    assert(n % N_STEP == 0);
    set_data(ptr);
  }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    c = reinterpret_cast<float*>(ptr);
    reduce_buf = c + (size_t)max_m * n;
  }

  void to_mat(int m, ggml_bf16_t* dst, int ith, int nth) {
    assert(m <= max_m);
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
          __m512* x0 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
          __m512* x1 =
              (__m512*)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP + 16);
          avx512_32xfp32_to_32xbf16(x0, x1, (__m512i*)(dst + (m_begin + i) * n + n_block_begin + n_begin));
        }
      }
    }
  }

  float* get_submat(int m, int n, int m_begin, int n_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    n_begin -= n_block_begin;
    return c + (size_t)m_block_size * n_block_begin + (size_t)m_begin * n_block_size + (size_t)n_begin * M_STEP;
  }

  float* get_reduce_submat(int m, int n, int m_begin, int n_begin) {
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    n_begin -= n_block_begin;
    return reduce_buf + (size_t)m_block_size * n_block_begin + (size_t)m_begin * n_block_size +
           (size_t)n_begin * M_STEP;
  }
};

// ============================================================================
// BufferBFP8PerChannelImpl: FP8 权重缓冲区（Per Channel 量化）
// ============================================================================

/**
 * @brief FP8 Per-Channel 权重缓冲区
 *
 * 存储 FP8 格式的权重矩阵，每个输出通道（行）有一个缩放因子。
 * 这与 GLM-4.7-FP8 的 per-channel 量化格式匹配。
 *
 * 与 BufferBFP8Impl (block-wise) 的区别：
 * - Block-wise: scale shape = [n/128, k/128], 每 128x128 块一个 scale
 * - Per-channel: scale shape = [n], 每行一个 scale
 *
 * @tparam K Kernel 类型
 */
template <typename K>
struct BufferBFP8PerChannelImpl {
  uint8_t* b;  // FP8 weight [n, k]
  float* d;    // per-channel scale [n]
  int n, k;

  static constexpr int N_STEP = K::N_STEP;
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int N_BLOCK = K::N_BLOCK;
  static constexpr int K_BLOCK = K::K_BLOCK;
  static constexpr bool SCALE = true;
  static constexpr bool PER_CHANNEL = true;

  /**
   * @brief 计算所需内存大小
   * weight: n * k bytes (FP8)
   * scale: n * sizeof(float) bytes
   */
  static size_t required_size(int n, int k) { return sizeof(uint8_t) * n * k + sizeof(float) * n; }

  /**
   * @brief 构造函数
   */
  BufferBFP8PerChannelImpl(int n, int k, void* ptr) : n(n), k(k) { set_data(ptr); }

  void set_data(void* ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    b = reinterpret_cast<uint8_t*>(ptr);
    d = reinterpret_cast<float*>(b + (size_t)n * k);
  }

  static constexpr int mat_offset[8] = {0, 2, 4, 6, 1, 3, 5, 7};  // fp8 matrix offset for reordering

  /**
   * @brief 从原始 FP8 权重加载（per-channel 量化格式）
   *
   * @param b_src FP8 权重源数据 (n-major, n×k)
   * @param d_src FP32 per-channel scale 源数据 (shape: [n] or [n, 1])
   */
  void from_mat(const uint8_t* b_src, const float* d_src, int ith, int nth) {
    assert(b != nullptr && d != nullptr);
    assert(N_STEP == 32 && K_STEP == 32);

    // Copy per-channel scales. Each thread copies its own n-block range.
    if (d_src != nullptr) {
      auto [n_start, n_end] = K::split_range_n(n, ith, nth);
      memcpy(d + n_start, d_src + n_start, sizeof(float) * (n_end - n_start));
    }

    // Reorder FP8 weights into KT block-major layout (same as BufferBFP8Impl)
    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
    int n_block_begin = n_start;
    int n_block_size = n_end - n_block_begin;
    for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
      int n_step_size = std::min(N_STEP, n_block_size - n_begin);
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
        int k_block_size = std::min(K_BLOCK, k - k_block_begin);
        for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
          int k_step_size = std::min(K_STEP, k_block_size - k_begin);
          // [k_step_size, n_step_size] block copy
          const uint8_t* block_b_src = b_src + (size_t)(n_block_begin + n_begin) * k + k_block_begin + k_begin;
          uint64_t* block_b_dst =
              reinterpret_cast<uint64_t*>(b + (size_t)n_block_begin * k + (size_t)k_block_begin * n_block_size +
                                          (size_t)n_begin * k_block_size + (size_t)k_begin * N_STEP);
          for (int i = 0; i < 8; i++) {
            const uint16_t* s = reinterpret_cast<const uint16_t*>(block_b_src + (size_t)i * k * 4);
            for (int j = 0; j < 16; j++) {
              uint64_t val = (((uint64_t)s[j])) | (((uint64_t)s[j + (k / 2) * 1]) << 16) |
                             (((uint64_t)s[j + (k / 2) * 2]) << 32) | (((uint64_t)s[j + (k / 2) * 3]) << 48);
              block_b_dst[8 * j + mat_offset[i]] = val;
            }
          }
        }
      }
    }
  }

  /**
   * @brief 获取行 n_begin 开始的 per-channel scale 指针
   */
  float* get_scale(int n_begin) { return d + n_begin; }

  /**
   * @brief 获取子矩阵指针
   */
  uint8_t* get_submat(int n, int k, int n_begin, int k_begin) {
    int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
    n_begin -= n_block_begin;
    int n_block_size = std::min(N_BLOCK, n - n_block_begin);
    int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
    k_begin -= k_block_begin;
    int k_block_size = std::min(K_BLOCK, k - k_block_begin);
    return b + (size_t)n_block_begin * k + (size_t)k_block_begin * n_block_size + (size_t)n_begin * k_block_size +
           (size_t)k_begin * N_STEP;
  }
};

}  // namespace amx

#endif  // AMX_RAW_BUFFERS_HPP


================================================
FILE: kt-kernel/operators/amx/la/amx_raw_kernels.hpp
================================================
#ifndef AMX_RAW_KERNELS_HPP
#define AMX_RAW_KERNELS_HPP

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <string>

#include "amx_config.hpp"
#include "amx_raw_buffers.hpp"
#include "amx_utils.hpp"
#include "llama.cpp/ggml-impl.h"

namespace amx {

struct GemmKernel224BF16 {
  using dt = ggml_bf16_t;
  using output_t = float;
  static constexpr double ELEMENT_SIZE = 2;
  static const int TILE_M = 16;
  static const int TILE_K = 32;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 2;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  static inline const int K_BLOCK = 1792;
  static std::string name() { return "BF16"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
#ifdef HAVE_AMX
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 32
    for (int i = 0; i < 2; i++) tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 16 x 32
    for (int i = 2; i < 4; i++) tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 16
    for (int i = 4; i < 8; i++) tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
#endif
  }

  static void load_a(dt* a, size_t lda) {
#ifdef HAVE_AMX
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
#else
    (void)a;
    (void)lda;
#endif
  }

  static void load_b(dt* b, size_t ldb) {
#ifdef HAVE_AMX
    _tile_loadd(2, b, ldb);
    _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
#else
    (void)b;
    (void)ldb;
#endif
  }

  static void clean_c() {
#ifdef HAVE_AMX
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
#endif
  }

  static void load_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void store_c(output_t* c, size_t ldc) {
#ifdef HAVE_AMX
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
#else
    (void)c;
    (void)ldc;
#endif
  }

  static void run_tile() {
#ifdef HAVE_AMX
    _tile_dpbf16ps(4, 0, 2);
    _tile_dpbf16ps(5, 0, 3);
    _tile_dpbf16ps(6, 1, 2);
    _tile_dpbf16ps(7, 1, 3);
#endif
  }
  using BufferA = BufferABF16Impl<GemmKernel224BF16>;
  using BufferB = BufferBBF16Impl<GemmKernel224BF16>;
  using BufferC = BufferCFP32Impl<GemmKernel224BF16>;

  // Basic AVX kernel for BF16: process entire K_BLOCK
  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    __m512* c512 = (__m512*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);

    // Zero out accumulator at the start of k_block
    if (k_block_begin == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_ps();
        c512[m_i * 2 + 1] = _mm512_setzero_ps();
      }
    }

    // Process entire K_BLOCK
    for (int k_begin = 0; k_begin < K_BLOCK && k_block_begin + k_begin < k; k_begin += K_STEP) {
      int32_t* a32 = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      __m512bh* b512 = (__m512bh*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);

      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512bh ma = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i]);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma, b512[k_i]);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma, b512[16 + k_i]);
        }
      }
    }
  }

  // Optimized AVX kernel: process 4 k_i at once, unroll m rows by 2
  static void avx_kernel_4(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                           BufferB* bb) {
    __m512* c512 = (__m512*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);

    // Zero out accumulator at the start of k_block
    if (k_block_begin == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_ps();
        c512[m_i * 2 + 1] = _mm512_setzero_ps();
      }
    }

    // Process entire K_BLOCK
    for (int k_begin = 0; k_begin < K_BLOCK && k_block_begin + k_begin < k; k_begin += K_STEP) {
      int32_t* a32 = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      __m512bh* b512 = (__m512bh*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);

      // Process 4 k_i at once - load B vectors and reuse across all m rows
      for (int k_i = 0; k_i < 16; k_i += 4) {
        // Load 4 B vector pairs (lo and hi for each k_i)
        __m512bh b0_lo = b512[k_i];
        __m512bh b0_hi = b512[16 + k_i];
        __m512bh b1_lo = b512[k_i + 1];
        __m512bh b1_hi = b512[16 + k_i + 1];
        __m512bh b2_lo = b512[k_i + 2];
        __m512bh b2_hi = b512[16 + k_i + 2];
        __m512bh b3_lo = b512[k_i + 3];
        __m512bh b3_hi = b512[16 + k_i + 3];

        // Process m rows - unroll by 2 for better ILP
        int m_i = 0;
        for (; m_i + 1 < m_block_end; m_i += 2) {
          // Load A values for 2 rows, 4 k_i each
          __m512bh ma0_0 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i]);
          __m512bh ma1_0 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i + 1]);
          __m512bh ma2_0 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i + 2]);
          __m512bh ma3_0 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i + 3]);
          __m512bh ma0_1 = (__m512bh)_mm512_set1_epi32(a32[(m_i + 1) * 16 + k_i]);
          __m512bh ma1_1 = (__m512bh)_mm512_set1_epi32(a32[(m_i + 1) * 16 + k_i + 1]);
          __m512bh ma2_1 = (__m512bh)_mm512_set1_epi32(a32[(m_i + 1) * 16 + k_i + 2]);
          __m512bh ma3_1 = (__m512bh)_mm512_set1_epi32(a32[(m_i + 1) * 16 + k_i + 3]);

          // Process row 0
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma0_0, b0_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma0_0, b0_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma1_0, b1_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma1_0, b1_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma2_0, b2_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma2_0, b2_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma3_0, b3_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma3_0, b3_hi);

          // Process row 1
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma0_1, b0_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma0_1, b0_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma1_1, b1_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma1_1, b1_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma2_1, b2_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma2_1, b2_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma3_1, b3_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma3_1, b3_hi);
        }
        // Handle remaining row
        for (; m_i < m_block_end; m_i++) {
          __m512bh ma0 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i]);
          __m512bh ma1 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i + 1]);
          __m512bh ma2 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i + 2]);
          __m512bh ma3 = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i + 3]);

          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma0, b0_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma0, b0_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma1, b1_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma1, b1_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma2, b2_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma2, b2_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma3, b3_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma3, b3_hi);
        }
      }
    }
  }

  // AMX kernel for BF16: process entire K_BLOCK using AMX tiles
  static void amx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    if (k_block_begin == 0) {
      clean_c();
    } else {
      load_c(c, N_STEP * sizeof(float));
    }

    for (int k_begin = 0; k_begin < K_BLOCK && k_block_begin + k_begin < k; k_begin += K_STEP) {
      load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K_STEP * sizeof(ggml_bf16_t));
      load_b(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K_STEP * sizeof(ggml_bf16_t));
      run_tile();
    }

    store_c(c, N_STEP * sizeof(float));
  }
};

// FP8 (e4m3) AMX kernel that mirrors the GemmKernel224BF16 interface.
struct GemmKernel224FP8 {
  using fp8_t = uint8_t;
  using output_t = float;

  static constexpr double ELEMENT_SIZE = 1.0;
  static const int TILE_M = 16;
  static const int TILE_K = 32;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 2;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  static inline const int BLOCK_SIZE = 128;  // 128 x 128 block quantization
  static inline const int N_BLOCK = 128;
  static inline const int K_BLOCK = 7168;

  static std::string name() { return "FP8"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {}

  // FP8->BF16 conversion lookup tables (public for reuse by GemmKernel224FP8PerChannel)
  alignas(64) static constexpr uint8_t bf16_hi_0_val[64] = {
      0x00, 0x3b, 0x3b, 0x3b, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c,
      0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, 0x3d,
      0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e,
      0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
  };
  alignas(64) static constexpr uint8_t bf16_hi_1_val[64] = {
      0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
      0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
      0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42,
      0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43, 0x43,
  };
  alignas(64) static constexpr uint8_t bf16_lo_0_val[64] = {
      0x00, 0x00, 0x80, 0xc0, 0x00, 0x20, 0x40, 0x60, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
      0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
      0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
      0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
  };
  alignas(64) static constexpr uint8_t bf16_lo_1_val[64] = {
      0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
      0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
      0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
      0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
  };
  // _mm512_set1_epi8 is not constexpr; keep it as a static cached value
  alignas(64) static const __m512i sign_mask_val;
  static inline __m512i bf16_hi_0_mask() { return _mm512_load_si512((__m512i const*)bf16_hi_0_val); }
  static inline __m512i bf16_hi_1_mask() { return _mm512_load_si512((__m512i const*)bf16_hi_1_val); }
  static inline __m512i bf16_lo_0_mask() { return _mm512_load_si512((__m512i const*)bf16_lo_0_val); }
  static inline __m512i bf16_lo_1_mask() { return _mm512_load_si512((__m512i const*)bf16_lo_1_val); }
  static inline __m512i sign_mask() { return _mm512_set1_epi8(0x80); }
  using BufferA = BufferABF16Impl<GemmKernel224FP8>;
  using BufferB = BufferBFP8Impl<GemmKernel224FP8>;
  using BufferC = BufferCFP32ReduceImpl<GemmKernel224FP8>;

  static inline std::pair<__m512i, __m512i> fp8x64_to_bf16x64(__m512i bfp8_512) {
    // fp8->bf16
    __m512i b_hi = _mm512_permutex2var_epi8(bf16_hi_0_mask(), bfp8_512, bf16_hi_1_mask());
    __m512i b_lo = _mm512_permutex2var_epi8(bf16_lo_0_mask(), bfp8_512, bf16_lo_1_mask());
    b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask(), bfp8_512), b_hi);
    __m512i bbf16_0 = _mm512_unpacklo_epi8(b_lo, b_hi);
    __m512i bbf16_1 = _mm512_unpackhi_epi8(b_lo, b_hi);
    return {bbf16_0, bbf16_1};
  }
  // Optimized AVX kernel: process entire k_group_size
  // Load all data first, then convert all, then compute all
  // This gives compiler more freedom to schedule instructions
  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_group_begin, float* c, BufferA* ba,
                         BufferB* bb, int k_group_size) {
    const __m512i bf16_hi_0_val = bf16_hi_0_mask();
    const __m512i bf16_hi_1_val = bf16_hi_1_mask();
    const __m512i bf16_lo_0_val = bf16_lo_0_mask();
    const __m512i bf16_lo_1_val = bf16_lo_1_mask();
    const __m512i sign_mask_val = sign_mask();

    __m512* c512 = (__m512*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);

    // Zero out accumulator at the start
    for (int m_i = 0; m_i < m_block_end; m_i++) {
      c512[m_i * 2] = _mm512_setzero_ps();
      c512[m_i * 2 + 1] = _mm512_setzero_ps();
    }

    // Process entire k_group_size
    for (int k_begin = 0; k_begin < k_group_size && k_group_begin + k_begin < k; k_begin += K_STEP) {
      ggml_bf16_t* abf16 = (ggml_bf16_t*)ba->get_submat(m, k, m_begin, k_group_begin + k_begin);
      __m512i* bfp8_512 = (__m512i*)bb->get_submat(n, k, n_begin, k_group_begin + k_begin);

      for (int m_i = 0; m_i < m_block_end; m_i++) {
        // Process 2 k_i per iteration
        for (int k_i = 0; k_i < 16; k_i += 2) {
          // Load A vectors
          __m512bh ma0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + k_i * 2]);
          __m512bh ma1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 1) * 2]);

          // Load B matrices
          __m512i bfp8_0 = bfp8_512[k_i];
          __m512i bfp8_1 = bfp8_512[k_i + 1];

          // Convert FP8 -> BF16 for all
          __m512i b_hi_0 = _mm512_permutex2var_epi8(bf16_hi_0_val, bfp8_0, bf16_hi_1_val);
          __m512i b_lo_0 = _mm512_permutex2var_epi8(bf16_lo_0_val, bfp8_0, bf16_lo_1_val);
          b_hi_0 = _mm512_or_si512(_mm512_and_si512(sign_mask_val, bfp8_0), b_hi_0);

          __m512i b_hi_1 = _mm512_permutex2var_epi8(bf16_hi_0_val, bfp8_1, bf16_hi_1_val);
          __m512i b_lo_1 = _mm512_permutex2var_epi8(bf16_lo_0_val, bfp8_1, bf16_lo_1_val);
          b_hi_1 = _mm512_or_si512(_mm512_and_si512(sign_mask_val, bfp8_1), b_hi_1);

          // Compute dpbf16 for all
          __m512bh bbf16_0_0 = (__m512bh)_mm512_unpacklo_epi8(b_lo_0, b_hi_0);
          __m512bh bbf16_1_0 = (__m512bh)_mm512_unpackhi_epi8(b_lo_0, b_hi_0);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma0, bbf16_0_0);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma0, bbf16_1_0);

          __m512bh bbf16_0_1 = (__m512bh)_mm512_unpacklo_epi8(b_lo_1, b_hi_1);
          __m512bh bbf16_1_1 = (__m512bh)_mm512_unpackhi_epi8(b_lo_1, b_hi_1);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma1, bbf16_0_1);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma1, bbf16_1_1);
        }
      }
    }
  }

  // Optimized AVX kernel: process 4 k_i at once, convert B once and reuse for all m rows
  // This version achieved ~493 GB/s - restoring as baseline for further optimization
  static void avx_kernel_4(int m, int n, int k, int m_begin, int n_begin, int k_group_begin, float* c, BufferA* ba,
                           BufferB* bb, int k_group_size) {
    const __m512i bf16_hi_0 = bf16_hi_0_mask();
    const __m512i bf16_hi_1 = bf16_hi_1_mask();
    const __m512i bf16_lo_0 = bf16_lo_0_mask();
    const __m512i bf16_lo_1 = bf16_lo_1_mask();
    const __m512i sign_mask_v = sign_mask();

    __m512* c512 = (__m512*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);

    // Zero out accumulator
    for (int m_i = 0; m_i < m_block_end; m_i++) {
      c512[m_i * 2] = _mm512_setzero_ps();
      c512[m_i * 2 + 1] = _mm512_setzero_ps();
    }

    // Process entire k_group_size
    for (int k_begin = 0; k_begin < k_group_size && k_group_begin + k_begin < k; k_begin += K_STEP) {
      ggml_bf16_t* abf16 = (ggml_bf16_t*)ba->get_submat(m, k, m_begin, k_group_begin + k_begin);
      __m512i* bfp8_512 = (__m512i*)bb->get_submat(n, k, n_begin, k_group_begin + k_begin);

      // Process 4 k_i at once - convert B and reuse across all m rows
      for (int k_i = 0; k_i < 16; k_i += 4) {
        // Load 4 B vectors
        __m512i bfp8_0 = bfp8_512[k_i];
        __m512i bfp8_1 = bfp8_512[k_i + 1];
        __m512i bfp8_2 = bfp8_512[k_i + 2];
        __m512i bfp8_3 = bfp8_512[k_i + 3];

        // Convert all 4 FP8 -> BF16
        __m512i b_hi, b_lo;

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_0),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_0, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_0, bf16_lo_1);
        __m512bh bbf16_0_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_0_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_1),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_1, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_1, bf16_lo_1);
        __m512bh bbf16_1_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_1_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_2),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_2, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_2, bf16_lo_1);
        __m512bh bbf16_2_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_2_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_3),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_3, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_3, bf16_lo_1);
        __m512bh bbf16_3_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_3_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        // Process m rows - unroll by 2 for better ILP
        int m_i = 0;
        for (; m_i + 1 < m_block_end; m_i += 2) {
          // Load A values for 2 rows
          __m512bh ma0_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + k_i * 2]);
          __m512bh ma1_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 1) * 2]);
          __m512bh ma2_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 2) * 2]);
          __m512bh ma3_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 3) * 2]);
          __m512bh ma0_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + k_i * 2]);
          __m512bh ma1_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + (k_i + 1) * 2]);
          __m512bh ma2_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + (k_i + 2) * 2]);
          __m512bh ma3_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + (k_i + 3) * 2]);

          // Process row 0, then row 1 - sequential to avoid dependencies
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma0_0, bbf16_0_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma0_0, bbf16_0_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma1_0, bbf16_1_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma1_0, bbf16_1_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma2_0, bbf16_2_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma2_0, bbf16_2_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma3_0, bbf16_3_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma3_0, bbf16_3_hi);

          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma0_1, bbf16_0_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma0_1, bbf16_0_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma1_1, bbf16_1_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma1_1, bbf16_1_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma2_1, bbf16_2_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma2_1, bbf16_2_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma3_1, bbf16_3_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma3_1, bbf16_3_hi);
        }
        // Handle remaining row
        for (; m_i < m_block_end; m_i++) {
          __m512bh ma0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + k_i * 2]);
          __m512bh ma1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 1) * 2]);
          __m512bh ma2 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 2) * 2]);
          __m512bh ma3 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 3) * 2]);

          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma0, bbf16_0_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma0, bbf16_0_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma1, bbf16_1_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma1, bbf16_1_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma2, bbf16_2_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma2, bbf16_2_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma3, bbf16_3_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma3, bbf16_3_hi);
        }
      }
    }
  }

  static void apply_scale_kgroup(int m, int n, int m_begin, int n_begin, int k_block_begin, float* c, float* reduce_c,
                                 BufferA* ba, BufferB* bb, int k, int k_group_size) {
    using K = GemmKernel224FP8;
    int to = std::min(m - m_begin, K::M_STEP);

    for (int i = 0; i < to; i++) {
      // Get scale for this k_group
      __m512 bs = _mm512_set1_ps(*bb->get_scale(n, n_begin, k, k_block_begin));
      __m512 now = _mm512_load_ps(reduce_c + i * K::N_STEP);
      __m512 result = _mm512_mul_ps(now, bs);
      __m512 existing = _mm512_load_ps(c + i * K::N_STEP);
      result = _mm512_add_ps(result, existing);
      _mm512_store_ps(c + i * K::N_STEP, result);

      now = _mm512_load_ps(reduce_c + i * K::N_STEP + K::TILE_N);
      result = _mm512_mul_ps(now, bs);
      existing = _mm512_load_ps(c + i * K::N_STEP + K::TILE_N);
      result = _mm512_add_ps(result, existing);
      _mm512_store_ps(c + i * K::N_STEP + K::TILE_N, result);
    }
  }
};

// all step = 32
template <typename K, bool amx_or_avx = false>
void float_mat_vec_kgroup(int m, int n, int k, int k_group_size, typename K::BufferA* ba, typename K::BufferB* bb,
                          typename K::BufferC* bc, int ith, int nth) {
  assert(n % K::N_STEP == 0);
  assert(k % k_group_size == 0);
  assert(k_group_size % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  // Process by k_groups
  for (int k_group_begin = 0; k_group_begin < k; k_group_begin += k_group_size) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
        float* c = bc->get_submat(m, n, m_begin, n_begin);
        float* reduce_c = bc->get_reduce_submat(m, n, m_begin, n_begin);

        if (k_group_begin == 0) {
          for (int i = 0; i < K::M_STEP && m_begin + i < m; i++) {
            for (int j = 0; j < K::N_STEP; j++) {
              c[i * K::N_STEP + j] = 0.0f;
            }
          }
        }

        // avx_kernel_4 now processes entire k_group_size internally (like INT8's avx_kernel)
        if constexpr (amx_or_avx && AMX_AVAILABLE) {
          for (int k_begin = k_group_begin; k_begin < std::min(k, k_group_begin + k_group_size); k_begin += K::K_STEP) {
            K::amx_kernel(m, n, k, m_begin, n_begin, k_begin, reduce_c, ba, bb, k_group_size);
          }
        } else {
          // Single call processes entire k_group
          K::avx_kernel(m, n, k, m_begin, n_begin, k_group_begin, reduce_c, ba, bb, k_group_size);
        }
        K::apply_scale_kgroup(m, n, m_begin, n_begin, k_group_begin, c, reduce_c, ba, bb, k, k_group_size);
      }
    }
  }
}

// ============================================================================
// GemmKernel224BF16 vec_mul/mat_mul
// ============================================================================

// Template function for BF16 mat_mul/vec_mul with AMX or AVX backend
template <typename K, bool amx_or_avx = true>
void float_mat_vec(int m, int n, int k, typename K::BufferA* ba, typename K::BufferB* bb, typename K::BufferC* bc,
                   int ith, int nth) {
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
        float* c = bc->get_submat(m, n, m_begin, n_begin);

        if constexpr (amx_or_avx && AMX_AVAILABLE) {
          K::amx_kernel(m, n, k, m_begin, n_begin, k_block_begin, c, ba, bb);
        } else {
          K::avx_kernel_4(m, n, k, m_begin, n_begin, k_block_begin, c, ba, bb);
        }
      }
    }
  }
}

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF16::BufferA> ba,
                    std::shared_ptr<GemmKernel224BF16::BufferB> bb, std::shared_ptr<GemmKernel224BF16::BufferC> bc,
                    int ith, int nth) {
  float_mat_vec<GemmKernel224BF16, true>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void vec_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF16::BufferA> ba,
                    std::shared_ptr<GemmKernel224BF16::BufferB> bb, std::shared_ptr<GemmKernel224BF16::BufferC> bc,
                    int ith, int nth) {
  float_mat_vec<GemmKernel224BF16, false>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void vec_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224FP8::BufferA> ba,
                           std::shared_ptr<GemmKernel224FP8::BufferB> bb, std::shared_ptr<GemmKernel224FP8::BufferC> bc,
                           int ith, int nth) {
  float_mat_vec_kgroup<GemmKernel224FP8, false>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
}

inline void mat_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224FP8::BufferA> ba,
                           std::shared_ptr<GemmKernel224FP8::BufferB> bb, std::shared_ptr<GemmKernel224FP8::BufferC> bc,
                           int ith, int nth) {
  float_mat_vec_kgroup<GemmKernel224FP8, false>(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
}

// ============================================================================
// Per-Channel FP8 GEMM (for GLM-4.7-FP8 style quantization)
// ============================================================================

/**
 * @brief FP8 Per-Channel Kernel
 *
 * Similar to GemmKernel224FP8 but with per-channel scaling instead of block-wise scaling.
 * - Block-wise: scale shape = [n/128, k/128], one scale per 128x128 block
 * - Per-channel: scale shape = [n], one scale per output row
 */
struct GemmKernel224FP8PerChannel {
  using fp8_t = uint8_t;
  using output_t = float;

  static constexpr double ELEMENT_SIZE = 1.0;
  static const int TILE_M = 16;
  static const int TILE_K = 32;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 2;

  static const int M_STEP = TILE_M * 2;
  static const int N_STEP = TILE_N * 2;
  static const int K_STEP = TILE_K;

  // Use smaller N_BLOCK for per-channel to allow efficient scale application
  static inline const int N_BLOCK = 128;
  static inline const int K_BLOCK = 7168;

  static std::string name() { return "FP8PerChannel"; }

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {}

  using BufferA = BufferABF16Impl<GemmKernel224FP8PerChannel>;
  using BufferB = BufferBFP8PerChannelImpl<GemmKernel224FP8PerChannel>;
  using BufferC = BufferCFP32Impl<GemmKernel224FP8PerChannel>;

  // Reuse FP8->BF16 conversion from GemmKernel224FP8
  static inline std::pair<__m512i, __m512i> fp8x64_to_bf16x64(__m512i bfp8_512) {
    return GemmKernel224FP8::fp8x64_to_bf16x64(bfp8_512);
  }

  /**
   * @brief Apply per-channel scale to result
   *
   * Unlike block-wise scaling, per-channel scaling applies a different scale to each column
   * of the result (each output channel).
   *
   * @param m Total rows
   * @param n Total columns
   * @param m_begin Starting row
   * @param n_begin Starting column
   * @param c Output buffer (M_STEP x N_STEP)
   * @param bb BufferB containing per-channel scales
   */
  static void apply_scale_perchannel(int m, [[maybe_unused]] int n, int m_begin, int n_begin, float* c, BufferB* bb) {
    int to = std::min(m - m_begin, M_STEP);

    // Load N_STEP per-channel scales (32 floats)
    __m512 bs_lo = _mm512_loadu_ps(bb->get_scale(n_begin));           // scale[n_begin..n_begin+15]
    __m512 bs_hi = _mm512_loadu_ps(bb->get_scale(n_begin + TILE_N));  // scale[n_begin+16..n_begin+31]

    for (int i = 0; i < to; i++) {
      // Each row gets multiplied by the same set of per-channel scales
      __m512 c_lo = _mm512_load_ps(c + i * N_STEP);
      __m512 c_hi = _mm512_load_ps(c + i * N_STEP + TILE_N);
      _mm512_store_ps(c + i * N_STEP, _mm512_mul_ps(c_lo, bs_lo));
      _mm512_store_ps(c + i * N_STEP + TILE_N, _mm512_mul_ps(c_hi, bs_hi));
    }
  }

  // AVX kernel for per-channel FP8 GEMM - processes entire K dimension
  static void avx_kernel_4(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                           BufferB* bb) {
    const __m512i bf16_hi_0 = GemmKernel224FP8::bf16_hi_0_mask();
    const __m512i bf16_hi_1 = GemmKernel224FP8::bf16_hi_1_mask();
    const __m512i bf16_lo_0 = GemmKernel224FP8::bf16_lo_0_mask();
    const __m512i bf16_lo_1 = GemmKernel224FP8::bf16_lo_1_mask();
    const __m512i sign_mask_v = GemmKernel224FP8::sign_mask();

    __m512* c512 = (__m512*)c;
    int m_block_end = std::min(m - m_begin, M_STEP);

    // Zero out accumulator at start of K_BLOCK
    if (k_block_begin == 0) {
      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_ps();
        c512[m_i * 2 + 1] = _mm512_setzero_ps();
      }
    }

    // Process K_BLOCK
    for (int k_begin = 0; k_begin < K_BLOCK && k_block_begin + k_begin < k; k_begin += K_STEP) {
      ggml_bf16_t* abf16 = (ggml_bf16_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      __m512i* bfp8_512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);

      // Process 4 k_i at once
      for (int k_i = 0; k_i < 16; k_i += 4) {
        // Load 4 B vectors
        __m512i bfp8_0 = bfp8_512[k_i];
        __m512i bfp8_1 = bfp8_512[k_i + 1];
        __m512i bfp8_2 = bfp8_512[k_i + 2];
        __m512i bfp8_3 = bfp8_512[k_i + 3];

        // Convert all 4 FP8 -> BF16
        __m512i b_hi, b_lo;

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_0),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_0, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_0, bf16_lo_1);
        __m512bh bbf16_0_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_0_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_1),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_1, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_1, bf16_lo_1);
        __m512bh bbf16_1_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_1_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_2),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_2, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_2, bf16_lo_1);
        __m512bh bbf16_2_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_2_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        b_hi = _mm512_or_si512(_mm512_and_si512(sign_mask_v, bfp8_3),
                               _mm512_permutex2var_epi8(bf16_hi_0, bfp8_3, bf16_hi_1));
        b_lo = _mm512_permutex2var_epi8(bf16_lo_0, bfp8_3, bf16_lo_1);
        __m512bh bbf16_3_lo = (__m512bh)_mm512_unpacklo_epi8(b_lo, b_hi);
        __m512bh bbf16_3_hi = (__m512bh)_mm512_unpackhi_epi8(b_lo, b_hi);

        // Process m rows
        int m_i = 0;
        for (; m_i + 1 < m_block_end; m_i += 2) {
          __m512bh ma0_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + k_i * 2]);
          __m512bh ma1_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 1) * 2]);
          __m512bh ma2_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 2) * 2]);
          __m512bh ma3_0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 3) * 2]);
          __m512bh ma0_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + k_i * 2]);
          __m512bh ma1_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + (k_i + 1) * 2]);
          __m512bh ma2_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + (k_i + 2) * 2]);
          __m512bh ma3_1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[(m_i + 1) * K_STEP + (k_i + 3) * 2]);

          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma0_0, bbf16_0_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma0_0, bbf16_0_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma1_0, bbf16_1_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma1_0, bbf16_1_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma2_0, bbf16_2_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma2_0, bbf16_2_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma3_0, bbf16_3_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma3_0, bbf16_3_hi);

          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma0_1, bbf16_0_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma0_1, bbf16_0_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma1_1, bbf16_1_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma1_1, bbf16_1_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma2_1, bbf16_2_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma2_1, bbf16_2_hi);
          c512[(m_i + 1) * 2] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2], ma3_1, bbf16_3_lo);
          c512[(m_i + 1) * 2 + 1] = _mm512_dpbf16_ps(c512[(m_i + 1) * 2 + 1], ma3_1, bbf16_3_hi);
        }
        // Handle remaining row
        for (; m_i < m_block_end; m_i++) {
          __m512bh ma0 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + k_i * 2]);
          __m512bh ma1 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 1) * 2]);
          __m512bh ma2 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 2) * 2]);
          __m512bh ma3 = (__m512bh)_mm512_set1_epi32(*(int32_t*)&abf16[m_i * K_STEP + (k_i + 3) * 2]);

          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma0, bbf16_0_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma0, bbf16_0_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma1, bbf16_1_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma1, bbf16_1_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma2, bbf16_2_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma2, bbf16_2_hi);
          c512[m_i * 2] = _mm512_dpbf16_ps(c512[m_i * 2], ma3, bbf16_3_lo);
          c512[m_i * 2 + 1] = _mm512_dpbf16_ps(c512[m_i * 2 + 1], ma3, bbf16_3_hi);
        }
      }
    }
  }
};

/**
 * @brief Per-channel FP8 GEMM function
 *
 * Unlike block-wise FP8 which applies scale per 128x128 block during computation,
 * per-channel FP8 processes entire K dimension first, then applies per-channel scale at the end.
 */
template <typename K>
void float_mat_vec_perchannel(int m, int n, int k, typename K::BufferA* ba, typename K::BufferB* bb,
                              typename K::BufferC* bc, int ith, int nth) {
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
    for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
      float* c = bc->get_submat(m, n, m_begin, n_begin);

      // Process entire K dimension with K_BLOCKs
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
        K::avx_kernel_4(m, n, k, m_begin, n_begin, k_block_begin, c, ba, bb);
      }

      // Apply per-channel scale once after all K is processed
      K::apply_scale_perchannel(m, n, m_begin, n_begin, c, bb);
    }
  }
}

inline void vec_mul_perchannel(int m, int n, int k, std::shared_ptr<GemmKernel224FP8PerChannel::BufferA> ba,
                               std::shared_ptr<GemmKernel224FP8PerChannel::BufferB> bb,
                               std::shared_ptr<GemmKernel224FP8PerChannel::BufferC> bc, int ith, int nth) {
  float_mat_vec_perchannel<GemmKernel224FP8PerChannel>(m, n, k, ba.get(), bb.get(), bc.get(), ith, nth);
}

}  // namespace amx

#endif  // AMX_RAW_KERNELS_HPP


================================================
FILE: kt-kernel/operators/amx/la/amx_utils.hpp
================================================
#ifndef AMX_UTILS_HPP
#define AMX_UTILS_HPP

#include <cstdio>
#include <iostream>

#include "../../common.hpp"
#include "amx_config.hpp"

namespace amx {
#if defined(HAVE_AMX)
// Debug functions
inline void debug_tile(int t) {
  printf("Tile %d\n", t);
  int8_t data[16][64] = {};
  TileConfig::store_data(t, data, 64);
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 64; j++) {
      printf("%4d ", data[i][j]);
    }
    printf("\n");
  }
  printf("\n");
}

inline void debug_tile_int32(int t) {
  printf("Tile %d\n", t);
  int32_t data[16][16] = {};
  TileConfig::store_data(t, data, 64);
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 16; j++) {
      printf("%10d ", data[i][j]);
    }
    printf("\n");
  }
  printf("\n");
}

inline void debug_tiles(int to = 8) {
  for (int i = 0; i < to; i++) {
    debug_tile(i);
  }
}

inline void debug_tiles_int32(int to = 8) {
  for (int i = 0; i < to; i++) {
    debug_tile_int32(i);
  }
}

inline void debug_tiles_224() {
  for (int i = 0; i < 4; i++) {
    debug_tile(i);
  }
  for (int i = 4; i < 8; i++) {
    debug_tile_int32(i);
  }
}

inline void debug_m512(__m512 x) {
  float data[16];
  _mm512_storeu_ps(data, x);
  for (int i = 0; i < 16; i++) {
    printf("%f ", data[i]);
  }
  printf("\n");
}

inline void debug_m512i(__m512i x) {
  int32_t data[16];
  _mm512_storeu_epi32(data, x);
  for (int i = 0; i < 16; i++) {
    printf("0x%08x ", data[i]);
  }
  printf("\n");
}

inline void debug_m128i(__m128i x) {
  int32_t data[16];
  _mm_storeu_epi32(data, x);
  for (int i = 0; i < 4; i++) {
    printf("0x%08x ", data[i]);
  }
  printf("\n");
}
#endif
// transpose utils
#define SHUFFLE_EPI32(a, b, mask) \
  _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))

inline void transpose_8x8_32bit(__m256i* v, __m256i* v1) {
  // unpacking and 32-bit elements
  v1[0] = _mm256_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm256_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm256_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm256_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm256_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm256_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm256_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm256_unpackhi_epi32(v[6], v[7]);

  // shuffling the 32-bit elements
  v[0] = SHUFFLE_EPI32(v1[0], v1[2], 0x44);
  v[1] = SHUFFLE_EPI32(v1[0], v1[2], 0xee);
  v[2] = SHUFFLE_EPI32(v1[4], v1[6], 0x44);
  v[3] = SHUFFLE_EPI32(v1[4], v1[6], 0xee);
  v[4] = SHUFFLE_EPI32(v1[1], v1[3], 0x44);
  v[5] = SHUFFLE_EPI32(v1[1], v1[3], 0xee);
  v[6] = SHUFFLE_EPI32(v1[5], v1[7], 0x44);
  v[7] = SHUFFLE_EPI32(v1[5], v1[7], 0xee);

  // shuffling 128-bit elements
  v1[0] = _mm256_permute2f128_si256(v[2], v[0], 0x02);
  v1[1] = _mm256_permute2f128_si256(v[3], v[1], 0x02);
  v1[2] = _mm256_permute2f128_si256(v[6], v[4], 0x02);
  v1[3] = _mm256_permute2f128_si256(v[7], v[5], 0x02);
  v1[4] = _mm256_permute2f128_si256(v[2], v[0], 0x13);
  v1[5] = _mm256_permute2f128_si256(v[3], v[1], 0x13);
  v1[6] = _mm256_permute2f128_si256(v[6], v[4], 0x13);
  v1[7] = _mm256_permute2f128_si256(v[7], v[5], 0x13);
}

inline void transpose_8x8_32bit(__m256i* v) {
  __m256i v1[8];
  transpose_8x8_32bit(v, v1);

  v[0] = v1[0];
  v[1] = v1[1];
  v[2] = v1[2];
  v[3] = v1[3];
  v[4] = v1[4];
  v[5] = v1[5];
  v[6] = v1[6];
  v[7] = v1[7];
}

inline void transpose_16x4_32bit(__m512i* r, __m512i* d) {
  static const __m512i index1 =
      _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02, 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);

  d[0] = _mm512_permutexvar_epi32(index1, r[0]);
  d[1] = _mm512_permutexvar_epi32(index1, r[1]);
  d[2] = _mm512_permutexvar_epi32(index1, r[2]);
  d[3] = _mm512_permutexvar_epi32(index1, r[3]);

  r[0] = _mm512_shuffle_i32x4(d[0], d[1], 0x44);
  r[1] = _mm512_shuffle_i32x4(d[0], d[1], 0xee);
  r[2] = _mm512_shuffle_i32x4(d[2], d[3], 0x44);
  r[3] = _mm512_shuffle_i32x4(d[2], d[3], 0xee);

  d[0] = _mm512_shuffle_i32x4(r[0], r[2], 0x88);
  d[1] = _mm512_shuffle_i32x4(r[0], r[2], 0xdd);
  d[2] = _mm512_shuffle_i32x4(r[1], r[3], 0x88);
  d[3] = _mm512_shuffle_i32x4(r[1], r[3], 0xdd);
}

inline void transpose_16x16_32bit(__m512i* v) {
  __m512i v1[16];
  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);

  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);

  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

inline void transpose_16x8_32bit(__m256i* v) {
  transpose_8x8_32bit(v);
  transpose_8x8_32bit(v + 8);
  __m256i v1[16];
  for (int i = 0; i < 16; i++) v1[i] = v[i];

  for (int i = 0; i < 8; i++) {
    v[i * 2] = v1[i];
    v[i * 2 + 1] = v1[8 + i];
  }
}

/*
  Transpose 16x16 32-bit elements
  Note that v must be 64 byte aligned
*/
inline void transpose_16x16_32bit(__m512i* v, size_t stride) {
  assert(reinterpret_cast<intptr_t>(v) % 64 == 0 && "v must be 64 aligned");

  auto stride_v = [=](int i) { return offset_pointer(v, i * stride); };
  __m512i v1[16];

  v1[0] = _mm512_unpacklo_epi32(*stride_v(0), *stride_v(1));
  v1[1] = _mm512_unpackhi_epi32(*stride_v(0), *stride_v(1));
  v1[2] = _mm512_unpacklo_epi32(*stride_v(2), *stride_v(3));
  v1[3] = _mm512_unpackhi_epi32(*stride_v(2), *stride_v(3));
  v1[4] = _mm512_unpacklo_epi32(*stride_v(4), *stride_v(5));
  v1[5] = _mm512_unpackhi_epi32(*stride_v(4), *stride_v(5));
  v1[6] = _mm512_unpacklo_epi32(*stride_v(6), *stride_v(7));
  v1[7] = _mm512_unpackhi_epi32(*stride_v(6), *stride_v(7));
  v1[8] = _mm512_unpacklo_epi32(*stride_v(8), *stride_v(9));
  v1[9] = _mm512_unpackhi_epi32(*stride_v(8), *stride_v(9));
  v1[10] = _mm512_unpacklo_epi32(*stride_v(10), *stride_v(11));
  v1[11] = _mm512_unpackhi_epi32(*stride_v(10), *stride_v(11));
  v1[12] = _mm512_unpacklo_epi32(*stride_v(12), *stride_v(13));
  v1[13] = _mm512_unpackhi_epi32(*stride_v(12), *stride_v(13));
  v1[14] = _mm512_unpacklo_epi32(*stride_v(14), *stride_v(15));
  v1[15] = _mm512_unpackhi_epi32(*stride_v(14), *stride_v(15));

  *stride_v(0) = _mm512_unpacklo_epi64(v1[0], v1[2]);
  *stride_v(1) = _mm512_unpackhi_epi64(v1[0], v1[2]);
  *stride_v(2) = _mm512_unpacklo_epi64(v1[1], v1[3]);
  *stride_v(3) = _mm512_unpackhi_epi64(v1[1], v1[3]);
  *stride_v(4) = _mm512_unpacklo_epi64(v1[4], v1[6]);
  *stride_v(5) = _mm512_unpackhi_epi64(v1[4], v1[6]);
  *stride_v(6) = _mm512_unpacklo_epi64(v1[5], v1[7]);
  *stride_v(7) = _mm512_unpackhi_epi64(v1[5], v1[7]);
  *stride_v(8) = _mm512_unpacklo_epi64(v1[8], v1[10]);
  *stride_v(9) = _mm512_unpackhi_epi64(v1[8], v1[10]);
  *stride_v(10) = _mm512_unpacklo_epi64(v1[9], v1[11]);
  *stride_v(11) = _mm512_unpackhi_epi64(v1[9], v1[11]);
  *stride_v(12) = _mm512_unpacklo_epi64(v1[12], v1[14]);
  *stride_v(13) = _mm512_unpackhi_epi64(v1[12], v1[14]);
  *stride_v(14) = _mm512_unpacklo_epi64(v1[13], v1[15]);
  *stride_v(15) = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(*stride_v(0), *stride_v(4), 0x88);
  v1[1] = _mm512_shuffle_i32x4(*stride_v(1), *stride_v(5), 0x88);
  v1[2] = _mm512_shuffle_i32x4(*stride_v(2), *stride_v(6), 0x88);
  v1[3] = _mm512_shuffle_i32x4(*stride_v(3), *stride_v(7), 0x88);
  v1[4] = _mm512_shuffle_i32x4(*stride_v(0), *stride_v(4), 0xdd);
  v1[5] = _mm512_shuffle_i32x4(*stride_v(1), *stride_v(5), 0xdd);
  v1[6] = _mm512_shuffle_i32x4(*stride_v(2), *stride_v(6), 0xdd);
  v1[7] = _mm512_shuffle_i32x4(*stride_v(3), *stride_v(7), 0xdd);
  v1[8] = _mm512_shuffle_i32x4(*stride_v(8), *stride_v(12), 0x88);
  v1[9] = _mm512_shuffle_i32x4(*stride_v(9), *stride_v(13), 0x88);
  v1[10] = _mm512_shuffle_i32x4(*stride_v(10), *stride_v(14), 0x88);
  v1[11] = _mm512_shuffle_i32x4(*stride_v(11), *stride_v(15), 0x88);
  v1[12] = _mm512_shuffle_i32x4(*stride_v(8), *stride_v(12), 0xdd);
  v1[13] = _mm512_shuffle_i32x4(*stride_v(9), *stride_v(13), 0xdd);
  v1[14] = _mm512_shuffle_i32x4(*stride_v(10), *stride_v(14), 0xdd);
  v1[15] = _mm512_shuffle_i32x4(*stride_v(11), *stride_v(15), 0xdd);

  *stride_v(0) = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  *stride_v(1) = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  *stride_v(2) = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  *stride_v(3) = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  *stride_v(4) = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  *stride_v(5) = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  *stride_v(6) = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  *stride_v(7) = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  *stride_v(8) = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  *stride_v(9) = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  *stride_v(10) = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  *stride_v(11) = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  *stride_v(12) = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  *stride_v(13) = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  *stride_v(14) = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  *stride_v(15) = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

}  // namespace amx

#endif  // AMX_UTILS_HPP

================================================
FILE: kt-kernel/operators/amx/la/pack.hpp
================================================
#ifndef PACK_HPP
#define PACK_HPP

#pragma once
#include <cassert>
#include <cstddef>
#include <iostream>
#include <numeric>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>

class Packed2DLayout {
 public:
  using index_t = std::size_t;

  struct Dim {
    index_t size;  // > 0
    char dir;      // 'r' or 'c'
  };

  // 构造：dims 必须按从低维到高维给出
  explicit Packed2DLayout(std::vector<Dim> dims) : dims_(std::move(dims)) {
    if (dims_.empty()) throw std::invalid_argument("dims must not be empty");
    rows_ = 1;
    cols_ = 1;

    // 预计算行/列 stride（混合进位权重）
    r_stride_for_dim_.assign(dims_.size(), 0);
    c_stride_for_dim_.assign(dims_.size(), 0);

    index_t r_stride = 1, c_stride = 1;
    for (index_t i = 0; i < dims_.size(); ++i) {
      const auto& d = dims_[i];
      if (d.size == 0) throw std::invalid_argument("dim size must be > 0");
      if (d.dir == 'r') {
        r_stride_for_dim_[i] = r_stride;
        r_stride *= d.size;
        rows_ *= d.size;
      } else if (d.dir == 'c') {
        c_stride_for_dim_[i] = c_stride;
        c_stride *= d.size;
        cols_ *= d.size;
      } else {
        throw std::invalid_argument("dim dir must be 'r' or 'c'");
      }
    }
    numel_ = rows_ * cols_;
  }

  // 基本信息
  index_t dims() const { return static_cast<index_t>(dims_.size()); }
  index_t rows() const { return rows_; }
  index_t cols() const { return cols_; }
  index_t numel() const { return numel_; }
  const std::vector<Dim>& spec() const { return dims_; }
  const std::vector<index_t>& r_strides() const { return r_stride_for_dim_; }
  const std::vector<index_t>& c_strides() const { return c_stride_for_dim_; }

  // ---------- 高维坐标 <-> 2D ----------
  std::pair<index_t, index_t> hd_to_rc(const std::vector<index_t>& hd_idx) const {
    check_hd_index(hd_idx);
    index_t row = 0, col = 0;
    for (index_t i = 0; i < dims(); ++i) {
      const auto& d = dims_[i];
      auto v = hd_idx[i];
      if (v >= d.size) throw std::out_of_range(err_dim(i, v, d.size));
      if (d.dir == 'r')
        row += v * r_stride_for_dim_[i];
      else
        col += v * c_stride_for_dim_[i];
    }
    return {row, col};
  }

  std::vector<index_t> rc_to_hd(index_t row, index_t col) const {
    if (row >= rows_ || col >= cols_)
      throw std::out_of_range("rc out of range: (" + std::to_string(row) + "," + std::to_string(col) +
                              "), expect rows<" + std::to_string(rows_) + ", cols<" + std::to_string(cols_) + ")");
    std::vector<index_t> hd_idx(dims(), 0);
    for (index_t i = 0; i < dims(); ++i) {
      const auto& d = dims_[i];
      if (d.dir == 'r') {
        auto stride = r_stride_for_dim_[i];
        hd_idx[i] = (row / stride) % d.size;
      } else {
        auto stride = c_stride_for_dim_[i];
        hd_idx[i] = (col / stride) % d.size;
      }
    }
    return hd_idx;
  }

  // ---------- 2D <-> offset（行主序），支持自定义 ld ----------
  index_t rc_to_offset(index_t row, index_t col, index_t ld = 0) const {
    if (ld == 0) ld = cols_;
    if (row >= rows_ || col >= cols_) throw std::out_of_range("rc out of range for rc_to_offset");
    return row * ld + col;
  }

  std::pair<index_t, index_t> offset_to_rc(index_t offset, index_t ld = 0) const {
    if (ld == 0) ld = cols_;
    index_t row = offset / ld;
    index_t col = offset % ld;
    if (row >= rows_ || col >= cols_) throw std::out_of_range("offset out of range for given ld");
    return {row, col};
  }

  // ---------- 高维坐标 <-> offset（组合/分解） ----------
  index_t hd_to_offset(const std::vector<index_t>& hd_idx, index_t ld = 0) const {
    auto [r, c] = hd_to_rc(hd_idx);
    return rc_to_offset(r, c, ld);
  }

  std::vector<index_t> offset_to_hd(index_t offset, index_t ld = 0) const {
    auto [r, c] = offset_to_rc(offset, ld);
    return rc_to_hd(r, c);
  }

  // ---------- 工具：把某一组 r/c 维做“混合进位”分解/合成 ----------
  // 给定行坐标 row，分解到所有 'r' 维的 digits（低维在前）
  std::vector<index_t> decompose_row(index_t row) const {
    if (row >= rows_) throw std::out_of_range("row out of range in decompose_row");
    std::vector<index_t> res(dims(), 0);
    for (index_t i = 0; i < dims(); ++i) {
      if (dims_[i].dir == 'r') {
        auto stride = r_stride_for_dim_[i];
        res[i] = (row / stride) % dims_[i].size;
      }
    }
    return res;  // 只有 'r' 维位置含有有效 digit
  }
  // 给定列坐标 col，分解到所有 'c' 维的 digits（低维在前）
  std::vector<index_t> decompose_col(index_t col) const {
    if (col >= cols_) throw std::out_of_range("col out of range in decompose_col");
    std::vector<index_t> res(dims(), 0);
    for (index_t i = 0; i < dims(); ++i) {
      if (dims_[i].dir == 'c') {
        auto stride = c_stride_for_dim_[i];
        res[i] = (col / stride) % dims_[i].size;
      }
    }
    return res;  // 只有 'c' 维位置含有有效 digit
  }
  // 合成行坐标（仅读取 'r' 维的位置）
  index_t compose_row(const std::vector<index_t>& digits) const {
    if (digits.size() != dims()) throw std::invalid_argument("digits dim mismatch");
    index_t row = 0;
    for (index_t i = 0; i < dims(); ++i)
      if (dims_[i].dir == 'r') {
        if (digits[i] >= dims_[i].size) throw std::out_of_range(err_dim(i, digits[i], dims_[i].size));
        row += digits[i] * r_stride_for_dim_[i];
      }
    return row;
  }
  // 合成列坐标（仅读取 'c' 维的位置）
  index_t compose_col(const std::vector<index_t>& digits) const {
    if (digits.size() != dims()) throw std::invalid_argument("digits dim mismatch");
    index_t col = 0;
    for (index_t i = 0; i < dims(); ++i)
      if (dims_[i].dir == 'c') {
        if (digits[i] >= dims_[i].size) throw std::out_of_range(err_dim(i, digits[i], dims_[i].size));
        col += digits[i] * c_stride_for_dim_[i];
      }
    return col;
  }

 private:
  void check_hd_index(const std::vector<index_t>& hd_idx) const {
    if (hd_idx.size() != dims())
      throw std::invalid_argument("hd index dim mismatch: got " + std::to_string(hd_idx.size()) + ", expect " +
                                  std::to_string(dims()));
  }
  static std::string err_dim(index_t i, index_t v, index_t sz) {
    return "hd index out of range at dim " + std::to_string(i) + ": got " + std::to_string(v) + ", expect < " +
           std::to_string(sz);
  }

  std::vector<Dim> dims_;
  std::vector<index_t> r_stride_for_dim_;
  std::vector<index_t> c_stride_for_dim_;
  index_t rows_{1}, cols_{1}, numel_{0};
};

// ===== 示例与自测（可选） =====
// g++ -O2 test.cpp -DPACKED2D_DEMO && ./a.out
#ifdef PACKED2D_DEMO
int main() {
  // 任意数量与顺序的 r/c 维；低 -> 高
  Packed2DLayout p({
      {4, 'r'}, {8, 'c'}, {2, 'r'}, {3, 'c'}  // rows=4*2=8, cols=8*3=24, numel=192
  });

  std::cout << "rows=" << p.rows() << " cols=" << p.cols() << " numel=" << p.numel() << "\n";

  // 高维 -> rc -> offset
  std::vector<std::size_t> hd = {3, 5, 1, 2};
  auto [r, c] = p.hd_to_rc(hd);
  auto off = p.hd_to_offset(hd);
  std::cout << "hd -> rc=(" << r << "," << c << "), off=" << off << "\n";

  // 反向
  auto hd2 = p.offset_to_hd(off);
  std::cout << "offset->hd: ";
  for (auto v : hd2) std::cout << v << " ";
  std::cout << "\n";

  // 只分解/合成行、列
  auto rdigits = p.decompose_row(r);
  auto cdigits = p.decompose_col(c);
  auto r2 = p.compose_row(rdigits);
  auto c2 = p.compose_col(cdigits);
  std::cout << "compose row=" << r2 << " col=" << c2 << "\n";
  return 0;
}
#endif

#endif

================================================
FILE: kt-kernel/operators/amx/la/utils.hpp
================================================
#ifndef UTILS_HPP
#define UTILS_HPP
#include <immintrin.h>

#include <cstddef>
#include <cstdint>

static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
  _mm512_storeu_si512(dst, _mm512_loadu_si512(src));
}

// FP32 to BF16 conversion (32 floats -> 32 bf16)
// This requires AVX512BF16 for the fast path, with a fallback for CPUs without it
static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
#if defined(HAVE_AVX512BF16) || defined(__AVX512BF16__)
  // Fast path: use native AVX512BF16 instruction
  _mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
#else
  // Fallback: manual BF16 conversion using bit manipulation
  // BF16 is the upper 16 bits of FP32 (with rounding)
  __m512i i0 = _mm512_castps_si512(*src0);
  __m512i i1 = _mm512_castps_si512(*src1);

  // Round to nearest even: add 0x7FFF + ((val >> 16) & 1)
  __m512i round0 =
      _mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i0, 16), _mm512_set1_epi32(1)));
  __m512i round1 =
      _mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i1, 16), _mm512_set1_epi32(1)));

  i0 = _mm512_add_epi32(i0, round0);
  i1 = _mm512_add_epi32(i1, round1);

  // Extract upper 16 bits (BF16)
  i0 = _mm512_srli_epi32(i0, 16);
  i1 = _mm512_srli_epi32(i1, 16);

  // Pack 32-bit values to 16-bit
  __m512i result = _mm512_packus_epi32(i0, i1);
  // Fix the interleaving from packus
  result = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), result);

  _mm512_storeu_si512(dst, result);
#endif
}

// BF16 to FP32 conversion (32 bf16 -> 32 floats)
// This does NOT require AVX512BF16 - uses basic AVX512 bit manipulation
static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
  _mm512_storeu_ps(dst0, _mm512_castsi512_ps(
                             _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)(src))), 16)));
  _mm512_storeu_ps(dst1, _mm512_castsi512_ps(_mm512_slli_epi32(
                             _mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)(src) + 1)), 16)));
}

static inline __m512 vector_abs_max(__m512 a, __m512 b) {
  __m512 a_abs = _mm512_abs_ps(a);
  __m512 b_abs = _mm512_abs_ps(b);

  __mmask16 mask = _mm512_cmp_ps_mask(a_abs, b_abs, _CMP_GT_OS);

  return _mm512_mask_blend_ps(mask, b_abs, a_abs);
}

#endif  // UTILS_HPP

================================================
FILE: kt-kernel/operators/amx/moe.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_AMX_MOE_H
#define CPUINFER_OPERATOR_AMX_MOE_H

// #define CHECK
// #define FORWARD_TIME_PROFILE
// #define FORWARD_TIME_REPORT

#include "moe_base.hpp"

template <class T>
class AMX_MOE_TP : public AMX_MOE_BASE<T, AMX_MOE_TP<T>> {
 private:
  using Base = AMX_MOE_BASE<T, AMX_MOE_TP<T>>;
  using Base::config_;
  using Base::down_ba_;
  using Base::down_bb_;
  using Base::down_bc_;
  using Base::gate_bb_;
  using Base::gate_bc_;
  using Base::gate_up_ba_;
  using Base::m_local_num_;
  using Base::tp_part_idx;
  using Base::up_bb_;
  using Base::up_bc_;

#ifdef CHECK
  char verify_bb[100000000];
  char check_bb[100000000];
  uint8_t compare_expers = 3;
#endif

  inline void write_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
                            size_t scale_size) {
    // printf("expert %d, size %ld, scale size %ld\n", expert_idx, size, scale_size);
    // std::ofstream of(prefix / (T::name() + mat_class + std::to_string(expert_idx)  + "_quant_" + ".kt"));
    std::ofstream of(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                               std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"));
    if (of.is_open() == false) {
      printf("no such file: %s", (prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                                            std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"))
                                     .c_str());
      // throw std::runtime_error("No such file");
    }
    of.write((char*)bb, size - scale_size);
    of.close();
    // of.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_scale_" + ".kt"));
    of.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" + std::to_string(scale_size) + "Byte" +
                      "_scale_" + ".kt"));
    if (of.is_open() == false) {
      printf("no such file\n");
      // throw std::runtime_error("No such file");
    }
    of.write(((char*)bb) + size - scale_size, scale_size);
  }

  inline void read_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
                           size_t scale_size, uint8_t mat_split, uint8_t mat_split_idex) {
    // std::ifstream f(prefix / (T::name() + mat_class + std::to_string(expert_idx)  + "_quant_" + ".kt"));
    std::ifstream f(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                              std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"));
    if (f.is_open() == false) {
      printf("no such file: %s\n", (prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                                              std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"))
                                       .c_str());
      // throw std::runtime_error("No such file");
    }
    f.seekg(mat_split_idex * (size - scale_size) / mat_split);
    f.read(((char*)bb) + mat_split_idex * (size - scale_size) / mat_split, (size - scale_size) / mat_split);
    f.close();
    // f.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_scale_" + ".kt"));
    f.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" + std::to_string(scale_size) + "Byte" +
                     "_scale_" + ".kt"));
    if (f.is_open() == false) {
      printf("no such file: %s\n", (prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                                              std::to_string(scale_size) + "Byte" + "_scale_" + ".kt"))
                                       .c_str());
      // throw std::runtime_error("No such file");
    }
    f.seekg(mat_split_idex * scale_size / mat_split);
    f.read((((char*)bb) + size - scale_size) + mat_split_idex * scale_size / mat_split, scale_size / mat_split);
  }
#ifdef CHECK
  inline void load_check() {
    memcpy(check_bb, (char*)down_bb_[compare_expers]->b,
           T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
  }

  void verify_load_right() {
    // printf("varify down bb_0 %d\n", tp_part_idx);
    memcpy(verify_bb, (char*)down_bb_[compare_expers]->b,
           T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
    // check if verify_bb_0 equal to check_bb_0
    if (memcmp(verify_bb, check_bb, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size)) != 0) {
      printf("verify error\n");
      for (size_t i = 0; i < T::BufferB::required_size(config_.hidden_size, config_.intermediate_size); ++i) {
        if (verify_bb[i] != check_bb[i]) {
          printf("Difference at byte %zu: verify_bb_%d[%zu] = %02x, check_bb[%zu] = %02x\n", i, compare_expers, i,
                 (unsigned char)verify_bb[i], i, (unsigned char)check_bb[i]);
          break;  // find the first difference and exit
        }
      }
      assert(0);
    } else {
      printf("pass verify\n");
      // pick out the 100th~150th byte of scale to see
      printf("numa %d, verify_bb_%d:\n", tp_part_idx, compare_expers);
      size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size);
      size_t scale_size = config_.hidden_size * sizeof(float);
      for (size_t i = size - scale_size; i < size - scale_size + 50; ++i) {
        printf("%02x ", (unsigned char)verify_bb[i]);
      }
      printf("\n");
    }
  }
#endif

#ifdef FORWARD_TIME_REPORT
  std::chrono::time_point<std::chrono::high_resolution_clock> last_now;
#endif

 public:
  AMX_MOE_TP() = default;

  AMX_MOE_TP(GeneralMOEConfig config, int tp_part_idx = 0) : Base(config, tp_part_idx) {
    // Initialization now happens in derived_init() which is called by base constructor
  }

  void derived_init() {
    printf("Creating AMX_MOE_TP %d at numa %d\n", tp_part_idx, numa_node_of_cpu(sched_getcpu()));
    auto& load = config_.load;
    auto& save = config_.save;

    std::filesystem::path prefix = config_.path;
    prefix = prefix / ("_layer_" + std::to_string(config_.layer_idx)) / ("_numa_" + std::to_string(tp_part_idx));
    if (save) {
      std::cout << "Creating " << prefix << std::endl;
      std::filesystem::create_directories(prefix);
    }
    if (load) {
      if (std::filesystem::exists(prefix)) {
        std::cout << "Loading from " << prefix << std::endl;
      } else {
        throw std::runtime_error("Path not found: " + prefix.string());
      }
    }
  }

  ~AMX_MOE_TP() = default;

  // ============================================================================
  // CRTP buffer creation - no group_size
  // ============================================================================

  size_t buffer_a_required_size_impl(size_t m, size_t k) const { return T::BufferA::required_size(m, k); }
  size_t buffer_b_required_size_impl(size_t n, size_t k) const { return T::BufferB::required_size(n, k); }
  size_t buffer_c_required_size_impl(size_t m, size_t n) const { return T::BufferC::required_size(m, n); }

  std::shared_ptr<typename T::BufferA> make_buffer_a_impl(size_t m, size_t k, void* data) const {
    return std::make_shared<typename T::BufferA>(m, k, data);
  }
  std::shared_ptr<typename T::BufferB> make_buffer_b_impl(size_t n, size_t k, void* data) const {
    return std::make_shared<typename T::BufferB>(n, k, data);
  }
  std::shared_ptr<typename T::BufferC> make_buffer_c_impl(size_t m, size_t n, void* data) const {
    return std::make_shared<typename T::BufferC>(m, n, data);
  }

  // ============================================================================
  // CRTP virtual points - GEMM dispatch
  // ============================================================================

  void do_gate_up_gemm(bool do_up, int expert_idx, int ith, int nth, int qlen) {
    int m = m_local_num_[expert_idx];
    auto& ba = gate_up_ba_[expert_idx];
    auto& bb = do_up ? up_bb_[expert_idx] : gate_bb_[expert_idx];
    auto& bc = do_up ? up_bc_[expert_idx] : gate_bc_[expert_idx];

    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul(m, config_.intermediate_size, config_.hidden_size, ba, bb, bc, ith, nth);
    } else {
      amx::vec_mul(m, config_.intermediate_size, config_.hidden_size, ba, bb, bc, ith, nth);
    }
  }

  void do_down_gemm(int expert_idx, int ith, int nth, int qlen) {
    int m = m_local_num_[expert_idx];
    auto& ba = down_ba_[expert_idx];
    auto& bb = down_bb_[expert_idx];
    auto& bc = down_bc_[expert_idx];

    if (qlen > 4 * config_.expert_num / config_.num_experts_per_tok) {
      amx::mat_mul(m, config_.hidden_size, config_.intermediate_size, ba, bb, bc, ith, nth);
    } else {
      amx::vec_mul(m, config_.hidden_size, config_.intermediate_size, ba, bb, bc, ith, nth);
    }
  }
  void load_weights() {
    auto pool = config_.pool->get_subpool(tp_part_idx);
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
    if (config_.gate_projs.size()) {
      pool->do_work_stealing_job(
          config_.expert_num, nullptr,
          [this, physical_to_logical_map](int expert_id) {
            // printf("Load layer %d [%d/%d]\n", config_.layer_idx, expert_id, config_.expert_num);
            uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_id);
            {
              size_t scale_size = config_.intermediate_size * sizeof(float);
              size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size) - scale_size;

              memcpy(gate_bb_[expert_id]->b, config_.gate_projs[tp_part_idx][logical_expert_id], size);

              if constexpr (T::BufferB::SCALE) {
                memcpy(gate_bb_[expert_id]->d, config_.gate_scales[tp_part_idx][logical_expert_id], scale_size);
              }

              memcpy(up_bb_[expert_id]->b, config_.up_projs[tp_part_idx][logical_expert_id], size);

              if constexpr (T::BufferB::SCALE) {
                memcpy(up_bb_[expert_id]->d, config_.up_scales[tp_part_idx][logical_expert_id], scale_size);
              }
            }

            {
              size_t scale_size = config_.hidden_size * sizeof(float);
              size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size) - scale_size;

              memcpy(down_bb_[expert_id]->b, config_.down_projs[tp_part_idx][logical_expert_id], size);

              if constexpr (T::BufferB::SCALE) {
                memcpy(down_bb_[expert_id]->d, config_.down_scales[tp_part_idx][logical_expert_id], scale_size);
              }
            }
          },
          nullptr);

    } else {
      int nth = T::recommended_nth(config_.intermediate_size);
      static uint8_t mat_type_all = 3, mat_split = 1;
      std::filesystem::path prefix = config_.path;
      prefix = prefix / ("_layer_" + std::to_string(config_.layer_idx)) / ("_numa_" + std::to_string(tp_part_idx));

      if (config_.load) {
        std::cout << "Loading from " << prefix << std::endl;
        for (int task_id = 0; task_id < config_.expert_num * mat_type_all * mat_split; task_id++) {
          int64_t expert_idx = task_id / (mat_type_all * mat_split);
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          uint8_t mat_class = (task_id % (mat_type_all * mat_split)) / mat_split;
          uint8_t mat_split_idex = task_id % mat_split;
          if (mat_class == 0) {  // the up matrix
            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
            size_t scale_size = config_.intermediate_size * sizeof(float);
            read_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, logical_expert_id, size, scale_size, mat_split,
                         mat_split_idex);
          } else if (mat_class == 1) {
            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
            size_t scale_size = config_.intermediate_size * sizeof(float);
            read_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, logical_expert_id, size, scale_size,
                         mat_split, mat_split_idex);
          } else {
            size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size);
            size_t scale_size = config_.hidden_size * sizeof(float);
            read_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, logical_expert_id, size, scale_size,
                         mat_split, mat_split_idex);
          }
        }
      }
// check process, store down matrix to check
#ifdef CHECK
      load_check();
#endif
#ifndef CHECK
      else
#endif
      {
        if (tp_part_idx == 0) {
          std::cout << "  online quant from bf16" << std::endl;
        }
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              int64_t expert_idx = task_id / nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // gate part
              gate_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.gate_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
                  ith, nth);
              // up part
              up_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.up_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
                  ith, nth);
            },
            nullptr);

        nth = T::recommended_nth(config_.hidden_size);
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              int64_t expert_idx = task_id / nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // down part
              down_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.down_proj + logical_expert_id * config_.hidden_size * config_.intermediate_size,
                  ith, nth);
              // printf("load idown, expert %ld, ith %d, total nth %d\n", expert_idx, ith, nth);
            },
            nullptr);
      }
#ifdef CHECK
      verify_load_right();
#endif
      // save process
      if (config_.save) {
        pool->do_work_stealing_job(
            config_.expert_num * mat_type_all, nullptr,
            [this, physical_to_logical_map, prefix](int task_id) {
              int64_t expert_idx = task_id / mat_type_all;
              expert_idx = expert_map(physical_to_logical_map, expert_idx);
              uint8_t mat_class = task_id % mat_type_all;
              if (mat_class == 0) {  // the up matrix
                size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
                size_t scale_size = config_.intermediate_size * sizeof(float);
                write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b, expert_idx, size, scale_size);
              } else if (mat_class == 1) {
                size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);
                size_t scale_size = config_.intermediate_size * sizeof(float);
                write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b, expert_idx, size, scale_size);
              } else if (mat_class == 2) {
                size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size);
                size_t scale_size = config_.hidden_size * sizeof(float);
                write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, expert_idx, size, scale_size);
              }
            },
            nullptr);
      }
    }
  }

  // forward, forward_prefill, forward_decode, warm_up are inherited from Base
};

// ============================================================================
// TP_MOE specialization for AMX_MOE_TP
// Inherits from TP_MOE<AMX_MOE_BASE<...>> to reuse merge_results implementation
// ============================================================================

template <typename K>
class TP_MOE<AMX_MOE_TP<K>> : public TP_MOE<AMX_MOE_BASE<K, AMX_MOE_TP<K>>> {
 public:
  using Base = TP_MOE<AMX_MOE_BASE<K, AMX_MOE_TP<K>>>;
  using Base::Base;

  void load_weights() override {
    auto& config = this->config;
    auto& tps = this->tps;
    auto& tp_count = this->tp_count;
    auto pool = config.pool;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;
    if (config.gate_projs.empty() == false) {
      printf("TP Load from loader\n");
      DO_TPS_LOAD_WEIGHTS(pool);
      this->weights_loaded = true;
    } else if (config.gate_proj != nullptr) {
      printf("From BF16\n");
      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        size_t gate_up_elcount = tpc.intermediate_size * tpc.hidden_size;
        tpc.gate_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        tpc.up_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        tpc.down_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        if (tps[i]->config_.load == false) {
          pool->get_subpool(i)->do_work_stealing_job(
              tpc.expert_num, nullptr,
              [&](int expert_id_) {
                size_t expert_id = expert_map(physical_to_logical_map, expert_id_);
                memcpy((ggml_bf16_t*)tpc.gate_proj + expert_id * gate_up_elcount,
                       (ggml_bf16_t*)config.gate_proj + expert_id * config.intermediate_size * config.hidden_size +
                           i * gate_up_elcount,
                       sizeof(ggml_bf16_t) * gate_up_elcount);
                memcpy((ggml_bf16_t*)tpc.up_proj + expert_id * gate_up_elcount,
                       (ggml_bf16_t*)config.up_proj + expert_id * config.intermediate_size * config.hidden_size +
                           i * gate_up_elcount,
                       sizeof(ggml_bf16_t) * gate_up_elcount);
                for (size_t col = 0; col < config.hidden_size; col++) {
                  memcpy((ggml_bf16_t*)tpc.down_proj + expert_id * tpc.hidden_size * tpc.intermediate_size +
                             col * tpc.intermediate_size,
                         (ggml_bf16_t*)config.down_proj + expert_id * config.intermediate_size * config.hidden_size +
                             col * config.intermediate_size + i * tpc.intermediate_size,
                         sizeof(ggml_bf16_t) * tpc.intermediate_size);
                }
              },
              nullptr);
        }
      }

      DO_TPS_LOAD_WEIGHTS(pool);

      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        delete[] (ggml_bf16_t*)(tpc.gate_proj);
        delete[] (ggml_bf16_t*)(tpc.up_proj);
        delete[] (ggml_bf16_t*)(tpc.down_proj);
      }

      this->weights_loaded = true;
    } else if (config.path != "") {
      printf("TP Load from file %s\n", config.path.c_str());
      DO_TPS_LOAD_WEIGHTS(pool);
      this->weights_loaded = true;
    } else {
      throw std::runtime_error("no weight source");
    }
  }

  // merge_results is inherited from TP_MOE<AMX_MOE_BASE<K, AMX_MOE_TP<K>>>
};

#endif


================================================
FILE: kt-kernel/operators/amx/moe_base.hpp
================================================
/**
 * @Description  : Common AMX MoE base class extracted from K2 implementation.
 * @Author       : oql, Codex and Claude
 * @Date         : 2025-12-09
 * @Version      : 0.1.0
 * @LastEditors  : oql, Codex and Claude
 * @LastEditTime : 2025-12-09
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_AMX_MOE_BASE_H
#define CPUINFER_OPERATOR_AMX_MOE_BASE_H

// #define FORWARD_TIME_PROFILE

#include <immintrin.h>

#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <filesystem>
#include <fstream>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "../../cpu_backend/shared_mem_buffer.h"
#include "../../cpu_backend/worker_pool.h"
#include "../common.hpp"
#include "../moe-tp.hpp"
#include "la/amx.hpp"
#include "llama.cpp/ggml.h"

template <class T, class Derived>
class AMX_MOE_BASE {
 public:
  int tp_part_idx = 0;

  ggml_bf16_t* m_local_input_ = nullptr;
  ggml_bf16_t* m_local_gate_output_ = nullptr;
  ggml_bf16_t* m_local_up_output_ = nullptr;
  ggml_bf16_t* m_local_down_output_ = nullptr;

  std::vector<std::vector<int>> m_local_pos_;
  std::vector<int> m_local_num_;
  std::vector<int> m_expert_id_map_;
  std::vector<ggml_bf16_t*> m_local_input_ptr_;
  std::vector<ggml_bf16_t*> m_local_gate_output_ptr_;
  std::vector<ggml_bf16_t*> m_local_up_output_ptr_;
  std::vector<ggml_bf16_t*> m_local_down_output_ptr_;

  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;

  size_t pool_count_ = 0;
  size_t gate_up_ba_pool_bytes_ = 0;
  size_t gate_bc_pool_bytes_ = 0;
  size_t up_bc_pool_bytes_ = 0;
  size_t down_ba_pool_bytes_ = 0;
  size_t down_bc_pool_bytes_ = 0;
  void* gate_up_ba_pool_ = nullptr;
  void* gate_bc_pool_ = nullptr;
  void* up_bc_pool_ = nullptr;
  void* down_ba_pool_ = nullptr;
  void* down_bc_pool_ = nullptr;

  GeneralMOEConfig config_;
  using input_t = ggml_bf16_t;
  using output_t = float;
  static constexpr double ELEMENT_SIZE = T::ELEMENT_SIZE;

  AMX_MOE_BASE(GeneralMOEConfig config, int tp_part_idx_) : tp_part_idx(tp_part_idx_), config_(config) {
    init();
    derived()->derived_init();
  }

  void init() {
    if (config_.load && config_.path == "") {
      config_.load = false;
    }

    MemoryRequest mem_requests;
    mem_requests.append_pointer(
        &m_local_input_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok * config_.max_len * config_.hidden_size);
    mem_requests.append_pointer(&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok *
                                                           config_.max_len * config_.intermediate_size);
    mem_requests.append_pointer(&m_local_up_output_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok *
                                                         config_.max_len * config_.intermediate_size);
    mem_requests.append_pointer(&m_local_down_output_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok *
                                                           config_.max_len * config_.hidden_size);

    m_local_pos_.resize(config_.max_len);
    for (int i = 0; i < config_.max_len; i++) {
      m_local_pos_[i].resize(config_.num_experts_per_tok);
    }
    m_expert_id_map_.resize(config_.expert_num);
    m_local_num_.resize(config_.expert_num);
    m_local_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);

    for (size_t i = 0; i < config_.expert_num; i++) {
      gate_up_ba_.push_back(make_buffer_a(config_.max_len, config_.hidden_size, nullptr));
      gate_bc_.push_back(make_buffer_c(config_.max_len, config_.intermediate_size, nullptr));
      up_bc_.push_back(make_buffer_c(config_.max_len, config_.intermediate_size, nullptr));
      down_ba_.push_back(make_buffer_a(config_.max_len, config_.intermediate_size, nullptr));
      down_bc_.push_back(make_buffer_c(config_.max_len, config_.hidden_size, nullptr));

      void* gate_bb_ptr =
          std::aligned_alloc(64, buffer_b_required_size(config_.intermediate_size, config_.hidden_size));
      gate_bb_.push_back(make_buffer_b(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));

      void* up_bb_ptr = std::aligned_alloc(64, buffer_b_required_size(config_.intermediate_size, config_.hidden_size));
      up_bb_.push_back(make_buffer_b(config_.intermediate_size, config_.hidden_size, up_bb_ptr));

      void* down_bb_ptr =
          std::aligned_alloc(64, buffer_b_required_size(config_.hidden_size, config_.intermediate_size));
      down_bb_.push_back(make_buffer_b(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
    }
    // TODO: need update to all *.hpp
    // (config_.expert_num * T::M_STEP) in pool_count_ is to ensure padding for each experts.
    pool_count_ = config_.max_len * config_.num_experts_per_tok + config_.expert_num * T::M_STEP;

    gate_up_ba_pool_bytes_ = buffer_a_required_size(pool_count_, config_.hidden_size) + pool_count_ * 64;
    gate_bc_pool_bytes_ = buffer_c_required_size(pool_count_, config_.intermediate_size) + pool_count_ * 64;
    up_bc_pool_bytes_ = buffer_c_required_size(pool_count_, config_.intermediate_size) + pool_count_ * 64;
    down_ba_pool_bytes_ = buffer_a_required_size(pool_count_, config_.intermediate_size) + pool_count_ * 64;
    down_bc_pool_bytes_ = buffer_c_required_size(pool_count_, config_.hidden_size) + pool_count_ * 64;

    mem_requests.append_pointer(&gate_up_ba_pool_, gate_up_ba_pool_bytes_);
    mem_requests.append_pointer(&gate_bc_pool_, gate_bc_pool_bytes_);
    mem_requests.append_pointer(&up_bc_pool_, up_bc_pool_bytes_);
    mem_requests.append_pointer(&down_ba_pool_, down_ba_pool_bytes_);
    mem_requests.append_pointer(&down_bc_pool_, down_bc_pool_bytes_);

    shared_mem_buffer_numa.alloc(tp_part_idx, this, mem_requests);
  }

  ~AMX_MOE_BASE() = default;

  void warm_up() {
    int qlen = config_.max_len;
    std::vector<uint8_t> input(sizeof(ggml_bf16_t) * qlen * config_.hidden_size);
    std::vector<uint8_t> output(sizeof(ggml_bf16_t) * qlen * config_.hidden_size);
    std::vector<int64_t> expert_ids(qlen * config_.num_experts_per_tok);
    std::vector<float> weights(qlen * config_.num_experts_per_tok);
    for (int i = 0; i < qlen * config_.num_experts_per_tok; i++) {
      expert_ids[i] = i % config_.expert_num;
      weights[i] = 0.01;
    }
    forward(qlen, config_.num_experts_per_tok, expert_ids.data(), weights.data(), input.data(), output.data());
  }

  void forward(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
    if (qlen > 1) {
      forward_prefill(qlen, k, expert_ids, weights, input, output);
    } else {
      forward_decode(k, expert_ids, weights, input, output);
    }
  }

  template <typename... Args>
  void load_weights(Args&&... args) {
    derived()->load_weights(std::forward<Args>(args)...);
  }

  template <typename... Args>
  void write_weights_to_buffer(Args&&... args) const {
    derived_const()->write_weights_to_buffer(std::forward<Args>(args)...);
  }

  void forward_prefill(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
                       void* output) {
    auto pool = config_.pool->get_subpool(tp_part_idx);
#ifdef FORWARD_TIME_PROFILE
    auto start_time = std::chrono::high_resolution_clock::now();
    auto last = start_time;
    long prepare_time = 0, cpy_input_time = 0, q_input_time = 0, up_gate_time = 0;
    long act_time = 0, q_down_time = 0, down_time = 0, weight_time = 0;
    int max_local_num = 0;
#endif

    int activated_expert = 0;
    std::fill(m_local_num_.begin(), m_local_num_.end(), 0);
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        if (config_.should_skip_expert(expert_ids[i * k + j])) {
          continue;
        }
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }

    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
#ifdef FORWARD_TIME_PROFILE
        max_local_num = std::max(max_local_num, m_local_num_[i]);
#endif
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }

    size_t offset = 0;
    void* gate_up_ba_pool_ptr = gate_up_ba_pool_;
    void* gate_bc_pool_ptr = gate_bc_pool_;
    void* up_bc_pool_ptr = up_bc_pool_;
    void* down_ba_pool_ptr = down_ba_pool_;
    void* down_bc_pool_ptr = down_bc_pool_;
    constexpr size_t M_STEP = T::M_STEP;
    auto align64 = [](size_t v) { return (v + 63) & (~(size_t)63); };
    size_t used_pool_m = 0;
    size_t used_pool_bytes_a = 0, used_pool_bytes_bc_gate = 0, used_pool_bytes_bc_up = 0, used_pool_bytes_ba_down = 0,
           used_pool_bytes_bc_down = 0;

    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];

      if (m_local_num_[i] == 0) {
        continue;
      }

      size_t max_m = (m_local_num_[i] + M_STEP - 1) / M_STEP * M_STEP;
      gate_up_ba_[i]->max_m = max_m;
      gate_up_ba_[i]->set_data(gate_up_ba_pool_ptr);
      size_t ba_size = align64(buffer_a_required_size(max_m, config_.hidden_size));
      gate_up_ba_pool_ptr = (void*)((uintptr_t)gate_up_ba_pool_ptr + ba_size);

      gate_bc_[i]->max_m = max_m;
      gate_bc_[i]->set_data(gate_bc_pool_ptr);
      size_t bc_gate_size = align64(buffer_c_required_size(max_m, config_.intermediate_size));
      gate_bc_pool_ptr = (void*)((uintptr_t)gate_bc_pool_ptr + bc_gate_size);

      up_bc_[i]->max_m = max_m;
      up_bc_[i]->set_data(up_bc_pool_ptr);
      size_t bc_up_size = align64(buffer_c_required_size(max_m, config_.intermediate_size));
      up_bc_pool_ptr = (void*)((uintptr_t)up_bc_pool_ptr + bc_up_size);

      down_ba_[i]->max_m = max_m;
      down_ba_[i]->set_data(down_ba_pool_ptr);
      size_t ba_down_size = align64(buffer_a_required_size(max_m, config_.intermediate_size));
      down_ba_pool_ptr = (void*)((uintptr_t)down_ba_pool_ptr + ba_down_size);

      down_bc_[i]->max_m = max_m;
      down_bc_[i]->set_data(down_bc_pool_ptr);
      size_t bc_down_size = align64(buffer_c_required_size(max_m, config_.hidden_size));
      down_bc_pool_ptr = (void*)((uintptr_t)down_bc_pool_ptr + bc_down_size);

      used_pool_m += max_m;
      used_pool_bytes_a += ba_size;
      used_pool_bytes_bc_gate += bc_gate_size;
      used_pool_bytes_bc_up += bc_up_size;
      used_pool_bytes_ba_down += ba_down_size;
      used_pool_bytes_bc_down += bc_down_size;
    }

    assert(used_pool_m <= pool_count_);
    assert(used_pool_bytes_a <= gate_up_ba_pool_bytes_);
    assert(used_pool_bytes_bc_gate <= gate_bc_pool_bytes_);
    assert(used_pool_bytes_bc_up <= up_bc_pool_bytes_);
    assert(used_pool_bytes_ba_down <= down_ba_pool_bytes_);
    assert(used_pool_bytes_bc_down <= down_bc_pool_bytes_);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      prepare_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    auto direct_or_pool = [&](int count, auto&& fn) {
      if (qlen < 10) {
        for (int i = 0; i < count; i++) {
          fn(i);
        }
      } else {
        pool->do_work_stealing_job(count, nullptr, fn, nullptr);
      }
    };

    direct_or_pool(qlen, [&](int i) {
      for (int j = 0; j < k; j++) {
        if (config_.should_skip_expert(expert_ids[i * k + j])) {
          continue;
        }
        memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
               (ggml_bf16_t*)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
      }
    });

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      cpy_input_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    direct_or_pool(activated_expert, [this](int task_id) {
      int expert_idx = m_expert_id_map_[task_id];
      gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1);
    });

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      q_input_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    int nth = T::recommended_nth(config_.intermediate_size);
    pool->do_work_stealing_job(
        nth * activated_expert * 2, [](int _) { T::config(); },
        [this, nth, qlen](int task_id2) {
          int task_id = task_id2 / 2;
          bool do_up = task_id2 % 2;
          int expert_idx = m_expert_id_map_[task_id / nth];

          int ith = task_id % nth;
          derived()->do_gate_up_gemm(do_up, expert_idx, ith, nth, qlen);
          if (do_up) {
            up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);
          } else {
            gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      up_gate_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    apply_activation(activated_expert, nth, qlen);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      act_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    pool->do_work_stealing_job(
        activated_expert, nullptr,
        [this](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], 0, 1);
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      q_down_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    nth = T::recommended_nth(config_.hidden_size);
    pool->do_work_stealing_job(
        nth * activated_expert, [](int _) { T::config(); },
        [this, nth, qlen](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
          derived()->do_down_gemm(expert_idx, ith, nth, qlen);
          down_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_output_ptr_[expert_idx], ith, nth);
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      down_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    pool->do_work_stealing_job(
        qlen, nullptr,
        [this, output, k, expert_ids, weights](int i) {
          for (int e = 0; e < config_.hidden_size; e += 32) {
            __m512 x0 = _mm512_setzero_ps();
            __m512 x1 = _mm512_setzero_ps();
            for (int j = 0; j < k; j++) {
              if (config_.should_skip_expert(expert_ids[i * k + j])) {
                continue;
              }
              __m512 weight = _mm512_set1_ps(weights[i * k + j]);
              __m512 down_output0, down_output1;
              avx512_32xbf16_to_32xfp32((__m512i*)(m_local_down_output_ptr_[expert_ids[i * k + j]] +
                                                   m_local_pos_[i][j] * config_.hidden_size + e),
                                        &down_output0, &down_output1);
              x0 = _mm512_fmadd_ps(down_output0, weight, x0);
              x1 = _mm512_fmadd_ps(down_output1, weight, x1);
            }
            auto f32out = (__m512*)((float*)output + i * config_.hidden_size + e);
            f32out[0] = x0;
            f32out[1] = x1;
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      weight_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto forward_total_time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
    printf(
        "Profiling Results (numa[%d]): activated_expert: %d, prepare: %ld us, cpy_input: %ld us, q_input: %ld us, "
        "up_gate: %ld us, act: %ld us, q_down: %ld us, down: %ld us, weight: %ld us, total: %ld us, max_local_num: "
        "%d, qlen: %d\n",
        tp_part_idx, activated_expert, prepare_time, cpy_input_time, q_input_time, up_gate_time, act_time, q_down_time,
        down_time, weight_time, forward_total_time, max_local_num, qlen);
#endif
  }

  void forward_decode(int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
    int qlen = 1;
    auto pool = config_.pool->get_subpool(tp_part_idx);
#ifdef FORWARD_TIME_PROFILE
    auto start_time = std::chrono::high_resolution_clock::now();
    auto last = start_time;
    long q_input_time = 0, up_gate_time = 0, act_time = 0, q_down_time = 0, down_time = 0, weight_time = 0;
#endif

    int activated_expert = 0;
    std::fill(m_local_num_.begin(), m_local_num_.end(), 0);
    for (int i = 0; i < k; i++) {
      if (config_.should_skip_expert(expert_ids[i])) {
        continue;
      }
      m_expert_id_map_[activated_expert] = expert_ids[i];
      m_local_pos_[0][i] = 0;
      m_local_num_[expert_ids[i]] = qlen;
      activated_expert++;
    }

    size_t offset = 0;
    for (int i = 0; i < activated_expert; i++) {
      auto expert_idx = m_expert_id_map_[i];
      m_local_gate_output_ptr_[expert_idx] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[expert_idx] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_down_output_ptr_[expert_idx] = m_local_down_output_ + offset * config_.hidden_size;
      offset += qlen;
    }

    void* gate_bc_pool_ptr = gate_bc_pool_;
    void* up_bc_pool_ptr = up_bc_pool_;
    void* down_ba_pool_ptr = down_ba_pool_;
    void* down_bc_pool_ptr = down_bc_pool_;
    constexpr size_t M_STEP = T::M_STEP;
    auto align64 = [](size_t v) { return (v + 63) & (~(size_t)63); };
    size_t used_pool_m = 0;
    size_t used_pool_bytes_bc_gate = 0, used_pool_bytes_bc_up = 0, used_pool_bytes_ba_down = 0,
           used_pool_bytes_bc_down = 0;
    for (int i = 0; i < activated_expert; i++) {
      auto expert_idx = m_expert_id_map_[i];
      size_t max_m = (qlen + M_STEP - 1) / M_STEP * M_STEP;

      gate_bc_[expert_idx]->max_m = max_m;
      gate_bc_[expert_idx]->set_data(gate_bc_pool_ptr);
      size_t bc_gate_size = align64(buffer_c_required_size(max_m, config_.intermediate_size));
      gate_bc_pool_ptr = (void*)((uintptr_t)gate_bc_pool_ptr + bc_gate_size);

      up_bc_[expert_idx]->max_m = max_m;
      up_bc_[expert_idx]->set_data(up_bc_pool_ptr);
      size_t bc_up_size = align64(buffer_c_required_size(max_m, config_.intermediate_size));
      up_bc_pool_ptr = (void*)((uintptr_t)up_bc_pool_ptr + bc_up_size);

      down_ba_[expert_idx]->max_m = max_m;
      down_ba_[expert_idx]->set_data(down_ba_pool_ptr);
      size_t ba_down_size = align64(buffer_a_required_size(max_m, config_.intermediate_size));
      down_ba_pool_ptr = (void*)((uintptr_t)down_ba_pool_ptr + ba_down_size);

      down_bc_[expert_idx]->max_m = max_m;
      down_bc_[expert_idx]->set_data(down_bc_pool_ptr);
      size_t bc_down_size = align64(buffer_c_required_size(max_m, config_.hidden_size));
      down_bc_pool_ptr = (void*)((uintptr_t)down_bc_pool_ptr + bc_down_size);

      used_pool_m += max_m;
      used_pool_bytes_bc_gate += bc_gate_size;
      used_pool_bytes_bc_up += bc_up_size;
      used_pool_bytes_ba_down += ba_down_size;
      used_pool_bytes_bc_down += bc_down_size;
    }
    assert(used_pool_m <= pool_count_);
    assert(used_pool_bytes_bc_gate <= gate_bc_pool_bytes_);
    assert(used_pool_bytes_bc_up <= up_bc_pool_bytes_);
    assert(used_pool_bytes_ba_down <= down_ba_pool_bytes_);
    assert(used_pool_bytes_bc_down <= down_bc_pool_bytes_);

    void* gate_up_ba_pool_ptr = gate_up_ba_pool_;
    for (int i = 0; i < activated_expert; i++) {
      auto expert_idx = m_expert_id_map_[i];
      size_t max_m = (qlen + M_STEP - 1) / M_STEP * M_STEP;
      gate_up_ba_[expert_idx]->max_m = max_m;
      gate_up_ba_[expert_idx]->set_data(gate_up_ba_pool_ptr);
      size_t ba_size = align64(buffer_a_required_size(max_m, config_.hidden_size));
      gate_up_ba_pool_ptr = (void*)((uintptr_t)gate_up_ba_pool_ptr + ba_size);
      gate_up_ba_[expert_idx]->from_mat(qlen, (ggml_bf16_t*)input, 0, 1);
    }

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      q_input_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    int nth = T::recommended_nth(config_.intermediate_size);
    pool->do_work_stealing_job(
        nth * activated_expert * 2, [](int _) { T::config(); },
        [this, nth, qlen](int task_id2) {
          int task_id = task_id2 / 2;
          bool do_up = task_id2 % 2;
          int expert_idx = m_expert_id_map_[task_id / nth];

          int ith = task_id % nth;
          derived()->do_gate_up_gemm(do_up, expert_idx, ith, nth, qlen);
          if (do_up) {
            up_bc_[expert_idx]->to_mat(qlen, m_local_up_output_ptr_[expert_idx], ith, nth);
          } else {
            gate_bc_[expert_idx]->to_mat(qlen, m_local_gate_output_ptr_[expert_idx], ith, nth);
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      up_gate_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    apply_activation(activated_expert, nth, qlen);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      act_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    pool->do_work_stealing_job(
        activated_expert, nullptr,
        [this, qlen](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          down_ba_[expert_idx]->from_mat(qlen, m_local_gate_output_ptr_[expert_idx], 0, 1);
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      q_down_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    nth = T::recommended_nth(config_.hidden_size);
    pool->do_work_stealing_job(
        nth * activated_expert, [](int _) { T::config(); },
        [this, nth, qlen](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
          derived()->do_down_gemm(expert_idx, ith, nth, qlen);
          down_bc_[expert_idx]->to_mat(qlen, m_local_down_output_ptr_[expert_idx], ith, nth);
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      down_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    for (int e = 0; e < config_.hidden_size; e += 32) {
      __m512 x0 = _mm512_setzero_ps();
      __m512 x1 = _mm512_setzero_ps();
      for (int j = 0; j < k; j++) {
        if (config_.should_skip_expert(expert_ids[j])) {
          continue;
        }
        __m512 weight = _mm512_set1_ps(weights[j]);
        __m512 down_output0, down_output1;
        avx512_32xbf16_to_32xfp32(
            (__m512i*)(m_local_down_output_ptr_[expert_ids[j]] + m_local_pos_[0][j] * config_.hidden_size + e),
            &down_output0, &down_output1);
        x0 = _mm512_fmadd_ps(down_output0, weight, x0);
        x1 = _mm512_fmadd_ps(down_output1, weight, x1);
      }
      auto f32out = (__m512*)((float*)output + e);
      f32out[0] = x0;
      f32out[1] = x1;
    }

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      weight_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto forward_total_time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
    printf(
        "Profiling Results (numa[%d]): activated_expert: %d, q_input: %ld us, "
        "up_gate: %ld us, act: %ld us, q_down: %ld us, down: %ld us, weight: %ld us, total: %ld us\n",
        tp_part_idx, activated_expert, q_input_time, up_gate_time, act_time, q_down_time, down_time, weight_time,
        forward_total_time);
#endif
  }

 protected:
  Derived* derived() { return static_cast<Derived*>(this); }
  const Derived* derived_const() const { return static_cast<const Derived*>(this); }

  // ============================================================================
  // Derived class initialization hook
  // Called after base class init() completes, allows derived classes to perform
  // their own initialization that depends on base class being fully initialized
  // ============================================================================
  void derived_init() {
    // Default implementation does nothing - derived classes can override
  }

  // ============================================================================
  // Virtual points for buffer creation and size calculation
  // Default implementations use group_size (for KGroup quantization like K2)
  // Derived classes (like moe.hpp) can override to not use group_size
  // ============================================================================

  size_t buffer_a_required_size(size_t m, size_t k) const { return derived_const()->buffer_a_required_size_impl(m, k); }
  size_t buffer_b_required_size(size_t n, size_t k) const { return derived_const()->buffer_b_required_size_impl(n, k); }
  size_t buffer_c_required_size(size_t m, size_t n) const { return derived_const()->buffer_c_required_size_impl(m, n); }

  std::shared_ptr<typename T::BufferA> make_buffer_a(size_t m, size_t k, void* data) const {
    return derived_const()->make_buffer_a_impl(m, k, data);
  }
  std::shared_ptr<typename T::BufferB> make_buffer_b(size_t n, size_t k, void* data) const {
    return derived_const()->make_buffer_b_impl(n, k, data);
  }
  std::shared_ptr<typename T::BufferC> make_buffer_c(size_t m, size_t n, void* data) const {
    return derived_const()->make_buffer_c_impl(m, n, data);
  }

  void apply_activation(int activated_expert, int nth, int qlen) {
    auto pool = config_.pool->get_subpool(tp_part_idx);
    auto fn = [this, nth](int task_id) {
      int expert_idx = m_expert_id_map_[task_id / nth];
      int ith = task_id % nth;
      auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
      for (int i = 0; i < m_local_num_[expert_idx]; i++) {
        ggml_bf16_t* gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
        ggml_bf16_t* up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
        for (int j = n_start; j < n_end; j += 32) {
          __m512 gate_val0, gate_val1, up_val0, up_val1;
          avx512_32xbf16_to_32xfp32((__m512i*)(gate_output_ptr + j), &gate_val0, &gate_val1);
          avx512_32xbf16_to_32xfp32((__m512i*)(up_output_ptr + j), &up_val0, &up_val1);
          __m512 result0 = amx::act_fn(gate_val0, up_val0);
          __m512 result1 = amx::act_fn(gate_val1, up_val1);
          avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i*)(gate_output_ptr + j));
        }
      }
    };

    if (activated_expert == 0) {
      return;
    }

    if (qlen < 10) {
      for (int task_id = 0; task_id < nth * activated_expert; task_id++) {
        fn(task_id);
      }
    } else {
      pool->do_work_stealing_job(nth * activated_expert, nullptr, fn, nullptr);
    }
  }
};

// ============================================================================
// TP_MOE specialization for AMX_MOE_BASE derived classes
// ============================================================================

template <class T, class Derived>
class TP_MOE<AMX_MOE_BASE<T, Derived>> : public TP_MOE_Common<AMX_MOE_BASE<T, Derived>> {
 public:
  using TP_MOE_Common<AMX_MOE_BASE<T, Derived>>::TP_MOE_Common;

  // Default load_weights implementation - can be overridden by derived TP_MOE classes
  void load_weights() override { throw std::runtime_error("Not Implemented"); }

  void write_weight_scale_to_buffer(int gpu_tp_count, int gpu_experts_num,
                                    const std::vector<uintptr_t>& w13_weight_ptrs,
                                    const std::vector<uintptr_t>& w13_scale_ptrs,
                                    const std::vector<uintptr_t>& w2_weight_ptrs,
                                    const std::vector<uintptr_t>& w2_scale_ptrs) {
    throw std::runtime_error("Not Implemented");
  }

  void merge_results(int qlen, void* output, bool incremental) override {
    auto& config = this->config;
    auto& tp_count = this->tp_count;
    auto& local_output_numa = this->local_output_numa;
    auto& tp_configs = this->tp_configs;

    auto merge_fn = [this, output, incremental, &config, &tp_count, &local_output_numa, &tp_configs](int token_nth) {
      float* merge_to = local_output_numa[0] + token_nth * tp_configs[0].hidden_size;
      if (incremental) {
        for (int e = 0; e < config.hidden_size; e += 32) {
          __m512 x0, x1;
          avx512_32xbf16_to_32xfp32((__m512i*)((ggml_bf16_t*)output + token_nth * config.hidden_size + e), &x0, &x1);
          *((__m512*)(merge_to + e)) = _mm512_add_ps(*((__m512*)(merge_to + e)), x0);
          *((__m512*)(merge_to + e + 16)) = _mm512_add_ps(*((__m512*)(merge_to + e + 16)), x1);
        }
      }
      for (int i = 1; i < tp_count; i++) {
        float* merge_from = local_output_numa[i] + token_nth * tp_configs[i].hidden_size;
        for (int e = 0; e < tp_configs[i].hidden_size; e += 16) {
          *((__m512*)(merge_to + e)) = _mm512_add_ps(*((__m512*)(merge_to + e)), *((__m512*)(merge_from + e)));
        }
      }
      for (int e = 0; e < config.hidden_size; e += 32) {
        __m512 x0 = *(__m512*)(merge_to + e);
        __m512 x1 = *(__m512*)(merge_to + e + 16);
        avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i*)((ggml_bf16_t*)output + token_nth * config.hidden_size + e));
      }
    };

    auto pool = config.pool;

    auto direct_or_pool = [&](int count, auto&& fn) {
      if (qlen < 10) {
        for (int i = 0; i < count; i++) {
          fn(i);
        }
      } else {
        pool->do_work_stealing_job(count, nullptr, fn, nullptr);
      }
    };

    direct_or_pool(qlen, merge_fn);
  }

  void merge_results(int qlen, void* output) override { merge_results(qlen, output, false); }
};

#endif  // CPUINFER_OPERATOR_AMX_MOE_BASE_H


================================================
FILE: kt-kernel/operators/amx/test/amx-bkgroup-test.cpp
================================================
#include <omp.h>

#include "../la/amx.hpp"
#define FMT_HEADER_ONLY
#include <fmt/core.h>

#include <cmath>
#include <iostream>
#include <memory>

// Test kernel configuration for k-group testing
struct TestKernelKGroupB {
  static constexpr int M_STEP = 32;
  static constexpr int K_STEP = 64;
  static constexpr int K_BLOCK = 512;
  static constexpr int N_STEP = 32;
  static constexpr int N_BLOCK = 512;
  static constexpr int TILE_N = 16;
  using dt = int8_t;

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_per_thread = (n + nth - 1) / nth;
    int n_start = ith * n_per_thread;
    int n_end = std::min(n_start + n_per_thread, n);
    return {n_start, n_end};
  }
};

void test_buffer_bkgroup_basic() {
  std::cout << "=== Testing BufferBKGroupImpl Basic Functionality ===" << std::endl;

  // Test parameters
  const int k = 2048;            // Must be multiple of K_STEP and K_BLOCK
  const int n = 1024;            // Must be multiple of TILE_N
  const int k_group_size = 128;  // Must divide K_BLOCK evenly

  std::cout << fmt::format("Parameters: k={}, n={}, k_group_size={}\n", k, n, k_group_size);

  // Calculate and allocate buffer
  size_t buffer_size = amx::BufferBKGroupImpl<TestKernelKGroupB>::required_size(k, n, k_group_size);
  void* buffer = std::aligned_alloc(64, buffer_size);
  std::memset(buffer, 0, buffer_size);

  std::cout << fmt::format("Buffer size: {} bytes\n", buffer_size);

  // Create BufferBKGroupImpl instance
  auto buf = std::make_unique<amx::BufferBKGroupImpl<TestKernelKGroupB>>(k, n, k_group_size, buffer);

  // Create test input data (bf16)
  std::vector<ggml_bf16_t> input(k * n);
  std::mt19937 gen(42);
  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);

  for (int i = 0; i < k * n; i++) {
    float val = dist(gen);
    input[i] = ggml_compute_fp32_to_bf16(val);
  }

  // Test from_mat
  std::cout << "Testing from_mat..." << std::endl;
  buf->from_mat(input.data(), 0, 1);
  std::cout << "✓ from_mat completed successfully" << std::endl;

  // Test get_submat
  std::cout << "Testing get_submat..." << std::endl;
  for (int k_begin = 0; k_begin < k; k_begin += TestKernelKGroupB::K_STEP) {
    for (int n_begin = 0; n_begin < n; n_begin += TestKernelKGroupB::TILE_N) {
      int8_t* submat = buf->get_submat(k, n, k_begin, n_begin);
      if (submat == nullptr) {
        std::cerr << fmt::format("ERROR: get_submat returned null for k_begin={}, n_begin={}\n", k_begin, n_begin);
        free(buffer);
        return;
      }
    }
  }
  std::cout << "✓ get_submat tested for all valid positions" << std::endl;

  // Test get_scale
  std::cout << "Testing get_scale..." << std::endl;
  int k_group_count = k / k_group_size;
  for (int n_idx = 0; n_idx < n; n_idx++) {
    for (int kg_idx = 0; kg_idx < k_group_count; kg_idx++) {
      float* scale = buf->get_scale(n, n_idx, k, kg_idx * k_group_size);
      if (scale == nullptr) {
        std::cerr << fmt::format("ERROR: get_scale returned null for n_idx={}, k_group={}\n", n_idx, kg_idx);
        free(buffer);
        return;
      }
      // Verify scale is non-zero (should be set by from_mat)
      if (*scale == 0.0f) {
        std::cerr << fmt::format("WARNING: scale is zero for n_idx={}, k_group={}\n", n_idx, kg_idx);
      }
    }
  }
  std::cout << "✓ get_scale tested for all k-groups" << std::endl;

  // Print some scale values for verification
  std::cout << "\nSample scale values:" << std::endl;
  for (int kg = 0; kg < std::min(4, k_group_count); kg++) {
    float* scale = buf->get_scale(n, 0, k, kg * k_group_size);
    std::cout << fmt::format("  k_group[{}] (k={}): scale = {:.6f}\n", kg, kg * k_group_size, *scale);
  }

  // Clean up
  free(buffer);
  std::cout << "\n✓ All basic tests passed!" << std::endl;
}

void test_buffer_bkgroup_correctness() {
  std::cout << "\n=== Testing BufferBKGroupImpl Quantization Correctness ===" << std::endl;

  const int k = 512;
  const int n = 256;
  const int k_group_size = 128;

  size_t buffer_size = amx::BufferBKGroupImpl<TestKernelKGroupB>::required_size(k, n, k_group_size);
  void* buffer = std::aligned_alloc(64, buffer_size);

  auto buf = std::make_unique<amx::BufferBKGroupImpl<TestKernelKGroupB>>(k, n, k_group_size, buffer);

  // Create test input matrix with known patterns
  std::vector<float> original(k * n);
  std::vector<ggml_bf16_t> input(k * n);

  // Fill with different patterns for each k-group to test group-wise quantization
  for (int k_idx = 0; k_idx < k; k_idx++) {
    for (int n_idx = 0; n_idx < n; n_idx++) {
      int kg = k_idx / k_group_size;
      // Different magnitude for each k-group
      float base_val = (kg + 1) * 0.1f;
      float val = base_val * std::sin(k_idx * 0.01f + n_idx * 0.1f);
      original[k_idx * n + n_idx] = val;
      input[k_idx * n + n_idx] = ggml_compute_fp32_to_bf16(val);
    }
  }

  // Quantize
  buf->from_mat(input.data(), 0, 1);

  // Calculate quantization error statistics
  float max_error = 0.0f;
  float total_error = 0.0f;
  float avg_magnitude = 0.0f;

  for (int i = 0; i < k * n; i++) {
    avg_magnitude += std::abs(original[i]);
  }
  avg_magnitude /= (k * n);

  // Since we're using 4-bit quantization, expect higher error than int8
  // Just verify that scales are being computed correctly
  std::cout << fmt::format("Quantization Analysis:\n");
  std::cout << fmt::format("  Average magnitude: {:.6f}\n", avg_magnitude);
  std::cout << fmt::format("  Using 4-bit quantization (INT4)\n");

  // Test that different k-groups have different scales
  std::cout << "\nVerifying k-group scales are computed independently:" << std::endl;
  bool scales_differ = false;
  for (int n_idx = 0; n_idx < std::min(4, n); n_idx++) {
    float* scale0 = buf->get_scale(n, n_idx, k, 0);
    for (int kg = 1; kg < k / k_group_size; kg++) {
      float* scale_kg = buf->get_scale(n, n_idx, k, kg * k_group_size);
      if (std::abs(*scale0 - *scale_kg) > 1e-6f) {
        scales_differ = true;
        break;
      }
    }
    if (scales_differ) break;
  }

  if (scales_differ) {
    std::cout << "✓ Different k-groups have independent scales" << std::endl;
  } else {
    std::cout << "✗ Warning: All k-groups have the same scale (might be correct for uniform data)" << std::endl;
  }

  free(buffer);
}

void test_buffer_bkgroup_comparison() {
  std::cout << "\n=== Comparing BufferBInt4Impl vs BufferBKGroupImpl ===" << std::endl;

  const int k = 2048;
  const int n = 512;
  const int k_group_size = 256;

  // Create test data
  std::vector<ggml_bf16_t> input(k * n);
  std::mt19937 gen(456);
  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
  for (int i = 0; i < k * n; i++) {
    input[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Test original BufferBInt4Impl
  {
    size_t buffer_size = amx::BufferBInt4Impl<TestKernelKGroupB>::required_size(k, n);
    void* buffer = std::aligned_alloc(64, buffer_size);
    auto buf_b = std::make_unique<amx::BufferBInt4Impl<TestKernelKGroupB>>(k, n, buffer);

    buf_b->from_mat(input.data(), 0, 1);

    // Print some scales
    std::cout << "BufferBInt4Impl scales (per-column):" << std::endl;
    for (int n_idx = 0; n_idx < std::min(4, n); n_idx++) {
      float* scale = buf_b->get_scale(n, n_idx);
      std::cout << fmt::format("  col[{}]: scale = {:.6f}\n", n_idx, *scale);
    }

    free(buffer);
  }

  // Test BufferBKGroupImpl
  {
    size_t buffer_size = amx::BufferBKGroupImpl<TestKernelKGroupB>::required_size(k, n, k_group_size);
    void* buffer = std::aligned_alloc(64, buffer_size);
    auto buf_kg = std::make_unique<amx::BufferBKGroupImpl<TestKernelKGroupB>>(k, n, k_group_size, buffer);

    buf_kg->from_mat(input.data(), 0, 1);

    // Print some scales
    std::cout << "\nBufferBKGroupImpl scales (per k-group):" << std::endl;
    for (int n_idx = 0; n_idx < std::min(2, n); n_idx++) {
      std::cout << fmt::format("  col[{}]:\n", n_idx);
      for (int kg = 0; kg < std::min(4, k / k_group_size); kg++) {
        float* scale = buf_kg->get_scale(n, n_idx, k, kg * k_group_size);
        std::cout << fmt::format("    k_group[{}]: scale = {:.6f}\n", kg, *scale);
      }
    }

    free(buffer);
  }

  std::cout << "\n✓ Comparison test completed" << std::endl;
}

int main(int argc, char** argv) {
  std::cout << "Starting BufferBKGroupImpl Tests\n" << std::endl;

  try {
    // Run basic functionality tests
    test_buffer_bkgroup_basic();

    // Run correctness tests
    test_buffer_bkgroup_correctness();

    // Run comparison tests
    test_buffer_bkgroup_comparison();

    std::cout << "\n=== All tests completed successfully! ===" << std::endl;
  } catch (const std::exception& e) {
    std::cerr << "Test failed with exception: " << e.what() << std::endl;
    return 1;
  }

  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/amx-c-reduce-test.cpp
================================================
#include <omp.h>

#include "../la/amx.hpp"
#define FMT_HEADER_ONLY
#include <fmt/core.h>

#include <cmath>
#include <iostream>
#include <memory>
#include <random>

// Test kernel configuration
struct TestKernelC {
  static constexpr int M_STEP = 32;
  static constexpr int K_STEP = 64;
  static constexpr int K_BLOCK = 512;
  static constexpr int N_STEP = 32;
  static constexpr int N_BLOCK = 512;
  static constexpr int TILE_N = 16;
  using dt = int8_t;

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_per_thread = (n + nth - 1) / nth;
    int n_start = ith * n_per_thread;
    int n_end = std::min(n_start + n_per_thread, n);
    return {n_start, n_end};
  }
};

void test_buffer_c_reduce_basic() {
  std::cout << "=== Testing BufferCReduceImpl Basic Functionality ===" << std::endl;

  // Test parameters
  const int max_m = 64;  // Must be multiple of M_STEP
  const int n = 512;     // Must be multiple of N_STEP

  std::cout << fmt::format("Parameters: max_m={}, n={}\n", max_m, n);

  // Calculate and allocate buffer for BufferCReduceImpl
  size_t buffer_size = amx::BufferCReduceImpl<TestKernelC>::required_size(max_m, n);
  void* buffer = std::aligned_alloc(64, buffer_size);
  std::memset(buffer, 0, buffer_size);

  std::cout << fmt::format("Buffer size: {} bytes\n", buffer_size);
  std::cout << fmt::format("  Float buffer: {} bytes\n", sizeof(float) * max_m * n);
  std::cout << fmt::format("  Int32 buffer: {} bytes\n", sizeof(int32_t) * max_m * n);

  // Create BufferCReduceImpl instance
  auto buf = std::make_unique<amx::BufferCReduceImpl<TestKernelC>>(max_m, n, buffer);

  // Test 1: Verify buffer pointers are set correctly
  std::cout << "\nTest 1: Buffer pointer verification" << std::endl;
  if (buf->c == nullptr) {
    std::cerr << "ERROR: Float buffer pointer is null" << std::endl;
    free(buffer);
    return;
  }
  if (buf->int_c == nullptr) {
    std::cerr << "ERROR: Int32 buffer pointer is null" << std::endl;
    free(buffer);
    return;
  }

  // Verify int_c starts after c
  size_t expected_offset = max_m * n;
  size_t actual_offset = buf->int_c - reinterpret_cast<int32_t*>(buf->c);
  if (actual_offset != expected_offset) {
    std::cerr << fmt::format("ERROR: int_c offset incorrect. Expected: {}, Got: {}\n", expected_offset, actual_offset)
              << std::endl;
    free(buffer);
    return;
  }
  std::cout << "✓ Buffer pointers are correctly set" << std::endl;

  // Test 2: Write to float buffer and verify
  std::cout << "\nTest 2: Float buffer write/read" << std::endl;
  std::mt19937 gen(42);
  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);

  // Fill float buffer with test data
  for (int i = 0; i < max_m * n; i++) {
    buf->c[i] = dist(gen);
  }

  // Verify get_submat works
  for (int m_begin = 0; m_begin < max_m; m_begin += TestKernelC::M_STEP) {
    for (int n_begin = 0; n_begin < n; n_begin += TestKernelC::N_STEP) {
      float* submat = buf->get_submat(max_m, n, m_begin, n_begin);
      if (submat == nullptr) {
        std::cerr << fmt::format("ERROR: get_submat returned null for m_begin={}, n_begin={}\n", m_begin, n_begin)
                  << std::endl;
        free(buffer);
        return;
      }
    }
  }
  std::cout << "✓ Float buffer read/write works correctly" << std::endl;

  // Test 3: Write to int32 buffer and verify
  std::cout << "\nTest 3: Int32 buffer write/read" << std::endl;
  std::uniform_int_distribution<int32_t> int_dist(-1000, 1000);

  // Fill int32 buffer with test data
  for (int i = 0; i < max_m * n; i++) {
    buf->int_c[i] = int_dist(gen);
  }

  // Verify get_int_submat works
  for (int m_begin = 0; m_begin < max_m; m_begin += TestKernelC::M_STEP) {
    for (int n_begin = 0; n_begin < n; n_begin += TestKernelC::N_STEP) {
      int32_t* submat = buf->get_int_submat(max_m, n, m_begin, n_begin);
      if (submat == nullptr) {
        std::cerr << fmt::format("ERROR: get_int_submat returned null for m_begin={}, n_begin={}\n", m_begin, n_begin)
                  << std::endl;
        free(buffer);
        return;
      }
    }
  }
  std::cout << "✓ Int32 buffer read/write works correctly" << std::endl;

  // Test 4: Clear int buffer
  std::cout << "\nTest 4: Clear int buffer" << std::endl;
  buf->clear_int_buffer();
  bool all_zero = true;
  for (int i = 0; i < max_m * n; i++) {
    if (buf->int_c[i] != 0) {
      all_zero = false;
      break;
    }
  }
  if (!all_zero) {
    std::cerr << "ERROR: clear_int_buffer failed to zero the buffer" << std::endl;
    free(buffer);
    return;
  }
  std::cout << "✓ clear_int_buffer works correctly" << std::endl;

  // Test 5: Convert int to float
  std::cout << "\nTest 5: Convert int32 to float" << std::endl;
  // Set some test values in int buffer
  for (int i = 0; i < max_m * n; i++) {
    buf->int_c[i] = i % 100 - 50;  // Values from -50 to 49
  }

  // Convert
  buf->convert_int_to_float(max_m);

  // Verify conversion
  bool conversion_correct = true;
  for (int i = 0; i < max_m * n; i++) {
    float expected = static_cast<float>(i % 100 - 50);
    if (std::abs(buf->c[i] - expected) > 1e-6) {
      std::cerr << fmt::format("ERROR: Conversion mismatch at index {}. Expected: {}, Got: {}\n", i, expected,
                               buf->c[i])
                << std::endl;
      conversion_correct = false;
      break;
    }
  }
  if (!conversion_correct) {
    free(buffer);
    return;
  }
  std::cout << "✓ convert_int_to_float works correctly" << std::endl;

  // Test 6: to_mat functionality
  std::cout << "\nTest 6: to_mat conversion" << std::endl;
  // Fill buffer using proper blocked layout via get_submat
  for (int m_idx = 0; m_idx < max_m; m_idx += TestKernelC::M_STEP) {
    for (int n_idx = 0; n_idx < n; n_idx += TestKernelC::N_STEP) {
      float* submat = buf->get_submat(max_m, n, m_idx, n_idx);
      // Fill this submat block
      for (int i = 0; i < TestKernelC::M_STEP && m_idx + i < max_m; i++) {
        for (int j = 0; j < TestKernelC::N_STEP && n_idx + j < n; j++) {
          submat[i * TestKernelC::N_STEP + j] = (m_idx + i) * 0.1f + (n_idx + j) * 0.01f;
        }
      }
    }
  }

  // Convert to bf16
  std::vector<ggml_bf16_t> output(max_m * n);
  buf->to_mat(max_m, output.data(), 0, 1);

  // Verify some values
  bool to_mat_correct = true;
  for (int i = 0; i < std::min(10, max_m); i++) {
    for (int j = 0; j < std::min(10, n); j++) {
      float original = i * 0.1f + j * 0.01f;
      float converted = ggml_compute_bf16_to_fp32(output[i * n + j]);
      // BF16 has limited precision, allow for some error
      if (std::abs(original - converted) > 0.02f) {  // Increased tolerance for BF16
        std::cerr << fmt::format("ERROR: to_mat mismatch at ({},{}). Original: {}, Converted: {}\n", i, j, original,
                                 converted)
                  << std::endl;
        to_mat_correct = false;
        break;
      }
    }
    if (!to_mat_correct) break;
  }

  if (!to_mat_correct) {
    free(buffer);
    return;
  }
  std::cout << "✓ to_mat works correctly" << std::endl;

  // Clean up
  free(buffer);
  std::cout << "\n✓ All basic tests passed!" << std::endl;
}

void test_buffer_c_reduce_comparison() {
  std::cout << "\n=== Comparing BufferCImpl vs BufferCReduceImpl ===" << std::endl;

  const int max_m = 128;
  const int n = 1024;

  // Test original BufferCImpl
  {
    size_t buffer_size = amx::BufferCImpl<TestKernelC>::required_size(max_m, n);
    void* buffer = std::aligned_alloc(64, buffer_size);
    auto buf_c = std::make_unique<amx::BufferCImpl<TestKernelC>>(max_m, n, buffer);

    std::cout << fmt::format("BufferCImpl size: {} bytes\n", buffer_size);

    // Fill with test data
    for (int i = 0; i < max_m * n; i++) {
      buf_c->c[i] = static_cast<float>(i % 1000) / 100.0f;
    }

    // Test to_mat
    std::vector<ggml_bf16_t> output(max_m * n);
    buf_c->to_mat(max_m, output.data(), 0, 1);

    std::cout << "  Sample values from BufferCImpl:" << std::endl;
    for (int i = 0; i < 3; i++) {
      std::cout << fmt::format("    c[{}] = {:.4f}\n", i, buf_c->c[i]);
    }

    free(buffer);
  }

  // Test BufferCReduceImpl
  {
    size_t buffer_size = amx::BufferCReduceImpl<TestKernelC>::required_size(max_m, n);
    void* buffer = std::aligned_alloc(64, buffer_size);
    auto buf_cr = std::make_unique<amx::BufferCReduceImpl<TestKernelC>>(max_m, n, buffer);

    std::cout << fmt::format("\nBufferCReduceImpl size: {} bytes ({}x larger)\n", buffer_size,
                             buffer_size / (sizeof(float) * max_m * n));

    // Fill float buffer
    for (int i = 0; i < max_m * n; i++) {
      buf_cr->c[i] = static_cast<float>(i % 1000) / 100.0f;
    }

    // Fill int buffer
    for (int i = 0; i < max_m * n; i++) {
      buf_cr->int_c[i] = i % 1000;
    }

    // Test to_mat
    std::vector<ggml_bf16_t> output(max_m * n);
    buf_cr->to_mat(max_m, output.data(), 0, 1);

    std::cout << "  Sample values from BufferCReduceImpl:" << std::endl;
    for (int i = 0; i < 3; i++) {
      std::cout << fmt::format("    c[{}] = {:.4f}, int_c[{}] = {}\n", i, buf_cr->c[i], i, buf_cr->int_c[i]);
    }

    free(buffer);
  }

  std::cout << "\n✓ Comparison test completed" << std::endl;
}

void test_buffer_c_reduce_performance() {
  std::cout << "\n=== Testing BufferCReduceImpl Performance Characteristics ===" << std::endl;

  const int max_m = 256;
  const int n = 2048;
  const int iterations = 1000;

  size_t buffer_size = amx::BufferCReduceImpl<TestKernelC>::required_size(max_m, n);
  void* buffer = std::aligned_alloc(64, buffer_size);
  auto buf = std::make_unique<amx::BufferCReduceImpl<TestKernelC>>(max_m, n, buffer);

  std::cout << fmt::format("Testing with max_m={}, n={}\n", max_m, n);
  std::cout << fmt::format("Total elements: {}\n", max_m * n);
  std::cout << fmt::format("Buffer size: {:.2f} MB\n", buffer_size / (1024.0 * 1024.0));

  // Test clear_int_buffer performance
  std::cout << "\nTesting clear_int_buffer..." << std::endl;
  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < iterations; i++) {
    buf->clear_int_buffer();
  }
  auto end = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
  std::cout << fmt::format("  Average time: {:.3f} us\n", duration / (double)iterations);

  // Test convert_int_to_float performance
  std::cout << "\nTesting convert_int_to_float..." << std::endl;
  // Fill int buffer with test data
  for (int i = 0; i < max_m * n; i++) {
    buf->int_c[i] = i;
  }

  start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < iterations; i++) {
    buf->convert_int_to_float(max_m);
  }
  end = std::chrono::high_resolution_clock::now();
  duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
  std::cout << fmt::format("  Average time: {:.3f} us\n", duration / (double)iterations);

  free(buffer);
  std::cout << "\n✓ Performance tests completed" << std::endl;
}

int main(int argc, char** argv) {
  std::cout << "Starting BufferCReduceImpl Tests\n" << std::endl;

  try {
    // Run basic functionality tests
    test_buffer_c_reduce_basic();

    // Run comparison tests
    test_buffer_c_reduce_comparison();

    // Run performance tests
    test_buffer_c_reduce_performance();

    std::cout << "\n=== All tests completed successfully! ===" << std::endl;
  } catch (const std::exception& e) {
    std::cerr << "Test failed with exception: " << e.what() << std::endl;
    return 1;
  }

  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/amx-kgroup-test.cpp
================================================
#include <omp.h>

#include "../la/amx.hpp"
#define FMT_HEADER_ONLY
#include <fmt/core.h>

#include <cmath>
#include <iostream>
#include <memory>

// Test kernel configuration for k-group testing
struct TestKernelKGroup {
  static constexpr int M_STEP = 32;
  static constexpr int K_STEP = 64;
  static constexpr int K_BLOCK = 512;
  static constexpr int N_STEP = 32;
  static constexpr int N_BLOCK = 512;
  static constexpr int TILE_N = 16;
  using dt = int8_t;

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_per_thread = (n + nth - 1) / nth;
    int n_start = ith * n_per_thread;
    int n_end = std::min(n_start + n_per_thread, n);
    return {n_start, n_end};
  }
};

void test_buffer_kgroup_basic() {
  std::cout << "=== Testing BufferAKGroupImpl Basic Functionality ===" << std::endl;

  // Test parameters
  const int max_m = 64;          // Must be multiple of M_STEP
  const int k = 2048;            // Must be multiple of K_STEP and K_BLOCK
  const int k_group_size = 128;  // Must divide K_BLOCK evenly

  std::cout << fmt::format("Parameters: max_m={}, k={}, k_group_size={}\n", max_m, k, k_group_size);

  // Calculate and allocate buffer
  size_t buffer_size = amx::BufferAKGroupImpl<TestKernelKGroup>::required_size(max_m, k, k_group_size);
  void* buffer = std::aligned_alloc(64, buffer_size);
  std::memset(buffer, 0, buffer_size);

  std::cout << fmt::format("Buffer size: {} bytes\n", buffer_size);

  // Create BufferAKGroupImpl instance
  auto buf = std::make_unique<amx::BufferAKGroupImpl<TestKernelKGroup>>(max_m, k, k_group_size, buffer);

  // Create test input data (bf16)
  std::vector<ggml_bf16_t> input(max_m * k);
  std::mt19937 gen(42);
  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);

  for (int i = 0; i < max_m * k; i++) {
    float val = dist(gen);
    input[i] = ggml_compute_fp32_to_bf16(val);
  }

  // Test from_mat
  std::cout << "Testing from_mat..." << std::endl;
  buf->from_mat(max_m, input.data(), 0, 1);
  std::cout << "✓ from_mat completed successfully" << std::endl;

  // Test get_submat
  std::cout << "Testing get_submat..." << std::endl;
  for (int m_begin = 0; m_begin < max_m; m_begin += TestKernelKGroup::M_STEP) {
    for (int k_begin = 0; k_begin < k; k_begin += TestKernelKGroup::K_STEP) {
      int8_t* submat = buf->get_submat(max_m, k, m_begin, k_begin);
      if (submat == nullptr) {
        std::cerr << fmt::format("ERROR: get_submat returned null for m_begin={}, k_begin={}\n", m_begin, k_begin);
        free(buffer);
        return;
      }
    }
  }
  std::cout << "✓ get_submat tested for all valid positions" << std::endl;

  // Test get_scale
  std::cout << "Testing get_scale..." << std::endl;
  int k_group_count = k / k_group_size;
  for (int m_idx = 0; m_idx < max_m; m_idx++) {
    for (int kg_idx = 0; kg_idx < k_group_count; kg_idx++) {
      float* scale = buf->get_scale(max_m, m_idx, k, kg_idx * k_group_size);
      if (scale == nullptr) {
        std::cerr << fmt::format("ERROR: get_scale returned null for m_idx={}, k_group={}\n", m_idx, kg_idx);
        free(buffer);
        return;
      }
      // Verify scale is non-zero (should be set by from_mat)
      if (*scale == 0.0f) {
        std::cerr << fmt::format("WARNING: scale is zero for m_idx={}, k_group={}\n", m_idx, kg_idx);
      }
    }
  }
  std::cout << "✓ get_scale tested for all k-groups" << std::endl;

  // Print some scale values for verification
  std::cout << "\nSample scale values:" << std::endl;
  for (int kg = 0; kg < std::min(4, k_group_count); kg++) {
    float* scale = buf->get_scale(max_m, 0, k, kg * k_group_size);
    std::cout << fmt::format("  k_group[{}] (k={}): scale = {:.6f}\n", kg, kg * k_group_size, *scale);
  }

  // Clean up
  free(buffer);
  std::cout << "\n✓ All basic tests passed!" << std::endl;
}

void test_buffer_kgroup_correctness() {
  std::cout << "\n=== Testing BufferAKGroupImpl Quantization Correctness ===" << std::endl;

  const int max_m = 32;
  const int k = 512;
  const int k_group_size = 128;

  size_t buffer_size = amx::BufferAKGroupImpl<TestKernelKGroup>::required_size(max_m, k, k_group_size);
  void* buffer = std::aligned_alloc(64, buffer_size);

  auto buf = std::make_unique<amx::BufferAKGroupImpl<TestKernelKGroup>>(max_m, k, k_group_size, buffer);

  // Create test input matrix with known patterns
  std::vector<float> original(max_m * k);
  std::vector<ggml_bf16_t> input(max_m * k);

  // Fill with different patterns for each k-group to test group-wise quantization
  for (int m = 0; m < max_m; m++) {
    for (int k_idx = 0; k_idx < k; k_idx++) {
      int kg = k_idx / k_group_size;
      // Different magnitude for each k-group
      float base_val = (kg + 1) * 0.1f;
      float val = base_val * std::sin(m * 0.1f + k_idx * 0.01f);
      original[m * k + k_idx] = val;
      input[m * k + k_idx] = ggml_compute_fp32_to_bf16(val);
    }
  }

  // Quantize
  buf->from_mat(max_m, input.data(), 0, 1);

  // Dequantize and check error
  std::vector<float> dequantized(max_m * k);
  float max_error = 0.0f;
  float total_error = 0.0f;
  int num_elements = 0;

  for (int m = 0; m < max_m; m++) {
    for (int k_idx = 0; k_idx < k; k_idx++) {
      int kg = k_idx / k_group_size;

      // Get the scale for this k-group
      float* scale_ptr = buf->get_scale(max_m, m, k, kg * k_group_size);
      float scale = *scale_ptr;

      // Get quantized value (simplified access for testing)
      // In real use, this would go through get_submat
      int m_block_size = (max_m + TestKernelKGroup::M_STEP - 1) / TestKernelKGroup::M_STEP * TestKernelKGroup::M_STEP;
      int k_block_begin = (k_idx / TestKernelKGroup::K_BLOCK) * TestKernelKGroup::K_BLOCK;
      int k_in_block = k_idx - k_block_begin;
      int k_block_size = std::min(TestKernelKGroup::K_BLOCK, k - k_block_begin);

      // Locate the quantized data
      int m_step_idx = m / TestKernelKGroup::M_STEP;
      int m_in_step = m % TestKernelKGroup::M_STEP;
      int k_step_idx = k_in_block / TestKernelKGroup::K_STEP;
      int k_in_step = k_in_block % TestKernelKGroup::K_STEP;

      int8_t* base = buf->a + k_block_begin * m_block_size + m_step_idx * TestKernelKGroup::M_STEP * k_block_size +
                     k_step_idx * TestKernelKGroup::K_STEP * TestKernelKGroup::M_STEP +
                     m_in_step * TestKernelKGroup::K_STEP + k_in_step;

      int8_t quantized_val = *base;

      // Dequantize
      float deq = quantized_val * scale;
      dequantized[m * k + k_idx] = deq;

      // Calculate error
      float error = std::abs(original[m * k + k_idx] - deq);
      max_error = std::max(max_error, error);
      total_error += error;
      num_elements++;
    }
  }

  float avg_error = total_error / num_elements;
  float avg_magnitude = 0.0f;
  for (int i = 0; i < max_m * k; i++) {
    avg_magnitude += std::abs(original[i]);
  }
  avg_magnitude /= (max_m * k);

  float relative_error = avg_error / (avg_magnitude + 1e-8f);

  std::cout << fmt::format("Quantization Error Analysis:\n");
  std::cout << fmt::format("  Max absolute error: {:.6f}\n", max_error);
  std::cout << fmt::format("  Average absolute error: {:.6f}\n", avg_error);
  std::cout << fmt::format("  Average magnitude: {:.6f}\n", avg_magnitude);
  std::cout << fmt::format("  Relative error: {:.2f}%\n", relative_error * 100);

  // Check that relative error is reasonable (typically < 5% for int8 quantization)
  if (relative_error < 0.05f) {
    std::cout << "✓ Quantization error is within acceptable range" << std::endl;
  } else {
    std::cerr << "WARNING: Quantization error is higher than expected!" << std::endl;
  }

  // Test that different k-groups have different scales
  std::cout << "\nVerifying k-group scales are computed independently:" << std::endl;
  bool scales_differ = false;
  for (int m = 0; m < std::min(4, max_m); m++) {
    float* scale0 = buf->get_scale(max_m, m, k, 0);
    for (int kg = 1; kg < k / k_group_size; kg++) {
      float* scale_kg = buf->get_scale(max_m, m, k, kg * k_group_size);
      if (std::abs(*scale0 - *scale_kg) > 1e-6f) {
        scales_differ = true;
        break;
      }
    }
    if (scales_differ) break;
  }

  if (scales_differ) {
    std::cout << "✓ Different k-groups have independent scales" << std::endl;
  } else {
    std::cout << "✗ Warning: All k-groups have the same scale (might be correct for uniform data)" << std::endl;
  }

  free(buffer);
}

void test_buffer_kgroup_comparison() {
  std::cout << "\n=== Comparing BufferAImpl vs BufferAKGroupImpl ===" << std::endl;

  const int max_m = 128;
  const int k = 2048;
  const int k_group_size = 256;

  // Create test data
  std::vector<ggml_bf16_t> input(max_m * k);
  std::mt19937 gen(456);
  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
  for (int i = 0; i < max_m * k; i++) {
    input[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Test original BufferAImpl
  {
    size_t buffer_size = amx::BufferAImpl<TestKernelKGroup>::required_size(max_m, k);
    void* buffer = std::aligned_alloc(64, buffer_size);
    auto buf_a = std::make_unique<amx::BufferAImpl<TestKernelKGroup>>(max_m, k, buffer);

    buf_a->from_mat(max_m, input.data(), 0, 1);

    // Print some scales
    std::cout << "BufferAImpl scales (per-row):" << std::endl;
    for (int m = 0; m < std::min(4, max_m); m++) {
      float* scale = buf_a->get_scale(max_m, m);
      std::cout << fmt::format("  row[{}]: scale = {:.6f}\n", m, *scale);
    }

    free(buffer);
  }

  // Test BufferAKGroupImpl
  {
    size_t buffer_size = amx::BufferAKGroupImpl<TestKernelKGroup>::required_size(max_m, k, k_group_size);
    void* buffer = std::aligned_alloc(64, buffer_size);
    auto buf_kg = std::make_unique<amx::BufferAKGroupImpl<TestKernelKGroup>>(max_m, k, k_group_size, buffer);

    buf_kg->from_mat(max_m, input.data(), 0, 1);

    // Print some scales
    std::cout << "\nBufferAKGroupImpl scales (per k-group):" << std::endl;
    for (int m = 0; m < std::min(2, max_m); m++) {
      std::cout << fmt::format("  row[{}]:\n", m);
      for (int kg = 0; kg < std::min(4, k / k_group_size); kg++) {
        float* scale = buf_kg->get_scale(max_m, m, k, kg * k_group_size);
        std::cout << fmt::format("    k_group[{}]: scale = {:.6f}\n", kg, *scale);
      }
    }

    free(buffer);
  }

  std::cout << "\n✓ Comparison test completed" << std::endl;
}

int main(int argc, char** argv) {
  std::cout << "Starting BufferAKGroupImpl Tests\n" << std::endl;

  try {
    // Run basic functionality tests
    test_buffer_kgroup_basic();

    // Run correctness tests
    test_buffer_kgroup_correctness();

    // Run comparison tests
    test_buffer_kgroup_comparison();

    std::cout << "\n=== All tests completed successfully! ===" << std::endl;
  } catch (const std::exception& e) {
    std::cerr << "Test failed with exception: " << e.what() << std::endl;
    return 1;
  }

  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/amx-test.cpp
================================================
#include "../la/amx.hpp"

#include <omp.h>

#include "mat-test.hpp"
#define FMT_HEADER_ONLY
#include <fmt/core.h>

const int test_iter = 100;
const bool mt = true;
const bool cache_hit = false;

void q_latency_test_bf16(int m, int n, int k, ggml_bf16_t* qa, ggml_bf16_t* qb) {
  int nth = amx::GemmKernel224BF::recommended_nth(n);
  int m_ = (m + 31) / 32 * 32;
  Mat<float> d(m_, n, Layout::RowMajor);
  {
    int repeat = 100;
    std::vector<ggml_bf16_t*> vec_a;
    std::vector<ggml_bf16_t*> vec_b;
    std::vector<float*> vec_c;
    std::vector<std::shared_ptr<amx::GemmKernel224BF::BufferA>> vec_ba;
    std::vector<std::shared_ptr<amx::GemmKernel224BF::BufferB>> vec_bb;
    std::vector<std::shared_ptr<amx::GemmKernel224BF::BufferC>> vec_bc;
    for (int i = 0; i < repeat * 2; i++) {
      ggml_bf16_t* a = (ggml_bf16_t*)std::aligned_alloc(64, amx::GemmKernel224BF::BufferA::required_size(m_, k));
      std::shared_ptr<amx::GemmKernel224BF::BufferA> ba = std::make_shared<amx::GemmKernel224BF::BufferA>(m_, k, a);
      ggml_bf16_t* b = (ggml_bf16_t*)std::aligned_alloc(64, amx::GemmKernel224BF::BufferB::required_size(n, k));
      std::shared_ptr<amx::GemmKernel224BF::BufferB> bb = std::make_shared<amx::GemmKernel224BF::BufferB>(n, k, b);
      float* c = (float*)std::aligned_alloc(64, amx::GemmKernel224BF::BufferC::required_size(m_, n));
      std::shared_ptr<amx::GemmKernel224BF::BufferC> bc = std::make_shared<amx::GemmKernel224BF::BufferC>(m_, n, c);
      ba->from_mat(m, qa, 0, 1);
      int nth = amx::GemmKernel224BF::recommended_nth(n);
      for (int i = 0; i < nth; i++) {
        bb->from_mat(qb, i, nth);
      }
      vec_a.push_back(a);
      vec_b.push_back(b);
      vec_c.push_back(c);
      vec_ba.push_back(ba);
      vec_bb.push_back(bb);
      vec_bc.push_back(bc);
    }
    Timer t(fmt::format("m:{} n:{} k:{} t:{} repeat:{}, latency", m, n, k, test_iter, repeat));
    for (int t = 0; t < test_iter; t++) {
#pragma omp parallel for schedule(dynamic, 1)
      for (int ti = 0; ti < nth * repeat; ti++) {
        int mat_id = ti / nth + repeat * (t % 2);
        int ith = ti % nth;
        if (cache_hit) {
          mat_id = 0;
        }
        amx::mat_mul(m, n, k, vec_ba[mat_id], vec_bb[mat_id], vec_bc[mat_id], ith, nth);
      }
    }
    for (int i = 0; i < repeat * 2; i++) {
      free(vec_a[i]);
      free(vec_b[i]);
      free(vec_c[i]);
    }
  }
  d.dealloc();
}

void group_q_latency_test_bf16(int n_max, int k_max) {
  amx::GemmKernel224BF::config();

  int m_max = 1024;
  int m_start = 32;
  int m_step = 32;

  Mat<float> a(m_max, k_max, Layout::RowMajor), b(k_max, n_max, Layout::ColumnMajor);
  std::mt19937 gen(123);
  a.random(gen);
  b.random(gen);
  a.quant(GGML_TYPE_BF16);
  b.quant(GGML_TYPE_BF16);

  std::string method_name = "BF16";
  if (mt) {
    method_name += fmt::format("_mt{}", omp_get_max_threads());
  }
  if (cache_hit) {
    method_name += "-cache-hit";
  }

  auto output = fmt::format("{}-m:{}:{}:{}-n:{}-k:{}-x{}x{}.txt", method_name, m_start, m_max, m_step, n_max, k_max,
                            amx::GemmKernel224BF::N_BLOCK, amx::GemmKernel224BF::K_BLOCK);
  // std::cout << "Output to: " << output << std::endl;
  auto x = freopen(output.c_str(), "w", stdout);
  assert(x);

  for (int m = m_start; m <= m_max; m *= 2) {
    q_latency_test_bf16(m, n_max, k_max, a.quant_data<ggml_bf16_t>(), b.quant_data<ggml_bf16_t>());
  }
}

void q_latency_test_int8(int m, int n, int k, ggml_bf16_t* qa, ggml_bf16_t* qb) {
  int nth = amx::GemmKernel224Int8::recommended_nth(n);
  int m_ = (m + 31) / 32 * 32;
  Mat<float> d(m_, n, Layout::RowMajor);
  {
    int repeat = 100;
    std::vector<int8_t*> vec_a;
    std::vector<int8_t*> vec_b;
    std::vector<float*> vec_c;
    std::vector<std::shared_ptr<amx::GemmKernel224Int8::BufferA>> vec_ba;
    std::vector<std::shared_ptr<amx::GemmKernel224Int8::BufferB>> vec_bb;
    std::vector<std::shared_ptr<amx::GemmKernel224Int8::BufferC>> vec_bc;
    for (int i = 0; i < repeat * 2; i++) {
      int8_t* a = (int8_t*)std::aligned_alloc(64, amx::GemmKernel224Int8::BufferA::required_size(m_, k));
      std::shared_ptr<amx::GemmKernel224Int8::BufferA> ba = std::make_shared<amx::GemmKernel224Int8::BufferA>(m_, k, a);
      int8_t* b = (int8_t*)std::aligned_alloc(64, amx::GemmKernel224Int8::BufferB::required_size(n, k));
      std::shared_ptr<amx::GemmKernel224Int8::BufferB> bb = std::make_shared<amx::GemmKernel224Int8::BufferB>(n, k, b);
      float* c = (float*)std::aligned_alloc(64, amx::GemmKernel224Int8::BufferC::required_size(m_, n));
      std::shared_ptr<amx::GemmKernel224Int8::BufferC> bc = std::make_shared<amx::GemmKernel224Int8::BufferC>(m_, n, c);
      ba->from_mat(m, qa, 0, 1);
      int nth = amx::GemmKernel224Int8::recommended_nth(n);
      for (int i = 0; i < nth; i++) {
        bb->from_mat(qb, i, nth);
      }
      vec_a.push_back(a);
      vec_b.push_back(b);
      vec_c.push_back(c);
      vec_ba.push_back(ba);
      vec_bb.push_back(bb);
      vec_bc.push_back(bc);
    }
    Timer t(fmt::format("m:{} n:{} k:{} t:{} repeat:{}, latency", m, n, k, test_iter, repeat));
    for (int t = 0; t < test_iter; t++) {
#pragma omp parallel for schedule(dynamic, 1)
      for (int ti = 0; ti < nth * repeat; ti++) {
        int mat_id = ti / nth + repeat * (t % 2);
        int ith = ti % nth;
        if (cache_hit) {
          mat_id = 0;
        }
        amx::mat_mul(m, n, k, vec_ba[mat_id], vec_bb[mat_id], vec_bc[mat_id], ith, nth);
      }
    }
    for (int i = 0; i < repeat * 2; i++) {
      free(vec_a[i]);
      free(vec_b[i]);
      free(vec_c[i]);
    }
  }
  d.dealloc();
}

void group_q_latency_test_int8(int n_max, int k_max) {
  amx::GemmKernel224Int8::config();

  int m_max = 1024;
  int m_start = 32;
  int m_step = 32;

  Mat<float> a(m_max, k_max, Layout::RowMajor), b(k_max, n_max, Layout::ColumnMajor);
  std::mt19937 gen(123);
  a.random(gen);
  b.random(gen);
  a.quant(GGML_TYPE_BF16);
  b.quant(GGML_TYPE_BF16);

  std::string method_name = "INT8";
  if (mt) {
    method_name += fmt::format("_mt{}", omp_get_max_threads());
  }
  if (cache_hit) {
    method_name += "-cache-hit";
  }

  auto output = fmt::format("{}-m:{}:{}:{}-n:{}-k:{}-x{}x{}.txt", method_name, m_start, m_max, m_step, n_max, k_max,
                            amx::GemmKernel224Int8::N_BLOCK, amx::GemmKernel224Int8::K_BLOCK);
  // std::cout << "Output to: " << output << std::endl;
  auto x = freopen(output.c_str(), "w", stdout);
  assert(x);
  for (int m = m_start; m <= m_max; m *= 2) {
    q_latency_test_int8(m, n_max, k_max, a.quant_data<ggml_bf16_t>(), b.quant_data<ggml_bf16_t>());
  }
}

void correction_test_int4(int m, int n, int k) {
  amx::GemmKernel224Int4::config();

  int m_max = 1024;
  int m_start = 32;
  int m_step = 32;

  Mat<float> ma(m, k, Layout::RowMajor), mb(k, n, Layout::ColumnMajor);
  // std::mt19937 gen(123);

  // for(size_t i=0;i<m;i++){
  //   for(size_t j=0;j<k;j++){
  //     // ma.at(i,j) = std::max(int(-i+j),0);
  //     ma.at(i,j) = (i+j)%25/25.0;
  //   }
  // }
  // for (size_t i = 0; i < k; i++) {
  //   for (size_t j = 0; j < n; j++) {
  //     // mb.at(i,j) = std::max(int(-i+j),0);
  //     mb.at(i,j) = (i+j)%25/25.0;
  //   }
  // }
  std::mt19937 gena(123);
  std::mt19937 genb(312);
  ma.random(gena);
  mb.random(genb);
  // ma.random(gen);
  // mb.random(gen);

  auto mc = ma.mul_check(mb);
  // ma.print();
  // mb.print();

  ma.quant(GGML_TYPE_BF16);
  mb.quant(GGML_TYPE_BF16);

  using K = amx::GemmKernel224Int4;
  int8_t* a = (int8_t*)std::aligned_alloc(64, K::BufferA::required_size(m, k));
  std::shared_ptr<K::BufferA> ba = std::make_shared<K::BufferA>(m, k, a);
  int8_t* b = (int8_t*)std::aligned_alloc(64, K::BufferB::required_size(n, k));
  std::shared_ptr<K::BufferB> bb = std::make_shared<K::BufferB>(n, k, b);
  float* c = (float*)std::aligned_alloc(64, K::BufferC::required_size(m, n));
  std::shared_ptr<K::BufferC> bc = std::make_shared<K::BufferC>(m, n, c);

  ba->from_mat(m, ma.quant_data<ggml_bf16_t>(), 0, 1);
  // printf("%d\n",amx::GemmKernel224Int4::BufferA::required_size(m, k));
  // for(size_t i=0;i<amx::GemmKernel224Int4::BufferA::required_size(m, k);i++){
  //   if((i*2)%k==0)
  //     printf("\n");

  //   printf("%02x ", (unsigned char)(a[i]));
  // }
  // printf("\n");

  // int nth = amx::GemmKernel224Int4::recommended_nth(n);
  bb->from_mat(mb.quant_data<ggml_bf16_t>(), 0, 1);

  // for(size_t i=0;i<amx::GemmKernel224Int4::BufferB::required_size(n, k);i++){
  //  if((i*2)%k==0)
  //     printf("\n");

  //  printf("%02x ", (unsigned char)(b[i]));
  // }
  // printf("\n");

  amx::mat_mul(m, n, k, ba, bb, bc, 0, 1);

  // for(size_t i=0;i<m;i++){
  //   for(size_t j=0;j<n;j++){
  //     printf("%.2f ",c[i*n+j]);
  //   }
  //   printf("\n");
  // }

  // printf("\n");
  Mat<float> tc(m, n, Layout::RowMajor);
  tc.data = c;
  // std::cout<<"AMX OUTPUT:"<<std::endl;
  // tc.print_all();
  // std::cout<<"STD OUTPUT:"<<std::endl;
  // mc.print_all();

  mc.cmp(tc);

  // for(size_t i=0;i<m/32;i++){
  //   for(size_t j=0;j<n/32;j++){
  //     Mat<float> stdre(32,32,Layout::RowMajor);
  //     Mat<float> amxre(32,32,Layout::RowMajor);
  //     for(size_t ii=i*32;ii<i*32+32;ii++){
  //       for(size_t jj=j*32;jj<j*32+32;jj++){
  //         stdre.at(ii-i*32,jj-j*32) = mc.at(ii,jj);
  //         amxre.at(ii-i*32,jj-j*32) = tc.at(ii,jj);
  //       }
  //     }
  //     printf("%d %d ",i,j);
  //     stdre.cmp(amxre);
  //     // if(i==0&&j==0){
  //       std::cout<<"STD"<<std::endl;
  //       stdre.print_all();
  //       std::cout<<"AMX"<<std::endl;
  //       amxre.print_all();
  //     // }
  //   }
  // }
}

void correction_test_int4_1(int m, int n, int k) {
  using K = amx::GemmKernel224Int4_1;
  K::config();

  int m_max = 1024;
  int m_start = 32;
  int m_step = 32;

  Mat<float> ma(m, k, Layout::RowMajor), mb(k, n, Layout::ColumnMajor);
  // std::mt19937 gen(123);

  // for(size_t i=0;i<m;i++){
  //   for(size_t j=0;j<k;j++){
  //     // ma.at(i,j) = std::max(int(-i+j),0);
  //     ma.at(i,j) = (i+j)%25/25.0;
  //   }
  // }
  // for (size_t i = 0; i < k; i++) {
  //   for (size_t j = 0; j < n; j++) {
  //     // mb.at(i,j) = std::max(int(-i+j),0);
  //     mb.at(i,j) = (i+j)%25/25.0;
  //   }
  // }
  std::mt19937 gena(123);
  std::mt19937 genb(312);
  ma.random(gena);
  mb.random(genb);
  // ma.random(gen);
  // mb.random(gen);

  auto mc = ma.mul_check(mb);
  // ma.print();
  // mb.print();

  ma.quant(GGML_TYPE_BF16);
  mb.quant(GGML_TYPE_BF16);

  int8_t* a = (int8_t*)std::aligned_alloc(64, K::BufferA::required_size(m, k));
  std::shared_ptr<K::BufferA> ba = std::make_shared<K::BufferA>(m, k, a);
  int8_t* b = (int8_t*)std::aligned_alloc(64, K::BufferB::required_size(n, k));
  std::shared_ptr<K::BufferB> bb = std::make_shared<K::BufferB>(n, k, b);
  float* c = (float*)std::aligned_alloc(64, K::BufferC::required_size(m, n));
  std::shared_ptr<K::BufferC> bc = std::make_shared<K::BufferC>(m, n, c);

  ba->from_mat(m, ma.quant_data<ggml_bf16_t>(), 0, 1);
  // printf("%d\n",amx::GemmKernel224Int4::BufferA::required_size(m, k));
  // for(size_t i=0;i<amx::GemmKernel224Int4::BufferA::required_size(m, k);i++){
  //   if((i*2)%k==0)
  //     printf("\n");

  //   printf("%02x ", (unsigned char)(a[i]));
  // }
  // printf("\n");

  // int nth = amx::GemmKernel224Int4::recommended_nth(n);
  bb->from_mat(mb.quant_data<ggml_bf16_t>(), 0, 1);

  // for(size_t i=0;i<amx::GemmKernel224Int4::BufferB::required_size(n, k);i++){
  //  if((i*2)%k==0)
  //     printf("\n");

  //  printf("%02x ", (unsigned char)(b[i]));
  // }
  // printf("\n");

  amx::mat_mul(m, n, k, ba, bb, bc, 0, 1);

  // for(size_t i=0;i<m;i++){
  //   for(size_t j=0;j<n;j++){
  //     printf("%.2f ",c[i*n+j]);
  //   }
  //   printf("\n");
  // }

  // printf("\n");
  Mat<float> tc(m, n, Layout::RowMajor);
  tc.data = c;
  std::cout << "AMX OUTPUT:" << std::endl;
  tc.print_all();
  std::cout << "STD OUTPUT:" << std::endl;
  mc.print_all();

  mc.cmp(tc);

  // for(size_t i=0;i<m/32;i++){
  //   for(size_t j=0;j<n/32;j++){
  //     Mat<float> stdre(32,32,Layout::RowMajor);
  //     Mat<float> amxre(32,32,Layout::RowMajor);
  //     for(size_t ii=i*32;ii<i*32+32;ii++){
  //       for(size_t jj=j*32;jj<j*32+32;jj++){
  //         stdre.at(ii-i*32,jj-j*32) = mc.at(ii,jj);
  //         amxre.at(ii-i*32,jj-j*32) = tc.at(ii,jj);
  //       }
  //     }
  //     printf("%d %d ",i,j);
  //     stdre.cmp(amxre);
  //     // if(i==0&&j==0){
  //       std::cout<<"STD"<<std::endl;
  //       stdre.print_all();
  //       std::cout<<"AMX"<<std::endl;
  //       amxre.print_all();
  //     // }
  //   }
  // }
}

void q_latency_test_int4(int m, int n, int k, ggml_bf16_t* qa, ggml_bf16_t* qb) {
  int nth = amx::GemmKernel224Int4::recommended_nth(n);
  int m_ = (m + 31) / 32 * 32;
  Mat<float> d(m_, n, Layout::RowMajor);
  {
    int repeat = 100;
    std::vector<int8_t*> vec_a;
    std::vector<int8_t*> vec_b;
    std::vector<float*> vec_c;
    std::vector<std::shared_ptr<amx::GemmKernel224Int4::BufferA>> vec_ba;
    std::vector<std::shared_ptr<amx::GemmKernel224Int4::BufferB>> vec_bb;
    std::vector<std::shared_ptr<amx::GemmKernel224Int4::BufferC>> vec_bc;
    for (int i = 0; i < repeat * 2; i++) {
      int8_t* a = (int8_t*)std::aligned_alloc(64, amx::GemmKernel224Int4::BufferA::required_size(m_, k));
      std::shared_ptr<amx::GemmKernel224Int4::BufferA> ba = std::make_shared<amx::GemmKernel224Int4::BufferA>(m_, k, a);
      int8_t* b = (int8_t*)std::aligned_alloc(64, amx::GemmKernel224Int4::BufferB::required_size(n, k));
      std::shared_ptr<amx::GemmKernel224Int4::BufferB> bb = std::make_shared<amx::GemmKernel224Int4::BufferB>(n, k, b);
      float* c = (float*)std::aligned_alloc(64, amx::GemmKernel224Int4::BufferC::required_size(m_, n));
      std::shared_ptr<amx::GemmKernel224Int4::BufferC> bc = std::make_shared<amx::GemmKernel224Int4::BufferC>(m_, n, c);
      ba->from_mat(m, qa, 0, 1);
      int nth = amx::GemmKernel224Int4::recommended_nth(n);
      for (int i = 0; i < nth; i++) {
        bb->from_mat(qb, i, nth);
      }
      vec_a.push_back(a);
      vec_b.push_back(b);
      vec_c.push_back(c);
      vec_ba.push_back(ba);
      vec_bb.push_back(bb);
      vec_bc.push_back(bc);
    }
    Timer t(fmt::format("m:{} n:{} k:{} t:{} repeat:{}, latency", m, n, k, test_iter, repeat));
    for (int t = 0; t < test_iter; t++) {
#pragma omp parallel for schedule(dynamic, 1)
      for (int ti = 0; ti < nth * repeat; ti++) {
        int mat_id = ti / nth + repeat * (t % 2);
        int ith = ti % nth;
        if (cache_hit) {
          mat_id = 0;
        }
        amx::mat_mul(m, n, k, vec_ba[mat_id], vec_bb[mat_id], vec_bc[mat_id], ith, nth);
      }
    }
    for (int i = 0; i < repeat * 2; i++) {
      free(vec_a[i]);
      free(vec_b[i]);
      free(vec_c[i]);
    }
  }
  d.dealloc();
}

void group_q_latency_test_int4(int n_max, int k_max) {
  amx::GemmKernel224Int4::config();

  int m_max = 1024;
  int m_start = 32;
  int m_step = 32;

  Mat<float> a(m_max, k_max, Layout::RowMajor), b(k_max, n_max, Layout::ColumnMajor);
  std::mt19937 gen(123);
  a.random(gen);
  b.random(gen);
  a.quant(GGML_TYPE_BF16);
  b.quant(GGML_TYPE_BF16);

  std::string method_name = "INT4";
  if (mt) {
    method_name += fmt::format("_mt{}", omp_get_max_threads());
  }
  if (cache_hit) {
    method_name += "-cache-hit";
  }

  auto output = fmt::format("{}-m:{}:{}:{}-n:{}-k:{}-x{}x{}.txt", method_name, m_start, m_max, m_step, n_max, k_max,
                            amx::GemmKernel224Int4::N_BLOCK, amx::GemmKernel224Int4::K_BLOCK);
  // std::cout << "Output to: " << output << std::endl;
  auto x = freopen(output.c_str(), "w", stdout);
  assert(x);

  for (int m = m_start; m <= m_max; m *= 2) {
    q_latency_test_int4(m, n_max, k_max, a.quant_data<ggml_bf16_t>(), b.quant_data<ggml_bf16_t>());
  }
}

void q_latency_test_int4_1(int m, int n, int k, ggml_bf16_t* qa, ggml_bf16_t* qb) {
  int nth = amx::GemmKernel224Int4_1::recommended_nth(n);
  int m_ = (m + 31) / 32 * 32;
  Mat<float> d(m_, n, Layout::RowMajor);
  {
    int repeat = 100;
    std::vector<int8_t*> vec_a;
    std::vector<int8_t*> vec_b;
    std::vector<float*> vec_c;
    std::vector<std::shared_ptr<amx::GemmKernel224Int4_1::BufferA>> vec_ba;
    std::vector<std::shared_ptr<amx::GemmKernel224Int4_1::BufferB>> vec_bb;
    std::vector<std::shared_ptr<amx::GemmKernel224Int4_1::BufferC>> vec_bc;
    for (int i = 0; i < repeat * 2; i++) {
      int8_t* a = (int8_t*)std::aligned_alloc(64, amx::GemmKernel224Int4_1::BufferA::required_size(m_, k));
      std::shared_ptr<amx::GemmKernel224Int4_1::BufferA> ba =
          std::make_shared<amx::GemmKernel224Int4_1::BufferA>(m_, k, a);
      int8_t* b = (int8_t*)std::aligned_alloc(64, amx::GemmKernel224Int4_1::BufferB::required_size(n, k));
      std::shared_ptr<amx::GemmKernel224Int4_1::BufferB> bb =
          std::make_shared<amx::GemmKernel224Int4_1::BufferB>(n, k, b);
      float* c = (float*)std::aligned_alloc(64, amx::GemmKernel224Int4_1::BufferC::required_size(m_, n));
      std::shared_ptr<amx::GemmKernel224Int4_1::BufferC> bc =
          std::make_shared<amx::GemmKernel224Int4_1::BufferC>(m_, n, c);
      ba->from_mat(m, qa, 0, 1);
      int nth = amx::GemmKernel224Int4_1::recommended_nth(n);
      for (int i = 0; i < nth; i++) {
        bb->from_mat(qb, i, nth);
      }
      vec_a.push_back(a);
      vec_b.push_back(b);
      vec_c.push_back(c);
      vec_ba.push_back(ba);
      vec_bb.push_back(bb);
      vec_bc.push_back(bc);
    }
    Timer t(fmt::format("m:{} n:{} k:{} t:{} repeat:{}, latency", m, n, k, test_iter, repeat));
    for (int t = 0; t < test_iter; t++) {
#pragma omp parallel for schedule(dynamic, 1)
      for (int ti = 0; ti < nth * repeat; ti++) {
        int mat_id = ti / nth + repeat * (t % 2);
        int ith = ti % nth;
        if (cache_hit) {
          mat_id = 0;
        }
        amx::mat_mul(m, n, k, vec_ba[mat_id], vec_bb[mat_id], vec_bc[mat_id], ith, nth);
      }
    }
    for (int i = 0; i < repeat * 2; i++) {
      free(vec_a[i]);
      free(vec_b[i]);
      free(vec_c[i]);
    }
  }
  d.dealloc();
}

void group_q_latency_test_int4_1(int n_max, int k_max) {
  amx::GemmKernel224Int4_1::config();

  int m_max = 1024;
  int m_start = 32;
  int m_step = 32;

  Mat<float> a(m_max, k_max, Layout::RowMajor), b(k_max, n_max, Layout::ColumnMajor);
  std::mt19937 gen(123);
  a.random(gen);
  b.random(gen);
  a.quant(GGML_TYPE_BF16);
  b.quant(GGML_TYPE_BF16);

  std::string method_name = "INT4_1";
  if (mt) {
    method_name += fmt::format("_mt{}", omp_get_max_threads());
  }
  if (cache_hit) {
    method_name += "-cache-hit";
  }

  auto output = fmt::format("{}-m:{}:{}:{}-n:{}-k:{}-x{}x{}.txt", method_name, m_start, m_max, m_step, n_max, k_max,
                            amx::GemmKernel224Int4_1::N_BLOCK, amx::GemmKernel224Int4_1::K_BLOCK);
  // std::cout << "Output to: " << output << std::endl;
  auto x = freopen(output.c_str(), "w", stdout);
  assert(x);

  for (int m = m_start; m <= m_max; m *= 2) {
    q_latency_test_int4_1(m, n_max, k_max, a.quant_data<ggml_bf16_t>(), b.quant_data<ggml_bf16_t>());
  }
}

int main() {
  amx::enable_amx();
  init();

  // group_q_latency_test_bf16(5120, 1536);
  // group_q_latency_test_bf16(3584, 2560);
  // group_q_latency_test_bf16(2560, 3584);
  // group_q_latency_test_bf16(1536, 5120);
  // group_q_latency_test_bf16(7168, 2048);
  // group_q_latency_test_bf16(2048, 7168);

  // group_q_latency_test_int8(5120, 1536);
  // group_q_latency_test_int8(3584, 2560);
  // group_q_latency_test_int8(2560, 3584);
  // group_q_latency_test_int8(1536, 5120);
  // group_q_latency_test_int8(7168, 2048);
  // group_q_latency_test_int8(2048, 7168);

  group_q_latency_test_int4(5120, 1536);
  group_q_latency_test_int4(3584, 2560);
  group_q_latency_test_int4(2560, 3584);
  group_q_latency_test_int4(1536, 5120);
  group_q_latency_test_int4(7168, 2048);
  group_q_latency_test_int4(2048, 7168);

  // group_q_latency_test_int4_1(5120, 1536);
  // group_q_latency_test_int4_1(3584, 2560);
  // group_q_latency_test_int4_1(2560, 3584);
  // group_q_latency_test_int4_1(1536, 5120);
  // group_q_latency_test_int4_1(7168, 2048);
  // group_q_latency_test_int4_1(2048, 7168);

  // int k = 2048;
  // correction_test_int4_1(32, 32, k);
  // correction_test_int4(256, 256, 2048);
  // correction_test_int4(32, 32, 4096);
  // correction_test_int4(256, 256, 4096);
  // correction_test_int4(32, 32, k);
  // correction_test_int4(256, 32, 128);
  // correction_test_int4(32, 64, 128);
  // correction_test_int4(64, 32, 128);
  // correction_test_int4(256, 256, 128);
}


================================================
FILE: kt-kernel/operators/amx/test/analyze-error.cpp
================================================
#include <cmath>
#include <iostream>
#include <memory>
#include <random>
#include <vector>

#include "../la/amx.hpp"

void analyze_error_patterns() {
  std::cout << "=== Analyzing Error Patterns in K-Group Quantization ===" << std::endl;

  const int m = 32;
  const int n = 32;
  const int k = 512;
  const int k_group_size = 128;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  Kernel::config();

  std::cout << "\n1. Testing with very small values (prone to quantization loss):" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Very small values - will mostly quantize to 0
    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(0.0001f * (i % 10));
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(0.0001f * (i % 10));
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    // Check scales
    float a_scale = *ba->get_scale(m, 0, k, 0);
    float b_scale = *bb->get_scale(n, 0, k, 0);
    std::cout << "  A scale: " << a_scale << ", B scale: " << b_scale << std::endl;

    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    float first_val = ggml_compute_bf16_to_fp32(output[0]);
    std::cout << "  Result[0,0]: " << first_val << std::endl;
  }

  std::cout << "\n2. Testing with values near quantization boundaries:" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Values at quantization boundaries (multiples of 1/127 for int8)
    for (int i = 0; i < m * k; i++) {
      float val = (i % 16) / 127.0f;  // INT4 has 16 levels
      input_a[i] = ggml_compute_fp32_to_bf16(val);
    }
    for (int i = 0; i < k * n; i++) {
      float val = (i % 16) / 127.0f;
      input_b[i] = ggml_compute_fp32_to_bf16(val);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    std::cout << "  First row results: ";
    for (int j = 0; j < 5; j++) {
      float val = ggml_compute_bf16_to_fp32(output[j]);
      std::cout << val << " ";
    }
    std::cout << std::endl;
  }

  std::cout << "\n3. Testing with different scale ranges per k-group:" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Different magnitude for each k-group
    for (int i = 0; i < m; i++) {
      for (int j = 0; j < k; j++) {
        int kg = j / k_group_size;
        float scale = std::pow(10.0f, -kg);  // 1.0, 0.1, 0.01, 0.001
        input_a[i * k + j] = ggml_compute_fp32_to_bf16(scale * 0.5f);
      }
    }

    for (int i = 0; i < k; i++) {
      for (int j = 0; j < n; j++) {
        int kg = i / k_group_size;
        float scale = std::pow(10.0f, -kg);
        input_b[i * n + j] = ggml_compute_fp32_to_bf16(scale * 0.5f);
      }
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    // Print scales for each k-group
    std::cout << "  A scales per k-group: ";
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *ba->get_scale(m, 0, k, kg * k_group_size);
      std::cout << scale << " ";
    }
    std::cout << std::endl;

    std::cout << "  B scales per k-group: ";
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *bb->get_scale(n, 0, k, kg * k_group_size);
      std::cout << scale << " ";
    }
    std::cout << std::endl;

    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Compute reference
    float ref = 0.0f;
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = std::pow(10.0f, -kg);
      ref += k_group_size * scale * scale * 0.25f;  // 0.5 * 0.5
    }

    float actual = ggml_compute_bf16_to_fp32(output[0]);
    std::cout << "  Expected: " << ref << ", Actual: " << actual << std::endl;
    std::cout << "  Error: " << std::abs(ref - actual) / ref * 100 << "%" << std::endl;
  }

  std::cout << "\n4. Testing with sparse patterns (many zeros):" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Sparse pattern - 90% zeros
    std::mt19937 gen(42);
    std::uniform_real_distribution<float> dist(0.0f, 1.0f);

    for (int i = 0; i < m * k; i++) {
      float val = (dist(gen) < 0.1f) ? 0.5f : 0.0f;
      input_a[i] = ggml_compute_fp32_to_bf16(val);
    }
    for (int i = 0; i < k * n; i++) {
      float val = (dist(gen) < 0.1f) ? 0.5f : 0.0f;
      input_b[i] = ggml_compute_fp32_to_bf16(val);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Compute statistics
    float max_val = 0.0f;
    float avg_val = 0.0f;
    int non_zero = 0;

    for (int i = 0; i < m * n; i++) {
      float val = std::abs(ggml_compute_bf16_to_fp32(output[i]));
      max_val = std::max(max_val, val);
      avg_val += val;
      if (val > 1e-6) non_zero++;
    }
    avg_val /= (m * n);

    std::cout << "  Max value: " << max_val << std::endl;
    std::cout << "  Avg value: " << avg_val << std::endl;
    std::cout << "  Non-zero outputs: " << non_zero << "/" << m * n << std::endl;
  }

  std::cout << "\n5. Testing with gradual value changes (worst case for k-group):" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Gradual increase across k dimension - worst case for k-group quantization
    for (int i = 0; i < m; i++) {
      for (int j = 0; j < k; j++) {
        float val = j * 0.001f;  // Gradual increase
        input_a[i * k + j] = ggml_compute_fp32_to_bf16(val);
      }
    }

    for (int i = 0; i < k; i++) {
      for (int j = 0; j < n; j++) {
        float val = 0.1f;  // Constant
        input_b[i * n + j] = ggml_compute_fp32_to_bf16(val);
      }
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    // Check how scales vary
    std::cout << "  A scales (should increase): ";
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *ba->get_scale(m, 0, k, kg * k_group_size);
      std::cout << scale << " ";
    }
    std::cout << std::endl;

    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Reference calculation
    float ref = 0.0f;
    for (int j = 0; j < k; j++) {
      ref += j * 0.001f * 0.1f;
    }

    float actual = ggml_compute_bf16_to_fp32(output[0]);
    std::cout << "  Expected: " << ref << ", Actual: " << actual << std::endl;
    std::cout << "  Error: " << std::abs(ref - actual) / ref * 100 << "%" << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

int main() {
  analyze_error_patterns();
  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/avx-test.cpp
================================================

#include <immintrin.h>
#include <omp.h>

#include <chrono>
#include <cstdlib>
#include <iostream>
#include <random>

constexpr size_t DATA_SIZE = 100ULL * 1024 * 1024 * 1024;  // 100 GB
constexpr size_t ALIGNMENT = 64;                           // alignment for AVX-512
constexpr int TEST_ITERATIONS = 100;
constexpr int INNER_TEST_ITERATIONS = 100;

void generate_data(uint8_t* data, size_t size) {
  size_t size_int64 = size / sizeof(int64_t);

#pragma omp parallel
  {
    std::mt19937_64 engine(omp_get_thread_num());
    std::uniform_int_distribution<int64_t> dist;

    int64_t* data64 = reinterpret_cast<int64_t*>(data);

#pragma omp for
    for (size_t i = 0; i < size_int64; ++i) {
      data64[i] = dist(engine);
    }
  }
}

void dpbusd_test(const uint8_t* data_a, const uint8_t* data_b, int32_t* result, size_t size) {
  constexpr size_t simd_width = 64;  // 512 bits = 64 bytes
  size_t vec_count = size / simd_width;

#pragma omp parallel for
  for (size_t x = 0; x < vec_count * INNER_TEST_ITERATIONS; ++x) {
    auto i = x % vec_count;
    __m512i va = _mm512_load_si512(reinterpret_cast<const __m512i*>(data_a + i * simd_width));
    __m512i vb = _mm512_load_si512(reinterpret_cast<const __m512i*>(data_b + i * simd_width));
    __m512i vc = _mm512_setzero_si512();

    vc = _mm512_dpbusd_epi32(vc, va, vb);

    _mm512_store_si512(reinterpret_cast<__m512i*>(result + i * (simd_width / 4)), vc);
  }
}

int main() {
  std::cout << "Allocating aligned memory...\n";
  uint8_t* data_a = reinterpret_cast<uint8_t*>(aligned_alloc(ALIGNMENT, DATA_SIZE));
  uint8_t* data_b = reinterpret_cast<uint8_t*>(aligned_alloc(ALIGNMENT, DATA_SIZE));
  int32_t* result = reinterpret_cast<int32_t*>(aligned_alloc(ALIGNMENT, DATA_SIZE));

  std::cout << "Generating random data...\n";
  generate_data(data_a, DATA_SIZE);
  generate_data(data_b, DATA_SIZE);

  for (int iter = 0; iter < TEST_ITERATIONS; ++iter) {
    std::cout << "Starting computation iteration " << iter + 1 << "...\n";
    auto start = std::chrono::high_resolution_clock::now();

    dpbusd_test(data_a, data_b, result, DATA_SIZE);

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;

    double bandwidth = (3 * DATA_SIZE * INNER_TEST_ITERATIONS) / (1e9) / diff.count();  // GB/s

    std::cout << "Iteration " << iter + 1 << " execution time: " << diff.count() << " s\n";
    std::cout << "Iteration " << iter + 1 << " estimated memory bandwidth: " << bandwidth << " GB/s\n";
  }

  free(data_a);
  free(data_b);
  free(result);

  return 0;
}


================================================
FILE: kt-kernel/operators/amx/test/debug-kgroup-details.cpp
================================================
#include <cmath>
#include <iostream>
#include <memory>
#include <vector>

#include "../la/amx.hpp"

void debug_kgroup_details() {
  std::cout << "=== Debugging K-Group Details ===\n" << std::endl;

  const int m = 32;  // Minimum size for AMX
  const int n = 32;
  const int k = 512;  // 4 k-groups, must be >= K_BLOCK
  const int k_group_size = 128;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Test with specific values to debug quantization
  std::cout << "Test: Specific values with normal distribution\n" << std::endl;

  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  std::mt19937 gen(42);
  std::normal_distribution<float> dist(0.0f, 0.1f);

  // Fill with random normal values and print some
  std::cout << "Sample A values (first 8):" << std::endl;
  for (int i = 0; i < 8; i++) {
    float val = dist(gen);
    input_a[i] = ggml_compute_fp32_to_bf16(val);
    std::cout << "  A[" << i << "] = " << val << std::endl;
  }

  // Fill rest of A
  for (int i = 8; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  std::cout << "\nSample B values (first 8):" << std::endl;
  for (int i = 0; i < 8; i++) {
    float val = dist(gen);
    input_b[i] = ggml_compute_fp32_to_bf16(val);
    std::cout << "  B[" << i << "] = " << val << std::endl;
  }

  // Fill rest of B
  for (int i = 8; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Quantize
  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  // Print scales for debugging
  std::cout << "\nA scales (per k-group):" << std::endl;
  for (int row = 0; row < m; row++) {
    std::cout << "  Row " << row << ": ";
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *ba->get_scale(m, row, k, kg * k_group_size);
      std::cout << "kg" << kg << "=" << scale << " ";
    }
    std::cout << std::endl;
  }

  std::cout << "\nB scales (per k-group):" << std::endl;
  for (int col = 0; col < n; col++) {
    std::cout << "  Col " << col << ": ";
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *bb->get_scale(n, col, k, kg * k_group_size);
      std::cout << "kg" << kg << "=" << scale << " ";
    }
    std::cout << std::endl;
  }

  // Test dequantization to check if quantization is working
  std::cout << "\nDequantization test (first row of A):" << std::endl;
  // We need to manually dequantize to check
  // Get quantized values and scale
  int8_t* a_data = (int8_t*)ba->get_submat(m, k, 0, 0);
  float scale0 = *ba->get_scale(m, 0, k, 0);

  std::cout << "  First 8 quantized values: ";
  for (int i = 0; i < 8; i++) {
    std::cout << (int)a_data[i] << " ";
  }
  std::cout << std::endl;

  std::cout << "  Dequantized (q * scale): ";
  for (int i = 0; i < 8; i++) {
    float dequant = a_data[i] * scale0;
    float original = ggml_compute_bf16_to_fp32(input_a[i]);
    std::cout << dequant << " (orig=" << original << ") ";
  }
  std::cout << std::endl;

  // Compute reference
  std::cout << "\nComputing reference result..." << std::endl;
  std::vector<float> ref_result(m * n, 0.0f);
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      float sum = 0.0f;
      for (int l = 0; l < k; l++) {
        float a_val = ggml_compute_bf16_to_fp32(input_a[i * k + l]);
        float b_val = ggml_compute_bf16_to_fp32(input_b[l * n + j]);
        sum += a_val * b_val;
      }
      ref_result[i * n + j] = sum;
    }
  }

  // Run k-group multiplication
  Kernel::config();
  amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

  std::vector<ggml_bf16_t> output(m * n);
  bc->to_mat(m, output.data(), 0, 1);

  // Compare results
  std::cout << "\nResults comparison:" << std::endl;
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      int idx = i * n + j;
      float actual = ggml_compute_bf16_to_fp32(output[idx]);
      float ref = ref_result[idx];
      float error = std::abs(actual - ref) / (std::abs(ref) + 1e-8) * 100;
      std::cout << "  [" << i << "," << j << "]: actual=" << actual << ", ref=" << ref << ", error=" << error << "%"
                << std::endl;
    }
  }

  // Test a simple case to verify the mechanism
  std::cout << "\n--- Simple test with k_group boundaries ---" << std::endl;

  // Clear buffers
  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(0.0f);
  }
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(0.0f);
  }

  // Set specific values for each k-group
  for (int i = 0; i < m; i++) {
    // First k-group (0-127): value = 0.5
    for (int j = 0; j < 128; j++) {
      input_a[i * k + j] = ggml_compute_fp32_to_bf16(0.5f);
    }
    // Second k-group (128-255): value = 0.25
    for (int j = 128; j < 256; j++) {
      input_a[i * k + j] = ggml_compute_fp32_to_bf16(0.25f);
    }
    // Remaining k-groups: value = 0.1
    for (int j = 256; j < k; j++) {
      input_a[i * k + j] = ggml_compute_fp32_to_bf16(0.1f);
    }
  }

  // B matrix: all 0.4
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(0.4f);
  }

  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  // Expected: 0.5 * 0.4 * 128 + 0.25 * 0.4 * 128 + 0.1 * 0.4 * 256 = 25.6 + 12.8 + 10.24 = 48.64
  float expected = 0.5f * 0.4f * 128 + 0.25f * 0.4f * 128 + 0.1f * 0.4f * 256;
  std::cout << "Expected value: " << expected << std::endl;

  amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);
  bc->to_mat(m, output.data(), 0, 1);

  float actual = ggml_compute_bf16_to_fp32(output[0]);
  std::cout << "Actual value: " << actual << std::endl;
  std::cout << "Error: " << std::abs(actual - expected) / expected * 100 << "%" << std::endl;

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

int main() {
  debug_kgroup_details();
  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/debug-kgroup.cpp
================================================
#include <omp.h>

#include "../la/amx.hpp"
#define FMT_HEADER_ONLY
#include <fmt/core.h>

#include <cmath>
#include <iostream>
#include <memory>
#include <random>

void debug_simple_multiplication() {
  std::cout << "=== Debug Simple K-Group Multiplication ===" << std::endl;

  // Very small test case for debugging
  const int m = 32;   // 1 M_STEP
  const int n = 32;   // 1 N_STEP
  const int k = 512;  // Must be at least K_BLOCK (512)
  const int k_group_size = 128;

  std::cout << fmt::format("Parameters: m={}, n={}, k={}, k_group_size={}\n", m, n, k, k_group_size);

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  // Allocate buffers
  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Create identity-like matrices for easy verification
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  // Initialize A as mostly zeros with a few ones
  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(0.0f);
  }
  // Set A[0,0] = 1
  input_a[0] = ggml_compute_fp32_to_bf16(1.0f);

  // Initialize B as mostly zeros with a few ones
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(0.0f);
  }
  // Set B[0,0] = 1
  input_b[0] = ggml_compute_fp32_to_bf16(1.0f);

  // Expected result: C[0,0] = 1*1 = 1, rest = 0
  std::cout << "\nExpected result: C[0,0] = 1.0, rest = 0.0\n" << std::endl;

  // Quantize inputs
  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  // Print scales for debugging
  std::cout << "BufferA scales for row 0:" << std::endl;
  for (int kg = 0; kg < k / k_group_size; kg++) {
    float scale = *ba->get_scale(m, 0, k, kg * k_group_size);
    std::cout << fmt::format("  k_group[{}]: scale = {:.6f}\n", kg, scale);
  }

  std::cout << "\nBufferB scales for col 0:" << std::endl;
  for (int kg = 0; kg < k / k_group_size; kg++) {
    float scale = *bb->get_scale(n, 0, k, kg * k_group_size);
    std::cout << fmt::format("  k_group[{}]: scale = {:.6f}\n", kg, scale);
  }

  // Configure AMX
  Kernel::config();

  // Run matrix multiplication
  amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

  // Get output
  std::vector<ggml_bf16_t> output(m * n);
  bc->to_mat(m, output.data(), 0, 1);

  // Print results
  std::cout << "\nActual result (first 5x5):" << std::endl;
  for (int i = 0; i < std::min(5, m); i++) {
    for (int j = 0; j < std::min(5, n); j++) {
      float val = ggml_compute_bf16_to_fp32(output[i * n + j]);
      std::cout << fmt::format("{:8.4f} ", val);
    }
    std::cout << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

void debug_pattern_multiplication() {
  std::cout << "\n=== Debug Pattern Multiplication ===" << std::endl;

  const int m = 32;
  const int n = 32;
  const int k = 512;  // Must be at least K_BLOCK (512)
  const int k_group_size = 128;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Create constant matrices
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  // Fill A with 0.1 and B with 0.1
  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(0.1f);
  }
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(0.1f);
  }

  // Expected: Each element should be 0.1 * 0.1 * k = 0.01 * 512 = 5.12
  float expected = 0.1f * 0.1f * k;
  std::cout << fmt::format("\nExpected result: all elements = {:.4f}\n", expected);

  // Quantize
  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  // Run
  Kernel::config();
  amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

  // Get output
  std::vector<ggml_bf16_t> output(m * n);
  bc->to_mat(m, output.data(), 0, 1);

  // Check results
  float max_error = 0.0f;
  float avg_error = 0.0f;
  for (int i = 0; i < m * n; i++) {
    float actual = ggml_compute_bf16_to_fp32(output[i]);
    float error = std::abs(actual - expected);
    max_error = std::max(max_error, error);
    avg_error += error;
  }
  avg_error /= (m * n);

  std::cout << fmt::format("Max error: {:.6f}\n", max_error);
  std::cout << fmt::format("Avg error: {:.6f}\n", avg_error);
  std::cout << fmt::format("Relative error: {:.2f}%\n", (max_error / expected) * 100);

  // Print sample values
  std::cout << "\nSample values (first 5x5):" << std::endl;
  for (int i = 0; i < std::min(5, m); i++) {
    for (int j = 0; j < std::min(5, n); j++) {
      float val = ggml_compute_bf16_to_fp32(output[i * n + j]);
      std::cout << fmt::format("{:8.4f} ", val);
    }
    std::cout << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

void compare_with_regular_int4() {
  std::cout << "\n=== Compare K-Group vs Regular INT4 ===" << std::endl;

  const int m = 32;
  const int n = 32;
  const int k = 512;
  const int k_group_size = 128;

  // Create test data
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  std::mt19937 gen(42);
  std::uniform_real_distribution<float> dist(-0.1f, 0.1f);

  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Test with regular INT4
  {
    using Kernel = amx::GemmKernel224Int4;
    using BufferA = Kernel::BufferA;
    using BufferB = Kernel::BufferB;
    using BufferC = Kernel::BufferC;

    void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k));
    void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k));  // Fixed: n, k not k, n
    void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

    auto ba = std::make_shared<BufferA>(m, k, buffer_a);
    auto bb = std::make_shared<BufferB>(n, k, buffer_b);  // Fixed: n, k not k, n
    auto bc = std::make_shared<BufferC>(m, n, buffer_c);

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    Kernel::config();
    amx::mat_mul(m, n, k, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output_regular(m * n);
    bc->to_mat(m, output_regular.data(), 0, 1);

    std::cout << "Regular INT4 results (first 3x3):" << std::endl;
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 3; j++) {
        float val = ggml_compute_bf16_to_fp32(output_regular[i * n + j]);
        std::cout << fmt::format("{:8.4f} ", val);
      }
      std::cout << std::endl;
    }

    free(buffer_a);
    free(buffer_b);
    free(buffer_c);
  }

  // Test with K-Group INT4
  {
    using Kernel = amx::GemmKernel224Int4KGroup;
    using BufferA = Kernel::BufferA;
    using BufferB = Kernel::BufferB;
    using BufferC = Kernel::BufferC;

    void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
    void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
    void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

    auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
    auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
    auto bc = std::make_shared<BufferC>(m, n, buffer_c);

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    Kernel::config();
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output_kgroup(m * n);
    bc->to_mat(m, output_kgroup.data(), 0, 1);

    std::cout << "\nK-Group INT4 results (first 3x3):" << std::endl;
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 3; j++) {
        float val = ggml_compute_bf16_to_fp32(output_kgroup[i * n + j]);
        std::cout << fmt::format("{:8.4f} ", val);
      }
      std::cout << std::endl;
    }

    free(buffer_a);
    free(buffer_b);
    free(buffer_c);
  }
}

int main() {
  std::cout << "Starting K-Group Debugging\n" << std::endl;

  debug_simple_multiplication();
  debug_pattern_multiplication();
  compare_with_regular_int4();

  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/debug-specific-dims.cpp
================================================
#include <cmath>
#include <iostream>
#include <memory>
#include <vector>

#include "../la/amx.hpp"

void debug_specific_dimensions() {
  std::cout << "=== Debugging Specific Dimensions Issue ===\n" << std::endl;

  const int m_original = 200;
  const int n = 2048;
  const int k = 7168;
  const int k_group_size = 128;

  const int M_STEP = 32;
  const int m = ((m_original + M_STEP - 1) / M_STEP) * M_STEP;  // Round up to 224

  std::cout << "Original dimensions: " << m_original << " x " << n << " x " << k << std::endl;
  std::cout << "Padded dimensions: " << m << " x " << n << " x " << k << std::endl;
  std::cout << "K-group size: " << k_group_size << std::endl;
  std::cout << "Number of k-groups: " << k / k_group_size << std::endl;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Test 1: Simple pattern - all ones
  std::cout << "\n--- Test 1: All ones (should give k = 7168) ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(1.0f);
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(1.0f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    // Check some scales
    std::cout << "A scales (first 3 k-groups): ";
    for (int kg = 0; kg < 3; kg++) {
      float scale = *ba->get_scale(m, 0, k, kg * k_group_size);
      std::cout << scale << " ";
    }
    std::cout << std::endl;

    std::cout << "B scales (first 3 k-groups): ";
    for (int kg = 0; kg < 3; kg++) {
      float scale = *bb->get_scale(n, 0, k, kg * k_group_size);
      std::cout << scale << " ";
    }
    std::cout << std::endl;

    Kernel::config();
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    float expected = 7168.0f;
    float actual = ggml_compute_bf16_to_fp32(output[0]);
    std::cout << "Expected: " << expected << ", Actual: " << actual << std::endl;
    std::cout << "Error: " << std::abs(actual - expected) / expected * 100 << "%" << std::endl;
  }

  // Test 2: Small values
  std::cout << "\n--- Test 2: Small values (0.01) ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(0.01f);
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(0.01f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    Kernel::config();
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    float expected = 0.01f * 0.01f * 7168.0f;  // 0.7168
    float actual = ggml_compute_bf16_to_fp32(output[0]);
    std::cout << "Expected: " << expected << ", Actual: " << actual << std::endl;
    std::cout << "Error: " << std::abs(actual - expected) / expected * 100 << "%" << std::endl;
  }

  // Test 3: Identity-like pattern
  std::cout << "\n--- Test 3: Identity pattern ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Initialize to zeros
    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(0.0f);
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(0.0f);
    }

    // Set diagonal to 1
    int min_dim = std::min(std::min(m, n), k);
    for (int i = 0; i < min_dim; i++) {
      input_a[i * k + i] = ggml_compute_fp32_to_bf16(1.0f);
      input_b[i * n + i] = ggml_compute_fp32_to_bf16(1.0f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    Kernel::config();
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Check diagonal elements
    std::cout << "Diagonal elements (should be 1): ";
    for (int i = 0; i < std::min(5, min_dim); i++) {
      float val = ggml_compute_bf16_to_fp32(output[i * n + i]);
      std::cout << val << " ";
    }
    std::cout << std::endl;
  }

  // Test 4: Pattern with different values per k-group
  std::cout << "\n--- Test 4: Different values per k-group ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Each k-group has different value
    for (int i = 0; i < m; i++) {
      for (int j = 0; j < k; j++) {
        int kg = j / k_group_size;
        float val = (kg + 1) * 0.1f;  // 0.1, 0.2, 0.3, ...
        input_a[i * k + j] = ggml_compute_fp32_to_bf16(val);
      }
    }

    for (int i = 0; i < k; i++) {
      for (int j = 0; j < n; j++) {
        input_b[i * n + j] = ggml_compute_fp32_to_bf16(0.1f);
      }
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);

    // Check scales for different k-groups
    std::cout << "A scales (first 5 k-groups): ";
    for (int kg = 0; kg < std::min(5, k / k_group_size); kg++) {
      float scale = *ba->get_scale(m, 0, k, kg * k_group_size);
      std::cout << scale << " ";
    }
    std::cout << std::endl;

    Kernel::config();
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Expected: sum of (kg+1)*0.1 * 0.1 * k_group_size for all k-groups
    float expected = 0.0f;
    for (int kg = 0; kg < k / k_group_size; kg++) {
      expected += (kg + 1) * 0.1f * 0.1f * k_group_size;
    }

    float actual = ggml_compute_bf16_to_fp32(output[0]);
    std::cout << "Expected: " << expected << ", Actual: " << actual << std::endl;
    std::cout << "Error: " << std::abs(actual - expected) / expected * 100 << "%" << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

int main() {
  debug_specific_dimensions();
  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/mat-test.hpp
================================================
#ifndef AMX_MAT_TEST_HPP
#define AMX_MAT_TEST_HPP

#include <cassert>
#include <iostream>
#include <limits>
#include <random>

#include "../../common.hpp"
#include "../la/utils.hpp"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "timer.hh"

template <typename T>
struct DotProductImpl {
  static_assert(sizeof(T) == -1, "No associated type defined for this type.");
  using type = void;
};

template <typename T>
using DotProductType = typename DotProductImpl<T>::type;

template <>
struct DotProductImpl<uint8_t> {
  using type = uint32_t;
};
template <>
struct DotProductImpl<int8_t> {
  using type = int32_t;
};
template <>
struct DotProductImpl<uint32_t> {
  using type = uint32_t;
};
template <>
struct DotProductImpl<int32_t> {
  using type = int32_t;
};

template <>
struct DotProductImpl<float> {
  using type = float;
};

enum class Layout {
  RowMajor,
  ColumnMajor,
  VNNIColumnMajor,
};

template <typename T>
struct Mat {
  int rows, cols;
  size_t size() { return rows * cols; }
  T* data;
  size_t stride_in_bytes;

  void* qdata = nullptr;
  ggml_type q_type;
  size_t q_stride;

  Layout layout = Layout::RowMajor;

  Mat() {};

  Mat(int rows, int cols, Layout layout) : rows(rows), cols(cols), layout(layout) {
    size_t total_size;
    if (layout == Layout::RowMajor) {
      stride_in_bytes = cols * sizeof(T);
      stride_in_bytes = (stride_in_bytes + 63) / 64 * 64;
      total_size = stride_in_bytes * rows;
    } else if (layout == Layout::ColumnMajor) {
      stride_in_bytes = rows * sizeof(T);
      stride_in_bytes = (stride_in_bytes + 63) / 64 * 64;
      total_size = stride_in_bytes * cols;
    } else {
      assert(0);
    }

    // data = new(std::align_val_t(64)) T[rows * cols];
    data = reinterpret_cast<T*>(aligned_alloc(64, total_size));
    memset(data, 0, total_size);
  }

  Mat<T> sub_mat(int r, int c) {
    Mat<T> re;
    re.rows = r;
    re.cols = c;
    re.data = data;
    re.layout = layout;
    re.stride_in_bytes = stride_in_bytes;
    re.qdata = qdata;
    re.q_stride = q_stride;
    re.q_type = q_type;
  }

  void dealloc() {
    delete[] data;
    if (qdata) {
      delete[] reinterpret_cast<char*>(qdata);
    }
  }

  void row_major_increase() {
    int x = 0;
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < cols; j++) {
        at(i, j) = x++;
      }
    }
  }

  void dis_to_00() {
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < cols; j++) {
        at(i, j) = i + j;
      }
    }
  }

  void random(std::mt19937& gen) {
    if constexpr (std::is_integral_v<T>) {
      std::uniform_int_distribution<T> dist(0, 100);
      for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
          at(i, j) = dist(gen);
        }
      }
    } else if constexpr (std::is_floating_point_v<T>) {
      std::uniform_real_distribution<T> dist(-1.0, 1.0);
      for (int i = 0; i < rows; i++) {
        std::mt19937 gen_row(gen());
        for (int j = 0; j < cols; j++) {
          at(i, j) = dist(gen_row);
        }
      }
    } else {
      throw std::runtime_error("Unsupported type");
    }
  }

  size_t stride() { return stride_in_bytes; }

  int line_element_count() {
    if (layout == Layout::RowMajor) {
      return cols;
    } else if (layout == Layout::ColumnMajor) {
      return rows;
    } else {
      assert(0);
    }
    assert(0);
    return 0;
  }

  T& at(int r, int c) {
    switch (layout) {
      case Layout::RowMajor:
        return *offset_pointer_row_major(data, r, c, stride());
      case Layout::ColumnMajor:
        return *offset_pointer_col_major(data, r, c, stride());
      // case Layout::VNNIColumnMajor:
      // return data[c*rows+r];
      default: {
        assert(0);
      }
    }
    throw std::runtime_error("Unsupported layout");
    // assert(0);
  }

  void print() {
    int limit = 10;      // 设置阈值
    int print_rows = 3;  // 开头和结尾打印的行数和列数

    for (int i = 0; i < rows; i++) {
      // 当行数过多时，跳过中间的行
      if (rows > limit && (i >= print_rows && i < rows - print_rows)) {
        if (i == print_rows) {
          std::cout << "...\n...\n";
        }
        continue;
      }

      for (int j = 0; j < cols; j++) {
        // 当列数过多时，跳过中间的列
        if (cols > limit && (j >= print_rows && j < cols - print_rows)) {
          if (j == print_rows) {
            std::cout << "... ";
          }
          continue;
        }

        if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>) {
          std::cout << (int)at(i, j) << " ";
        } else {
          std::cout << at(i, j) << " ";
        }
      }
      std::cout << std::endl;
    }
    std::cout << std::endl;
  }

  void print_all() {
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < cols; j++) {
        if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>) {
          std::cout << (int)at(i, j) << " ";
        } else if constexpr (std::is_floating_point_v<T>) {
          // std::cout << std::setw(6) << std::scientific << std::setprecision(2) << at(i, j) << "  ";
          printf("%6.2f ", at(i, j));
        } else {
          std::cout << at(i, j) << " ";
        }
      }
      std::cout << std::endl;
    }
    std::cout << std::endl;
  }

  Mat<DotProductType<T>> mul_check(Mat<T>& b) {
    assert(cols == b.rows);
    Mat<DotProductType<T>> c(rows, b.cols, Layout::RowMajor);
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < b.cols; j++) {
        c.at(i, j) = 0;
        for (int k = 0; k < cols; k++) {
          c.at(i, j) += static_cast<DotProductType<T>>(at(i, k)) * static_cast<DotProductType<T>>(b.at(k, j));
        }
      }
    }
    return c;
  }

  bool cmp(Mat<T>& b) {
    if constexpr (std::is_integral_v<T>) {
      assert(rows == b.rows && cols == b.cols);
      for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
          if (at(i, j) != b.at(i, j)) {
            std::cout << "Error at " << i << " " << j << " " << at(i, j) << ", " << b.at(i, j) << std::endl;
            // std::cout << "Error at " << i << " " << j << std::endl;
            // std::cout << "Other: " << b.at(i, j) << std::endl;
            // std::cout << "Me: " << at(i, j) << std::endl;
            // assert(0);
            // break;
            // return false;
          }
        }
      }
      std::cout << "Check passed" << std::endl;
      return true;
    }

    if constexpr (std::is_floating_point_v<T>) {
      T rel_error_sum = 0;
      T error_sum = 0;
      T max_error = 0;
      T max_rel_error = 0;
      int max_i = 0, max_j = 0;
      assert(rows == b.rows && cols == b.cols);
      for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
          T error = std::abs(at(i, j) - b.at(i, j));
          error_sum += error;
          rel_error_sum += error / std::abs(at(i, j));
          if (error / std::abs(at(i, j)) > max_rel_error) {
            max_rel_error = error / std::abs(at(i, j));
          }
          if (error > max_error) {
            max_i = i;
            max_j = j;
            max_error = error;
          }
        }
      }
      if (rel_error_sum / size() > 1e-2 || max_error / at(max_i, max_j) > 1e-2) {
        std::cout << "Max Error: " << std::fixed << max_error << "(" << max_error / at(max_i, max_j) << ")"
                  << " at " << max_i << " " << max_j << ", Max Rel Error " << max_rel_error
                  << ", Average Relative: " << rel_error_sum / size() << ", Average Error: " << error_sum / size()
                  << std::endl;
      } else {
        std::cout << "Error Less Than 1%" << std::endl;
      }

      return true;
    }
  }

  void quant(ggml_type to) {
    if constexpr (std::is_same<T, float>::value == false) {
      throw std::runtime_error("Quantization only supported for f32 matrices");
    }
    // Timer t(std::string("to ") + ggml_type_name(to));
    assert(line_element_count() * sizeof(T) == stride());
    assert(line_element_count() % ggml_blck_size(to) == 0);
    int blck_cnt_per_row = line_element_count() / ggml_blck_size(to);
    q_stride = blck_cnt_per_row * ggml_type_size(to);

    size_t qdata_size = size() * ggml_type_size(to) / ggml_blck_size(to);
    qdata_size += 512 - q_stride % 512;

    qdata = new (std::align_val_t(512)) char[qdata_size];
    q_type = to;

    switch (to) {
      case GGML_TYPE_F32: {
        return;
      }
      case GGML_TYPE_F16: {
        ggml_fp32_to_fp16_row(data, reinterpret_cast<ggml_fp16_t*>(qdata), size());
        return;
      }
      case GGML_TYPE_BF16: {
        ggml_fp32_to_bf16_row(data, reinterpret_cast<ggml_bf16_t*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q4_0: {
        quantize_row_q4_0(data, reinterpret_cast<block_q4_0*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q4_1: {
        quantize_row_q4_1(data, reinterpret_cast<block_q4_1*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q5_0: {
        quantize_row_q5_0(data, reinterpret_cast<block_q5_0*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q5_1: {
        quantize_row_q5_1(data, reinterpret_cast<block_q5_1*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q8_0: {
        quantize_row_q8_0(data, reinterpret_cast<block_q8_0*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q8_1: {
        quantize_row_q8_1(data, reinterpret_cast<block_q8_1*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q2_K: {
        quantize_row_q2_K(data, reinterpret_cast<block_q2_K*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q3_K: {
        quantize_row_q3_K(data, reinterpret_cast<block_q3_K*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q4_K: {
        quantize_row_q4_K(data, reinterpret_cast<block_q4_K*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q5_K: {
        quantize_row_q5_K(data, reinterpret_cast<block_q5_K*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q6_K: {
        quantize_row_q6_K(data, reinterpret_cast<block_q6_K*>(qdata), size());
        return;
      }
      case GGML_TYPE_Q8_K: {
        quantize_row_q8_K(data, reinterpret_cast<block_q8_K*>(qdata), size());
        return;
      }
      case GGML_TYPE_IQ2_XXS:
      case GGML_TYPE_IQ2_XS:
      case GGML_TYPE_IQ3_XXS:
      case GGML_TYPE_IQ1_S:
      case GGML_TYPE_IQ4_NL:
      case GGML_TYPE_IQ3_S:
      case GGML_TYPE_IQ2_S:
      case GGML_TYPE_IQ4_XS:
      case GGML_TYPE_I8:
      case GGML_TYPE_I16:
      case GGML_TYPE_I32:
      case GGML_TYPE_I64:
      case GGML_TYPE_F64:
      case GGML_TYPE_IQ1_M:
      case GGML_TYPE_COUNT:
      default:
        throw std::runtime_error("Unsupported quantization type");
    }
    throw std::runtime_error("Unsupported quantization type");
  }

  template <typename Block>
  Block* quant_data() {
    return reinterpret_cast<Block*>(qdata);
  }

  void dequant() {
    auto x = q_type;
    switch (x) {
      case GGML_TYPE_F32: {
        return;
      }
      case GGML_TYPE_F16: {
        ggml_fp16_to_fp32_row(reinterpret_cast<ggml_fp16_t*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q4_0: {
        dequantize_row_q4_0(reinterpret_cast<block_q4_0*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q4_1: {
        dequantize_row_q4_1(reinterpret_cast<block_q4_1*>(qdata), data, size());

        return;
      }
      case GGML_TYPE_Q5_0: {
        dequantize_row_q5_0(reinterpret_cast<block_q5_0*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q5_1: {
        dequantize_row_q5_1(reinterpret_cast<block_q5_1*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q8_0: {
        dequantize_row_q8_0(reinterpret_cast<block_q8_0*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q8_1: {
        throw std::runtime_error("not supported");
      }
      case GGML_TYPE_Q2_K: {
        dequantize_row_q2_K(reinterpret_cast<block_q2_K*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q3_K: {
        dequantize_row_q3_K(reinterpret_cast<block_q3_K*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q4_K: {
        dequantize_row_q4_K(reinterpret_cast<block_q4_K*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q5_K: {
        dequantize_row_q5_K(reinterpret_cast<block_q5_K*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q6_K: {
        dequantize_row_q6_K(reinterpret_cast<block_q6_K*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_Q8_K: {
        dequantize_row_q8_K(reinterpret_cast<block_q8_K*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_IQ2_XXS:
      case GGML_TYPE_IQ2_XS:
      case GGML_TYPE_IQ3_XXS:
      case GGML_TYPE_IQ1_S:
      case GGML_TYPE_IQ4_NL:
      case GGML_TYPE_IQ3_S:
      case GGML_TYPE_IQ2_S:
      case GGML_TYPE_IQ4_XS:
      case GGML_TYPE_I8:
      case GGML_TYPE_I16:
      case GGML_TYPE_I32:
      case GGML_TYPE_I64:
      case GGML_TYPE_F64:
      case GGML_TYPE_IQ1_M:
      case GGML_TYPE_BF16: {
        ggml_bf16_to_fp32_row(reinterpret_cast<ggml_bf16_t*>(qdata), data, size());
        return;
      }
      case GGML_TYPE_COUNT:
      default:
        throw std::runtime_error("Unsupported quantization type");
    }
    throw std::runtime_error("Unsupported quantization type");
  }
};

inline void init() {
  struct ggml_init_params params = {
      0,
      NULL,
      true,
  };

  auto ctx_eval = ggml_init(params);

  if (!ctx_eval) {
    throw std::runtime_error("Failed to create ggml context");
  }
}
#endif

================================================
FILE: kt-kernel/operators/amx/test/mmq-test.cpp
================================================

#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#endif

#include "mmq.h"

#include <algorithm>
#include <type_traits>

#include "ggml-impl.h"
#include "ggml-quants.h"
#include "mat-test.hpp"

#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif

#if defined(_OPENMP)
#include <omp.h>
#endif

#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
#define RESTRICT __restrict__
#endif

#if (defined(_WIN32) || defined(_WIN64))
#define ALWAYS_INLINE __forceinline
#elif __has_attribute(always_inline) || defined(__GNUC__)
#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
#else
#define ALWAYS_INLINE inline
#endif

#if defined(__AMX_INT8__)

namespace {

#define TILE_M 16
#define TILE_N 16
#define TILE_K 32
#define VNNI_BLK 4

#define AMX_BLK_SIZE 32

#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7

// parallel routines
// template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0> inline T div_up(T x, T y) {
//   return (x + y - 1) / y;
// }

template <typename T>
void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
#if 0
  // onednn partition pattern
  T& n_my = n_end;
  if (nth <= 1 || n == 0) {
    n_start = 0;
    n_my = n;
  } else {
    T n1 = div_up(n, nth);
    T n2 = n1 - 1;
    T T1 = n - n2 * nth;
    n_my = ith < T1 ? n1 : n2;
    n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
  }
  n_end += n_start;
#else
  // pytorch aten partition pattern
  T n_my = div_up(n, nth);
  n_start = ith * n_my;
  n_end = std::min(n_start + n_my, n);
#endif
}

template <typename func_t>
inline void parallel_for(int nth, int ith, int n, const func_t& f) {
  // int nth = omp_get_num_threads();
  // int ith = omp_get_thread_num();
  int tbegin, tend;
  balance211(n, nth, ith, tbegin, tend);
  f(tbegin, tend);
}

// Forced unrolling
template <int n>
struct Unroll {
  template <typename Func, typename... Args>
  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
    Unroll<n - 1>{}(f, args...);
    f(std::integral_constant<int, n - 1>{}, args...);
  }
};

template <>
struct Unroll<1> {
  template <typename Func, typename... Args>
  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
    f(std::integral_constant<int, 0>{}, args...);
  }
};

// type traits
template <typename T>
struct PackedTypes {};
template <>
struct PackedTypes<block_q4_0> {
  using type = int8_t;
};
template <>
struct PackedTypes<block_q4_1> {
  using type = uint8_t;
};
template <>
struct PackedTypes<block_q8_0> {
  using type = int8_t;
};
template <typename T>
using packed_B_type = typename PackedTypes<T>::type;

template <typename T>
struct do_compensate : std::integral_constant<bool, std::is_same<T, block_q8_0>::value> {};

template <typename T>
struct do_unpack
    : std::integral_constant<bool, std::is_same<T, block_q4_0>::value || std::is_same<T, block_q4_1>::value> {};

template <typename T>
struct is_type_qkk
    : std::integral_constant<bool, std::is_same<T, block_q4_K>::value || std::is_same<T, block_q5_K>::value ||
                                       std::is_same<T, block_q6_K>::value || std::is_same<T, block_iq4_xs>::value> {};

#define GGML_DISPATCH_FLOATING_TYPES(TYPE, ...)              \
  [&] {                                                      \
    switch (TYPE) {                                          \
      case GGML_TYPE_F16: {                                  \
        using type = ggml_fp16_t;                            \
        constexpr int blck_size = 16;                        \
        return __VA_ARGS__();                                \
      }                                                      \
      case GGML_TYPE_BF16: {                                 \
        using type = ggml_bf16_t;                            \
        constexpr int blck_size = 32;                        \
        return __VA_ARGS__();                                \
      }                                                      \
      default:                                               \
        fprintf(stderr, "Unsupported floating data type\n"); \
    }                                                        \
  }()

#define GGML_DISPATCH_QTYPES(QT, ...)                         \
  [&] {                                                       \
    switch (QT) {                                             \
      case GGML_TYPE_Q4_0: {                                  \
        using type = block_q4_0;                              \
        using vec_dot_type = block_q8_0;                      \
        constexpr int blck_size = QK4_0;                      \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q4_1: {                                  \
        using type = block_q4_1;                              \
        using vec_dot_type = block_q8_1;                      \
        constexpr int blck_size = QK4_1;                      \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q8_0: {                                  \
        using type = block_q8_0;                              \
        using vec_dot_type = block_q8_0;                      \
        constexpr int blck_size = QK8_0;                      \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q4_K: {                                  \
        using type = block_q4_K;                              \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q5_K: {                                  \
        using type = block_q5_K;                              \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q6_K: {                                  \
        using type = block_q6_K;                              \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_IQ4_XS: {                                \
        using type = block_iq4_xs;                            \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      default:                                                \
        fprintf(stderr, "Unsupported quantized data type\n"); \
    }                                                         \
  }()

#define GGML_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \
  [&] {                                            \
    if (BOOL_V) {                                  \
      constexpr bool BOOL_NAME = true;             \
      return __VA_ARGS__();                        \
    } else {                                       \
      constexpr bool BOOL_NAME = false;            \
      return __VA_ARGS__();                        \
    }                                              \
  }()

// define amx tile config data structure
struct tile_config_t {
  uint8_t palette_id = 0;
  uint8_t start_row = 0;
  uint8_t reserved_0[14] = {0};
  uint16_t colsb[16] = {0};
  uint8_t rows[16] = {0};
};

// Notes: amx tile config
//
// Typically, TMUL calculates A and B of size 16 x 64 containing INT8 values,
// and accumulate the result to a 16 x 16 matrix C containing INT32 values,
//
// As many GGUF quantized types as `block_size` of 32, so a 16-16-32 config is used
// instead of the normally used 16-16-64 config.
//
//   Block A: {16, 32}, dtype = int8_t
//   Block B: {16, 32}, dtype = uint8_t/int8_t
//   Block C: {16, 16}, dtype = int32_t
//
// Block B needs to be prepacked to vnni format before feeding into  TMUL:
//   packed_B: from {n, k} to {k/vnni_blk, n, vnni_blck}, viewed in 2d, we get {8, 64}
//
// Therefore, we get tileconfig:
//             A    B    C
//    rows    16    8   16
//    colsb   32   64   16
//
// For tile distribution, follow a 2-2-4 pattern, e.g. A used TMM2-TMM3, B used TMM0-TMM1,
// C used TMM4-TMM7:
//            B TMM0  B TMM1
//    A TMM2  C TMM4  C TMM6
//    A TMM3  C TMM5  C TMM7
//
// Each `amx` kernel handles 4 blocks at a time: 2MB * 2NB, when m < 2 * BLOCK_M, unpack A
// will be needed.
//
// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
//
// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
//   advanced-matrix-extensions-intrinsics-functions.html
//

#define TC_CONFIG_TILE(i, r, cb) \
  tc.rows[i] = r;                \
  tc.colsb[i] = cb
void ggml_tile_config_init(void) {
  static thread_local tile_config_t tc;
  tile_config_t current_tc;
  _tile_storeconfig(&current_tc);

  // load only when config changes
  if (tc.palette_id == 0 || (memcmp(&current_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 &&
                             memcmp(&current_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) {
    tc.palette_id = 1;
    tc.start_row = 0;
    TC_CONFIG_TILE(TMM0, 8, 64);
    TC_CONFIG_TILE(TMM1, 8, 64);
    TC_CONFIG_TILE(TMM2, 16, 32);
    TC_CONFIG_TILE(TMM3, 16, 32);
    TC_CONFIG_TILE(TMM4, 16, 64);
    TC_CONFIG_TILE(TMM5, 16, 64);
    TC_CONFIG_TILE(TMM6, 16, 64);
    TC_CONFIG_TILE(TMM7, 16, 64);
    _tile_loadconfig(&tc);
  }
}

// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation.
// See the notes `s8s8 igemm compensation in avx512-vnni` for detail.
template <typename TB>
int get_tile_size() {
  int tile_size = TILE_N * sizeof(TB);
  if (do_compensate<TB>::value) {
    tile_size += TILE_N * sizeof(int32_t);
  }
  if (std::is_same<TB, block_q4_K>::value || std::is_same<TB, block_q5_K>::value) {
    tile_size += TILE_N * 4;
  }
  if (std::is_same<TB, block_iq4_xs>::value) {
    tile_size += TILE_N * 2;
  }
  return tile_size;
}

template <typename TB, int BLOCK_K>
int get_row_size(int K) {
  int KB = K / BLOCK_K;
  int row_size = KB * sizeof(TB);
  if (do_compensate<TB>::value) {
    row_size += KB * sizeof(int32_t);
  }
  if (std::is_same<TB, block_q4_K>::value || std::is_same<TB, block_q5_K>::value) {
    row_size += KB * 4;
  }
  if (std::is_same<TB, block_iq4_xs>::value) {
    row_size += KB * 2;
  }
  return row_size;
}

// vectorized dtype conversion
inline float FP16_TO_FP32(ggml_half val) {
  __m256i v = _mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  __m512 o = _mm512_cvtph_ps(v);
  return _mm512_cvtss_f32(o);
}

inline __m512 FP16_TO_FP32_VEC(ggml_half val) {
  __m256i v = _mm256_set1_epi16(val);
  return _mm512_cvtph_ps(v);
}

// horizontal reduce
inline float _mm512_reduce_max_ps(const __m512 x) {
  __m512 v = x;
  __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
  v = _mm512_max_ps(v, v1);
  v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
  v = _mm512_max_ps(v, v1);
  v1 = _mm512_shuffle_ps(v, v, 0x4E);
  v = _mm512_max_ps(v, v1);
  v1 = _mm512_shuffle_ps(v, v, 0xB1);
  v = _mm512_max_ps(v, v1);
  return _mm512_cvtss_f32(v);
}

// transpose utils
#define SHUFFLE_EPI32(a, b, mask) \
  _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))

// transpose 8x8 32-bit element from v to v1
inline void transpose_8x8_32bit(__m256i* v, __m256i* v1) {
  // unpacking and 32-bit elements
  v1[0] = _mm256_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm256_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm256_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm256_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm256_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm256_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm256_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm256_unpackhi_epi32(v[6], v[7]);

  // shuffling the 32-bit elements
  v[0] = SHUFFLE_EPI32(v1[0], v1[2], 0x44);
  v[1] = SHUFFLE_EPI32(v1[0], v1[2], 0xee);
  v[2] = SHUFFLE_EPI32(v1[4], v1[6], 0x44);
  v[3] = SHUFFLE_EPI32(v1[4], v1[6], 0xee);
  v[4] = SHUFFLE_EPI32(v1[1], v1[3], 0x44);
  v[5] = SHUFFLE_EPI32(v1[1], v1[3], 0xee);
  v[6] = SHUFFLE_EPI32(v1[5], v1[7], 0x44);
  v[7] = SHUFFLE_EPI32(v1[5], v1[7], 0xee);

  // shuffling 128-bit elements
  v1[0] = _mm256_permute2f128_si256(v[2], v[0], 0x02);
  v1[1] = _mm256_permute2f128_si256(v[3], v[1], 0x02);
  v1[2] = _mm256_permute2f128_si256(v[6], v[4], 0x02);
  v1[3] = _mm256_permute2f128_si256(v[7], v[5], 0x02);
  v1[4] = _mm256_permute2f128_si256(v[2], v[0], 0x13);
  v1[5] = _mm256_permute2f128_si256(v[3], v[1], 0x13);
  v1[6] = _mm256_permute2f128_si256(v[6], v[4], 0x13);
  v1[7] = _mm256_permute2f128_si256(v[7], v[5], 0x13);
}

// transpose 16x4 32-bit element to 4x16 from r to d
inline void transpose_16x4_32bit(__m512i* r, __m512i* d) {
  static const __m512i index1 =
      _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02, 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);

  d[0] = _mm512_permutexvar_epi32(index1, r[0]);
  d[1] = _mm512_permutexvar_epi32(index1, r[1]);
  d[2] = _mm512_permutexvar_epi32(index1, r[2]);
  d[3] = _mm512_permutexvar_epi32(index1, r[3]);

  r[0] = _mm512_shuffle_i32x4(d[0], d[1], 0x44);
  r[1] = _mm512_shuffle_i32x4(d[0], d[1], 0xee);
  r[2] = _mm512_shuffle_i32x4(d[2], d[3], 0x44);
  r[3] = _mm512_shuffle_i32x4(d[2], d[3], 0xee);

  d[0] = _mm512_shuffle_i32x4(r[0], r[2], 0x88);
  d[1] = _mm512_shuffle_i32x4(r[0], r[2], 0xdd);
  d[2] = _mm512_shuffle_i32x4(r[1], r[3], 0x88);
  d[3] = _mm512_shuffle_i32x4(r[1], r[3], 0xdd);
}

// transpose 16x16 32-bit element in place
inline void transpose_16x16_32bit(__m512i* v) {
  __m512i v1[16];
  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);

  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);

  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

void quantize_row_q8_K_vnni(const float* RESTRICT x, void* RESTRICT vy, int64_t k) {
  assert(k % QK_K == 0);
  const int KB = k / QK_K;
  constexpr int kVecs = QK_K / 16;

  block_q8_K* y = reinterpret_cast<block_q8_K*>(vy);

  // hold 16 float vecs from x
  __m512 v[kVecs];

  // hold the quants vecs
  __m512i vq[kVecs / 4];

  // hold the packed quants vecs
  __m512i vq_packed[kVecs / 4];

  const __m512 signBit = _mm512_set1_ps(-0.f);

  for (int i = 0; i < KB; ++i) {
    // Compute max(abs(e)) for the block
    __m512 vamax = _mm512_set1_ps(0.f);
    for (int j = 0; j < kVecs; ++j) {
      v[j] = _mm512_loadu_ps(x);
      x += 16;
      vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(signBit, v[j]));
    }
    const float amax = _mm512_reduce_max_ps(vamax);

    // Quantize these floats
    const float iscale = 127.f / amax;
    y[i].d = GGML_FP32_TO_FP16(1 / iscale);
    const float id = (amax != 0.0f) ? iscale : 0.f;
    const __m512 vscale = _mm512_set1_ps(id);

    // Apply multiplier and round to nearest integer
    for (int j = 0; j < kVecs; ++j) {
      v[j] = _mm512_mul_ps(v[j], vscale);
      v[j] = _mm512_roundscale_ps(v[j], (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
    }

    // Pack to epi8 vecs
    for (int j = 0; j < kVecs / 4; ++j) {
      __m128i q8_0 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 0]));
      __m128i q8_1 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 1]));
      __m128i q8_2 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 2]));
      __m128i q8_3 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 3]));

      __m256i q8_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_0), (q8_1), 1);
      __m256i q8_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_2), (q8_3), 1);

      vq[j] = _mm512_inserti32x8(_mm512_castsi256_si512(q8_01), q8_23, 1);
      _mm512_storeu_si512((__m512i*)(y[i].qs + j * 64), vq[j]);
    }

    // Compute the bsums with vnni
    transpose_16x4_32bit(vq, vq_packed);

    const __m512i one = _mm512_set1_epi8(1);
    __m512i sum = _mm512_setzero_si512();
    for (int k = 0; k < 4; ++k) {
      sum = _mm512_dpbusd_epi32(sum, one, vq_packed[k]);
    }
    _mm256_storeu_si256((__m256i*)(y[i].bsums), _mm512_cvtepi32_epi16(sum));
  }
}

// quantize A from float to `vec_dot_type`
template <typename T>
inline void from_float(const float* x, char* vy, int64_t k);

template <>
inline void from_float<block_q8_0>(const float* x, char* vy, int64_t k) {
  quantize_row_q8_0(x, vy, k);
}

template <>
inline void from_float<block_q8_1>(const float* x, char* vy, int64_t k) {
  quantize_row_q8_1(x, vy, k);
}

template <>
inline void from_float<block_q8_K>(const float* x, char* vy, int64_t k) {
#if 1
  // TODO: this is reference impl!
  quantize_row_q8_K(x, vy, k);
#else
  quantize_row_q8_K_vnni(x, vy, k);
#endif
}

// load A from memory to array when nrows can not fill in whole tile
void unpack_A(int8_t* RESTRICT tile, const block_q8_0* RESTRICT A, int lda, int nr) {
  assert(nr != TILE_M);
  for (int m = 0; m < nr; ++m) {
    const __m256i v = _mm256_loadu_si256((const __m256i*)(A[m * lda].qs));
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), v);
  }
}

void unpack_A(int8_t* RESTRICT tile, const block_q8_1* RESTRICT A, int lda, int nr) {
  assert(nr != TILE_M);
  for (int m = 0; m < nr; ++m) {
    const __m256i v = _mm256_loadu_si256((const __m256i*)(A[m * lda].qs));
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), v);
  }
}

template <typename TB>
void unpack_A(int8_t* RESTRICT tile, const block_q8_K* RESTRICT A, int lda, int k, int nr) {
  assert(nr <= TILE_M);
  for (int m = 0; m < nr; ++m) {
    const __m256i v = _mm256_loadu_si256((const __m256i*)(A[m * lda].qs + k * 32));
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), v);
  }
}

template <>
void unpack_A<block_q6_K>(int8_t* RESTRICT tile, const block_q8_K* RESTRICT A, int lda, int k, int nr) {
  assert(nr <= TILE_M);
  // zero padding k from 16 to 32, so that we don't have to re-config amx
  const __m128i zero = _mm_setzero_si128();
  for (int m = 0; m < nr; ++m) {
    const __m128i v = _mm_loadu_si128((const __m128i*)(A[m * lda].qs + k * 16));
    const __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(v), zero, 1);
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), r);
  }
}

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
inline __m256i bytes_from_nibbles_32(const uint8_t* rsi) {
  const __m128i tmp = _mm_loadu_si128((const __m128i*)rsi);
  const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
  const __m256i lowMask = _mm256_set1_epi8(0xF);
  return _mm256_and_si256(lowMask, bytes);
}

// used for block_q4_K
inline __m512i bytes_from_nibbles_64(const uint8_t* rsi) {
  const __m256i tmp = _mm256_loadu_si256((const __m256i*)rsi);
  const __m256i lowMask = _mm256_set1_epi8(0xF);
  const __m256i q4l = _mm256_and_si256(tmp, lowMask);
  const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(tmp, 4), lowMask);
  return _mm512_inserti32x8(_mm512_castsi256_si512(q4l), q4h, 1);
}

// used for block_q5_K
inline __m512i bytes_from_nibbles_64(const uint8_t* qs, const uint8_t* qh, int k) {
  const __m256i lowMask = _mm256_set1_epi8(0xF);
  __m256i hmask = _mm256_set1_epi8(1);
  hmask = _mm256_slli_epi16(hmask, k);

  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)qs);
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)qh);

  const __m256i q5l_0 = _mm256_and_si256(q5bits, lowMask);
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 0), 4);
  const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
  hmask = _mm256_slli_epi16(hmask, 1);

  const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), lowMask);
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 1), 4);
  const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);

  return _mm512_inserti32x8(_mm512_castsi256_si512(q5_0), q5_1, 1);
}

// used for block_q6_K
inline void bytes_from_nibbles_128(__m512i& r0, __m512i& r1, const uint8_t* qs, const uint8_t* qh) {
  const __m256i m4 = _mm256_set1_epi8(0xF);
  const __m256i m2 = _mm256_set1_epi8(0x3);

  const __m256i q6bits1 = _mm256_loadu_si256((const __m256i*)qs);
  const __m256i q6bits2 = _mm256_loadu_si256((const __m256i*)(qs + 32));
  const __m256i q6bitsH = _mm256_loadu_si256((const __m256i*)qh);

  const __m256i q6h_0 = _mm256_slli_epi16(_mm256_and_si256(q6bitsH, m2), 4);
  const __m256i q6h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 2), m2), 4);
  const __m256i q6h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 4), m2), 4);
  const __m256i q6h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 6), m2), 4);

  const __m256i q6_0 = _mm256_or_si256(_mm256_and_si256(q6bits1, m4), q6h_0);
  const __m256i q6_1 = _mm256_or_si256(_mm256_and_si256(q6bits2, m4), q6h_1);
  const __m256i q6_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits1, 4), m4), q6h_2);
  const __m256i q6_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits2, 4), m4), q6h_3);

  r0 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_0), q6_1, 1);
  r1 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_2), q6_3, 1);
}

inline __m512i packNibbles(__m512i r0, __m512i r1) { return _mm512_or_si512(r0, _mm512_slli_epi16(r1, 4)); }

template <typename TB>
inline void pack_qs(void* RESTRICT packed_B, const TB* RESTRICT B, int KB) {
  int8_t tmp[8 * 64];
  __m256i v[8], v2[8];
  for (int n = 0; n < 8; ++n) {
    v[n] = bytes_from_nibbles_32(B[n * KB].qs);
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)(tmp + n * 64), v2[n]);
  }
  for (int n = 0; n < 8; ++n) {
    v[n] = bytes_from_nibbles_32(B[(n + 8) * KB].qs);
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)(tmp + n * 64 + 32), v2[n]);
  }

  // pack again with 128 to fully utilize vector length
  for (int n = 0; n < 8; n += 2) {
    __m512i r0 = _mm512_loadu_si512((const __m512i*)(tmp + n * 64));
    __m512i r1 = _mm512_loadu_si512((const __m512i*)(tmp + n * 64 + 64));
    __m512i r1r0 = packNibbles(r0, r1);
    _mm512_storeu_si512((__m512i*)((char*)packed_B + n * 32), r1r0);
  }
}

template <>
inline void pack_qs<block_q8_0>(void* RESTRICT packed_B, const block_q8_0* RESTRICT B, int KB) {
  __m256i v[8], v2[8];
  for (int n = 0; n < 8; ++n) {
    v[n] = _mm256_loadu_si256((const __m256i*)(B[n * KB].qs));
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)((char*)packed_B + n * 64), v2[n]);
  }
  for (int n = 0; n < 8; ++n) {
    v[n] = _mm256_loadu_si256((const __m256i*)(B[(n + 8) * KB].qs));
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)((char*)packed_B + n * 64 + 32), v2[n]);
  }
}

template <>
inline void pack_qs<block_q4_K>(void* RESTRICT packed_B, const block_q4_K* RESTRICT B, int KB) {
  __m512i v[16];
  // QK_K 256 with 8 groups, handle 2 groups at a time
  char* pb = (char*)packed_B;
  for (int k = 0; k < QK_K / 64; ++k) {
    // pack 2 groups { n, g,  k} to {g, k/4, 4n}
    //          e.g. {16, 2, 32} to {2,   8, 64}
    for (int n = 0; n < TILE_N; ++n) {
      v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32);
    }

    transpose_16x16_32bit(v);

    // pack again with 128 to fully utilize vector length
    for (int n = 0; n < TILE_N; n += 2) {
      _mm512_storeu_si512((__m512i*)pb, packNibbles(v[n], v[n + 1]));
      pb += 64;
    }
  }
}

template <>
inline void pack_qs<block_q5_K>(void* RESTRICT packed_B, const block_q5_K* RESTRICT B, int KB) {
  __m512i v[16];
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  // QK_K 256 with 8 groups, handle 2 groups at a time
  char* pb = (char*)packed_B;
  char* ph = (char*)packed_B + (QK_K / 2) * TILE_N;
  for (int k = 0; k < QK_K / 64; ++k) {
    // pack 2 groups { n, g,  k} to {g, k/4, 4n}
    //          e.g. {16, 2, 32} to {2,   8, 64}
    for (int n = 0; n < TILE_N; ++n) {
      v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32, B[n * KB].qh, /* group */ 2 * k);
    }

    transpose_16x16_32bit(v);

    // 1. pack lower 4bits with 2 groups
    for (int n = 0; n < TILE_N; n += 2) {
      // get lower 4 bits
      const __m512i r0 = _mm512_and_si512(v[n], lowMask);
      const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
      _mm512_storeu_si512((__m512i*)pb, packNibbles(r0, r1));
      pb += 64;
    }

    // 2. pack higher 1bit with 2 groups
    const __m512i hmask = _mm512_set1_epi8(0x10);
    for (int g = 0; g < 2; ++g) {
      __m512i hbits = _mm512_setzero_si512();
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 0], hmask), 4));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 1], hmask), 3));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 2], hmask), 2));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 3], hmask), 1));
      hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 8 + 4], hmask));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 5], hmask), 1));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 6], hmask), 2));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 7], hmask), 3));
      _mm512_storeu_si512((__m512i*)ph, hbits);
      ph += 64;
    }
  }
}

template <>
inline void pack_qs<block_q6_K>(void* RESTRICT packed_B, const block_q6_K* RESTRICT B, int KB) {
  __m512i v[32];
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  // QK_K 256 with 8 groups, handle 4 groups at a time
  char* pb = (char*)packed_B;
  char* ph = (char*)packed_B + (QK_K / 2) * TILE_N;
  for (int k = 0; k < QK_K / 128; ++k) {
    for (int n = 0; n < TILE_N; ++n) {
      bytes_from_nibbles_128(v[n], v[n + 16], B[n * KB].ql + k * 64, B[n * KB].qh + k * 32);
    }

    // top half: group 0,1 or 4,5; bottom half: group 2,3 or 6,7
    transpose_16x16_32bit(v);
    transpose_16x16_32bit(v + 16);

    // 1. pack lower 4bits with 4 groups
    for (int n = 0; n < 32; n += 2) {
      const __m512i r0 = _mm512_and_si512(v[n], lowMask);
      const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
      _mm512_storeu_si512((__m512i*)pb, packNibbles(r0, r1));
      pb += 64;
    }

    // 2. pack higher 2bit with 4 groups
    const __m512i hmask = _mm512_set1_epi8(0x30);
    for (int g = 0; g < 8; ++g) {
      __m512i hbits = _mm512_setzero_si512();
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 0], hmask), 4));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 1], hmask), 2));
      hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 4 + 2], hmask));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 4 + 3], hmask), 2));
      _mm512_storeu_si512((__m512i*)ph, hbits);
      ph += 64;
    }
  }
}

template <>
inline void pack_qs<block_iq4_xs>(void* RESTRICT packed_B, const block_iq4_xs* RESTRICT B, int KB) {
  __m512i v[16];
  char* pb = (char*)packed_B;
  for (int k = 0; k < QK_K / 64; ++k) {
    for (int n = 0; n < TILE_N; ++n) {
      __m256i r0 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 0);
      __m256i r1 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 16);
      v[n] = _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
    }

    transpose_16x16_32bit(v);

    // pack again with 128 to fully utilize vector length
    for (int n = 0; n < TILE_N; n += 2) {
      _mm512_storeu_si512((__m512i*)pb, packNibbles(v[n], v[n + 1]));
      pb += 64;
    }
  }
}

// pack B to vnni formats in 4bits or 8 bits
void pack_B(void* RESTRICT packed_B, const block_q4_0* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);
  ggml_half* d0 = reinterpret_cast<ggml_half*>((char*)packed_B + TILE_N * TILE_K / 2);
  for (int n = 0; n < TILE_N; ++n) {
    d0[n] = B[n * KB].d;
  }
}

void pack_B(void* RESTRICT packed_B, const block_q4_1* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);
  ggml_half* d0 = reinterpret_cast<ggml_half*>((char*)packed_B + TILE_N * TILE_K / 2);
  ggml_half* m0 = d0 + TILE_N;
  for (int n = 0; n < TILE_N; ++n) {
    d0[n] = B[n * KB].d;
    m0[n] = B[n * KB].m;
  }
}

inline void s8s8_compensation(void* RESTRICT packed_B) {
  // packed_B layout:
  //   quants {TILE_N, TILEK}  int8_t
  //   d0     {TILE_N}      ggml_half
  //   comp   {TILE_N}        int32_t
  const int offset = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
  __m512i vcomp = _mm512_setzero_si512();
  const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
  for (int k = 0; k < 8; ++k) {
    __m512i vb = _mm512_loadu_si512((const __m512i*)((const char*)packed_B + k * 64));
    vcomp = _mm512_dpbusd_epi32(vcomp, off, vb);
  }
  _mm512_storeu_si512((__m512i*)((char*)(packed_B) + offset), vcomp);
}

void pack_B(void* RESTRICT packed_B, const block_q8_0* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);
  ggml_half* d0 = reinterpret_cast<ggml_half*>((char*)packed_B + TILE_N * TILE_K);
  for (int n = 0; n < TILE_N; ++n) {
    d0[n] = B[n * KB].d;
  }
  s8s8_compensation(packed_B);
}

// convert 8 * {min, scale} from int6 to int8
inline void unpack_mins_and_scales(const uint8_t* scales, uint32_t* utmp) {
  const uint32_t kmask1 = 0x3f3f3f3f;
  const uint32_t kmask2 = 0x0f0f0f0f;
  const uint32_t kmask3 = 0x03030303;

  memcpy(utmp, scales, 12);
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
  const uint32_t uaux = utmp[1] & kmask1;
  utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
  utmp[2] = uaux;
  utmp[0] &= kmask1;
}

// packed_B layout:
//   quants {8, TILE_N, 16}  uint8
//   scales {8, TILE_N}      uint8
//   mins   {8, TILE_N}      uint8
//   d      {TILE_N}     ggml_half
//   dmin   {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_q4_K* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  uint8_t* scales = reinterpret_cast<uint8_t*>((char*)packed_B + (QK_K / 2) * TILE_N);
  uint8_t* mins = scales + 8 * TILE_N;
  ggml_half* d = reinterpret_cast<ggml_half*>(mins + 8 * TILE_N);
  ggml_half* dmin = d + TILE_N;

  union {
    uint32_t u32[4];
    uint8_t u8[16];
  } s;

  for (int n = 0; n < TILE_N; ++n) {
    unpack_mins_and_scales(B[n * KB].scales, s.u32);
    for (int k = 0; k < 8; ++k) {
      scales[k * TILE_N + n] = s.u8[k];
      mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
    }
    d[n] = B[n * KB].d;
    dmin[n] = B[n * KB].dmin;
  }
}

// packed_B layout:
//   quants {8, TILE_N, 16}  uint8
//   qh     {8, TILE_N,  4}  uint8
//   scales {8, TILE_N}      uint8
//   mins   {8, TILE_N}      uint8
//   d      {TILE_N}     ggml_half
//   dmin   {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_q5_K* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  uint8_t* scales = reinterpret_cast<uint8_t*>((char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
  uint8_t* mins = scales + 8 * TILE_N;
  ggml_half* d = reinterpret_cast<ggml_half*>(mins + 8 * TILE_N);
  ggml_half* dmin = d + TILE_N;

  union {
    uint32_t u32[4];
    uint8_t u8[16];
  } s;

  for (int n = 0; n < TILE_N; ++n) {
    unpack_mins_and_scales(B[n * KB].scales, s.u32);
    for (int k = 0; k < 8; ++k) {
      scales[k * TILE_N + n] = s.u8[k];
      mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
    }
    d[n] = B[n * KB].d;
    dmin[n] = B[n * KB].dmin;
  }
}

// packed_B layout:
//   quants {16, TILE_N, 8}  uint8
//   qh     {16, TILE_N, 4}  uint8
//   scales {16, TILE_N}      uint8
//   d      {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_q6_K* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  uint8_t* scales = reinterpret_cast<uint8_t*>((char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
  ggml_half* d = reinterpret_cast<ggml_half*>(scales + 16 * TILE_N);
  for (int n = 0; n < TILE_N; ++n) {
    const int8_t* ps = B[n * KB].scales;
    for (int k = 0; k < 16; ++k) {
      scales[k * TILE_N + n] = ps[k];
    }
    d[n] = B[n * KB].d;
  }
}

// packed_B layout:
//   quants {8, TILE_N, 16}  uint8
//   scales {8, TILE_N}       int8
//   d      {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_iq4_xs* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  int8_t* scales = reinterpret_cast<int8_t*>((char*)packed_B + (QK_K / 2) * TILE_N);
  ggml_half* d = reinterpret_cast<ggml_half*>(scales + 8 * TILE_N);

  // pack the scales
  for (int n = 0; n < TILE_N; ++n) {
    uint16_t sh = B[n * KB].scales_h;
    for (int k = 0; k < 8; k += 2) {
      const int16_t ls1 = ((B[n * KB].scales_l[k / 2] & 0xf) | ((sh << 4) & 0x30)) - 32;
      const int16_t ls2 = ((B[n * KB].scales_l[k / 2] >> 4) | ((sh << 2) & 0x30)) - 32;
      scales[(k + 0) * TILE_N + n] = ls1;
      scales[(k + 1) * TILE_N + n] = ls2;
      sh >>= 4;
    }
    d[n] = B[n * KB].d;
  }
}

template <typename TB, typename packed_B_t = packed_B_type<TB>>
void unpack_B(packed_B_t* RESTRICT tile, const void* RESTRICT packed_B) {
  GGML_UNUSED(tile);
  GGML_UNUSED(packed_B);
};

template <>
void unpack_B<block_q4_0>(int8_t* RESTRICT tile, const void* RESTRICT packed_B) {
  const __m512i off = _mm512_set1_epi8(8);
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512((const __m512i*)((const char*)packed_B + n * 32));
    const __m512i r0 = _mm512_sub_epi8(_mm512_and_si512(bytes, lowMask), off);
    const __m512i r1 = _mm512_sub_epi8(_mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask), off);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <>
void unpack_B<block_q4_1>(uint8_t* RESTRICT tile, const void* RESTRICT packed_B) {
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512((const __m512i*)((const char*)packed_B + n * 32));
    const __m512i r0 = _mm512_and_si512(bytes, lowMask);
    const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

// packed_B_t for QKK is int8_t
template <typename TB>
void unpack_B(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
  const char* packed_B_group = (const char*)packed_B + k * packed_B_group_size;
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512(packed_B_group + n * 32);
    const __m512i r0 = _mm512_and_si512(bytes, lowMask);
    const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <>
void unpack_B<block_q5_K>(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  // lower 4bits, stride 256 bytes
  const int packed_l4_group_size = QK_K / 2 * TILE_N / 8;
  const char* pb = (const char*)packed_B + k * packed_l4_group_size;

  // higher 1bit, stride 64 bytes
  const int packed_h1_group_size = QK_K / 8 * TILE_N / 8;
  const char* ph = (const char*)packed_B + (QK_K / 2) * TILE_N + k * packed_h1_group_size;
  const __m512i hbits = _mm512_loadu_si512(ph);

  const __m512i lowMask = _mm512_set1_epi8(0xF);
  __m512i hmask0 = _mm512_set1_epi8(0x1);
  __m512i hmask1 = _mm512_set1_epi8(0x2);

  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512(pb + n * 32);
    __m512i r0 = _mm512_and_si512(bytes, lowMask);
    __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
    __m512i h0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), n), 4);
    __m512i h1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), n + 1), 4);

    hmask0 = _mm512_slli_epi16(hmask0, 2);
    hmask1 = _mm512_slli_epi16(hmask1, 2);
    r0 = _mm512_add_epi8(r0, h0);
    r1 = _mm512_add_epi8(r1, h1);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <>
void unpack_B<block_q6_K>(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  // lower 4bits, stride 128 bytes
  const int packed_l4_group_size = QK_K / 2 * TILE_N / 16;
  const char* pb = (const char*)packed_B + k * packed_l4_group_size;

  // higher 2bits, stride 64 bytes
  const int packed_h2_group_size = QK_K / 4 * TILE_N / 16;
  const char* ph = (const char*)packed_B + (QK_K / 2) * TILE_N + k * packed_h2_group_size;
  const __m512i hbits = _mm512_loadu_si512(ph);

  const __m512i off = _mm512_set1_epi8(32);
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  __m512i hmask0 = _mm512_set1_epi8(0x3);  // 0011
  __m512i hmask1 = _mm512_set1_epi8(0xC);  // 1100

  // notes: skip zero padding from row4 to row7 as we have done so in `unpack_A`
  __m512i bytes = _mm512_loadu_si512(pb);
  __m512i r0 = _mm512_and_si512(bytes, lowMask);
  __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
  __m512i h0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask0), 4);
  __m512i h1 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask1), 2);
  _mm512_storeu_si512((__m512i*)(tile + 0), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
  _mm512_storeu_si512((__m512i*)(tile + 64), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));

  hmask0 = _mm512_slli_epi16(hmask0, 4);
  hmask1 = _mm512_slli_epi16(hmask1, 4);

  bytes = _mm512_loadu_si512(pb + 64);
  r0 = _mm512_and_si512(bytes, lowMask);
  r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
  h0 = _mm512_and_si512(hbits, hmask0);
  h1 = _mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), 2);
  _mm512_storeu_si512((__m512i*)(tile + 128), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
  _mm512_storeu_si512((__m512i*)(tile + 192), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));
}

template <>
void unpack_B<block_iq4_xs>(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  static const __m512i values128 = _mm512_set_epi8(
      113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, 113, 89, 69, 53, 38, 25, 13, 1, -10,
      -22, -35, -49, -65, -83, -104, -127, 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
      113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127);

  const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
  const char* pb = (const char*)packed_B + k * packed_B_group_size;
  const __m512i lowMask = _mm512_set1_epi8(0xF);

  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512(pb + n * 32);
    const __m512i r0 = _mm512_shuffle_epi8(values128, _mm512_and_si512(bytes, lowMask));
    const __m512i r1 = _mm512_shuffle_epi8(values128, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <typename TA, typename TB, bool is_acc>
struct acc_C {};

template <bool is_acc>
struct acc_C<block_q8_0, block_q4_0, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_0* A, int lda,
                    const void* packed_B, int nr) {
    const int offset = TILE_N * TILE_K / 2;
    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset)));

    for (int m = 0; m < nr; ++m) {
      const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }
      vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_1, block_q4_1, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_1* A, int lda,
                    const void* packed_B, int nr) {
    const int offset = TILE_N * TILE_K / 2;
    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset)));
    const __m512 vm0 = _mm512_cvtph_ps(
        _mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset + TILE_N * sizeof(ggml_half))));

    for (int m = 0; m < nr; ++m) {
      const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
      const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }
      vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
      vsum = _mm512_fmadd_ps(vm0, vs1, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_0, block_q8_0, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_0* A, int lda,
                    const void* packed_B, int nr) {
    const int offset = TILE_N * TILE_K;
    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset)));

    for (int m = 0; m < nr; ++m) {
      const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }
      vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_q4_K, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const uint8_t* scales = reinterpret_cast<const uint8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N);
    const uint8_t* mins = scales + 8 * TILE_N;
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(mins + 8 * TILE_N);
    const ggml_half* dmin = d0 + TILE_N;

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));
    const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)dmin));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[m * lda].bsums);
      const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));

      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_q5_K, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const uint8_t* scales =
        reinterpret_cast<const uint8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
    const uint8_t* mins = scales + 8 * TILE_N;
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(mins + 8 * TILE_N);
    const ggml_half* dmin = d0 + TILE_N;

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));
    const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)dmin));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[m * lda].bsums);
      const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));

      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_q6_K, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const uint8_t* scales =
        reinterpret_cast<const uint8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(scales + 16 * TILE_N);

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_iq4_xs, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const int8_t* scales = reinterpret_cast<const int8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N);
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(scales + 8 * TILE_N);

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <typename TB>
constexpr int get_quants_size();
template <>
constexpr int get_quants_size<block_q4_K>() {
  return (QK_K / 2) * TILE_N;
}
template <>
constexpr int get_quants_size<block_q5_K>() {
  return (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N;
}
template <>
constexpr int get_quants_size<block_q6_K>() {
  return (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N;
}
template <>
constexpr int get_quants_size<block_iq4_xs>() {
  return (QK_K / 2) * TILE_N;
}

// used for QKK format
template <typename TB, bool is_acc, typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
inline void scale_C(const int32_t* RESTRICT tile, int32_t* RESTRICT sumi, const void* packed_B, int k, int nr) {
  const uint8_t* scales = reinterpret_cast<const uint8_t*>((const char*)packed_B + get_quants_size<TB>());
  const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(scales + k * TILE_N)));

  for (int m = 0; m < nr; ++m) {
    __m512i vsumi;
    if (is_acc) {
      vsumi = _mm512_loadu_si512(sumi + m * TILE_N);
    } else {
      vsumi = _mm512_setzero_si512();
    }
    __m512i vtile = _mm512_loadu_si512(tile + m * TILE_N);
    vsumi = _mm512_add_epi32(vsumi, _mm512_mullo_epi32(vtile, vscale));
    _mm512_storeu_si512((__m512i*)(sumi + m * TILE_N), vsumi);
  }
}

template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_avx {
  static void apply(int K, const TA* RESTRICT A, const TB* RESTRICT B, TC* RESTRICT C, int ldc) {
    GGML_UNUSED(K);
    GGML_UNUSED(A);
    GGML_UNUSED(B);
    GGML_UNUSED(C);
    GGML_UNUSED(ldc);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int K, const float* RESTRICT A, const ggml_fp16_t* RESTRICT B, float* RESTRICT C, int ldc) {
    constexpr int ROWS = BLOCK_M;
    constexpr int COLS = BLOCK_N;
    assert(BLOCK_K == 16);

    __m512 va;
    __m512 vb[COLS];
    __m512 vc[ROWS * COLS];

    auto loadc = [&](int idx) { vc[idx] = _mm512_setzero_ps(); };
    Unroll<ROWS * COLS>{}(loadc);

    auto compute = [&](int idx, int k) {
      // TODO: use `constexpr` here to get rid of interger div
      // when upgraded to C++17
      const int row = idx / COLS;
      const int col = idx % COLS;

      if (col == 0) {
        va = _mm512_loadu_ps(A + row * K + k);
      }
      if (row == 0) {
        vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(B + col * K + k)));
      }
      vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
    };

    for (int k = 0; k < K; k += 16) {
      Unroll<ROWS * COLS>{}(compute, k);
    }

    auto storec = [&](int idx) {
      const int row = idx / COLS;
      const int col = idx % COLS;
      C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
    };
    Unroll<ROWS * COLS>{}(storec);
  }
};

#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE)                                      \
  tinygemm_kernel_avx<float, type, float, MB_SIZE, NB_SIZE, blck_size>::apply(            \
      K, (const float*)src1->data + mb_start * K, (const type*)src0->data + nb_start * K, \
      (float*)dst->data + mb_start * ldc + nb_start, ldc);

// re-organize in the format {NB, KB, TILE_SIZE}:
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size

template <typename TB, int BLOCK_K>
void convert_B_packed_format(void* RESTRICT packed_B, const TB* RESTRICT B, int N, int K) {
  const int NB = N / TILE_N;
  const int KB = K / BLOCK_K;
  const int TILE_SIZE = get_tile_size<TB>();

  // parallel on NB should be enough
  parallel_for(1, 0, NB, [&](int begin, int end) {
    for (int n = begin; n < end; ++n) {
      for (int k = 0; k < KB; ++k) {
        int n0 = n * TILE_N;
        pack_B((char*)packed_B + PACKED_INDEX(n, k, KB, TILE_SIZE), &B[n0 * KB + k], KB);
      }
    }
  });
}

template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni {};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q4_0);

    const block_q8_0* RESTRICT A = static_cast<const block_q8_0*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    __m512i va[8];
    __m512 vc[COLS];
    __m512 vd1;

    // sum of offsets, shared across COLS
    //
    // avx512-vnni does not have `_mm512_dpbssd_epi32`,
    // need to transfrom ss to us:
    //   a * (b - 8) is equavilent to b * a - 8 * a
    //   s    u   u                   u   s   u   s
    //
    __m512i vcomp;

    const __m512i off = _mm512_set1_epi8(8);
    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      // load a and compute compensation
      if (col == 0) {
        const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A[0 * KB + i].qs);
        vcomp = _mm512_setzero_si512();
        for (int k = 0; k < 8; ++k) {
          va[k] = _mm512_set1_epi32(a_ptr[k]);
          vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
        }
        vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
      }

      // load b
      __m512i vsum = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      for (int k = 0; k < 8; k += 2) {
        __m512i bytes = _mm512_loadu_si512((const __m512i*)(b_ptr + k * 32));
        __m512i vb0 = _mm512_and_si512(bytes, lowMask);
        vsum = _mm512_dpbusd_epi32(vsum, vb0, va[k + 0]);
        __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
        vsum = _mm512_dpbusd_epi32(vsum, vb1, va[k + 1]);
      }
      const int offset = TILE_N * TILE_K / 2;
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset)));
      vsum = _mm512_sub_epi32(vsum, vcomp);

      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q4_1);

    const block_q8_1* RESTRICT A = static_cast<const block_q8_1*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    __m512i va[8];
    __m512i vb[8];
    __m512 vc[COLS];
    __m512 vd1, vs1;

    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      // load a
      if (col == 0) {
        const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A[0 * KB + i].qs);
        for (int k = 0; k < 8; ++k) {
          va[k] = _mm512_set1_epi32(a_ptr[k]);
        }
        vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
        vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
      }

      // load b
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      for (int k = 0; k < 8; k += 2) {
        __m512i bytes = _mm512_loadu_si512((const __m512i*)(b_ptr + k * 32));
        vb[k + 0] = _mm512_and_si512(bytes, lowMask);
        vb[k + 1] = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
      }
      const int offset = TILE_N * TILE_K / 2;
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset)));
      const __m512 vm0 =
          _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset + TILE_N * sizeof(ggml_half))));

      __m512i vsum = _mm512_setzero_si512();
      for (int k = 0; k < 8; ++k) {
        vsum = _mm512_dpbusd_epi32(vsum, vb[k], va[k]);
      }

      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
      vc[col] = _mm512_fmadd_ps(vm0, vs1, vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q8_0) + TILE_N * sizeof(int32_t);

    const block_q8_0* RESTRICT A = static_cast<const block_q8_0*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    __m512i va[8];
    __m512i vb[8];
    __m512 vc[COLS];
    __m512 vd1;

    // Notes: s8s8 igemm compensation in avx512-vnni
    // change s8s8 to u8s8 with compensate
    //   a * b = (a + 128) * b - 128 * b
    //   s   s       u       s    u    s
    //
    // (128 * b is pre-computed when packing B to vnni formats)
    //
    const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      // load a and add offset 128
      if (col == 0) {
        const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A[0 * KB + i].qs);
        for (int k = 0; k < 8; ++k) {
          va[k] = _mm512_set1_epi32(a_ptr[k]);
          va[k] = _mm512_add_epi8(va[k], off);
        }
        vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
      }

      // load b
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      for (int k = 0; k < 8; ++k) {
        vb[k] = _mm512_loadu_si512((const __m512i*)(b_ptr + k * 64));
      }
      const int offset = TILE_N * TILE_K;
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset)));
      const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
      const __m512i vcomp = _mm512_loadu_si512((const __m512i*)(b_ptr + offset2));

      __m512i vsum = _mm512_setzero_si512();
      for (int k = 0; k < 8; ++k) {
        vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
      }
      vsum = _mm512_sub_epi32(vsum, vcomp);

      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q4_K) + TILE_N * 4;

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // a.qs:   8 groups, 32 bytes each group (m256i)
    __m512i va[8];
    // a.bsum: 8 groups,  2 bytes each group (m128i)
    __m512i va_bsum;
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_scales = (QK_K / 2) * TILE_N;
    const int offset_mins = (QK_K / 2) * TILE_N + 8 * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + 16 * TILE_N;
    const int offset_dmin = (QK_K / 2) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);

    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    // Notes: vnni formats in QK_K
    //   a) quants vnni format
    //     int8  {k/4, n, 4}, viewed as 2d {k/4, 4n}, k = 32
    //     from {16, 32} to {8, 64}
    //
    //   b) min vnni format
    //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
    //     from {16,  8} to {4, 32}
    //
    auto compute = [&](int col, int i) {
      // load a
      if (col == 0) {
        for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
          va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(A[0 * KB + i].qs + k_group * 32)));
        }
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
        va_bsum = _mm512_castsi128_si512(q8s);
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // step 1: accumultate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
        __m512i vsum = _mm512_setzero_si512();
        for (int k = 0; k < 8; k += 2) {
          __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
          __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);

          __m512i bytes = _mm512_loadu_si512((const __m512i*)b_qs);
          __m512i vb0 = _mm512_and_si512(bytes, lowMask);
          vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
          __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
          vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);

          b_qs += 64;
        }
        // vacc += scale * (q8 @ q4)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);

      // step 2: accumulate the mins
      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }
      const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_dmin)));
      vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q5_K) + TILE_N * 4;

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // a.qs:   8 groups, 32 bytes each group (m256i)
    __m512i va[8];
    // a.bsum: 8 groups,  2 bytes each group (m128i)
    __m512i va_bsum;
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_qh = (QK_K / 2) * TILE_N;
    const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N;
    const int offset_mins = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 8 * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N;
    const int offset_dmin = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);

    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    // Q5_K and Q4_K shares the same vnni formats, refer to notes above.
    auto compute = [&](int col, int i) {
      // load a
      if (col == 0) {
        for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
          va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(A[0 * KB + i].qs + k_group * 32)));
        }
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
        va_bsum = _mm512_castsi128_si512(q8s);
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // step 1: accumultate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      const char* b_qh = b_ptr + offset_qh;
      for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
        __m512i vsum = _mm512_setzero_si512();
        __m512i hmask0 = _mm512_set1_epi8(0x1);
        __m512i hmask1 = _mm512_set1_epi8(0x2);
        __m512i hbits = _mm512_loadu_si512((const __m512i*)(b_qh + k_group * 64));
        for (int k = 0; k < 8; k += 2) {
          __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
          __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);

          __m512i bytes = _mm512_loadu_si512((const __m512i*)b_qs);
          __m512i vb0 = _mm512_and_si512(bytes, lowMask);
          __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);

          __m512i vh0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), k), 4);
          __m512i vh1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), k + 1), 4);

          hmask0 = _mm512_slli_epi16(hmask0, 2);
          hmask1 = _mm512_slli_epi16(hmask1, 2);
          vb0 = _mm512_add_epi8(vb0, vh0);
          vb1 = _mm512_add_epi8(vb1, vh1);

          vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
          vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);

          b_qs += 64;
        }
        // vacc += scale * (q8 @ q5)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);

      // step 2: accumulate the mins
      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }
      const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_dmin)));
      vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q6_K);

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // load the 256 bytes from A to 4 avx512 vectors
    __m512i va[4];
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_qh = (QK_K / 2) * TILE_N;
    const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N + 16 * TILE_N;

    // compensation
    __m512i vcomp;

    const __m512i m32s = _mm512_set1_epi32(32);
    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      if (col == 0) {
        // load a
        va[0] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 0));
        va[1] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 64));
        va[2] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 128));
        va[3] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 192));

        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        vcomp = _mm512_mullo_epi32(_mm512_cvtepi16_epi32(q8sums), m32s);
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // accmulate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      const char* b_qh = b_ptr + offset_qh;
      int mask = 0;
      for (int k_group = 0; k_group < QK_K / 16; ++k_group) {
        int r = k_group >> 2;
        __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
        __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);

        __m512i vsum = _mm512_setzero_si512();
        __m512i hmask = _mm512_set1_epi8(0x3);

        __m512i bytes = _mm512_loadu_si512(b_qs);
        __m512i hbits = _mm512_loadu_si512(b_qh);
        __m512i vb0 = _mm512_and_si512(bytes, lowMask);
        __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
        __m512i vh0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask), 4);
        __m512i vh1 = _mm512_slli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 2)), 2);

        vb0 = _mm512_add_epi8(vb0, vh0);
        vb1 = _mm512_add_epi8(vb1, vh1);
        vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
        vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
        b_qs += 64;

        va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
        va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);

        bytes = _mm512_loadu_si512(b_qs);
        vb0 = _mm512_and_si512(bytes, lowMask);
        vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
        vh0 = _mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 4));
        vh1 = _mm512_srli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 6)), 2);
        vb0 = _mm512_add_epi8(vb0, vh0);
        vb1 = _mm512_add_epi8(vb1, vh1);
        vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
        vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
        b_qs += 64;
        b_qh += 64;

        // B * A - 32 * A
        __m512i vmask = _mm512_set1_epi32(k_group);
        vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));

        // vacc += scale * (q8 @ q6)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_iq4_xs) + TILE_N * 2;

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // load the 256 bytes from A to 4 avx512 vectors
    __m512i va[4];
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_scales = (QK_K / 2) * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + 8 * TILE_N;

    // compensation
    __m512i vcomp;

    const __m256i m128s = _mm256_set1_epi16(128);
    const __m512i lowMask = _mm512_set1_epi8(0xF);

    const __m512i values128 = _mm512_set_epi8(113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
                                              113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
                                              113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
                                              113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127);
    const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
    const __m512i values256 = _mm512_add_epi8(values128, off);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      if (col == 0) {
        // load a
        va[0] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 0));
        va[1] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 64));
        va[2] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 128));
        va[3] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 192));

        // compensation: 128 * A
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        vcomp = _mm512_castsi256_si512(_mm256_madd_epi16(q8sums, m128s));
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // accmulate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      int mask = 0;
      for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
        int r = k_group >> 1;
        __m512i vmask = _mm512_set1_epi32(k_group);
        __m512i vsum = _mm512_setzero_si512();
        for (int k = 0; k < 8; k += 2) {
          __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
          __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);

          __m512i bytes = _mm512_loadu_si512(b_qs);
          __m512i vb0 = _mm512_shuffle_epi8(values256, _mm512_and_si512(bytes, lowMask));
          __m512i vb1 = _mm512_shuffle_epi8(values256, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));

          vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
          vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
          b_qs += 64;
        }
        // (B + 128) * A - 128 * A
        vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));

        // vacc += scale * (q8 @ q4)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE)                                                                         \
  tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply(                                     \
      KB, (const char*)wdata + 0 * row_size_A,                                                                       \
      (const char*)src0->extra + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), (float*)dst->data + 0 * N + nb_start, \
      ldc)

template <typename TA, typename TB, typename TC, int BLOCK_K,
          typename std::enable_if<!is_type_qkk<TB>::value, int>::type = 0>
void tinygemm_kernel_amx(int M, int N, int KB, const void* RESTRICT _A, const void* RESTRICT _B, TC* RESTRICT C,
                         int ldc) {
  using packed_B_t = packed_B_type<TB>;
  const int TILE_SIZE = get_tile_size<TB>();
  const bool need_unpack = do_unpack<TB>::value;

  GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
  const TA* RESTRICT A = static_cast<const TA*>(_A);
  const char* RESTRICT B = static_cast<const char*>(_B);

  const int m0 = std::min(M, TILE_M);
  const int m1 = std::max(M - TILE_M, 0);
  const int lda = KB * sizeof(TA);
  // const int ldb = KB * sizeof(TB);

  static thread_local packed_B_t Tile0[TILE_N * TILE_K];
  static thread_local packed_B_t Tile1[TILE_N * TILE_K];
  static thread_local int8_t Tile23[TILE_M * TILE_K];

  static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
  static thread_local int32_t TileC1[TILE_M * TILE_N * 4];

  // double buffering C to interleave avx512 and amx
  int32_t* C_cur = TileC0;
  int32_t* C_pre = TileC1;

  auto Tile4 = [&](int32_t* base) { return base; };
  auto Tile5 = [&](int32_t* base) { return base + TILE_M * TILE_N; };
  auto Tile6 = [&](int32_t* base) { return base + 2 * TILE_M * TILE_N; };
  auto Tile7 = [&](int32_t* base) { return base + 3 * TILE_M * TILE_N; };

  if (M == 2 * TILE_M) {
    // i = 0
    const char* B_blk0 = B + PACKED_INDEX(0, 0, KB, TILE_SIZE);
    const char* B_blk1 = B + PACKED_INDEX(1, 0, KB, TILE_SIZE);
    if (need_unpack) {
      unpack_B<TB>(Tile0, B_blk0);
      _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
    } else {
      _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
    }

    _tile_zero(TMM4);
    _tile_loadd(TMM2, A[0].qs, lda);
    _tile_dpbssd(TMM4, TMM2, TMM0);
    _tile_stored(TMM4, Tile4(C_pre), TILE_N * sizeof(int32_t));

    _tile_zero(TMM5);
    _tile_loadd(TMM3, A[TILE_M * KB + 0].qs, lda);
    _tile_dpbssd(TMM5, TMM3, TMM0);
    _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t));

    if (need_unpack) {
      unpack_B<TB>(Tile1, B_blk0);
      _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
    } else {
      _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
    }

    _tile_zero(TMM6);
    _tile_dpbssd(TMM6, TMM2, TMM1);
    _tile_stored(TMM6, Tile6(C_pre), TILE_N * sizeof(int32_t));

    _tile_zero(TMM7);
    _tile_dpbssd(TMM7, TMM3, TMM1);
    _tile_stored(TMM7, Tile7(C_pre), TILE_N * sizeof(int32_t));

    for (int i = 1; i < KB; ++i) {
      // index of previous iter
      const int ii = i - 1;
      const char* B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
      const char* B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
      GGML_DISPATCH_BOOL(ii > 0, is_acc, [&] {
        if (need_unpack) {
          unpack_B<TB>(Tile0, B_blk0);
          _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
        } else {
          _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
        }
        _tile_zero(TMM4);
        _tile_loadd(TMM2, A[i].qs, lda);
        acc_C<TA, TB, is_acc>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);

        _tile_dpbssd(TMM4, TMM2, TMM0);
        _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));

        _tile_zero(TMM5);
        _tile_loadd(TMM3, A[TILE_M * KB + i].qs, lda);
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB,
                                     B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);

        _tile_dpbssd(TMM5, TMM3, TMM0);
        _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));

        if (need_unpack) {
          unpack_B<TB>(Tile1, B_blk1);
          _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
        } else {
          _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
        }
        _tile_zero(TMM6);
        acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE),
                                     TILE_M);

        _tile_dpbssd(TMM6, TMM2, TMM1);
        _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));

        _tile_zero(TMM7);
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB,
                                     B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);

        _tile_dpbssd(TMM7, TMM3, TMM1);
        _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));

        std::swap(C_cur, C_pre);
      });
    }
    // final accumulation
    {
      int ii = KB - 1;
      acc_C<TA, TB, true>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
      acc_C<TA, TB, true>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB,
                                 B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
      acc_C<TA, TB, true>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE),
                                 TILE_M);
      acc_C<TA, TB, true>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB,
                                 B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
    }
  } else {
    for (int i = 0; i < KB; ++i) {
      _tile_zero(TMM4);
      _tile_zero(TMM6);
      if (m1 != 0) {
        _tile_zero(TMM5);
        _tile_zero(TMM7);
      }

      const char* B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
      const char* B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
      if (need_unpack) {
        unpack_B<TB>(Tile0, B_blk0);
        _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
      } else {
        _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
      }

      if (need_unpack) {
        unpack_B<TB>(Tile1, B_blk1);
        _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
      } else {
        _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
      }

      if (m0 == TILE_M) {
        _tile_loadd(TMM2, A[i].qs, lda);
      } else {
        unpack_A(Tile23, &A[i], KB, m0);
        _tile_loadd(TMM2, Tile23, TILE_K);
      }

      _tile_dpbssd(TMM4, TMM2, TMM0);
      _tile_dpbssd(TMM6, TMM2, TMM1);

      _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));
      _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));

      GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
        acc_C<TA, TB, is_acc>::apply(C, ldc, Tile4(C_cur), &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
        acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_cur), &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE),
                                     m0);
      });
      if (m1 != 0) {
        unpack_A(Tile23, &A[TILE_M * KB + i], KB, m1);
        _tile_loadd(TMM3, Tile23, TILE_K);

        _tile_dpbssd(TMM5, TMM3, TMM0);
        _tile_dpbssd(TMM7, TMM3, TMM1);
        _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));
        _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));
        GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
          acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Tile5(C_cur), &A[TILE_M * KB + i], KB,
                                       B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
          acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_cur), &A[TILE_M * KB + i], KB,
                                       B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
        });
      }
    }
  }
  return;
}

template <typename TA, typename TB, typename TC, int BLOCK_K,
          typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
void tinygemm_kernel_amx(int M, int N, int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C,
                         int ldc) {
  static_assert(std::is_same<TA, block_q8_K>::value);
  const int TILE_SIZE = get_tile_size<TB>();

  GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
  const TA* RESTRICT A = static_cast<const TA*>(_A);
  const char* RESTRICT B = static_cast<const char*>(_B);

  const int m0 = std::min(M, TILE_M);
  const int m1 = std::max(M - TILE_M, 0);
  // const int lda = KB * sizeof(TA);

  static thread_local int8_t Tile0[TILE_N * TILE_K];
  static thread_local int8_t Tile1[TILE_N * TILE_K];
  static thread_local int8_t Tile23[TILE_M * TILE_K];

  // mat mul result for each group
  static thread_local int32_t Tile4[TILE_M * TILE_N];
  static thread_local int32_t Tile5[TILE_M * TILE_N];
  static thread_local int32_t Tile6[TILE_M * TILE_N];
  static thread_local int32_t Tile7[TILE_M * TILE_N];

  // sum of each QK_K block, contains 8 groups, int32
  static thread_local int32_t Sumi4[TILE_M * TILE_N];
  static thread_local int32_t Sumi5[TILE_M * TILE_N];
  static thread_local int32_t Sumi6[TILE_M * TILE_N];
  static thread_local int32_t Sumi7[TILE_M * TILE_N];

  const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
  for (int i = 0; i < KB; ++i) {
    // step 1: accumulate the quants across 8 groups, each group with 32
    for (int k = 0; k < QK_K / k_group_size; ++k) {
      GGML_DISPATCH_BOOL(k > 0, is_acc, [&] {
        _tile_zero(TMM4);
        _tile_zero(TMM6);

        unpack_B<TB>(Tile0, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k);
        _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);

        unpack_B<TB>(Tile1, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k);
        _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);

        unpack_A<TB>(Tile23, &A[i], KB, k, m0);
        _tile_loadd(TMM2, Tile23, TILE_K);

        _tile_dpbssd(TMM4, TMM2, TMM0);
        _tile_dpbssd(TMM6, TMM2, TMM1);

        _tile_stored(TMM4, Tile4, TILE_N * sizeof(int32_t));
        _tile_stored(TMM6, Tile6, TILE_N * sizeof(int32_t));

        scale_C<TB, is_acc>(Tile4, Sumi4, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m0);
        scale_C<TB, is_acc>(Tile6, Sumi6, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m0);

        if (m1 != 0) {
          _tile_zero(TMM5);
          _tile_zero(TMM7);

          unpack_A<TB>(Tile23, &A[TILE_M * KB + i], KB, k, m1);
          _tile_loadd(TMM3, Tile23, TILE_K);

          _tile_dpbssd(TMM5, TMM3, TMM0);
          _tile_dpbssd(TMM7, TMM3, TMM1);

          _tile_stored(TMM5, Tile5, TILE_N * sizeof(int32_t));
          _tile_stored(TMM7, Tile7, TILE_N * sizeof(int32_t));

          scale_C<TB, is_acc>(Tile5, Sumi5, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m1);
          scale_C<TB, is_acc>(Tile7, Sumi7, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m1);
        }
      });
    }

    // step 2: accmulate the mins
    GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
      acc_C<TA, TB, is_acc>::apply(C, ldc, Sumi4, &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
      acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Sumi6, &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0);
      if (m1 != 0) {
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Sumi5, &A[TILE_M * KB + i], KB,
                                     B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Sumi7, &A[TILE_M * KB + i], KB,
                                     B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
      }
    });
  }
  return;
}

}  // anonymous namespace

#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18

bool ggml_amx_init() {
#if defined(__gnu_linux__)
  if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
    fprintf(stderr, "AMX is not ready to be used!\n");
    return false;
  }
  return true;
#elif defined(_WIN32)
  return true;
#endif
}

bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
  static thread_local bool is_first_time = true;
  if (is_first_time) {
#pragma omp single
    { ggml_amx_init(); }

    // load tile config
    ggml_tile_config_init();
  }
  is_first_time = false;

  const struct ggml_tensor* src0 = dst->src[0];
  const struct ggml_tensor* src1 = dst->src[1];

  const enum ggml_type type = src0->type;
  const int64_t ne0 = dst->ne[0];

  bool is_training = src0->grad || src1->grad;

  // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
  // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
  bool has_amx_kernels = (type == GGML_TYPE_Q4_0) || (type == GGML_TYPE_Q4_1) || (type == GGML_TYPE_Q8_0) ||
#ifndef GGML_QKK_64
                         // only enabled for QK_K == 256
                         (type == GGML_TYPE_Q4_K) || (type == GGML_TYPE_Q5_K) || (type == GGML_TYPE_Q6_K) ||
                         (type == GGML_TYPE_IQ4_XS) ||
#endif
                         (type == GGML_TYPE_F16);

  // handle only 2d gemm for now
  auto is_contiguous_2d = [](const struct ggml_tensor* t) {
    return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
  };

  return dst->op != GGML_OP_MUL_MAT_ID && is_contiguous_2d(src0) && is_contiguous_2d(src1) && !is_training &&
         src1->type == GGML_TYPE_F32 && has_amx_kernels &&
         // out features is 32x
         ne0 % (TILE_N * 2) == 0;
}

// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
//
// src0: weight in shape of {N, K}, quantized
// src1: input  in shape of {M, K}, float32
// dst:  output in shape of {M, N}, float32
//
// the function performs: dst = src1 @ src0.T
//
void ggml_mul_mat_amx(struct ggml_tensor* dst, int nth, int ith, void* wdata, int wsize) {
  struct ggml_tensor* src0 = dst->src[0];
  struct ggml_tensor* src1 = dst->src[1];

  const enum ggml_type TYPE = src0->type;

  // f16 only has avx512 kernels for now,
  // amx kernels will be added once 6th gen xeon is released.
  const bool is_floating_type = TYPE == GGML_TYPE_F16;

  const int M = dst->ne[1];
  const int N = dst->ne[0];
  const int K = src0->ne[0];
  const int ldc = dst->nb[1] / dst->nb[0];

  if (is_floating_type) {
    constexpr int BLOCK_M = 4;
    constexpr int BLOCK_N = 6;
    const int MB = div_up(M, BLOCK_M);
    const int NB = div_up(N, BLOCK_N);

    parallel_for(nth, ith, MB * NB, [&](int begin, int end) {
      GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
        for (int i = begin; i < end; ++i) {
          int mb = i / NB;
          int nb = i % NB;

          int mb_start = mb * BLOCK_M;
          int mb_size = std::min(BLOCK_M, M - mb_start);
          int nb_start = nb * BLOCK_N;
          int nb_size = std::min(BLOCK_N, N - nb_start);

          switch (mb_size << 4 | nb_size) {
            case 0x12:
              LAUNCH_TINYGEMM_KERNEL_AVX(1, 2);
              break;
            case 0x14:
              LAUNCH_TINYGEMM_KERNEL_AVX(1, 4);
              break;
            case 0x16:
              LAUNCH_TINYGEMM_KERNEL_AVX(1, 6);
              break;
            case 0x22:
              LAUNCH_TINYGEMM_KERNEL_AVX(2, 2);
              break;
            case 0x24:
              LAUNCH_TINYGEMM_KERNEL_AVX(2, 4);
              break;
            case 0x26:
              LAUNCH_TINYGEMM_KERNEL_AVX(2, 6);
              break;
            case 0x32:
              LAUNCH_TINYGEMM_KERNEL_AVX(3, 2);
              break;
            case 0x34:
              LAUNCH_TINYGEMM_KERNEL_AVX(3, 4);
              break;
            case 0x36:
              LAUNCH_TINYGEMM_KERNEL_AVX(3, 6);
              break;
            case 0x42:
              LAUNCH_TINYGEMM_KERNEL_AVX(4, 2);
              break;
            case 0x44:
              LAUNCH_TINYGEMM_KERNEL_AVX(4, 4);
              break;
            case 0x46:
              LAUNCH_TINYGEMM_KERNEL_AVX(4, 6);
              break;
            default:
              fprintf(stderr, "Unexpected block size!\n");
          }
        }
      });
    });
    return;
  }

#pragma omp single
  {
    GGML_DISPATCH_QTYPES(TYPE, [&] {
      const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
      GGML_ASSERT(wsize >= int(M * row_size_A));

      // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
      // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
      GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
      // pack mat B to vnni format
      if (src0->extra == nullptr) {
        const size_t row_size_B = get_row_size<type, blck_size>(K);
        src0->extra = aligned_alloc(64, N * row_size_B);
        convert_B_packed_format<type, blck_size>((void*)src0->extra, (const type*)src0->data, N, K);
      }

      const float* A_data = static_cast<const float*>(src1->data);
      for (int m = 0; m < M; ++m) {
        from_float<vec_dot_type>(A_data + m * K, (char*)wdata + m * row_size_A, K);
      }
    });
  }

  GGML_ASSERT(src0->extra != nullptr);
  if (M == 1) {
    // MB = 1 and handle 8 tiles in each block
    constexpr int kTilesN = 4;
    constexpr int BLOCK_N = TILE_N * kTilesN;
    const int NB = div_up(N, BLOCK_N);

    parallel_for(nth, ith, NB, [&](int begin, int end) {
      GGML_DISPATCH_QTYPES(TYPE, [&] {
        const int KB = K / blck_size;
        const int TILE_SIZE = get_tile_size<type>();
        const int row_size_A = KB * sizeof(vec_dot_type);
        for (int i = begin; i < end; ++i) {
          int nb = i;
          int nb_start = nb * BLOCK_N;
          int nb_size = std::min(BLOCK_N, N - nb_start);  // 32, 64, 96

          switch (nb_size) {
            // case 160: LAUNCH_TINYGEMM_KERNEL_VNNI(160); break;
            case 128:
              LAUNCH_TINYGEMM_KERNEL_VNNI(128);
              break;
            case 96:
              LAUNCH_TINYGEMM_KERNEL_VNNI(96);
              break;
            case 64:
              LAUNCH_TINYGEMM_KERNEL_VNNI(64);
              break;
            case 32:
              LAUNCH_TINYGEMM_KERNEL_VNNI(32);
              break;
            default:
              fprintf(stderr, "Unexpected n block size!\n");
          }
        }
      });
    });
    return;
  }

  // handle 4 tiles at a tile
  constexpr int BLOCK_M = TILE_M * 2;
  constexpr int BLOCK_N = TILE_N * 2;
  const int MB = div_up(M, BLOCK_M);
  const int NB = div_up(N, BLOCK_N);

  parallel_for(nth, ith, MB * NB, [&](int begin, int end) {
    GGML_DISPATCH_QTYPES(TYPE, [&] {
      const int KB = K / blck_size;
      const int TILE_SIZE = get_tile_size<type>();
      const int row_size_A = KB * sizeof(vec_dot_type);

      for (int i = begin; i < end; ++i) {
        int mb = i / NB;
        int nb = i % NB;

        int mb_start = mb * BLOCK_M;
        int mb_size = std::min(BLOCK_M, M - mb_start);
        int nb_start = nb * BLOCK_N;
        int nb_size = BLOCK_N;

        tinygemm_kernel_amx<vec_dot_type, type, float, blck_size>(
            mb_size, nb_size, KB, (const char*)wdata + mb_start * row_size_A,
            (const char*)src0->extra + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
            (float*)dst->data + mb_start * N + nb_start, ldc);
      }
    });
  });
}

#else  // if defined(__AMX_INT8__)

bool ggml_amx_init() {
  fprintf(stderr, "GGML is not compiled with AMX support!\n");
  return false;
}

bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
  GGML_UNUSED(dst);
  fprintf(stderr, "GGML is not compiled with AMX support!\n");
  return false;
}

void ggml_mul_mat_amx(struct ggml_tensor* dst, int nth, int ith, void* wdata, int wsize) {
  GGML_UNUSED(dst);
  GGML_UNUSED(nth);
  GGML_UNUSED(ith);
  GGML_UNUSED(wdata);
  GGML_UNUSED(wsize);
  fprintf(stderr, "GGML is not compiled with AMX support!\n");
}

#endif  // if defined(__AMX_INT8__)

void test_gemm() {
  std::mt19937 gen(123);
  // const int m=10,n=10,k=10;
  const int m = 100, n = 100, k = 1024;
  Mat<float> a(m, k, Layout::RowMajor), b(k, n, Layout::ColumnMajor);
  a.random(gen);
  b.random(gen);

  a.print();
  b.print();

  ggml_type a_type = GGML_TYPE_Q4_K;
  a.quant(a_type);
  b.quant(ggml_internal_get_type_traits(a_type).vec_dot_type);

  auto c = a.mul_check(b);

  // quantize_row_q4_K_reference(a.data, block_q4_K *restrict y, int64_t k)

  c.print();
}

int main() {
  // int32_t x[1000]={};
  // int32_t y[1000]={};
  // for(int i=0;i<1000;i++){
  //   x[i] = i;
  // }
  // // transpose_16x16_32bit(reinterpret_cast<__m512i*>(x));
  // // transpose_16x4_32bit(reinterpret_cast<__m512i*>(x),(__m512i*)y);
  // transpose_8x8_32bit((__m256i*)x,  (__m256i*)y);
  // for(int i=0;i<300;i++){
  //   if(i%8==0) printf("\n");
  //   printf("%d ",x[i]);
  // }
  // for (int i = 0; i < 300; i++) {
  //   if (i % 8 == 0)
  //     printf("\n");
  //   printf("%d ", y[i]);
  // }

  // block_q8_0 test[20] = {};
  // for(int i=0;i<20;i++){
  //   for(int j=0;j<32;j++){
  //     test[i].qs[j] = i*32+j;
  //   }
  //   test[i].d = 0xffff;
  // }
  // uint8_t test_out[1000];

  // for (int i = 0; i < 512; i++) {
  //   if (i % 32 == 0)
  //     printf("\n");
  //   printf("%d ", test[i/32].qs[i%32]);

  // }

  // pack_B(test_out, test, 1);

  // for(int i=0;i<512;i++){
  //   if(i%32==0) printf("\n");
  //   printf("%d ",test_out[i]);
  // }

  test_gemm();

  return 0;
}


================================================
FILE: kt-kernel/operators/amx/test/mmq.cpp
================================================

#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#endif

#include "mmq.h"

#include <algorithm>
#include <type_traits>

#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"

#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif

#if defined(_OPENMP)
#include <omp.h>
#endif

#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
#define RESTRICT __restrict__
#endif

#if (defined(_WIN32) || defined(_WIN64))
#define ALWAYS_INLINE __forceinline
#elif __has_attribute(always_inline) || defined(__GNUC__)
#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
#else
#define ALWAYS_INLINE inline
#endif

#if defined(__AMX_INT8__)

namespace {

#define TILE_M 16
#define TILE_N 16
#define TILE_K 32
#define VNNI_BLK 4

#define AMX_BLK_SIZE 32

#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7

// parallel routines
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
inline T div_up(T x, T y) {
  return (x + y - 1) / y;
}

template <typename T>
void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
#if 0
  // onednn partition pattern
  T& n_my = n_end;
  if (nth <= 1 || n == 0) {
    n_start = 0;
    n_my = n;
  } else {
    T n1 = div_up(n, nth);
    T n2 = n1 - 1;
    T T1 = n - n2 * nth;
    n_my = ith < T1 ? n1 : n2;
    n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
  }
  n_end += n_start;
#else
  // pytorch aten partition pattern
  T n_my = div_up(n, nth);
  n_start = ith * n_my;
  n_end = std::min(n_start + n_my, n);
#endif
}

template <typename func_t>
inline void parallel_for(int nth, int ith, int n, const func_t& f) {
  // int nth = omp_get_num_threads();
  // int ith = omp_get_thread_num();
  int tbegin, tend;
  balance211(n, nth, ith, tbegin, tend);
  f(tbegin, tend);
}

// Forced unrolling
template <int n>
struct Unroll {
  template <typename Func, typename... Args>
  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
    Unroll<n - 1>{}(f, args...);
    f(std::integral_constant<int, n - 1>{}, args...);
  }
};

template <>
struct Unroll<1> {
  template <typename Func, typename... Args>
  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
    f(std::integral_constant<int, 0>{}, args...);
  }
};

// type traits
template <typename T>
struct PackedTypes {};
template <>
struct PackedTypes<block_q4_0> {
  using type = int8_t;
};
template <>
struct PackedTypes<block_q4_1> {
  using type = uint8_t;
};
template <>
struct PackedTypes<block_q8_0> {
  using type = int8_t;
};
template <typename T>
using packed_B_type = typename PackedTypes<T>::type;

template <typename T>
struct do_compensate : std::integral_constant<bool, std::is_same<T, block_q8_0>::value> {};

template <typename T>
struct do_unpack
    : std::integral_constant<bool, std::is_same<T, block_q4_0>::value || std::is_same<T, block_q4_1>::value> {};

template <typename T>
struct is_type_qkk
    : std::integral_constant<bool, std::is_same<T, block_q4_K>::value || std::is_same<T, block_q5_K>::value ||
                                       std::is_same<T, block_q6_K>::value || std::is_same<T, block_iq4_xs>::value> {};

#define GGML_DISPATCH_FLOATING_TYPES(TYPE, ...)              \
  [&] {                                                      \
    switch (TYPE) {                                          \
      case GGML_TYPE_F16: {                                  \
        using type = ggml_fp16_t;                            \
        constexpr int blck_size = 16;                        \
        return __VA_ARGS__();                                \
      }                                                      \
      case GGML_TYPE_BF16: {                                 \
        using type = ggml_bf16_t;                            \
        constexpr int blck_size = 32;                        \
        return __VA_ARGS__();                                \
      }                                                      \
      default:                                               \
        fprintf(stderr, "Unsupported floating data type\n"); \
    }                                                        \
  }()

#define GGML_DISPATCH_QTYPES(QT, ...)                         \
  [&] {                                                       \
    switch (QT) {                                             \
      case GGML_TYPE_Q4_0: {                                  \
        using type = block_q4_0;                              \
        using vec_dot_type = block_q8_0;                      \
        constexpr int blck_size = QK4_0;                      \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q4_1: {                                  \
        using type = block_q4_1;                              \
        using vec_dot_type = block_q8_1;                      \
        constexpr int blck_size = QK4_1;                      \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q8_0: {                                  \
        using type = block_q8_0;                              \
        using vec_dot_type = block_q8_0;                      \
        constexpr int blck_size = QK8_0;                      \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q4_K: {                                  \
        using type = block_q4_K;                              \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q5_K: {                                  \
        using type = block_q5_K;                              \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_Q6_K: {                                  \
        using type = block_q6_K;                              \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      case GGML_TYPE_IQ4_XS: {                                \
        using type = block_iq4_xs;                            \
        using vec_dot_type = block_q8_K;                      \
        constexpr int blck_size = QK_K;                       \
        return __VA_ARGS__();                                 \
      }                                                       \
      default:                                                \
        fprintf(stderr, "Unsupported quantized data type\n"); \
    }                                                         \
  }()

#define GGML_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \
  [&] {                                            \
    if (BOOL_V) {                                  \
      constexpr bool BOOL_NAME = true;             \
      return __VA_ARGS__();                        \
    } else {                                       \
      constexpr bool BOOL_NAME = false;            \
      return __VA_ARGS__();                        \
    }                                              \
  }()

// define amx tile config data structure
struct tile_config_t {
  uint8_t palette_id = 0;
  uint8_t start_row = 0;
  uint8_t reserved_0[14] = {0};
  uint16_t colsb[16] = {0};
  uint8_t rows[16] = {0};
};

// Notes: amx tile config
//
// Typically, TMUL calculates A and B of size 16 x 64 containing INT8 values,
// and accumulate the result to a 16 x 16 matrix C containing INT32 values,
//
// As many GGUF quantized types as `block_size` of 32, so a 16-16-32 config is used
// instead of the normally used 16-16-64 config.
//
//   Block A: {16, 32}, dtype = int8_t
//   Block B: {16, 32}, dtype = uint8_t/int8_t
//   Block C: {16, 16}, dtype = int32_t
//
// Block B needs to be prepacked to vnni format before feeding into  TMUL:
//   packed_B: from {n, k} to {k/vnni_blk, n, vnni_blck}, viewed in 2d, we get {8, 64}
//
// Therefore, we get tileconfig:
//             A    B    C
//    rows    16    8   16
//    colsb   32   64   16
//
// For tile distribution, follow a 2-2-4 pattern, e.g. A used TMM2-TMM3, B used TMM0-TMM1,
// C used TMM4-TMM7:
//            B TMM0  B TMM1
//    A TMM2  C TMM4  C TMM6
//    A TMM3  C TMM5  C TMM7
//
// Each `amx` kernel handles 4 blocks at a time: 2MB * 2NB, when m < 2 * BLOCK_M, unpack A
// will be needed.
//
// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
//
// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
//   advanced-matrix-extensions-intrinsics-functions.html
//

#define TC_CONFIG_TILE(i, r, cb) \
  tc.rows[i] = r;                \
  tc.colsb[i] = cb
void ggml_tile_config_init(void) {
  static thread_local tile_config_t tc;
  tile_config_t current_tc;
  _tile_storeconfig(&current_tc);

  // load only when config changes
  if (tc.palette_id == 0 || (memcmp(&current_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 &&
                             memcmp(&current_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) {
    tc.palette_id = 1;
    tc.start_row = 0;
    TC_CONFIG_TILE(TMM0, 8, 64);
    TC_CONFIG_TILE(TMM1, 8, 64);
    TC_CONFIG_TILE(TMM2, 16, 32);
    TC_CONFIG_TILE(TMM3, 16, 32);
    TC_CONFIG_TILE(TMM4, 16, 64);
    TC_CONFIG_TILE(TMM5, 16, 64);
    TC_CONFIG_TILE(TMM6, 16, 64);
    TC_CONFIG_TILE(TMM7, 16, 64);
    _tile_loadconfig(&tc);
  }
}

// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation.
// See the notes `s8s8 igemm compensation in avx512-vnni` for detail.
template <typename TB>
int get_tile_size() {
  int tile_size = TILE_N * sizeof(TB);
  if (do_compensate<TB>::value) {
    tile_size += TILE_N * sizeof(int32_t);
  }
  if (std::is_same<TB, block_q4_K>::value || std::is_same<TB, block_q5_K>::value) {
    tile_size += TILE_N * 4;
  }
  if (std::is_same<TB, block_iq4_xs>::value) {
    tile_size += TILE_N * 2;
  }
  return tile_size;
}

template <typename TB, int BLOCK_K>
int get_row_size(int K) {
  int KB = K / BLOCK_K;
  int row_size = KB * sizeof(TB);
  if (do_compensate<TB>::value) {
    row_size += KB * sizeof(int32_t);
  }
  if (std::is_same<TB, block_q4_K>::value || std::is_same<TB, block_q5_K>::value) {
    row_size += KB * 4;
  }
  if (std::is_same<TB, block_iq4_xs>::value) {
    row_size += KB * 2;
  }
  return row_size;
}

// vectorized dtype conversion
inline float FP16_TO_FP32(ggml_half val) {
  __m256i v = _mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  __m512 o = _mm512_cvtph_ps(v);
  return _mm512_cvtss_f32(o);
}

inline __m512 FP16_TO_FP32_VEC(ggml_half val) {
  __m256i v = _mm256_set1_epi16(val);
  return _mm512_cvtph_ps(v);
}

// horizontal reduce
inline float _mm512_reduce_max_ps(const __m512 x) {
  __m512 v = x;
  __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
  v = _mm512_max_ps(v, v1);
  v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
  v = _mm512_max_ps(v, v1);
  v1 = _mm512_shuffle_ps(v, v, 0x4E);
  v = _mm512_max_ps(v, v1);
  v1 = _mm512_shuffle_ps(v, v, 0xB1);
  v = _mm512_max_ps(v, v1);
  return _mm512_cvtss_f32(v);
}

// transpose utils
#define SHUFFLE_EPI32(a, b, mask) \
  _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))
inline void transpose_8x8_32bit(__m256i* v, __m256i* v1) {
  // unpacking and 32-bit elements
  v1[0] = _mm256_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm256_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm256_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm256_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm256_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm256_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm256_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm256_unpackhi_epi32(v[6], v[7]);

  // shuffling the 32-bit elements
  v[0] = SHUFFLE_EPI32(v1[0], v1[2], 0x44);
  v[1] = SHUFFLE_EPI32(v1[0], v1[2], 0xee);
  v[2] = SHUFFLE_EPI32(v1[4], v1[6], 0x44);
  v[3] = SHUFFLE_EPI32(v1[4], v1[6], 0xee);
  v[4] = SHUFFLE_EPI32(v1[1], v1[3], 0x44);
  v[5] = SHUFFLE_EPI32(v1[1], v1[3], 0xee);
  v[6] = SHUFFLE_EPI32(v1[5], v1[7], 0x44);
  v[7] = SHUFFLE_EPI32(v1[5], v1[7], 0xee);

  // shuffling 128-bit elements
  v1[0] = _mm256_permute2f128_si256(v[2], v[0], 0x02);
  v1[1] = _mm256_permute2f128_si256(v[3], v[1], 0x02);
  v1[2] = _mm256_permute2f128_si256(v[6], v[4], 0x02);
  v1[3] = _mm256_permute2f128_si256(v[7], v[5], 0x02);
  v1[4] = _mm256_permute2f128_si256(v[2], v[0], 0x13);
  v1[5] = _mm256_permute2f128_si256(v[3], v[1], 0x13);
  v1[6] = _mm256_permute2f128_si256(v[6], v[4], 0x13);
  v1[7] = _mm256_permute2f128_si256(v[7], v[5], 0x13);
}

inline void transpose_16x4_32bit(__m512i* r, __m512i* d) {
  static const __m512i index1 =
      _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02, 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);

  d[0] = _mm512_permutexvar_epi32(index1, r[0]);
  d[1] = _mm512_permutexvar_epi32(index1, r[1]);
  d[2] = _mm512_permutexvar_epi32(index1, r[2]);
  d[3] = _mm512_permutexvar_epi32(index1, r[3]);

  r[0] = _mm512_shuffle_i32x4(d[0], d[1], 0x44);
  r[1] = _mm512_shuffle_i32x4(d[0], d[1], 0xee);
  r[2] = _mm512_shuffle_i32x4(d[2], d[3], 0x44);
  r[3] = _mm512_shuffle_i32x4(d[2], d[3], 0xee);

  d[0] = _mm512_shuffle_i32x4(r[0], r[2], 0x88);
  d[1] = _mm512_shuffle_i32x4(r[0], r[2], 0xdd);
  d[2] = _mm512_shuffle_i32x4(r[1], r[3], 0x88);
  d[3] = _mm512_shuffle_i32x4(r[1], r[3], 0xdd);
}

inline void transpose_16x16_32bit(__m512i* v) {
  __m512i v1[16];
  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);

  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);

  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

void quantize_row_q8_K_vnni(const float* RESTRICT x, void* RESTRICT vy, int64_t k) {
  assert(k % QK_K == 0);
  const int KB = k / QK_K;
  constexpr int kVecs = QK_K / 16;

  block_q8_K* y = reinterpret_cast<block_q8_K*>(vy);

  // hold 16 float vecs from x
  __m512 v[kVecs];

  // hold the quants vecs
  __m512i vq[kVecs / 4];

  // hold the packed quants vecs
  __m512i vq_packed[kVecs / 4];

  const __m512 signBit = _mm512_set1_ps(-0.f);

  for (int i = 0; i < KB; ++i) {
    // Compute max(abs(e)) for the block
    __m512 vamax = _mm512_set1_ps(0.f);
    for (int j = 0; j < kVecs; ++j) {
      v[j] = _mm512_loadu_ps(x);
      x += 16;
      vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(signBit, v[j]));
    }
    const float amax = _mm512_reduce_max_ps(vamax);

    // Quantize these floats
    const float iscale = 127.f / amax;
    y[i].d = GGML_FP32_TO_FP16(1 / iscale);
    const float id = (amax != 0.0f) ? iscale : 0.f;
    const __m512 vscale = _mm512_set1_ps(id);

    // Apply multiplier and round to nearest integer
    for (int j = 0; j < kVecs; ++j) {
      v[j] = _mm512_mul_ps(v[j], vscale);
      v[j] = _mm512_roundscale_ps(v[j], (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
    }

    // Pack to epi8 vecs
    for (int j = 0; j < kVecs / 4; ++j) {
      __m128i q8_0 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 0]));
      __m128i q8_1 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 1]));
      __m128i q8_2 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 2]));
      __m128i q8_3 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 3]));

      __m256i q8_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_0), (q8_1), 1);
      __m256i q8_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_2), (q8_3), 1);

      vq[j] = _mm512_inserti32x8(_mm512_castsi256_si512(q8_01), q8_23, 1);
      _mm512_storeu_si512((__m512i*)(y[i].qs + j * 64), vq[j]);
    }

    // Compute the bsums with vnni
    transpose_16x4_32bit(vq, vq_packed);

    const __m512i one = _mm512_set1_epi8(1);
    __m512i sum = _mm512_setzero_si512();
    for (int k = 0; k < 4; ++k) {
      sum = _mm512_dpbusd_epi32(sum, one, vq_packed[k]);
    }
    _mm256_storeu_si256((__m256i*)(y[i].bsums), _mm512_cvtepi32_epi16(sum));
  }
}

// quantize A from float to `vec_dot_type`
template <typename T>
inline void from_float(const float* x, char* vy, int64_t k);

template <>
inline void from_float<block_q8_0>(const float* x, char* vy, int64_t k) {
  quantize_row_q8_0(x, vy, k);
}

template <>
inline void from_float<block_q8_1>(const float* x, char* vy, int64_t k) {
  quantize_row_q8_1(x, vy, k);
}

template <>
inline void from_float<block_q8_K>(const float* x, char* vy, int64_t k) {
#if 1
  // TODO: this is reference impl!
  quantize_row_q8_K(x, vy, k);
#else
  quantize_row_q8_K_vnni(x, vy, k);
#endif
}

// load A from memory to array when nrows can not fill in whole tile
void unpack_A(int8_t* RESTRICT tile, const block_q8_0* RESTRICT A, int lda, int nr) {
  assert(nr != TILE_M);
  for (int m = 0; m < nr; ++m) {
    const __m256i v = _mm256_loadu_si256((const __m256i*)(A[m * lda].qs));
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), v);
  }
}

void unpack_A(int8_t* RESTRICT tile, const block_q8_1* RESTRICT A, int lda, int nr) {
  assert(nr != TILE_M);
  for (int m = 0; m < nr; ++m) {
    const __m256i v = _mm256_loadu_si256((const __m256i*)(A[m * lda].qs));
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), v);
  }
}

template <typename TB>
void unpack_A(int8_t* RESTRICT tile, const block_q8_K* RESTRICT A, int lda, int k, int nr) {
  assert(nr <= TILE_M);
  for (int m = 0; m < nr; ++m) {
    const __m256i v = _mm256_loadu_si256((const __m256i*)(A[m * lda].qs + k * 32));
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), v);
  }
}

template <>
void unpack_A<block_q6_K>(int8_t* RESTRICT tile, const block_q8_K* RESTRICT A, int lda, int k, int nr) {
  assert(nr <= TILE_M);
  // zero padding k from 16 to 32, so that we don't have to re-config amx
  const __m128i zero = _mm_setzero_si128();
  for (int m = 0; m < nr; ++m) {
    const __m128i v = _mm_loadu_si128((const __m128i*)(A[m * lda].qs + k * 16));
    const __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(v), zero, 1);
    _mm256_storeu_si256((__m256i*)(tile + m * TILE_K), r);
  }
}

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
inline __m256i bytes_from_nibbles_32(const uint8_t* rsi) {
  const __m128i tmp = _mm_loadu_si128((const __m128i*)rsi);
  const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
  const __m256i lowMask = _mm256_set1_epi8(0xF);
  return _mm256_and_si256(lowMask, bytes);
}

// used for block_q4_K
inline __m512i bytes_from_nibbles_64(const uint8_t* rsi) {
  const __m256i tmp = _mm256_loadu_si256((const __m256i*)rsi);
  const __m256i lowMask = _mm256_set1_epi8(0xF);
  const __m256i q4l = _mm256_and_si256(tmp, lowMask);
  const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(tmp, 4), lowMask);
  return _mm512_inserti32x8(_mm512_castsi256_si512(q4l), q4h, 1);
}

// used for block_q5_K
inline __m512i bytes_from_nibbles_64(const uint8_t* qs, const uint8_t* qh, int k) {
  const __m256i lowMask = _mm256_set1_epi8(0xF);
  __m256i hmask = _mm256_set1_epi8(1);
  hmask = _mm256_slli_epi16(hmask, k);

  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)qs);
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)qh);

  const __m256i q5l_0 = _mm256_and_si256(q5bits, lowMask);
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 0), 4);
  const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
  hmask = _mm256_slli_epi16(hmask, 1);

  const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), lowMask);
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 1), 4);
  const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);

  return _mm512_inserti32x8(_mm512_castsi256_si512(q5_0), q5_1, 1);
}

// used for block_q6_K
inline void bytes_from_nibbles_128(__m512i& r0, __m512i& r1, const uint8_t* qs, const uint8_t* qh) {
  const __m256i m4 = _mm256_set1_epi8(0xF);
  const __m256i m2 = _mm256_set1_epi8(0x3);

  const __m256i q6bits1 = _mm256_loadu_si256((const __m256i*)qs);
  const __m256i q6bits2 = _mm256_loadu_si256((const __m256i*)(qs + 32));
  const __m256i q6bitsH = _mm256_loadu_si256((const __m256i*)qh);

  const __m256i q6h_0 = _mm256_slli_epi16(_mm256_and_si256(q6bitsH, m2), 4);
  const __m256i q6h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 2), m2), 4);
  const __m256i q6h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 4), m2), 4);
  const __m256i q6h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 6), m2), 4);

  const __m256i q6_0 = _mm256_or_si256(_mm256_and_si256(q6bits1, m4), q6h_0);
  const __m256i q6_1 = _mm256_or_si256(_mm256_and_si256(q6bits2, m4), q6h_1);
  const __m256i q6_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits1, 4), m4), q6h_2);
  const __m256i q6_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits2, 4), m4), q6h_3);

  r0 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_0), q6_1, 1);
  r1 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_2), q6_3, 1);
}

inline __m512i packNibbles(__m512i r0, __m512i r1) { return _mm512_or_si512(r0, _mm512_slli_epi16(r1, 4)); }

template <typename TB>
inline void pack_qs(void* RESTRICT packed_B, const TB* RESTRICT B, int KB) {
  int8_t tmp[8 * 64];
  __m256i v[8], v2[8];
  for (int n = 0; n < 8; ++n) {
    v[n] = bytes_from_nibbles_32(B[n * KB].qs);
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)(tmp + n * 64), v2[n]);
  }
  for (int n = 0; n < 8; ++n) {
    v[n] = bytes_from_nibbles_32(B[(n + 8) * KB].qs);
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)(tmp + n * 64 + 32), v2[n]);
  }

  // pack again with 128 to fully utilize vector length
  for (int n = 0; n < 8; n += 2) {
    __m512i r0 = _mm512_loadu_si512((const __m512i*)(tmp + n * 64));
    __m512i r1 = _mm512_loadu_si512((const __m512i*)(tmp + n * 64 + 64));
    __m512i r1r0 = packNibbles(r0, r1);
    _mm512_storeu_si512((__m512i*)((char*)packed_B + n * 32), r1r0);
  }
}

template <>
inline void pack_qs<block_q8_0>(void* RESTRICT packed_B, const block_q8_0* RESTRICT B, int KB) {
  __m256i v[8], v2[8];
  for (int n = 0; n < 8; ++n) {
    v[n] = _mm256_loadu_si256((const __m256i*)(B[n * KB].qs));
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)((char*)packed_B + n * 64), v2[n]);
  }
  for (int n = 0; n < 8; ++n) {
    v[n] = _mm256_loadu_si256((const __m256i*)(B[(n + 8) * KB].qs));
  }
  transpose_8x8_32bit(v, v2);
  for (int n = 0; n < 8; ++n) {
    _mm256_storeu_si256((__m256i*)((char*)packed_B + n * 64 + 32), v2[n]);
  }
}

template <>
inline void pack_qs<block_q4_K>(void* RESTRICT packed_B, const block_q4_K* RESTRICT B, int KB) {
  __m512i v[16];
  // QK_K 256 with 8 groups, handle 2 groups at a time
  char* pb = (char*)packed_B;
  for (int k = 0; k < QK_K / 64; ++k) {
    // pack 2 groups { n, g,  k} to {g, k/4, 4n}
    //          e.g. {16, 2, 32} to {2,   8, 64}
    for (int n = 0; n < TILE_N; ++n) {
      v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32);
    }

    transpose_16x16_32bit(v);

    // pack again with 128 to fully utilize vector length
    for (int n = 0; n < TILE_N; n += 2) {
      _mm512_storeu_si512((__m512i*)pb, packNibbles(v[n], v[n + 1]));
      pb += 64;
    }
  }
}

template <>
inline void pack_qs<block_q5_K>(void* RESTRICT packed_B, const block_q5_K* RESTRICT B, int KB) {
  __m512i v[16];
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  // QK_K 256 with 8 groups, handle 2 groups at a time
  char* pb = (char*)packed_B;
  char* ph = (char*)packed_B + (QK_K / 2) * TILE_N;
  for (int k = 0; k < QK_K / 64; ++k) {
    // pack 2 groups { n, g,  k} to {g, k/4, 4n}
    //          e.g. {16, 2, 32} to {2,   8, 64}
    for (int n = 0; n < TILE_N; ++n) {
      v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32, B[n * KB].qh, /* group */ 2 * k);
    }

    transpose_16x16_32bit(v);

    // 1. pack lower 4bits with 2 groups
    for (int n = 0; n < TILE_N; n += 2) {
      // get lower 4 bits
      const __m512i r0 = _mm512_and_si512(v[n], lowMask);
      const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
      _mm512_storeu_si512((__m512i*)pb, packNibbles(r0, r1));
      pb += 64;
    }

    // 2. pack higher 1bit with 2 groups
    const __m512i hmask = _mm512_set1_epi8(0x10);
    for (int g = 0; g < 2; ++g) {
      __m512i hbits = _mm512_setzero_si512();
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 0], hmask), 4));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 1], hmask), 3));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 2], hmask), 2));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 3], hmask), 1));
      hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 8 + 4], hmask));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 5], hmask), 1));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 6], hmask), 2));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 7], hmask), 3));
      _mm512_storeu_si512((__m512i*)ph, hbits);
      ph += 64;
    }
  }
}

template <>
inline void pack_qs<block_q6_K>(void* RESTRICT packed_B, const block_q6_K* RESTRICT B, int KB) {
  __m512i v[32];
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  // QK_K 256 with 8 groups, handle 4 groups at a time
  char* pb = (char*)packed_B;
  char* ph = (char*)packed_B + (QK_K / 2) * TILE_N;
  for (int k = 0; k < QK_K / 128; ++k) {
    for (int n = 0; n < TILE_N; ++n) {
      bytes_from_nibbles_128(v[n], v[n + 16], B[n * KB].ql + k * 64, B[n * KB].qh + k * 32);
    }

    // top half: group 0,1 or 4,5; bottom half: group 2,3 or 6,7
    transpose_16x16_32bit(v);
    transpose_16x16_32bit(v + 16);

    // 1. pack lower 4bits with 4 groups
    for (int n = 0; n < 32; n += 2) {
      const __m512i r0 = _mm512_and_si512(v[n], lowMask);
      const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
      _mm512_storeu_si512((__m512i*)pb, packNibbles(r0, r1));
      pb += 64;
    }

    // 2. pack higher 2bit with 4 groups
    const __m512i hmask = _mm512_set1_epi8(0x30);
    for (int g = 0; g < 8; ++g) {
      __m512i hbits = _mm512_setzero_si512();
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 0], hmask), 4));
      hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 1], hmask), 2));
      hbits = _mm512_add_epi8(hbits, _mm512_and_si512(v[g * 4 + 2], hmask));
      hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 4 + 3], hmask), 2));
      _mm512_storeu_si512((__m512i*)ph, hbits);
      ph += 64;
    }
  }
}

template <>
inline void pack_qs<block_iq4_xs>(void* RESTRICT packed_B, const block_iq4_xs* RESTRICT B, int KB) {
  __m512i v[16];
  char* pb = (char*)packed_B;
  for (int k = 0; k < QK_K / 64; ++k) {
    for (int n = 0; n < TILE_N; ++n) {
      __m256i r0 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 0);
      __m256i r1 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 16);
      v[n] = _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
    }

    transpose_16x16_32bit(v);

    // pack again with 128 to fully utilize vector length
    for (int n = 0; n < TILE_N; n += 2) {
      _mm512_storeu_si512((__m512i*)pb, packNibbles(v[n], v[n + 1]));
      pb += 64;
    }
  }
}

// pack B to vnni formats in 4bits or 8 bits
void pack_B(void* RESTRICT packed_B, const block_q4_0* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);
  ggml_half* d0 = reinterpret_cast<ggml_half*>((char*)packed_B + TILE_N * TILE_K / 2);
  for (int n = 0; n < TILE_N; ++n) {
    d0[n] = B[n * KB].d;
  }
}

void pack_B(void* RESTRICT packed_B, const block_q4_1* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);
  ggml_half* d0 = reinterpret_cast<ggml_half*>((char*)packed_B + TILE_N * TILE_K / 2);
  ggml_half* m0 = d0 + TILE_N;
  for (int n = 0; n < TILE_N; ++n) {
    d0[n] = B[n * KB].d;
    m0[n] = B[n * KB].m;
  }
}

inline void s8s8_compensation(void* RESTRICT packed_B) {
  // packed_B layout:
  //   quants {TILE_N, TILEK}  int8_t
  //   d0     {TILE_N}      ggml_half
  //   comp   {TILE_N}        int32_t
  const int offset = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
  __m512i vcomp = _mm512_setzero_si512();
  const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
  for (int k = 0; k < 8; ++k) {
    __m512i vb = _mm512_loadu_si512((const __m512i*)((const char*)packed_B + k * 64));
    vcomp = _mm512_dpbusd_epi32(vcomp, off, vb);
  }
  _mm512_storeu_si512((__m512i*)((char*)(packed_B) + offset), vcomp);
}

void pack_B(void* RESTRICT packed_B, const block_q8_0* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);
  ggml_half* d0 = reinterpret_cast<ggml_half*>((char*)packed_B + TILE_N * TILE_K);
  for (int n = 0; n < TILE_N; ++n) {
    d0[n] = B[n * KB].d;
  }
  s8s8_compensation(packed_B);
}

// convert 8 * {min, scale} from int6 to int8
inline void unpack_mins_and_scales(const uint8_t* scales, uint32_t* utmp) {
  const uint32_t kmask1 = 0x3f3f3f3f;
  const uint32_t kmask2 = 0x0f0f0f0f;
  const uint32_t kmask3 = 0x03030303;

  memcpy(utmp, scales, 12);
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
  const uint32_t uaux = utmp[1] & kmask1;
  utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
  utmp[2] = uaux;
  utmp[0] &= kmask1;
}

// packed_B layout:
//   quants {8, TILE_N, 16}  uint8
//   scales {8, TILE_N}      uint8
//   mins   {8, TILE_N}      uint8
//   d      {TILE_N}     ggml_half
//   dmin   {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_q4_K* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  uint8_t* scales = reinterpret_cast<uint8_t*>((char*)packed_B + (QK_K / 2) * TILE_N);
  uint8_t* mins = scales + 8 * TILE_N;
  ggml_half* d = reinterpret_cast<ggml_half*>(mins + 8 * TILE_N);
  ggml_half* dmin = d + TILE_N;

  union {
    uint32_t u32[4];
    uint8_t u8[16];
  } s;

  for (int n = 0; n < TILE_N; ++n) {
    unpack_mins_and_scales(B[n * KB].scales, s.u32);
    for (int k = 0; k < 8; ++k) {
      scales[k * TILE_N + n] = s.u8[k];
      mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
    }
    d[n] = B[n * KB].d;
    dmin[n] = B[n * KB].dmin;
  }
}

// packed_B layout:
//   quants {8, TILE_N, 16}  uint8
//   qh     {8, TILE_N,  4}  uint8
//   scales {8, TILE_N}      uint8
//   mins   {8, TILE_N}      uint8
//   d      {TILE_N}     ggml_half
//   dmin   {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_q5_K* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  uint8_t* scales = reinterpret_cast<uint8_t*>((char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
  uint8_t* mins = scales + 8 * TILE_N;
  ggml_half* d = reinterpret_cast<ggml_half*>(mins + 8 * TILE_N);
  ggml_half* dmin = d + TILE_N;

  union {
    uint32_t u32[4];
    uint8_t u8[16];
  } s;

  for (int n = 0; n < TILE_N; ++n) {
    unpack_mins_and_scales(B[n * KB].scales, s.u32);
    for (int k = 0; k < 8; ++k) {
      scales[k * TILE_N + n] = s.u8[k];
      mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
    }
    d[n] = B[n * KB].d;
    dmin[n] = B[n * KB].dmin;
  }
}

// packed_B layout:
//   quants {16, TILE_N, 8}  uint8
//   qh     {16, TILE_N, 4}  uint8
//   scales {16, TILE_N}      uint8
//   d      {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_q6_K* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  uint8_t* scales = reinterpret_cast<uint8_t*>((char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
  ggml_half* d = reinterpret_cast<ggml_half*>(scales + 16 * TILE_N);
  for (int n = 0; n < TILE_N; ++n) {
    const int8_t* ps = B[n * KB].scales;
    for (int k = 0; k < 16; ++k) {
      scales[k * TILE_N + n] = ps[k];
    }
    d[n] = B[n * KB].d;
  }
}

// packed_B layout:
//   quants {8, TILE_N, 16}  uint8
//   scales {8, TILE_N}       int8
//   d      {TILE_N}     ggml_half
void pack_B(void* RESTRICT packed_B, const block_iq4_xs* RESTRICT B, int KB) {
  pack_qs(packed_B, B, KB);

  int8_t* scales = reinterpret_cast<int8_t*>((char*)packed_B + (QK_K / 2) * TILE_N);
  ggml_half* d = reinterpret_cast<ggml_half*>(scales + 8 * TILE_N);

  // pack the scales
  for (int n = 0; n < TILE_N; ++n) {
    uint16_t sh = B[n * KB].scales_h;
    for (int k = 0; k < 8; k += 2) {
      const int16_t ls1 = ((B[n * KB].scales_l[k / 2] & 0xf) | ((sh << 4) & 0x30)) - 32;
      const int16_t ls2 = ((B[n * KB].scales_l[k / 2] >> 4) | ((sh << 2) & 0x30)) - 32;
      scales[(k + 0) * TILE_N + n] = ls1;
      scales[(k + 1) * TILE_N + n] = ls2;
      sh >>= 4;
    }
    d[n] = B[n * KB].d;
  }
}

template <typename TB, typename packed_B_t = packed_B_type<TB>>
void unpack_B(packed_B_t* RESTRICT tile, const void* RESTRICT packed_B) {
  GGML_UNUSED(tile);
  GGML_UNUSED(packed_B);
};

template <>
void unpack_B<block_q4_0>(int8_t* RESTRICT tile, const void* RESTRICT packed_B) {
  const __m512i off = _mm512_set1_epi8(8);
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512((const __m512i*)((const char*)packed_B + n * 32));
    const __m512i r0 = _mm512_sub_epi8(_mm512_and_si512(bytes, lowMask), off);
    const __m512i r1 = _mm512_sub_epi8(_mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask), off);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <>
void unpack_B<block_q4_1>(uint8_t* RESTRICT tile, const void* RESTRICT packed_B) {
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512((const __m512i*)((const char*)packed_B + n * 32));
    const __m512i r0 = _mm512_and_si512(bytes, lowMask);
    const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

// packed_B_t for QKK is int8_t
template <typename TB>
void unpack_B(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
  const char* packed_B_group = (const char*)packed_B + k * packed_B_group_size;
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512(packed_B_group + n * 32);
    const __m512i r0 = _mm512_and_si512(bytes, lowMask);
    const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <>
void unpack_B<block_q5_K>(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  // lower 4bits, stride 256 bytes
  const int packed_l4_group_size = QK_K / 2 * TILE_N / 8;
  const char* pb = (const char*)packed_B + k * packed_l4_group_size;

  // higher 1bit, stride 64 bytes
  const int packed_h1_group_size = QK_K / 8 * TILE_N / 8;
  const char* ph = (const char*)packed_B + (QK_K / 2) * TILE_N + k * packed_h1_group_size;
  const __m512i hbits = _mm512_loadu_si512(ph);

  const __m512i lowMask = _mm512_set1_epi8(0xF);
  __m512i hmask0 = _mm512_set1_epi8(0x1);
  __m512i hmask1 = _mm512_set1_epi8(0x2);

  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512(pb + n * 32);
    __m512i r0 = _mm512_and_si512(bytes, lowMask);
    __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
    __m512i h0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), n), 4);
    __m512i h1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), n + 1), 4);

    hmask0 = _mm512_slli_epi16(hmask0, 2);
    hmask1 = _mm512_slli_epi16(hmask1, 2);
    r0 = _mm512_add_epi8(r0, h0);
    r1 = _mm512_add_epi8(r1, h1);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <>
void unpack_B<block_q6_K>(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  // lower 4bits, stride 128 bytes
  const int packed_l4_group_size = QK_K / 2 * TILE_N / 16;
  const char* pb = (const char*)packed_B + k * packed_l4_group_size;

  // higher 2bits, stride 64 bytes
  const int packed_h2_group_size = QK_K / 4 * TILE_N / 16;
  const char* ph = (const char*)packed_B + (QK_K / 2) * TILE_N + k * packed_h2_group_size;
  const __m512i hbits = _mm512_loadu_si512(ph);

  const __m512i off = _mm512_set1_epi8(32);
  const __m512i lowMask = _mm512_set1_epi8(0xF);
  __m512i hmask0 = _mm512_set1_epi8(0x3);  // 0011
  __m512i hmask1 = _mm512_set1_epi8(0xC);  // 1100

  // notes: skip zero padding from row4 to row7 as we have done so in `unpack_A`
  __m512i bytes = _mm512_loadu_si512(pb);
  __m512i r0 = _mm512_and_si512(bytes, lowMask);
  __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
  __m512i h0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask0), 4);
  __m512i h1 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask1), 2);
  _mm512_storeu_si512((__m512i*)(tile + 0), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
  _mm512_storeu_si512((__m512i*)(tile + 64), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));

  hmask0 = _mm512_slli_epi16(hmask0, 4);
  hmask1 = _mm512_slli_epi16(hmask1, 4);

  bytes = _mm512_loadu_si512(pb + 64);
  r0 = _mm512_and_si512(bytes, lowMask);
  r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
  h0 = _mm512_and_si512(hbits, hmask0);
  h1 = _mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), 2);
  _mm512_storeu_si512((__m512i*)(tile + 128), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
  _mm512_storeu_si512((__m512i*)(tile + 192), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));
}

template <>
void unpack_B<block_iq4_xs>(int8_t* RESTRICT tile, const void* RESTRICT packed_B, int k) {
  static const __m512i values128 = _mm512_set_epi8(
      113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127, 113, 89, 69, 53, 38, 25, 13, 1, -10,
      -22, -35, -49, -65, -83, -104, -127, 113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
      113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127);

  const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
  const char* pb = (const char*)packed_B + k * packed_B_group_size;
  const __m512i lowMask = _mm512_set1_epi8(0xF);

  for (int n = 0; n < 8; n += 2) {
    __m512i bytes = _mm512_loadu_si512(pb + n * 32);
    const __m512i r0 = _mm512_shuffle_epi8(values128, _mm512_and_si512(bytes, lowMask));
    const __m512i r1 = _mm512_shuffle_epi8(values128, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 0), r0);
    _mm512_storeu_si512((__m512i*)(tile + n * 64 + 64), r1);
  }
}

template <typename TA, typename TB, bool is_acc>
struct acc_C {};

template <bool is_acc>
struct acc_C<block_q8_0, block_q4_0, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_0* A, int lda,
                    const void* packed_B, int nr) {
    const int offset = TILE_N * TILE_K / 2;
    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset)));

    for (int m = 0; m < nr; ++m) {
      const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }
      vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_1, block_q4_1, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_1* A, int lda,
                    const void* packed_B, int nr) {
    const int offset = TILE_N * TILE_K / 2;
    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset)));
    const __m512 vm0 = _mm512_cvtph_ps(
        _mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset + TILE_N * sizeof(ggml_half))));

    for (int m = 0; m < nr; ++m) {
      const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
      const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }
      vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
      vsum = _mm512_fmadd_ps(vm0, vs1, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_0, block_q8_0, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_0* A, int lda,
                    const void* packed_B, int nr) {
    const int offset = TILE_N * TILE_K;
    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)((const char*)packed_B + offset)));

    for (int m = 0; m < nr; ++m) {
      const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }
      vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_q4_K, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const uint8_t* scales = reinterpret_cast<const uint8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N);
    const uint8_t* mins = scales + 8 * TILE_N;
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(mins + 8 * TILE_N);
    const ggml_half* dmin = d0 + TILE_N;

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));
    const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)dmin));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[m * lda].bsums);
      const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));

      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_q5_K, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const uint8_t* scales =
        reinterpret_cast<const uint8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
    const uint8_t* mins = scales + 8 * TILE_N;
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(mins + 8 * TILE_N);
    const ggml_half* dmin = d0 + TILE_N;

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));
    const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)dmin));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[m * lda].bsums);
      const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));

      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_q6_K, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const uint8_t* scales =
        reinterpret_cast<const uint8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(scales + 16 * TILE_N);

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <bool is_acc>
struct acc_C<block_q8_K, block_iq4_xs, is_acc> {
  static void apply(float* RESTRICT C, int ldc, const int32_t* RESTRICT tile, const block_q8_K* A, int lda,
                    const void* packed_B, int nr) {
    const int8_t* scales = reinterpret_cast<const int8_t*>((const char*)packed_B + (QK_K / 2) * TILE_N);
    const ggml_half* d0 = reinterpret_cast<const ggml_half*>(scales + 8 * TILE_N);

    const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)d0));

    for (int m = 0; m < nr; ++m) {
      const float d1 = A[m * lda].d;
      const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
      const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));

      __m512 vsum;
      if (is_acc) {
        vsum = _mm512_loadu_ps(C + m * ldc);
      } else {
        vsum = _mm512_set1_ps(0.f);
      }

      vsum = _mm512_fmadd_ps(vtile, vd, vsum);
      _mm512_storeu_ps(C + m * ldc, vsum);
    }
  }
};

template <typename TB>
constexpr int get_quants_size();
template <>
constexpr int get_quants_size<block_q4_K>() {
  return (QK_K / 2) * TILE_N;
}
template <>
constexpr int get_quants_size<block_q5_K>() {
  return (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N;
}
template <>
constexpr int get_quants_size<block_q6_K>() {
  return (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N;
}
template <>
constexpr int get_quants_size<block_iq4_xs>() {
  return (QK_K / 2) * TILE_N;
}

// used for QKK format
template <typename TB, bool is_acc, typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
inline void scale_C(const int32_t* RESTRICT tile, int32_t* RESTRICT sumi, const void* packed_B, int k, int nr) {
  const uint8_t* scales = reinterpret_cast<const uint8_t*>((const char*)packed_B + get_quants_size<TB>());
  const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(scales + k * TILE_N)));

  for (int m = 0; m < nr; ++m) {
    __m512i vsumi;
    if (is_acc) {
      vsumi = _mm512_loadu_si512(sumi + m * TILE_N);
    } else {
      vsumi = _mm512_setzero_si512();
    }
    __m512i vtile = _mm512_loadu_si512(tile + m * TILE_N);
    vsumi = _mm512_add_epi32(vsumi, _mm512_mullo_epi32(vtile, vscale));
    _mm512_storeu_si512((__m512i*)(sumi + m * TILE_N), vsumi);
  }
}

template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_avx {
  static void apply(int K, const TA* RESTRICT A, const TB* RESTRICT B, TC* RESTRICT C, int ldc) {
    GGML_UNUSED(K);
    GGML_UNUSED(A);
    GGML_UNUSED(B);
    GGML_UNUSED(C);
    GGML_UNUSED(ldc);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int K, const float* RESTRICT A, const ggml_fp16_t* RESTRICT B, float* RESTRICT C, int ldc) {
    constexpr int ROWS = BLOCK_M;
    constexpr int COLS = BLOCK_N;
    assert(BLOCK_K == 16);

    __m512 va;
    __m512 vb[COLS];
    __m512 vc[ROWS * COLS];

    auto loadc = [&](int idx) { vc[idx] = _mm512_setzero_ps(); };
    Unroll<ROWS * COLS>{}(loadc);

    auto compute = [&](int idx, int k) {
      // TODO: use `constexpr` here to get rid of interger div
      // when upgraded to C++17
      const int row = idx / COLS;
      const int col = idx % COLS;

      if (col == 0) {
        va = _mm512_loadu_ps(A + row * K + k);
      }
      if (row == 0) {
        vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(B + col * K + k)));
      }
      vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
    };

    for (int k = 0; k < K; k += 16) {
      Unroll<ROWS * COLS>{}(compute, k);
    }

    auto storec = [&](int idx) {
      const int row = idx / COLS;
      const int col = idx % COLS;
      C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
    };
    Unroll<ROWS * COLS>{}(storec);
  }
};

#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE)                                      \
  tinygemm_kernel_avx<float, type, float, MB_SIZE, NB_SIZE, blck_size>::apply(            \
      K, (const float*)src1->data + mb_start * K, (const type*)src0->data + nb_start * K, \
      (float*)dst->data + mb_start * ldc + nb_start, ldc);

// re-organize in the format {NB, KB, TILE_SIZE}:
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size

template <typename TB, int BLOCK_K>
void convert_B_packed_format(void* RESTRICT packed_B, const TB* RESTRICT B, int N, int K) {
  const int NB = N / TILE_N;
  const int KB = K / BLOCK_K;
  const int TILE_SIZE = get_tile_size<TB>();

  // parallel on NB should be enough
  parallel_for(1, 0, NB, [&](int begin, int end) {
    for (int n = begin; n < end; ++n) {
      for (int k = 0; k < KB; ++k) {
        int n0 = n * TILE_N;
        pack_B((char*)packed_B + PACKED_INDEX(n, k, KB, TILE_SIZE), &B[n0 * KB + k], KB);
      }
    }
  });
}

template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni {};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q4_0);

    const block_q8_0* RESTRICT A = static_cast<const block_q8_0*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    __m512i va[8];
    __m512 vc[COLS];
    __m512 vd1;

    // sum of offsets, shared across COLS
    //
    // avx512-vnni does not have `_mm512_dpbssd_epi32`,
    // need to transfrom ss to us:
    //   a * (b - 8) is equavilent to b * a - 8 * a
    //   s    u   u                   u   s   u   s
    //
    __m512i vcomp;

    const __m512i off = _mm512_set1_epi8(8);
    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      // load a and compute compensation
      if (col == 0) {
        const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A[0 * KB + i].qs);
        vcomp = _mm512_setzero_si512();
        for (int k = 0; k < 8; ++k) {
          va[k] = _mm512_set1_epi32(a_ptr[k]);
          vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
        }
        vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
      }

      // load b
      __m512i vsum = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      for (int k = 0; k < 8; k += 2) {
        __m512i bytes = _mm512_loadu_si512((const __m512i*)(b_ptr + k * 32));
        __m512i vb0 = _mm512_and_si512(bytes, lowMask);
        vsum = _mm512_dpbusd_epi32(vsum, vb0, va[k + 0]);
        __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
        vsum = _mm512_dpbusd_epi32(vsum, vb1, va[k + 1]);
      }
      const int offset = TILE_N * TILE_K / 2;
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset)));
      vsum = _mm512_sub_epi32(vsum, vcomp);

      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q4_1);

    const block_q8_1* RESTRICT A = static_cast<const block_q8_1*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    __m512i va[8];
    __m512i vb[8];
    __m512 vc[COLS];
    __m512 vd1, vs1;

    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      // load a
      if (col == 0) {
        const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A[0 * KB + i].qs);
        for (int k = 0; k < 8; ++k) {
          va[k] = _mm512_set1_epi32(a_ptr[k]);
        }
        vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
        vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
      }

      // load b
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      for (int k = 0; k < 8; k += 2) {
        __m512i bytes = _mm512_loadu_si512((const __m512i*)(b_ptr + k * 32));
        vb[k + 0] = _mm512_and_si512(bytes, lowMask);
        vb[k + 1] = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
      }
      const int offset = TILE_N * TILE_K / 2;
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset)));
      const __m512 vm0 =
          _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset + TILE_N * sizeof(ggml_half))));

      __m512i vsum = _mm512_setzero_si512();
      for (int k = 0; k < 8; ++k) {
        vsum = _mm512_dpbusd_epi32(vsum, vb[k], va[k]);
      }

      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
      vc[col] = _mm512_fmadd_ps(vm0, vs1, vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q8_0) + TILE_N * sizeof(int32_t);

    const block_q8_0* RESTRICT A = static_cast<const block_q8_0*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    __m512i va[8];
    __m512i vb[8];
    __m512 vc[COLS];
    __m512 vd1;

    // Notes: s8s8 igemm compensation in avx512-vnni
    // change s8s8 to u8s8 with compensate
    //   a * b = (a + 128) * b - 128 * b
    //   s   s       u       s    u    s
    //
    // (128 * b is pre-computed when packing B to vnni formats)
    //
    const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      // load a and add offset 128
      if (col == 0) {
        const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A[0 * KB + i].qs);
        for (int k = 0; k < 8; ++k) {
          va[k] = _mm512_set1_epi32(a_ptr[k]);
          va[k] = _mm512_add_epi8(va[k], off);
        }
        vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
      }

      // load b
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      for (int k = 0; k < 8; ++k) {
        vb[k] = _mm512_loadu_si512((const __m512i*)(b_ptr + k * 64));
      }
      const int offset = TILE_N * TILE_K;
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset)));
      const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
      const __m512i vcomp = _mm512_loadu_si512((const __m512i*)(b_ptr + offset2));

      __m512i vsum = _mm512_setzero_si512();
      for (int k = 0; k < 8; ++k) {
        vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
      }
      vsum = _mm512_sub_epi32(vsum, vcomp);

      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q4_K) + TILE_N * 4;

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // a.qs:   8 groups, 32 bytes each group (m256i)
    __m512i va[8];
    // a.bsum: 8 groups,  2 bytes each group (m128i)
    __m512i va_bsum;
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_scales = (QK_K / 2) * TILE_N;
    const int offset_mins = (QK_K / 2) * TILE_N + 8 * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + 16 * TILE_N;
    const int offset_dmin = (QK_K / 2) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);

    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    // Notes: vnni formats in QK_K
    //   a) quants vnni format
    //     int8  {k/4, n, 4}, viewed as 2d {k/4, 4n}, k = 32
    //     from {16, 32} to {8, 64}
    //
    //   b) min vnni format
    //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
    //     from {16,  8} to {4, 32}
    //
    auto compute = [&](int col, int i) {
      // load a
      if (col == 0) {
        for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
          va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(A[0 * KB + i].qs + k_group * 32)));
        }
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
        va_bsum = _mm512_castsi128_si512(q8s);
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // step 1: accumultate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
        __m512i vsum = _mm512_setzero_si512();
        for (int k = 0; k < 8; k += 2) {
          __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
          __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);

          __m512i bytes = _mm512_loadu_si512((const __m512i*)b_qs);
          __m512i vb0 = _mm512_and_si512(bytes, lowMask);
          vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
          __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
          vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);

          b_qs += 64;
        }
        // vacc += scale * (q8 @ q4)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);

      // step 2: accumulate the mins
      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }
      const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_dmin)));
      vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q5_K) + TILE_N * 4;

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // a.qs:   8 groups, 32 bytes each group (m256i)
    __m512i va[8];
    // a.bsum: 8 groups,  2 bytes each group (m128i)
    __m512i va_bsum;
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_qh = (QK_K / 2) * TILE_N;
    const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N;
    const int offset_mins = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 8 * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N;
    const int offset_dmin = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);

    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    // Q5_K and Q4_K shares the same vnni formats, refer to notes above.
    auto compute = [&](int col, int i) {
      // load a
      if (col == 0) {
        for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
          va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(A[0 * KB + i].qs + k_group * 32)));
        }
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
        va_bsum = _mm512_castsi128_si512(q8s);
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // step 1: accumultate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      const char* b_qh = b_ptr + offset_qh;
      for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
        __m512i vsum = _mm512_setzero_si512();
        __m512i hmask0 = _mm512_set1_epi8(0x1);
        __m512i hmask1 = _mm512_set1_epi8(0x2);
        __m512i hbits = _mm512_loadu_si512((const __m512i*)(b_qh + k_group * 64));
        for (int k = 0; k < 8; k += 2) {
          __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
          __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);

          __m512i bytes = _mm512_loadu_si512((const __m512i*)b_qs);
          __m512i vb0 = _mm512_and_si512(bytes, lowMask);
          __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);

          __m512i vh0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), k), 4);
          __m512i vh1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), k + 1), 4);

          hmask0 = _mm512_slli_epi16(hmask0, 2);
          hmask1 = _mm512_slli_epi16(hmask1, 2);
          vb0 = _mm512_add_epi8(vb0, vh0);
          vb1 = _mm512_add_epi8(vb1, vh1);

          vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
          vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);

          b_qs += 64;
        }
        // vacc += scale * (q8 @ q5)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);

      // step 2: accumulate the mins
      __m512i acc_m = _mm512_setzero_si512();
      for (int k = 0; k < 4; ++k) {
        __m512i vmask = _mm512_set1_epi32(k);
        __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
        __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_mins + k * 32)));
        acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
      }
      const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_dmin)));
      vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_q6_K);

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // load the 256 bytes from A to 4 avx512 vectors
    __m512i va[4];
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_qh = (QK_K / 2) * TILE_N;
    const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N + 16 * TILE_N;

    // compensation
    __m512i vcomp;

    const __m512i m32s = _mm512_set1_epi32(32);
    const __m512i lowMask = _mm512_set1_epi8(0xF);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      if (col == 0) {
        // load a
        va[0] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 0));
        va[1] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 64));
        va[2] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 128));
        va[3] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 192));

        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        vcomp = _mm512_mullo_epi32(_mm512_cvtepi16_epi32(q8sums), m32s);
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // accmulate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      const char* b_qh = b_ptr + offset_qh;
      int mask = 0;
      for (int k_group = 0; k_group < QK_K / 16; ++k_group) {
        int r = k_group >> 2;
        __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
        __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);

        __m512i vsum = _mm512_setzero_si512();
        __m512i hmask = _mm512_set1_epi8(0x3);

        __m512i bytes = _mm512_loadu_si512(b_qs);
        __m512i hbits = _mm512_loadu_si512(b_qh);
        __m512i vb0 = _mm512_and_si512(bytes, lowMask);
        __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
        __m512i vh0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask), 4);
        __m512i vh1 = _mm512_slli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 2)), 2);

        vb0 = _mm512_add_epi8(vb0, vh0);
        vb1 = _mm512_add_epi8(vb1, vh1);
        vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
        vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
        b_qs += 64;

        va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
        va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);

        bytes = _mm512_loadu_si512(b_qs);
        vb0 = _mm512_and_si512(bytes, lowMask);
        vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
        vh0 = _mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 4));
        vh1 = _mm512_srli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 6)), 2);
        vb0 = _mm512_add_epi8(vb0, vh0);
        vb1 = _mm512_add_epi8(vb1, vh1);
        vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
        vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
        b_qs += 64;
        b_qh += 64;

        // B * A - 32 * A
        __m512i vmask = _mm512_set1_epi32(k_group);
        vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));

        // vacc += scale * (q8 @ q6)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, BLOCK_K> {
  static void apply(int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C, int ldc) {
    constexpr int COLS = BLOCK_N / 16;
    const int TILE_SIZE = TILE_N * sizeof(block_iq4_xs) + TILE_N * 2;

    const block_q8_K* RESTRICT A = static_cast<const block_q8_K*>(_A);
    const char* RESTRICT B = static_cast<const char*>(_B);

    // load the 256 bytes from A to 4 avx512 vectors
    __m512i va[4];
    __m512 vc[COLS];
    __m512 vd1;

    // packed_B:
    const int offset_scales = (QK_K / 2) * TILE_N;
    const int offset_d0 = (QK_K / 2) * TILE_N + 8 * TILE_N;

    // compensation
    __m512i vcomp;

    const __m256i m128s = _mm256_set1_epi16(128);
    const __m512i lowMask = _mm512_set1_epi8(0xF);

    const __m512i values128 = _mm512_set_epi8(113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
                                              113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
                                              113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
                                              113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127);
    const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
    const __m512i values256 = _mm512_add_epi8(values128, off);

    auto loadc = [&](int col) { vc[col] = _mm512_setzero_ps(); };
    Unroll<COLS>{}(loadc);

    auto compute = [&](int col, int i) {
      if (col == 0) {
        // load a
        va[0] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 0));
        va[1] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 64));
        va[2] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 128));
        va[3] = _mm512_loadu_si512((const __m512i*)(A[0 * KB + i].qs + 192));

        // compensation: 128 * A
        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)A[0 * KB + i].bsums);
        vcomp = _mm512_castsi256_si512(_mm256_madd_epi16(q8sums, m128s));
        vd1 = _mm512_set1_ps(A[0 * KB + i].d);
      }

      // accmulate the quants
      __m512i acc = _mm512_setzero_si512();
      const char* b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
      const char* b_qs = b_ptr;
      int mask = 0;
      for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
        int r = k_group >> 1;
        __m512i vmask = _mm512_set1_epi32(k_group);
        __m512i vsum = _mm512_setzero_si512();
        for (int k = 0; k < 8; k += 2) {
          __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
          __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);

          __m512i bytes = _mm512_loadu_si512(b_qs);
          __m512i vb0 = _mm512_shuffle_epi8(values256, _mm512_and_si512(bytes, lowMask));
          __m512i vb1 = _mm512_shuffle_epi8(values256, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));

          vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
          vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
          b_qs += 64;
        }
        // (B + 128) * A - 128 * A
        vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));

        // vacc += scale * (q8 @ q4)
        const __m512i vscale =
            _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)(b_ptr + offset_scales + k_group * TILE_N)));
        acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
      }
      const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(b_ptr + offset_d0)));
      vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
    };

    for (int i = 0; i < KB; ++i) {
      Unroll<COLS>{}(compute, i);
    }

    // store to C
    auto storec = [&](int col) { _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); };
    Unroll<COLS>{}(storec);
  }
};

#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE)                                                                         \
  tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply(                                     \
      KB, (const char*)wdata + 0 * row_size_A,                                                                       \
      (const char*)src0->extra + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), (float*)dst->data + 0 * N + nb_start, \
      ldc)

template <typename TA, typename TB, typename TC, int BLOCK_K,
          typename std::enable_if<!is_type_qkk<TB>::value, int>::type = 0>
void tinygemm_kernel_amx(int M, int N, int KB, const void* RESTRICT _A, const void* RESTRICT _B, TC* RESTRICT C,
                         int ldc) {
  using packed_B_t = packed_B_type<TB>;
  const int TILE_SIZE = get_tile_size<TB>();
  const bool need_unpack = do_unpack<TB>::value;

  GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
  const TA* RESTRICT A = static_cast<const TA*>(_A);
  const char* RESTRICT B = static_cast<const char*>(_B);

  const int m0 = std::min(M, TILE_M);
  const int m1 = std::max(M - TILE_M, 0);
  const int lda = KB * sizeof(TA);
  // const int ldb = KB * sizeof(TB);

  static thread_local packed_B_t Tile0[TILE_N * TILE_K];
  static thread_local packed_B_t Tile1[TILE_N * TILE_K];
  static thread_local int8_t Tile23[TILE_M * TILE_K];

  static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
  static thread_local int32_t TileC1[TILE_M * TILE_N * 4];

  // double buffering C to interleave avx512 and amx
  int32_t* C_cur = TileC0;
  int32_t* C_pre = TileC1;

  auto Tile4 = [&](int32_t* base) { return base; };
  auto Tile5 = [&](int32_t* base) { return base + TILE_M * TILE_N; };
  auto Tile6 = [&](int32_t* base) { return base + 2 * TILE_M * TILE_N; };
  auto Tile7 = [&](int32_t* base) { return base + 3 * TILE_M * TILE_N; };

  if (M == 2 * TILE_M) {
    // i = 0
    const char* B_blk0 = B + PACKED_INDEX(0, 0, KB, TILE_SIZE);
    const char* B_blk1 = B + PACKED_INDEX(1, 0, KB, TILE_SIZE);
    if (need_unpack) {
      unpack_B<TB>(Tile0, B_blk0);
      _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
    } else {
      _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
    }

    _tile_zero(TMM4);
    _tile_loadd(TMM2, A[0].qs, lda);
    _tile_dpbssd(TMM4, TMM2, TMM0);
    _tile_stored(TMM4, Tile4(C_pre), TILE_N * sizeof(int32_t));

    _tile_zero(TMM5);
    _tile_loadd(TMM3, A[TILE_M * KB + 0].qs, lda);
    _tile_dpbssd(TMM5, TMM3, TMM0);
    _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t));

    if (need_unpack) {
      unpack_B<TB>(Tile1, B_blk0);
      _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
    } else {
      _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
    }

    _tile_zero(TMM6);
    _tile_dpbssd(TMM6, TMM2, TMM1);
    _tile_stored(TMM6, Tile6(C_pre), TILE_N * sizeof(int32_t));

    _tile_zero(TMM7);
    _tile_dpbssd(TMM7, TMM3, TMM1);
    _tile_stored(TMM7, Tile7(C_pre), TILE_N * sizeof(int32_t));

    for (int i = 1; i < KB; ++i) {
      // index of previous iter
      const int ii = i - 1;
      const char* B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
      const char* B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
      GGML_DISPATCH_BOOL(ii > 0, is_acc, [&] {
        if (need_unpack) {
          unpack_B<TB>(Tile0, B_blk0);
          _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
        } else {
          _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
        }
        _tile_zero(TMM4);
        _tile_loadd(TMM2, A[i].qs, lda);
        acc_C<TA, TB, is_acc>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);

        _tile_dpbssd(TMM4, TMM2, TMM0);
        _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));

        _tile_zero(TMM5);
        _tile_loadd(TMM3, A[TILE_M * KB + i].qs, lda);
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB,
                                     B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);

        _tile_dpbssd(TMM5, TMM3, TMM0);
        _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));

        if (need_unpack) {
          unpack_B<TB>(Tile1, B_blk1);
          _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
        } else {
          _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
        }
        _tile_zero(TMM6);
        acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE),
                                     TILE_M);

        _tile_dpbssd(TMM6, TMM2, TMM1);
        _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));

        _tile_zero(TMM7);
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB,
                                     B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);

        _tile_dpbssd(TMM7, TMM3, TMM1);
        _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));

        std::swap(C_cur, C_pre);
      });
    }
    // final accumulation
    {
      int ii = KB - 1;
      acc_C<TA, TB, true>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
      acc_C<TA, TB, true>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB,
                                 B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
      acc_C<TA, TB, true>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE),
                                 TILE_M);
      acc_C<TA, TB, true>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB,
                                 B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
    }
  } else {
    for (int i = 0; i < KB; ++i) {
      _tile_zero(TMM4);
      _tile_zero(TMM6);
      if (m1 != 0) {
        _tile_zero(TMM5);
        _tile_zero(TMM7);
      }

      const char* B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
      const char* B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
      if (need_unpack) {
        unpack_B<TB>(Tile0, B_blk0);
        _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
      } else {
        _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
      }

      if (need_unpack) {
        unpack_B<TB>(Tile1, B_blk1);
        _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
      } else {
        _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
      }

      if (m0 == TILE_M) {
        _tile_loadd(TMM2, A[i].qs, lda);
      } else {
        unpack_A(Tile23, &A[i], KB, m0);
        _tile_loadd(TMM2, Tile23, TILE_K);
      }

      _tile_dpbssd(TMM4, TMM2, TMM0);
      _tile_dpbssd(TMM6, TMM2, TMM1);

      _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));
      _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));

      GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
        acc_C<TA, TB, is_acc>::apply(C, ldc, Tile4(C_cur), &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
        acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_cur), &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE),
                                     m0);
      });
      if (m1 != 0) {
        unpack_A(Tile23, &A[TILE_M * KB + i], KB, m1);
        _tile_loadd(TMM3, Tile23, TILE_K);

        _tile_dpbssd(TMM5, TMM3, TMM0);
        _tile_dpbssd(TMM7, TMM3, TMM1);
        _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));
        _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));
        GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
          acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Tile5(C_cur), &A[TILE_M * KB + i], KB,
                                       B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
          acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_cur), &A[TILE_M * KB + i], KB,
                                       B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
        });
      }
    }
  }
  return;
}

template <typename TA, typename TB, typename TC, int BLOCK_K,
          typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
void tinygemm_kernel_amx(int M, int N, int KB, const void* RESTRICT _A, const void* RESTRICT _B, float* RESTRICT C,
                         int ldc) {
  static_assert(std::is_same<TA, block_q8_K>::value);
  const int TILE_SIZE = get_tile_size<TB>();

  GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
  const TA* RESTRICT A = static_cast<const TA*>(_A);
  const char* RESTRICT B = static_cast<const char*>(_B);

  const int m0 = std::min(M, TILE_M);
  const int m1 = std::max(M - TILE_M, 0);
  // const int lda = KB * sizeof(TA);

  static thread_local int8_t Tile0[TILE_N * TILE_K];
  static thread_local int8_t Tile1[TILE_N * TILE_K];
  static thread_local int8_t Tile23[TILE_M * TILE_K];

  // mat mul result for each group
  static thread_local int32_t Tile4[TILE_M * TILE_N];
  static thread_local int32_t Tile5[TILE_M * TILE_N];
  static thread_local int32_t Tile6[TILE_M * TILE_N];
  static thread_local int32_t Tile7[TILE_M * TILE_N];

  // sum of each QK_K block, contains 8 groups, int32
  static thread_local int32_t Sumi4[TILE_M * TILE_N];
  static thread_local int32_t Sumi5[TILE_M * TILE_N];
  static thread_local int32_t Sumi6[TILE_M * TILE_N];
  static thread_local int32_t Sumi7[TILE_M * TILE_N];

  const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
  for (int i = 0; i < KB; ++i) {
    // step 1: accumulate the quants across 8 groups, each group with 32
    for (int k = 0; k < QK_K / k_group_size; ++k) {
      GGML_DISPATCH_BOOL(k > 0, is_acc, [&] {
        _tile_zero(TMM4);
        _tile_zero(TMM6);

        unpack_B<TB>(Tile0, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k);
        _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);

        unpack_B<TB>(Tile1, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k);
        _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);

        unpack_A<TB>(Tile23, &A[i], KB, k, m0);
        _tile_loadd(TMM2, Tile23, TILE_K);

        _tile_dpbssd(TMM4, TMM2, TMM0);
        _tile_dpbssd(TMM6, TMM2, TMM1);

        _tile_stored(TMM4, Tile4, TILE_N * sizeof(int32_t));
        _tile_stored(TMM6, Tile6, TILE_N * sizeof(int32_t));

        scale_C<TB, is_acc>(Tile4, Sumi4, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m0);
        scale_C<TB, is_acc>(Tile6, Sumi6, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m0);

        if (m1 != 0) {
          _tile_zero(TMM5);
          _tile_zero(TMM7);

          unpack_A<TB>(Tile23, &A[TILE_M * KB + i], KB, k, m1);
          _tile_loadd(TMM3, Tile23, TILE_K);

          _tile_dpbssd(TMM5, TMM3, TMM0);
          _tile_dpbssd(TMM7, TMM3, TMM1);

          _tile_stored(TMM5, Tile5, TILE_N * sizeof(int32_t));
          _tile_stored(TMM7, Tile7, TILE_N * sizeof(int32_t));

          scale_C<TB, is_acc>(Tile5, Sumi5, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m1);
          scale_C<TB, is_acc>(Tile7, Sumi7, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m1);
        }
      });
    }

    // step 2: accmulate the mins
    GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
      acc_C<TA, TB, is_acc>::apply(C, ldc, Sumi4, &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
      acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Sumi6, &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0);
      if (m1 != 0) {
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Sumi5, &A[TILE_M * KB + i], KB,
                                     B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
        acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Sumi7, &A[TILE_M * KB + i], KB,
                                     B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
      }
    });
  }
  return;
}

}  // anonymous namespace

#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18

bool ggml_amx_init() {
#if defined(__gnu_linux__)
  if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
    fprintf(stderr, "AMX is not ready to be used!\n");
    return false;
  }
  return true;
#elif defined(_WIN32)
  return true;
#endif
}

bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
  static thread_local bool is_first_time = true;
  if (is_first_time) {
#pragma omp single
    { ggml_amx_init(); }

    // load tile config
    ggml_tile_config_init();
  }
  is_first_time = false;

  const struct ggml_tensor* src0 = dst->src[0];
  const struct ggml_tensor* src1 = dst->src[1];

  const enum ggml_type type = src0->type;
  const int64_t ne0 = dst->ne[0];

  bool is_training = src0->grad || src1->grad;

  // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
  // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
  bool has_amx_kernels = (type == GGML_TYPE_Q4_0) || (type == GGML_TYPE_Q4_1) || (type == GGML_TYPE_Q8_0) ||
#ifndef GGML_QKK_64
                         // only enabled for QK_K == 256
                         (type == GGML_TYPE_Q4_K) || (type == GGML_TYPE_Q5_K) || (type == GGML_TYPE_Q6_K) ||
                         (type == GGML_TYPE_IQ4_XS) ||
#endif
                         (type == GGML_TYPE_F16);

  // handle only 2d gemm for now
  auto is_contiguous_2d = [](const struct ggml_tensor* t) {
    return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
  };

  return dst->op != GGML_OP_MUL_MAT_ID && is_contiguous_2d(src0) && is_contiguous_2d(src1) && !is_training &&
         src1->type == GGML_TYPE_F32 && has_amx_kernels &&
         // out features is 32x
         ne0 % (TILE_N * 2) == 0;
}

// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
//
// src0: weight in shape of {N, K}, quantized
// src1: input  in shape of {M, K}, float32
// dst:  output in shape of {M, N}, float32
//
// the function performs: dst = src1 @ src0.T
//
void ggml_mul_mat_amx(struct ggml_tensor* dst, int nth, int ith, void* wdata, int wsize) {
  struct ggml_tensor* src0 = dst->src[0];
  struct ggml_tensor* src1 = dst->src[1];

  const enum ggml_type TYPE = src0->type;

  // f16 only has avx512 kernels for now,
  // amx kernels will be added once 6th gen xeon is released.
  const bool is_floating_type = TYPE == GGML_TYPE_F16;

  const int M = dst->ne[1];
  const int N = dst->ne[0];
  const int K = src0->ne[0];
  const int ldc = dst->nb[1] / dst->nb[0];

  if (is_floating_type) {
    constexpr int BLOCK_M = 4;
    constexpr int BLOCK_N = 6;
    const int MB = div_up(M, BLOCK_M);
    const int NB = div_up(N, BLOCK_N);

    parallel_for(nth, ith, MB * NB, [&](int begin, int end) {
      GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
        for (int i = begin; i < end; ++i) {
          int mb = i / NB;
          int nb = i % NB;

          int mb_start = mb * BLOCK_M;
          int mb_size = std::min(BLOCK_M, M - mb_start);
          int nb_start = nb * BLOCK_N;
          int nb_size = std::min(BLOCK_N, N - nb_start);

          switch (mb_size << 4 | nb_size) {
            case 0x12:
              LAUNCH_TINYGEMM_KERNEL_AVX(1, 2);
              break;
            case 0x14:
              LAUNCH_TINYGEMM_KERNEL_AVX(1, 4);
              break;
            case 0x16:
              LAUNCH_TINYGEMM_KERNEL_AVX(1, 6);
              break;
            case 0x22:
              LAUNCH_TINYGEMM_KERNEL_AVX(2, 2);
              break;
            case 0x24:
              LAUNCH_TINYGEMM_KERNEL_AVX(2, 4);
              break;
            case 0x26:
              LAUNCH_TINYGEMM_KERNEL_AVX(2, 6);
              break;
            case 0x32:
              LAUNCH_TINYGEMM_KERNEL_AVX(3, 2);
              break;
            case 0x34:
              LAUNCH_TINYGEMM_KERNEL_AVX(3, 4);
              break;
            case 0x36:
              LAUNCH_TINYGEMM_KERNEL_AVX(3, 6);
              break;
            case 0x42:
              LAUNCH_TINYGEMM_KERNEL_AVX(4, 2);
              break;
            case 0x44:
              LAUNCH_TINYGEMM_KERNEL_AVX(4, 4);
              break;
            case 0x46:
              LAUNCH_TINYGEMM_KERNEL_AVX(4, 6);
              break;
            default:
              fprintf(stderr, "Unexpected block size!\n");
          }
        }
      });
    });
    return;
  }

#pragma omp single
  {
    GGML_DISPATCH_QTYPES(TYPE, [&] {
      const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
      GGML_ASSERT(wsize >= int(M * row_size_A));

      // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
      // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
      GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
      // pack mat B to vnni format
      if (src0->extra == nullptr) {
        const size_t row_size_B = get_row_size<type, blck_size>(K);
        src0->extra = aligned_alloc(64, N * row_size_B);
        convert_B_packed_format<type, blck_size>((void*)src0->extra, (const type*)src0->data, N, K);
      }

      const float* A_data = static_cast<const float*>(src1->data);
      for (int m = 0; m < M; ++m) {
        from_float<vec_dot_type>(A_data + m * K, (char*)wdata + m * row_size_A, K);
      }
    });
  }

  GGML_ASSERT(src0->extra != nullptr);
  if (M == 1) {
    // MB = 1 and handle 8 tiles in each block
    constexpr int kTilesN = 4;
    constexpr int BLOCK_N = TILE_N * kTilesN;
    const int NB = div_up(N, BLOCK_N);

    parallel_for(nth, ith, NB, [&](int begin, int end) {
      GGML_DISPATCH_QTYPES(TYPE, [&] {
        const int KB = K / blck_size;
        const int TILE_SIZE = get_tile_size<type>();
        const int row_size_A = KB * sizeof(vec_dot_type);
        for (int i = begin; i < end; ++i) {
          int nb = i;
          int nb_start = nb * BLOCK_N;
          int nb_size = std::min(BLOCK_N, N - nb_start);  // 32, 64, 96

          switch (nb_size) {
            // case 160: LAUNCH_TINYGEMM_KERNEL_VNNI(160); break;
            case 128:
              LAUNCH_TINYGEMM_KERNEL_VNNI(128);
              break;
            case 96:
              LAUNCH_TINYGEMM_KERNEL_VNNI(96);
              break;
            case 64:
              LAUNCH_TINYGEMM_KERNEL_VNNI(64);
              break;
            case 32:
              LAUNCH_TINYGEMM_KERNEL_VNNI(32);
              break;
            default:
              fprintf(stderr, "Unexpected n block size!\n");
          }
        }
      });
    });
    return;
  }

  // handle 4 tiles at a tile
  constexpr int BLOCK_M = TILE_M * 2;
  constexpr int BLOCK_N = TILE_N * 2;
  const int MB = div_up(M, BLOCK_M);
  const int NB = div_up(N, BLOCK_N);

  parallel_for(nth, ith, MB * NB, [&](int begin, int end) {
    GGML_DISPATCH_QTYPES(TYPE, [&] {
      const int KB = K / blck_size;
      const int TILE_SIZE = get_tile_size<type>();
      const int row_size_A = KB * sizeof(vec_dot_type);

      for (int i = begin; i < end; ++i) {
        int mb = i / NB;
        int nb = i % NB;

        int mb_start = mb * BLOCK_M;
        int mb_size = std::min(BLOCK_M, M - mb_start);
        int nb_start = nb * BLOCK_N;
        int nb_size = BLOCK_N;

        tinygemm_kernel_amx<vec_dot_type, type, float, blck_size>(
            mb_size, nb_size, KB, (const char*)wdata + mb_start * row_size_A,
            (const char*)src0->extra + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
            (float*)dst->data + mb_start * N + nb_start, ldc);
      }
    });
  });
}

#else  // if defined(__AMX_INT8__)

bool ggml_amx_init() {
  fprintf(stderr, "GGML is not compiled with AMX support!\n");
  return false;
}

bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
  GGML_UNUSED(dst);
  fprintf(stderr, "GGML is not compiled with AMX support!\n");
  return false;
}

void ggml_mul_mat_amx(struct ggml_tensor* dst, int nth, int ith, void* wdata, int wsize) {
  GGML_UNUSED(dst);
  GGML_UNUSED(nth);
  GGML_UNUSED(ith);
  GGML_UNUSED(wdata);
  GGML_UNUSED(wsize);
  fprintf(stderr, "GGML is not compiled with AMX support!\n");
}

#endif  // if defined(__AMX_INT8__)
int main() {
  // to be written
}

================================================
FILE: kt-kernel/operators/amx/test/mmq.h
================================================
#ifndef MMQ_H
#define MMQ_H
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

bool ggml_amx_init(void);

bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst);

void ggml_mul_mat_amx(struct ggml_tensor* dst, int nth, int ith, void* wdata, int wsize);

/**
 * @param m
 * @param n
 * @param k
 * @param a
 * @param a_type
 * @param b
 * @param b_type
 * @param c
 * @param c_type
 * @param ldc c stride in elements
 * @param ith
 * @param nth
 * @param wdata auxillary data area
 * @param wsize size of auxillary data size
 */

void mat_mul_amx(int m, int n, int k, const void* a, int a_type, const void* b, int b_type, void* c, int c_type,
                 int ldc, int ith, int nth, void* wdata, int wsize);

#ifdef __cplusplus
}
#endif

#endif  // MMQ_H


================================================
FILE: kt-kernel/operators/amx/test/test-kgroup-128.cpp
================================================
#include <cmath>
#include <iostream>
#include <memory>
#include <vector>

#include "../la/amx.hpp"

void test_kgroup_128() {
  std::cout << "=== Testing K-Group with k_group_size = 128 ===\n" << std::endl;

  const int m = 32;  // Simple case
  const int n = 32;
  const int k = 512;  // Multiple of 128
  const int k_group_size = 128;

  std::cout << "Matrix dimensions: " << m << " x " << n << " x " << k << std::endl;
  std::cout << "K-group size: " << k_group_size << std::endl;
  std::cout << "Number of k-groups: " << k / k_group_size << std::endl;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  Kernel::config();

  // Test 1: All ones
  std::cout << "\n--- Test 1: All ones (expected = " << k << ") ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(1.0f);
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(1.0f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    float actual = ggml_compute_bf16_to_fp32(output[0]);
    float error = std::abs(actual - k) / k * 100;
    std::cout << "Result[0,0]: " << actual << " (error: " << error << "%)" << std::endl;
  }

  // Test 2: Values in quantization sweet spot (0.5)
  std::cout << "\n--- Test 2: All 0.5 (expected = " << 0.5f * 0.5f * k << ") ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(0.5f);
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(0.5f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    float expected = 0.5f * 0.5f * k;
    float actual = ggml_compute_bf16_to_fp32(output[0]);
    float error = std::abs(actual - expected) / expected * 100;
    std::cout << "Result[0,0]: " << actual << " (expected: " << expected << ", error: " << error << "%)" << std::endl;
  }

  // Test 3: Different values per k-group
  std::cout << "\n--- Test 3: Different values per k-group ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Each k-group has different value
    for (int i = 0; i < m; i++) {
      for (int j = 0; j < k; j++) {
        int kg = j / k_group_size;
        float val = (kg + 1) * 0.25f;  // 0.25, 0.5, 0.75, 1.0
        input_a[i * k + j] = ggml_compute_fp32_to_bf16(val);
      }
    }

    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(0.5f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Expected: sum of (kg+1)*0.25 * 0.5 * k_group_size for all k-groups
    float expected = 0.0f;
    for (int kg = 0; kg < k / k_group_size; kg++) {
      expected += (kg + 1) * 0.25f * 0.5f * k_group_size;
    }

    float actual = ggml_compute_bf16_to_fp32(output[0]);
    float error = std::abs(actual - expected) / expected * 100;
    std::cout << "Expected: " << expected << ", Actual: " << actual << std::endl;
    std::cout << "Error: " << error << "%" << std::endl;
  }

  // Test 4: Pattern test
  std::cout << "\n--- Test 4: Pattern with alternating values ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    // Alternating pattern in A
    for (int i = 0; i < m * k; i++) {
      float val = (i % 2 == 0) ? 0.25f : 0.75f;
      input_a[i] = ggml_compute_fp32_to_bf16(val);
    }

    // Constant in B
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(0.4f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Expected: average of 0.25 and 0.75 is 0.5, so 0.5 * 0.4 * k
    float expected = 0.5f * 0.4f * k;
    float actual = ggml_compute_bf16_to_fp32(output[0]);
    float error = std::abs(actual - expected) / expected * 100;
    std::cout << "Expected: " << expected << ", Actual: " << actual << std::endl;
    std::cout << "Error: " << error << "%" << std::endl;
  }

  // Test 5: Check all output elements
  std::cout << "\n--- Test 5: Verify all output elements (0.1 × 0.1) ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(0.1f);
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(0.1f);
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    float expected = 0.1f * 0.1f * k;
    float max_error = 0.0f;
    float avg_error = 0.0f;
    int error_count = 0;

    for (int i = 0; i < m * n; i++) {
      float actual = ggml_compute_bf16_to_fp32(output[i]);
      float error = std::abs(actual - expected) / expected * 100;
      max_error = std::max(max_error, error);
      avg_error += error;
      if (error > 5.0f) error_count++;
    }
    avg_error /= (m * n);

    std::cout << "Expected value: " << expected << std::endl;
    std::cout << "Max error: " << max_error << "%" << std::endl;
    std::cout << "Average error: " << avg_error << "%" << std::endl;
    std::cout << "Elements with >5% error: " << error_count << "/" << m * n << std::endl;
  }

  // Test 6: Random normal distribution (like real model weights)
  std::cout << "\n--- Test 6: Random normal distribution ---" << std::endl;
  {
    std::vector<ggml_bf16_t> input_a(m * k);
    std::vector<ggml_bf16_t> input_b(k * n);

    std::mt19937 gen(42);
    std::normal_distribution<float> dist(0.0f, 0.1f);

    for (int i = 0; i < m * k; i++) {
      input_a[i] = ggml_compute_fp32_to_bf16(dist(gen));
    }
    for (int i = 0; i < k * n; i++) {
      input_b[i] = ggml_compute_fp32_to_bf16(dist(gen));
    }

    // Compute reference with float32
    std::vector<float> ref_result(m * n, 0.0f);
    for (int i = 0; i < m; i++) {
      for (int j = 0; j < n; j++) {
        float sum = 0.0f;
        for (int l = 0; l < k; l++) {
          float a_val = ggml_compute_bf16_to_fp32(input_a[i * k + l]);
          float b_val = ggml_compute_bf16_to_fp32(input_b[l * n + j]);
          sum += a_val * b_val;
        }
        ref_result[i * n + j] = sum;
      }
    }

    ba->from_mat(m, input_a.data(), 0, 1);
    bb->from_mat(input_b.data(), 0, 1);
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

    std::vector<ggml_bf16_t> output(m * n);
    bc->to_mat(m, output.data(), 0, 1);

    // Compute errors
    float max_abs_error = 0.0f;
    float max_rel_error = 0.0f;
    float avg_rel_error = 0.0f;
    int large_error_count = 0;

    for (int i = 0; i < m * n; i++) {
      float actual = ggml_compute_bf16_to_fp32(output[i]);
      float ref = ref_result[i];
      float abs_error = std::abs(actual - ref);
      float rel_error = std::abs(ref) > 1e-6 ? abs_error / std::abs(ref) : 0.0f;

      max_abs_error = std::max(max_abs_error, abs_error);
      max_rel_error = std::max(max_rel_error, rel_error);
      avg_rel_error += rel_error;

      if (rel_error > 0.2f) {  // 20% error
        large_error_count++;
        if (large_error_count <= 5) {
          std::cout << "  [" << i / n << "," << i % n << "]: actual=" << actual << ", ref=" << ref
                    << ", rel_error=" << (rel_error * 100) << "%" << std::endl;
        }
      }
    }
    avg_rel_error /= (m * n);

    std::cout << "Max absolute error: " << max_abs_error << std::endl;
    std::cout << "Max relative error: " << (max_rel_error * 100) << "%" << std::endl;
    std::cout << "Average relative error: " << (avg_rel_error * 100) << "%" << std::endl;
    std::cout << "Elements with >20% error: " << large_error_count << "/" << m * n << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

int main() {
  test_kgroup_128();
  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/test-kgroup-kernel.cpp
================================================
#include <omp.h>

#include "../la/amx.hpp"
#define FMT_HEADER_ONLY
#include <fmt/core.h>

#include <chrono>
#include <cmath>
#include <iostream>
#include <memory>
#include <random>

void test_kgroup_kernel_basic() {
  std::cout << "=== Testing GemmKernel224Int4KGroup Basic Functionality ===" << std::endl;

  // Test parameters - must match kernel requirements
  const int m = 64;              // Must be multiple of M_STEP (32)
  const int n = 64;              // Must be multiple of N_STEP (32)
  const int k = 1024;            // Must be multiple of K_STEP (64)
  const int k_group_size = 256;  // Must divide k evenly

  std::cout << fmt::format("Parameters: m={}, n={}, k={}, k_group_size={}\n", m, n, k, k_group_size);

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  // Allocate buffers
  size_t size_a = BufferA::required_size(m, k, k_group_size);
  size_t size_b = BufferB::required_size(n, k, k_group_size);  // Fixed: n, k not k, n
  size_t size_c = BufferC::required_size(m, n);

  void* buffer_a = std::aligned_alloc(64, size_a);
  void* buffer_b = std::aligned_alloc(64, size_b);
  void* buffer_c = std::aligned_alloc(64, size_c);

  std::cout << fmt::format("Buffer sizes: A={} KB, B={} KB, C={} KB\n", size_a / 1024, size_b / 1024, size_c / 1024);

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);  // Fixed: n, k not k, n
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Create test input data
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  std::mt19937 gen(42);
  std::uniform_real_distribution<float> dist(-0.5f, 0.5f);

  // Fill with small values to avoid overflow
  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Quantize inputs
  std::cout << "Quantizing inputs..." << std::endl;
  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  // Configure AMX
  Kernel::config();

  // Run matrix multiplication with k-group quantization
  std::cout << "Running k-group matrix multiplication..." << std::endl;
  auto start = std::chrono::high_resolution_clock::now();

  amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

  auto end = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
  std::cout << fmt::format("Time: {} ms\n", duration / 1000.0);

  // Convert output to bf16
  std::vector<ggml_bf16_t> output(m * n);
  bc->to_mat(m, output.data(), 0, 1);

  // Print sample output values
  std::cout << "\nSample output values:" << std::endl;
  for (int i = 0; i < std::min(5, m); i++) {
    for (int j = 0; j < std::min(5, n); j++) {
      float val = ggml_compute_bf16_to_fp32(output[i * n + j]);
      std::cout << fmt::format("{:8.4f} ", val);
    }
    std::cout << std::endl;
  }

  // Clean up
  free(buffer_a);
  free(buffer_b);
  free(buffer_c);

  std::cout << "\n✓ Basic test completed!" << std::endl;
}

void test_kgroup_kernel_correctness() {
  std::cout << "\n=== Testing GemmKernel224Int4KGroup Correctness ===" << std::endl;

  const int m = 32;
  const int n = 32;
  const int k = 512;
  const int k_group_size = 128;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  // Allocate buffers
  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));  // Fixed: n, k not k, n
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);  // Fixed: n, k not k, n
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Create simple test pattern
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);
  std::vector<float> expected(m * n, 0.0f);

  // Fill A with row indices and B with column indices (scaled down)
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < k; j++) {
      input_a[i * k + j] = ggml_compute_fp32_to_bf16((i + 1) * 0.001f);
    }
  }

  for (int i = 0; i < k; i++) {
    for (int j = 0; j < n; j++) {
      input_b[i * n + j] = ggml_compute_fp32_to_bf16((j + 1) * 0.001f);
    }
  }

  // Compute expected result (naive)
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      float sum = 0.0f;
      for (int l = 0; l < k; l++) {
        float a_val = ggml_compute_bf16_to_fp32(input_a[i * k + l]);
        float b_val = ggml_compute_bf16_to_fp32(input_b[l * n + j]);
        sum += a_val * b_val;
      }
      expected[i * n + j] = sum;
    }
  }

  // Quantize and run
  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  Kernel::config();
  amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

  // Get output
  std::vector<ggml_bf16_t> output(m * n);
  bc->to_mat(m, output.data(), 0, 1);

  // Compare results
  float max_error = 0.0f;
  float total_error = 0.0f;
  int count = 0;

  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output[i * n + j]);
      float exp = expected[i * n + j];
      float error = std::abs(actual - exp);
      max_error = std::max(max_error, error);
      total_error += error;
      count++;
    }
  }

  float avg_error = total_error / count;
  float relative_error = max_error / (*std::max_element(expected.begin(), expected.end()) + 1e-8f);

  std::cout << fmt::format("Error Analysis:\n");
  std::cout << fmt::format("  Max absolute error: {:.6f}\n", max_error);
  std::cout << fmt::format("  Average absolute error: {:.6f}\n", avg_error);
  std::cout << fmt::format("  Relative error: {:.2f}%\n", relative_error * 100);

  // Check acceptability (INT4 quantization + k-group should have reasonable error)
  if (relative_error < 0.10f) {  // 10% relative error threshold for INT4
    std::cout << "✓ Error is within acceptable range for INT4 quantization" << std::endl;
  } else {
    std::cout << "✗ Error is higher than expected!" << std::endl;
  }

  // Print first few values for comparison
  std::cout << "\nFirst 5x5 values comparison:" << std::endl;
  std::cout << "Expected vs Actual:" << std::endl;
  for (int i = 0; i < std::min(5, m); i++) {
    for (int j = 0; j < std::min(5, n); j++) {
      float actual = ggml_compute_bf16_to_fp32(output[i * n + j]);
      float exp = expected[i * n + j];
      std::cout << fmt::format("({:.4f},{:.4f}) ", exp, actual);
    }
    std::cout << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);

  std::cout << "\n✓ Correctness test completed!" << std::endl;
}

void test_kgroup_kernel_performance() {
  std::cout << "\n=== Testing GemmKernel224Int4KGroup Performance ===" << std::endl;

  const int m = 256;
  const int n = 256;
  const int k = 2048;
  const int k_group_size = 512;
  const int iterations = 100;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  // Allocate buffers
  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));  // Fixed: n, k not k, n
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);  // Fixed: n, k not k, n
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Create random input
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  std::mt19937 gen(42);
  std::uniform_real_distribution<float> dist(-0.1f, 0.1f);

  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Quantize
  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  Kernel::config();

  // Warm up
  for (int i = 0; i < 10; i++) {
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);
  }

  // Benchmark
  auto start = std::chrono::high_resolution_clock::now();

  for (int i = 0; i < iterations; i++) {
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);
  }

  auto end = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();

  double avg_time_ms = duration / (1000.0 * iterations);
  double ops = 2.0 * m * n * k;
  double gflops = (ops * iterations) / (duration * 1000.0);

  std::cout << fmt::format("Matrix size: {}x{}x{}\n", m, n, k);
  std::cout << fmt::format("K-group size: {}\n", k_group_size);
  std::cout << fmt::format("Average time per multiplication: {:.3f} ms\n", avg_time_ms);
  std::cout << fmt::format("Performance: {:.2f} GFLOPS\n", gflops);

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);

  std::cout << "\n✓ Performance test completed!" << std::endl;
}

int main(int argc, char** argv) {
  std::cout << "Starting GemmKernel224Int4KGroup Tests\n" << std::endl;

  try {
    test_kgroup_kernel_basic();
    test_kgroup_kernel_correctness();
    test_kgroup_kernel_performance();

    std::cout << "\n=== All tests completed successfully! ===" << std::endl;
  } catch (const std::exception& e) {
    std::cerr << "Test failed with exception: " << e.what() << std::endl;
    return 1;
  }

  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/test-specific-dims.cpp
================================================
#include <chrono>
#include <cmath>
#include <iostream>
#include <memory>
#include <random>
#include <vector>

#include "../la/amx.hpp"
#include "../la/amx_buffers.hpp"
#include "../la/amx_kernels.hpp"

void test_specific_dimensions() {
  std::cout << "=== Testing Specific Dimensions ===\n" << std::endl;

  const int m_original = 200;
  const int n = 512;
  const int k = 7168;
  const int k_group_size = 64;

  // Pad m to nearest multiple of 32 (M_STEP)
  const int M_STEP = 32;
  const int m = ((m_original + M_STEP - 1) / M_STEP) * M_STEP;  // Round up to 224

  std::cout << "Original dimensions: " << m_original << " x " << n << " x " << k << std::endl;
  std::cout << "Padded dimensions: " << m << " x " << n << " x " << k << std::endl;
  std::cout << "K-group size is: " << k_group_size << std::endl;
  std::cout << "Number of k-groups: " << k / k_group_size << std::endl;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using Kernel_int4_1 = amx::GemmKernel224Int4_1;
  using Kernel_int4 = amx::GemmKernel224Int4;
  using Kernel_k_int4_1 = amx::GemmKernel224Int4_1KGroup;
  using Kernel_k_int4_1_low = amx::GemmKernel224Int4_1_LowKGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;
  using BufferA_int4_1 = Kernel_int4_1::BufferA;
  using BufferB_int4_1 = Kernel_int4_1::BufferB;
  using BufferC_int4_1 = Kernel_int4_1::BufferC;
  using BufferA_int4 = Kernel_int4::BufferA;
  using BufferB_int4 = Kernel_int4::BufferB;
  using BufferC_int4 = Kernel_int4::BufferC;
  using BufferA_k_int4_1 = Kernel_k_int4_1::BufferA;
  using BufferB_k_int4_1 = Kernel_k_int4_1::BufferB;
  using BufferC_k_int4_1 = Kernel_k_int4_1::BufferC;
  using BufferA_k_int4_1_low = Kernel_k_int4_1_low::BufferA;
  using BufferB_k_int4_1_low = Kernel_k_int4_1_low::BufferB;
  using BufferC_k_int4_1_low = Kernel_k_int4_1_low::BufferC;

  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  void* buffer_a_int4_1 = std::aligned_alloc(64, BufferA_int4_1::required_size(m, k));
  void* buffer_b_int4_1 = std::aligned_alloc(64, BufferB_int4_1::required_size(n, k));
  void* buffer_c_int4_1 = std::aligned_alloc(64, BufferC_int4_1::required_size(m, n));

  void* buffer_a_int4 = std::aligned_alloc(64, BufferA_int4::required_size(m, k));
  void* buffer_b_int4 = std::aligned_alloc(64, BufferB_int4::required_size(n, k));
  void* buffer_c_int4 = std::aligned_alloc(64, BufferC_int4::required_size(m, n));

  void* buffer_a_k_int4_1 = std::aligned_alloc(64, BufferA_k_int4_1::required_size(m, k, k_group_size));
  void* buffer_b_k_int4_1 = std::aligned_alloc(64, BufferB_k_int4_1::required_size(n, k, k_group_size));
  void* buffer_c_k_int4_1 = std::aligned_alloc(64, BufferC_k_int4_1::required_size(m, n));

  void* buffer_a_k_int4_1_low = std::aligned_alloc(64, BufferA_k_int4_1_low::required_size(m, k, k_group_size));
  void* buffer_b_k_int4_1_low = std::aligned_alloc(64, BufferB_k_int4_1_low::required_size(n, k, k_group_size));
  void* buffer_c_k_int4_1_low = std::aligned_alloc(64, BufferC_k_int4_1_low::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  printf("buffer_b ptr:%p\n", buffer_b);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  auto ba_int4_1 = std::make_shared<BufferA_int4_1>(m, k, buffer_a_int4_1);
  auto bb_int4_1 = std::make_shared<BufferB_int4_1>(n, k, buffer_b_int4_1);
  auto bc_int4_1 = std::make_shared<BufferC_int4_1>(m, n, buffer_c_int4_1);

  auto ba_int4 = std::make_shared<BufferA_int4>(m, k, buffer_a_int4);
  auto bb_int4 = std::make_shared<BufferB_int4>(n, k, buffer_b_int4);
  auto bc_int4 = std::make_shared<BufferC_int4>(m, n, buffer_c_int4);

  auto ba_k_int4_1 = std::make_shared<BufferA_k_int4_1>(m, k, k_group_size, buffer_a_k_int4_1);
  auto bb_k_int4_1 = std::make_shared<BufferB_k_int4_1>(n, k, k_group_size, buffer_b_k_int4_1);
  auto bc_k_int4_1 = std::make_shared<BufferC_k_int4_1>(m, n, buffer_c_k_int4_1);

  auto ba_k_int4_1_low = std::make_shared<BufferA_k_int4_1_low>(m, k, k_group_size, buffer_a_k_int4_1_low);
  auto bb_k_int4_1_low = std::make_shared<BufferB_k_int4_1_low>(n, k, k_group_size, buffer_b_k_int4_1_low);
  auto bc_k_int4_1_low = std::make_shared<BufferC_k_int4_1_low>(m, n, buffer_c_k_int4_1_low);

  // Create input matrices with realistic values
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  std::mt19937 gen(42);
  std::normal_distribution<float> dist(0.0f, 0.1f);  // Normal distribution, mean=0, std=0.1

  std::cout << "\nGenerating input matrices..." << std::endl;
  // print input mat(first 10)
  // for (int i = 0; i < std::min(10, m * k); i++) {
  //   std::cout << "input_a[" << i << "] = " << ggml_compute_bf16_to_fp32(input_a[i]) << std::endl;
  // }
  // for (int i = 0; i < std::min(10, k * n); i++) {
  //   std::cout << "input_b[" << i << "] = " << ggml_compute_bf16_to_fp32(input_b[i]) << std::endl;
  // }
  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Compute reference result with float32 (sampling for speed, only use original m rows)
  std::cout << "Computing reference (sampling)..." << std::endl;
  const int sample_m = std::min(50, m_original);  // Use original m for reference
  const int sample_n = std::min(50, n);
  std::vector<float> ref_result(sample_m * sample_n, 0.0f);

  for (int i = 0; i < sample_m; i++) {
    for (int j = 0; j < sample_n; j++) {
      float sum = 0.0f;
      for (int l = 0; l < k; l++) {
        float a_val = ggml_compute_bf16_to_fp32(input_a[i * k + l]);
        float b_val = ggml_compute_bf16_to_fp32(input_b[j * k + l]);
        sum += a_val * b_val;
      }
      ref_result[i * sample_n + j] = sum;
    }
  }

  // Quantize and compute with k-group
  std::cout << "Quantizing matrices..." << std::endl;
  ba->from_mat(m, input_a.data(), 0, 1);
  int nth = Kernel::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bb->from_mat(input_b.data(), i, nth);
  }

  ba_int4_1->from_mat(m, input_a.data(), 0, 1);
  nth = Kernel_int4_1::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bb_int4_1->from_mat(input_b.data(), i, nth);
  }

  ba_int4->from_mat(m, input_a.data(), 0, 1);
  nth = Kernel_int4::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bb_int4->from_mat(input_b.data(), i, nth);
  }

  ba_k_int4_1->from_mat(m, input_a.data(), 0, 1);
  nth = Kernel_k_int4_1::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bb_k_int4_1->from_mat(input_b.data(), i, nth);
  }

  ba_k_int4_1_low->from_mat(m, input_a.data(), 0, 1);
  nth = Kernel_k_int4_1_low::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bb_k_int4_1_low->from_mat(input_b.data(), i, nth);
  }

  // Print some scale statistics
  std::cout << "\nScale statistics:" << std::endl;
  float min_a_scale = 1e10f, max_a_scale = 0.0f;
  float min_b_scale = 1e10f, max_b_scale = 0.0f;
  float min_a_scale_int4_1 = 1e10f, max_a_scale_int4_1 = 0.0f;
  float min_b_scale_int4_1 = 1e10f, max_b_scale_int4_1 = 0.0f;
  float min_b_min_int4_1 = 1e10f, max_b_min_int4_1 = -1e10f;
  float min_a_scale_int4 = 1e10f, max_a_scale_int4 = 0.0f;
  float min_b_scale_int4 = 1e10f, max_b_scale_int4 = 0.0f;
  float min_a_scale_k_int4_1 = 1e10f, max_a_scale_k_int4_1 = 0.0f;
  float min_b_scale_k_int4_1 = 1e10f, max_b_scale_k_int4_1 = 0.0f;
  float min_b_min_k_int4_1 = 1e10f, max_b_min_k_int4_1 = -1e10f;
  float min_a_scale_k_int4_1_low = 1e10f, max_a_scale_k_int4_1_low = 0.0f;
  float min_b_scale_k_int4_1_low = 1e10f, max_b_scale_k_int4_1_low = 0.0f;
  float min_b_min_k_int4_1_low = 1e10f, max_b_min_k_int4_1_low = -1e10f;

  for (int i = 0; i < std::min(10, m); i++) {
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *ba->get_scale(m, i, k, kg * k_group_size);
      min_a_scale = std::min(min_a_scale, scale);
      max_a_scale = std::max(max_a_scale, scale);
    }
  }

  for (int j = 0; j < std::min(10, n); j++) {
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *bb->get_scale(n, j, k, kg * k_group_size);
      min_b_scale = std::min(min_b_scale, scale);
      max_b_scale = std::max(max_b_scale, scale);
    }
  }
  for (int i = 0; i < std::min(10, m); i++) {
    float scale = *ba_int4_1->get_scale(m, i);
    min_a_scale_int4_1 = std::min(min_a_scale_int4_1, scale);
    max_a_scale_int4_1 = std::max(max_a_scale_int4_1, scale);
  }
  for (int j = 0; j < std::min(10, n); j++) {
    float scale = *bb_int4_1->get_scale(n, j);
    min_b_scale_int4_1 = std::min(min_b_scale_int4_1, scale);
    max_b_scale_int4_1 = std::max(max_b_scale_int4_1, scale);
    float b_min = *bb_int4_1->get_min(n, j);
    min_b_min_int4_1 = std::min(min_b_min_int4_1, b_min);
    max_b_min_int4_1 = std::max(max_b_min_int4_1, b_min);
  }

  for (int i = 0; i < std::min(10, m); i++) {
    float scale = *ba_int4->get_scale(m, i);
    min_a_scale_int4 = std::min(min_a_scale_int4, scale);
    max_a_scale_int4 = std::max(max_a_scale_int4, scale);
  }

  for (int j = 0; j < std::min(10, n); j++) {
    float scale = *bb_int4->get_scale(n, j);
    min_b_scale_int4 = std::min(min_b_scale_int4, scale);
    max_b_scale_int4 = std::max(max_b_scale_int4, scale);
  }

  for (int i = 0; i < std::min(10, m); i++) {
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *ba_k_int4_1->get_scale(m, i, k, kg * k_group_size);
      min_a_scale_k_int4_1 = std::min(min_a_scale_k_int4_1, scale);
      max_a_scale_k_int4_1 = std::max(max_a_scale_k_int4_1, scale);
    }
  }

  for (int j = 0; j < std::min(10, n); j++) {
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *bb_k_int4_1->get_scale(n, j, k, kg * k_group_size);
      min_b_scale_k_int4_1 = std::min(min_b_scale_k_int4_1, scale);
      max_b_scale_k_int4_1 = std::max(max_b_scale_k_int4_1, scale);
      float b_min = *bb_k_int4_1->get_min(n, j, k, kg * k_group_size);
      min_b_min_k_int4_1 = std::min(min_b_min_k_int4_1, b_min);
      max_b_min_k_int4_1 = std::max(max_b_min_k_int4_1, b_min);
    }
  }

  for (int i = 0; i < std::min(10, m); i++) {
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *ba_k_int4_1_low->get_scale(m, i, k, kg * k_group_size);
      min_a_scale_k_int4_1_low = std::min(min_a_scale_k_int4_1_low, scale);
      max_a_scale_k_int4_1_low = std::max(max_a_scale_k_int4_1_low, scale);
    }
  }

  for (int j = 0; j < std::min(10, n); j++) {
    for (int kg = 0; kg < k / k_group_size; kg++) {
      float scale = *bb_k_int4_1_low->get_scale(n, j, k, kg * k_group_size);
      min_b_scale_k_int4_1_low = std::min(min_b_scale_k_int4_1_low, scale);
      max_b_scale_k_int4_1_low = std::max(max_b_scale_k_int4_1_low, scale);
      float b_min = *bb_k_int4_1_low->get_min(n, j, k, kg * k_group_size);
      min_b_min_k_int4_1_low = std::min(min_b_min_k_int4_1_low, b_min);
      max_b_min_k_int4_1_low = std::max(max_b_min_k_int4_1_low, b_min);
    }
  }
  std::cout << "  B_int4_1 scales: min=" << min_b_scale_int4_1 << ", max=" << max_b_scale_int4_1 << std::endl;
  std::cout << "  B_int4_1 min: min=" << min_b_min_int4_1 << ", max=" << max_b_min_int4_1 << std::endl;

  std::cout << "  A_int4 scales: min=" << min_a_scale_int4 << ", max=" << max_a_scale_int4 << std::endl;
  std::cout << "  B_int4 scales: min=" << min_b_scale_int4 << ", max=" << max_b_scale_int4 << std::endl;

  std::cout << "  A_k_int4_1 scales: min=" << min_a_scale_k_int4_1 << ", max=" << max_a_scale_k_int4_1 << std::endl;
  std::cout << "  B_k_int4_1 scales: min=" << min_b_scale_k_int4_1 << ", max=" << max_b_scale_k_int4_1 << std::endl;
  std::cout << "  B_k_int4_1 min: min=" << min_b_min_k_int4_1 << ", max=" << max_b_min_k_int4_1 << std::endl;

  std::cout << "  A_k_int4_1_low scales: min=" << min_a_scale_k_int4_1_low << ", max=" << max_a_scale_k_int4_1_low
            << std::endl;
  std::cout << "  B_k_int4_1_low scales: min=" << min_b_scale_k_int4_1_low << ", max=" << max_b_scale_k_int4_1_low
            << std::endl;
  std::cout << "  B_k_int4_1_low min: min=" << min_b_min_k_int4_1_low << ", max=" << max_b_min_k_int4_1_low
            << std::endl;

  Kernel::config();

  std::cout << "\nRunning k-group matrix multiplication..." << std::endl;
  auto start = std::chrono::high_resolution_clock::now();

  nth = Kernel::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, i, nth);
  }

  nth = Kernel_int4_1::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    amx::mat_mul(m, n, k, ba_int4_1, bb_int4_1, bc_int4_1, i, nth);
  }

  nth = Kernel_int4::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    amx::mat_mul(m, n, k, ba_int4, bb_int4, bc_int4, i, nth);
  }

  nth = Kernel_k_int4_1::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    amx::vec_mul_kgroup(m, n, k, k_group_size, ba_k_int4_1, bb_k_int4_1, bc_k_int4_1, i, nth);
  }

  nth = Kernel_k_int4_1_low::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    amx::vec_mul_kgroup(m, n, k, k_group_size, ba_k_int4_1_low, bb_k_int4_1_low, bc_k_int4_1_low, i, nth);
  }
  auto end = std::chrono::high_resolution_clock::now();

  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
  std::cout << "Computation time: " << duration.count() / 1000.0 << " ms" << std::endl;

  // Calculate GFLOPS
  double ops = 2.0 * m * n * k;
  double gflops = ops / (duration.count() * 1000.0);
  std::cout << "Performance: " << gflops << " GFLOPS" << std::endl;

  std::vector<ggml_bf16_t> output(m * n);
  std::vector<ggml_bf16_t> output_int4_1(m * n);
  std::vector<ggml_bf16_t> output_int4(m * n);
  std::vector<ggml_bf16_t> output_k_int4_1(m * n);
  std::vector<ggml_bf16_t> output_k_int4_1_low(m * n);
  nth = Kernel::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bc->to_mat(m, output.data(), i, nth);
  }
  nth = Kernel_int4_1::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bc_int4_1->to_mat(m, output_int4_1.data(), i, nth);
  }
  nth = Kernel_int4::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bc_int4->to_mat(m, output_int4.data(), i, nth);
  }
  nth = Kernel_k_int4_1::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bc_k_int4_1->to_mat(m, output_k_int4_1.data(), i, nth);
  }
  nth = Kernel_k_int4_1_low::recommended_nth(n);
  for (int i = 0; i <= nth; i++) {
    bc_k_int4_1_low->to_mat(m, output_k_int4_1_low.data(), i, nth);
  }
  float thresh_hold = 2.0f;
  // Compute errors for sampled elements
  std::cout << "\nError analysis (sampled):" << std::endl;
  float max_abs_error = 0.0f;
  float total_abs_error = 0.0f;
  float max_rel_error = 0.0f;
  float total_rel_error = 0.0f;
  int count = 0;

  for (int i = 0; i < sample_m; i++) {
    for (int j = 0; j < sample_n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float abs_error = std::abs(actual - ref);
      float rel_error = std::abs(ref) > 1e-6 ? abs_error / std::abs(ref) : 0.0f;
      if (rel_error >= thresh_hold) {
        rel_error = thresh_hold;
      }
      max_abs_error = std::max(max_abs_error, abs_error);
      total_abs_error += abs_error;
      max_rel_error = std::max(max_rel_error, rel_error);
      total_rel_error += rel_error;
      count++;
    }
  }

  float avg_abs_error = total_abs_error / count;
  float avg_rel_error = total_rel_error / count;

  std::cout << "  Max absolute error: " << max_abs_error << std::endl;
  std::cout << "  Average absolute error: " << avg_abs_error << std::endl;
  std::cout << "  Max relative error: " << (max_rel_error * 100) << "%" << std::endl;
  std::cout << "  Average relative error: " << (avg_rel_error * 100) << "%" << std::endl;

  float max_abs_error_int4_1 = 0.0f;
  float total_abs_error_int4_1 = 0.0f;
  float max_rel_error_int4_1 = 0.0f;
  float total_rel_error_int4_1 = 0.0f;
  int count_int4_1 = 0;

  for (int i = 0; i < sample_m; i++) {
    for (int j = 0; j < sample_n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_int4_1[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float abs_error = std::abs(actual - ref);
      float rel_error = std::abs(ref) > 1e-6 ? abs_error / std::abs(ref) : 0.0f;
      if (rel_error >= thresh_hold) {
        rel_error = thresh_hold;
      }

      max_abs_error_int4_1 = std::max(max_abs_error_int4_1, abs_error);
      total_abs_error_int4_1 += abs_error;
      max_rel_error_int4_1 = std::max(max_rel_error_int4_1, rel_error);
      total_rel_error_int4_1 += rel_error;
      count_int4_1++;
    }
  }

  float avg_abs_error_int4_1 = total_abs_error_int4_1 / count_int4_1;
  float avg_rel_error_int4_1 = total_rel_error_int4_1 / count_int4_1;
  std::cout << "\nINT4_1 Error analysis (sampled):" << std::endl;
  std::cout << "  Max absolute error: " << max_abs_error_int4_1 << std::endl;
  std::cout << "  Average absolute error: " << avg_abs_error_int4_1 << std::endl;
  std::cout << "  Max relative error: " << (max_rel_error_int4_1 * 100) << "%" << std::endl;
  std::cout << "  Average relative error: " << (avg_rel_error_int4_1 * 100) << "%" << std::endl;

  float max_abs_error_int4 = 0.0f;
  float total_abs_error_int4 = 0.0f;
  float max_rel_error_int4 = 0.0f;
  float total_rel_error_int4 = 0.0f;
  int count_int4 = 0;

  for (int i = 0; i < sample_m; i++) {
    for (int j = 0; j < sample_n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_int4[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float abs_error = std::abs(actual - ref);
      float rel_error = std::abs(ref) > 1e-6 ? abs_error / std::abs(ref) : 0.0f;
      if (rel_error >= thresh_hold) {
        rel_error = thresh_hold;
      }

      max_abs_error_int4 = std::max(max_abs_error_int4, abs_error);
      total_abs_error_int4 += abs_error;
      max_rel_error_int4 = std::max(max_rel_error_int4, rel_error);
      total_rel_error_int4 += rel_error;
      count_int4++;
    }
  }

  float avg_abs_error_int4 = total_abs_error_int4 / count_int4;
  float avg_rel_error_int4 = total_rel_error_int4 / count_int4;
  std::cout << "\nINT4 Error analysis (sampled):" << std::endl;
  std::cout << "  Max absolute error: " << max_abs_error_int4 << std::endl;
  std::cout << "  Average absolute error: " << avg_abs_error_int4 << std::endl;
  std::cout << "  Max relative error: " << (max_rel_error_int4 * 100) << "%" << std::endl;
  std::cout << "  Average relative error: " << (avg_rel_error_int4 * 100) << "%" << std::endl;

  float max_abs_error_k_int4_1 = 0.0f;
  float total_abs_error_k_int4_1 = 0.0f;
  float max_rel_error_k_int4_1 = 0.0f;
  float total_rel_error_k_int4_1 = 0.0f;
  int count_k_int4_1 = 0;

  for (int i = 0; i < sample_m; i++) {
    for (int j = 0; j < sample_n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_k_int4_1[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float abs_error = std::abs(actual - ref);
      float rel_error = std::abs(ref) > 1e-6 ? abs_error / std::abs(ref) : 0.0f;
      if (rel_error >= thresh_hold) {
        rel_error = thresh_hold;
      }

      max_abs_error_k_int4_1 = std::max(max_abs_error_k_int4_1, abs_error);
      total_abs_error_k_int4_1 += abs_error;
      max_rel_error_k_int4_1 = std::max(max_rel_error_k_int4_1, rel_error);
      total_rel_error_k_int4_1 += rel_error;
      count_k_int4_1++;
    }
  }
  float avg_abs_error_k_int4_1 = total_abs_error_k_int4_1 / count_k_int4_1;
  float avg_rel_error_k_int4_1 = total_rel_error_k_int4_1 / count_k_int4_1;
  std::cout << "\nINT4_1_k Error analysis (sampled):" << std::endl;
  std::cout << "  Max absolute error: " << max_abs_error_k_int4_1 << std::endl;
  std::cout << "  Average absolute error: " << avg_abs_error_k_int4_1 << std::endl;
  std::cout << "  Max relative error: " << (max_rel_error_k_int4_1 * 100) << "%" << std::endl;
  std::cout << "  Average relative error: " << (avg_rel_error_k_int4_1 * 100) << "%" << std::endl;

  float max_abs_error_k_int4_1_low = 0.0f;
  float total_abs_error_k_int4_1_low = 0.0f;
  float max_rel_error_k_int4_1_low = 0.0f;
  float total_rel_error_k_int4_1_low = 0.0f;
  int count_k_int4_1_low = 0;

  for (int i = 0; i < sample_m; i++) {
    for (int j = 0; j < sample_n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_k_int4_1_low[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float abs_error = std::abs(actual - ref);
      float rel_error = std::abs(ref) > 1e-6 ? abs_error / std::abs(ref) : 0.0f;
      if (rel_error >= thresh_hold) {
        rel_error = thresh_hold;
      }

      max_abs_error_k_int4_1_low = std::max(max_abs_error_k_int4_1_low, abs_error);
      total_abs_error_k_int4_1_low += abs_error;
      max_rel_error_k_int4_1_low = std::max(max_rel_error_k_int4_1_low, rel_error);
      total_rel_error_k_int4_1_low += rel_error;
      count_k_int4_1_low++;
    }
  }

  float avg_abs_error_k_int4_1_low = total_abs_error_k_int4_1_low / count_k_int4_1_low;
  float avg_rel_error_k_int4_1_low = total_rel_error_k_int4_1_low / count_k_int4_1_low;
  std::cout << "\nINT4_1_k_low Error analysis (sampled):" << std::endl;
  std::cout << "  Max absolute error: " << max_abs_error_k_int4_1_low << std::endl;
  std::cout << "  Average absolute error: " << avg_abs_error_k_int4_1_low << std::endl;
  std::cout << "  Max relative error: " << (max_rel_error_k_int4_1_low * 100) << "%" << std::endl;
  std::cout << "  Average relative error: " << (avg_rel_error_k_int4_1_low * 100) << "%" << std::endl;

  // Print sample comparison
  std::cout << "\nSample comparison (first 10x10):" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 10; i < std::min(20, sample_m); i++) {
    for (int j = 10; j < std::min(20, sample_n); j++) {
      float actual = ggml_compute_bf16_to_fp32(output[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("%7.4f (%7.4f) [%+6.1f%%]  ", actual, ref, error_pct);
    }
    std::cout << std::endl;
  }
  std::cout << "\nint4_1 Sample comparison (first 10x10):" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 10; i < std::min(20, sample_m); i++) {
    for (int j = 10; j < std::min(20, sample_n); j++) {
      float actual = ggml_compute_bf16_to_fp32(output_int4_1[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("%7.4f (%7.4f) [%+6.1f%%]  ", actual, ref, error_pct);
    }
    std::cout << std::endl;
  }
  std::cout << "\nint4 Sample comparison (first 10x10):" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 10; i < std::min(20, sample_m); i++) {
    for (int j = 10; j < std::min(20, sample_n); j++) {
      float actual = ggml_compute_bf16_to_fp32(output_int4[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("%7.4f (%7.4f) [%+6.1f%%]  ", actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  std::cout << "\nint4_1_k Sample comparison (first 10x10):" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 10; i < std::min(20, sample_m); i++) {
    for (int j = 10; j < std::min(20, sample_n); j++) {
      float actual = ggml_compute_bf16_to_fp32(output_k_int4_1[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("%7.4f (%7.4f) [%+6.1f%%]  ", actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  std::cout << "\nint4_1_k_low Sample comparison (first 10x10):" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 10; i < std::min(20, sample_m); i++) {
    for (int j = 10; j < std::min(20, sample_n); j++) {
      float actual = ggml_compute_bf16_to_fp32(output_k_int4_1_low[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("%7.4f (%7.4f) [%+6.1f%%]  ", actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  std::cout << "\nint4 Sample comparison:" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_int4[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("j:%d, %7.4f (%7.4f) [%+6.1f%%]  ", j, actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  std::cout << "\nSample comparison:" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("j:%d, %7.4f (%7.4f) [%+6.1f%%]  ", j, actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  std::cout << "\nint4_1_k Sample comparison:" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_k_int4_1[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("j:%d, %7.4f (%7.4f) [%+6.1f%%]  ", j, actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  std::cout << "\nint4_1 Sample comparison:" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_int4_1[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("j:%d, %7.4f (%7.4f) [%+6.1f%%]  ", j, actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  std::cout << "\nint4_1_k_low Sample comparison:" << std::endl;
  std::cout << "Format: actual (reference) [error%]" << std::endl;
  for (int i = 0; i < 1; i++) {
    for (int j = 0; j < n; j++) {
      float actual = ggml_compute_bf16_to_fp32(output_k_int4_1_low[i * n + j]);
      float ref = ref_result[i * sample_n + j];
      float error_pct = std::abs(ref) > 1e-6 ? (actual - ref) / ref * 100 : 0.0f;
      printf("j:%d, %7.4f (%7.4f) [%+6.1f%%]  ", j, actual, ref, error_pct);
    }
    std::cout << std::endl;
  }

  // Check if accuracy is acceptable for INT4
  if (avg_rel_error < 0.2f) {
    std::cout << "\n✓ Excellent accuracy (<20% average error)" << std::endl;
  } else if (avg_rel_error < 0.3f) {
    std::cout << "\n✓ Acceptable accuracy (20-30% average error)" << std::endl;
  } else if (avg_rel_error < 0.4f) {
    std::cout << "\n⚠ Marginal accuracy (30-40% average error)" << std::endl;
  } else {
    std::cout << "\n✗ Poor accuracy (>40% average error)" << std::endl;
  }

  if (avg_rel_error_int4_1 < 0.2f) {
    std::cout << "\n✓ Excellent accuracy for INT4 quantization (<20% average error)" << std::endl;
  } else if (avg_rel_error_int4_1 < 0.3f) {
    std::cout << "\n✓ Acceptable accuracy for INT4 quantization (20-30% average error)" << std::endl;
  } else if (avg_rel_error_int4_1 < 0.4f) {
    std::cout << "\n⚠ Marginal accuracy for INT4 quantization (30-40% average error)" << std::endl;
  } else {
    std::cout << "\n✗ Poor accuracy for INT4 quantization (>40% average error)" << std::endl;
  }

  if (avg_rel_error_int4 < 0.2f) {
    std::cout << "\n✓ Excellent accuracy for INT4 quantization (<20% average error)" << std::endl;
  } else if (avg_rel_error_int4 < 0.3f) {
    std::cout << "\n✓ Acceptable accuracy for INT4 quantization (20-30% average error)" << std::endl;
  } else if (avg_rel_error_int4 < 0.4f) {
    std::cout << "\n⚠ Marginal accuracy for INT4 quantization (30-40% average error)" << std::endl;
  } else {
    std::cout << "\n✗ Poor accuracy for INT4 quantization (>40% average error)" << std::endl;
  }

  if (avg_rel_error_k_int4_1 < 0.2f) {
    std::cout << "\n✓ Excellent accuracy for INT4 k-group quantization (<20% average error)" << std::endl;
  } else if (avg_rel_error_k_int4_1 < 0.3f) {
    std::cout << "\n✓ Acceptable accuracy for INT4 k-group quantization (20-30% average error)" << std::endl;
  } else if (avg_rel_error_k_int4_1 < 0.4f) {
    std::cout << "\n⚠ Marginal accuracy for INT4 k-group quantization (30-40% average error)" << std::endl;
  } else {
    std::cout << "\n✗ Poor accuracy for INT4 k-group quantization (>40% average error)" << std::endl;
  }

  if (avg_rel_error_k_int4_1_low < 0.2f) {
    std::cout << "\n✓ Excellent accuracy for INT4 k-group low quantization (<20% average error)" << std::endl;
  } else if (avg_rel_error_k_int4_1_low < 0.3f) {
    std::cout << "\n✓ Acceptable accuracy for INT4 k-group low quantization (20-30% average error)" << std::endl;
  } else if (avg_rel_error_k_int4_1_low < 0.4f) {
    std::cout << "\n⚠ Marginal accuracy for INT4 k-group low quantization (30-40% average error)" << std::endl;
  } else {
    std::cout << "\n✗ Poor accuracy for INT4 k-group low quantization (>40% average error)" << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

int main() {
  test_specific_dimensions();
  return 0;
}

================================================
FILE: kt-kernel/operators/amx/test/thread_test.sh
================================================
#!/bin/bash

# 进入脚本所在的目录
cd "$(dirname "$0")" || { echo "Failed to enter the script's directory"; exit 1; }

# 进入 ../build 目录
cd ../build || { echo "Failed to enter ../build directory"; exit 1; }

# 设置线程数列表
threads=(1 2 4 8 16 24 36 48 72)

# 遍历每个线程数并运行命令
for t in "${threads[@]}"; do
    echo "Running with OMP_NUM_THREADS=$t"
    OMP_NUM_THREADS=$t numactl -N 0 ./la/amx-test
    sleep 1s
done


================================================
FILE: kt-kernel/operators/amx/test/timer.hh
================================================
#ifndef TIMER_HH
#define TIMER_HH

#include <cassert>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include <array>

inline std::string doubleToStringR2(double value) {
  std::stringstream stream;
  stream << std::fixed << std::setprecision(2) << value;
  return stream.str();
}

inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};

inline std::string readable_number(size_t size) {
  size_t unit_index = 0;
  double readable_size = size;
  while (readable_size >= 1000 && unit_index < units.size() - 1) {
    readable_size /= 1000;
    unit_index++;
  }
  std::ostringstream ss;
  ss << std::fixed << std::setprecision(2) << readable_size;
  std::string str = ss.str();
  return str + "" + units[unit_index];
}

class Timer {
public:
  std::string name;
  bool tmp_timer = false;

  Timer() {}
  Timer(std::string name) : name(name), tmp_timer(true) { start(); }
  ~Timer() {
    if (tmp_timer) {
      std::cout << name << " " << elapsedTime() << std::endl;
    }
  }

  void start() {
    m_startTime = std::chrono::high_resolution_clock::now();
    assert(m_isRunning == false);
    m_isRunning = true;
  }

  void stop() {
    m_endTime = std::chrono::high_resolution_clock::now();
    assert(m_isRunning == true);
    m_isRunning = false;
    m_runningNs += elapsedNs();
  }

  double elapsedNs() {
    std::chrono::time_point<std::chrono::high_resolution_clock> endTime;

    if (m_isRunning) {
      endTime = std::chrono::high_resolution_clock::now();
    } else {
      endTime = m_endTime;
    }

    return std::chrono::duration_cast<std::chrono::nanoseconds>(endTime -
                                                                m_startTime)
        .count();
  }

  void printElapsedMilliseconds() {
    std::cout << elapsedNs() / 1e6 << " ms" << std::endl;
  }

  static std::string ns_to_string(double duration) {
    auto nano_sec = duration;
    if (nano_sec >= 1000) {
      auto mirco_sec = nano_sec / 1000.0;
      if (mirco_sec >= 1000) {
        auto milli_sec = mirco_sec / 1000.0;
        if (milli_sec >= 1000) {
          auto seconds = milli_sec / 1000.0;

          if (seconds >= 60.0) {
            auto minutes = seconds / 60.0;

            if (minutes >= 60.0) {
              auto hours = minutes / 60.0;
              return doubleToStringR2(hours) + " h";
            } else {
              return doubleToStringR2(minutes) + " min";
            }
          } else {
            return doubleToStringR2(seconds) + " sec";
          }
        } else {
          return doubleToStringR2(milli_sec) + " ms";
        }
      } else {
        return doubleToStringR2(mirco_sec) + " us";
      }
    } else {
      return doubleToStringR2(nano_sec) + " ns";
    }
  }

  double runningTimeNs() { return m_runningNs; }

  std::string runningTime() {
    auto duration = m_runningNs;
    return ns_to_string(duration);
  }

  std::string elapsedTime() { return ns_to_string(elapsedNs()); }
  double elapsedMs() { return elapsedNs() / 1e6; }
  std::string report_throughput(size_t op_cnt) {
    double ops = op_cnt / elapsedMs() * 1000;
    return readable_number(ops) + "op/s";
  }

  void merge(Timer &other) {
    assert(m_isRunning == false);
    assert(other.m_isRunning == false);
    m_runningNs += other.runningTimeNs();
  }

private:
  std::chrono::time_point<std::chrono::high_resolution_clock> m_startTime;
  std::chrono::time_point<std::chrono::high_resolution_clock> m_endTime;
  bool m_isRunning = false;
  double m_runningNs = 0.0;
};

class Counter {
public:
  Counter() {}

  std::map<std::string, size_t> counters;

  void inc(const char *name, size_t num) { counters[name] += num; };
  void print() {
    for (auto &p : counters) {
      std::cout << p.first << " : " << p.second << std::endl;
    }
  };
};

#endif // TIMER_HH


================================================
FILE: kt-kernel/operators/amx/test/verify-kgroup.cpp
================================================
#include <cmath>
#include <iostream>
#include <memory>
#include <random>

#include "../la/amx.hpp"

void verify_kgroup_accuracy() {
  std::cout << "=== Verifying K-Group Accuracy ===" << std::endl;

  const int m = 32;
  const int n = 32;
  const int k = 1024;
  const int k_group_size = 256;

  using Kernel = amx::GemmKernel224Int4KGroup;
  using BufferA = Kernel::BufferA;
  using BufferB = Kernel::BufferB;
  using BufferC = Kernel::BufferC;

  void* buffer_a = std::aligned_alloc(64, BufferA::required_size(m, k, k_group_size));
  void* buffer_b = std::aligned_alloc(64, BufferB::required_size(n, k, k_group_size));
  void* buffer_c = std::aligned_alloc(64, BufferC::required_size(m, n));

  auto ba = std::make_shared<BufferA>(m, k, k_group_size, buffer_a);
  auto bb = std::make_shared<BufferB>(n, k, k_group_size, buffer_b);
  auto bc = std::make_shared<BufferC>(m, n, buffer_c);

  // Create input matrices with values in the quantization sweet spot
  std::vector<ggml_bf16_t> input_a(m * k);
  std::vector<ggml_bf16_t> input_b(k * n);

  std::mt19937 gen(12345);
  std::uniform_real_distribution<float> dist(-0.5f, 0.5f);

  for (int i = 0; i < m * k; i++) {
    input_a[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }
  for (int i = 0; i < k * n; i++) {
    input_b[i] = ggml_compute_fp32_to_bf16(dist(gen));
  }

  // Compute reference result with float32
  std::vector<float> ref_result(m * n, 0.0f);
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      float sum = 0.0f;
      for (int l = 0; l < k; l++) {
        float a_val = ggml_compute_bf16_to_fp32(input_a[i * k + l]);
        float b_val = ggml_compute_bf16_to_fp32(input_b[l * n + j]);
        sum += a_val * b_val;
      }
      ref_result[i * n + j] = sum;
    }
  }

  // Quantize and compute with k-group
  ba->from_mat(m, input_a.data(), 0, 1);
  bb->from_mat(input_b.data(), 0, 1);

  Kernel::config();
  amx::mat_mul_kgroup(m, n, k, k_group_size, ba, bb, bc, 0, 1);

  std::vector<ggml_bf16_t> output(m * n);
  bc->to_mat(m, output.data(), 0, 1);

  // Compute errors
  float max_abs_error = 0.0f;
  float total_abs_error = 0.0f;
  float max_ref_value = 0.0f;

  for (int i = 0; i < m * n; i++) {
    float actual = ggml_compute_bf16_to_fp32(output[i]);
    float ref = ref_result[i];
    float error = std::abs(actual - ref);

    max_abs_error = std::max(max_abs_error, error);
    total_abs_error += error;
    max_ref_value = std::max(max_ref_value, std::abs(ref));
  }

  float avg_abs_error = total_abs_error / (m * n);
  float relative_error = max_abs_error / (max_ref_value + 1e-8f);

  std::cout << "Matrix dimensions: " << m << "x" << n << "x" << k << std::endl;
  std::cout << "K-group size: " << k_group_size << std::endl;
  std::cout << "Max absolute error: " << max_abs_error << std::endl;
  std::cout << "Average absolute error: " << avg_abs_error << std::endl;
  std::cout << "Max reference value: " << max_ref_value << std::endl;
  std::cout << "Relative error: " << (relative_error * 100) << "%" << std::endl;

  // Check if accuracy is acceptable for INT4
  // INT4 quantization typically has 5-10% error
  if (relative_error < 0.15f) {
    std::cout << "✓ Accuracy is acceptable for INT4 quantization" << std::endl;
  } else {
    std::cout << "✗ Accuracy needs improvement" << std::endl;
  }

  free(buffer_a);
  free(buffer_b);
  free(buffer_c);
}

int main() {
  verify_kgroup_accuracy();
  return 0;
}

================================================
FILE: kt-kernel/operators/common.hpp
================================================
#ifndef CPUINFER_OPERATOR_COMMON_HPP
#define CPUINFER_OPERATOR_COMMON_HPP

#include <map>

#include "../cpu_backend/worker_pool.h"
#include "ggml.h"

#if defined(__aarch64__) && defined(CPU_USE_KML)
#include <arm_sve.h>
#endif

#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <stdexcept>
#include <type_traits>

// #define FORWARD_TIME_PROFILE
// #define FORWARD_TIME_REPORT

#define ASSERT_RELEASE(x, text)                                                            \
  do {                                                                                     \
    if (!(x)) {                                                                            \
      fprintf(stderr, "Assertion failed: %s, file %s, line %d\n", #x, __FILE__, __LINE__); \
      fprintf(stderr, "Error message: %s\n", (text));                                      \
      throw std::runtime_error((text));                                                    \
    }                                                                                      \
  } while (0)

#define PUSH_MEM_REQ(ptr, size) mem_requests.append_pointer(&(ptr), (size))

#define PROFILE_RECORD_TIME_STAMP(name)                                                             \
  do {                                                                                              \
    auto end_time = std::chrono::high_resolution_clock::now();                                      \
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - last).count(); \
    time_map[(name)] = duration;                                                                    \
    last = end_time;                                                                                \
  } while (0)

#define DO_TPS_LOAD_WEIGHTS(pool)                                                         \
  (pool)->dispense_backend()->do_numa_job([this, pool, config](int numa_id) {             \
    this->tps[numa_id]->config_.physical_to_logical_map = config.physical_to_logical_map; \
    this->tps[numa_id]->load_weights();                                                   \
  })

#define expert_map(m, x) (m != nullptr ? m[(x)] : (x))

template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
inline T div_up(T x, T y) {
  return (x + y - 1) / y;
}

template <typename T>
T* offset_pointer(T* ptr, size_t byte_offset) {
  return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + byte_offset);
}

template <typename T>
size_t pointer_offset(T* ptr, T* b) {
  return reinterpret_cast<size_t>(b) - reinterpret_cast<size_t>(ptr);
}

template <typename T>
const T* offset_pointer(const T* ptr, size_t byte_offset) {
  return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + byte_offset);
}

template <typename T>
T* offset_pointer_row_major(T* t, int row, int col, size_t ld) {
  return offset_pointer(t, row * ld) + col;
}

template <typename T>
T* offset_pointer_col_major(T* t, int row, int col, size_t ld) {
  return offset_pointer(t, col * ld) + row;
}

class TimePerf {
 protected:
  std::string time_perf_name;
  std::map<std::string, long> time_map;
  std::chrono::time_point<std::chrono::high_resolution_clock> last;
  std::chrono::time_point<std::chrono::high_resolution_clock> start_time;

  void forward_perf_start() {
    start_time = std::chrono::high_resolution_clock::now();
    last = start_time;
  }

  void perf_report() {
    auto end_time = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    std::string output = time_perf_name + ", forward time: " + std::to_string(duration.count()) + " us";
    // for (auto [name, t] : time_map) {
    //   double p = 100.0 * t / duration.count();
    //   // if (p < 1.0) {
    //   //   continue; // Skip if the percentage is less than 1%
    //   // }
    //   output += ", " + name + ": " + std::to_string(t) + " us(" + std::to_string(size_t(round(p))) + "%)";
    // }
    // 反向遍历
    for (auto it = time_map.rbegin(); it != time_map.rend(); ++it) {
      const std::string& name = it->first;
      long t = it->second;
      double p = 100.0 * t / duration.count();
      // if (p < 1.0) {
      //   continue; // Skip if the percentage is less than 1%
      // }
      output += ", " + name + ": " + std::to_string(t) + " us(" + std::to_string(size_t(round(p))) + "%)";
    }
    printf("%s\n", output.c_str());
  }
};

struct TaskCounter {
  std::vector<size_t> fold = {}, card = {};

  TaskCounter(std::initializer_list<size_t> i) {
    card.push_back(1);
    for (auto j : i) {
      push_back(j);
    }
  }

  void push_back(size_t i) {
    fold.push_back(i);
    for (auto& c : card) {
      c *= i;
    }
    card.push_back(1);
  }
  void push_back(std::vector<size_t> i) {
    for (auto j : i) {
      push_back(j);
    }
  }
  size_t count() { return card[0]; }
  size_t at(size_t id, size_t which) { return id % card.at(which) / card.at(which + 1); }
};

struct GeneralConfig {
  size_t vocab_size;
  size_t hidden_size;

  size_t num_experts_per_tok;
  size_t n_routed_experts;
  size_t n_shared_experts;
  size_t max_qlen = 4096;

  void* lm_heads_ptr;
  ggml_type lm_heads_type;
  void* norm_weights_ptr;
  ggml_type norm_weights_type;
  void* token_embd_ptr;
  ggml_type token_embd_type;
  WorkerPool* pool = nullptr;
  GeneralConfig() {}
};

struct GeneralMLAConfig {
  size_t hidden_size;
  size_t q_lora_rank;
  size_t num_heads;
  size_t nope_size;
  size_t rope_size;
  size_t kv_lora_rank;

  int layer_idx = 0;
  WorkerPool* pool = nullptr;
  size_t token_count_in_page = 256;  // token count in a page
  size_t max_qlen = 1024;
  size_t max_kvlen = 4096;

  // rope
  size_t max_position_embeddings;
  double rope_scaling_factor = 1.0;
  double rope_theta = 10000.0;
  double rope_scaling_beta_fast;
  double rope_scaling_beta_slow;
  double rope_scaling_mscale;
  double rope_scaling_mscale_all_dim;
  double rope_scaling_original_max_position_embeddings;

  void* q_a_proj;
  void* q_a_norm = nullptr;
  void* q_b_proj;
  void* kv_a_proj_with_mqa;
  void* kv_a_norm = nullptr;
  void* kv_b_proj;
  void* o_proj;

  // for llamafile
  ggml_type q_a_proj_type;
  ggml_type q_a_norm_type;
  ggml_type q_b_proj_type;
  ggml_type kv_a_proj_with_mqa_type;
  ggml_type kv_a_norm_type;
  ggml_type kv_b_proj_type;
  ggml_type w_o_type;

  ggml_type input_type = GGML_TYPE_F32;
  ggml_type output_type = GGML_TYPE_F32;

  size_t m_block = 4;
  size_t n_block = 4;
  // for kvcache
  size_t page_count = 200;  // page count for kv cache

  GeneralMLAConfig() {}
  GeneralMLAConfig(size_t hidden_size, size_t q_lora_rank, size_t kv_lora_rank, size_t num_heads, size_t nope_size,
                   size_t rope_size)
      : hidden_size(hidden_size),
        q_lora_rank(q_lora_rank),
        kv_lora_rank(kv_lora_rank),
        num_heads(num_heads),
        nope_size(nope_size),
        rope_size(rope_size) {}
};

struct QuantConfig {
  std::string quant_method = "";
  int bits = 0;
  int group_size = 0;
  bool zero_point = false;
  bool per_channel = false;  // Per-channel quantization (GLM-4.7-FP8 style)
};

struct GeneralMOEConfig {
  // Basic Config
  int expert_num;
  int num_experts_per_tok;
  int hidden_size;
  int intermediate_size;

  int layer_idx = 0;
  WorkerPool* pool = nullptr;

  // SGLang offload
  int num_gpu_experts = 0;              // Computed from gpu_experts_mask
  uint8_t* gpu_experts_mask = nullptr;  // Bool mask: true = expert on GPU
  void* physical_to_logical_map = nullptr;

  // Compute num_gpu_experts from gpu_experts_mask
  void compute_num_gpu_experts() {
    num_gpu_experts = 0;
    if (gpu_experts_mask) {
      for (int i = 0; i < expert_num; i++) {
        if (gpu_experts_mask[i]) num_gpu_experts++;
      }
    }
  }

  // Check if expert should be skipped (invalid, out of range, or on GPU)
  inline bool should_skip_expert(int64_t expert_id) const {
    return expert_id < 0 || expert_id >= expert_num || (gpu_experts_mask && gpu_experts_mask[expert_id]);
  }

  void* gate_proj;
  void* up_proj;
  void* down_proj;

  void* gate_scale;
  void* up_scale;
  void* down_scale;

  void* gate_zero;
  void* up_zero;
  void* down_zero;

  QuantConfig quant_config;

  // for amx
  int max_len = 0;
  std::vector<std::vector<void*>> gate_projs;
  std::vector<std::vector<void*>> up_projs;
  std::vector<std::vector<void*>> down_projs;
  std::vector<std::vector<void*>> gate_scales;
  std::vector<std::vector<void*>> up_scales;
  std::vector<std::vector<void*>> down_scales;
  std::vector<std::vector<void*>> gate_zeros;
  std::vector<std::vector<void*>> up_zeros;
  std::vector<std::vector<void*>> down_zeros;

  std::string path;
  bool save = false;
  bool load = false;

  // for llamafile
  int m_block = 4;
  int group_min_len = 0;
  int group_max_len = 0;
  int gate_type;
  int up_type;
  int down_type;
  int hidden_type;

  GeneralMOEConfig() {}

  GeneralMOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size)
      : expert_num(expert_num),
        num_experts_per_tok(routed_expert_num),
        hidden_size(hidden_size),
        intermediate_size(intermediate_size) {}

  int max_possible_qlen() { return std::max(max_len, group_max_len); }
};

struct GeneralGateConfig {
  size_t hidden_size;
  size_t num_experts_per_tok;
  size_t n_routed_experts;
  size_t n_group;
  size_t topk_group;

  bool norm_topk_prob = true;
  float routed_scaling_factor = 2.5f;

  std::string scoring_func = "sigmoid";
  std::string topk_method = "noaux_tc";

  int layer_idx = 0;
  WorkerPool* pool = nullptr;

  void* weight = nullptr;
  ggml_type weight_type;
  void* e_score_correction_bias = nullptr;
  ggml_type e_score_correction_bias_type;

  size_t max_seqlen = 25600;

  GeneralGateConfig() = default;

  GeneralGateConfig(int hidden_size, int num_experts_per_tok, int n_routed_experts, int n_group, int topk_group)
      : hidden_size(hidden_size),
        num_experts_per_tok(num_experts_per_tok),
        n_routed_experts(n_routed_experts),
        n_group(n_group),
        topk_group(topk_group) {}
};

class MLA_Interface {
 public:
  virtual void forward(std::vector<int> qlens, std::vector<std::vector<int>> page_tables, std::vector<int> kv_lens,
                       const void* input, void* output) = 0;
};

class MoE_Interface {
 public:
  virtual void forward(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
                       void* output, bool incremental = false) = 0;
};
inline void init_ggml() {
  static bool inited = false;
  if (inited) {
    return;
  }
  struct ggml_init_params params = {
      0,
      NULL,
      true,
  };

  auto ctx_eval = ggml_init(params);

  if (!ctx_eval) {
    throw std::runtime_error("Failed to create ggml context");
  }
  inited = true;
}

template <typename A, typename B>
void convert_or_copy(A* dst, const B* src, size_t count) {
  if constexpr (std::is_same_v<A, B>) {
    // printf("Direct copy\n");
    memcpy(dst, src, sizeof(A) * count);
  } else {
    if constexpr (std::is_same_v<A, float>) {
      if constexpr (std::is_same_v<B, ggml_bf16_t>) {
        // printf("Converting ggml_bf16_t to float\n");
        ggml_bf16_to_fp32_row(src, dst, count);
      } else if constexpr (std::is_same_v<B, ggml_fp16_t>) {
        ggml_fp16_to_fp32_row(src, dst, count);
      } else {
        throw std::runtime_error("Unsupported conversion");
      }
    } else if constexpr (std::is_same_v<A, ggml_bf16_t>) {
      if constexpr (std::is_same_v<B, float>) {
        // printf("Converting float to ggml_bf16_t\n");
        ggml_fp32_to_bf16_row(src, dst, count);
      } else {
        throw std::runtime_error("Unsupported conversion");
      }
    }

    else {
      throw std::runtime_error("Unsupported conversion");
    }
  }
}

template <typename A>
void convert_or_copy(A* dst, void* src, ggml_type type, size_t count) {
  switch (type) {
    case GGML_TYPE_BF16: {
      auto src_bf16 = (ggml_bf16_t*)src;
      convert_or_copy(dst, src_bf16, count);
      break;
    }
    case GGML_TYPE_F16: {
#if defined(__aarch64__) && defined(CPU_USE_KML)
      auto src_fp16 = (float16_t*)src;
      convert_or_copy(dst, src_fp16, count);
#else
      throw std::runtime_error("GGML_TYPE_F16 is not supported on this platform");
#endif
      break;
    }
    case GGML_TYPE_F32: {
      auto src_f32 = (float*)src;
      convert_or_copy(dst, src_f32, count);
      break;
    }
    default:
      throw std::runtime_error("Unsupported type for conversion");
  }
}

template <typename A>
void check_numerics(A* data, size_t count) {
  for (size_t i = 0; i < count; i++) {
    if (std::isnan(data[i]) || std::isinf(data[i])) {
      printf("Numerics check failed at index %zu: value = %f\n", i, data[i]);
      throw std::runtime_error("Numerics check failed");
    }
  }
  printf("Numerics check passed for %zu elements.\n", count);
}

inline void debug_bf16(ggml_bf16_t* x) {
  for (int i = 0; i < 10; i++) {
    printf("%f ", ggml_bf16_to_fp32(x[i]));
  }
  printf("\n");
}
inline void debug_f32(float* x) {
  for (int i = 0; i < 10; i++) {
    printf("%f ", x[i]);
  }
  printf("\n");
}

inline void debug_f32(float* x, size_t count) {
  if (count < 10) {
    for (size_t i = 0; i < count; i++) {
      printf("%f ", x[i]);
    }
  } else {
    for (size_t i = 0; i < 3; i++) {
      printf("%f ", x[i]);
    }
    printf("...");
    for (size_t i = count - 3; i < count; i++) {
      printf("%f ", x[i]);
    }
    printf("\n");
  }
}

#endif


================================================
FILE: kt-kernel/operators/kvcache/kvcache.h
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#ifndef CPUINFER_OPERATOR_KVCACHE_H
#define CPUINFER_OPERATOR_KVCACHE_H

#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <functional>
#include <memory>
#include <mutex>
#include <queue>
#include <vector>

#include "../../cpu_backend/worker_pool.h"
#include "llama.cpp/ggml-common.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"

#define CHUNK_SIZE 32

/**
 * @brief Converts a ggml_type enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * ggml_type enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param type The ggml_type enum value to convert.
 * @return A string representation of the enum value.
 */
std::string ggml_type_to_string(ggml_type type);

/**
 * @enum AnchorType
 * @brief Defines the types of anchors used in attention mechanisms.
 *
 * This enum specifies different types of anchors that can be used in attention
 * mechanisms, such as fixed anchors, dynamic anchors, or special anchors like
 * QUEST, BLOCK_MEAN, or BLOCK_MAX.
 */
enum AnchorType {
  FIXED_ANCHOR, /**< A fixed anchor that does not change. */
  DYNAMIC,      /**< A dynamic anchor that can change over time. */
  QUEST,        /**< A special anchor type used for QUEST (Query and Embedding Space
                   Transformation). */
  BLOCK_MEAN,   /**< An anchor based on the mean of a block of data. */
  BLOCK_MAX     /**< An anchor based on the maximum value within a block of data.
                 */
};

/**
 * @brief Converts an AnchorType enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * AnchorType enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param anchor_type The AnchorType enum value to convert.
 * @return A string representation of the enum value.
 */
std::string AnchorTypeToString(AnchorType anchor_type);

/**
 * @enum RetrievalType
 * @brief Defines the types of retrieval strategies in attention mechanisms.
 *
 * This enum specifies different retrieval strategies that can be used in
 * attention mechanisms, such as layer-level retrieval, key-value head-level
 * retrieval, or query head-level retrieval.
 */
enum RetrievalType {
  LAYER,  /**< Retrieval at the layer level. */
  KVHEAD, /**< Retrieval at the key-value head level. */
  QHEAD   /**< Retrieval at the query head level. */
};

/**
 * @brief Converts a RetrievalType enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * RetrievalType enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param retrieval_type The RetrievalType enum value to convert.
 * @return A string representation of the enum value.
 */
std::string RetrievalTypeToString(RetrievalType retrieval_type);

/**
 * @struct KVCacheConfig
 * @brief Configuration structure for Key-Value (KV) Cache.
 *
 * This structure holds configuration parameters for setting up and managing
 * a Key-Value (KV) Cache used in various attention mechanisms. It includes
 * parameters such as the number of layers, the number of heads, the dimension
 * of each head, block length, anchor information, and memory-related settings.
 */
struct KVCacheConfig {
  int layer_num;   /**< Number of layers in the model. */
  int kv_head_num; /**< Number of heads in the KV Cache. */
  int q_head_num;  /**< Number of heads in the query. */
  int head_dim;    /**< Dimension of each head. */
  int block_len;   /**< Length of each block in the cache. */
  int anchor_num;  /**< Number of anchors used in attention. */

  ggml_type kv_type; /**< Data type of the KV Cache (e.g., fp16, q8_0). */

  // Controls the pre-allocated memory size
  int max_block_num;  /**< Maximum number of blocks that can be allocated. */
  int max_batch_size; /**< Maximum batch size that can be processed. */
  int max_thread_num; /**< Maximum number of threads that can be used. */

  AnchorType anchor_type;       /**< Type of anchors used in the attention mechanism. */
  RetrievalType retrieval_type; /**< Type of retrieval strategy used in the cache. */

  int layer_step;   /**< Step size between layers. */
  int token_step;   /**< Step size between tokens. */
  int layer_offset; /**< Offset value for layers. */

  /**
   * @brief Default constructor for KVCacheConfig.
   *
   * Initializes the configuration with default values. This constructor
   * does not initialize any member variables explicitly.
   */
  KVCacheConfig() = default;

  /**
   * @brief Parameterized constructor for KVCacheConfig.
   *
   * This constructor initializes the configuration with specific values
   * for all member variables.
   *
   * @param layer_num The number of layers in the model.
   * @param kv_head_num The number of heads in the KV Cache.
   * @param q_head_num The number of heads in the query.
   * @param head_dim The dimension of each head.
   * @param block_len The length of each block in the cache.
   * @param anchor_num The number of anchors used in attention.
   * @param anchor_type The type of anchors used in the attention mechanism.
   * @param kv_type The data type of the KV Cache (e.g., fp16, q8_0).
   * @param retrieval_type The type of retrieval strategy used in the cache.
   * @param layer_step The step size between layers.
   * @param token_step The step size between tokens.
   * @param layer_offset The offset value for layers.
   * @param max_block_num The maximum number of blocks that can be allocated.
   * @param max_batch_size The maximum batch size that can be processed.
   * @param max_thread_num The maximum number of threads that can be used.
   */
  KVCacheConfig(int layer_num, int kv_head_num, int q_head_num, int head_dim, int block_len, int anchor_num,
                AnchorType anchor_type, ggml_type kv_type, RetrievalType retrieval_type, int layer_step, int token_step,
                int layer_offset, int max_block_num, int max_batch_size, int max_thread_num);
};

/**
 * @class KVCache
 * @brief Manages the Key-Value (KV) Cache used in attention mechanisms.
 *
 * The KVCache class provides functionality for managing the Key-Value Cache,
 * including resizing the cache, retrieving configuration parameters, and
 * updating internal states. This class is typically used in transformer models
 * to store and manage past key and value states for efficient attention
 * computations.
 */
class KVCache {
 public:
  /**
   * @brief Constructs a KVCache object with the given configuration.
   *
   * Initializes the KVCache with the specified configuration parameters,
   * such as the number of layers, heads, head dimensions, and other
   * relevant settings.
   *
   * @param config The configuration object containing initialization
   * parameters.
   */
  KVCache(KVCacheConfig config);

  /**
   * @brief Resizes the number of threads used by the cache.
   *
   * This function adjusts the number of threads that the cache can utilize.
   * It allows dynamic reconfiguration of the parallel processing capabilities
   * based on the current workload or system resources.
   *
   * @param thread_num The new number of threads to use.
   */
  void ThreadResize(int thread_num);

  /**
   * @brief Resizes the batch size managed by the cache.
   *
   * This function adjusts the batch size that the cache can handle. It
   * is useful when the input batch size changes dynamically, allowing
   * the cache to be reconfigured accordingly.
   *
   * @param batch_size The new batch size.
   */
  void BatchResize(int batch_size);

  /**
   * @brief Resizes the number of blocks managed by the cache.
   *
   * This function adjusts the number of blocks that the cache can manage.
   * It allows dynamic reconfiguration of the block structure based on the
   * current sequence length or other factors.
   *
   * @param block_num The new number of blocks.
   */
  void BlockResize(int block_num);

  /**
   * @brief Gets the number of layers in the cache.
   *
   * @return The number of layers configured in the cache.
   */
  int get_layer_num() { return config_.layer_num; }

  /**
   * @brief Gets the number of KV heads in the cache.
   *
   * @return The number of KV heads configured in the cache.
   */
  int get_kv_head_num() { return config_.kv_head_num; }

  /**
   * @brief Gets the number of query heads in the cache.
   *
   * @return The number of query heads configured in the cache.
   */
  int get_q_head_num() { return config_.q_head_num; }

  /**
   * @brief Gets the dimension of each head in the cache.
   *
   * @return The dimension of each head.
   */
  int get_head_dim() { return config_.head_dim; }

  /**
   * @brief Gets the length of each block in the cache.
   *
   * @return The length of each block.
   */
  int get_block_len() { return config_.block_len; }

  /**
   * @brief Gets the number of blocks for a specific layer.
   *
   * @param layer_id The ID of the layer for which to retrieve the block
   * number.
   * @return The number of blocks in the specified layer.
   */
  int get_block_num(int layer_id) { return past_block_num_[layer_id]; }

  /**
   * @brief Gets the number of anchors in the cache.
   *
   * @return The number of anchors configured in the cache.
   */
  int get_anchor_num() { return config_.anchor_num; }

  /**
   * @brief Gets the total length of the cache.
   *
   * @return The total length of the cache.
   */
  int get_cache_total_len() { return cache_total_len_; }

  /**
   * @brief Gets the total number of blocks in the cache.
   *
   * This function computes and returns the total number of blocks in the
   * cache based on the total cache length and the block length configuration.
   *
   * @return The total number of blocks in the cache.
   */
  int get_cache_total_block_num() { return (cache_total_len_ + config_.block_len - 1) / config_.block_len; }

  /**
   * @brief Updates the total length of the cache.
   *
   * This function sets a new total length for the cache, allowing dynamic
   * adjustment of the cache size during runtime.
   *
   * @param cache_total_len The new total length of the cache.
   */
  void update_cache_total_len(int cache_total_len) { cache_total_len_ = cache_total_len; }
  void attn(const ggml_fp16_t* q_in, ggml_fp16_t* output, float* attn_lse, int layer_idx, int generate_token_idx,
            int q_len, int batch_size, int max_block_num, int* block_table, int* cache_seqlens, int pick_block_num,
            int init_block_num, int local_block_num, WorkerPool* backend);

  void update_kvcache_one_block_fp16(const ggml_fp16_t* k_in, const ggml_fp16_t* v_in, int layer_id, int block_idx,
                                     WorkerPool* backend);

  void get_kvcache_one_block_fp16(ggml_fp16_t* k_in, ggml_fp16_t* v_in, int layer_id, int block_idx,
                                  WorkerPool* backend);

  void update_importance_one_block(const ggml_fp16_t* importance, int layer_id, int block_idx, WorkerPool* backend);
  void get_importance_one_block(ggml_fp16_t* importance, int layer_id, int block_idx, WorkerPool* backend);

  void get_anchor_one_block(ggml_fp16_t* anchor, int layer_id, int block_idx, WorkerPool* backend);

  void update_anchor_one_block(const ggml_fp16_t* anchor, int layer_id, int block_idx, WorkerPool* backend);

  void calc_anchor_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num,
                              WorkerPool* backend);

  void load_kvcache(std::string tensor_file_path, WorkerPool* backend);
  void dump_kvcache(int* block_table, int cache_total_len, std::string tensor_file_path, WorkerPool* backend);

  void get_and_update_kvcache_fp16(ggml_fp16_t* k_in, ggml_fp16_t* v_in, int layer_id, int* block_table, int batch_size,
                                   int max_block_num, int* cache_seqlens, int q_len, WorkerPool* backend);

  void get_kvcache_fp16(ggml_fp16_t* k_in, ggml_fp16_t* v_in, int layer_id, int* block_table, int batch_size,
                        int max_block_num, int* cache_seqlens, WorkerPool* backend);

  void update_kvcache_fp16(const ggml_fp16_t* k_in, const ggml_fp16_t* v_in, int layer_id, int* block_table,
                           int batch_size, int max_block_num, int* cache_seqlens, int q_len, WorkerPool* backend);

  void update_importance(const ggml_fp16_t* importance, int layer_id, int* block_table, int batch_size,
                         int max_block_num, int* offset, int width, WorkerPool* backend);

  void attn_with_kvcache(const ggml_fp16_t* q_in, const ggml_fp16_t* k_in, const ggml_fp16_t* v_in, ggml_fp16_t* output,
                         float* attn_lse, int layer_idx, int generate_token_idx, int q_len, int batch_size,
                         int max_block_num, int* block_table, int* cache_seqlens, int topk, int local,
                         WorkerPool* backend);

  void clear_importance_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num,
                                   WorkerPool* backend);

  void clear_kvcache_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num,
                                WorkerPool* backend);

  void get_sincos(ggml_fp16_t* sin, ggml_fp16_t* cos, int seqlen);

  void get_attn_sparsity(const ggml_fp16_t* q_in, float* attn_sparsity, int layer_idx, int generate_token_idx,
                         int q_len, int batch_size, int max_block_num, int* block_table, int* cache_seqlens,
                         int* block_table_origin, int* cache_seqlens_origin, int max_block_num_origin, int topk,
                         int local, WorkerPool* backend);

  void get_all_kvcache_one_layer(int layer_id, ggml_fp16_t* k_in, ggml_fp16_t* v_in, WorkerPool* backend);

 private:
  // Persistent data
  KVCacheConfig config_;
  int n_gqa_;                             // q_head_num / kv_head_num
  int cache_total_len_;                   // Number of tokens in cache
  std::vector<uint64_t> past_block_num_;  // [layer_num]
  std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
      k_cache_q4;  // [layer_num, kv_head_num, past_block_num,
                   // block_len * (head_dim / QK_4)]
  std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
      v_cache_q4;  // [layer_num, kv_head_num, past_block_num,
                   // head_dim * (block_len / QK_4)]
  std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
      k_cache_q8;  // [layer_num, kv_head_num, past_block_num,
                   // block_len * (head_dim / QK_8)]
  std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
      v_cache_q8;  // [layer_num, kv_head_num, past_block_num,
                   // head_dim * (block_len / QK_8)]

  std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
      k_cache_fp16_;  // [layer_num, kv_head_num, past_block_num, block_len *
                      // head_dim]
  std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
      v_cache_fp16_;  // [layer_num, kv_head_num, past_block_num, head_dim *
                      // block_len]

  std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>> importance_;  // [layer_num, past_block_num,
                                                                                // block_len, attention_head_num]

  std::vector<ggml_fp16_t> anchor_;  // [layer_num * past_block_num * anchor_num *
                                     // attention_head_num * head_dim]

  // Runtime data
  int64_t layer_id_;
  int64_t block_idx_;
  int* block_table_;
  uint64_t block_num_;
  int max_block_num_after_retrieval_;

  // Rotary positional embeddings
  std::vector<std::vector<ggml_fp16_t>> sin_;  // [seq_len, head_dim]
  std::vector<std::vector<ggml_fp16_t>> cos_;  // [seq_len, head_dim]

  // update/get
  int seq_len_;
  uint16_t* k_scales_;         // q4_0
  uint8_t* k_in_;              // q4_0
  uint16_t* v_scales_;         // q4_0
  uint8_t* v_in_;              // q4_0
  uint16_t* k_data_;           // fp16
  uint16_t* v_data_;           // fp16
  uint16_t* importance_data_;  // fp16
  uint16_t* anchor_data_;      // fp16

  // sparsity = (sigma(block lse / lse))
  std::vector<std::vector<std::vector<float>>> block_lse_;  // [batch_size, max_block_num, q_head_num]
  std::vector<std::vector<float>> attn_sparsity_;           // [batch_size, q_head_num]

  // attn
  std::vector<std::vector<float>> avg_q;  // [batch_size, q_head_num * head_dim]

  std::vector<std::vector<ggml_fp16_t>> avg_q_fp16;  // [batch_size, q_head_num * head_dim]
  std::vector<std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, std::greater<>>>
      top_similar_block_;

  std::vector<std::vector<float>> block_similar_;
  std::vector<std::vector<std::vector<float>>> block_similar_kv_head_;
  std::vector<std::vector<std::vector<float>>> block_similar_q_head_;

  std::vector<int> cache_seqlens_;                // [batch_size]
  std::vector<int> selected_blocks_num_history_;  // [layer_num // layer_step]

  std::vector<std::vector<std::vector<int>>> selected_blocks_history_;
  // [layer_num // layer_step, batch_size, max_block_num]

  std::vector<std::vector<std::vector<std::vector<int>>>>
      selected_blocks_history_kvhead_;  // [layer_num // layer_step,
                                        // batch_size, max_block_num,
                                        // kv_head_num]

  std::vector<std::vector<int>> block_table_before_retrieval_;  // [batch_size, max_block_num]
  std::vector<std::vector<int>> block_table_after_retrieval_;   // [batch_size, pick_block_num]

  std::vector<std::vector<std::vector<int>>> block_table_before_retrieval_qhead_;  // [batch_size, max_block_num,
                                                                                   // q_head_num]
  std::vector<std::vector<std::vector<int>>> block_table_after_retrieval_qhead_;   // [batch_size, pick_block_num,
                                                                                   // q_head_num]

  std::vector<std::vector<std::vector<int>>> block_table_before_retrieval_kvhead_;  // [batch_size, max_block_num,
                                                                                    // kv_head_num]
  std::vector<std::vector<std::vector<int>>> block_table_after_retrieval_kvhead_;   // [batch_size, pick_block_num,
                                                                                    // kv_head_num]

  std::vector<std::vector<std::unique_ptr<std::mutex>>> mutex_;  // [batch_size, kv_head_num]
  std::vector<std::vector<std::vector<block_q8_0>>> q_q8_0_;     // [batch_size, kv_head_num, n_gqa * head_dim / QK8_0]
  std::vector<std::vector<std::vector<float>>> q_fp32_;          // [batch_size, kv_head_num, n_gqa * head_dim]

  std::vector<std::vector<std::vector<float>>> output_fp32_;  // [batch_size, kv_head_num, n_gqa * head_dim]
  std::vector<std::vector<std::vector<float>>> attn_lse_;     // [batch_size, kv_head_num, n_gqa]

  std::vector<std::pair<int, int>> thread_cur_head_idx_;  // [thread_num]

  std::vector<std::vector<block_q8_0>> thread_local_output_q8_0_;  // [thread_num, n_gqa * head_dim / QK8_0]
  std::vector<std::vector<float>> thread_local_attn_score_;        // [thread_num, n_gqa * block_len]
  std::vector<std::vector<float>> thread_local_output_fp32_;       // [thread_num, n_gqa * head_dim]
  std::vector<std::vector<float>> thread_local_attn_lse_;          // [thread_num, n_gqa]
  std::vector<std::vector<float>> thread_local_cur_output_fp32_;   // [thread_num, n_gqa * head_dim]
  std::vector<std::vector<float>> thread_local_cur_attn_lse_;      // [thread_num, n_gqa]
  std::vector<std::vector<uint8_t>> thread_local_attn_mask_;       // [thread_num, block_len // 8]
  std::vector<std::vector<char>> thread_local_draft_;              // [thread_num, 2 * n_gqa * block_len + 6 * n_gqa *
                                                                   // head_dim + 2 * block_len * head_dim]

  // tmp space
  std::vector<float> q_fp32;  // [n_gqa * head_dim]

  void quantize_q_(const uint16_t* q_in_data, int batch_size);
  void attn_initialize_layer_(int batch_size, int layer_idx, int* block_table, int& max_block_num, int* cache_seqlens);
  void attn_initialize_kvhead_(int batch_size, int layer_idx, int* block_table, int& max_block_num, int* cache_seqlens);
  void retrieval_kvcache_layer_(const uint16_t* q_in_data, int init_block_num, int local_block_num, int pick_block_num,
                                int q_len, int generate_token_idx, int batch_size, int layer_idx, int* cache_seqlens,
                                int& max_block_num, WorkerPool* backend);
  void retrieval_kvcache_kvhead_(const uint16_t* q_in_data, int init_block_num, int local_block_num, int pick_block_num,
                                 int q_len, int generate_token_idx, int batch_size, int layer_idx, int* cache_seqlens,
                                 int& max_block_num, WorkerPool* backend);

  void calculate_block_similarity_layer_(const uint16_t* q_in_data, int batch_size, int layer_idx, int q_len,
                                         int max_block_num, int* cache_seqlens, int init_block_num, int local_block_num,
                                         int pick_block_num, WorkerPool* backend);
  void calculate_block_similarity_kvhead_(const uint16_t* q_in_data, int batch_size, int layer_idx, int q_len,
                                          int max_block_num, int* cache_seqlens, int init_block_num,
                                          int local_block_num, int pick_block_num, WorkerPool* backend);

  void select_block_layer_(int batch_size, int layer_idx, int max_block_num, int init_block_num, int local_block_num,
                           int pick_block_num);
  void select_block_kvhead_(int batch_size, int layer_idx, int max_block_num, int init_block_num, int local_block_num,
                            int pick_block_num);

  void calculate_sparsity_layer_(const uint16_t* q_in_data, float* attn_sparsity, int batch_size, int max_block_num,
                                 int* block_table, int* cache_seqlens, WorkerPool* backend);
  void calculate_sparsity_kvhead_(const uint16_t* q_in_data, float* attn_sparsity, int batch_size, int max_block_num,
                                  int* block_table, int* cache_seqlens, WorkerPool* backend);

  void attention_kvhead_(const uint16_t* q_in_data, ggml_fp16_t* output, float* attn_lse, int batch_size,
                         WorkerPool* backend);
  void attention_layer_(const uint16_t* q_in_data, ggml_fp16_t* output, float* attn_lse, int batch_size,
                        WorkerPool* backend);

  /**
   * @brief Computes attention with KV cache for one block.
   *
   * This function performs attention computation for one block using KV
   * cache. The function supports different data types for Q, K, and V caches,
   * and provides options for quantization. The function does not perform any
   * dynamic memory allocation internally, so all necessary buffers must be
   * pre-allocated externally.
   *
   * @param head_dim The dimension of the head.
   * @param bsz The batch size.
   * @param q_type The data type of Q (GGML data type). Only supports fp16 and
   * q8_0.
   * @param q Pointer to the Q tensor [bsz, head_dim]. The quantization is
   *          always applied along the head_dim dimension. The size must be
   *          bsz * head_dim/32 * qtype_size. If head_dim % 32 != 0, an error
   *          will be raised.
   * @param past_kv_len The length of the past KV cache.
   * @param past_kv_offset The offset in the past KV cache.
   * @param is_full_attn Boolean flag indicating whether to use full attention
   *                     (true for full 1 mask).
   * @param attn_mask Pointer to the attention mask [bsz, past_kv_len]. If
   *                  is_full_attn = false, a bit matrix is passed to
   * represent the mask.
   * @param k_type The data type of K cache (GGML data type). Only supports
   *               fp16, q4_0, and q8_0.
   * @param k_quant_type Quantization type for K cache. 0 for per_token, 1 for
   *                     per_channel. Other values will raise an error.
   * @param k_cache Pointer to the K cache tensor [seq_len, head_dim]. If
   *                quant_type == 0, head_dim % 32 must be 0. If quant_type ==
   * 1, seq_len % 32 must be 0.
   * @param num_k_anchor The number of K anchors. If num_k_anchor == 0, it
   * means no anchor is present.
   * @param k_cache_anchors Pointer to the K cache anchors [num_k_anchor,
   * head_dim]. The k_anchor_type must be fp16.
   * @param k_cache_anchor_pos Pointer to the K cache anchor positions. Each
   * token is associated with the nearest previous anchor position.
   * @param v_type The data type of V cache (GGML data type).
   * @param v_quant_type Quantization type for V cache.
   * @param v_cache Pointer to the V cache tensor [head_dim, seq_len].
   * @param num_v_anchor The number of V anchors.
   * @param v_cache_anchors Pointer to the V cache anchors.
   * @param v_cache_anchor_pos Pointer to the V cache anchor positions.
   * @param attn_score Pre-allocated buffer for attention scores [bsz,
   * past_kv_len].
   * @param output Output tensor [bsz, head_dim] with the same type as q_type.
   * @param lse Pre-allocated buffer [bsz] for the log-sum-exp of the
   * attention scores.
   * @param draft Pre-allocated temporary buffer. The buffer size should be
   * enough to hold (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 *
   *              past_kv_len * head_dim + past_kv_len * head_dim / 32) bytes.
   * @param rotary_angle Pointer to the rotary angle tensor.
   * @param rotary_cos Pointer to the cosine values for rotary embedding.
   * @param rotary_sin Pointer to the sine values for rotary embedding.
   */
  void attn_with_kvcache_one_block_(int head_dim, int bsz,
                                    ggml_type q_type,  // GGML data type of `Q`, only supports fp16 and q8_0
                                    // [bsz, head_dim]
                                    // Quantization is always on the head_dim dimension (per_token). If
                                    // head_dim % 32 != 0, an error will be raised. The size must be bsz *
                                    // head_dim/32 * qtype_size.
                                    const void* q,

                                    int past_kv_len, int past_kv_offset,
                                    bool is_full_attn,  // true indicates a full 1 mask
                                    // If is_full_attn = false, a bit matrix representing the mask is
                                    // passed. [bsz, past_kv_len]
                                    const uint8_t* attn_mask,

                                    ggml_type k_type,  // GGML data type of `K Cache`, only supports fp16,
                                                       // q4_0, q8_0
                                    int k_quant_type,  // 0 for per_token, 1 for per_channel, others raise an
                                                       // error
                                    // [seq_len, head_dim]
                                    // If quant_type == 0, head_dim % 32 must be 0.
                                    // If quant_type == 1, seq_len % 32 must be 0.
                                    const void* k_cache,

                                    // k_anchor_type must be fp16
                                    int num_k_anchor,  // num_k_anchor == 0 indicates no anchor
                                    // [num_k_anchor, head_dim]
                                    const void* k_cache_anchors,
                                    // Each token is associated with the nearest previous position's anchor,
                                    // with the same distance.
                                    const int* k_cache_anchor_pos,

                                    // v_cache similar to k_cache
                                    ggml_type v_type, int v_quant_type,
                                    // [head_dim, seq_len]
                                    const void* v_cache, int num_v_anchor, const void* v_cache_anchors,
                                    const int* v_cache_anchor_pos,

                                    // Pre-allocated buffer for intermediate calculations [bsz,
                                    // past_kv_len]. No malloc is performed inside this function.
                                    float* attn_score,

                                    // Output: [bsz, head_dim], with the same type as q_type
                                    void* output,
                                    // [bsz]
                                    float* lse,

                                    // Pre-allocated temporary buffer with sufficient size:
                                    // (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
                                    // head_dim + past_kv_len * head_dim / 32) bytes.
                                    void* draft,

                                    // Apply rotary embedding online
                                    const int* rotary_angle, const void* rotary_cos, const void* rotary_sin
                                    // rotary_cos=None,
                                    // rotary_sin=None,
                                    // cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
                                    // cache_batch_idx: Optional[torch.Tensor] = None,
                                    // rotary_interleaved=True,

                                    // // Not supported for now
                                    // window_size=(-1, -1),  # -1 means infinite context window
                                    // alibi_slopes=None,
  );
};

/**
 * @brief Scales a float32 vector by a given scalar value.
 *
 * This function multiplies each element of the input vector `y` by a scalar
 * `v`. It uses platform-specific optimizations if available, such as Apple's
 * Accelerate framework or SIMD instructions. If no specific optimization is
 * available, the function falls back to a simple scalar multiplication loop.
 *
 * @param n The number of elements in the vector `y`.
 * @param y The input vector to be scaled. The result will be stored in the same
 * vector.
 * @param v The scalar value by which to scale the vector.
 */
void ggml_vec_scale_f32(const int n, float* y, const float v);
#endif

================================================
FILE: kt-kernel/operators/kvcache/kvcache_attn.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include <chrono>
#include <cmath>

#include "ggml-impl.h"
#include "kvcache.h"
#include "llamafile/sgemm.h"

void KVCache::attention_kvhead_(const uint16_t* q_in_data, ggml_fp16_t* output, float* attn_lse, int batch_size,
                                WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  seq_len_ = config_.block_len;

  backend->do_work_stealing_job(
      batch_size * config_.kv_head_num * max_block_num_after_retrieval_,
      [&](int thread_id) {
        thread_cur_head_idx_[thread_id].first = -1;
        thread_cur_head_idx_[thread_id].second = -1;
      },
      [&](int task_id) {
        int batch_id = task_id / (config_.kv_head_num * max_block_num_after_retrieval_);
        int head_id =
            (task_id % (config_.kv_head_num * max_block_num_after_retrieval_)) / max_block_num_after_retrieval_;
        int block_id = task_id % max_block_num_after_retrieval_;
        int thread_id = WorkerPool::thread_local_id;

        // If the block is out of the sequence length, skip it.
        if (cache_seqlens_[batch_id] / config_.block_len < block_id) {
          return;
        }
        int block_idx = block_table_after_retrieval_kvhead_[batch_id][block_id][head_id];
        if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
          int seq_len = cache_seqlens_[batch_id] % config_.block_len;
          if (seq_len == 0) return;

          // Prepare the attention mask for the last block.
          int full_blocks = seq_len / 8;
          int remaining_bits = seq_len % 8;
          // Fill full blocks with 1s
          for (int i = 0; i < full_blocks; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0xFF;
          }
          // Fill the remaining bits in the next block
          if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
            thread_local_attn_mask_[thread_id][full_blocks] = (1 << remaining_bits) - 1;
          } else {
            thread_local_attn_mask_[thread_id][full_blocks] = 0;
          }

          for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0;
          }
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                                         (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                                           head_id * n_gqa_ * config_.head_dim],
                                         seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(), GGML_TYPE_F16,
                                         0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_fp32_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q4_0, 0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q8_0, 0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        } else {
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                  head_id * n_gqa_ * config_.head_dim],
                seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                nullptr, nullptr, GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr,
                nullptr, thread_local_attn_score_[thread_id].data(), thread_local_output_fp32_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());

          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0,
                                         0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0,
                                         0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        }
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (batch_id == cur_batch_idx && head_id == cur_head_id) {
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse = thread_local_cur_attn_lse_[thread_id][i] +
                                 std::log(1.0 + std::exp(thread_local_attn_lse_[thread_id][i] -
                                                         thread_local_cur_attn_lse_[thread_id][i]));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] +=
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
          }
        } else {
          if (cur_batch_idx != -1) {
            mutex_[cur_batch_idx][cur_head_id]->lock();
            for (int i = 0; i < n_gqa_; i++) {
              if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
                attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
                for (int j = 0; j < config_.head_dim; j++) {
                  output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                      thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
                }
                continue;
              }
              float new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                                   std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                           attn_lse_[cur_batch_idx][cur_head_id][i]));
              ggml_vec_scale_f32(config_.head_dim,
                                 output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                                 std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
              ggml_vec_scale_f32(config_.head_dim,
                                 thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                                 std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
            }
            mutex_[cur_batch_idx][cur_head_id]->unlock();
          }
          thread_cur_head_idx_[thread_id].first = batch_id;
          thread_cur_head_idx_[thread_id].second = head_id;
          for (int i = 0; i < n_gqa_; i++) {
            thread_local_cur_attn_lse_[thread_id][i] = thread_local_attn_lse_[thread_id][i];
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] =
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
          }
        }
      },
      // Merge the results of the remaining blocks.
      [&](int thread_id) {
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (cur_head_id != -1) {
          mutex_[cur_batch_idx][cur_head_id]->lock();
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse;
            if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
              attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              continue;
            }
            new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                           std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                   attn_lse_[cur_batch_idx][cur_head_id][i]));
            ggml_vec_scale_f32(config_.head_dim, output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                               std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                  thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
          }
          mutex_[cur_batch_idx][cur_head_id]->unlock();
        }
      });
  // move the results to output and attn_lse
  uint16_t* output_data = reinterpret_cast<uint16_t*>(output);
  float* attn_lse_data = attn_lse;
  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    for (int i = 0; i < config_.kv_head_num; i++) {
      for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
        output_data[batch_idx * config_.kv_head_num * n_gqa_ * config_.head_dim + i * n_gqa_ * config_.head_dim + j] =
            GGML_FP32_TO_FP16(output_fp32_[batch_idx][i][j]);
      }
      for (int j = 0; j < n_gqa_; j++) {
        attn_lse_data[batch_idx * config_.kv_head_num * n_gqa_ + i * n_gqa_ + j] = attn_lse_[batch_idx][i][j];
      }
    }
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of computing attention: %f s\n", layer_idx,
  //        diff.count());
}

void KVCache::attention_layer_(const uint16_t* q_in_data, ggml_fp16_t* output, float* attn_lse, int batch_size,
                               WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  seq_len_ = config_.block_len;
  backend->do_work_stealing_job(
      batch_size * config_.kv_head_num * max_block_num_after_retrieval_,
      [&](int thread_id) {
        thread_cur_head_idx_[thread_id].first = -1;
        thread_cur_head_idx_[thread_id].second = -1;
      },
      [&](int task_id) {
        int batch_id = task_id / (config_.kv_head_num * max_block_num_after_retrieval_);
        int head_id =
            (task_id % (config_.kv_head_num * max_block_num_after_retrieval_)) / max_block_num_after_retrieval_;
        int block_id = task_id % max_block_num_after_retrieval_;
        int thread_id = WorkerPool::thread_local_id;
        // If the block is out of the sequence length, skip it.
        if (cache_seqlens_[batch_id] / config_.block_len < block_id) {
          return;
        }
        int block_idx = block_table_after_retrieval_[batch_id][block_id];
        if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
          int seq_len = cache_seqlens_[batch_id] % config_.block_len;
          if (seq_len == 0) return;

          // Prepare the attention mask for the last block.
          int full_blocks = seq_len / 8;
          int remaining_bits = seq_len % 8;

          // Fill full blocks with 1s
          for (int i = 0; i < full_blocks; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0xFF;
          }
          // Fill the remaining bits in the next block
          if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
            thread_local_attn_mask_[thread_id][full_blocks] = (1 << remaining_bits) - 1;
          } else {
            thread_local_attn_mask_[thread_id][full_blocks] = 0;
          }

          for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0;
          }
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                                         (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                                           head_id * n_gqa_ * config_.head_dim],
                                         seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(), GGML_TYPE_F16,
                                         0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_fp32_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q4_0, 0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q8_0, 0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        } else {
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                  head_id * n_gqa_ * config_.head_dim],
                seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                nullptr, nullptr, GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr,
                nullptr, thread_local_attn_score_[thread_id].data(), thread_local_output_fp32_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());

          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0,
                                         0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0,
                                         0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        }
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (batch_id == cur_batch_idx && head_id == cur_head_id) {
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse = thread_local_cur_attn_lse_[thread_id][i] +
                                 std::log(1.0 + std::exp(thread_local_attn_lse_[thread_id][i] -
                                                         thread_local_cur_attn_lse_[thread_id][i]));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] +=
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
          }
        } else {
          if (cur_batch_idx != -1) {
            mutex_[cur_batch_idx][cur_head_id]->lock();
            for (int i = 0; i < n_gqa_; i++) {
              if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
                attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
                for (int j = 0; j < config_.head_dim; j++) {
                  output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                      thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
                }
                continue;
              }
              float new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                                   std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                           attn_lse_[cur_batch_idx][cur_head_id][i]));
              ggml_vec_scale_f32(config_.head_dim,
                                 output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                                 std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
              ggml_vec_scale_f32(config_.head_dim,
                                 thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                                 std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
            }
            mutex_[cur_batch_idx][cur_head_id]->unlock();
          }
          thread_cur_head_idx_[thread_id].first = batch_id;
          thread_cur_head_idx_[thread_id].second = head_id;
          for (int i = 0; i < n_gqa_; i++) {
            thread_local_cur_attn_lse_[thread_id][i] = thread_local_attn_lse_[thread_id][i];
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] =
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
          }
        }
      },
      // Merge the results of the remaining blocks.
      [&](int thread_id) {
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (cur_head_id != -1) {
          mutex_[cur_batch_idx][cur_head_id]->lock();
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse;
            if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
              attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              continue;
            }
            new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                           std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                   attn_lse_[cur_batch_idx][cur_head_id][i]));
            ggml_vec_scale_f32(config_.head_dim, output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                               std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                  thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
          }
          mutex_[cur_batch_idx][cur_head_id]->unlock();
        }
      });

  // move the results to output and attn_lse
  uint16_t* output_data = reinterpret_cast<uint16_t*>(output);
  float* attn_lse_data = attn_lse;
  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    for (int i = 0; i < config_.kv_head_num; i++) {
      for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
        output_data[batch_idx * config_.kv_head_num * n_gqa_ * config_.head_dim + i * n_gqa_ * config_.head_dim + j] =
            GGML_FP32_TO_FP16(output_fp32_[batch_idx][i][j]);
      }
      for (int j = 0; j < n_gqa_; j++) {
        attn_lse_data[batch_idx * config_.kv_head_num * n_gqa_ + i * n_gqa_ + j] = attn_lse_[batch_idx][i][j];
      }
    }
  }
  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  //     printf("layer %d time of computing attention: %f s\n", layer_id_,
  //     diff.count());
}

void KVCache::attn(const ggml_fp16_t* q_in, ggml_fp16_t* output, float* attn_lse, int layer_idx, int generate_token_idx,
                   int q_len, int batch_size, int max_block_num, int* block_table, int* cache_seqlens,
                   int pick_block_num, int init_block_num, int local_block_num, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  layer_id_ = layer_idx;
  batch_size = batch_size * q_len;

  const uint16_t* q_in_data = const_cast<const uint16_t*>(q_in);

  quantize_q_(q_in_data, batch_size);
  if (config_.retrieval_type == RetrievalType::LAYER) {
    attn_initialize_layer_(batch_size, layer_idx, block_table, max_block_num, cache_seqlens);
    retrieval_kvcache_layer_(q_in_data, init_block_num, local_block_num, pick_block_num, q_len, generate_token_idx,
                             batch_size, layer_idx, cache_seqlens, max_block_num, backend);
    attention_layer_(q_in_data, output, attn_lse, batch_size, backend);
  } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
    attn_initialize_kvhead_(batch_size, layer_idx, block_table, max_block_num, cache_seqlens);
    retrieval_kvcache_kvhead_(q_in_data, init_block_num, local_block_num, pick_block_num, q_len, generate_token_idx,
                              batch_size, layer_idx, cache_seqlens, max_block_num, backend);
    attention_kvhead_(q_in_data, output, attn_lse, batch_size, backend);
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of computing attention: %f s\n", layer_idx,
  //        diff.count());
}

void KVCache::attn_with_kvcache(const ggml_fp16_t* q_in, const ggml_fp16_t* k_in, const ggml_fp16_t* v_in,
                                ggml_fp16_t* output, float* attn_lse, int layer_idx, int generate_token_idx, int q_len,
                                int batch_size, int max_block_num, int* block_table, int* cache_seqlens, int topk,
                                int local, WorkerPool* backend) {
  //    printf("attn_with_kvcache start\n");
  assert(q_len == 1);
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_idx;

  update_kvcache_fp16(k_in, v_in, layer_idx, block_table, batch_size, max_block_num, cache_seqlens, q_len, backend);
  //    printf("update finished.\n");

  // cache_seqlens memory is modified.
  for (int i = 0; i < batch_size; i++) {
    cache_seqlens[i] += q_len;
  }
  int init_block_num = 1;
  if (config_.block_len <= 32) {
    init_block_num = 64 / config_.block_len;
  }

  attn(q_in, output, attn_lse, layer_idx, generate_token_idx, q_len, batch_size, max_block_num, block_table,
       cache_seqlens, topk, init_block_num, local, backend);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  //     printf("layer %d time of computing attention with kvcache: %f s\n",
  //     layer_idx, diff.count());
}

void KVCache::quantize_q_(const uint16_t* q_in_data, int batch_size) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
      // quantize q
      for (int i = 0; i < config_.kv_head_num; i++) {
        for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
          q_fp32_[batch_idx][i][j] =
              GGML_FP16_TO_FP32(q_in_data[batch_idx * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                          i * n_gqa_ * config_.head_dim + j]);
        }
      }
    } else {
      // quantize q
      for (int i = 0; i < config_.kv_head_num; i++) {
        for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
          q_fp32[j] = GGML_FP16_TO_FP32(q_in_data[batch_idx * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                                  i * n_gqa_ * config_.head_dim + j]);
        }
        quantize_row_q8_0(q_fp32.data(), q_q8_0_[batch_idx][i].data(), n_gqa_ * config_.head_dim);
      }
    }
  }
  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  // printf("time of quantizing q: %f s\n",
  //        std::chrono::duration<double>(end - start).count());
}
void KVCache::attn_initialize_layer_(int batch_size, int layer_idx, int* block_table, int& max_block_num,
                                     int* cache_seqlens) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    // initialize output_fp32_ and attn_lse_
    for (int i = 0; i < config_.kv_head_num; i++) {
      for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
        output_fp32_[batch_idx][i][j] = 0;
      }
      for (int j = 0; j < n_gqa_; j++) {
        attn_lse_[batch_idx][i][j] = 0;
      }
    }
    // clear top_similar_block_

    while (!top_similar_block_[batch_idx].empty()) top_similar_block_[batch_idx].pop();
  }

  // get block_table_before_retrieval_ and cache_seqlens_
  if (block_table == nullptr) {
    max_block_num = past_block_num_[layer_idx];
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
      if (cache_total_len_ != 0)
        cache_seqlens_[batch_idx] = cache_total_len_;
      else
        cache_seqlens_[batch_idx] = max_block_num * config_.block_len;
      for (int i = 0; i < max_block_num; i++) {
        block_table_before_retrieval_[batch_idx][i] = i;
        block_similar_[batch_idx][i] = 0;
      }
    }
  } else {
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
      cache_seqlens_[batch_idx] = cache_seqlens[batch_idx];
      for (int i = 0; i < max_block_num; i++) {
        block_table_before_retrieval_[batch_idx][i] = block_table[batch_idx * max_block_num + i];
        block_similar_[batch_idx][i] = 0;
      }
    }
  }
  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  // printf("layer %d time of initializing attention: %f s\n", layer_idx,
  //        std::chrono::duration<double>(end - start).count());
}

void KVCache::calculate_block_similarity_layer_(const uint16_t* q_in_data, int batch_size, int layer_idx, int q_len,
                                                int max_block_num, int* cache_seqlens, int init_block_num,
                                                int local_block_num, int pick_block_num, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  if (batch_size == 1 && config_.anchor_num == 1) {  // TODO: improve batch_size > 1
    for (int batch_id = 0; batch_id < batch_size; batch_id++) {
      if (q_len == 1) {
        for (int j = 0; j < config_.head_dim * config_.q_head_num; j++) {
          avg_q[batch_id][j] =
              GGML_FP16_TO_FP32(q_in_data[batch_id * q_len * config_.q_head_num * config_.head_dim + j]);
          avg_q_fp16[batch_id][j] = q_in_data[batch_id * q_len * config_.q_head_num * config_.head_dim + j];
        }
      } else {
        for (int j = 0; j < config_.head_dim * config_.q_head_num; j++) {
          avg_q[batch_id][j] = 0;
        }
        for (int i = 0; i < q_len; i++) {
          for (int j = 0; j < config_.head_dim; j++) {
            avg_q[batch_id][j] += GGML_FP16_TO_FP32(q_in_data[batch_id * q_len * config_.q_head_num * config_.head_dim +
                                                              i * config_.q_head_num * config_.head_dim + j]);
          }
        }
        for (int j = 0; j < config_.head_dim * config_.q_head_num; j++) {
          avg_q[batch_id][j] /= q_len;
          avg_q_fp16[batch_id][j] = GGML_FP32_TO_FP16(avg_q[batch_id][j]);
        }
      }
      int seq_len = cache_seqlens_[batch_id];
      int block_num = (seq_len / config_.block_len) - local_block_num - init_block_num;
      if (block_num <= 0) {
        continue;
      }
      bool is_seq = true;
      for (int i = init_block_num + 1; i < (seq_len / config_.block_len) - local_block_num; i++) {
        if (block_table_before_retrieval_[batch_id][i] != block_table_before_retrieval_[batch_id][i - 1] + 1) {
          is_seq = false;
          break;
        }
      }
      if (is_seq) {
        int nth = backend->get_thread_num();
        backend->do_work_stealing_job(
            nth, nullptr,
            [&](int task_id) {
              int ith = task_id;
              bool ok = llamafile_sgemm(
                  block_num, 1, config_.q_head_num * config_.head_dim,
                  anchor_.data() +
                      (layer_idx * config_.max_block_num + block_table_before_retrieval_[batch_id][init_block_num]) *
                          config_.anchor_num * config_.q_head_num * config_.head_dim,
                  config_.q_head_num * config_.head_dim, avg_q_fp16[batch_id].data(),
                  config_.q_head_num * config_.head_dim, block_similar_[batch_id].data() + init_block_num, block_num,
                  ith, nth, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F16, GGML_TYPE_F16, GGML_TYPE_F32, GGML_PREC_DEFAULT);
              if (!ok) {
                printf("llamafile_sgemm failed\n");
              }
            },
            nullptr);
      } else {
        backend->do_work_stealing_job(
            block_num, nullptr,
            [&](int task_id) {
              int block_id = task_id + init_block_num;
              int block_idx = block_table_before_retrieval_[batch_id][block_id];
              bool ok = llamafile_sgemm(
                  1, 1, config_.q_head_num * config_.head_dim,
                  anchor_.data() +
                      (layer_idx * config_.max_block_num + block_table_before_retrieval_[batch_id][block_idx]) *
                          config_.anchor_num * config_.q_head_num * config_.head_dim,
                  config_.q_head_num * config_.head_dim, avg_q_fp16[batch_id].data(),
                  config_.q_head_num * config_.head_dim, block_similar_[batch_id].data() + block_id, 1, 0, 1,
                  GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F16, GGML_TYPE_F16, GGML_TYPE_F32, GGML_PREC_DEFAULT);
              if (!ok) {
                printf("llamafile_sgemm failed\n");
              }
            },
            nullptr);
      }
    }
  } else {
    backend->do_work_stealing_job(
        batch_size * max_block_num, nullptr,
        [&](int task_id) {
          int batch_id = task_id / max_block_num;
          int block_id = task_id % max_block_num;
          int seq_len = cache_seqlens_[batch_id];

          if (block_id < init_block_num || block_id >= (seq_len / config_.block_len) - local_block_num) {
            return;
          }

          int block_idx = block_table_before_retrieval_[batch_id][block_id];
          float sim = 0;

          for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
            for (int i = 0; i < config_.head_dim; i++) {
              float q_i = 0, qa_i = std::numeric_limits<float>::lowest();
              for (int q_id = 0; q_id < q_len; q_id++) {
                q_i += GGML_FP16_TO_FP32(
                    q_in_data[batch_id * q_len * config_.q_head_num * config_.head_dim +
                              q_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + i]);
              }
              q_i /= q_len;
              for (int anchor_id = 0; anchor_id < config_.anchor_num; anchor_id++) {
                qa_i = std::max(
                    qa_i,
                    GGML_FP16_TO_FP32(
                        anchor_[(long long)layer_idx * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                    config_.head_dim +
                                block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + i]) *
                        q_i);
              }
              sim += qa_i;
            }
          }
          block_similar_[batch_id][block_id] = sim;
        },
        nullptr);
  }
  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of calculating similarity: %f s\n", layer_idx,
  //        diff.count());
}

void KVCache::select_block_layer_(int batch_size, int layer_idx, int max_block_num, int init_block_num,
                                  int local_block_num, int pick_block_num) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    if (cache_seqlens_[batch_idx] / config_.block_len <= init_block_num + pick_block_num + local_block_num) {
      block_table_after_retrieval_[batch_idx].swap(block_table_before_retrieval_[batch_idx]);
      selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] = 0;
      continue;
    }

    for (int block_id = init_block_num; block_id < (cache_seqlens_[batch_idx] / config_.block_len) - local_block_num;
         block_id++) {
      top_similar_block_[batch_idx].push(
          std::make_pair(block_similar_[batch_idx][block_id], block_table_before_retrieval_[batch_idx][block_id]));
      if (top_similar_block_[batch_idx].size() > pick_block_num) {
        top_similar_block_[batch_idx].pop();
      }
    }

    int i = 0;
    for (; i < init_block_num; i++) {
      block_table_after_retrieval_[batch_idx][i] = block_table_before_retrieval_[batch_idx][i];
    }
    while (!top_similar_block_[batch_idx].empty()) {
      block_table_after_retrieval_[batch_idx][i] = top_similar_block_[batch_idx].top().second;
      top_similar_block_[batch_idx].pop();
      i++;
    }
    for (; i < init_block_num + pick_block_num + local_block_num; i++) {
      block_table_after_retrieval_[batch_idx][i] =
          block_table_before_retrieval_[batch_idx][(cache_seqlens_[batch_idx] / config_.block_len) - local_block_num +
                                                   i - init_block_num - pick_block_num];
    }
    if (cache_seqlens_[batch_idx] % config_.block_len != 0) {
      block_table_after_retrieval_[batch_idx][i] =
          block_table_before_retrieval_[batch_idx][(cache_seqlens_[batch_idx] / config_.block_len)];
      cache_seqlens_[batch_idx] = (cache_seqlens_[batch_idx] % config_.block_len) + i * config_.block_len;
      i++;
    } else {
      cache_seqlens_[batch_idx] = (cache_seqlens_[batch_idx] % config_.block_len) + i * config_.block_len;
    }
    for (int j = 0; j < i; j++) {
      selected_blocks_history_[(layer_idx - config_.layer_offset) / config_.layer_step][batch_idx][j] =
          block_table_after_retrieval_[batch_idx][j];
    }
    selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] = i;
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of selecting blocks: %f s\n", layer_idx,
  //        diff.count());
}

// retrieval kvcache, get the init_block_num block at beginning, top
// pick_block_num similar and last local_block_num blocks. Each task
// calculates the simlarity of a certain block with the query, then push
// the block into the priority queue. Finally, the required blocks are
// pushed into the block_table_after_retrieval_.
void KVCache::retrieval_kvcache_layer_(const uint16_t* q_in_data, int init_block_num, int local_block_num,
                                       int pick_block_num, int q_len, int generate_token_idx, int batch_size,
                                       int layer_idx, int* cache_seqlens, int& max_block_num, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  max_block_num_after_retrieval_ = 0;
  if (pick_block_num != -1 &&
      (generate_token_idx % config_.token_step != 0 || (layer_idx % config_.layer_step != config_.layer_offset))) {
    if (selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] == 0) {
      max_block_num_after_retrieval_ = max_block_num;
      block_table_after_retrieval_.swap(block_table_before_retrieval_);
    } else {
      max_block_num_after_retrieval_ =
          selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step];
      for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        for (int i = 0; i < max_block_num_after_retrieval_; i++) {
          block_table_after_retrieval_[batch_idx][i] =
              selected_blocks_history_[(layer_idx - config_.layer_offset) / config_.layer_step][batch_idx][i];
        }

        if (cache_seqlens[batch_idx] % config_.block_len == 1) {
          selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] += 1;
          int x = selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step];
          int last_block_idx = block_table_before_retrieval_[batch_idx][cache_seqlens[batch_idx] / config_.block_len];
          selected_blocks_history_[(layer_idx - config_.layer_offset) / config_.layer_step][batch_idx][x - 1] =
              last_block_idx;
          block_table_after_retrieval_[batch_idx][x - 1] = last_block_idx;
        }
        cache_seqlens_[batch_idx] =
            (cache_seqlens_[batch_idx] % config_.block_len) +
            selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] * config_.block_len -
            config_.block_len;
      }
    }
  } else if (pick_block_num != -1) {
    max_block_num_after_retrieval_ = std::min(max_block_num, init_block_num + pick_block_num + local_block_num + 1);
    calculate_block_similarity_layer_(q_in_data, batch_size, layer_idx, q_len, max_block_num, cache_seqlens,
                                      init_block_num, local_block_num, pick_block_num, backend);
    select_block_layer_(batch_size, layer_idx, max_block_num, init_block_num, local_block_num, pick_block_num);
  } else {
    selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] = 0;
    max_block_num_after_retrieval_ = max_block_num;
    block_table_after_retrieval_.swap(block_table_before_retrieval_);
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  //     printf("layer %d time of retrieval kvcache: %f s\n", layer_idx,
  //     std::chrono::duration<double>(end - start).count());
}
void KVCache::calculate_sparsity_layer_(const uint16_t* q_in_data, float* attn_sparsity, int batch_size,
                                        int max_block_num, int* block_table, int* cache_seqlens, WorkerPool* backend

) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  seq_len_ = config_.block_len;
  backend->do_work_stealing_job(
      batch_size * config_.kv_head_num * max_block_num,
      [&](int thread_id) {
        thread_cur_head_idx_[thread_id].first = -1;
        thread_cur_head_idx_[thread_id].second = -1;
      },
      [&](int task_id) {
        int batch_id = task_id / (config_.kv_head_num * max_block_num);
        int head_id = (task_id % (config_.kv_head_num * max_block_num)) / max_block_num;
        int block_id = task_id % max_block_num;
        int thread_id = WorkerPool::thread_local_id;
        // If the block is out of the sequence length, skip it.
        if (cache_seqlens[batch_id] / config_.block_len < block_id) {
          return;
        }
        int block_idx = block_table[batch_id * max_block_num + block_id];
        if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
          int seq_len = cache_seqlens_[batch_id] % config_.block_len;
          if (seq_len == 0) return;

          // Prepare the attention mask for the last block.
          int full_blocks = seq_len / 8;
          int remaining_bits = seq_len % 8;
          // Fill full blocks with 1s
          for (int i = 0; i < full_blocks; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0xFF;
          }
          // Fill the remaining bits in the next block
          if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
            thread_local_attn_mask_[thread_id][full_blocks] = (1 << remaining_bits) - 1;
          } else {
            thread_local_attn_mask_[thread_id][full_blocks] = 0;
          }

          for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0;
          }
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                                         (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                                           head_id * n_gqa_ * config_.head_dim],
                                         seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(), GGML_TYPE_F16,
                                         0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_fp32_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q4_0, 0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q8_0, 0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        } else {
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                  head_id * n_gqa_ * config_.head_dim],
                seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                nullptr, nullptr, GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr,
                nullptr, thread_local_attn_score_[thread_id].data(), thread_local_output_fp32_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());

          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0,
                                         0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0,
                                         0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        }
        for (int i = 0; i < n_gqa_; i++) {
          block_lse_[batch_id][block_idx][head_id * n_gqa_ + i] = thread_local_attn_lse_[thread_id][i];
        }
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (batch_id == cur_batch_idx && head_id == cur_head_id) {
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse = thread_local_cur_attn_lse_[thread_id][i] +
                                 std::log(1.0 + std::exp(thread_local_attn_lse_[thread_id][i] -
                                                         thread_local_cur_attn_lse_[thread_id][i]));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] +=
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
          }
        } else {
          if (cur_batch_idx != -1) {
            mutex_[cur_batch_idx][cur_head_id]->lock();
            for (int i = 0; i < n_gqa_; i++) {
              if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
                attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
                for (int j = 0; j < config_.head_dim; j++) {
                  output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                      thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
                }
                continue;
              }
              float new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                                   std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                           attn_lse_[cur_batch_idx][cur_head_id][i]));
              ggml_vec_scale_f32(config_.head_dim,
                                 output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                                 std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
              ggml_vec_scale_f32(config_.head_dim,
                                 thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                                 std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
            }
            mutex_[cur_batch_idx][cur_head_id]->unlock();
          }
          thread_cur_head_idx_[thread_id].first = batch_id;
          thread_cur_head_idx_[thread_id].second = head_id;
          for (int i = 0; i < n_gqa_; i++) {
            thread_local_cur_attn_lse_[thread_id][i] = thread_local_attn_lse_[thread_id][i];
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] =
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
          }
        }
      },
      // Merge the results of the remaining blocks.
      [&](int thread_id) {
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (cur_head_id != -1) {
          mutex_[cur_batch_idx][cur_head_id]->lock();
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse;
            if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
              attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              continue;
            }
            new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                           std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                   attn_lse_[cur_batch_idx][cur_head_id][i]));
            ggml_vec_scale_f32(config_.head_dim, output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                               std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                  thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
          }
          mutex_[cur_batch_idx][cur_head_id]->unlock();
        }
      });

  for (int i = 0; i < batch_size; i++) {
    for (int j = 0; j < max_block_num_after_retrieval_; j++) {
      int block_idx = block_table_after_retrieval_[i][j];
      for (int k = 0; k < config_.q_head_num; k++) {
        attn_sparsity[i * config_.q_head_num + k] +=
            std::exp(block_lse_[i][block_idx][k] - attn_lse_[i][k / n_gqa_][k % n_gqa_]);
      }
    }
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of calculating sparsity: %f s\n", layer_id_,
  //        diff.count());
}

void KVCache::attn_initialize_kvhead_(int batch_size, int layer_idx, int* block_table, int& max_block_num,
                                      int* cache_seqlens) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    // initialize output_fp32_ and attn_lse_
    for (int i = 0; i < config_.kv_head_num; i++) {
      for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
        output_fp32_[batch_idx][i][j] = 0;
      }
      for (int j = 0; j < n_gqa_; j++) {
        attn_lse_[batch_idx][i][j] = 0;
      }
    }

    // clear top_similar_block_
    while (!top_similar_block_[batch_idx].empty()) top_similar_block_[batch_idx].pop();
  }

  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    cache_seqlens_[batch_idx] = cache_seqlens[batch_idx];
    for (int i = 0; i < max_block_num; i++) {
      for (int j = 0; j < config_.kv_head_num; j++) {
        block_table_before_retrieval_kvhead_[batch_idx][i][j] = block_table[batch_idx * max_block_num + i];
        block_similar_kv_head_[batch_idx][i][j] = 0;
      }
    }
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  // printf("layer %d time of initializing attn: %f s\n", layer_idx,
  //        std::chrono::duration<double>(end - start).count());
}
void KVCache::retrieval_kvcache_kvhead_(const uint16_t* q_in_data, int init_block_num, int local_block_num,
                                        int pick_block_num, int q_len, int generate_token_idx, int batch_size,
                                        int layer_idx, int* cache_seqlens, int& max_block_num, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  max_block_num_after_retrieval_ = 0;
  if (pick_block_num != -1 &&
      (generate_token_idx % config_.token_step != 0 || (layer_idx % config_.layer_step != config_.layer_offset))) {
    if (selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] == 0) {
      max_block_num_after_retrieval_ = max_block_num;
      for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        for (int i = 0; i < max_block_num; i++) {
          for (int j = 0; j < config_.kv_head_num; j++) {
            block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                block_table_before_retrieval_kvhead_[batch_idx][i][j];
          }
        }
      }
    } else {
      max_block_num_after_retrieval_ =
          selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step];

      for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        for (int i = 0; i < max_block_num_after_retrieval_; i++) {
          for (int j = 0; j < config_.kv_head_num; j++) {
            block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                selected_blocks_history_kvhead_[(layer_idx - config_.layer_offset) / config_.layer_step][batch_idx][i]
                                               [j];
          }
        }

        if (cache_seqlens[batch_idx] % config_.block_len == 1) {
          selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] += 1;
          int x = selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step];
          for (int i = 0; i < config_.kv_head_num; i++) {
            int last_block_idx =
                block_table_before_retrieval_kvhead_[batch_idx][cache_seqlens[batch_idx] / config_.block_len][i];
            selected_blocks_history_kvhead_[(layer_idx - config_.layer_offset) / config_.layer_step][batch_idx][x - 1]
                                           [i] = last_block_idx;
            block_table_after_retrieval_kvhead_[batch_idx][x - 1][i] = last_block_idx;
          }
        }
        cache_seqlens_[batch_idx] = std::min(
            cache_seqlens_[batch_idx], (cache_seqlens_[batch_idx] % config_.block_len) +
                                           (init_block_num + pick_block_num + local_block_num) * config_.block_len);
      }
    }
  } else if (pick_block_num != -1) {
    max_block_num_after_retrieval_ = std::min(max_block_num, init_block_num + pick_block_num + local_block_num + 1);
    calculate_block_similarity_kvhead_(q_in_data, batch_size, layer_idx, q_len, max_block_num, cache_seqlens,
                                       init_block_num, local_block_num, pick_block_num, backend);
    select_block_kvhead_(batch_size, layer_idx, max_block_num, init_block_num, local_block_num, pick_block_num);
  } else {
    selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] = 0;
    max_block_num_after_retrieval_ = max_block_num;
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
      for (int i = 0; i < max_block_num; i++) {
        for (int j = 0; j < config_.kv_head_num; j++) {
          block_table_after_retrieval_kvhead_[batch_idx][i][j] = block_table_before_retrieval_kvhead_[batch_idx][i][j];
        }
      }
    }
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  // printf("layer %d time of retrieval kvcache: %f s\n", layer_idx,
  //        std::chrono::duration<double>(end - start).count());
}
void KVCache::calculate_sparsity_kvhead_(const uint16_t* q_in_data, float* attn_sparsity, int batch_size,
                                         int max_block_num, int* block_table, int* cache_seqlens, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  seq_len_ = config_.block_len;
  backend->do_work_stealing_job(
      batch_size * config_.kv_head_num * max_block_num,
      [&](int thread_id) {
        thread_cur_head_idx_[thread_id].first = -1;
        thread_cur_head_idx_[thread_id].second = -1;
      },
      [&](int task_id) {
        int batch_id = task_id / (config_.kv_head_num * max_block_num);
        int head_id = (task_id % (config_.kv_head_num * max_block_num)) / max_block_num;
        int block_id = task_id % max_block_num;
        int thread_id = WorkerPool::thread_local_id;
        // If the block is out of the sequence length, skip it.
        if (cache_seqlens[batch_id] / config_.block_len < block_id) {
          return;
        }
        int block_idx = block_table[batch_id * max_block_num + block_id];
        if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
          int seq_len = cache_seqlens_[batch_id] % config_.block_len;
          if (seq_len == 0) return;

          // Prepare the attention mask for the last block.
          int full_blocks = seq_len / 8;
          int remaining_bits = seq_len % 8;

          // Fill full blocks with 1s
          for (int i = 0; i < full_blocks; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0xFF;
          }
          // Fill the remaining bits in the next block
          if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
            thread_local_attn_mask_[thread_id][full_blocks] = (1 << remaining_bits) - 1;
          } else {
            thread_local_attn_mask_[thread_id][full_blocks] = 0;
          }

          for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
            thread_local_attn_mask_[thread_id][i] = 0;
          }
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                                         (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                                           head_id * n_gqa_ * config_.head_dim],
                                         seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(), GGML_TYPE_F16,
                                         0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_fp32_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q4_0, 0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                q_q8_0_[batch_id][head_id].data(), seq_len_, 0, false, thread_local_attn_mask_[thread_id].data(),
                GGML_TYPE_Q8_0, 0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                thread_local_attn_score_[thread_id].data(), thread_local_output_q8_0_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        } else {
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            attn_with_kvcache_one_block_(
                config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                (void*)&q_in_data[batch_id * config_.kv_head_num * n_gqa_ * config_.head_dim +
                                  head_id * n_gqa_ * config_.head_dim],
                seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0, k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                nullptr, nullptr, GGML_TYPE_F16, 1, v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0, nullptr,
                nullptr, thread_local_attn_score_[thread_id].data(), thread_local_output_fp32_[thread_id].data(),
                thread_local_attn_lse_[thread_id].data(), thread_local_draft_[thread_id].data(), nullptr, cos_.data(),
                sin_.data());

          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0,
                                         0, k_cache_q4[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q4_0, 1, v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            attn_with_kvcache_one_block_(config_.head_dim, config_.q_head_num / config_.kv_head_num, GGML_TYPE_Q8_0,
                                         q_q8_0_[batch_id][head_id].data(), seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0,
                                         0, k_cache_q8[layer_id_][head_id][block_idx].data(), 0, nullptr, nullptr,
                                         GGML_TYPE_Q8_0, 1, v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                                         nullptr, nullptr, thread_local_attn_score_[thread_id].data(),
                                         thread_local_output_q8_0_[thread_id].data(),
                                         thread_local_attn_lse_[thread_id].data(),
                                         thread_local_draft_[thread_id].data(), nullptr, cos_.data(), sin_.data());
            dequantize_row_q8_0(thread_local_output_q8_0_[thread_id].data(),
                                thread_local_output_fp32_[thread_id].data(), n_gqa_ * config_.head_dim);
          }
        }
        for (int i = 0; i < n_gqa_; i++) {
          block_lse_[batch_id][block_idx][head_id * n_gqa_ + i] = thread_local_attn_lse_[thread_id][i];
        }
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (batch_id == cur_batch_idx && head_id == cur_head_id) {
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse = thread_local_cur_attn_lse_[thread_id][i] +
                                 std::log(1.0 + std::exp(thread_local_attn_lse_[thread_id][i] -
                                                         thread_local_cur_attn_lse_[thread_id][i]));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] +=
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
          }
        } else {
          if (cur_batch_idx != -1) {
            mutex_[cur_batch_idx][cur_head_id]->lock();
            for (int i = 0; i < n_gqa_; i++) {
              if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
                attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
                for (int j = 0; j < config_.head_dim; j++) {
                  output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                      thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
                }
                continue;
              }
              float new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                                   std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                           attn_lse_[cur_batch_idx][cur_head_id][i]));
              ggml_vec_scale_f32(config_.head_dim,
                                 output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                                 std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
              ggml_vec_scale_f32(config_.head_dim,
                                 thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                                 std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
            }
            mutex_[cur_batch_idx][cur_head_id]->unlock();
          }
          thread_cur_head_idx_[thread_id].first = batch_id;
          thread_cur_head_idx_[thread_id].second = head_id;
          for (int i = 0; i < n_gqa_; i++) {
            thread_local_cur_attn_lse_[thread_id][i] = thread_local_attn_lse_[thread_id][i];
            for (int j = 0; j < config_.head_dim; j++) {
              thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j] =
                  thread_local_output_fp32_[thread_id][i * config_.head_dim + j];
            }
          }
        }
      },
      // Merge the results of the remaining blocks.
      [&](int thread_id) {
        int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
        int cur_head_id = thread_cur_head_idx_[thread_id].second;
        if (cur_head_id != -1) {
          mutex_[cur_batch_idx][cur_head_id]->lock();
          for (int i = 0; i < n_gqa_; i++) {
            float new_attn_lse;
            if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) < 1e-6) {
              attn_lse_[cur_batch_idx][cur_head_id][i] = thread_local_cur_attn_lse_[thread_id][i];
              for (int j = 0; j < config_.head_dim; j++) {
                output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] =
                    thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
              }
              continue;
            }
            new_attn_lse = attn_lse_[cur_batch_idx][cur_head_id][i] +
                           std::log(1.0 + std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                                   attn_lse_[cur_batch_idx][cur_head_id][i]));
            ggml_vec_scale_f32(config_.head_dim, output_fp32_[cur_batch_idx][cur_head_id].data() + i * config_.head_dim,
                               std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] - new_attn_lse));
            ggml_vec_scale_f32(config_.head_dim, thread_local_cur_output_fp32_[thread_id].data() + i * config_.head_dim,
                               std::exp(thread_local_cur_attn_lse_[thread_id][i] - new_attn_lse));
            for (int j = 0; j < config_.head_dim; j++) {
              output_fp32_[cur_batch_idx][cur_head_id][i * config_.head_dim + j] +=
                  thread_local_cur_output_fp32_[thread_id][i * config_.head_dim + j];
            }
            attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
          }
          mutex_[cur_batch_idx][cur_head_id]->unlock();
        }
      });

  for (int i = 0; i < batch_size; i++) {
    for (int j = 0; j < max_block_num_after_retrieval_; j++) {
      for (int k = 0; k < config_.q_head_num; k++) {
        int block_idx = block_table_after_retrieval_kvhead_[i][j][k / n_gqa_];
        attn_sparsity[i * config_.q_head_num + k] +=
            std::exp(block_lse_[i][block_idx][k] - attn_lse_[i][k / n_gqa_][k % n_gqa_]);
      }
    }
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of calculating sparsity: %f s\n", layer_id_,
  //        diff.count());
}
void KVCache::calculate_block_similarity_kvhead_(const uint16_t* q_in_data, int batch_size, int layer_idx, int q_len,
                                                 int max_block_num, int* cache_seqlens, int init_block_num,
                                                 int local_block_num, int pick_block_num, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  backend->do_work_stealing_job(
      batch_size * max_block_num, nullptr,
      [&](int task_id) {
        int batch_id = task_id / max_block_num;
        int block_id = task_id % max_block_num;
        int seq_len = cache_seqlens_[batch_id];

        if (block_id < init_block_num || block_id >= (seq_len / config_.block_len) - local_block_num) {
          return;
        }
        int block_idx = block_table_before_retrieval_kvhead_[batch_id][block_id][0];

        for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
          for (int i = 0; i < config_.head_dim; i++) {
            float q_i = 0, qa_i = std::numeric_limits<float>::lowest();
            for (int q_id = 0; q_id < q_len; q_id++) {
              q_i += GGML_FP16_TO_FP32(
                  q_in_data[batch_id * q_len * config_.q_head_num * config_.head_dim +
                            q_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + i]);
            }
            q_i /= q_len;
            for (int anchor_id = 0; anchor_id < config_.anchor_num; anchor_id++) {
              qa_i = std::max(
                  qa_i,
                  GGML_FP16_TO_FP32(
                      anchor_[layer_idx * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                  config_.head_dim +
                              block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                              anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + i]) *
                      q_i);
            }
            block_similar_kv_head_[batch_id][block_id][head_id / n_gqa_] += qa_i;
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of calculating similarity: %f s\n", layer_idx,
  //        diff.count());
}
void KVCache::select_block_kvhead_(int batch_size, int layer_idx, int max_block_num, int init_block_num,
                                   int local_block_num, int pick_block_num) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    int cache_len_after_retrieval = 0;
    if (cache_seqlens_[batch_idx] / config_.block_len <= init_block_num + pick_block_num + local_block_num) {
      selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] = 0;
      for (int i = 0; i < max_block_num; i++) {
        for (int j = 0; j < config_.kv_head_num; j++) {
          block_table_after_retrieval_kvhead_[batch_idx][i][j] = block_table_before_retrieval_kvhead_[batch_idx][i][j];
        }
      }
      continue;
    }
    for (int head_id = 0; head_id < config_.kv_head_num; head_id++) {
      for (int block_id = init_block_num; block_id < (cache_seqlens_[batch_idx] / config_.block_len) - local_block_num;
           block_id++) {
        top_similar_block_[batch_idx].push(
            std::make_pair(block_similar_kv_head_[batch_idx][block_id][head_id],
                           block_table_before_retrieval_kvhead_[batch_idx][block_id][head_id]));
        if (top_similar_block_[batch_idx].size() > pick_block_num) {
          top_similar_block_[batch_idx].pop();
        }
      }

      int i = 0;
      for (; i < init_block_num; i++) {
        block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
            block_table_before_retrieval_kvhead_[batch_idx][i][head_id];
      }
      while (!top_similar_block_[batch_idx].empty()) {
        block_table_after_retrieval_kvhead_[batch_idx][i][head_id] = top_similar_block_[batch_idx].top().second;
        top_similar_block_[batch_idx].pop();
        i++;
      }
      for (; i < init_block_num + pick_block_num + local_block_num; i++) {
        block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
            block_table_before_retrieval_kvhead_[batch_idx][(cache_seqlens_[batch_idx] / config_.block_len) -
                                                            local_block_num + i - init_block_num - pick_block_num]
                                                [head_id];
      }
      if (cache_seqlens_[batch_idx] % config_.block_len != 0) {
        block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
            block_table_before_retrieval_kvhead_[batch_idx][(cache_seqlens_[batch_idx] / config_.block_len)][head_id];
        cache_len_after_retrieval = (cache_seqlens_[batch_idx] % config_.block_len) + i * config_.block_len;
        i++;
      } else {
        cache_len_after_retrieval = (cache_seqlens_[batch_idx] % config_.block_len) + i * config_.block_len;
      }
      for (int j = 0; j < i; j++) {
        selected_blocks_history_kvhead_[(layer_idx - config_.layer_offset) / config_.layer_step][batch_idx][j]
                                       [head_id] = block_table_after_retrieval_kvhead_[batch_idx][j][head_id];
      }
    }
    cache_seqlens_[batch_idx] = cache_len_after_retrieval;
    selected_blocks_num_history_[(layer_idx - config_.layer_offset) / config_.layer_step] =
        (cache_len_after_retrieval + config_.block_len - 1) / config_.block_len;
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  // printf("layer %d time of selecting block: %f s\n", layer_idx,
  //        diff.count())
}

void KVCache::get_attn_sparsity(const ggml_fp16_t* q_in, float* attn_sparsity, int layer_idx, int generate_token_idx,
                                int q_len, int batch_size, int max_block_num, int* block_table, int* cache_seqlens,
                                int* block_table_origin, int* cache_seqlens_origin, int max_block_num_origin, int topk,
                                int local, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  layer_id_ = layer_idx;
  int thread_num = backend->get_thread_num();
  batch_size = 1;

  const uint16_t* q_in_data = const_cast<const uint16_t*>(q_in);

  quantize_q_(q_in_data, batch_size);
  if (config_.retrieval_type == RetrievalType::LAYER) {
    attn_initialize_layer_(batch_size, layer_idx, block_table, max_block_num, cache_seqlens);
    retrieval_kvcache_layer_(q_in_data, 1, local, topk, q_len, generate_token_idx, batch_size, layer_idx, cache_seqlens,
                             max_block_num, backend);
    calculate_sparsity_layer_(q_in_data, attn_sparsity, batch_size, max_block_num_origin, block_table_origin,
                              cache_seqlens_origin, backend);
  } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
    attn_initialize_kvhead_(batch_size, layer_idx, block_table, max_block_num, cache_seqlens);
    retrieval_kvcache_kvhead_(q_in_data, 1, local, topk, q_len, generate_token_idx, batch_size, layer_idx,
                              cache_seqlens, max_block_num, backend);
    calculate_sparsity_kvhead_(q_in_data, attn_sparsity, batch_size, max_block_num_origin, block_table_origin,
                               cache_seqlens_origin, backend);
  }
}

void KVCache::attn_with_kvcache_one_block_(int head_dim, int bsz,
                                           ggml_type q_type,  // GGML data type of `Q`, only supports fp16 and q8_0
                                           // [bsz, head_dim]
                                           // Quantization is always on the head_dim dimension (per_token). If
                                           // head_dim % 32 != 0, an error will be raised. The size must be bsz *
                                           // head_dim/32 * qtype_size.
                                           const void* q,

                                           int past_kv_len, int past_kv_offset,
                                           bool is_full_attn,  // true indicates a full 1 mask
                                           // If is_full_attn = false, a bit matrix representing the mask is
                                           // passed. [bsz, past_kv_len]
                                           const uint8_t* attn_mask,

                                           ggml_type k_type,  // GGML data type of `K Cache`, only supports fp16,
                                                              // q4_0, q8_0
                                           int k_quant_type,  // 0 for per_token, 1 for per_channel, others raise an
                                                              // error
                                           // [seq_len, head_dim]
                                           // If quant_type == 0, head_dim % 32 must be 0.
                                           // If quant_type == 1, seq_len % 32 must be 0.
                                           const void* k_cache,

                                           // k_anchor_type must be fp16
                                           int num_k_anchor,  // num_k_anchor == 0 indicates no anchor
                                           // [num_k_anchor, head_dim]
                                           const void* k_cache_anchors,
                                           // Each token is associated with the nearest previous position's anchor,
                                           // with the same distance.
                                           const int* k_cache_anchor_pos,

                                           // v_cache similar to k_cache
                                           ggml_type v_type, int v_quant_type,
                                           // [head_dim, seq_len]
                                           const void* v_cache, int num_v_anchor, const void* v_cache_anchors,
                                           const int* v_cache_anchor_pos,

                                           // Pre-allocated buffer for intermediate calculations [bsz,
                                           // past_kv_len]. No malloc is performed inside this function.
                                           float* attn_score,

                                           // Output: [bsz, head_dim], with the same type as q_type
                                           void* output,
                                           // [bsz]
                                           float* lse,

                                           // Pre-allocated temporary buffer with sufficient size:
                                           // (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
                                           // head_dim + past_kv_len * head_dim / 32) bytes.
                                           void* draft,

                                           // Apply rotary embedding online
                                           const int* rotary_angle, const void* rotary_cos, const void* rotary_sin
                                           // rotary_cos=None,
                                           // rotary_sin=None,
                                           // cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
                                           // cache_batch_idx: Optional[torch.Tensor] = None,
                                           // rotary_interleaved=True,

                                           // // Not supported for now
                                           // window_size=(-1, -1),  # -1 means infinite context window
                                           // alibi_slopes=None,
) {
  assert(head_dim % 32 == 0);
  assert(k_quant_type == 0);
  assert(v_quant_type == 1);
  assert(q_type == GGML_TYPE_F16 || q_type == GGML_TYPE_Q8_0);
  if (q_type == GGML_TYPE_F16) {
    assert(k_type == GGML_TYPE_F16);
    assert(v_type == GGML_TYPE_F16);

    // attn = q * k + q * k_anchor
    // TODO: anchor
    assert(num_k_anchor == 0);

    if (rotary_angle != nullptr) {
      ggml_fp16_t* k_cache_with_rope_fp16 =
          (reinterpret_cast<ggml_fp16_t*>(draft) + sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
           sizeof(float) * bsz * head_dim);
      // dequant k_cache and apply rope
      // k_rope(i) = k(i) * cos(i) - k(i+l) * sin(i)
      // k_rope(i+l) = k(i+l) * cos(i+l) + k(i) * sin(i)

      // k(i)cos(i) -> k_rope(i)
      // k(i)sin(i+l) -> k_rope(i+l)

      // k(i)cos(i) -> k_rope(i)
      // -k(i)sin(i-l) -> k_rope(i-l)

      std::vector<float> block_fp32(32);
      for (int k = 0; k < past_kv_len; k++) {
        int angle = rotary_angle[k];
        for (int l = 0; l < head_dim / 32; l++) {
          for (int m = 0; m < 32; m++) {
            float x = GGML_FP16_TO_FP32(((ggml_fp16_t*)k_cache)[k * head_dim + l * 32 + m]);
            float sin_val = GGML_FP16_TO_FP32(((ggml_fp16_t*)rotary_sin)[angle * head_dim + l * 32 + m]);
            float cos_val = GGML_FP16_TO_FP32(((ggml_fp16_t*)rotary_cos)[angle * head_dim + l * 32 + m]);

            if (l * 32 + m < head_dim / 2) {
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m] = GGML_FP32_TO_FP16(x * cos_val);
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m + head_dim / 2] = GGML_FP32_TO_FP16(-x * sin_val);
            } else {
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m] =
                  GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(k_cache_with_rope_fp16[k * head_dim + l * 32 + m]) + x * sin_val);
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m - head_dim / 2] = GGML_FP32_TO_FP16(
                  GGML_FP16_TO_FP32(k_cache_with_rope_fp16[k * head_dim + l * 32 + m - head_dim / 2]) - x * cos_val);
            }
          }
        }
      }

      llamafile_sgemm(past_kv_len, bsz, head_dim, (ggml_fp16_t*)k_cache_with_rope_fp16, head_dim, (ggml_fp16_t*)q,
                      head_dim, attn_score, past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_F16,
                      GGML_TYPE_F32, GGML_PREC_DEFAULT);
    } else {
      bool ok = llamafile_sgemm(past_kv_len, bsz, head_dim, (ggml_fp16_t*)k_cache, head_dim, (ggml_fp16_t*)q, head_dim,
                                attn_score, past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_F16,
                                GGML_TYPE_F32, GGML_PREC_DEFAULT);

      if (!ok) {
        printf("llamafile_sgemm failed\n");
      }
    }
    // attn = attn * scale
    float scale_factor = 1.0 / std::sqrt(float(head_dim));
    ggml_vec_scale_f32(bsz * past_kv_len, attn_score, scale_factor);

    // attn = attn & mask
    if (!is_full_attn) {
      for (int i = 0; i < bsz; i++) {
        for (int j = 0; j < past_kv_len; j++) {
          int index = i * past_kv_len + j;
          if (!(attn_mask[j / 8] & (1 << (j % 8)))) {
            attn_score[index] = std::numeric_limits<float>::lowest();
          }
        }
      }
    }

    // attn = softmax(attn)
    for (int i = 0; i < bsz; i++) {
      float sum_exp = 0;
      for (int j = 0; j < past_kv_len; j++) {
        attn_score[i * past_kv_len + j] = std::exp(attn_score[i * past_kv_len + j]);
        sum_exp += attn_score[i * past_kv_len + j];
      }
      for (int j = 0; j < past_kv_len; j++) {
        attn_score[i * past_kv_len + j] /= sum_exp;
      }
      if (lse != nullptr) {
        lse[i] = std::log(sum_exp);
      }
    }

    // output = attn * v + attn * v_anchor
    // std::vector<float> sum(bsz * head_dim);
    float* sum =
        reinterpret_cast<float*>(reinterpret_cast<char*>(draft) + sizeof(block_q8_0) * bsz * past_kv_len / QK8_0);

    // float* attn_score_fp16(bsz, past_kv_len)
    ggml_fp16_t* attn_score_fp16 = (reinterpret_cast<ggml_fp16_t*>(reinterpret_cast<char*>(draft) +
                                                                   sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
                                                                   sizeof(float) * bsz * head_dim));

    for (int i = 0; i < bsz * past_kv_len; i++) {
      attn_score_fp16[i] = GGML_FP32_TO_FP16(attn_score[i]);
    }

    // TODO: anchor
    assert(num_v_anchor == 0);
    bool ok = llamafile_sgemm(head_dim, bsz, past_kv_len, (ggml_fp16_t*)v_cache, past_kv_len,
                              (ggml_fp16_t*)attn_score_fp16, past_kv_len, sum, head_dim, 0, 1, GGML_TASK_TYPE_COMPUTE,
                              v_type, GGML_TYPE_F16, GGML_TYPE_F32, GGML_PREC_DEFAULT);
    if (!ok) {
      printf("llamafile_sgemm failed\n");
    }

    // copy to output
    for (int i = 0; i < bsz; i++) {
      for (int j = 0; j < head_dim; j++) {
        ((float*)output)[i * head_dim + j] = sum[i * head_dim + j];
      }
    }
  } else {
    assert(k_type == GGML_TYPE_Q4_0 || k_type == GGML_TYPE_Q8_0);
    assert(v_type == GGML_TYPE_Q4_0 || v_type == GGML_TYPE_Q8_0);

    // attn = q * k + q * k_anchor
    // TODO: anchor
    assert(num_k_anchor == 0);

    if (rotary_angle != nullptr) {
      ggml_fp16_t* k_cache_with_rope_fp16 =
          (reinterpret_cast<ggml_fp16_t*>(draft) + sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
           sizeof(float) * bsz * head_dim);
      block_q4_0* k_cache_with_rope_q4 =
          (reinterpret_cast<block_q4_0*>(draft) + sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
           sizeof(float) * bsz * head_dim) +
          sizeof(ggml_fp16_t) * bsz * head_dim;
      // dequant k_cache and apply rope
      // k_rope(i) = k(i) * cos(i) - k(i+l) * sin(i)
      // k_rope(i+l) = k(i+l) * cos(i+l) + k(i) * sin(i)

      // k(i)cos(i) -> k_rope(i)
      // k(i)sin(i+l) -> k_rope(i+l)

      // k(i)cos(i) -> k_rope(i)
      // -k(i)sin(i-l) -> k_rope(i-l)

      std::vector<float> block_fp32(32);
      for (int k = 0; k < past_kv_len; k++) {
        int angle = rotary_angle[k];
        for (int l = 0; l < head_dim / 32; l++) {
          block_q4_0 block = ((block_q4_0*)k_cache)[k * head_dim / 32 + l];
          dequantize_row_q4_0(&block, block_fp32.data(), 32);
          for (int m = 0; m < 32; m++) {
            float sin_val = GGML_FP16_TO_FP32(((ggml_fp16_t*)rotary_sin)[angle * head_dim + l * 32 + m]);
            float cos_val = GGML_FP16_TO_FP32(((ggml_fp16_t*)rotary_cos)[angle * head_dim + l * 32 + m]);

            if (l * 32 + m < head_dim / 2) {
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m] = GGML_FP32_TO_FP16(block_fp32[m] * cos_val);
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m + head_dim / 2] =
                  GGML_FP32_TO_FP16(-block_fp32[m] * sin_val);
            } else {
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m] += GGML_FP32_TO_FP16(block_fp32[m] * sin_val);
              k_cache_with_rope_fp16[k * head_dim + l * 32 + m - head_dim / 2] -=
                  GGML_FP32_TO_FP16(block_fp32[m] * cos_val);
            }
          }
        }
      }
      // quantize k_cache_with_rope_fp16
      for (int k = 0; k < past_kv_len; k++) {
        for (int l = 0; l < head_dim / 32; l++) {
          for (int m = 0; m < 32; m++) {
            block_fp32[m] = GGML_FP16_TO_FP32(k_cache_with_rope_fp16[k * head_dim + l * 32 + m]);
          }
          quantize_row_q4_0(block_fp32.data(), &k_cache_with_rope_q4[k * head_dim / 32 + l], 32);
        }
      }

      llamafile_sgemm(past_kv_len, bsz, head_dim / 32, (block_q4_0*)k_cache_with_rope_q4, head_dim / 32, (block_q8_0*)q,
                      head_dim / 32, attn_score, past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_Q8_0,
                      GGML_TYPE_F32, GGML_PREC_DEFAULT);
    } else {
      llamafile_sgemm(past_kv_len, bsz, head_dim / 32, (block_q4_0*)k_cache, head_dim / 32, (block_q8_0*)q,
                      head_dim / 32, attn_score, past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_Q8_0,
                      GGML_TYPE_F32, GGML_PREC_DEFAULT);
    }

    // attn = attn * scale
    float scale_factor = 1.0 / std::sqrt(float(head_dim));
    ggml_vec_scale_f32(bsz * past_kv_len, attn_score, scale_factor);

    // attn = attn & mask
    if (!is_full_attn) {
      for (int i = 0; i < bsz; i++) {
        for (int j = 0; j < past_kv_len; j++) {
          int index = i * past_kv_len + j;
          if (!(attn_mask[j / 8] & (1 << (j % 8)))) {
            attn_score[index] = std::numeric_limits<float>::lowest();
          }
        }
      }
    }

    // attn = softmax(attn)
    for (int i = 0; i < bsz; i++) {
      float sum_exp = 0;
      for (int j = 0; j < past_kv_len; j++) {
        attn_score[i * past_kv_len + j] = std::exp(attn_score[i * past_kv_len + j]);
        sum_exp += attn_score[i * past_kv_len + j];
      }
      for (int j = 0; j < past_kv_len; j++) {
        attn_score[i * past_kv_len + j] /= sum_exp;
      }
      if (lse != nullptr) {
        lse[i] = std::log(sum_exp);
      }
    }

    // output = attn * v + attn * v_anchor
    // std::vector<block_q8_0> attn_q8_0(bsz * past_kv_len / QK8_0);
    block_q8_0* attn_q8_0 = reinterpret_cast<block_q8_0*>(draft);
    quantize_row_q8_0(attn_score, attn_q8_0, bsz * past_kv_len);
    // std::vector<float> sum(bsz * head_dim);
    float* sum =
        reinterpret_cast<float*>(reinterpret_cast<char*>(draft) + sizeof(block_q8_0) * bsz * past_kv_len / QK8_0);
    // TODO: anchor
    assert(num_v_anchor == 0);
    llamafile_sgemm(head_dim, bsz, past_kv_len / 32, (block_q4_0*)v_cache, past_kv_len / 32, attn_q8_0,
                    past_kv_len / 32, sum, head_dim, 0, 1, GGML_TASK_TYPE_COMPUTE, v_type, GGML_TYPE_Q8_0,
                    GGML_TYPE_F32, GGML_PREC_DEFAULT);

    quantize_row_q8_0(sum, (block_q8_0*)output, bsz * head_dim);
  }
}


================================================
FILE: kt-kernel/operators/kvcache/kvcache_load_dump.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include <chrono>
#include <fstream>
#include <iostream>

#include "kvcache.h"

void KVCache::load_kvcache(std::string tensor_file_path, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  std::ifstream ifs_tensor(tensor_file_path, std::ios::binary);
  if (!ifs_tensor) {
    throw std::runtime_error("Failed to open tensor file");
  }
  ifs_tensor.read(reinterpret_cast<char*>(&cache_total_len_), sizeof(cache_total_len_));
  int past_block_num = (cache_total_len_ + config_.block_len - 1) / config_.block_len;
  printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len_, past_block_num);
  for (int i = 0; i < config_.layer_num; ++i) {
    past_block_num_[i] = past_block_num;
  }
  ifs_tensor.read(reinterpret_cast<char*>(anchor_.data()), anchor_.size() * sizeof(ggml_fp16_t));
  for (int i = 0; i < config_.layer_num; ++i) {
    for (int j = 0; j < config_.kv_head_num; ++j) {
      for (int k = 0; k < past_block_num_[i]; ++k) {
        if (config_.kv_type == GGML_TYPE_F16) {
          ifs_tensor.read(reinterpret_cast<char*>(k_cache_fp16_[i][j][k].data()),
                          k_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
          ifs_tensor.read(reinterpret_cast<char*>(v_cache_fp16_[i][j][k].data()),
                          v_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
        } else if (config_.kv_type == GGML_TYPE_Q4_0) {
          ifs_tensor.read(reinterpret_cast<char*>(k_cache_q4[i][j][k].data()),
                          k_cache_q4[i][j][k].size() * sizeof(block_q4_0));
          ifs_tensor.read(reinterpret_cast<char*>(v_cache_q4[i][j][k].data()),
                          v_cache_q4[i][j][k].size() * sizeof(block_q4_0));
        }
      }
    }
    for (int k = 0; k < past_block_num_[i]; ++k) {
      for (int l = 0; l < config_.block_len; l++) {
        ifs_tensor.read(reinterpret_cast<char*>(importance_[i][k][l].data()),
                        importance_[i][k][l].size() * sizeof(ggml_fp16_t));
      }
    }
  }
  ifs_tensor.close();
  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  printf("time of load: %f s\n", diff.count());
}
void KVCache::dump_kvcache(int* block_table, int cache_total_len, std::string tensor_file_path, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();
  std::ofstream ofs(tensor_file_path, std::ios::binary);
  printf("dump_kvcache: %s\n", tensor_file_path.c_str());
  if (!ofs.is_open()) {
    std::cerr << "Cannot open file " << tensor_file_path << std::endl;
    return;
  }
  ofs.write(reinterpret_cast<const char*>(&cache_total_len), sizeof(cache_total_len));
  int past_block_num = (cache_total_len + config_.block_len - 1) / config_.block_len;
  printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len, past_block_num);
  ofs.write(reinterpret_cast<const char*>(anchor_.data()), anchor_.size() * sizeof(ggml_fp16_t));
  for (int i = 0; i < config_.layer_num; ++i) {
    for (int j = 0; j < config_.kv_head_num; ++j) {
      for (int k = 0; k < past_block_num; ++k) {
        int block_idx = block_table[k];
        if (config_.kv_type == GGML_TYPE_F16) {
          ofs.write(reinterpret_cast<const char*>(k_cache_fp16_[i][j][block_idx].data()),
                    k_cache_fp16_[i][j][block_idx].size() * sizeof(ggml_fp16_t));
          ofs.write(reinterpret_cast<const char*>(v_cache_fp16_[i][j][block_idx].data()),
                    v_cache_fp16_[i][j][block_idx].size() * sizeof(ggml_fp16_t));

        } else if (config_.kv_type == GGML_TYPE_Q4_0) {
          ofs.write(reinterpret_cast<const char*>(k_cache_q4[i][j][block_idx].data()),
                    k_cache_q4[i][j][block_idx].size() * sizeof(block_q4_0));
          ofs.write(reinterpret_cast<const char*>(v_cache_q4[i][j][block_idx].data()),
                    v_cache_q4[i][j][block_idx].size() * sizeof(block_q4_0));
        }
      }
    }
    for (int k = 0; k < past_block_num; ++k) {
      int block_idx = block_table[k];
      for (int l = 0; l < config_.block_len; l++) {
        ofs.write(reinterpret_cast<const char*>(importance_[i][block_idx][l].data()),
                  importance_[i][block_idx][l].size() * sizeof(ggml_fp16_t));
      }
    }
  }
  ofs.close();
  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> diff = end - start;
  printf("time of dump: %f s\n", diff.count());
}

================================================
FILE: kt-kernel/operators/kvcache/kvcache_read_write.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include <chrono>

#include "ggml-impl.h"
#include "kvcache.h"

void KVCache::get_anchor_one_block(ggml_fp16_t* anchor, int layer_id, int block_idx, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  block_idx = block_idx;
  seq_len_ = config_.block_len;
  anchor_data_ = const_cast<uint16_t*>(anchor);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  printf("layer %d block %d time of reading anchor: %f s\n", layer_id, block_idx, duration.count());
}

void KVCache::update_anchor_one_block(const ggml_fp16_t* anchor, int layer_id, int block_idx, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  block_idx = block_idx;
  seq_len_ = config_.block_len;
  anchor_data_ = const_cast<uint16_t*>(anchor);

  // Each task updates the anchor of a certain position
  // backend->do_work_stealing_job(config_.anchor_num, [&](int task_id) {
  //     int k = task_id % config_.anchor_num;
  //     int head_id = task_id / config_.anchor_num;
  //     memcpy(anchor_[layer_id_][head_id][block_idx].data() +
  //                k * config_.head_dim,
  //            anchor_data_ + k * config_.head_dim,
  //            sizeof(uint16_t) * config_.head_dim);
  // });

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  printf("layer %d block %d time of writting anchor: %f s\n", layer_id, block_idx, duration.count());
}

void KVCache::update_importance_one_block(const ggml_fp16_t* importance, int layer_id, int block_idx,
                                          WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  block_idx = block_idx;
  seq_len_ = config_.block_len;
  importance_data_ = const_cast<uint16_t*>(importance);

  // Each task updates the importance of a certain position
  backend->do_work_stealing_job(
      config_.block_len, nullptr,
      [&](int task_id) {
        int k = task_id;
        memcpy(importance_[layer_id_][block_idx].data() + k, importance_data_ + k, sizeof(uint16_t));
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  printf("layer %d block %d time of writting importance: %f s\n", layer_id, block_idx, duration.count());
}

void KVCache::get_importance_one_block(ggml_fp16_t* importance, int layer_id, int block_idx, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  block_idx = block_idx;
  seq_len_ = config_.block_len;
  importance_data_ = const_cast<uint16_t*>(importance);

  // Each task updates the importance of a certain position
  backend->do_work_stealing_job(
      config_.block_len, nullptr,
      [&](int task_id) {
        int k = task_id;
        memcpy(importance_data_ + k, importance_[layer_id_][block_idx].data() + k, sizeof(uint16_t));
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  printf("layer %d block %d time of reading importance: %f s\n", layer_id, block_idx, duration.count());
}

void KVCache::update_kvcache_one_block_fp16(const ggml_fp16_t* k_in, const ggml_fp16_t* v_in, int layer_id,
                                            int block_idx, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  block_idx = block_idx;
  seq_len_ = config_.block_len;
  k_data_ = const_cast<uint16_t*>(k_in);
  v_data_ = const_cast<uint16_t*>(v_in);

  int new_block_num = std::max((int)past_block_num_[layer_id], block_idx + 1);

  importance_[layer_id_].resize(new_block_num);

  for (int i = 0; i < config_.kv_head_num; i++) {
    k_cache_q4[layer_id][i].resize(new_block_num);
    v_cache_q4[layer_id][i].resize(new_block_num);
    // anchor_[layer_id][i].resize(new_block_num);
  }

  for (int i = 0; i < new_block_num; i++) {
    importance_[layer_id][i].resize(config_.block_len);
  }

  // Each task updates the k cache or v cache of a certain header
  backend->do_work_stealing_job(
      config_.kv_head_num * 2, nullptr,
      [&](int task_id) {
        std::vector<float> block_fp32(32);
        int head_id = task_id / 2;
        if (task_id & 1) {
          // fill k_cache_
          k_cache_q4[layer_id_][head_id][block_idx].resize(config_.block_len * config_.head_dim / 32);
          for (int k = 0; k < config_.block_len; k++) {
            for (int l = 0; l < config_.head_dim / 32; l++) {
              block_q4_0 block;
              for (int m = 0; m < 32; m++) {
                block_fp32[m] = GGML_FP16_TO_FP32(
                    k_data_[((0 * config_.kv_head_num + head_id) * seq_len_ + 0 * config_.block_len + k) *
                                config_.head_dim +
                            l * 32 + m]);
              }
              quantize_row_q4_0(block_fp32.data(), &block, 32);
              k_cache_q4[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l] = block;
            }
          }
        } else {
          // fill v_cache_
          v_cache_q4[layer_id_][head_id][block_idx].resize(config_.head_dim * config_.block_len / 32);
          for (int k = 0; k < config_.block_len / 32; k++) {
            for (int l = 0; l < config_.head_dim; l++) {
              block_q4_0 block;
              for (int m = 0; m < 32; m++) {
                block_fp32[m] = GGML_FP16_TO_FP32(
                    v_data_[((0 * config_.kv_head_num + head_id) * seq_len_ + 0 * config_.block_len + k * 32 + m) *
                                config_.head_dim +
                            l]);
              }
              quantize_row_q4_0(block_fp32.data(), &block, 32);
              v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k] = block;
            }
          }
        }
      },
      nullptr);
  past_block_num_[layer_id] = new_block_num;

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  printf("layer %d block %d time of writting KV Cache: %f s\n", layer_id, block_idx, duration.count());
  // printf("get_one_block_fp16 duration: %ld\n", duration);
}

void KVCache::get_kvcache_one_block_fp16(ggml_fp16_t* k_in, ggml_fp16_t* v_in, int layer_id, int block_idx,
                                         WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  seq_len_ = config_.block_len;
  k_data_ = reinterpret_cast<uint16_t*>(k_in);
  v_data_ = reinterpret_cast<uint16_t*>(v_in);

  // printf("layer_id: %d, block_idx: %d\n", layer_id, block_idx);
  // Each task gets the k cache or v cache of a certain header
  backend->do_work_stealing_job(
      config_.kv_head_num * 2, nullptr,
      [&](int task_id) {
        std::vector<float> block_fp32(32);
        int head_id = task_id / 2;
        if (task_id & 1) {
          // get k_cache_
          for (int k = 0; k < config_.block_len; k++) {
            for (int l = 0; l < config_.head_dim / 32; l++) {
              block_q4_0 block = k_cache_q4[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l];
              dequantize_row_q4_0(&block, block_fp32.data(), 32);
              for (int m = 0; m < 32; m++) {
                k_data_[((0 * config_.kv_head_num + head_id) * seq_len_ + 0 * config_.block_len + k) *
                            config_.head_dim +
                        l * 32 + m] = GGML_FP32_TO_FP16(block_fp32[m]);
              }
            }
          }
        } else {
          // get v_cache_
          for (int k = 0; k < config_.block_len / 32; k++) {
            for (int l = 0; l < config_.head_dim; l++) {
              block_q4_0 block = v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k];
              dequantize_row_q4_0(&block, block_fp32.data(), 32);
              for (int m = 0; m < 32; m++) {
                v_data_[((0 * config_.kv_head_num + head_id) * seq_len_ + 0 * config_.block_len + k * 32 + m) *
                            config_.head_dim +
                        l] = GGML_FP32_TO_FP16(block_fp32[m]);
              }
            }
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  printf("layer %d block %d time of reading KV Cache: %f s\n", layer_id, block_idx, duration.count());
  // printf("get_one_block_fp16 duration: %ld\n", duration);
}

// k_in: (batch_size, seq_len, head_num, head_dim)
// v_in: (batch_size, seq_len, head_num, head_dim)
void KVCache::get_and_update_kvcache_fp16(ggml_fp16_t* k_in, ggml_fp16_t* v_in, int layer_id, int* block_table,
                                          int batch_size, int max_block_num, int* cache_seqlens, int q_len,
                                          WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  k_data_ = const_cast<uint16_t*>(k_in);
  v_data_ = const_cast<uint16_t*>(v_in);

  // Each task updates the k cache and v cache of a certain header
  backend->do_work_stealing_job(
      config_.kv_head_num * max_block_num * batch_size, nullptr,
      [&](int task_id) {
        // printf("block_idx: %d, task_id: %d\n", block_idx, task_id);
        std::vector<float> block_fp32(32);
        int batch_id = task_id / (config_.kv_head_num * max_block_num);
        int block_id = (task_id / config_.kv_head_num) % max_block_num;
        int head_id = task_id % config_.kv_head_num;
        int block_idx = block_table[batch_id * max_block_num + block_id];
        int seq_len = cache_seqlens[batch_id];
        int block_l = block_id * config_.block_len;
        int block_r = block_id * config_.block_len + config_.block_len;

        if (block_l < seq_len) {
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len) break;
              for (int l = 0; l < config_.head_dim; l++) {
                k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                        block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                        k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l] =
                    k_cache_fp16_[layer_id_][head_id][block_idx][k * config_.head_dim + l];
                v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                        block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                        k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l] =
                    v_cache_fp16_[layer_id_][head_id][block_idx][l * config_.block_len + k];
              }
            }
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            // get k_cache_
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len) break;
              for (int l = 0; l < config_.head_dim / 32; l++) {
                block_q4_0 block = k_cache_q4[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l];
                dequantize_row_q4_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l * 32 + m] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
            // get v_cache_
            for (int k = 0; k < config_.block_len / 32; k++) {
              for (int l = 0; l < config_.head_dim; l++) {
                block_q4_0 block = v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k];
                dequantize_row_q4_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  if (block_id * config_.block_len + k * 32 + m >= seq_len) break;
                  v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          (k * 32 + m) * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            // get k_cache_
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len) break;
              for (int l = 0; l < config_.head_dim / 32; l++) {
                block_q8_0 block = k_cache_q8[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l];
                dequantize_row_q8_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l * 32 + m] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
            // get v_cache_
            for (int k = 0; k < config_.block_len / 32; k++) {
              for (int l = 0; l < config_.head_dim; l++) {
                block_q8_0 block = v_cache_q8[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k];
                dequantize_row_q8_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  if (block_id * config_.block_len + k * 32 + m >= seq_len) break;
                  v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          (k * 32 + m) * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
          }
        }
        if (block_r > seq_len && block_l < seq_len + q_len) {
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len + q_len || block_id * config_.block_len + k < seq_len)
                continue;
              for (int l = 0; l < config_.head_dim; l++) {
                k_cache_fp16_[layer_id_][head_id][block_idx][k * config_.head_dim + l] =
                    k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                            block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                            k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l];
                v_cache_fp16_[layer_id_][head_id][block_idx][l * config_.block_len + k] =
                    v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                            block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                            k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l];
              }
            }
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            // fill k_cache_
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len + q_len || block_id * config_.block_len + k < seq_len)
                continue;
              for (int l = 0; l < config_.head_dim / 32; l++) {
                block_q4_0 block;
                for (int m = 0; m < 32; m++) {
                  block_fp32[m] = GGML_FP16_TO_FP32(
                      k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                              block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                              k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l * 32 + m]);
                }
                quantize_row_q4_0(block_fp32.data(), &block, 32);
                k_cache_q4[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l] = block;
              }
            }

            // fill v_cache_
            for (int k = 0; k < config_.block_len / 32; k++) {
              for (int l = 0; l < config_.head_dim; l++) {
                block_q4_0 block;
                for (int m = 0; m < 32; m++) {
                  if (block_id * config_.block_len + k * 32 + m >= seq_len + q_len) {
                    block_fp32[m] = 0;
                    continue;
                  }
                  block_fp32[m] = GGML_FP16_TO_FP32(
                      v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                              block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                              (k * 32 + m) * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l]);
                }
                quantize_row_q4_0(block_fp32.data(), &block, 32);
                v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k] = block;
              }
            }
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            // fill k_cache_
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len + q_len || block_id * config_.block_len + k < seq_len)
                continue;
              for (int l = 0; l < config_.head_dim / 32; l++) {
                block_q8_0 block;
                for (int m = 0; m < 32; m++) {
                  block_fp32[m] = GGML_FP16_TO_FP32(
                      k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                              block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                              k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l * 32 + m]);
                }
                quantize_row_q8_0(block_fp32.data(), &block, 32);
                k_cache_q8[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l] = block;
              }
            }

            // fill v_cache_
            for (int k = 0; k < config_.block_len / 32; k++) {
              for (int l = 0; l < config_.head_dim; l++) {
                block_q8_0 block;
                for (int m = 0; m < 32; m++) {
                  if (block_id * config_.block_len + k * 32 + m >= seq_len + q_len) {
                    block_fp32[m] = 0;
                    continue;
                  }
                  block_fp32[m] = GGML_FP16_TO_FP32(
                      v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                              block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                              (k * 32 + m) * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l]);
                }
                quantize_row_q8_0(block_fp32.data(), &block, 32);
                v_cache_q8[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k] = block;
              }
            }
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;

  // printf("layer %d time of reading and updating KV Cache: %f s\n",
  // layer_id,
  //        duration.count());
}

void KVCache::update_importance(const ggml_fp16_t* importance, int layer_id, int* block_table, int batch_size,
                                int max_block_num, int* offset, int width, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  importance_data_ = const_cast<uint16_t*>(importance);

  // Each task updates the importance of a certain position
  backend->do_work_stealing_job(
      max_block_num * batch_size, nullptr,
      [&](int task_id) {
        int block_id = task_id % max_block_num;
        int batch_id = task_id / max_block_num;
        int block_idx = block_table[batch_id * max_block_num + block_id];
        if (block_id > (offset[batch_id] + width) / config_.block_len) {
          return;
        }
        for (int k = 0; k < config_.block_len; k++) {
          for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
            importance_[layer_id_][block_idx][k][head_id] = GGML_FP32_TO_FP16(
                GGML_FP16_TO_FP32(importance_data_[batch_id * max_block_num * config_.block_len * config_.q_head_num +
                                                   (block_id * config_.block_len + k) * config_.q_head_num + head_id]) +
                GGML_FP16_TO_FP32(importance_[layer_id_][block_idx][k][head_id]));
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;

  // printf("layer %d time of updating importance: %f s\n", layer_id,
  //        duration.count());
}

void KVCache::get_kvcache_fp16(ggml_fp16_t* k_in, ggml_fp16_t* v_in, int layer_id, int* block_table, int batch_size,
                               int max_block_num, int* cache_seqlens, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  k_data_ = const_cast<uint16_t*>(k_in);
  v_data_ = const_cast<uint16_t*>(v_in);

  // Each task updates the k cache and v cache of a certain header
  backend->do_work_stealing_job(
      config_.kv_head_num * max_block_num * batch_size, nullptr,
      [&](int task_id) {
        // printf("block_idx: %d, task_id: %d\n", block_idx, task_id);
        std::vector<float> block_fp32(32);
        int batch_id = task_id / (config_.kv_head_num * max_block_num);
        int block_id = (task_id / config_.kv_head_num) % max_block_num;
        int head_id = task_id % config_.kv_head_num;
        int block_idx = block_table[batch_id * max_block_num + block_id];
        int seq_len = cache_seqlens[batch_id];
        int block_l = block_id * config_.block_len;
        int block_r = block_id * config_.block_len + config_.block_len;

        if (block_l < seq_len) {
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len) break;
              for (int l = 0; l < config_.head_dim; l++) {
                k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                        block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                        k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l] =
                    k_cache_fp16_[layer_id_][head_id][block_idx][k * config_.head_dim + l];
                v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                        block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                        k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l] =
                    v_cache_fp16_[layer_id_][head_id][block_idx][l * config_.block_len + k];
              }
            }
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            // get k_cache_
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len) break;
              for (int l = 0; l < config_.head_dim / 32; l++) {
                block_q4_0 block = k_cache_q4[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l];
                dequantize_row_q4_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l * 32 + m] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
            // get v_cache_
            for (int k = 0; k < config_.block_len / 32; k++) {
              for (int l = 0; l < config_.head_dim; l++) {
                block_q4_0 block = v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k];
                dequantize_row_q4_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  if (block_id * config_.block_len + k * 32 + m >= seq_len) break;
                  v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          (k * 32 + m) * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            // get k_cache_
            for (int k = 0; k < config_.block_len; k++) {
              if (block_id * config_.block_len + k >= seq_len) break;
              for (int l = 0; l < config_.head_dim / 32; l++) {
                block_q8_0 block = k_cache_q8[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l];
                dequantize_row_q8_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  k_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          k * (config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l * 32 + m] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
            // get v_cache_
            for (int k = 0; k < config_.block_len / 32; k++) {
              for (int l = 0; l < config_.head_dim; l++) {
                block_q8_0 block = v_cache_q8[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k];
                dequantize_row_q8_0(&block, block_fp32.data(), 32);
                for (int m = 0; m < 32; m++) {
                  if (block_id * config_.block_len + k * 32 + m >= seq_len) break;
                  v_data_[batch_id * (max_block_num * config_.block_len * config_.kv_head_num * config_.head_dim) +
                          block_id * (config_.block_len * config_.kv_head_num * config_.head_dim) +
                          (k * 32 + m) * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(block_fp32[m]);
                }
              }
            }
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
}

void KVCache::update_kvcache_fp16(const ggml_fp16_t* k_in, const ggml_fp16_t* v_in, int layer_id, int* block_table,
                                  int batch_size, int max_block_num, int* cache_seqlens, int q_len,
                                  WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  k_data_ = const_cast<uint16_t*>(k_in);
  v_data_ = const_cast<uint16_t*>(v_in);
  // Each task updates the k cache and v cache of a certain header
  backend->do_work_stealing_job(
      batch_size * config_.kv_head_num * q_len, nullptr,
      [&](int task_id) {
        int batch_id = task_id / (config_.kv_head_num * q_len);
        int head_id = task_id / q_len % config_.kv_head_num;
        int seq_len = cache_seqlens[batch_id] + task_id % q_len;
        int q_offset = task_id % q_len;

        int block_id = seq_len / config_.block_len;
        int block_idx = block_table[batch_id * max_block_num + block_id];
        int pos_in_block = seq_len % config_.block_len;

        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
          for (int l = 0; l < config_.head_dim; l++) {
            k_cache_fp16_[layer_id_][head_id][block_idx][pos_in_block * config_.head_dim + l] =
                k_data_[batch_id * (q_len * config_.kv_head_num * config_.head_dim) +
                        q_offset * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l];
            v_cache_fp16_[layer_id_][head_id][block_idx][l * config_.block_len + pos_in_block] =
                v_data_[batch_id * (q_len * config_.kv_head_num * config_.head_dim) +
                        q_offset * config_.kv_head_num * config_.head_dim + head_id * config_.head_dim + l];
          }
        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
          std::vector<float> block_fp32(32);
          // fill k_cache_
          for (int l = 0; l < config_.head_dim / 32; l++) {
            block_q4_0 block;
            for (int m = 0; m < 32; m++) {
              block_fp32[m] = GGML_FP16_TO_FP32(k_data_[batch_id * (q_len * config_.kv_head_num * config_.head_dim) +
                                                        head_id * config_.head_dim + l * 32 + m]);
            }
            quantize_row_q4_0(block_fp32.data(), &block, 32);

            k_cache_q4[layer_id_][head_id][block_idx][pos_in_block * config_.head_dim / 32 + l] = block;
          }

          // fill v_cache_
          for (int l = 0; l < config_.head_dim; l++) {
            block_q4_0 block =
                v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + pos_in_block / 32];
            dequantize_row_q4_0(&block, block_fp32.data(), 32);
            block_fp32[pos_in_block % 32] = GGML_FP16_TO_FP32(
                v_data_[batch_id * (q_len * config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l]);
            quantize_row_q4_0(block_fp32.data(), &block, 32);
            v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + pos_in_block / 32] = block;
          }
        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
          std::vector<float> block_fp32(32);
          // fill k_cache_
          for (int l = 0; l < config_.head_dim / 32; l++) {
            block_q8_0 block;
            for (int m = 0; m < 32; m++) {
              block_fp32[m] = GGML_FP16_TO_FP32(k_data_[batch_id * (q_len * config_.kv_head_num * config_.head_dim) +
                                                        head_id * config_.head_dim + l * 32 + m]);
            }
            quantize_row_q8_0(block_fp32.data(), &block, 32);

            k_cache_q8[layer_id_][head_id][block_idx][pos_in_block * config_.head_dim / 32 + l] = block;
          }

          // fill v_cache_
          for (int l = 0; l < config_.head_dim; l++) {
            block_q8_0 block =
                v_cache_q8[layer_id_][head_id][block_idx][l * config_.block_len / 32 + pos_in_block / 32];
            dequantize_row_q8_0(&block, block_fp32.data(), 32);
            block_fp32[pos_in_block % 32] = GGML_FP16_TO_FP32(
                v_data_[batch_id * (q_len * config_.kv_head_num * config_.head_dim) + head_id * config_.head_dim + l]);
            quantize_row_q8_0(block_fp32.data(), &block, 32);
            v_cache_q8[layer_id_][head_id][block_idx][l * config_.block_len / 32 + pos_in_block / 32] = block;
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  // printf("layer %d time of reading KV Cache: %f s\n", layer_id,
  //        duration.count());
}

void KVCache::get_all_kvcache_one_layer(int layer_id, ggml_fp16_t* k_in, ggml_fp16_t* v_in, WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  layer_id_ = layer_id;
  seq_len_ = config_.block_len;
  block_num_ = get_cache_total_block_num();
  k_data_ = reinterpret_cast<uint16_t*>(k_in);
  v_data_ = reinterpret_cast<uint16_t*>(v_in);

  // Each task gets the k cache or v cache of a certain header
  backend->do_work_stealing_job(
      config_.kv_head_num * past_block_num_[layer_id] * 2, nullptr,
      [&](int task_id) {
        std::vector<float> block_fp32(32);
        int head_id = task_id / 2 / past_block_num_[layer_id];
        int block_idx = task_id / 2 % past_block_num_[layer_id];
        if (block_idx >= block_num_) return;

        int max_offset = 0;
        if (task_id & 1) {
          // get k_cache_
          for (int k = 0; k < config_.block_len; k++) {
            if (block_idx * seq_len_ + k >= cache_total_len_) break;
            for (int l = 0; l < config_.head_dim / 32; l++) {
              block_q4_0 block = k_cache_q4[layer_id_][head_id][block_idx][k * config_.head_dim / 32 + l];
              dequantize_row_q4_0(&block, block_fp32.data(), 32);
              for (int m = 0; m < 32; m++) {
                k_data_[(head_id * cache_total_len_ + block_idx * config_.block_len + k) * config_.head_dim + l * 32 +
                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                max_offset =
                    std::max(max_offset,
                             (int)(head_id * cache_total_len_ + block_idx * config_.block_len + k) * config_.head_dim +
                                 l * 32 + m);
              }
            }
          }
        } else {
          // get v_cache_
          for (int k = 0; k < config_.block_len / 32; k++) {
            for (int l = 0; l < config_.head_dim; l++) {
              block_q4_0 block = v_cache_q4[layer_id_][head_id][block_idx][l * config_.block_len / 32 + k];
              dequantize_row_q4_0(&block, block_fp32.data(), 32);
              for (int m = 0; m < 32; m++) {
                if (block_idx * seq_len_ + k * 32 + m >= cache_total_len_) break;
                v_data_[(head_id * cache_total_len_ + block_idx * config_.block_len + k * 32 + m) * config_.head_dim +
                        l] = GGML_FP32_TO_FP16(block_fp32[m]);
                max_offset = std::max(
                    max_offset,
                    (int)((head_id * cache_total_len_ + block_idx * config_.block_len + k * 32 + m) * config_.head_dim +
                          l));
              }
            }
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  // printf("layer %d block num %d time of reading all KV Cache: %f s\n",
  //        layer_id, block_num_, duration.count());
}


================================================
FILE: kt-kernel/operators/kvcache/kvcache_utils.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include <chrono>

#include "ggml-impl.h"
#include "kvcache.h"

std::string ggml_type_to_string(ggml_type type) {
  switch (type) {
    case GGML_TYPE_F32:
      return "GGML_TYPE_F32";
    case GGML_TYPE_F16:
      return "GGML_TYPE_F16";
    case GGML_TYPE_Q4_0:
      return "GGML_TYPE_Q4_0";
    case GGML_TYPE_Q8_0:
      return "GGML_TYPE_Q8_0";
  }
  return "UNDIFINED";
}
std::string AnchorTypeToString(AnchorType type) {
  switch (type) {
    case AnchorType::DYNAMIC:
      return "DYNAMIC";
    case AnchorType::BLOCK_MEAN:
      return "BLOCK_MEAN";
    case AnchorType::BLOCK_MAX:
      return "BLOCK_MAX";
    case AnchorType::FIXED_ANCHOR:
      return "FIXED_ANCHOR";
    case AnchorType::QUEST:
      return "QUEST";
  }
  return "UNDIFINED";
}
std::string RetrievalTypeToString(RetrievalType type) {
  switch (type) {
    case RetrievalType::LAYER:
      return "SHARED";
    case RetrievalType::KVHEAD:
      return "SEPARATE";
    case RetrievalType::QHEAD:
      return "INDIVIDUAL";
  }
  return "UNDIFINED";
}
KVCacheConfig::KVCacheConfig(int layer_num, int kv_head_num, int q_head_num, int head_dim, int block_len,
                             int anchor_num, AnchorType anchor_type, ggml_type kv_type, RetrievalType retrieval_type,
                             int layer_step, int token_step, int layer_offset, int max_block_num, int max_batch_size,
                             int max_thread_num)
    : layer_num(layer_num),
      kv_head_num(kv_head_num),
      q_head_num(q_head_num),
      head_dim(head_dim),
      block_len(block_len),
      anchor_num(anchor_num),
      anchor_type(anchor_type),
      kv_type(kv_type),
      retrieval_type(retrieval_type),
      layer_step(layer_step),
      token_step(token_step),
      layer_offset(layer_offset),
      max_block_num(max_block_num),
      max_batch_size(max_batch_size),
      max_thread_num(max_thread_num) {
  printf(
      "layer_num: %d, kv_head_num: %d, q_head_num: %d, head_dim: %d, "
      "block_len: %d, anchor_num: %d, anchor_type: %s, kv_type: %s, "
      "retrieval_type: %s, layer_step: %d, token_step: %d, layer_offset: %d,"
      "max_block_num: %d, max_batch_size: %d, max_thread_num: %d\n",
      layer_num, kv_head_num, q_head_num, head_dim, block_len, anchor_num, AnchorTypeToString(anchor_type).c_str(),
      ggml_type_to_string(kv_type).c_str(), RetrievalTypeToString(retrieval_type).c_str(), layer_step, token_step,
      layer_offset, max_block_num, max_batch_size, max_thread_num);
  assert(q_head_num % kv_head_num == 0);
}
KVCache::KVCache(KVCacheConfig config) {
  this->config_ = config;

  n_gqa_ = config_.q_head_num / config_.kv_head_num;
  if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
    // TODO: Elegant implement
    k_cache_fp16_.resize(config_.layer_num);
    v_cache_fp16_.resize(config_.layer_num);
    selected_blocks_num_history_.resize(config_.layer_num / config_.layer_step);
    if (config_.retrieval_type == RetrievalType::LAYER) {
      selected_blocks_history_.resize(config_.layer_num / config_.layer_step);
    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
      selected_blocks_history_kvhead_.resize(config_.layer_num / config_.layer_step);
    } else if (config_.retrieval_type == RetrievalType::QHEAD) {
    }
  } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
    k_cache_q4.resize(config.layer_num);
    v_cache_q4.resize(config.layer_num);
  } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
    k_cache_q8.resize(config.layer_num);
    v_cache_q8.resize(config.layer_num);
  } else {
    assert(false);
  }
  anchor_.resize(config.layer_num * config.max_block_num * config.anchor_num * config.q_head_num * config.head_dim);
  importance_.resize(config.layer_num);
  past_block_num_.resize(config.layer_num);
  for (int i = 0; i < config.layer_num; i++) {
    past_block_num_[i] = 0;
  }

  ThreadResize(config.max_thread_num);
  BatchResize(config.max_batch_size);
  BlockResize(config.max_block_num);
  q_fp32.resize(n_gqa_ * config.head_dim);
}

void KVCache::ThreadResize(int thread_num) {
  thread_local_output_q8_0_.resize(thread_num);
  thread_local_attn_score_.resize(thread_num);
  thread_local_output_fp32_.resize(thread_num);
  thread_local_attn_lse_.resize(thread_num);
  thread_local_cur_output_fp32_.resize(thread_num);
  thread_local_cur_attn_lse_.resize(thread_num);
  thread_local_draft_.resize(thread_num);
  thread_cur_head_idx_.resize(thread_num);
  thread_local_attn_mask_.resize(thread_num);
  for (int i = 0; i < thread_num; i++) {
    thread_local_output_q8_0_[i].resize(n_gqa_ * config_.head_dim / QK8_0);
    thread_local_attn_score_[i].resize(n_gqa_ * config_.block_len);
    thread_local_output_fp32_[i].resize(n_gqa_ * config_.head_dim);
    thread_local_attn_lse_[i].resize(n_gqa_);
    thread_local_cur_output_fp32_[i].resize(n_gqa_ * config_.head_dim);
    thread_local_cur_attn_lse_[i].resize(n_gqa_);
    thread_local_draft_[i].resize(2 * n_gqa_ * config_.block_len + 6 * n_gqa_ * config_.head_dim +
                                  2 * config_.block_len * config_.head_dim +
                                  config_.block_len * config_.head_dim / QK4_0);
    thread_local_attn_mask_[i].resize(config_.block_len / 8);
  }
}
void KVCache::BatchResize(int batch_size) {
  mutex_.resize(batch_size);
  q_q8_0_.resize(batch_size);
  q_fp32_.resize(batch_size);
  output_fp32_.resize(batch_size);
  attn_lse_.resize(batch_size);
  block_lse_.resize(batch_size);
  attn_sparsity_.resize(batch_size);

  if (config_.retrieval_type == RetrievalType::LAYER) {
    block_table_before_retrieval_.resize(batch_size);
    block_table_after_retrieval_.resize(batch_size);

    for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
      selected_blocks_history_[i].resize(batch_size);
    }

  } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
    block_table_before_retrieval_kvhead_.resize(batch_size);
    block_table_after_retrieval_kvhead_.resize(batch_size);
    for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
      selected_blocks_history_kvhead_[i].resize(batch_size);
    }
  } else if (config_.retrieval_type == RetrievalType::QHEAD) {
    block_table_before_retrieval_qhead_.resize(batch_size);
    block_table_after_retrieval_qhead_.resize(batch_size);
  }
  cache_seqlens_.resize(batch_size);
  if (config_.retrieval_type == RetrievalType::LAYER) {
    block_similar_.resize(batch_size);
  } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
    block_similar_kv_head_.resize(batch_size);
  } else if (config_.retrieval_type == RetrievalType::QHEAD) {
    block_similar_q_head_.resize(batch_size);
  }
  for (int i = 0; i < batch_size; i++) {
    top_similar_block_.resize(batch_size);

    mutex_[i].resize(config_.kv_head_num);
    q_q8_0_[i].resize(config_.kv_head_num);
    q_fp32_[i].resize(config_.kv_head_num);
    output_fp32_[i].resize(config_.kv_head_num);
    attn_lse_[i].resize(config_.kv_head_num);

    for (int j = 0; j < config_.kv_head_num; j++) {
      if (!mutex_[i][j]) {
        mutex_[i][j] = std::make_unique<std::mutex>();
      }
      q_q8_0_[i][j].resize(n_gqa_ * config_.head_dim / QK8_0);
      q_fp32_[i][j].resize(n_gqa_ * config_.head_dim);
      output_fp32_[i][j].resize(n_gqa_ * config_.head_dim);
      attn_lse_[i][j].resize(n_gqa_);
    }
  }
  avg_q.resize(batch_size);
  avg_q_fp16.resize(batch_size);
  for (int i = 0; i < batch_size; i++) {
    attn_sparsity_[i].resize(config_.q_head_num);
    avg_q[i].resize(config_.q_head_num * config_.head_dim);
    avg_q_fp16[i].resize(config_.q_head_num * config_.head_dim);
  }
}

void KVCache::BlockResize(int max_block_num) {
  sin_.resize(max_block_num * config_.block_len);
  cos_.resize(max_block_num * config_.block_len);
  for (int i = 0; i < max_block_num * config_.block_len; i++) {
    sin_[i].resize(config_.head_dim);
    cos_[i].resize(config_.head_dim);
  }

  for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
    for (int j = 0; j < config_.max_batch_size; j++) {
      if (config_.retrieval_type == RetrievalType::LAYER) {
        selected_blocks_history_[i][j].resize(max_block_num);
      } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        selected_blocks_history_kvhead_[i][j].resize(max_block_num);
        for (int k = 0; k < config_.max_block_num; k++) {
          selected_blocks_history_kvhead_[i][j][k].resize(config_.kv_head_num);
        }
      } else if (config_.retrieval_type == RetrievalType::QHEAD) {
      }
    }
  }

  for (int layer_id = 0; layer_id < config_.layer_num; layer_id++) {
    importance_[layer_id].resize(max_block_num);

    if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
      // TODO: Elegant implement
      k_cache_fp16_[layer_id].resize(config_.kv_head_num);
      v_cache_fp16_[layer_id].resize(config_.kv_head_num);

      for (int i = 0; i < config_.kv_head_num; i++) {
        k_cache_fp16_[layer_id][i].resize(max_block_num);
        v_cache_fp16_[layer_id][i].resize(max_block_num);

        for (int j = 0; j < max_block_num; j++) {
          k_cache_fp16_[layer_id][i][j].resize(config_.block_len * config_.head_dim);
          v_cache_fp16_[layer_id][i][j].resize(config_.block_len * config_.head_dim);
        }
      }

    } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
      k_cache_q4[layer_id].resize(config_.kv_head_num);
      v_cache_q4[layer_id].resize(config_.kv_head_num);
      for (int i = 0; i < config_.kv_head_num; i++) {
        k_cache_q4[layer_id][i].resize(max_block_num);
        v_cache_q4[layer_id][i].resize(max_block_num);

        for (int j = 0; j < max_block_num; j++) {
          k_cache_q4[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32);
          v_cache_q4[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32);
        }
      }
    } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
      k_cache_q8[layer_id].resize(config_.kv_head_num);
      v_cache_q8[layer_id].resize(config_.kv_head_num);
      for (int i = 0; i < config_.kv_head_num; i++) {
        k_cache_q8[layer_id][i].resize(max_block_num);
        v_cache_q8[layer_id][i].resize(max_block_num);

        for (int j = 0; j < max_block_num; j++) {
          k_cache_q8[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32);
          v_cache_q8[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32);
        }
      }
    } else {
      assert(false);
    }
    for (int i = 0; i < config_.max_batch_size; i++) {
      if (config_.retrieval_type == RetrievalType::LAYER) {
        block_similar_[i].resize(max_block_num);
        block_table_before_retrieval_[i].resize(max_block_num);
        block_table_after_retrieval_[i].resize(max_block_num);
      } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        block_similar_kv_head_[i].resize(max_block_num);
        block_table_before_retrieval_kvhead_[i].resize(max_block_num);
        block_table_after_retrieval_kvhead_[i].resize(max_block_num);
        for (int j = 0; j < max_block_num; j++) {
          block_similar_kv_head_[i][j].resize(config_.kv_head_num);
          block_table_before_retrieval_kvhead_[i][j].resize(config_.kv_head_num);
          block_table_after_retrieval_kvhead_[i][j].resize(config_.kv_head_num);
        }
      } else if (config_.retrieval_type == RetrievalType::QHEAD) {
        block_similar_q_head_[i].resize(max_block_num);
        block_table_before_retrieval_qhead_[i].resize(max_block_num);
        block_table_after_retrieval_qhead_[i].resize(max_block_num);
        for (int j = 0; j < max_block_num; j++) {
          block_similar_q_head_[i][j].resize(config_.q_head_num);
          block_table_before_retrieval_qhead_[i][j].resize(config_.q_head_num);
          block_table_after_retrieval_qhead_[i][j].resize(config_.q_head_num);
        }
      }
      block_lse_[i].resize(max_block_num);
      for (int j = 0; j < max_block_num; j++) {
        block_lse_[i][j].resize(config_.q_head_num);
      }
    }

    for (int i = 0; i < max_block_num; i++) {
      importance_[layer_id][i].resize(config_.block_len);
      for (int j = 0; j < config_.block_len; j++) {
        importance_[layer_id][i][j].resize(config_.q_head_num);
      }
    }
  }
}

void KVCache::calc_anchor_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num,
                                     WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  // Each task updates the importance of a certain block
  seq_len_ = config_.block_len;
  backend->do_work_stealing_job(
      config_.layer_num * batch_size * max_block_num, nullptr,
      [&](int task_id) {
        int layer_id = task_id / (batch_size * max_block_num);
        int batch_id = (task_id / max_block_num) % batch_size;
        int block_id = task_id % max_block_num;
        // If the block is out of the sequence length, skip it. In
        // particular, the last block of the sequence that is shorter than
        // the block length should be skipped.

        if (cache_seqlens[batch_id] / config_.block_len < block_id) {
          return;
        }
        int block_idx = block_table[batch_id * max_block_num + block_id];

        std::vector<float> block_fp32(32);
        if (config_.anchor_type == AnchorType::DYNAMIC) {
          // clear anchor_
          for (int anchor_id = 0; anchor_id < 1; anchor_id++) {
            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
              for (int l = 0; l < config_.head_dim; l++) {
                anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0;
              }
            }
          }

          // find top anchor_num importances and their corresponding
          // positions in the importance_ tensor
          // TODO: Move top_importances to the class member to avoid
          // repeated memory allocation
          std::priority_queue<std::pair<float, std::pair<int, int>>, std::vector<std::pair<float, std::pair<int, int>>>,
                              std::greater<>>
              top_importances;
          for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
            for (int k = 0; k < seq_len_; k++) {
              top_importances.push(std::make_pair(GGML_FP16_TO_FP32(importance_[layer_id][block_idx][k][head_id]),
                                                  std::make_pair(block_idx, k)));
              // TODO: change to config_ item
              if (top_importances.size() > config_.anchor_num) {
                top_importances.pop();
              }
            }

            // fill anchor_

            for (int l = 0; l < config_.head_dim; l++) {
              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim +
                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                      0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0;
            }
            for (int k = 0; k < config_.anchor_num; k++) {
              int top_indice = top_importances.top().second.second;
              int top_block_idx = top_importances.top().second.first;

              if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                for (int l = 0; l < config_.head_dim; l++) {
                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                              config_.head_dim +
                          top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                          0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(
                          GGML_FP16_TO_FP32(
                              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                          config_.head_dim +
                                      top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                      0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]) +
                          GGML_FP16_TO_FP32(k_cache_fp16_[layer_id][head_id / n_gqa_][top_block_idx]
                                                         [top_indice * config_.head_dim + l]));
                }

              } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                for (int l = 0; l < config_.head_dim / 32; l++) {
                  block_q4_0 block =
                      k_cache_q4[layer_id][head_id / n_gqa_][top_block_idx][top_indice * config_.head_dim / 32 + l];
                  dequantize_row_q4_0(&block, block_fp32.data(), 32);
                  for (int m = 0; m < 32; m++) {
                    anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                config_.head_dim +
                            top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                            0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] =
                        GGML_FP32_TO_FP16(
                            block_fp32[m] / 4 +
                            GGML_FP16_TO_FP32(
                                anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                            config_.head_dim +
                                        top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                        0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim +
                                        l * 32 + m]));
                  }
                }
              } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                for (int l = 0; l < config_.head_dim / 32; l++) {
                  block_q8_0 block =
                      k_cache_q8[layer_id][head_id / n_gqa_][top_block_idx][top_indice * config_.head_dim / 32 + l];
                  dequantize_row_q8_0(&block, block_fp32.data(), 32);
                  for (int m = 0; m < 32; m++) {
                    anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                config_.head_dim +
                            top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                            0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] =
                        GGML_FP32_TO_FP16(
                            block_fp32[m] / 4 +
                            GGML_FP16_TO_FP32(
                                anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                            config_.head_dim +
                                        top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                        0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim +
                                        l * 32 + m]));
                  }
                }
              }
              top_importances.pop();
            }
          }
        } else if (config_.anchor_type == AnchorType::BLOCK_MEAN) {
          // clear anchor_
          for (int anchor_id = 0; anchor_id < config_.anchor_num; anchor_id++) {
            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
              for (int l = 0; l < config_.head_dim; l++) {
                anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0;
              }
            }
          }

          // fill anchor_
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
              for (int k = 0; k < config_.block_len; k++) {
                for (int l = 0; l < config_.head_dim; l++) {
                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                              config_.head_dim +
                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                          0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(
                          GGML_FP16_TO_FP32(
                              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                          config_.head_dim +
                                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                      0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]) +
                          GGML_FP16_TO_FP32(
                              k_cache_fp16_[layer_id][head_id / n_gqa_][block_idx][k * config_.head_dim + l]) /
                              config_.block_len);
                }
              }
            }
          }
        } else if (config_.anchor_type == AnchorType::BLOCK_MAX) {
          // clear anchor_
          for (int anchor_id = 0; anchor_id < config_.anchor_num; anchor_id++) {
            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
              for (int l = 0; l < config_.head_dim; l++) {
                anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0;
              }
            }
          }

          // fill anchor_
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
              for (int k = 0; k < config_.block_len; k++) {
                for (int l = 0; l < config_.head_dim; l++) {
                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                              config_.head_dim +
                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                          0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(std::max(
                          GGML_FP16_TO_FP32(
                              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                          config_.head_dim +
                                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                      0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]),
                          GGML_FP16_TO_FP32(
                              k_cache_fp16_[layer_id][head_id / n_gqa_][block_idx][k * config_.head_dim + l])));
                }
              }
            }
          }
        } else if (config_.anchor_type == AnchorType::FIXED_ANCHOR) {
          // clear anchor_
          for (int anchor_id = 0; anchor_id < 1; anchor_id++) {
            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
              for (int l = 0; l < config_.head_dim; l++) {
                anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                        anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0;
              }
            }
          }

          // fill anchor_
          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            int stride = config_.block_len / config_.anchor_num;
            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
              for (int k = 0, tot = 0; k < config_.block_len, tot < config_.anchor_num; k += stride, tot++) {
                for (int l = 0; l < config_.head_dim; l++) {
                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                              config_.head_dim +
                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                          0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(
                          GGML_FP16_TO_FP32(
                              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                          config_.head_dim +
                                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                      0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]) +
                          GGML_FP16_TO_FP32(
                              k_cache_fp16_[layer_id][head_id / n_gqa_][block_idx][k * config_.head_dim + l]) /
                              config_.anchor_num);
                }
              }
            }
          }

        } else if (config_.anchor_type == AnchorType::QUEST) {
          // clear anchor_
          for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
            for (int l = 0; l < config_.head_dim; l++) {
              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim +
                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                      1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                  GGML_FP32_TO_FP16(std::numeric_limits<float>::max());

              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim +
                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                      0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                  GGML_FP32_TO_FP16(std::numeric_limits<float>::min());
            }
          }

          // fill anchor_

          if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            for (int indice = 0; indice < seq_len_; indice++) {
              for (int head_id = 0; head_id < config_.kv_head_num; head_id++) {
                for (int l = 0; l < config_.head_dim; l++) {
                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                              config_.head_dim +
                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                          0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(std::max(
                          GGML_FP16_TO_FP32(k_cache_fp16_[layer_id][head_id][block_idx][indice * config_.head_dim + l]),
                          GGML_FP16_TO_FP32(
                              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                          config_.head_dim +
                                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                      0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l])));

                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                              config_.head_dim +
                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                          1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] =
                      GGML_FP32_TO_FP16(std::min(
                          GGML_FP16_TO_FP32(k_cache_fp16_[layer_id][head_id][block_idx][indice * config_.head_dim + l]),
                          GGML_FP16_TO_FP32(
                              anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                          config_.head_dim +
                                      block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                      1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l])));
                }
              }
            }

          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            for (int indice = 0; indice < seq_len_; indice++) {
              for (int head_id = 0; head_id < config_.kv_head_num; head_id++) {
                for (int l = 0; l < config_.head_dim / 32; l++) {
                  block_q4_0 block = k_cache_q4[layer_id][head_id][block_idx][indice * config_.head_dim / 32 + l];
                  dequantize_row_q4_0(&block, block_fp32.data(), 32);

                  for (int m = 0; m < 32; m++) {
                    for (int gqa_idx = 0; gqa_idx < n_gqa_; gqa_idx++) {
                      anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                  config_.head_dim +
                              block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                              0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] =
                          GGML_FP32_TO_FP16(std::max(
                              block_fp32[m],
                              GGML_FP16_TO_FP32(
                                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                              config_.head_dim +
                                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                          0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim +
                                          l * 32 + m])));

                      anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                  config_.head_dim +
                              block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                              1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] =
                          GGML_FP32_TO_FP16(std::min(
                              block_fp32[m],
                              GGML_FP16_TO_FP32(
                                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                              config_.head_dim +
                                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                          1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim +
                                          l * 32 + m])));
                    }
                  }
                }
              }
            }
          } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            for (int indice = 0; indice < seq_len_; indice++) {
              for (int head_id = 0; head_id < config_.kv_head_num; head_id++) {
                for (int l = 0; l < config_.head_dim / 32; l++) {
                  block_q8_0 block = k_cache_q8[layer_id][head_id][block_idx][indice * config_.head_dim / 32 + l];
                  dequantize_row_q8_0(&block, block_fp32.data(), 32);

                  for (int m = 0; m < 32; m++) {
                    for (int gqa_idx = 0; gqa_idx < n_gqa_; gqa_idx++) {
                      anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                  config_.head_dim +
                              block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                              0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] =
                          GGML_FP32_TO_FP16(std::max(
                              block_fp32[m],
                              GGML_FP16_TO_FP32(
                                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                              config_.head_dim +
                                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                          0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim +
                                          l * 32 + m])));

                      anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                  config_.head_dim +
                              block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                              1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] =
                          GGML_FP32_TO_FP16(std::min(
                              block_fp32[m],
                              GGML_FP16_TO_FP32(
                                  anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num *
                                              config_.head_dim +
                                          block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim +
                                          1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim +
                                          l * 32 + m])));
                    }
                  }
                }
              }
            }
          }
        } else {
          assert(false);
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  //    printf("time of calc_anchor_all_layers: %f s\n", duration.count());
}

void KVCache::clear_importance_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num,
                                          WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  // Each task updates the importance of a certain block
  seq_len_ = config_.block_len;
  backend->do_work_stealing_job(
      config_.layer_num * batch_size * max_block_num, nullptr,
      [&](int task_id) {
        int layer_id = task_id / (batch_size * max_block_num);
        int batch_id = (task_id / max_block_num) % batch_size;
        int block_id = task_id % max_block_num;
        // If the block is out of the sequence length, skip it. In
        // particular, the last block of the sequence that is shorter than
        // the block length should be skipped.

        if (cache_seqlens[batch_id] / config_.block_len < block_id) {
          return;
        }
        int block_idx = block_table[batch_id * max_block_num + block_id];

        if (config_.anchor_type == AnchorType::DYNAMIC) {
          // clear anchor_
          for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
            for (int l = 0; l < config_.block_len; l++) {
              importance_[layer_id][block_idx][l][head_id] = 0;
            }
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  //    printf("time of clear_importance_all_layerssssss: %f s\n",
  //    duration.count());
}

void KVCache::clear_kvcache_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num,
                                       WorkerPool* backend) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  // Each task updates the importance of a certain block
  seq_len_ = config_.block_len;
  backend->do_work_stealing_job(
      config_.layer_num * batch_size * max_block_num * config_.kv_head_num, nullptr,
      [&](int task_id) {
        int layer_id = task_id / (batch_size * max_block_num * config_.kv_head_num);
        int batch_id = (task_id / (max_block_num * config_.kv_head_num)) % batch_size;
        int block_id = task_id / config_.kv_head_num % max_block_num;
        int head_id = task_id % config_.kv_head_num;
        // If the block is out of the sequence length, skip it. In
        // particular, the last block of the sequence that is shorter than
        // the block length should be skipped.
        if (cache_seqlens[batch_id] / config_.block_len < block_id) {
          return;
        }
        int block_idx = block_table[batch_id * max_block_num + block_id];

        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
          for (int l = 0; l < config_.block_len * config_.head_dim; l++) {
            k_cache_fp16_[layer_id][head_id][block_idx][l] = 0;
            v_cache_fp16_[layer_id][head_id][block_idx][l] = 0;
          }
        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
          for (int l = 0; l < config_.block_len * config_.head_dim / 32; l++) {
            k_cache_q4[layer_id][head_id][block_idx][l].d = 0;
            v_cache_q4[layer_id][head_id][block_idx][l].d = 0;
          }
        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
          for (int l = 0; l < config_.block_len * config_.head_dim / 32; l++) {
            k_cache_q8[layer_id][head_id][block_idx][l].d = 0;
            v_cache_q8[layer_id][head_id][block_idx][l].d = 0;
          }
        }
      },
      nullptr);

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  //    printf("time of clear_kvcache_all_layers: %f s\n", duration.count());
}

void KVCache::get_sincos(ggml_fp16_t* sin, ggml_fp16_t* cos, int seqlen) {
  // Timer start
  auto start = std::chrono::high_resolution_clock::now();

  const uint16_t* sin_data = const_cast<const uint16_t*>(sin);
  const uint16_t* cos_data = const_cast<const uint16_t*>(cos);

  for (int i = 0; i < seqlen; i++) {
    for (int j = 0; j < config_.head_dim; j++) {
      sin_[i][j] = sin_data[i * config_.head_dim + j];
      cos_[i][j] = cos_data[i * config_.head_dim + j];
    }
  }

  // Timer end
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> duration = end - start;
  printf("time of get_sincos: %f s\n", duration.count());
}

void ggml_vec_scale_f32(const int n, float* y, const float v) {
#if defined(GGML_USE_ACCELERATE)
  vDSP_vsmul(y, 1, &v, y, 1, n);
#elif defined(GGML_SIMD)
  const int np = (n & ~(GGML_F32_STEP - 1));

  GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);

  GGML_F32_VEC ay[GGML_F32_ARR];

  for (int i = 0; i < np; i += GGML_F32_STEP) {
    for (int j = 0; j < GGML_F32_ARR; j++) {
      ay[j] = GGML_F32_VEC_LOAD(y + i + j * GGML_F32_EPR);
      ay[j] = GGML_F32_VEC_MUL(ay[j], vx);

      GGML_F32_VEC_STORE(y + i + j * GGML_F32_EPR, ay[j]);
    }
  }

  // leftovers
  for (int i = np; i < n; ++i) {
    y[i] *= v;
  }
#else
  // scalar
  for (int i = 0; i < n; ++i) {
    y[i] *= v;
  }
#endif
}

================================================
FILE: kt-kernel/operators/llamafile/conversion.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:34:55
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_CONVERSION_H
#define CPUINFER_CONVERSION_H

#include <memory.h>

#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"

inline void to_float(const void* input, float* output, int size, ggml_type type) {
  if (type == ggml_type::GGML_TYPE_F32) {
    memcpy(output, input, size * sizeof(float));
  } else {
    if (type == ggml_type::GGML_TYPE_Q8_K) {
      dequantize_row_q8_K((block_q8_K*)input, output, size);
    } else {
      ggml_internal_get_type_traits(type).to_float(input, output, size);
    }
  }
}

inline void from_float(const float* input, void* output, int size, ggml_type type) {
  if (type == ggml_type::GGML_TYPE_F32) {
    memcpy(output, input, size * sizeof(float));
  } else {
    ggml_internal_get_type_traits(type).from_float(input, output, size);
  }
}

#endif

================================================
FILE: kt-kernel/operators/llamafile/linear.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:45:18
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "linear.h"

Linear::Linear(LinearConfig config) {
  config_ = config;
  proj_ = config_.proj;

  MemoryRequest mem_requests;
  mem_requests.append_pointer(&input_fp32_, sizeof(float) * config_.group_max_len * config_.input_size);
  mem_requests.append_pointer(&proj_input_,
                              config_.group_max_len * config_.input_size *
                                  ggml_type_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) /
                                  ggml_blck_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type));
  mem_requests.append_pointer(&proj_output_, sizeof(float) * config_.group_max_len * config_.output_size);
  shared_mem_buffer.alloc(this, mem_requests);
}

Linear::~Linear() {}

void Linear::warm_up(WorkerPool* backend) {
  std::vector<float> input_fp32(config_.input_size);
  std::vector<uint8_t> input(config_.input_size * ggml_type_size(config_.hidden_type) /
                             ggml_blck_size(config_.hidden_type));
  std::vector<uint8_t> output(config_.output_size * ggml_type_size(config_.hidden_type) /
                              ggml_blck_size(config_.hidden_type));
  for (int i = 0; i < config_.input_size; i++) {
    input_fp32[i] = 0;
  }
  from_float(input_fp32.data(), input.data(), config_.input_size, config_.hidden_type);
  forward_many(1, input.data(), output.data(), backend);
}

void Linear::forward_many(int qlen, const void* input, void* output, WorkerPool* backend) {
  const void* proj_input_ptr;
  if (config_.hidden_type == ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) {
    proj_input_ptr = input;
  } else {
    to_float(input, input_fp32_, qlen * config_.input_size, config_.hidden_type);
    from_float(input_fp32_, proj_input_, qlen * config_.input_size,
               ggml_internal_get_type_traits(config_.proj_type).vec_dot_type);
    proj_input_ptr = proj_input_;
  }
  int nth = config_.output_size / config_.stride;
  backend->do_work_stealing_job(
      nth, nullptr,
      [&](int task_id) {
        int ith = task_id;
        void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size *
                                               ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
        float* proj_output_ptr = proj_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.input_size / ggml_blck_size(config_.proj_type), proj_ptr,
                        config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr,
                        config_.input_size / ggml_blck_size(config_.proj_type), proj_output_ptr, config_.output_size, 0,
                        1, GGML_TASK_TYPE_COMPUTE, config_.proj_type,
                        ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32,
                        GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
          for (int i = 0; i < qlen; i++) {
            float* output_fp32_ptr = proj_output_ + i * config_.output_size + ith * config_.stride;
            void* output_ptr =
                (uint8_t*)output +
                i * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) +
                ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
          }
        }
      },
      nullptr);
  if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
    from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
  }
}

void Linear::forward(int qlen, const void* input, void* output, WorkerPool* backend) {
  if (qlen <= 0) {
    return;
  }
  int forward_len = std::min(qlen, config_.group_max_len);
  forward_many(forward_len, input, output, backend);
  forward(qlen - forward_len,
          (uint8_t*)input + forward_len * config_.input_size * ggml_type_size(config_.hidden_type) /
                                ggml_blck_size(config_.hidden_type),
          (uint8_t*)output + forward_len * config_.output_size * ggml_type_size(config_.hidden_type) /
                                 ggml_blck_size(config_.hidden_type),
          backend);
}

================================================
FILE: kt-kernel/operators/llamafile/linear.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_LINEAR_H
#define CPUINFER_OPERATOR_LINEAR_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/shared_mem_buffer.h"
#include "../../cpu_backend/worker_pool.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct LinearConfig {
  int input_size;
  int output_size;
  int stride;
  int group_max_len;
  void* proj;
  ggml_type proj_type;
  ggml_type hidden_type;

  LinearConfig() {}

  LinearConfig(int input_size, int output_size, int stride, int group_max_len, void* proj, ggml_type proj_type,
               ggml_type hidden_type)
      : input_size(input_size),
        output_size(output_size),
        stride(stride),
        group_max_len(group_max_len),
        proj(proj),
        proj_type(proj_type),
        hidden_type(hidden_type) {}
};

class Linear {
 public:
  Linear(LinearConfig);
  ~Linear();
  void warm_up(WorkerPool* backend);
  void forward_many(int qlen, const void* input, void* output, WorkerPool* backend);
  void forward(int qlen, const void* input, void* output, WorkerPool* backend);

 private:
  LinearConfig config_;
  void* proj_;  // [output_size * input_size ( /32 if quantized)]

  float* input_fp32_;    // [group_max_len * input_size]
  uint8_t* proj_input_;  // [group_max_len * input_size *
                         // ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) /
                         // ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)]
  float* proj_output_;   // [group_max_len * output_size]
};

#endif

================================================
FILE: kt-kernel/operators/llamafile/mla.hpp
================================================
// #ifndef LLAMAFILE_MLA_HPP
// #define LLAMAFILE_MLA_HPP

// #include "../common.hpp"
// #include "../mla-tp.hpp"
// #include "../rms-norm.hpp"
// #include "../rope.hpp"
// #include "ggml-quants.h"
// #include "ggml.h"
// #include "llamafile/sgemm.h"

// #include <algorithm>
// #include <cstddef>
// #include <utility>
// #include <vector>

// #define DIRECT_OR_POOL_BY(what, threshold, var, fn) \
//   do { \
//     if ((what) < (threshold)) { \
//       for (int i = 0; i < (var); i++) { \
//         (fn)(i); \
//       } \
//     } else { \
//       pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \
//     } \
//   } while (0)

// #define VEC_DOT_TYPE(type) (ggml_internal_get_type_traits((ggml_type)(type)).vec_dot_type)
// #define QUANT_BLCK_COUNT(size, type) (((size_t)(size)) / (ggml_blck_size((ggml_type)(type))))
// #define QUANT_BLCK_SIZE(size, type) (QUANT_BLCK_COUNT(size, type) * (ggml_type_size((ggml_type)(type))))
// #define QUANT_OFFSET(ptr, type, n, n_elements) \
//   (offset_pointer((ptr), (size_t)(n) * QUANT_BLCK_SIZE((n_elements), (type))))

// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col) \
//   do { \
//     llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)), \
//                     QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)), \
//                     QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) *
//                     sizeof(float)), \
//                     (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32, \
//                     GGML_PREC_DEFAULT); \
//   } while (0)

// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc) \
//   do { \
//     llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32, \
//                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT); \
//   } while (0)

// // bool decide_absorb(size_t a,int a_type,size_t b,int b_type,size_t c,int c_type,size_t d,int d_type){
// //   size_t flops1 = ;

// // }

// inline void transpose(void *start, size_t dim0, size_t stride, size_t dim1) {
//   // static_assert(false, "TODO");
// }

// template <RMS_NORM T_RMSNorm = RMSNorm, ROPE_APPLIER T_RopeApplier = Rope, ROPE_ANGLE T_RopeAngle = Yarn>
// class LLAMA_MLA_TP {
// private:
//   GeneralMLAConfig config;
//   int tp_part_idx;
//   std::vector<void *> nope_pages;     // [page_count * page_token_count * nope]
//   std::vector<void *> rope_pages;     // [page_count * page_token_count * nope]

//   // weights
//   void *local_q_a_proj;               // [hidden_size * q_lora_rank]
//   void *local_q_a_norm;               // [q_lora_rank]
//   std::vector<void *> local_q_b_proj; // [num_heads * (nope_size + rope_size))]
//   void *local_kv_a_proj_with_mqa;     // [hidden_size * (kv_lora_rank + rope)]
//   void *local_kv_a_norm_with_mqa;
//   void *local_kv_b_proj;                   // [(num_heads * (nope_size + nope_size) * kv_lora_rank)],
//                                            // q_absorb:   [num_heads * nope_size * kv_lora_rank]
//                                            // out_absorb: [num_heads * nope_size * kv_lora_rank]
//   std::vector<void *> local_k_b_proj_nope; // [(num_heads * kv_lora_rank * nope)],
//   void *local_w_o; // [(num_heads * nope_size) * hidden_size]
//   T_RopeAngle rope_angle;

//   // intermediate

//   void *quant_input;           // [qlen, hidden size(Q)]
//   void *q_a_proj_output;       // [qlen, q_lora_rank]
//   void *quant_q_a_proj_output; // [qlen, q_lora_rank(Q)]

//   // for each query
//   std::vector<void *> q_pe;              // [num_heads * max_qlen * rope_size]
//   std::vector<void *> k_pe;              // [num_threads * rope_size]
//   std::vector<void *> q_nope;            // [num_heads * max_qlen * nope_size]
//   std::vector<void *> attention_weights; // [num_heads * max_qlen * max_klen];
//   std::vector<void *> q_absorb;          // [num_heads, max_qlen, kv_lora_rank],  or [num_heads, kv_lora_rank,
//   max_qlen] std::vector<void *> o_absorb;          // [num_heads, max_qlen, kv_lora_rank],  or [num_heads,
//   kv_lora_rank, max_qlen] std::vector<void *> compressed_kv_tmp; // [num_threads * token_count_in_page *
//   kv_lora_rank] std::vector<void *> quant_o_absorb;    // [num_heads, max_qlen, kv_lora_rank],  or [num_heads,
//   kv_lora_rank, max_qlen] std::vector<void *> attention_output;  // [num_threads * max_qlen * nope] std::vector<void
//   *> quant_attention_output; // [num_threads * max_qlen * nope]

// public:
//   using output_t = float;

//   LLAMA_MLA_TP(GeneralMLAConfig config, int tp_part_idx) : config(config), tp_part_idx(tp_part_idx) {
//     std::vector<std::pair<void **, uint64_t>> s_mem_requests;
//   }

//   void set_pages(std::vector<void *> cache_pages) { this->nope_pages = cache_pages; }
//   void set_pages(std::vector<void *> cache_pages, std::vector<void *> pe_pages) {
//     this->nope_pages = cache_pages;
//     this->rope_pages = pe_pages;
//   }

//   void forward(std::vector<int> qlens, std::vector<std::vector<int>> page_tables, std::vector<int> kv_lens,
//                const void *input, void *output) {}

//   void forward_prefill(std::vector<int> qlens, std::vector<std::vector<int>> page_tables, std::vector<int> kvlens,
//                        const void *input_raw, void *output) {
//     auto pool = config.pool->get_subpool(tp_part_idx);

//     float *input = (float *)input_raw;
//     std::vector<int> qlen_split, total_len_split;
//     qlen_split.reserve(qlens.size() + 1);
//     qlen_split.push_back(0);
//     total_len_split.reserve(qlens.size() + 1);
//     int qlen_sum = 0;
//     int total_len_sum = 0;
//     for (size_t i = 0; i < qlens.size(); i++) {
//       qlen_sum += qlens[i];
//       qlen_split.push_back(qlen_sum);

//       total_len_sum += qlens[i] + kvlens[i];
//       total_len_split.push_back(total_len_sum);
//     }

//     auto which_query_by_qlen_sum = [&](int token_nth) -> std::pair<size_t, size_t> {
//       auto query_idx = std::upper_bound(qlen_split.begin(), qlen_split.end(), token_nth) - qlen_split.begin() - 1;
//       auto token_nth_from_start = token_nth - qlen_split.at(query_idx) + kvlens.at(query_idx);
//       return {query_idx, token_nth_from_start};
//     };
//     auto which_query_by_total_sum = [&](int token_nth) -> std::pair<size_t, size_t> {
//       auto query_idx =
//           std::upper_bound(total_len_split.begin(), total_len_split.end(), token_nth) - total_len_split.begin() - 1;
//       auto token_nth_from_start = token_nth - total_len_split.at(query_idx);
//       return {query_idx, token_nth_from_start};
//     };

//     auto which_page = [&](int query, int token_nth_from_start) -> std::pair<size_t, size_t> {
//       size_t page_idx = page_tables.at(query).at(div_up((size_t)token_nth_from_start, config.token_count_in_page));

//       size_t token_at_in_page = token_nth_from_start % config.token_count_in_page;
//       return {page_idx, token_at_in_page};
//     };

//     ggml_type vec_dot_type = ggml_internal_get_type_traits((ggml_type)config.q_a_proj_type).vec_dot_type;
//     size_t hidden_size_float_bytes = config.hidden_size * sizeof(float);
//     size_t hidden_size_quant_blck_count = config.hidden_size / ggml_blck_size(vec_dot_type);
//     size_t hidden_size_quant_bytes = hidden_size_quant_blck_count * ggml_type_size(vec_dot_type);
//     // quant to q8 0

//     DIRECT_OR_POOL_BY(qlen_sum, 10, qlen_sum, [&](int token_at_i) {
//       size_t token_at = token_at_i;
//       quantize_q8_0(offset_pointer(input, token_at * config.hidden_size * sizeof(float)),
//                     offset_pointer(quant_input,
//                                    token_at * QUANT_BLCK_SIZE(config.hidden_size,
//                                    VEC_DOT_TYPE(config.q_a_proj_type))),
//                     1, config.hidden_size, nullptr);
//     });

//     {
//       // q lora rank
//       // maybe this should be up to non tp
//       auto proj_lora_a = [&](int task_id) {
//         size_t token_at = task_id % qlen_sum;
//         bool do_q_or_kv = (task_id / qlen_sum) == 0;
//         if (do_q_or_kv) {
//           auto this_q_a_proj_output =
//               (float *)offset_pointer(q_a_proj_output, token_at * config.hidden_size * sizeof(float));

//           LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(config.q_lora_rank, 1, config.hidden_size, local_q_a_proj,
//                                             config.q_a_proj_type, quant_input, token_at, this_q_a_proj_output, 0);

//           T_RMSNorm::rms_norm_single(config.q_lora_rank, (float *)local_q_a_norm, this_q_a_proj_output);

//           quantize_q8_0(
//               this_q_a_proj_output,
//               offset_pointer(quant_q_a_proj_output,
//                              token_at * QUANT_BLCK_SIZE(config.q_lora_rank, VEC_DOT_TYPE(config.q_b_proj_type))),
//               1, config.q_lora_rank, nullptr);

//         } else {
//           auto [query, token_from_start] = which_query_by_qlen_sum(token_at);
//           auto [page_idx, token_at_in_page] = which_page(query, token_from_start);
//           LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(config.kv_lora_rank, 1, config.hidden_size, local_kv_a_proj_with_mqa,
//                                             config.kv_a_proj_with_mqa_type, quant_input, token_at,
//                                             rope_pages.at(page_idx), token_at_in_page);
//           T_RMSNorm::rms_norm_single(
//               config.kv_lora_rank, (float *)local_kv_a_norm_with_mqa,
//               (float *)offset_pointer(rope_pages.at(page_idx), token_at_in_page * config.kv_lora_rank *
//               sizeof(float)));
//           LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(config.rope_size, 1, config.hidden_size,
//                                             QUANT_OFFSET(local_kv_a_proj_with_mqa, config.kv_a_proj_with_mqa_type,
//                                                          config.kv_lora_rank, config.hidden_size),
//                                             config.kv_a_proj_with_mqa_type, quant_input, token_at,
//                                             nope_pages.at(page_idx), token_at_in_page);
//         }
//       };
//       DIRECT_OR_POOL_BY(qlen_sum, 10, qlen_sum * 2, proj_lora_a);
//     }

//     {
//       int task_count = config.num_heads * 2 * qlen_sum; // head, rope/nope, qlen
//       auto q_proj_lora_b = [&](int task_id) {
//         size_t head_idx = task_id / (2 * qlen_sum);
//         task_id %= (2 * qlen_sum);
//         bool nope_or_rope = (task_id / qlen_sum) == 0;
//         task_id %= qlen_sum;
//         size_t token_at = task_id;
//         auto [query, token_from_start] = which_query_by_qlen_sum(token_at);
//         auto [page_idx, token_at_in_page] = which_page(query, token_from_start);

//         if (nope_or_rope) {
//           LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(
//               config.nope_size, 1, config.q_lora_rank,
//               QUANT_OFFSET(local_q_b_proj.at(head_idx), config.q_b_proj_type,
//                            head_idx * (config.nope_size + config.rope_size), config.q_lora_rank),
//               config.q_b_proj_type, quant_q_a_proj_output, token_at, q_nope.at(head_idx), token_at);
//         } else {
//           LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(
//               config.rope_size, 1, config.q_lora_rank,
//               QUANT_OFFSET(local_q_b_proj.at(head_idx), config.q_b_proj_type,
//                            head_idx * (config.nope_size + config.rope_size) + config.nope_size, config.q_lora_rank),
//               config.q_b_proj_type, quant_q_a_proj_output, token_at, q_pe.at(head_idx), token_at);
//           T_RopeApplier::apply_single(config.rope_size,
//                                       offset_pointer(q_pe.at(head_idx), token_at * config.rope_size * sizeof(float)),
//                                       rope_angle.cos(token_at), rope_angle.sin(token_at));
//         }
//       };
//       pool->do_work_stealing_job(task_count, nullptr, q_proj_lora_b, nullptr);
//     }

//     for (int query = 0; query < qlens.size(); query++) {
//       {
//         // pe attention
//         // apply k pe online
//         int task_count = config.num_heads * (qlens[query] + kvlens[query]); // by kvlen
//         auto pe_attn = [&](int task_id) {
//           size_t head_idx = task_id / (qlens[query] + kvlens[query]);
//           size_t token_from_start = task_id % (qlens[query] + kvlens[query]);

//           // auto q_token_at = qlen_split[query] + qlens[query];

//           auto [page_idx, token_at_in_page] = which_page(query, token_from_start);
//           memcpy(k_pe[WorkerPool::thread_local_id],
//                  offset_pointer(rope_pages.at(page_idx), token_at_in_page * config.rope_size * sizeof(float)),
//                  sizeof(float) * config.rope_size);
//           T_RopeApplier::apply_single(config.rope_size, k_pe[WorkerPool::thread_local_id],
//                                       rope_angle.cos(token_from_start), rope_angle.sin(token_from_start));

//           LLAMAFILE_SGEMM_MATMUL_F32(1, qlens[query], config.rope_size, k_pe[WorkerPool::thread_local_id],
//                                      config.rope_size, q_pe.at(head_idx), config.rope_size,
//                                      attention_weights[head_idx], config.max_kvlen);
//         };
//         pool->do_work_stealing_job(task_count, pe_attn);
//       }
//       {
//         // clear q absorb
//         pool->do_work_stealing_job(config.num_heads, [&](int task_id) {
//           memset(q_absorb[task_id], 0, config.kv_lora_rank * config.max_qlen * sizeof(float));
//         });

//         // aborb W_uk
//         int task_count = config.num_heads * qlens[query];
//         auto task = [&](int task_id) {
//           size_t head_idx = task_id / qlens[query];
//           size_t token_at = task_id % qlens[query];

//           // q_absorb now [kvrank, max_qlen]
//           LLAMAFILE_SGEMM_MATMUL_F32(qlens[query], config.kv_lora_rank, config.nope_size, q_nope[head_idx],
//                                      config.nope_size, local_k_b_proj_nope[head_idx], config.nope_size,
//                                      q_absorb[head_idx], config.max_qlen);
//           transpose(q_nope[head_idx], config.kv_lora_rank, config.max_qlen, qlens[query]);
//         };
//         pool->do_work_stealing_job(task_count, task);
//       }

//       {
//         // nope attention weights
//         size_t page_count = div_up((size_t)kvlens[query], config.token_count_in_page);
//         int task_count = config.num_heads * page_count;
//         auto task = [&](int task_id) {
//           size_t head_idx = task_id / page_count;
//           size_t page_idx = task_id % page_count;
//           void *page_ptr = nope_pages[page_tables[query][page_idx]]; // mla no head

//           size_t kvlen =
//               page_idx == (page_count - 1) ? (kvlens[query] % config.token_count_in_page) :
//               config.token_count_in_page;

//           LLAMAFILE_SGEMM_MATMUL_F32(
//               kvlen, qlens[query], config.kv_lora_rank, page_ptr, config.kv_lora_rank, q_absorb[head_idx],
//               config.max_qlen,
//               offset_pointer(attention_weights[head_idx], page_idx * config.token_count_in_page * sizeof(float)),
//               config.max_kvlen);
//           // static_assert(false, "soft max todo");
//         };
//         pool->do_work_stealing_job(task_count, task);
//       }

//       {
//         // clear o absorb
//         pool->do_work_stealing_job(config.num_heads, [&](int task_id) {
//           memset(o_absorb[task_id], 0, config.kv_lora_rank * config.max_qlen * sizeof(float));
//         });

//         // o absorb
//         size_t page_count = div_up((size_t)kvlens[query], config.token_count_in_page);
//         int task_count = config.num_heads * page_count;
//         auto task = [&](int task_id) {
//           size_t head_idx = task_id / page_count;
//           size_t page_idx = task_id % page_count;
//           void *page_ptr = nope_pages[page_tables[query][page_idx]]; // mla no head
//           size_t kvlen =
//               page_idx == (page_count - 1) ? (kvlens[query] % config.token_count_in_page) :
//               config.token_count_in_page;

//           memcpy(compressed_kv_tmp[WorkerPool::thread_local_id], page_ptr,
//                  config.token_count_in_page * config.kv_lora_rank * sizeof(float));
//           transpose(compressed_kv_tmp[WorkerPool::thread_local_id], config.token_count_in_page, config.kv_lora_rank,
//                     kvlen);

//           LLAMAFILE_SGEMM_MATMUL_F32(
//               config.kv_lora_rank, qlens[query], kvlen, compressed_kv_tmp[WorkerPool::thread_local_id],
//               config.token_count_in_page,
//               offset_pointer(attention_weights[head_idx], page_idx * config.token_count_in_page * sizeof(float)),
//               config.max_kvlen, o_absorb[head_idx], config.kv_lora_rank);
//         };
//         pool->do_work_stealing_job(task_count, task);
//       }

//       {

//         // clear
//         pool->do_work_stealing_job(config.num_heads, [&](int task_id) {
//           memset(attention_output[task_id], 0, config.nope_size * config.max_qlen * sizeof(float));
//         });

//         // attention output
//         int task_count = config.num_heads * qlens[query];
//         auto task = [&](int task_id) {
//           size_t head_idx = task_id / qlens[query];
//           size_t token_at = task_id % qlens[query];

//           quantize_q8_0((float *)offset_pointer(o_absorb[head_idx], config.kv_lora_rank * token_at * sizeof(float)),
//                         offset_pointer(quant_o_absorb[head_idx],
//                                        QUANT_BLCK_SIZE(config.kv_lora_rank, VEC_DOT_TYPE(config.kv_b_proj_type))),
//                         1, config.kv_lora_rank, nullptr);

//           auto kv_b_proj_ptr =
//               offset_pointer(local_kv_b_proj, ((head_idx * 2 + 1) * config.nope_size) *
//                                                   QUANT_BLCK_SIZE(config.kv_lora_rank, config.kv_b_proj_type));

//           LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(config.nope_size, 1, config.kv_lora_rank, kv_b_proj_ptr,
//                                             config.kv_b_proj_type, quant_o_absorb[head_idx], token_at,
//                                             attention_output[head_idx], token_at);
//         };
//         pool->do_work_stealing_job(task_count, task);
//       }

//       {
//         // quant attention output
//         // static_assert(false,"TODO" );
//       }

//       {
//         // get final output
//         // static_assert(false,"TODO" );
//       }
//     }
//   }

//   void load_weights(int complete_num_heads, int offset) {}
// };
// template <typename Norm, typename Rope, typename RopeAngle>
// class TP_MLA<LLAMA_MLA_TP<Norm, Rope, RopeAngle>> : public TP_MLA_Common<LLAMA_MLA_TP<Norm, Rope, RopeAngle>> {
// public:
//   using TP_MLA_Common<LLAMA_MLA_TP<Norm, Rope, RopeAngle>>::TP_MLA_Common;

//   void load_weights() {
//     auto pool = this->config.pool;
//     auto tp_num_heads = this->config.num_heads / this->tp_count;
//     pool->dispense_backend()->do_numa_job([this, pool, tp_num_heads](int tp_id) {
//       this->tps[tp_id]->load_weights(this->config.num_heads, tp_id * tp_num_heads);
//     });
//     this->weights_loaded = true;
//   }

//   void merge_results(int qlen, void *output) {}
// };

// #endif


================================================
FILE: kt-kernel/operators/llamafile/mlp.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:44:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "mlp.h"

MLP::MLP(MLPConfig config) {
  config_ = config;
  gate_proj_ = config_.gate_proj;
  up_proj_ = config_.up_proj;
  down_proj_ = config_.down_proj;

  MemoryRequest mem_requests;
  mem_requests.append_pointer(&input_fp32_, sizeof(float) * config_.group_max_len * config_.hidden_size);
  mem_requests.append_pointer(&gate_input_,
                              config_.group_max_len * config_.hidden_size *
                                  ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) /
                                  ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
  mem_requests.append_pointer(&up_input_,
                              config_.group_max_len * config_.hidden_size *
                                  ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) /
                                  ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
  mem_requests.append_pointer(&gate_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size);
  mem_requests.append_pointer(&up_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size);
  mem_requests.append_pointer(&intermediate_fp32_, sizeof(float) * config_.group_max_len * config_.intermediate_size);
  mem_requests.append_pointer(&down_input_,
                              config_.group_max_len * config_.intermediate_size *
                                  ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) /
                                  ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type));
  mem_requests.append_pointer(&down_output_, sizeof(float) * config_.group_max_len * config_.hidden_size);
  shared_mem_buffer.alloc(this, mem_requests);
}

MLP::~MLP() {}

void MLP::warm_up(WorkerPool* backend) {
  std::vector<float> input_fp32(config_.hidden_size);
  std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) /
                             ggml_blck_size(config_.hidden_type));
  std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) /
                              ggml_blck_size(config_.hidden_type));
  for (int i = 0; i < config_.hidden_size; i++) {
    input_fp32[i] = 0;
  }
  from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
  forward_many(1, input.data(), output.data(), backend);
}

static float act_fn(float x) { return x / (1.0f + expf(-x)); }

void MLP::forward_many(int qlen, const void* input, void* output, WorkerPool* backend) {
  const void* gate_input_ptr;
  const void* up_input_ptr;
  if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type &&
      config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
    gate_input_ptr = up_input_ptr = input;
  } else {
    to_float(input, input_fp32_, qlen * config_.hidden_size, config_.hidden_type);
    if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type ==
        ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
      from_float(input_fp32_, gate_input_, qlen * config_.hidden_size,
                 ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
      gate_input_ptr = up_input_ptr = gate_input_;
    } else {
      if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
        from_float(input_fp32_, gate_input_, qlen * config_.hidden_size,
                   ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
        gate_input_ptr = gate_input_;
      } else {
        gate_input_ptr = input;
      }
      if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
        from_float(input_fp32_, up_input_, qlen * config_.hidden_size,
                   ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
        up_input_ptr = up_input_;
      } else {
        up_input_ptr = input;
      }
    }
  }
  int nth = config_.intermediate_size / config_.stride;
  backend->do_work_stealing_job(
      nth, nullptr,
      [&](int task_id) {
        int ith = task_id;
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size *
                                                         ggml_type_size(config_.gate_type) /
                                                         ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = gate_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr,
                        config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr,
                        config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr,
                        config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type,
                        ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32,
                        GGML_PREC_DEFAULT);
        void* up_proj_ptr = (uint8_t*)up_proj_ + ith * config_.stride * config_.hidden_size *
                                                     ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = up_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr,
                        config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr,
                        config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size,
                        0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type,
                        ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < qlen; i++) {
          for (int j = ith * config_.stride; j < (ith + 1) * config_.stride; j++) {
            intermediate_fp32_[i * config_.intermediate_size + j] =
                act_fn(gate_output_[i * config_.intermediate_size + j]) * up_output_[i * config_.intermediate_size + j];
          }
          if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
            float* intermediate_fp32_ptr = intermediate_fp32_ + i * config_.intermediate_size + ith * config_.stride;
            void* down_input_ptr = (uint8_t*)down_input_ +
                                   i * config_.intermediate_size *
                                       ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) /
                                       ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) +
                                   ith * config_.stride *
                                       ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) /
                                       ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride,
                       ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
          }
        }
      },
      nullptr);
  if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
    from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size,
               ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
  }
  nth = config_.hidden_size / config_.stride;
  backend->do_work_stealing_job(
      nth, nullptr,
      [&](int task_id) {
        int ith = task_id;
        void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size *
                                                         ggml_type_size(config_.down_type) /
                                                         ggml_blck_size(config_.down_type);
        float* down_output_ptr = down_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.intermediate_size / ggml_blck_size(config_.down_type),
                        down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_,
                        config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr,
                        config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type,
                        ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32,
                        GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
          for (int i = 0; i < qlen; i++) {
            float* output_fp32_ptr = down_output_ + i * config_.hidden_size + ith * config_.stride;
            void* output_ptr =
                (uint8_t*)output +
                i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) +
                ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
          }
        }
      },
      nullptr);
  if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
    from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
  }
}

void MLP::forward(int qlen, const void* input, void* output, WorkerPool* backend) {
  if (qlen <= 0) {
    return;
  }
  int forward_len = std::min(qlen, config_.group_max_len);
  forward_many(forward_len, input, output, backend);
  forward(qlen - forward_len,
          (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) /
                                ggml_blck_size(config_.hidden_type),
          (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) /
                                 ggml_blck_size(config_.hidden_type),
          backend);
}

================================================
FILE: kt-kernel/operators/llamafile/mlp.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_MLP_H
#define CPUINFER_OPERATOR_MLP_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/shared_mem_buffer.h"
#include "../../cpu_backend/worker_pool.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct MLPConfig {
  int hidden_size;
  int intermediate_size;
  int stride;
  int group_max_len;
  void* gate_proj;
  void* up_proj;
  void* down_proj;
  ggml_type gate_type;
  ggml_type up_type;
  ggml_type down_type;
  ggml_type hidden_type;

  MLPConfig() {}

  MLPConfig(int hidden_size, int intermediate_size, int stride, int group_max_len, void* gate_proj, void* up_proj,
            void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
      : hidden_size(hidden_size),
        intermediate_size(intermediate_size),
        stride(stride),
        group_max_len(group_max_len),
        gate_proj(gate_proj),
        up_proj(up_proj),
        down_proj(down_proj),
        gate_type(gate_type),
        up_type(up_type),
        down_type(down_type),
        hidden_type(hidden_type) {}
};

class MLP {
 public:
  MLP(MLPConfig);
  ~MLP();
  void warm_up(WorkerPool* backend);
  void forward_many(int qlen, const void* input, void* output, WorkerPool* backend);
  void forward(int qlen, const void* input, void* output, WorkerPool* backend);

 private:
  MLPConfig config_;
  void* gate_proj_;  // [intermediate_size * hidden_size ( /32 if quantized)]
  void* up_proj_;    // [intermediate_size * hidden_size ( /32 if quantized)]
  void* down_proj_;  // [hidden_size * intermediate_size ( /32 if quantized)]

  float* input_fp32_;    // [group_max_len * hidden_size]
  uint8_t* gate_input_;  // [group_max_len * hidden_size *
                         // ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) /
                         // ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
  uint8_t*
      up_input_;  // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type)
                  // / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
  float* gate_output_;        // [group_max_len * intermediate_size]
  float* up_output_;          // [group_max_len * intermediate_size]
  float* intermediate_fp32_;  // [group_max_len * intermediate_size]
  uint8_t* down_input_;       // [group_max_len * intermediate_size *
                              // ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) /
                              // ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
  float* down_output_;        // [group_max_len * hidden_size]
};

#endif

================================================
FILE: kt-kernel/operators/llamafile/moe.hpp
================================================
#ifndef LLAMAFILE_MOE_HPP
#define LLAMAFILE_MOE_HPP
#ifdef FORWARD_TIME_PROFILE
#include <fmt/format.h>
#endif
#include <numa.h>
#include <numaif.h>

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <functional>
#include <vector>

#include "../../cpu_backend/shared_mem_buffer.h"
#include "../../cpu_backend/worker_pool.h"
#include "../moe-tp.hpp"
#include "conversion.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

inline void debug_quant(void* input, ggml_type type) {
  std::vector<float> output(ggml_blck_size(type));
  to_float(input, output.data(), ggml_blck_size(type), type);
  for (size_t i = 0; i < 10; i++) {
    printf("%f ", output[i]);
  }
  printf("\n");
}

class LLAMA_MOE_TP {
 private:
  GeneralMOEConfig config_;
  int tp_part_idx;

  uint8_t* m_local_gate_proj_;  // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  uint8_t* m_local_up_proj_;    // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  uint8_t* m_local_down_proj_;  // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

  float* s_input_fp32_;    // [hidden_size]
  uint8_t* s_gate_input_;  // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) /
                           // ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
  uint8_t* s_up_input_;    // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) /
                           // ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
  std::vector<float*> s_gate_output_;        // [routed_expert_num, intermediate_size]
  std::vector<float*> s_up_output_;          // [routed_expert_num, intermediate_size]
  std::vector<float*> s_intermediate_fp32_;  // [routed_expert_num, intermediate_size]
  std::vector<uint8_t*> s_down_input_;       // [routed_expert_num, intermediate_size *
                                             // ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) /
                                             // ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
  std::vector<float*> s_down_output_;        // [routed_expert_num, hidden_size]
  float* s_output_fp32_;                     // [hidden_size]

  std::vector<float*> m_input_fp32_;    // [group_max_len, hidden_size]
  std::vector<uint8_t*> m_gate_input_;  // [group_max_len, hidden_size *
                                        // ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) /
                                        // ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
  std::vector<uint8_t*>
      m_up_input_;  // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type)
                    // / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
  uint8_t* m_local_gate_input_;        // [routed_expert_num * group_max_len * hidden_size *
                                       // ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) /
                                       // ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
  uint8_t* m_local_up_input_;          // [routed_expert_num * group_max_len * hidden_size *
                                       // ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) /
                                       // ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
  float* m_local_gate_output_;         // [routed_expert_num * group_max_len * intermediate_size]
  float* m_local_up_output_;           // [routed_expert_num * group_max_len * intermediate_size]
  float* m_local_intermediate_fp32_;   // [routed_expert_num * group_max_len * intermediate_size]
  uint8_t* m_local_down_input_;        // [routed_expert_num * group_max_len * intermediate_size *
                                       // ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) /
                                       // ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
  float* m_local_down_output_;         // [routed_expert_num * group_max_len * hidden_size]
  std::vector<float*> m_output_fp32_;  // [group_max_len, hidden_size]

  std::vector<std::vector<int>> m_local_pos_;          // [group_max_len, routed_expert_num]
  std::vector<int> m_local_num_;                       // [expert_num]
  std::vector<int> m_expert_id_map_;                   // [expert_num]
  std::vector<uint8_t*> m_local_gate_input_ptr_;       // [expert_num]
  std::vector<uint8_t*> m_local_up_input_ptr_;         // [expert_num]
  std::vector<float*> m_local_gate_output_ptr_;        // [expert_num]
  std::vector<float*> m_local_up_output_ptr_;          // [expert_num]
  std::vector<float*> m_local_intermediate_fp32_ptr_;  // [expert_num]
  std::vector<uint8_t*> m_local_down_input_ptr_;       // [expert_num]
  std::vector<float*> m_local_down_output_ptr_;        // [expert_num]
 public:
  using input_t = ggml_bf16_t;
  using output_t = float;

  LLAMA_MOE_TP(GeneralMOEConfig config, int tp_part_idx) : config_(config), tp_part_idx(tp_part_idx) {
    MemoryRequest mem_requests;
    mem_requests.append_pointer(&s_input_fp32_, sizeof(float) * config_.hidden_size);
    mem_requests.append_pointer(
        &s_gate_input_, config_.hidden_size *
                            ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) /
                            ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type));
    mem_requests.append_pointer(
        &s_up_input_, config_.hidden_size *
                          ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) /
                          ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type));
    s_gate_output_.resize(config_.num_experts_per_tok);
    s_up_output_.resize(config_.num_experts_per_tok);
    s_intermediate_fp32_.resize(config_.num_experts_per_tok);
    s_down_input_.resize(config_.num_experts_per_tok);
    s_down_output_.resize(config_.num_experts_per_tok);
    for (int i = 0; i < config_.num_experts_per_tok; i++) {
      mem_requests.append_pointer(&s_gate_output_[i], sizeof(float) * config_.intermediate_size);
      mem_requests.append_pointer(&s_up_output_[i], sizeof(float) * config_.intermediate_size);
      mem_requests.append_pointer(&s_intermediate_fp32_[i], sizeof(float) * config_.intermediate_size);
      mem_requests.append_pointer(
          &s_down_input_[i],
          config_.intermediate_size *
              ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) /
              ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type));
      mem_requests.append_pointer(&s_down_output_[i], sizeof(float) * config_.hidden_size);
    }
    mem_requests.append_pointer(&s_output_fp32_, sizeof(float) * config_.hidden_size);
    shared_mem_buffer_numa.alloc(tp_part_idx, this, mem_requests);
    // shared_mem_buffer.alloc(this, mem_requests);

    m_input_fp32_.resize(config_.group_max_len);
    m_gate_input_.resize(config_.group_max_len);
    m_up_input_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
      mem_requests.append_pointer(&m_input_fp32_[i], sizeof(float) * config_.hidden_size);
      mem_requests.append_pointer(
          &m_gate_input_[i],
          config_.hidden_size *
              ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) /
              ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type));
      mem_requests.append_pointer(
          &m_up_input_[i], config_.hidden_size *
                               ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) /
                               ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type));
    }
    mem_requests.append_pointer(
        &m_local_gate_input_,
        config_.num_experts_per_tok * config_.group_max_len * config_.hidden_size *
            ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) /
            ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type));
    mem_requests.append_pointer(
        &m_local_up_input_, config_.num_experts_per_tok * config_.group_max_len * config_.hidden_size *
                                ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) /
                                ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type));
    mem_requests.append_pointer(&m_local_gate_output_, sizeof(float) * config_.num_experts_per_tok *
                                                           config_.group_max_len * config_.intermediate_size);
    mem_requests.append_pointer(&m_local_up_output_, sizeof(float) * config_.num_experts_per_tok *
                                                         config_.group_max_len * config_.intermediate_size);
    mem_requests.append_pointer(&m_local_intermediate_fp32_, sizeof(float) * config_.num_experts_per_tok *
                                                                 config_.group_max_len * config_.intermediate_size);
    mem_requests.append_pointer(
        &m_local_down_input_,
        config_.num_experts_per_tok * config_.group_max_len * config_.intermediate_size *
            ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) /
            ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type));
    mem_requests.append_pointer(&m_local_down_output_, sizeof(float) * config_.num_experts_per_tok *
                                                           config_.group_max_len * config_.hidden_size);
    m_output_fp32_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
      mem_requests.append_pointer(&m_output_fp32_[i], sizeof(float) * config_.hidden_size);
    }
    shared_mem_buffer_numa.alloc(tp_part_idx, this, mem_requests);
    // shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
      m_local_pos_[i].resize(config_.num_experts_per_tok);
    }
    m_expert_id_map_.resize(config_.expert_num);
    m_local_num_.resize(config_.expert_num);
    m_local_gate_input_ptr_.resize(config_.expert_num);
    m_local_up_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_intermediate_fp32_ptr_.resize(config_.expert_num);
    m_local_down_input_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);

    auto size = 1ll * config.expert_num * config.intermediate_size * config.hidden_size;
    m_local_up_proj_ =
        new uint8_t[size * ggml_type_size((ggml_type)config.up_type) / ggml_blck_size((ggml_type)config.up_type)];

    m_local_gate_proj_ =
        new uint8_t[size * ggml_type_size((ggml_type)config.gate_type) / ggml_blck_size((ggml_type)config.gate_type)];
    m_local_down_proj_ =
        new uint8_t[size * ggml_type_size((ggml_type)config.down_type) / ggml_blck_size((ggml_type)config.down_type)];
  }

  void load_weights(int complete_intermediate_size, int offset) {
    auto local_gate_proj = m_local_gate_proj_;
    auto local_up_proj = m_local_up_proj_;
    auto local_down_proj = m_local_down_proj_;
    auto& config = config_;
    // printf("gate load weights:");
    // debug_quant(config.gate_proj, (ggml_type)config.gate_type);
    // we need to make sure the blck size is correct for size.
    if (config.intermediate_size % ggml_blck_size((ggml_type)config.down_type) != 0) {
      printf("intermediate_size: %d, down_type blck size: %d\n", config.intermediate_size,
             ggml_blck_size((ggml_type)config.down_type));
      throw std::runtime_error("intermediate_size must be a multiple of gate_type blck size");
    }
    if (config.intermediate_size * config.hidden_size % ggml_blck_size((ggml_type)config.up_type) != 0) {
      printf("intermediate_size: %d, up_type blck size: %d\n", config.intermediate_size,
             ggml_blck_size((ggml_type)config.up_type));
      throw std::runtime_error("intermediate_size * hidden_size must be a multiple of up_type blck size");
    }
    if (config.intermediate_size * config.hidden_size % ggml_blck_size((ggml_type)config.gate_type) != 0) {
      printf("intermediate_size: %d, gate_type blck size: %d\n", config.intermediate_size,
             ggml_blck_size((ggml_type)config.gate_type));
      throw std::runtime_error("intermediate_size * hidden_size must be a multiple of gate_type blck size");
    }
    uint8_t* gate_proj = (uint8_t*)config.gate_proj + offset * config.hidden_size *
                                                          ggml_type_size((ggml_type)config.gate_type) /
                                                          ggml_blck_size((ggml_type)config.gate_type);
    uint8_t* up_proj = (uint8_t*)config.up_proj + offset * config.hidden_size *
                                                      ggml_type_size((ggml_type)config.up_type) /
                                                      ggml_blck_size((ggml_type)config.up_type);
    uint8_t* down_proj = (uint8_t*)config.down_proj + offset * ggml_type_size((ggml_type)config.down_type) /
                                                          ggml_blck_size((ggml_type)config.down_type);

    for (int i = 0; i < config.expert_num; ++i) {
      memcpy(local_gate_proj, gate_proj,
             config.intermediate_size * config.hidden_size * ggml_type_size((ggml_type)config.gate_type) /
                 ggml_blck_size((ggml_type)config.gate_type));
      memcpy(local_up_proj, up_proj,
             config.intermediate_size * config.hidden_size * ggml_type_size((ggml_type)config.up_type) /
                 ggml_blck_size((ggml_type)config.up_type));
      for (int j = 0; j < config.hidden_size; ++j) {
        memcpy(local_down_proj, down_proj,
               config.intermediate_size * ggml_type_size((ggml_type)config.down_type) /
                   ggml_blck_size((ggml_type)config.down_type));
        local_down_proj += config.intermediate_size * ggml_type_size((ggml_type)config.down_type) /
                           ggml_blck_size((ggml_type)config.down_type);
        down_proj += complete_intermediate_size * ggml_type_size((ggml_type)config.down_type) /
                     ggml_blck_size((ggml_type)config.down_type);
      }
      local_gate_proj += config.intermediate_size * config.hidden_size * ggml_type_size((ggml_type)config.gate_type) /
                         ggml_blck_size((ggml_type)config.gate_type);
      local_up_proj += config.intermediate_size * config.hidden_size * ggml_type_size((ggml_type)config.up_type) /
                       ggml_blck_size((ggml_type)config.up_type);
      gate_proj += complete_intermediate_size * config.hidden_size * ggml_type_size((ggml_type)config.gate_type) /
                   ggml_blck_size((ggml_type)config.gate_type);
      up_proj += complete_intermediate_size * config.hidden_size * ggml_type_size((ggml_type)config.up_type) /
                 ggml_blck_size((ggml_type)config.up_type);
    }
  }

  void warm_up() {
    std::vector<float> input_fp32(config_.hidden_size);
    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size((ggml_type)config_.hidden_type) /
                               ggml_blck_size((ggml_type)config_.hidden_type));
    std::vector<float> output(config_.hidden_size);
    for (int i = 0; i < config_.hidden_size; i++) {
      input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.hidden_size, (ggml_type)config_.hidden_type);
    for (int i = 0; i < config_.expert_num; i++) {
      int64_t expert_ids = i;
      float weights = 0;
      forward_one(1, &expert_ids, &weights, input.data(), output.data());
    }
  }

  static float act_fn(float x) { return x / (1.0f + expf(-x)); }

  void forward_one(int k, const int64_t* expert_ids, const float* weights, const void* input, float* output) {
    auto pool = config_.pool->get_subpool(tp_part_idx);
#ifdef FORWARD_TIME_PROFILE
    auto t0 = std::chrono::high_resolution_clock::now();
#endif
    const void* gate_input_ptr;
    const void* up_input_ptr;
    if ((ggml_type)config_.hidden_type == ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type &&
        (ggml_type)config_.hidden_type == ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) {
      gate_input_ptr = up_input_ptr = input;
    } else {
      to_float(input, s_input_fp32_, config_.hidden_size, (ggml_type)config_.hidden_type);
      if (ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type ==
          ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) {
        from_float(s_input_fp32_, s_gate_input_, config_.hidden_size,
                   ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type);
        gate_input_ptr = up_input_ptr = s_gate_input_;
      } else {
        if ((ggml_type)config_.hidden_type !=
            ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) {
          from_float(s_input_fp32_, s_gate_input_, config_.hidden_size,
                     ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type);
          gate_input_ptr = s_gate_input_;
        } else {
          gate_input_ptr = input;
        }
        if ((ggml_type)config_.hidden_type != ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) {
          from_float(s_input_fp32_, s_up_input_, config_.hidden_size,
                     ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type);
          up_input_ptr = s_up_input_;
        } else {
          up_input_ptr = input;
        }
      }
    }

#ifdef FORWARD_TIME_PROFILE
    // printf("gate_input: ");
    // debug_quant(const_cast<void *>(gate_input_ptr),
    // ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type);
    // printf("up_input: ");
    // debug_quant(const_cast<void *>(up_input_ptr),
    // ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type);
    auto t1 = std::chrono::high_resolution_clock::now();
    fmt::print("numa_node: {}, convert time: {}\n", tp_part_idx,
               std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count());

#endif

    int activated_expert = 0;
    for (int i = 0; i < k; i++) {
      if (config_.should_skip_expert(expert_ids[i])) {
        continue;
      }
      m_expert_id_map_[activated_expert] = expert_ids[i];
      activated_expert++;
    }

    int nth = config_.intermediate_size / config_.m_block;

    // Only process activated (CPU) experts; skip GPU experts entirely to keep buffers aligned.
    if (activated_expert > 0) {
      pool->do_work_stealing_job(
          nth * activated_expert, nullptr,
          [&](int task_id) {
            int act_idx = task_id / nth;
            int64_t expert_id = m_expert_id_map_[act_idx];
            if (expert_id == -1) {
              return;
            }
            int ith = task_id % nth;

            void* gate_proj_ptr =
                (uint8_t*)m_local_gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.m_block) *
                                                   config_.hidden_size * ggml_type_size((ggml_type)config_.gate_type) /
                                                   ggml_blck_size((ggml_type)config_.gate_type);

            float* gate_output_ptr = s_gate_output_[act_idx] + ith * config_.m_block;
            auto ok = llamafile_sgemm(
                config_.m_block, 1, config_.hidden_size / ggml_blck_size((ggml_type)config_.gate_type), gate_proj_ptr,
                config_.hidden_size / ggml_blck_size((ggml_type)config_.gate_type), gate_input_ptr,
                config_.hidden_size / ggml_blck_size((ggml_type)config_.gate_type), gate_output_ptr, config_.m_block, 0,
                1, GGML_TASK_TYPE_COMPUTE, (ggml_type)config_.gate_type,
                ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type, GGML_TYPE_F32,
                GGML_PREC_DEFAULT);
            if (ok == false) [[unlikely]] {
              throw std::runtime_error("llamafile not supported");
            }

            void* up_proj_ptr =
                (uint8_t*)m_local_up_proj_ + (expert_id * config_.intermediate_size + ith * config_.m_block) *
                                                 config_.hidden_size * ggml_type_size((ggml_type)config_.up_type) /
                                                 ggml_blck_size((ggml_type)config_.up_type);

            float* up_output_ptr = s_up_output_[act_idx] + ith * config_.m_block;
            llamafile_sgemm(config_.m_block, 1, config_.hidden_size / ggml_blck_size((ggml_type)config_.up_type),
                            up_proj_ptr, config_.hidden_size / ggml_blck_size((ggml_type)config_.up_type), up_input_ptr,
                            config_.hidden_size / ggml_blck_size((ggml_type)config_.up_type), up_output_ptr,
                            config_.m_block, 0, 1, GGML_TASK_TYPE_COMPUTE, (ggml_type)config_.up_type,
                            ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type, GGML_TYPE_F32,
                            GGML_PREC_DEFAULT);

            for (int i = ith * config_.m_block; i < (ith + 1) * config_.m_block; i++) {
              s_intermediate_fp32_[act_idx][i] = act_fn(s_gate_output_[act_idx][i]) * s_up_output_[act_idx][i];
            }
            if (config_.m_block %
                    ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) ==
                0) {
              float* intermediate_fp32_ptr = s_intermediate_fp32_[act_idx] + ith * config_.m_block;
              void* down_input_ptr =
                  s_down_input_[act_idx] +
                  ith * config_.m_block *
                      ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) /
                      ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type);
              from_float(intermediate_fp32_ptr, down_input_ptr, config_.m_block,
                         ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type);
            }
          },
          nullptr);
    }

    if (config_.m_block % ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) !=
        0) {
      for (int i = 0; i < activated_expert; i++) {
        from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size,
                   ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type);
      }
    }

#ifdef FORWARD_TIME_PROFILE
    // printf("sinter:");
    // debug_f32(s_intermediate_fp32_[expert_ids[0]]);
    auto t2 = std::chrono::high_resolution_clock::now();
    fmt::print("numa_node: {}, gate/up time: {}\n", tp_part_idx,
               std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count());
#endif

    nth = config_.hidden_size / config_.m_block;
    pool->do_work_stealing_job(
        nth, nullptr,
        [&](int task_id) {
          int ith = task_id;
          for (int i = ith * config_.m_block; i < (ith + 1) * config_.m_block; i++) {
            output[i] = 0;
          }
          for (int expert_idx = 0; expert_idx < activated_expert; expert_idx++) {
            int64_t expert_id = m_expert_id_map_[expert_idx];
            if (expert_id == -1) {
              continue;
            }

            auto expert_offset = expert_id * config_.hidden_size * config_.intermediate_size;
            auto m_block_offset = ith * config_.m_block * config_.intermediate_size;
            void* down_proj_ptr = (uint8_t*)m_local_down_proj_ + (expert_offset + m_block_offset) *
                                                                     ggml_type_size((ggml_type)config_.down_type) /
                                                                     ggml_blck_size((ggml_type)config_.down_type);

            float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.m_block;
            llamafile_sgemm(
                config_.m_block, 1, config_.intermediate_size / ggml_blck_size((ggml_type)config_.down_type),
                down_proj_ptr, config_.intermediate_size / ggml_blck_size((ggml_type)config_.down_type),
                s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size((ggml_type)config_.down_type),
                down_output_ptr, config_.m_block, 0, 1, GGML_TASK_TYPE_COMPUTE, (ggml_type)config_.down_type,
                ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type, GGML_TYPE_F32,
                GGML_PREC_DEFAULT);

            float expert_weight = 0.0f;
            for (int j = 0; j < k; j++) {
              if (expert_ids[j] == expert_id) {
                expert_weight = weights[j];
                break;
              }
            }

            for (int i = ith * config_.m_block; i < (ith + 1) * config_.m_block; i++) {
              output[i] += s_down_output_[expert_idx][i] * expert_weight;
            }
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    auto t3 = std::chrono::high_resolution_clock::now();
    fmt::print("numa_node: {}, down time: {}\n", tp_part_idx,
               std::chrono::duration_cast<std::chrono::nanoseconds>(t3 - t2).count());
    fmt::print("numa_node: {}, total time: {}\n", tp_part_idx,
               std::chrono::duration_cast<std::chrono::nanoseconds>(t3 - t0).count());
#endif
  }

  void forward_many(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
                    float* output) {
    auto pool = config_.pool->get_subpool(tp_part_idx);
#ifdef FORWARD_TIME_PROFILE
    auto start_time = std::chrono::high_resolution_clock::now();
    auto last = start_time;
    // 用于保存各阶段耗时（单位：微秒）
    long prepare_time = 0, cpy_input_time = 0, q_input_time = 0, up_gate_time = 0;
    long act_time = 0, q_down_time = 0, down_time = 0, weight_time = 0;
    int max_local_num = 0;  // 记录最大的 local num
#endif

    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        if (config_.should_skip_expert(expert_ids[i * k + j])) {
          continue;
        }
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_gate_input_ptr_[i] =
          m_local_gate_input_ +
          offset * config_.hidden_size *
              ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) /
              ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type);
      m_local_up_input_ptr_[i] =
          m_local_up_input_ +
          offset * config_.hidden_size *
              ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) /
              ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type);
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_intermediate_fp32_ptr_[i] = m_local_intermediate_fp32_ + offset * config_.intermediate_size;
      m_local_down_input_ptr_[i] =
          m_local_down_input_ +
          offset * config_.intermediate_size *
              ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) /
              ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type);
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];
      if (m_local_num_[i] > 0) {
#ifdef FORWARD_TIME_PROFILE
        max_local_num = std::max(max_local_num, m_local_num_[i]);
#endif
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      prepare_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    pool->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          const void* gate_input_ptr;
          const void* up_input_ptr;
          if ((ggml_type)config_.hidden_type ==
                  ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type &&
              (ggml_type)config_.hidden_type ==
                  ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) {
            gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size *
                                                                  ggml_type_size((ggml_type)config_.hidden_type) /
                                                                  ggml_blck_size((ggml_type)config_.hidden_type);
          } else {
            to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size((ggml_type)config_.hidden_type) /
                                           ggml_blck_size((ggml_type)config_.hidden_type),
                     m_input_fp32_[i], config_.hidden_size, (ggml_type)config_.hidden_type);
            if (ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type ==
                ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) {
              from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size,
                         ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type);
              gate_input_ptr = up_input_ptr = m_gate_input_[i];
            } else {
              if ((ggml_type)config_.hidden_type !=
                  ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) {
                from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size,
                           ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type);
                gate_input_ptr = m_gate_input_[i];
              } else {
                gate_input_ptr = (uint8_t*)input + i * config_.hidden_size *
                                                       ggml_type_size((ggml_type)config_.hidden_type) /
                                                       ggml_blck_size((ggml_type)config_.hidden_type);
              }
              if ((ggml_type)config_.hidden_type !=
                  ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) {
                from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size,
                           ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type);
                up_input_ptr = m_up_input_[i];
              } else {
                up_input_ptr = (uint8_t*)input + i * config_.hidden_size *
                                                     ggml_type_size((ggml_type)config_.hidden_type) /
                                                     ggml_blck_size((ggml_type)config_.hidden_type);
              }
            }
          }
          for (int j = 0; j < k; j++) {
            if (config_.should_skip_expert(expert_ids[i * k + j])) {
              continue;
            }
            memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] +
                       m_local_pos_[i][j] * config_.hidden_size *
                           ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) /
                           ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type),
                   gate_input_ptr,
                   config_.hidden_size *
                       ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type) /
                       ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type));
            memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] +
                       m_local_pos_[i][j] * config_.hidden_size *
                           ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) /
                           ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type),
                   up_input_ptr,
                   config_.hidden_size *
                       ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type) /
                       ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type));
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      cpy_input_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    int m_block = QK_K;
    int nth = config_.intermediate_size / m_block;
    // printf("nth: %d, m_block: %d, activated_expert: %d\n", nth, m_block, activated_expert);
    // printf("config_.hidden_size: %d, config_.intermediate_size: %d\n", config_.hidden_size,
    // config_.intermediate_size);
    pool->do_work_stealing_job(
        nth * activated_expert, nullptr,
        [&](int task_id) {
          int64_t expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
          void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];

          void* gate_proj_ptr =
              (uint8_t*)m_local_gate_proj_ + (expert_idx * config_.intermediate_size + ith * m_block) *
                                                 config_.hidden_size * ggml_type_size((ggml_type)config_.gate_type) /
                                                 ggml_blck_size((ggml_type)config_.gate_type);

          float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * m_block;

          // if (ith == 0) {
          //   printf("matrix size: m:%d, n:%d, k:%d\n", m_block, m_local_num_[expert_idx],
          //          config_.hidden_size / ggml_blck_size((ggml_type)config_.gate_type));
          // }
          llamafile_sgemm(m_block, m_local_num_[expert_idx],
                          config_.hidden_size / ggml_blck_size((ggml_type)config_.gate_type), gate_proj_ptr,
                          config_.hidden_size / ggml_blck_size((ggml_type)config_.gate_type), gate_input_ptr,
                          config_.hidden_size / ggml_blck_size((ggml_type)config_.gate_type), gate_output_ptr,
                          config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, (ggml_type)config_.gate_type,
                          ggml_internal_get_type_traits((ggml_type)config_.gate_type).vec_dot_type, GGML_TYPE_F32,
                          GGML_PREC_DEFAULT);
          void* up_input_ptr = m_local_up_input_ptr_[expert_idx];

          void* up_proj_ptr = (uint8_t*)m_local_up_proj_ + (expert_idx * config_.intermediate_size + ith * m_block) *
                                                               config_.hidden_size *
                                                               ggml_type_size((ggml_type)config_.up_type) /
                                                               ggml_blck_size((ggml_type)config_.up_type);

          float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * m_block;
          llamafile_sgemm(
              m_block, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size((ggml_type)config_.up_type),
              up_proj_ptr, config_.hidden_size / ggml_blck_size((ggml_type)config_.up_type), up_input_ptr,
              config_.hidden_size / ggml_blck_size((ggml_type)config_.up_type), up_output_ptr,
              config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, (ggml_type)config_.up_type,
              ggml_internal_get_type_traits((ggml_type)config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
          for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            for (int j = ith * m_block; j < (ith + 1) * m_block; j++) {
              m_local_intermediate_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] =
                  act_fn(m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size + j]) *
                  m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size + j];
            }
            float* intermediate_fp32_ptr =
                m_local_intermediate_fp32_ptr_[expert_idx] + i * config_.intermediate_size + ith * m_block;
            void* down_input_ptr =
                m_local_down_input_ptr_[expert_idx] +
                i * config_.intermediate_size *
                    ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) /
                    ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) +
                ith * m_block *
                    ggml_type_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type) /
                    ggml_blck_size(ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, m_block,
                       ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type);
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      up_gate_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    m_block = QK_K;
    nth = config_.hidden_size / m_block;
    pool->do_work_stealing_job(
        nth * activated_expert, nullptr,
        [&](int task_id) {
          int64_t expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
          void* down_input_ptr = m_local_down_input_ptr_[expert_idx];

          auto expert_offset = expert_idx * config_.hidden_size * config_.intermediate_size;
          auto m_block_offset = ith * m_block * config_.intermediate_size;

          void* down_proj_ptr = (uint8_t*)m_local_down_proj_ + (expert_offset + m_block_offset) *
                                                                   ggml_type_size((ggml_type)config_.down_type) /
                                                                   ggml_blck_size((ggml_type)config_.down_type);

          float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * m_block;
          llamafile_sgemm(m_block, m_local_num_[expert_idx],
                          config_.intermediate_size / ggml_blck_size((ggml_type)config_.down_type), down_proj_ptr,
                          config_.intermediate_size / ggml_blck_size((ggml_type)config_.down_type), down_input_ptr,
                          config_.intermediate_size / ggml_blck_size((ggml_type)config_.down_type), down_output_ptr,
                          config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, (ggml_type)config_.down_type,
                          ggml_internal_get_type_traits((ggml_type)config_.down_type).vec_dot_type, GGML_TYPE_F32,
                          GGML_PREC_DEFAULT);
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      down_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
#endif

    pool->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int e = 0; e < config_.hidden_size; e++) {
            m_output_fp32_[i][e] = 0;
          }
          for (int j = 0; j < k; j++) {
            if (config_.should_skip_expert(expert_ids[i * k + j])) {
              continue;
            }
            for (int e = 0; e < config_.hidden_size; e++) {
              m_output_fp32_[i][e] +=
                  m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] *
                  weights[i * k + j];
            }
          }
          for (int e = 0; e < config_.hidden_size; e++) {
            output[i * config_.hidden_size + e] = m_output_fp32_[i][e];
          }
        },
        nullptr);
#ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
      weight_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
      last = now_time;
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto forward_total_time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
    // 在函数末尾一次性打印所有阶段的耗时，并附带 max_local_num 和 qlen
    printf(
        "Profiling Results (numa[%d]): activated_expert: %d, prepare: %ld us, cpy_input: %ld us, q_input: %ld us, "
        "up_gate: %ld us, act: %ld us, q_down: %ld us, down: %ld us, weight: %ld us, total: %ld us, max_local_num: "
        "%d, qlen: %d\n",
        tp_part_idx, activated_expert, prepare_time, cpy_input_time, q_input_time, up_gate_time, act_time, q_down_time,
        down_time, weight_time, forward_total_time, max_local_num, qlen);
#endif
  }

  void forward(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input, void* output_in) {
    auto output = (float*)output_in;
    if (qlen < config_.group_min_len) {
      for (int i = 0; i < qlen; i++) {
        forward_one(k, expert_ids + i * k, weights + i * k,
                    (uint8_t*)input + i * config_.hidden_size * ggml_type_size((ggml_type)config_.hidden_type) /
                                          ggml_blck_size((ggml_type)config_.hidden_type),
                    output + i * config_.hidden_size);
      }
      return;
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    forward_many(forward_len, k, expert_ids, weights, input, output);
    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k,
            (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size((ggml_type)config_.hidden_type) /
                                  ggml_blck_size((ggml_type)config_.hidden_type),
            output + forward_len * config_.hidden_size);
  }
};

template <>
class TP_MOE<LLAMA_MOE_TP> : public TP_MOE_Common<LLAMA_MOE_TP> {
 public:
  using TP_MOE_Common<LLAMA_MOE_TP>::TP_MOE_Common;

  void load_weights() {
    auto pool = this->config.pool;

    std::vector<int> tp_offsets(this->tp_count);
    int accumulated_offset = 0;
    for (int i = 0; i < this->tp_count; i++) {
      tp_offsets[i] = accumulated_offset;
      accumulated_offset += this->tp_configs[i].intermediate_size;
    }

    pool->dispense_backend()->do_numa_job([this, pool, tp_offsets](int tp_id) {
      this->tps[tp_id]->load_weights(this->config.intermediate_size, tp_offsets[tp_id]);
    });
    this->weights_loaded = true;
  }

  void merge_results(int qlen, void* output) { merge_results(qlen, output, false); }

  void merge_results(int qlen, void* output, bool incremental) {
    auto pool = this->config.pool;
    pool->do_work_stealing_job(
        qlen, nullptr,
        [this, output, incremental](int token_nth) {
          if (incremental) {
            to_float((uint8_t*)output + token_nth * config.hidden_size * ggml_type_size((ggml_type)config.hidden_type) /
                                            ggml_blck_size((ggml_type)config.hidden_type),
                     local_output + token_nth * config.hidden_size, config.hidden_size, (ggml_type)config.hidden_type);
            for (int e = 0; e < config.hidden_size; e++) {
              local_output_numa[0][token_nth * config.hidden_size + e] +=
                  local_output[token_nth * config.hidden_size + e];
            }
          }
          auto& tp_count = this->tp_count;
          for (int i = 1; i < tp_count; i++) {
            for (int e = 0; e < config.hidden_size; e++) {
              local_output_numa[0][token_nth * config.hidden_size + e] +=
                  local_output_numa[i][token_nth * config.hidden_size + e];
            }
          }
          from_float(local_output_numa[0] + token_nth * config.hidden_size,
                     (uint8_t*)output + token_nth * config.hidden_size * ggml_type_size((ggml_type)config.hidden_type) /
                                            ggml_blck_size((ggml_type)config.hidden_type),
                     config.hidden_size, (ggml_type)config.hidden_type);
        },
        nullptr);
  }
};
#endif


================================================
FILE: kt-kernel/operators/mla-tp.hpp
================================================
#ifndef CPUINFER_OPERATOR_MLA_HPP
#define CPUINFER_OPERATOR_MLA_HPP

#include "common.hpp"

template <typename T>
// qlens: token count for each query
// cache_pages: kv_cache for all queries in the current layer
// page_tables: kv_cache page table for each query ([query_idx][page_idx])
// kv_lens: kv_cache length for each query
// input: input tensor, shape [qlen, hidden_size]
// output: output tensor, shape [qlen, hidden_size]
// config: GeneralMLAConfig
// tp_idx: thread pool index
// T must have the following methods:
concept MLA_TP_PART =
    requires(T t, std::vector<int> qlens, std::vector<void*> kv_lora_pages, std::vector<void*> pe_pages,
             std::vector<std::vector<int>> page_tables, std::vector<int> kv_lens, const void* input, void* output,
             GeneralMLAConfig config, int tp_idx, int page_count, std::vector<void*> attention_masks) {
      typename T::output_t;
      { new T(config, tp_idx) } -> std::same_as<T*>;
      { t.set_pages(kv_lora_pages, pe_pages) } -> std::same_as<void>;
      { t.set_local_pages(page_count) } -> std::same_as<void>;
      { t.forward(qlens, page_tables, kv_lens, input, output) } -> std::same_as<void>;
      { t.forward(qlens, page_tables, kv_lens, attention_masks, input, output) } -> std::same_as<void>;
    };

template <MLA_TP_PART T>
class TP_MLA_Common : public MLA_Interface {
 protected:
  GeneralMLAConfig config;
  std::vector<GeneralMLAConfig> tp_configs;
  int tp_count;
  int me_numa_id;
  std::vector<std::unique_ptr<T>> tps;

  std::vector<typename T::output_t*> local_output_numa;

  bool weights_loaded = false;
#ifdef FORWARD_TIME_REPORT
  size_t forward_time_sum_ns = 0;
  size_t forward_count = 0;
#endif

 public:
  TP_MLA_Common(GeneralMLAConfig config) : config(config) {
    printf("TP MLA layer %d, pool: 0x%lx\n", config.layer_idx, (intptr_t)config.pool);
    if (config.pool == nullptr) {
      printf("TP MLA layer %d, no worker pool\n", config.layer_idx);
      throw std::runtime_error("no worker pool");
    }

    this->config = config;
    tp_count = config.pool->config.subpool_count;
    if (config.hidden_size % tp_count != 0) {
      printf("hidden_size %d, tp count %d\n", config.hidden_size, tp_count);
      throw std::runtime_error(
          "For TP, hidden_size must be a "
          "multiple of NUMA node count");
    }

    for (auto i = 0; i < tp_count; i++) {
      tps.push_back(nullptr);
    }

    tp_configs.resize(tp_count);
    config.pool->dispense_backend()->do_numa_job([this, config](int i) {
      tp_configs[i] = config;
      tp_configs[i].num_heads /= tp_count;
      tps[i] = std::move(std::unique_ptr<T>(new T(tp_configs[i], i)));
    });

    local_output_numa.resize(tp_count, nullptr);
    MemoryRequest mem_requests;
    for (auto i = 0; i < tp_count; i++) {
      mem_requests.append_pointer(&local_output_numa[i],
                                  sizeof(typename T::output_t) * tp_configs[i].max_qlen * tp_configs[i].hidden_size);
    }
    shared_mem_buffer.alloc(this, mem_requests);
  }

  void forward(std::vector<int> qlens, std::vector<std::vector<int>> page_tables, std::vector<int> kv_lens,
               const void* input, void* output) override {
    if (weights_loaded == false) [[unlikely]] {
      throw std::runtime_error("Not Loaded");
    }
#ifdef FORWARD_TIME_REPORT
    auto start = std::chrono::high_resolution_clock::now();
#endif

    auto pool = config.pool;
    pool->dispense_backend()->do_numa_job([this, pool, qlens, page_tables, kv_lens, input](int numa_id) {
      tps[numa_id]->forward(qlens, page_tables, kv_lens, input, this->local_output_numa[numa_id]);
    });
    int qlen_sum = 0;
    for (auto i = 0; i < qlens.size(); i++) {
      qlen_sum += qlens[i];
    }

    merge_results(qlen_sum, output);

#ifdef FORWARD_TIME_REPORT
    auto end = std::chrono::high_resolution_clock::now();
    auto forward_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    auto band_width = (1.0 * config.routed_expert_num * config.hidden_size * config.intermediate_size * 3 / 1e9) /
                      (1.0 * forward_time / 1e6);
    auto GFLOPS =
        (1.0 * config.hidden_size * config.intermediate_size * qlen * 3 * config.routed_expert_num * 2 / 1e9) /
        (1.0 * forward_time / 1e6);
    if (qlen <= 10) {
      forward_time_sum_ns += forward_time;
      forward_count++;
    }
    auto average_bandwidth =
        (1.0 * forward_count * config.routed_expert_num * config.hidden_size * config.intermediate_size * 3 / 1e9) /
        (1.0 * forward_time_sum_ns / 1e6);
    printf(
        "forward time %ld, time stamp:%ld, band width %f GElement/s, ave bandwidth %f GElement/s (only "
        "decode), %f GFLOPS, me numa: %d\n",
        forward_time, end.time_since_epoch().count() / 1000 % 100000000, band_width, average_bandwidth, GFLOPS,
        numa_node_of_cpu(sched_getcpu()));
#endif
  }

  void set_pages(std::vector<std::vector<void*>> kv_lora_pages, std::vector<std::vector<void*>> pe_pages) {
    for (auto i = 0; i < tp_count; i++) {
      tps[i]->set_pages(kv_lora_pages[i], pe_pages[i]);
    }
  }

  void set_local_pages(int page_count) {
    config.pool->dispense_backend()->do_numa_job(
        [this, page_count](int tp_idx) { tps[tp_idx]->set_local_pages(page_count); });
  }

  virtual void load_weights() = 0;
  virtual void merge_results(int qlen, void* output) = 0;
};

template <MLA_TP_PART T>
class TP_MLA : public TP_MLA_Common<T> {
 public:
  using TP_MLA_Common<T>::TP_MLA_Common;
  void load_weights() { throw std::runtime_error("Not Implemented"); }
  void merge_results(int qlen, void* output) { throw std::runtime_error("Not Implemented"); }
};

#endif

================================================
FILE: kt-kernel/operators/moe-tp.hpp
================================================
#ifndef CPUINFER_OPERATOR_MOE_HPP
#define CPUINFER_OPERATOR_MOE_HPP

// #define CHECK

#include <cstdint>
#include <cstdio>
#include <type_traits>

#include "../cpu_backend/shared_mem_buffer.h"
#include "common.hpp"

// Forward declaration for Llamafile backend type checking
class LLAMA_MOE_TP;

template <typename T>
concept MOE_TP_PART = requires(T t, int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
                               void* output, GeneralMOEConfig config, int tp_idx) {
  typename T::output_t;
  { new T(config, tp_idx) } -> std::same_as<T*>;
  { t.forward(qlen, k, expert_ids, weights, input, output) } -> std::same_as<void>;
  // { t.load_weights() } -> std::same_as<void>;
};

template <MOE_TP_PART T>
class TP_MOE_Common : public MoE_Interface {
 protected:
  std::vector<GeneralMOEConfig> tp_configs;
  int tp_count;
  int me_numa_id;
  std::vector<std::unique_ptr<T>> tps;

  std::vector<typename T::output_t*> local_output_numa;
  T::output_t* local_output = nullptr;

  bool weights_loaded = false;

#ifdef FORWARD_TIME_REPORT
  size_t forward_time_sum_ns = 0;
  size_t forward_count = 0;
#endif
 public:
  GeneralMOEConfig config;
  using input_t = typename T::input_t;
  TP_MOE_Common(GeneralMOEConfig config) : config(config) {
    printf("TP MOE layer %d, pool: 0x%lx, expert num: %d, num_experts_per_tok: %d\n", config.layer_idx,
           (intptr_t)config.pool, config.expert_num, config.num_experts_per_tok);
    if (config.pool == nullptr) {
      printf("TP MOE layer %d, no worker pool\n", config.layer_idx);
      throw std::runtime_error("no worker pool");
    }

    this->config = config;
    tp_count = config.pool->config.subpool_count;
    if (config.intermediate_size % tp_count != 0) {
      printf("intermediate_size %d, tp count %d\n", config.intermediate_size, tp_count);
      throw std::runtime_error(
          "For TP, intermediate_size must be a "
          "multiple of NUMA node count");
    }

    // Check if this is Llamafile backend using compile-time type checking
    constexpr bool is_llamafile = std::is_same<T, LLAMA_MOE_TP>::value;
#ifndef QK_K
#define QK_K 256
#endif

    if (is_llamafile) {
      // For Llamafile backend: use QK_K-aligned TP splitting
      if (config.intermediate_size % QK_K != 0) {
        printf("intermediate_size %d must be divisible by QK_K %d for Llamafile backend\n", config.intermediate_size,
               QK_K);
        throw std::runtime_error("intermediate_size must be divisible by QK_K (256) for Llamafile backend");
      }

      int num_blocks = config.intermediate_size / QK_K;
      int base_blocks = num_blocks / tp_count;
      int extra_blocks = num_blocks % tp_count;

      if (base_blocks == 0) {
        printf("intermediate_size %d is too small for tp_count %d (num_blocks=%d)\n", config.intermediate_size,
               tp_count, num_blocks);
        throw std::runtime_error("intermediate_size too small: cannot distribute blocks to all TP instances");
      }

      printf("Llamafile TP splitting: intermediate_size=%d, tp_count=%d, QK_K=%d\n", config.intermediate_size, tp_count,
             QK_K);
      printf("  num_blocks=%d, base_blocks=%d, extra_blocks=%d\n", num_blocks, base_blocks, extra_blocks);

      int current_offset = 0;
      for (auto i = 0; i < tp_count; i++) {
        tps.push_back(nullptr);
        GeneralMOEConfig tp_config = config;

        // First extra_blocks TPs get one more block
        int num_blocks_for_this_tp = base_blocks + (i < extra_blocks ? 1 : 0);
        tp_config.intermediate_size = num_blocks_for_this_tp * QK_K;

        printf("  TP %d: intermediate_size=%d, offset=%d, blocks=%d\n", i, tp_config.intermediate_size, current_offset,
               num_blocks_for_this_tp);

        tp_configs.push_back(tp_config);
        current_offset += tp_config.intermediate_size;
      }
    } else {
      // For non-Llamafile backends: use simple equal division
      if (config.intermediate_size % tp_count != 0) {
        printf("intermediate_size %d, tp count %d\n", config.intermediate_size, tp_count);
        throw std::runtime_error(
            "For TP, intermediate_size must be a "
            "multiple of NUMA node count");
      }

      for (auto i = 0; i < tp_count; i++) {
        tps.push_back(nullptr);
        GeneralMOEConfig tp_config = config;
        tp_config.intermediate_size /= tp_count;
        tp_configs.push_back(tp_config);
      }
    }

    config.pool->dispense_backend()->do_numa_job(
        [this, config](int i) { tps[i] = std::move(std::unique_ptr<T>(new T(tp_configs[i], i))); });

    local_output_numa.resize(tp_count, nullptr);
    MemoryRequest mem_requests;
    for (auto i = 0; i < tp_count; i++) {
      mem_requests.append_pointer(
          &local_output_numa[i],
          (size_t)sizeof(typename T::output_t) * tp_configs[i].max_possible_qlen() * tp_configs[i].hidden_size);
    }
    mem_requests.append_pointer(
        (void**)&local_output,
        sizeof(typename T::output_t) * tp_configs[0].max_possible_qlen() * tp_configs[0].hidden_size);
    // printf("local output tp, %d,\n", tp_configs[0].max_possible_qlen());
    shared_mem_buffer.alloc(this, mem_requests);
  }

  void warm_up() {
    int qlen = config.max_possible_qlen();
    std::vector<uint8_t> input(sizeof(ggml_bf16_t) * qlen * config.hidden_size);
    std::vector<uint8_t> output(sizeof(ggml_bf16_t) * qlen * config.hidden_size);
    std::vector<int64_t> expert_ids(qlen * config.num_experts_per_tok);
    std::vector<float> weights(qlen * config.num_experts_per_tok);
    for (int i = 0; i < qlen * config.num_experts_per_tok; i++) {
      expert_ids[i] = i % config.expert_num;
      weights[i] = 0.01;
    }
    forward(&qlen, config.num_experts_per_tok, expert_ids.data(), weights.data(), input.data(), output.data(), false);
  }

  void forward(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input, void* output,
               bool incremental = false) {
    int qlen_local = qlen;
    forward(&qlen_local, k, expert_ids, weights, input, output, incremental);
  }

  void forward(int* qlen_ptr, int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
    forward(qlen_ptr, k, expert_ids, weights, input, output, false);
  }

  void forward_binding(intptr_t qlen_ptr, int k, intptr_t expert_ids, intptr_t weights, intptr_t input, intptr_t output,
                       bool incremental) {
    forward((int*)qlen_ptr, k, (const int64_t*)expert_ids, (const float*)weights, (const void*)input, (void*)output,
            incremental);
  }

  void forward(int* qlen_ptr, int k, const int64_t* expert_ids, const float* weights, const void* input, void* output,
               bool incremental) {
    if (weights_loaded == false) [[unlikely]] {
      throw std::runtime_error("Not Loaded");
    }
#ifdef FORWARD_TIME_REPORT
    auto start = std::chrono::high_resolution_clock::now();
#endif
    int qlen = *qlen_ptr;

    auto pool = config.pool;
    pool->dispense_backend()->do_numa_job([this, pool, qlen, k, expert_ids, input, weights](int numa_id) {
      tps[numa_id]->forward(qlen, k, expert_ids, weights, input, this->local_output_numa[numa_id]);
    });

    merge_results(qlen, output, incremental);
#ifdef FORWARD_TIME_REPORT
    auto end = std::chrono::high_resolution_clock::now();
    auto forward_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    int unique_experts = 0;
    {
      std::unordered_set<int64_t> expert_set;
      for (int i = 0; i < qlen * config.num_experts_per_tok; i++) {
        expert_set.insert(expert_ids[i]);
      }
      unique_experts = expert_set.size();
    }
    auto band_width =
        (1.0 * unique_experts * config.hidden_size * config.intermediate_size * 3 / 1e9) / (1.0 * forward_time / 1e6);
    auto GFLOPS =
        (1.0 * config.hidden_size * config.intermediate_size * qlen * 3 * config.num_experts_per_tok * 2 / 1e9) /
        (1.0 * forward_time / 1e6);
    if (qlen <= 10) {
      forward_time_sum_ns += forward_time;
      forward_count++;
    }
    auto average_bandwidth =
        (1.0 * forward_count * unique_experts * config.hidden_size * config.intermediate_size * 3 / 1e9) /
        (1.0 * forward_time_sum_ns / 1e6);
    printf(
        "forward time %ld, time stamp:%ld, band width %f GElement/s, ave bandwidth %f GElement/s (only "
        "decode), %f GFLOPS, me numa: %d\n",
        forward_time, end.time_since_epoch().count() / 1000 % 100000000, band_width, average_bandwidth, GFLOPS,
        numa_node_of_cpu(sched_getcpu()));
#endif
  }

  virtual void load_weights() = 0;

  virtual void merge_results(int qlen, void* output) = 0;

  virtual void merge_results(int qlen, void* output, bool incremental) {
    if (incremental == false) {
      merge_results(qlen, output);
    } else {
      throw std::runtime_error("Not Implemented");
    }
  };
};

template <MOE_TP_PART T>
class TP_MOE : public TP_MOE_Common<T> {
 public:
  using TP_MOE_Common<T>::TP_MOE_Common;
  void load_weights(const uint64_t* physical_to_logical_map) { throw std::runtime_error("Not Implemented"); }
  // void merge_results(int qlen, void *output, bool incremental) { throw std::runtime_error("Not Implemented"); }
};

#endif


================================================
FILE: kt-kernel/operators/moe_kernel/api/common.h
================================================
// BOOST_STRONG_TYPEDEF(int8_t, int4_2_t);
#pragma once
#include <cstdint>

#include "llama.cpp/ggml.h"
#if !defined(CPUINFER_HAS_FLOAT16_T)
using float16_t = ggml_fp16_t;
#define CPUINFER_HAS_FLOAT16_T 1
#endif

#if !defined(CPUINFER_HAS_BFLOAT16_T)
using bfloat16_t = ggml_bf16_t;
#define CPUINFER_HAS_BFLOAT16_T 1
#endif  // CPUINFER_HAS_BFLOAT16_T
const bool PACKED = true;
#if defined(__aarch64__) || defined(__arm__) || defined(CPU_USE_KML)
#ifndef CPU_USE_KML
#define CPU_USE_KML
#endif
#endif  // USE_MOE_KERNEL_AMD or CPU_USE_KML

#define STRONG_TYPEDEF(T, D)                                   \
  struct D {                                                   \
    T t;                                                       \
    explicit D(const T &v) : t(v) {}                           \
    D() = default;                                             \
    D(const D &) = default;                                    \
    D &operator=(const D &) = default;                         \
    D &operator=(const T &rhs) {                               \
      t = rhs;                                                 \
      return *this;                                            \
    }                                                          \
    operator const T &() const { return t; }                   \
    operator T &() { return t; }                               \
    bool operator==(const D &rhs) const { return t == rhs.t; } \
    bool operator!=(const D &rhs) const { return t != rhs.t; } \
    bool operator<(const D &rhs) const { return t < rhs.t; }   \
  };
STRONG_TYPEDEF(int8_t, int4_2_t)
typedef int8_t BLASINT8;

/* matrix transpose or conjugate transpose */
typedef enum KERNEL_CBLAS_TRANSPOSE {
  KernelCblasNoTrans = 111,
  KernelCblasTrans = 112,
  KernelCblasConjTrans = 113,
  KernelCblasConjNoTrans = 114
} KERNEL_CBLAS_TRANSPOSE;
/* matrix stored in rows or cols */
typedef enum KERNEL_CBLAS_ORDER { KernelCblasRowMajor = 101, KernelCblasColMajor = 102 } KERNEL_CBLAS_ORDER;
/* matrix position is left or right */
typedef enum KERNEL_CBLAS_SIDE { KernelCblasLeft = 141, KernelCblasRight = 142 } KERNEL_CBLAS_SIDE;
typedef KERNEL_CBLAS_ORDER KERNEL_CBLAS_LAYOUT;
typedef enum KERNEL_CBLAS_OFFSET {
  KernelCblasRowOffset = 171,
  KernelCblasColOffset = 172,
  KernelCblasFixOffset = 173
} KERNEL_CBLAS_OFFSET;

enum class MatKernelVariant {
  Decode,
  Prefill,
};

================================================
FILE: kt-kernel/operators/moe_kernel/api/mat_kernel.h
================================================
#pragma once

#include <cstddef>
#include <cstdint>
#include <type_traits>

#include "common.h"

using GemmFn = void (*)(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                        const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc, const size_t m,
                        const size_t n, const size_t k, const float alpha, const void* a, const size_t lda,
                        const int8_t oa, const void* b, const size_t ldb, const int8_t ob, const float beta, int32_t* c,
                        const size_t ldc, const int32_t* oc);

struct MatKernelSelection {
  GemmFn fn;
  int divide_elements_size;
};

MatKernelSelection select_kernel_for_int4(MatKernelVariant variant);
MatKernelSelection select_kernel_for_int8(MatKernelVariant variant);

template <typename T>
MatKernelSelection select_mat_kernel(MatKernelVariant variant) {
  if constexpr (std::is_same_v<typename T::dt, int4_2_t>) {
    return select_kernel_for_int4(variant);
  } else {
    return select_kernel_for_int8(variant);
  }
}


================================================
FILE: kt-kernel/operators/moe_kernel/la/kernel.hpp
================================================
#ifndef CPUINFER_OPERATOR_KERNEL_LA_HPP
#define CPUINFER_OPERATOR_KERNEL_LA_HPP

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <stdexcept>
#include <string>
#include <vector>

#include "../api/common.h"
#include "../mat_kernel/batch_gemm_api.hpp"
#include "llama.cpp/ggml.h"
static const size_t MAX_Nth_B = 1024, MAX_N_B = 1024, MAX_K_B = 10240;
namespace moe_kernel {
template <typename T>
T *offset_pointer(T *ptr, size_t byte_offset) {
  return reinterpret_cast<T *>(reinterpret_cast<char *>(ptr) + byte_offset);
}

inline float bf16_to_fp32(ggml_bf16_t src) {
  // 将 bfloat16 的 16 位移到 float32 的高 16 位，低 16 位填充 0
  uint16_t *src_16 = reinterpret_cast<uint16_t *>(&src);
  uint32_t packed = (uint32_t)*src_16 << 16;

  // 使用 union 将 uint32 解释为 float
  union {
    uint32_t u;
    float f;
  } converter;

  converter.u = packed;
  return converter.f;
}

inline float fp16_to_fp32(ggml_fp16_t src) { return ggml_fp16_to_fp32(src); }

template <typename K>
struct BufferAImpl {
  int8_t *a;
  float *d;
  int max_m, k;
  bool if_pack = false;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int K_STEP = K::K_STEP;
  // K_BLOCK is runtime-configurable via kernel tiling; expose as function to avoid constexpr requirements
  static inline int K_BLOCK() { return K::K_BLOCK; }
  static constexpr int PACK_SIZE_M = K::PACK_SIZE_M;
  static constexpr int PACK_SIZE_K = K::PACK_SIZE_K;

  static size_t required_size(int max_m, int k) { return sizeof(int8_t) * max_m * k + sizeof(float) * max_m; }

  BufferAImpl(int max_m, int k, void *ptr, bool if_pack = false) : max_m(max_m), k(k), if_pack(if_pack) {
    set_data(ptr);
  }

  BufferAImpl(int max_m, int k, bool if_pack = false) : max_m(max_m), k(k), if_pack(if_pack) {
    if (max_m % M_STEP != 0 || k % K_STEP != 0) {
      throw std::runtime_error("max_m and k must be multiples of M_STEP and K_STEP respectively");
    }
  }

  void set_data(void *ptr) {
    a = reinterpret_cast<int8_t *>(ptr);
    d = reinterpret_cast<float *>(a + max_m * k);
  }

  size_t required_size() const { return sizeof(int8_t) * max_m * k + sizeof(float) * max_m; }

  BufferAImpl<K> offset_row(size_t row_begin, size_t row_block) {
    auto buffera = BufferAImpl<K>(row_block, k, a + row_begin * k, if_pack);
    buffera.d = d + row_begin;
    return buffera;
  }

  // 将输入的 A 矩阵转换成 int8_t 的形式，
  // 这里的 A 矩阵是 m * k 的矩阵，存储在 a 中, 是行主序的 (row major)
  void from_mat(int m, ggml_bf16_t *src, int ith, int mth) {
    // printf("in A from_mat, m = %d, ith = %d, nth = %d\n", m, ith, nth);
    auto [m_start, m_end] = K::split_range_m(m, ith, mth);
    int m_block_begin = m_start;
    int m_block_size = m_end - m_block_begin;
    if (m_block_size < 0) {
      throw std::runtime_error("m_block_size is negative, this should not happen");
    }
    for (int m_begin = 0; m_begin < m_block_size; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m_block_size; i++) {
        float amax = 0;
        // TODO: 后续用 SVE 来加速
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = bf16_to_fp32(src[(m_block_begin + m_begin + i) * k + j]);
          f = f < 0 ? -f : f;
          if (f > amax) {
            amax = f;
          }
        }
        d[m_block_begin + m_begin + i] = amax / ((1 << 7) - 1);
        // TODO: 后续用 SVE 来加速
        // 通过这个 amax 来量化这一行
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = bf16_to_fp32(src[(m_block_begin + m_begin + i) * k + j]);
          if (if_pack) {
            throw std::runtime_error("Packing is deprecated in this function");
            size_t split_m = (m_begin + i) / PACK_SIZE_M;
            size_t m_idx = (m_begin + i) % PACK_SIZE_M;
            size_t split_k = j / PACK_SIZE_K;
            size_t k_idx = j % PACK_SIZE_K;
            size_t buff_idx = m_block_begin * k + split_m * PACK_SIZE_M * k + split_k * PACK_SIZE_K * PACK_SIZE_M +
                              m_idx * PACK_SIZE_K + k_idx;
            a[buff_idx] = static_cast<int8_t>(std::round(f / d[m_block_begin + m_begin + i]));
          } else {
            // 这里的 amax 是当前行的最大值
            a[(m_block_begin + m_begin + i) * k + j] =
                static_cast<int8_t>(std::round(f / d[m_block_begin + m_begin + i]));
          }
        }
      }
    }
  }

  void from_mat(int m, ggml_fp16_t *src, int ith, int mth) {
    // printf("in A from_mat, m = %d, ith = %d, nth = %d\n", m, ith, nth);
    auto [m_start, m_end] = K::split_range_m(m, ith, mth);
    int m_block_begin = m_start;
    int m_block_size = m_end - m_block_begin;
    if (m_block_size < 0) {
      throw std::runtime_error("m_block_size is negative, this should not happen");
    }
    for (int m_begin = 0; m_begin < m_block_size; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m_block_size; i++) {
        float amax = 0;
        // TODO: 后续用 SVE 来加速
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = fp16_to_fp32(src[(m_block_begin + m_begin + i) * k + j]);
          f = f < 0 ? -f : f;
          if (f > amax) {
            amax = f;
          }
        }
        d[m_block_begin + m_begin + i] = amax / ((1 << 7) - 1);
        // TODO: 后续用 SVE 来加速
        // 通过这个 amax 来量化这一行
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = fp16_to_fp32(src[(m_block_begin + m_begin + i) * k + j]);
          if (if_pack) {
            throw std::runtime_error("Packing is deprecated in this function");
            size_t split_m = (m_begin + i) / PACK_SIZE_M;
            size_t m_idx = (m_begin + i) % PACK_SIZE_M;
            size_t split_k = j / PACK_SIZE_K;
            size_t k_idx = j % PACK_SIZE_K;
            size_t buff_idx = m_block_begin * k + split_m * PACK_SIZE_M * k + split_k * PACK_SIZE_K * PACK_SIZE_M +
                              m_idx * PACK_SIZE_K + k_idx;
            a[buff_idx] = static_cast<int8_t>(std::round(f / d[m_block_begin + m_begin + i]));
          } else {
            // 这里的 amax 是当前行的最大值
            a[(m_block_begin + m_begin + i) * k + j] =
                static_cast<int8_t>(std::round(f / d[m_block_begin + m_begin + i]));
          }
        }
      }
    }
  }

  // 这里是针对 gate_output 作为 fp32 的形式，量化到 int8_t 的形式
  // 这里的 A 矩阵是 m * n (intermediate_size) 的矩阵，存储在 a 中, 是行主序的 (row major)
  void from_mat(int m, float *src, int ith, int mth) {
    assert(m <= max_m);
    // assert(ith == 0 && nth == 1);
    auto [m_start, m_end] = K::split_range_m(m, ith, mth);
    int m_block_begin = m_start;
    int m_block_size = m_end - m_block_begin;
    for (int m_begin = 0; m_begin < m_block_size; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m_block_size; i++) {
        float amax = 0;
        // TODO: 后续用 SVE 来加速
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = src[(m_block_begin + m_begin + i) * k + j];
          f = f < 0 ? -f : f;
          if (f > amax) {
            amax = f;
          }
        }
        d[m_block_begin + m_begin + i] = amax / ((1 << 7) - 1);
        // TODO: 后续用 SVE 来加速
        // 通过这个 amax 来量化这一行
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = src[(m_block_begin + m_begin + i) * k + j];
          if (if_pack) {
            throw std::runtime_error("Packing is deprecated in this function");
            size_t split_m = (m_begin + i) / PACK_SIZE_M;
            size_t m_idx = (m_begin + i) % PACK_SIZE_M;
            size_t split_k = j / PACK_SIZE_K;
            size_t k_idx = j % PACK_SIZE_K;
            size_t buff_idx = m_block_begin * k + split_m * PACK_SIZE_M * k + split_k * PACK_SIZE_K * PACK_SIZE_M +
                              m_idx * PACK_SIZE_K + k_idx;
            a[buff_idx] = static_cast<int8_t>(std::round(f / d[m_block_begin + m_begin + i]));
          } else {
            // 这里的 amax 是当前行的最大值
            a[(m_block_begin + m_begin + i) * k + j] =
                static_cast<int8_t>(std::round(f / d[m_block_begin + m_begin + i]));
          }
        }
      }
    }
  }

  void from_mat(int m, float *src) {
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
        float amax = 0;
        // TODO: 后续用 SVE 来加速
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = src[(m_begin + i) * k + j];
          f = f < 0 ? -f : f;
          if (f > amax) {
            amax = f;
          }
        }
        d[m_begin + i] = amax / ((1 << 7) - 1);
        // TODO: 后续用 SVE 来加速
        // 通过这个 amax 来量化这一行
        for (int j = 0; j < k; j++) {
          // 先把 src 转换成 float
          float f = src[(m_begin + i) * k + j];
          // 这里的 amax 是当前行的最大值
          a[(m_begin + i) * k + j] = static_cast<int8_t>(std::round(f / d[m_begin + i]));
        }
      }
    }
  }

  // 反量化
  void to_mat(int m, float *dst, int ith, int mth) {
    auto [m_start, m_end] = K::split_range_m(m, ith, mth);
    int m_block_begin = m_start;
    int m_block_size = m_end - m_block_begin;
    for (int m_begin = 0; m_begin < m_block_size; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m_block_size; i++) {
        for (int j = 0; j < k; j++) {
          float f = static_cast<float>(a[(m_block_begin + m_begin + i) * k + j]);
          f *= d[m_block_begin + m_begin + i];
          dst[(m_block_begin + m_begin + i) * k + j] = f;
        }
      }
    }
  }

  float *get_scale(int m, int m_begin) { return d + m_begin; }
};

template <typename K>
struct BufferCImpl {
  int32_t *c;
  int max_m, n;
  bool if_row_major;

  static constexpr int M_STEP = K::M_STEP;
  static constexpr int N_STEP = K::N_STEP;
  // N_BLOCK is runtime-configurable via kernel tiling; expose as function to avoid constexpr requirements
  static inline int N_BLOCK() { return K::N_BLOCK; }

  static size_t required_size(int max_m, int n) { return sizeof(int32_t) * max_m * n; }

  BufferCImpl(int max_m, int n, void *ptr, bool if_row_major = false) : max_m(max_m), n(n), if_row_major(if_row_major) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    assert(max_m % M_STEP == 0);
    assert(n % N_STEP == 0);
    c = reinterpret_cast<int *>(ptr);
  }

  BufferCImpl(int max_m, int n, bool if_row_major = false) : max_m(max_m), n(n), if_row_major(if_row_major) {}

  void set_data(void *ptr) {
    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
    c = reinterpret_cast<int32_t *>(ptr);
  }
  size_t required_size() const { return sizeof(int32_t) * max_m * n; }

  // void to_mat(int m, float **dst, int ith, int nth) {
  //   *dst = c + ith * N_BLOCK;
  // }
};

struct GemmKernelInt8 {
  using dt = int8_t;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 1;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  // static const int M_STEP = TILE_M * 2;
  // static const int N_STEP = TILE_N * 2;
  // static const int K_STEP = TILE_K;
  static const int M_STEP = 1;
  static const int N_STEP = 1;
  static const int K_STEP = 1;

  // static inline const int N_BLOCK = 1024;
  // Make tiling params runtime-configurable (modifiable via Python bindings)
  static inline int N_BLOCK_UP_GATE = 32;
  static inline int N_BLOCK_DOWN = 64;
  static inline int N_BLOCK_UP_GATE_PREFI = 32;
  static inline int N_BLOCK_DOWN_PREFI = 64;
  static inline int N_BLOCK = 64;
  static inline int M_BLOCK = 320;
  // static inline const int N_BLOCK = 32;
  static inline int K_BLOCK = 7168;

  // Setter/getter for runtime tiling configuration
  static void set_tiling(int n_block_up_gate, int n_block_down, int n_block, int m_block, int k_block,
                         int n_block_up_gate_prefi, int n_block_down_prefi) {
    N_BLOCK_UP_GATE = n_block_up_gate;
    N_BLOCK_DOWN = n_block_down;
    N_BLOCK = n_block;
    M_BLOCK = m_block;
    K_BLOCK = k_block;
    N_BLOCK_UP_GATE_PREFI = n_block_up_gate_prefi;
    N_BLOCK_DOWN_PREFI = n_block_down_prefi;
  }
  static std::tuple<int, int, int, int, int, int, int> get_tiling() {
    return std::make_tuple(N_BLOCK_UP_GATE, N_BLOCK_DOWN, N_BLOCK, M_BLOCK, K_BLOCK, N_BLOCK_UP_GATE_PREFI,
                           N_BLOCK_DOWN_PREFI);
  }

  static inline const int PACK_SIZE_N = 8;
  static inline const int PACK_SIZE_M = 8;
  static inline const int PACK_SIZE_K = 32;

  static std::string name() { return "MOE_INT8"; }
  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
  // type_: d for decode, p for prefill
  static int recommended_nth_down(int n, char type_ = 'd') {
    if (type_ == 'p') {
      if (n % N_BLOCK_DOWN_PREFI != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_DOWN_PREFI in prefill");
      }
      return n / N_BLOCK_DOWN_PREFI;
    } else {
      if (n % N_BLOCK_DOWN != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_DOWN in decode");
      }
      return n / N_BLOCK_DOWN;
    }
  }

  static int recommended_nth_up_gate(int n, char type_ = 'd') {
    if (type_ == 'p') {
      if (n % N_BLOCK_UP_GATE_PREFI != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_UP_GATE_PREFI in prefill");
      }
      return n / N_BLOCK_UP_GATE_PREFI;
    } else {
      if (n % N_BLOCK_UP_GATE != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_UP_GATE in decode");
      }
      return n / N_BLOCK_UP_GATE;
    }
  }

  static int recommended_mth(int m) { return (m + M_BLOCK - 1) / M_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth, int block_size = N_BLOCK) {
    int n_start = block_size * ith;
    int n_end = std::min(n, block_size * (ith + 1));
    return {n_start, n_end};
  }

  static std::pair<int, int> split_range_m(int m, int ith, int mth = 0) {
    int m_start = M_BLOCK * ith;
    int m_end = std::min(m, M_BLOCK * (ith + 1));
    return {m_start, m_end};
  }

  static std::pair<int, int> split_range_n_block(int n, int ith, int nth, int block) {
    int n_start = block * ith;
    int n_end = std::min(n, block * (ith + 1));
    return {n_start, n_end};
  }

  using BufferA = BufferAImpl<GemmKernelInt8>;
  using BufferC = BufferCImpl<GemmKernelInt8>;

  struct BufferB {
    int8_t *b;
    std::vector<int8_t *> b_pack;  // b_pack[i] -> the ith block (the ith packed matrix of B)
    size_t reorder_B_size;
    size_t nth_B;       // number of blocks of B
    size_t block_size;  // size of each block of B
    float *d;
    int n, k;
    static constexpr bool SCALE = true;
    bool if_pack = false;
    // n for normal, u for up_gate, d for down
    static size_t required_size(int n, int k, bool if_pack = false, char mat_type = 'n', bool plain = true) {
      int nth, n_block;
      if (if_pack && !plain) {
        switch (mat_type) {
          case 'n':
            nth = recommended_nth(n);
            n_block = N_BLOCK;
            break;
          case 'u':
            nth = recommended_nth_up_gate(n);
            n_block = N_BLOCK_UP_GATE;
            break;
          case 'd':
            nth = recommended_nth_down(n);
            n_block = N_BLOCK_DOWN;
            break;
          default:
            throw std::invalid_argument("Invalid mat_type");
        }
        size_t reorder_B_size = get_reorder_B_size(KernelCblasRowMajor, KernelCblasNoTrans, k, n_block);
        return sizeof(int8_t) * nth * reorder_B_size + sizeof(float) * n;
      } else {
        return sizeof(int8_t) * n * k + sizeof(float) * n;
      }
    }
    BufferB(int n, int k, bool if_pack = false, char mat_type = 'n', bool plain = true) : n(n), k(k), if_pack(if_pack) {
      int nth, n_block;
      if (if_pack && !plain) {
        switch (mat_type) {
          case 'n':
            nth = recommended_nth(n);
            n_block = N_BLOCK;
            break;
          case 'u':
            nth = recommended_nth_up_gate(n);
            n_block = N_BLOCK_UP_GATE;
            break;
          case 'd':
            nth = recommended_nth_down(n);
            n_block = N_BLOCK_DOWN;
            break;
          default:
            throw std::invalid_argument("Invalid mat_type");
        }
        reorder_B_size = get_reorder_B_size(KernelCblasRowMajor, KernelCblasNoTrans, k, n_block);
        nth_B = nth;
        block_size = n_block;
        b_pack.resize(nth);
      }
      if (n % N_STEP != 0 || k % K_STEP != 0) {
        throw std::runtime_error("n and k must be multiples of N_STEP and K_STEP respectively");
      }
    }
    BufferB(int n, int k, void *ptr, bool if_pack = false, char mat_type = 'n', bool plain = true)
        : BufferB(n, k, if_pack, mat_type, plain) {
      set_data(ptr, plain);
      // printf("mat_type:%c,nth_B:%zu,b_pack_ptr[0]:%p,d_ptr:%p,ptr:%p\n", mat_type, nth_B, b_pack[0], d, ptr);
    }
    void set_data(void *ptr, bool plain = true) {
      if (if_pack && !plain) {
        for (size_t i = 0; i < nth_B; i++) {
          b_pack[i] = reinterpret_cast<int8_t *>(ptr) + i * reorder_B_size;
        }
        d = reinterpret_cast<float *>((int8_t *)ptr + nth_B * reorder_B_size);
      } else {
        b = reinterpret_cast<int8_t *>(ptr);
        d = reinterpret_cast<float *>(b + n * k);
      }
    }
    size_t required_size() const { return sizeof(int8_t) * n * k + sizeof(float) * n; }
    BufferB offset_col(size_t col_begin, size_t col_block) {
      auto bufferb = BufferB(col_block, k, b + col_begin * k, if_pack);
      bufferb.d = d + col_begin;
      return bufferb;
    }
    // B 矩阵是 K * N 的矩阵，存储在 b 中, 是列主序的 (column major)
    void from_mat(ggml_bf16_t *src, int ith, int nth, int n_new = -1, bool if_pack = false,
                  bool plain = true) {  // CHECK: nth has no usage
      if (n_new > 0) {
        n = n_new;  // 如果 n_new 大于 0，则使用 n_new
      }
      // 这里将 src 转换成 int8_t 的形式，按照k 维度量化  (也就是按列量化)
      int8_t *b_t = nullptr;
      if ((if_pack || this->if_pack) && !plain) {
        b_t = (int8_t *)malloc(sizeof(int8_t) * n * k);
      }
      auto [n_start, n_end] = split_range_n(n, ith, nth, block_size);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP && n_begin + i < n_block_size; i++) {
          float amax = 0;
          // TODO: 后续用 SVE 来加速
          for (int j = 0; j < k; j++) {
            // 先把 src 转换成 float
            float f = bf16_to_fp32(src[(n_block_begin + n_begin + i) * k + j]);
            f = f < 0 ? -f : f;
            if (f > amax) {
              amax = f;
            }
          }
          d[n_block_begin + n_begin + i] = amax / ((1 << 7) - 1);
          // TODO: 后续用 SVE 来加速
          // 通过这个 amax 来量化这一列
          for (int j = 0; j < k; j++) {
            // 先把 src 转换成 float
            float f = bf16_to_fp32(src[(n_block_begin + n_begin + i) * k + j]);
            if ((if_pack || this->if_pack) && plain) {
              size_t split_n = (n_begin + i) / PACK_SIZE_N;
              size_t n_idx = (n_begin + i) % PACK_SIZE_N;
              size_t split_k = j / PACK_SIZE_K;
              size_t k_idx = j % PACK_SIZE_K;

              size_t buff_idx = n_block_begin * k + split_n * PACK_SIZE_N * k + split_k * PACK_SIZE_N * PACK_SIZE_K +
                                n_idx * PACK_SIZE_K + k_idx;
              b[buff_idx] = static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
            } else if ((if_pack || this->if_pack) && !plain) {
              // 这里的 amax 是当前列的最大值
              b_t[(n_begin + i) * k + j] = static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
            } else {
              b[(n_block_begin + n_begin + i) * k + j] =
                  static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
            }
          }
        }
      }
      if ((if_pack || this->if_pack) && !plain) {
        // 在这里调用 AMD 的reorder函数
        reorder_B_gemm(KernelCblasColMajor, KernelCblasNoTrans, k, n_block_size, k, b_t, b_pack[ith]);
        free(b_t);
      }
    }

    void from_mat(float *src, int ith, int nth, int n_new = -1, bool if_pack = false) {  // CHECK: nth has no usage
      if (n_new > 0) {
        n = n_new;  // 如果 n_new 大于 0，则使用 n_new
      }
      // 这里将 src 转换成 int8_t 的形式，按照k 维度量化  (也就是按列量化)
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      // printf("n_start = %d, n_end = %d, n = %d\n", n_start, n_end, n);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      float average = 0;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP && n_begin + i < n_block_size; i++) {
          float amax = 0;
          // TODO: 后续用 SVE 来加速
          for (int j = 0; j < k; j++) {
            // 先把 src 转换成 float
            float f = src[(n_block_begin + n_begin + i) * k + j];
            f = f < 0 ? -f : f;
            average += f;
            if (f > amax) {
              amax = f;
            }
          }
          average /= k;
          d[n_block_begin + n_begin + i] = amax / ((1 << 7) - 1);
          // printf("amax: %f,average: %f\n", amax, average);
          // TODO: 后续用 SVE 来加速
          // 通过这个 amax 来量化这一列
          for (int j = 0; j < k; j++) {
            // 先把 src 转换成 float
            float f = src[(n_block_begin + n_begin + i) * k + j];
            // 这里的 amax 是当前列的最大值
            if (if_pack || this->if_pack) {
              size_t split_n = (n_begin + i) / PACK_SIZE_N;
              size_t n_idx = (n_begin + i) % PACK_SIZE_N;
              size_t split_k = j / PACK_SIZE_K;
              size_t k_idx = j % PACK_SIZE_K;

              size_t buff_idx = n_block_begin * k + split_n * PACK_SIZE_N * k + split_k * PACK_SIZE_N * PACK_SIZE_K +
                                n_idx * PACK_SIZE_K + k_idx;
              b[buff_idx] = static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
            } else {
              b[(n_block_begin + n_begin + i) * k + j] =
                  static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
            }
          }
        }
      }
    }

    void from_mat_row_major(float *src, int ld, int ith, int nth, int n_new = -1) {  // CHECK: nth has no usage
      if (n_new > 0) {
        n = n_new;  // 如果 n_new 大于 0，则使用 n_new
      }
      // 这里将 src 转换成 int8_t 的形式，按照k 维度量化  (也就是按列量化),但是 src 是行主序的
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP && n_begin + i < n_block_size; i++) {
          float amax = 0;
          for (int j = 0; j < k; j++) {
            float f = src[j * ld + (n_block_begin + n_begin + i)];
            f = f < 0 ? -f : f;
            if (f > amax) {
              amax = f;
            }
          }
          d[n_block_begin + n_begin + i] = amax / ((1 << 7) - 1);
          for (int j = 0; j < k; j++) {
            float f = src[j * ld + (n_block_begin + n_begin + i)];
            // 这里的 amax 是当前列的最大值
            b[(n_block_begin + n_begin + i) * k + j] =
                static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
          }
        }
      }
    }

    // 将内容解量化为 float
    void to_mat(float *dst, int ith, int nth, int n_new = -1) {
      if (n_new > 0) {
        n = n_new;  // 如果 n_new 大于 0，则使用 n_new
      }
      // 这里将 b 转换成 float 的形式，按照k 维度解量化
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP && n_begin + i < n_block_size; i++) {
          // 通过这个 amax 来解量化这一列
          for (int j = 0; j < k; j++) {
            // 先把 b 转换成 float
            int8_t b_val = b[(n_block_begin + n_begin + i) * k + j];
            float d_val = d[n_block_begin + n_begin + i];
            dst[(n_block_begin + n_begin + i) * k + j] = b_val * d_val;
          }
        }
      }
    }

    float *get_scale(int n, int n_begin) { return d + n_begin; }
  };
  /* 将 buffer A 转为 buffer B, [m,k](row major) -> [k,n](column major) (n = m)
    而量化部分没变化，直接 buffer A 的 d = buffer B 的 d，校验 m 和 n 以及 k是否相等，才能转换
  */
  static void convert_buffer_a_to_buffer_b(BufferA *ba, BufferB *bb) {
    if (bb->n != ba->max_m || bb->k != ba->k || bb->if_pack != ba->if_pack) {
      throw std::runtime_error(
          "BufferA and BufferB dimensions do not match for conversion, or they are not the same pack.");
    }
    bb->b = ba->a;
    bb->d = ba->d;
  }

  static void convert_buffer_b_to_buffer_a(BufferB *bb, BufferA *ba) {
    if (ba->max_m != bb->n || ba->k != bb->k || ba->if_pack != bb->if_pack) {
      throw std::runtime_error(
          "BufferB and BufferA dimensions do not match for conversion, or they are not the same pack.");
    }
    ba->a = bb->b;
    ba->d = bb->d;
  }
  // 改变当前 C 的 view
  static void change_view(BufferC *c_src, BufferC *c_dst) {
    if (c_src->max_m != c_dst->n || c_src->n != c_dst->max_m || c_src->if_row_major == c_dst->if_row_major) {
      throw std::runtime_error("C buffer size mismatch or they are the same major");
    }
    c_dst->c = c_src->c;
  }
  // 此函数作用是，对 int32结果的 c 矩阵应用 A和 B 矩阵的scale（反量化）
  // 这里的 c 矩阵是 m * n 的矩阵，存储在 c 中, 是行主序的 (row major)
  // A 矩阵是 m * k 的矩阵，按照行量化，其 scale 是 d 是 m 维度的，对应每一行的量化系数
  // B 矩阵是 k * n 的矩阵，按照列量化，其 scale 是 d 是 n 维度的，对应每一列的量化系数
  // C 的第 i 行第 j 列的缩放值就是 A 的第 i 行的缩放值 * B 的第 j 列的缩放值
  static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc) {
    // TODO: 后续用 SVE 来加速
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
        float *scale_a = ba->get_scale(m, m_begin + i);
        for (int n_begin = 0; n_begin < n; n_begin += N_STEP) {
          for (int j = 0; j < N_STEP && n_begin + j < n; j++) {
            float *scale_b = bb->get_scale(n, n_begin + j);
            c[(m_begin + i) * n + (n_begin + j)] = (*scale_a) * (*scale_b) * bc->c[(m_begin + i) * n + (n_begin + j)];
          }
        }
      }
    }
  }

  // 对第二个维度分块的 apply scale
  static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc, int ith, int nth, int block,
                          int jth = -1) {
    // printf("use split apply scale\n");
    auto [n_start, n_end] = split_range_n_block(n, ith, nth, block);
    int m_start = 0, m_end = m;
    if (jth != -1) {
      auto tmp = split_range_m(m, jth);
      m_start = tmp.first;
      m_end = tmp.second;
    }
    // TODO: 后续用 SVE 来加速
    for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
        float *scale_a = ba->get_scale(m, m_begin + i);
        for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
          for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
            float *scale_b = bb->get_scale(n, n_begin + j);
            c[(m_begin + i) * n + (n_begin + j)] = (*scale_a) * (*scale_b) * bc->c[(m_begin + i) * n + (n_begin + j)];
          }
        }
      }
    }
  }

  // 两个维度均有分块的 apply scale
  // C 矩阵区分是 row major 还是 column major
  static void apply_scale(float *c, int ldc, BufferA *ba, BufferB *bb, BufferC *bc, int m_start, int m_end, int n_start,
                          int n_end, bool if_row_major = true, long long c_row_idx_offset = 0,
                          long long c_col_idx_offset = 0) {
    if (if_row_major) {
      for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
          float *scale_a = ba->get_scale(m_end, m_begin + i);
          for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
            for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
              float *scale_b = bb->get_scale(n_end, n_begin + j);
              c[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc->c[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)];
            }
          }
        }
      }
    } else {
      for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
        for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
          float *scale_b = bb->get_scale(n_end, n_begin + j);
          for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
              float *scale_a = ba->get_scale(m_end, m_begin + i);
              c[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc->c[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)];
            }
          }
        }
      }
    }
  }

  // 两个维度均有分块的 apply scale
  // C 矩阵区分是 row major 还是 column major
  static void apply_scale(float *c, int ldc, BufferA *ba, BufferB *bb, int32_t *bc, int m_start, int m_end, int n_start,
                          int n_end, bool if_row_major = true, long long c_row_idx_offset = 0,
                          long long c_col_idx_offset = 0) {
    if (if_row_major) {
      for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
          float *scale_a = ba->get_scale(m_end, m_begin + i);
          for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
            for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
              float *scale_b = bb->get_scale(n_end, n_begin + j);
              c[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)];
            }
          }
        }
      }
    } else {
      for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
        for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
          float *scale_b = bb->get_scale(n_end, n_begin + j);
          for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
              float *scale_a = ba->get_scale(m_end, m_begin + i);
              c[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)];
            }
          }
        }
      }
    }
  }
};

struct GemmKernelInt4 {
  using dt = int4_2_t;
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  // static const int M_STEP = TILE_M * 2;
  // static const int N_STEP = TILE_N * 2;
  // static const int K_STEP = TILE_K;
  static const int M_STEP = 1;
  static const int N_STEP = 1;
  static const int K_STEP = 1;

  // static inline const int N_BLOCK = 1024;
  // Make tiling params runtime-configurable (modifiable via Python bindings)
  static inline int N_BLOCK_UP_GATE = 256;
  static inline int N_BLOCK_DOWN = 1024;
  static inline int N_BLOCK_UP_GATE_PREFI = 256;
  static inline int N_BLOCK_DOWN_PREFI = 1024;
  static inline int N_BLOCK = 64;
  static inline int M_BLOCK = 320;
  // static inline const int N_BLOCK = 32;
  static inline int K_BLOCK = 7168;

  // Setter/getter for runtime tiling configuration
  static void set_tiling(int n_block_up_gate, int n_block_down, int n_block, int m_block, int k_block,
                         int n_block_up_gate_prefi, int n_block_down_prefi) {
    N_BLOCK_UP_GATE = n_block_up_gate;
    N_BLOCK_DOWN = n_block_down;
    N_BLOCK = n_block;
    M_BLOCK = m_block;
    K_BLOCK = k_block;
    N_BLOCK_UP_GATE_PREFI = n_block_up_gate_prefi;
    N_BLOCK_DOWN_PREFI = n_block_down_prefi;
  }
  static std::tuple<int, int, int, int, int, int, int> get_tiling() {
    return std::make_tuple(N_BLOCK_UP_GATE, N_BLOCK_DOWN, N_BLOCK, M_BLOCK, K_BLOCK, N_BLOCK_UP_GATE_PREFI,
                           N_BLOCK_DOWN_PREFI);
  }

  static inline const int PACK_SIZE_N = 8;
  static inline const int PACK_SIZE_K = 32;
  static inline const int PACK_SIZE_M = 8;

  static std::string name() { return "MOE_INT4"; }
  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static int recommended_nth_down(int n, char type_ = 'd') {
    if (type_ == 'p') {
      if (n % N_BLOCK_DOWN_PREFI != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_DOWN_PREFI in prefill");
      }
      return n / N_BLOCK_DOWN_PREFI;
    } else {
      if (n % N_BLOCK_DOWN != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_DOWN in decode");
      }
      return n / N_BLOCK_DOWN;
    }
  }
  static int recommended_mth(int m) { return (m + M_BLOCK - 1) / M_BLOCK; }

  static int recommended_nth_up_gate(int n, char type_ = 'd') {
    if (type_ == 'p') {
      if (n % N_BLOCK_UP_GATE_PREFI != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_UP_GATE_PREFI in prefill");
      }
      return n / N_BLOCK_UP_GATE_PREFI;
    } else {
      if (n % N_BLOCK_UP_GATE != 0) {
        throw std::invalid_argument("n must be multiple of N_BLOCK_UP_GATE in decode");
      }
      return n / N_BLOCK_UP_GATE;
    }
  }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }
  static std::pair<int, int> split_range_m(int m, int ith, int mth) {
    int n_start = M_BLOCK * ith;
    int n_end = std::min(m, M_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static std::pair<int, int> split_range_n_block(int n, int ith, int nth, int block) {
    int n_start = block * ith;
    int n_end = std::min(n, block * (ith + 1));
    return {n_start, n_end};
  }

  using BufferA = BufferAImpl<GemmKernelInt4>;
  using BufferC = BufferCImpl<GemmKernelInt4>;

  struct BufferB {
    dt *b;
    float *d;
    int n, k;
    std::vector<int8_t *> b_pack;  // b_pack[i] -> the ith block (the ith packed matrix of B)
    static constexpr bool SCALE = true;
    bool if_pack = false;

    // static size_t required_size(int n, int k) { return sizeof(int8_t) * n * k / 2 + sizeof(float) * n; }
    static size_t required_size(int n, int k, bool if_pack = false, char mat_type = 'n', bool plain = true) {
      int nth, n_block;
      if (if_pack && !plain) {
        switch (mat_type) {
          case 'n':
            nth = recommended_nth(n);
            n_block = N_BLOCK;
            break;
          case 'u':
            nth = recommended_nth_up_gate(n);
            n_block = N_BLOCK_UP_GATE;
            break;
          case 'd':
            nth = recommended_nth_down(n);
            n_block = N_BLOCK_DOWN;
            break;
          default:
            throw std::invalid_argument("Invalid mat_type");
        }
        size_t reorder_B_size = get_reorder_B_size(KernelCblasRowMajor, KernelCblasNoTrans, k, n_block);
        return sizeof(int8_t) * nth * reorder_B_size + sizeof(float) * n;
      } else {
        return sizeof(int8_t) * n * k / 2 + sizeof(float) * n;
      }
    }

    // BufferB(int n, int k, void *ptr, bool if_pack = false) : n(n), k(k), if_pack(if_pack) {
    //   b = reinterpret_cast<dt *>(ptr);
    //   d = reinterpret_cast<float *>(moe_kernel::offset_pointer(b, n * k / 2));
    // }
    BufferB(int n, int k, bool if_pack = false, char mat_type = 'n', bool plain = true) : n(n), k(k), if_pack(if_pack) {
      if (n % N_STEP != 0 || k % K_STEP != 0) {
        throw std::runtime_error("n and k must be multiples of N_STEP and K_STEP respectively");
      }
    }
    BufferB(int n, int k, void *ptr, bool if_pack = false, char mat_type = 'n', bool plain = true)
        : BufferB(n, k, if_pack, mat_type, plain) {
      set_data(ptr, plain);
    }
    void set_data(void *ptr, bool plain = true) {
      b = reinterpret_cast<dt *>(ptr);
      d = reinterpret_cast<float *>(moe_kernel::offset_pointer(b, n * k / 2));
    }
    size_t required_size() const { return sizeof(int8_t) * n * k / 2 + sizeof(float) * n; }
    BufferB offset_col(size_t col_begin, size_t col_block) {
      auto bufferb = BufferB(col_block, k, moe_kernel::offset_pointer(b, (col_begin * k) / 2), if_pack);
      bufferb.d = d + col_begin;
      return bufferb;
    }
    // B 矩阵是 K * N 的矩阵，存储在 b 中, 是列主序的 (column major)
    void from_mat(ggml_bf16_t *src, int ith, int nth, int n_new = -1, bool if_pack = false,
                  bool plain = true) {  // CHECK: nth has no usage
      if (!if_pack && !this->if_pack) throw std::runtime_error("from mat for buffer should be packed");
      if (n_new > 0) {
        n = n_new;  // 如果 n_new 大于 0，则使用 n_new
      }
      // 这里将 src 转换成 int8_t 的形式，按照k 维度量化  (也就是按列量化)
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP && n_begin + i < n_block_size; i++) {
          float amax = 0;
          // TODO: 后续用 SVE 来加速
          for (int j = 0; j < k; j++) {
            // 先把 src 转换成 float
            float f = bf16_to_fp32(src[(n_block_begin + n_begin + i) * k + j]);
            f = f < 0 ? -f : f;
            if (f > amax) {
              amax = f;
            }
          }
          d[n_block_begin + n_begin + i] = amax / 112.0;
          // TODO: 后续用 SVE 来加速
          for (int k_start = 0; k_start < k; k_start += (PACK_SIZE_K * 2)) {
            for (int j = 0; j < PACK_SIZE_K; j++) {
              size_t split_n = (n_begin + i) / PACK_SIZE_N;
              size_t n_idx = (n_begin + i) % PACK_SIZE_N;
              size_t split_k = k_start / (PACK_SIZE_K * 2);
              size_t k_idx = j;

              size_t buff_idx = n_block_begin * k / 2 + split_n * PACK_SIZE_N * k / 2 +
                                split_k * PACK_SIZE_N * PACK_SIZE_K + n_idx * PACK_SIZE_K + k_idx;

              float f0 = bf16_to_fp32(src[(n_block_begin + n_begin + i) * k + k_start + j]);
              float f1 = bf16_to_fp32(src[(n_block_begin + n_begin + i) * k + k_start + j + PACK_SIZE_K]);
              // static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
              int8_t b0 = static_cast<int8_t>(std::round((f0 / (d[n_block_begin + n_begin + i] * 16.0))) * 16);
              int8_t b1 = static_cast<int8_t>(std::round((f1 / (d[n_block_begin + n_begin + i] * 16.0))) * 16);
              int8_t b01 = (b0 & 0xF0) | ((b1 >> 4) & 0x0F);
              // int8_t b01 = ((b0 << 4) & 0xF0) | ((b1)&0x0F);

              b[buff_idx] = b01;
            }
          }
        }
      }
    }

    void from_mat(float *src, int ith, int nth, int n_new = -1, bool if_pack = false) {  // CHECK: nth has no usage
      if (!if_pack && !this->if_pack) throw std::runtime_error("from mat for buffer should be packed");
      if (n_new > 0) {
        n = n_new;  // 如果 n_new 大于 0，则使用 n_new
      }
      // 这里将 src 转换成 int8_t 的形式，按照k 维度量化  (也就是按列量化)
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      // DEBUG: 查看 average 值
      float average = 0;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP && n_begin + i < n_block_size; i++) {
          float amax = 0;
          // TODO: 后续用 SVE 来加速
          for (int j = 0; j < k; j++) {
            // 先把 src 转换成 float
            float f = src[(n_block_begin + n_begin + i) * k + j];
            f = f < 0 ? -f : f;
            average += f;
            if (f > amax) {
              amax = f;
            }
          }
          average /= k;
          d[n_block_begin + n_begin + i] = amax / 112.0;
          // printf("amax: %f,average: %f\n", amax, average);
          // TODO: 后续用 SVE 来加速
          // 通过这个 amax 来量化这一列
          for (int k_start = 0; k_start < k; k_start += (PACK_SIZE_K * 2)) {
            for (int j = 0; j < PACK_SIZE_K; j++) {
              size_t split_n = (n_begin + i) / PACK_SIZE_N;
              size_t n_idx = (n_begin + i) % PACK_SIZE_N;
              size_t split_k = k_start / (PACK_SIZE_K * 2);
              size_t k_idx = j;

              size_t buff_idx = n_block_begin * k / 2 + split_n * PACK_SIZE_N * k / 2 +
                                split_k * PACK_SIZE_N * PACK_SIZE_K + n_idx * PACK_SIZE_K + k_idx;

              float f0 = (src[(n_block_begin + n_begin + i) * k + k_start + j]);
              float f1 = (src[(n_block_begin + n_begin + i) * k + k_start + j + PACK_SIZE_K]);
              // static_cast<int8_t>(std::round(f / d[n_block_begin + n_begin + i]));
              int8_t b0 = static_cast<int8_t>(std::round((f0 / (d[n_block_begin + n_begin + i] * 16.0))) * 16);
              int8_t b1 = static_cast<int8_t>(std::round((f1 / (d[n_block_begin + n_begin + i] * 16.0))) * 16);
              int8_t b01 = (b0 & 0xF0) | ((b1 >> 4) & 0x0F);
              // int8_t b01 = ((b0 << 4) & 0xF0) | ((b1)&0x0F);
              // if (n_begin == 0 && i == 0 && k_start == 0 && j <= 10) {
              //   printf("b0: %d, b1: %d, b01: %d,f0: %f, f1: %f, scale: %f\n", b0, b1, b01, f0, f1,
              //          d[n_block_begin + n_begin + i]);
              // }

              b[buff_idx] = b01;
            }
          }
        }
      }
      // printf("from_mat done, n: %d, k: %d, if_pack: %d\n", n, k, if_pack);
    }

    float *get_scale(int n, int n_begin) { return d + n_begin; }
  };
  /* 将 buffer A 转为 buffer B, [m,k](row major) -> [k,n](column major) (n = m)
    而量化部分没变化，直接 buffer A 的 d = buffer B 的 d，校验 m 和 n 以及 k是否相等，才能转换
  */
  static void convert_buffer_a_to_buffer_b(BufferA *ba, BufferB *bb) {
    if (bb->n != ba->max_m || bb->k != ba->k || bb->if_pack != ba->if_pack) {
      throw std::runtime_error(
          "BufferA and BufferB dimensions do not match for conversion, or they are not the same pack.");
    }
    throw std::runtime_error("int4 not support convert");
    // bb->b = ba->a;
    // bb->d = ba->d;
  }

  static void convert_buffer_b_to_buffer_a(BufferB *bb, BufferA *ba) {
    if (ba->max_m != bb->n || ba->k != bb->k || ba->if_pack != bb->if_pack) {
      throw std::runtime_error(
          "BufferB and BufferA dimensions do not match for conversion, or they are not the same pack.");
    }
    throw std::runtime_error("int4 not support convert");

    // ba->a = bb->b;
    // ba->d = bb->d;
  }
  // 改变当前 C 的 view
  static void change_view(BufferC *c_src, BufferC *c_dst) {
    if (c_src->max_m != c_dst->n || c_src->n != c_dst->max_m || c_src->if_row_major == c_dst->if_row_major) {
      throw std::runtime_error("C buffer size mismatch or they are the same major");
    }
    throw std::runtime_error("int4 not support convert");

    // c_dst->c = c_src->c;
  }
  // 此函数作用是，对 int32结果的 c 矩阵应用 A和 B 矩阵的scale（反量化）
  // 这里的 c 矩阵是 m * n 的矩阵，存储在 c 中, 是行主序的 (row major)
  // A 矩阵是 m * k 的矩阵，按照行量化，其 scale 是 d 是 m 维度的，对应每一行的量化系数
  // B 矩阵是 k * n 的矩阵，按照列量化，其 scale 是 d 是 n 维度的，对应每一列的量化系数
  // C 的第 i 行第 j 列的缩放值就是 A 的第 i 行的缩放值 * B 的第 j 列的缩放值
  static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc) {
    // TODO: 后续用 SVE 来加速
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
        float *scale_a = ba->get_scale(m, m_begin + i);
        for (int n_begin = 0; n_begin < n; n_begin += N_STEP) {
          for (int j = 0; j < N_STEP && n_begin + j < n; j++) {
            float *scale_b = bb->get_scale(n, n_begin + j);
            c[(m_begin + i) * n + (n_begin + j)] = (*scale_a) * (*scale_b) * bc->c[(m_begin + i) * n + (n_begin + j)];
          }
        }
      }
    }
  }
  // 对第二个维度分块的 apply scale
  static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc, int ith, int nth, int block) {
    // printf("use split apply scale\n");
    auto [n_start, n_end] = split_range_n_block(n, ith, nth, block);
    // TODO: 后续用 SVE 来加速
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
        float *scale_a = ba->get_scale(m, m_begin + i);
        for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
          for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
            float *scale_b = bb->get_scale(n, n_begin + j);
            c[(m_begin + i) * n + (n_begin + j)] = (*scale_a) * (*scale_b) * bc->c[(m_begin + i) * n + (n_begin + j)];
          }
        }
      }
    }
  }
  // 两个维度均有分块的 apply scale
  // C 矩阵区分是 row major 还是 column major
  static void apply_scale(float *c, int ldc, BufferA *ba, BufferB *bb, BufferC *bc, int m_start, int m_end, int n_start,
                          int n_end, bool if_row_major = true, long long c_row_idx_offset = 0,
                          long long c_col_idx_offset = 0) {
    if (if_row_major) {
      for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
          float *scale_a = ba->get_scale(m_end, m_begin + i);
          for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
            for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
              float *scale_b = bb->get_scale(n_end, n_begin + j);
              c[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc->c[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)];
            }
          }
        }
      }
    } else {
      for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
        for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
          float *scale_b = bb->get_scale(n_end, n_begin + j);
          for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
              float *scale_a = ba->get_scale(m_end, m_begin + i);
              c[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc->c[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)];
            }
          }
        }
      }
    }
  }

  // 两个维度均有分块的 apply scale
  // C 矩阵区分是 row major 还是 column major
  static void apply_scale(float *c, int ldc, BufferA *ba, BufferB *bb, int32_t *bc, int m_start, int m_end, int n_start,
                          int n_end, bool if_row_major = true, long long c_row_idx_offset = 0,
                          long long c_col_idx_offset = 0) {
    if (if_row_major) {
      for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
          float *scale_a = ba->get_scale(m_end, m_begin + i);
          for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
            for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
              float *scale_b = bb->get_scale(n_end, n_begin + j);
              c[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc[(m_begin + i + c_row_idx_offset) * ldc + (n_begin + j + c_col_idx_offset)];
            }
          }
        }
      }
    } else {
      for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
        for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
          float *scale_b = bb->get_scale(n_end, n_begin + j);
          for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
              float *scale_a = ba->get_scale(m_end, m_begin + i);
              c[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)] =
                  (*scale_a) * (*scale_b) *
                  bc[(n_begin + j + c_col_idx_offset) * ldc + (m_begin + i + c_row_idx_offset)];
            }
          }
        }
      }
    }
  }
};

}  // namespace moe_kernel

#endif

================================================
FILE: kt-kernel/operators/moe_kernel/la/mat_kernel.cpp
================================================
#include "../api/mat_kernel.h"

#include <cassert>

namespace {
constexpr int kInt4ElementDivisor = 2;
constexpr int kInt8ElementDivisor = 1;
}  // namespace
extern "C" {
void decode_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                               const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc, const size_t m,
                               const size_t n, const size_t k, const float alpha, const void* a, const size_t lda,
                               const int8_t oa, const void* b, const size_t ldb, const int8_t ob, const float beta,
                               int32_t* c, const size_t ldc, const int32_t* oc);

void prefill_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc, const size_t m,
                                const size_t n, const size_t k, const float alpha, const void* a, const size_t lda,
                                const int8_t oa, const void* b, const size_t ldb, const int8_t ob, const float beta,
                                int32_t* c, const size_t ldc, const int32_t* oc);

void decode_int4_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                    const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc,
                                    const size_t m, const size_t n, const size_t k, const float alpha, const void* a,
                                    const size_t lda, const int8_t oa, const void* b, const size_t ldb, const int8_t ob,
                                    const float beta, int32_t* c, const size_t ldc, const int32_t* oc);

void prefill_int4_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                     const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc,
                                     const size_t m, const size_t n, const size_t k, const float alpha, const void* a,
                                     const size_t lda, const int8_t oa, const void* b, const size_t ldb,
                                     const int8_t ob, const float beta, int32_t* c, const size_t ldc,
                                     const int32_t* oc);
}

MatKernelSelection select_kernel_for_int4(MatKernelVariant variant) {
  switch (variant) {
    case MatKernelVariant::Decode:
      return {decode_int4_cblas_gemm_s8s8s32, kInt4ElementDivisor};
    case MatKernelVariant::Prefill:
      return {prefill_int4_cblas_gemm_s8s8s32, kInt4ElementDivisor};
  }
  return {nullptr, 0};
}

MatKernelSelection select_kernel_for_int8(MatKernelVariant variant) {
  switch (variant) {
    case MatKernelVariant::Decode:
      return {decode_cblas_gemm_s8s8s32, kInt8ElementDivisor};
    case MatKernelVariant::Prefill:
      return {prefill_cblas_gemm_s8s8s32, kInt8ElementDivisor};
  }
  return {nullptr, 0};
}

================================================
FILE: kt-kernel/operators/moe_kernel/la/utils.hpp
================================================
#pragma once
// #include <arm_sve.h>
#include <cstdint>
#include <cstring>

// 简单截断模式：直接丢弃低 16 位
static inline uint16_t float_to_bf16_trunc(float f) {
  uint32_t u;
  // 按位拷贝，避免 strict‑aliasing UB
  memcpy(&u, &f, sizeof(u));   // :contentReference[oaicite:3]{index=3}
  return (uint16_t)(u >> 16);  // 截断得到高 16 位 :contentReference[oaicite:4]{index=4}
}

static inline void convert_32fp32_to_32bf16_pure_c(const float* src, uint16_t* dst) {
  // src 已偏移至 token_nth * hidden_size
  for (int e = 0; e < 32; e++) {  // 共 32 个元素
    // 选择截断或四舍五入
    dst[e] = float_to_bf16_trunc(src[e]);
  }
}

// 把 32 个 bf16 元素转换成 32 个 fp32 元素

static inline void convert_32bf16_to_32fp32_pure_c(const uint16_t* src, float* dst) {
  for (int e = 0; e < 32; e++) {
    uint32_t temp = ((uint32_t)src[e]) << 16;  // 将 BF16 左移 16 位
    memcpy(&dst[e], &temp, sizeof(float));     // 将结果复制到 FP32 变量中
  }
}

================================================
FILE: kt-kernel/operators/moe_kernel/mat_kernel/aocl_kernel/kernel.cpp
================================================
#include <stdexcept>

#include "../batch_gemm_api.hpp"
#include "blis.h"

namespace {

char ToAoclOrder(KERNEL_CBLAS_LAYOUT layout) {
  switch (layout) {
    case KernelCblasRowMajor:
      return 'r';
    case KernelCblasColMajor:
      return 'c';
  }
  throw std::invalid_argument("Unsupported KERNEL_CBLAS_LAYOUT value");
}

char ToAoclTranspose(KERNEL_CBLAS_TRANSPOSE transpose) {
  switch (transpose) {
    case KernelCblasNoTrans:
      return 'n';
    case KernelCblasTrans:
      return 't';
    case KernelCblasConjTrans:
    case KernelCblasConjNoTrans:
      break;
  }
  throw std::invalid_argument("Unsupported KERNEL_CBLAS_TRANSPOSE value");
}

}  // namespace

// 映射表，layout 从KERNEL_CBLAS_ORDER 映射到'r'或者'c',以及将KERNEL_CBLAS_TRANSPOSE映射到'n'或者't'
#ifdef __cplusplus
extern "C" {
#endif
void decode_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                               const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc, const size_t m,
                               const size_t n, const size_t k, const float alpha, const void* a, const size_t lda,
                               const BLASINT8 oa, const void* b, const size_t ldb, const BLASINT8 ob, const float beta,
                               int32_t* c, const size_t ldc, const int32_t* oc) {
  const char order = ToAoclOrder(layout);
  const char op_a = ToAoclTranspose(transa);
  const char op_b = ToAoclTranspose(transb);
  (void)offsetc;
  aocl_gemm_s8s8s32os32(order, op_a, op_b, static_cast<dim_t>(m), static_cast<dim_t>(n), static_cast<dim_t>(k),
                        static_cast<int32_t>(alpha), static_cast<const int8_t*>(a), static_cast<dim_t>(lda), 'n',
                        static_cast<const int8_t*>(b), static_cast<dim_t>(ldb), 'r', static_cast<int32_t>(beta), c,
                        static_cast<dim_t>(ldc), nullptr);
}

void prefill_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc, const size_t m,
                                const size_t n, const size_t k, const float alpha, const void* a, const size_t lda,
                                const BLASINT8 oa, const void* b, const size_t ldb, const BLASINT8 ob, const float beta,
                                int32_t* c, const size_t ldc, const int32_t* oc) {
  const char order = ToAoclOrder(layout);
  const char op_a = ToAoclTranspose(transa);
  const char op_b = ToAoclTranspose(transb);
  (void)offsetc;
  aocl_gemm_s8s8s32os32(order, op_a, op_b, static_cast<dim_t>(m), static_cast<dim_t>(n), static_cast<dim_t>(k),
                        static_cast<int32_t>(alpha), static_cast<const int8_t*>(a), static_cast<dim_t>(lda), 'n',
                        static_cast<const int8_t*>(b), static_cast<dim_t>(ldb), 'r', static_cast<int32_t>(beta), c,
                        static_cast<dim_t>(ldc), nullptr);
}

void prefill_int4_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                     const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc,
                                     const size_t m, const size_t n, const size_t k, const float alpha, const void* a,
                                     const size_t lda, const BLASINT8 oa, const void* b, const size_t ldb,
                                     const BLASINT8 ob, const float beta, int32_t* c, const size_t ldc,
                                     const int32_t* oc) {
  throw std::runtime_error("int4 not support prefill");
}

void decode_int4_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                    const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc,
                                    const size_t m, const size_t n, const size_t k, const float alpha, const void* a,
                                    const size_t lda, const BLASINT8 oa, const void* b, const size_t ldb,
                                    const BLASINT8 ob, const float beta, int32_t* c, const size_t ldc,
                                    const int32_t* oc) {
  throw std::runtime_error("int4 not support decode");
}

void reorder_B_gemm(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transb, const size_t k,
                    const size_t n, const size_t ldb, const void* b, void* b_reordered) {
  const char order = ToAoclOrder(layout);
  const char op_b = ToAoclTranspose(transb);
  aocl_reorder_s8s8s32os32(order, op_b, 'B', static_cast<const int8_t*>(b), static_cast<int8_t*>(b_reordered), k, n,
                           ldb);
}

size_t get_reorder_B_size(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transb, const size_t k,
                          const size_t n) {
  return aocl_get_reorder_buf_size_s8s8s32os32(ToAoclOrder(layout), ToAoclTranspose(transb), 'B', k, n);
}

#ifdef __cplusplus
}
#endif

================================================
FILE: kt-kernel/operators/moe_kernel/mat_kernel/batch_gemm_api.hpp
================================================
#pragma once
#include <cstddef>
#ifndef _BATCH_GEMM_KERNEL_API_
#define _BATCH_GEMM_KERNEL_API_
#include "../api/common.h"
#ifdef __cplusplus
extern "C" {
#endif
void decode_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                               const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc, const size_t m,
                               const size_t n, const size_t k, const float alpha, const void* a, const size_t lda,
                               const BLASINT8 oa, const void* b, const size_t ldb, const BLASINT8 ob, const float beta,
                               int32_t* c, const size_t ldc, const int32_t* oc);

void prefill_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc, const size_t m,
                                const size_t n, const size_t k, const float alpha, const void* a, const size_t lda,
                                const BLASINT8 oa, const void* b, const size_t ldb, const BLASINT8 ob, const float beta,
                                int32_t* c, const size_t ldc, const int32_t* oc);

void decode_int4_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                    const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc,
                                    const size_t m, const size_t n, const size_t k, const float alpha, const void* a,
                                    const size_t lda, const BLASINT8 oa, const void* b, const size_t ldb,
                                    const BLASINT8 ob, const float beta, int32_t* c, const size_t ldc,
                                    const int32_t* oc);

void prefill_int4_cblas_gemm_s8s8s32(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transa,
                                     const KERNEL_CBLAS_TRANSPOSE transb, const KERNEL_CBLAS_OFFSET offsetc,
                                     const size_t m, const size_t n, const size_t k, const float alpha, const void* a,
                                     const size_t lda, const BLASINT8 oa, const void* b, const size_t ldb,
                                     const BLASINT8 ob, const float beta, int32_t* c, const size_t ldc,
                                     const int32_t* oc);
void reorder_B_gemm(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transb, const size_t k,
                    const size_t n, const size_t ldb, const void* b, void* b_reordered);
size_t get_reorder_B_size(const KERNEL_CBLAS_LAYOUT layout, const KERNEL_CBLAS_TRANSPOSE transb, const size_t k,
                          const size_t n);

#ifdef __cplusplus
}
#endif
#endif /*** _BATCH_GEMM_KERNEL_API_ ***/

================================================
FILE: kt-kernel/operators/moe_kernel/moe.hpp
================================================
#ifndef MOE_KERNEL_HPP
#define MOE_KERNEL_HPP

#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <vector>

#include "../../cpu_backend/shared_mem_buffer.h"
#include "../common.hpp"
#include "../moe-tp.hpp"
#include "api/common.h"
#include "api/mat_kernel.h"
#include "llama.cpp/ggml.h"
template <class T, bool PLAIN = true>
class MOE_KERNEL_TP
#ifdef FORWARD_TIME_PROFILE
    : protected TimePerf
#endif
{
 private:
  int tp_part_idx;
  std::filesystem::path prefix;

  void* gate_proj_;  // [expert_num * intermediate_size * hidden_size ( /32 if
                     // quantized)]
  void* up_proj_;    // [expert_num * intermediate_size * hidden_size ( /32 if
                     // quantized)]
  void* down_proj_;  // [expert_num * hidden_size * intermediate_size ( /32 if
                     // quantized)]

  ggml_bf16_t* m_local_input_;  // [routed_expert_num * max_len * hidden_size]
  float* m_local_gate_output_;  // [routed_expert_num * max_len * intermediate_size]
  float* m_local_up_output_;    // [routed_expert_num * max_len * intermediate_size]
  float* m_local_down_output_;  // [routed_expert_num * max_len * hidden_size]

  std::vector<std::vector<int>> m_local_pos_;    // [max_len, routed_expert_num]
  std::vector<int> m_local_num_;                 // [expert_num]
  std::vector<int> m_expert_id_map_;             // [expert_num]
  std::vector<ggml_bf16_t*> m_local_input_ptr_;  // [expert_num]
  std::vector<float*> m_local_gate_output_ptr_;  // [expert_num]
  std::vector<float*> m_local_up_output_ptr_;    // [expert_num]
  std::vector<float*> m_local_down_output_ptr_;  // [expert_num]

  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;

  std::vector<void*> gate_up_owner_ptr_;
  std::vector<void*> down_owner_ptr_;

  inline void write_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
                            size_t scale_size) {
    // printf("expert %d, size %ld, scale size %ld\n", expert_idx, size, scale_size);
    // std::ofstream of(prefix / (T::name() + mat_class + std::to_string(expert_idx)  + "_quant_" + ".kt"));
    std::ofstream of(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                               std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"));
    if (of.is_open() == false) {
      printf("no such file: %s", (prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                                            std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"))
                                     .c_str());
      // throw std::runtime_error("No such file");
    }
    of.write((char*)bb, size - scale_size);
    of.close();
    // of.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_scale_" + ".kt"));
    of.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" + std::to_string(scale_size) + "Byte" +
                      "_scale_" + ".kt"));
    if (of.is_open() == false) {
      printf("no such file\n");
      // throw std::runtime_error("No such file");
    }
    of.write(((char*)bb) + size - scale_size, scale_size);
  }

  inline void read_weights(std::filesystem::path prefix, std::string mat_class, char* bb, int expert_idx, size_t size,
                           size_t scale_size, uint8_t mat_split, uint8_t mat_split_idex) {
    // std::ifstream f(prefix / (T::name() + mat_class + std::to_string(expert_idx)  + "_quant_" + ".kt"));
    std::ifstream f(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                              std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"));
    if (f.is_open() == false) {
      printf("no such file: %s\n", (prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                                              std::to_string(size - scale_size) + "Byte" + "_quant_" + ".kt"))
                                       .c_str());
      // throw std::runtime_error("No such file");
    }
    f.seekg(mat_split_idex * (size - scale_size) / mat_split);
    f.read(((char*)bb) + mat_split_idex * (size - scale_size) / mat_split, (size - scale_size) / mat_split);
    f.close();
    // f.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_scale_" + ".kt"));
    f.open(prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" + std::to_string(scale_size) + "Byte" +
                     "_scale_" + ".kt"));
    if (f.is_open() == false) {
      printf("no such file: %s\n", (prefix / (T::name() + mat_class + std::to_string(expert_idx) + "_" +
                                              std::to_string(scale_size) + "Byte" + "_scale_" + ".kt"))
                                       .c_str());
      // throw std::runtime_error("No such file");
    }
    f.seekg(mat_split_idex * scale_size / mat_split);
    f.read((((char*)bb) + size - scale_size) + mat_split_idex * scale_size / mat_split, scale_size / mat_split);
  }

 public:
  using input_t = ggml_bf16_t;
  using output_t = float;

  GeneralMOEConfig config_;
  static constexpr double ELEMENT_SIZE = T::ELEMENT_SIZE;

  MOE_KERNEL_TP(GeneralMOEConfig config, int tp_part_idx) {
    printf("  Creating AMD_MOE_TP %d at numa %d\n", tp_part_idx, numa_node_of_cpu(sched_getcpu()));
    auto& load = config.load;
    auto& save = config.save;
    if (load && config.path == "") {
      load = false;
    }

    prefix = config.path;
    prefix = prefix / ("_layer_" + std::to_string(config.layer_idx)) / ("_numa_" + std::to_string(tp_part_idx));
    if (save) {
      std::cout << "Creating " << prefix << std::endl;
      std::filesystem::create_directories(prefix);
    }
    if (load) {
      if (std::filesystem::exists(prefix)) {
        std::cout << "Loading from " << prefix << std::endl;
      } else {
        throw std::runtime_error("Path not found: " + prefix.string());
      }
    }

    this->tp_part_idx = tp_part_idx;
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;

    MemoryRequest mem_requests;
    mem_requests.append_pointer(&m_local_input_,
                                sizeof(input_t) * config_.num_experts_per_tok * config_.max_len * config_.hidden_size);

    mem_requests.append_pointer(&m_local_gate_output_, sizeof(float) * config_.num_experts_per_tok * config_.max_len *
                                                           config_.intermediate_size);
    mem_requests.append_pointer(
        &m_local_up_output_, sizeof(float) * config_.num_experts_per_tok * config_.max_len * config_.intermediate_size);
    mem_requests.append_pointer(&m_local_down_output_,
                                sizeof(float) * config_.num_experts_per_tok * config_.max_len * config_.hidden_size);

    m_local_pos_.resize(config_.max_len);
    for (int i = 0; i < config_.max_len; i++) {
      m_local_pos_[i].resize(config_.num_experts_per_tok);
    }
    m_expert_id_map_.resize(config_.expert_num);
    m_local_num_.resize(config_.expert_num);
    m_local_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);

    // printf("tp part %d alloc layer %d, %f GB, on numa %d\n", tp_part_idx, config_.layer_idx,
    //        1e-9 * config_.expert_num *
    //            (T::BufferB::required_size(config_.intermediate_size, config_.hidden_size) * 2 +
    //             T::BufferB::required_size(config_.hidden_size, config_.intermediate_size)),
    //        numa_node_of_cpu(sched_getcpu()));
    // 统一分配一块巨大的内存用于权重：
    size_t gate_up_exp_size =
        T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN) +
        T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);

    for (uint64_t i = 0; i < config_.expert_num; i++) {
      gate_up_ba_.push_back(std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, nullptr));
      gate_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, nullptr));
      up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, nullptr));
      down_ba_.push_back(std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, nullptr));
      down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, nullptr));
      void* gate_up_down_per_exp_ptr = std::aligned_alloc(64, gate_up_exp_size);
      gate_up_owner_ptr_.push_back(gate_up_down_per_exp_ptr);

      gate_bb_.push_back(std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size,
                                                               gate_up_down_per_exp_ptr, PACKED, 'u', PLAIN));
      up_bb_.push_back(std::make_shared<typename T::BufferB>(
          config_.intermediate_size, config_.hidden_size,
          offset_pointer(gate_up_down_per_exp_ptr,
                         T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN)),
          PACKED, 'u', PLAIN));

      void* down_bb_ptr = std::aligned_alloc(
          64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN));
      down_owner_ptr_.push_back(down_bb_ptr);
      down_bb_.push_back(std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size,
                                                               down_bb_ptr, PACKED, 'd', PLAIN));
    }

    for (int i = 0; i < config_.expert_num; i++) {
      mem_requests.append_function([this, i](void* new_ptr) { gate_up_ba_[i]->set_data(new_ptr); },
                                   T::BufferA::required_size(config_.max_len, config_.hidden_size));
      mem_requests.append_function([this, i](void* new_ptr) { gate_bc_[i]->set_data(new_ptr); },
                                   T::BufferC::required_size(config_.max_len, config_.intermediate_size));
      mem_requests.append_function([this, i](void* new_ptr) { up_bc_[i]->set_data(new_ptr); },
                                   T::BufferC::required_size(config_.max_len, config_.intermediate_size));
      mem_requests.append_function([this, i](void* new_ptr) { down_ba_[i]->set_data(new_ptr); },
                                   T::BufferA::required_size(config_.max_len, config_.intermediate_size));
      mem_requests.append_function([this, i](void* new_ptr) { down_bc_[i]->set_data(new_ptr); },
                                   T::BufferC::required_size(config_.max_len, config_.hidden_size));
    }

    shared_mem_buffer_numa.alloc(tp_part_idx, this, mem_requests);
  }

  MOE_KERNEL_TP(const MOE_KERNEL_TP&) = delete;
  MOE_KERNEL_TP& operator=(const MOE_KERNEL_TP&) = delete;
  MOE_KERNEL_TP(MOE_KERNEL_TP&&) = delete;
  MOE_KERNEL_TP& operator=(MOE_KERNEL_TP&&) = delete;

  ~MOE_KERNEL_TP() {
    // printf("  Destroying KML_MOE_TP %lx\n", (intptr_t)(this));
    for (void* ptr : gate_up_owner_ptr_) {
      std::free(ptr);
    }
    for (void* ptr : down_owner_ptr_) {
      std::free(ptr);
    }
  }

  void load_weights() {
    auto pool = config_.pool->get_subpool(tp_part_idx);
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
    if (config_.gate_projs.size()) {
      printf("load from safetensor");
      pool->do_work_stealing_job(
          config_.expert_num, nullptr,
          [this, physical_to_logical_map](int expert_id) {
            uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_id);
            {
              size_t scale_size = config_.intermediate_size * sizeof(float);
              size_t whole_size_ =
                  T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
              size_t size = whole_size_ - scale_size;
              void* dst_ = PLAIN ? gate_bb_[expert_id]->b : gate_bb_[expert_id]->b_pack[0];

              memcpy(dst_, config_.gate_projs[tp_part_idx][logical_expert_id], size);

              if constexpr (T::BufferB::SCALE) {
                memcpy(gate_bb_[expert_id]->d, config_.gate_scales[tp_part_idx][logical_expert_id], scale_size);
              }

              whole_size_ =
                  T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
              size = whole_size_ - scale_size;
              dst_ = PLAIN ? up_bb_[expert_id]->b : up_bb_[expert_id]->b_pack[0];
              memcpy(dst_, config_.up_projs[tp_part_idx][logical_expert_id], size);

              if constexpr (T::BufferB::SCALE) {
                memcpy(up_bb_[expert_id]->d, config_.up_scales[tp_part_idx][logical_expert_id], scale_size);
              }
            }

            {
              size_t scale_size = config_.hidden_size * sizeof(float);
              size_t whole_size_ =
                  T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
              size_t size = whole_size_ - scale_size;
              void* dst_ = PLAIN ? down_bb_[expert_id]->b : down_bb_[expert_id]->b_pack[0];
              memcpy(dst_, config_.down_projs[tp_part_idx][logical_expert_id], size);

              if constexpr (T::BufferB::SCALE) {
                memcpy(down_bb_[expert_id]->d, config_.down_scales[tp_part_idx][logical_expert_id], scale_size);
              }
            }
          },
          nullptr);

    } else {
      static uint8_t mat_type_all = 3, mat_split = 1;
      if (config_.load) {
        std::cout << "Loading from " << prefix << std::endl;
        for (int task_id = 0; task_id < config_.expert_num * mat_type_all * mat_split; task_id++) {
          int64_t expert_idx = task_id / (mat_type_all * mat_split);
          uint8_t mat_class = (task_id % (mat_type_all * mat_split)) / mat_split;
          uint8_t mat_split_idex = task_id % mat_split;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
          void* src_;
          if (mat_class == 0) {  // the up matrix
            src_ = PLAIN ? up_bb_[expert_idx]->b : up_bb_[expert_idx]->b_pack[0];
            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
            size_t scale_size = config_.intermediate_size * sizeof(float);
            read_weights(prefix, "_up_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
          } else if (mat_class == 1) {
            void* src_ = PLAIN ? gate_bb_[expert_idx]->b : gate_bb_[expert_idx]->b_pack[0];
            size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
            size_t scale_size = config_.intermediate_size * sizeof(float);
            read_weights(prefix, "_gate_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
          } else {
            void* src_ = PLAIN ? down_bb_[expert_idx]->b : down_bb_[expert_idx]->b_pack[0];
            size_t size = T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
            size_t scale_size = config_.hidden_size * sizeof(float);
            read_weights(prefix, "_down_", (char*)src_, logical_expert_id, size, scale_size, mat_split, mat_split_idex);
          }
        }
      }
// check process, store down matrix to check
#ifdef CHECK
      load_check();
#endif
#ifndef CHECK
      else
#endif
      {
        if (tp_part_idx == 0) {
          std::cout << "  online quant from bf16" << std::endl;
        }
        int nth = T::recommended_nth_up_gate(config_.intermediate_size);
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              int64_t expert_idx = task_id / nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // gate part
              gate_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.gate_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
                  ith, nth, -1, PACKED, PLAIN);
              // up part
              up_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.up_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
                  ith, nth, -1, PACKED, PLAIN);
            },
            nullptr);

        nth = T::recommended_nth_down(config_.hidden_size);
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
            [this, nth, physical_to_logical_map](int task_id) {
              int64_t expert_idx = task_id / nth;
              int ith = task_id % nth;
              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              // down part
              down_bb_[logical_expert_id]->from_mat(
                  (ggml_bf16_t*)config_.down_proj + logical_expert_id * config_.hidden_size * config_.intermediate_size,
                  ith, nth, -1, PACKED, PLAIN);
            },
            nullptr);
      }
#ifdef CHECK
      verify_load_right();
#endif
      // save process
      if (config_.save) {
        pool->do_work_stealing_job(
            config_.expert_num * mat_type_all, nullptr,
            [this, physical_to_logical_map](int task_id) {
              int64_t expert_idx = task_id / mat_type_all;
              expert_idx = expert_map(physical_to_logical_map, expert_idx);
              uint8_t mat_class = task_id % mat_type_all;
              if (mat_class == 0) {  // the up matrix
                size_t size =
                    T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
                size_t scale_size = config_.intermediate_size * sizeof(float);
                write_weights(prefix, "_up_", (char*)up_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
              } else if (mat_class == 1) {
                size_t size =
                    T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, PACKED, 'u', PLAIN);
                size_t scale_size = config_.intermediate_size * sizeof(float);
                write_weights(prefix, "_gate_", (char*)gate_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
              } else if (mat_class == 2) {
                size_t size =
                    T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, PACKED, 'd', PLAIN);
                size_t scale_size = config_.hidden_size * sizeof(float);
                write_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b_pack[0], expert_idx, size, scale_size);
              }
            },
            nullptr);
      }
    }
  }

  void warm_up() {
    int qlen = config_.max_len;
    std::vector<uint8_t> input(sizeof(input_t) * qlen * config_.hidden_size);
    std::vector<uint8_t> output(sizeof(output_t) * qlen * config_.hidden_size);
    std::vector<int64_t> expert_ids(qlen * config_.num_experts_per_tok);
    std::vector<float> weights(qlen * config_.num_experts_per_tok);
    for (int i = 0; i < qlen * config_.num_experts_per_tok; i++) {
      expert_ids[i] = i % config_.expert_num;
      weights[i] = 0.01;
    }
    forward(qlen, config_.num_experts_per_tok, expert_ids.data(), weights.data(), input.data(), output.data());
  }

#define MOE_DIRECT_OR_POOL_BY_VAR(var, fn)                       \
  do {                                                           \
    if (var < 5) {                                               \
      for (int i = 0; i < (var); i++) {                          \
        (fn)(i);                                                 \
      }                                                          \
    } else {                                                     \
      pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \
    }                                                            \
  } while (0)
  static float act_fn(float x) { return x / (1.0f + expf(-x)); }

  void forward(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
    // Unified forward path: 'd' for decode (qlen<=1), 'p' for prefill (qlen>1)
    char mode = (qlen <= 1) ? 'd' : 'p';
    forward_unified(mode, qlen, k, expert_ids, weights, input, output);
  }

  // Helper to select B pointer for up or gate mat based on packing
  inline int8_t* select_up_or_gate_B_ptr_(bool do_up, int expert_idx, int ith, int devide_elements_size) {
    if constexpr (PLAIN) {
      int8_t* base = do_up ? (int8_t*)up_bb_[expert_idx]->b : (int8_t*)gate_bb_[expert_idx]->b;
      return base + ith * config_.hidden_size * T::N_BLOCK_UP_GATE / devide_elements_size;
    } else {
      return do_up ? (int8_t*)up_bb_[expert_idx]->b_pack[ith] : (int8_t*)gate_bb_[expert_idx]->b_pack[ith];
    }
  }

  // Helper to select B pointer for down mat based on packing
  inline int8_t* select_down_B_ptr_(int expert_idx, int ith, int devide_elements_size) {
    if constexpr (PLAIN) {
      return ((int8_t*)down_bb_[expert_idx]->b) +
             ith * config_.intermediate_size * T::N_BLOCK_DOWN / devide_elements_size;
    } else {
      return (int8_t*)down_bb_[expert_idx]->b_pack[ith];
    }
  }

  // Unified implementation for decode/prefill using mode 'd' or 'p'
  void forward_unified(char mode, int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
                       void* output) {
    MatKernelVariant var = (mode == 'p') ? MatKernelVariant::Prefill : MatKernelVariant::Decode;
    MatKernelSelection kernel = select_mat_kernel<T>(var);
    GemmFn cblas_gemm_s8s8s32 = kernel.fn;
    int devide_elements_size = kernel.divide_elements_size;

#ifdef FORWARD_TIME_PROFILE
    forward_perf_start();
#endif
    int max_local_num = 0;

    auto pool = config_.pool->get_subpool(tp_part_idx);

    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        if (config_.should_skip_expert(expert_ids[i * k + j])) {
          continue;
        }
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }

    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
        max_local_num = std::max(max_local_num, m_local_num_[i]);
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }

    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];
    }

#ifdef FORWARD_TIME_PROFILE
    PROFILE_RECORD_TIME_STAMP("prepare");
#endif

    // Copy inputs into expert-local buffers
    MOE_DIRECT_OR_POOL_BY_VAR(qlen, [&](int i) {
      for (int j = 0; j < k; j++) {
        if (config_.should_skip_expert(expert_ids[i * k + j])) {
          continue;
        }
        memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
               (input_t*)input + i * config_.hidden_size, sizeof(input_t) * config_.hidden_size);
      }
    });

#ifdef FORWARD_TIME_PROFILE
    PROFILE_RECORD_TIME_STAMP("copy_input");
#endif

    // Quantize expert inputs (row-wise)
    {
      size_t mth = T::recommended_mth(max_local_num);
      MOE_DIRECT_OR_POOL_BY_VAR(activated_expert * mth, [&](int task_id) {
        int task_id_expert = task_id / mth;
        int ith = task_id % mth;
        int expert_idx = m_expert_id_map_[task_id_expert];
        if (ith * T::M_BLOCK >= m_local_num_[expert_idx]) return;
        gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], ith, mth);
      });
    }

#ifdef FORWARD_TIME_PROFILE
    PROFILE_RECORD_TIME_STAMP("quant_input");
#endif

    int nth_up = T::recommended_nth_up_gate(config_.intermediate_size, mode);
    int mth = T::recommended_mth(max_local_num);
    int32_t oc = 0;

    // Up and Gate GEMMs + dequant scale
    pool->do_work_stealing_job(
        mth * nth_up * activated_expert * 2, nullptr,
        [this, qlen, nth_up, oc, &cblas_gemm_s8s8s32, devide_elements_size, mth](int task_id2) {
          int task_id = task_id2 / 2;
          bool do_up = task_id2 % 2;
          int expert_idx = m_expert_id_map_[task_id / (nth_up * mth)];
          task_id = task_id % (nth_up * mth);
          int ith = task_id % nth_up;
          int jth = task_id / nth_up;
          if (jth * T::M_BLOCK >= m_local_num_[expert_idx]) return;
          int m_block = T::M_BLOCK;
          if ((jth + 1) * T::M_BLOCK > m_local_num_[expert_idx]) {
            m_block = m_local_num_[expert_idx] - jth * T::M_BLOCK;
          }
          int8_t* a_ptr = (int8_t*)gate_up_ba_[expert_idx]->a + jth * T::M_BLOCK * config_.hidden_size;
          int8_t* b_ptr = select_up_or_gate_B_ptr_(do_up, expert_idx, ith, devide_elements_size);
          int32_t* c_ptr = (do_up ? (int32_t*)up_bc_[expert_idx]->c : (int32_t*)gate_bc_[expert_idx]->c) +
                           ith * T::N_BLOCK_UP_GATE + jth * T::M_BLOCK * config_.intermediate_size;

          cblas_gemm_s8s8s32(KernelCblasRowMajor, KernelCblasNoTrans, KernelCblasTrans, KernelCblasFixOffset, m_block,
                             T::N_BLOCK_UP_GATE, config_.hidden_size, 1.0, a_ptr, config_.hidden_size, 0, b_ptr,
                             config_.hidden_size, 0, 0.0, c_ptr, config_.intermediate_size, &oc);

          if (do_up) {
            T::apply_scale(m_local_num_[expert_idx], config_.intermediate_size, m_local_up_output_ptr_[expert_idx],
                           gate_up_ba_[expert_idx].get(), up_bb_[expert_idx].get(), up_bc_[expert_idx].get(), ith,
                           nth_up, T::N_BLOCK_UP_GATE, jth);
          } else {
            T::apply_scale(m_local_num_[expert_idx], config_.intermediate_size, m_local_gate_output_ptr_[expert_idx],
                           gate_up_ba_[expert_idx].get(), gate_bb_[expert_idx].get(), gate_bc_[expert_idx].get(), ith,
                           nth_up, T::N_BLOCK_UP_GATE, jth);
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    PROFILE_RECORD_TIME_STAMP("up_gate");
#endif

    // Activate gate and multiply by up
    {
      int nth = T::recommended_nth(config_.intermediate_size);
      auto up_gate_fn = [this, nth](int task_id) {
        int expert_idx = m_expert_id_map_[task_id / nth];
        int ith = task_id % nth;
        auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
          float* gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
          float* up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
          for (int j = n_start; j < n_end; j++) {
            gate_output_ptr[j] = act_fn(gate_output_ptr[j]) * up_output_ptr[j];
          }
        }
      };
      MOE_DIRECT_OR_POOL_BY_VAR(nth * activated_expert, up_gate_fn);
    }

#ifdef FORWARD_TIME_PROFILE
    PROFILE_RECORD_TIME_STAMP("act");
#endif

    pool->do_work_stealing_job(
        activated_expert, nullptr,
        [this](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx]);
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    PROFILE_RECORD_TIME_STAMP("quant_down_input");
#endif

    int nth_down = T::recommended_nth_down(config_.hidden_size, mode);
    pool->do_work_stealing_job(
        mth * nth_down * activated_expert, nullptr,
        [this, qlen, nth_down, oc, &cblas_gemm_s8s8s32, devide_elements_size, mth](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / (nth_down * mth)];
          task_id = task_id % (nth_down * mth);
          int ith = task_id % nth_down;
          int jth = task_id / nth_down;
          if (jth * T::M_BLOCK >= m_local_num_[expert_idx]) return;
          int m_block = T::M_BLOCK;
          if ((jth + 1) * T::M_BLOCK > m_local_num_[expert_idx]) {
            m_block = m_local_num_[expert_idx] - jth * T::M_BLOCK;
          }
          int8_t* a_ptr = ((int8_t*)down_ba_[expert_idx]->a) + jth * T::M_BLOCK * config_.intermediate_size;
          int8_t* b_ptr = select_down_B_ptr_(expert_idx, ith, devide_elements_size);
          int32_t* c_ptr =
              ((int32_t*)down_bc_[expert_idx]->c) + ith * T::N_BLOCK_DOWN + jth * T::M_BLOCK * config_.hidden_size;
          cblas_gemm_s8s8s32(KernelCblasRowMajor, KernelCblasNoTrans, KernelCblasTrans, KernelCblasFixOffset, m_block,
                             T::N_BLOCK_DOWN, config_.intermediate_size, 1.0, a_ptr, config_.intermediate_size, 0,
                             b_ptr, config_.intermediate_size, 0, 0.0, c_ptr, config_.hidden_size, &oc);

          T::apply_scale(m_local_num_[expert_idx], config_.hidden_size, m_local_down_output_ptr_[expert_idx],
                         down_ba_[expert_idx].get(), down_bb_[expert_idx].get(), down_bc_[expert_idx].get(), ith,
                         nth_down, T::N_BLOCK_DOWN, jth);
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    PROFILE_RECORD_TIME_STAMP("down");
#endif

    // Merge k experts per token with weights
    size_t block_dim = 512;
    size_t block_num = (config_.hidden_size + block_dim - 1) / block_dim;
    pool->do_work_stealing_job(
        qlen * block_num, nullptr,
        [this, k, expert_ids, weights, output, block_dim, block_num](int i) {
          int q_idx = i / block_num;
          int block_idx = i % block_num;
          int e_start = block_idx * block_dim;
          int e_end =
              ((block_idx + 1) * block_dim) < config_.hidden_size ? ((block_idx + 1) * block_dim) : config_.hidden_size;
          for (int e = e_start; e < e_end; e++) {
            float sum = 0;
            for (int j = 0; j < k; j++) {
              if (config_.should_skip_expert(expert_ids[q_idx * k + j])) {
                continue;
              }
              sum += weights[q_idx * k + j] * ((float*)m_local_down_output_ptr_[expert_ids[q_idx * k + j]])
                                                  [m_local_pos_[q_idx][j] * config_.hidden_size + e];
            }
            ((float*)output)[q_idx * config_.hidden_size + e] = sum;
          }
        },
        nullptr);

#ifdef FORWARD_TIME_PROFILE
    time_perf_name = std::string("[moe] ") + ((mode == 'p') ? "layer prefill" : "decode layer ") +
                     std::to_string(config_.layer_idx) + " tp_part_idx: " + std::to_string(tp_part_idx);
    perf_report();
#endif
  }

  /* merged into forward_unified */
  void forward_decode(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
                      void* output) {
    forward_unified('d', qlen, k, expert_ids, weights, input, output);
  }

  void forward_prefill(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
                       void* output) {
    forward_unified('p', qlen, k, expert_ids, weights, input, output);
  }
};

template <typename K, bool T>
class TP_MOE<MOE_KERNEL_TP<K, T>> : public TP_MOE_Common<MOE_KERNEL_TP<K, T>> {
 public:
  using TP_MOE_Common<MOE_KERNEL_TP<K, T>>::TP_MOE_Common;

  void load_weights() {
    auto& config = this->config;
    auto& tps = this->tps;
    auto& tp_count = this->tp_count;
    auto pool = config.pool;
    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;
    if (config.gate_projs.empty() == false) {
      printf("TP Load from loader\n");
      pool->dispense_backend()->do_numa_job([this, pool](int numa_id) { this->tps[numa_id]->load_weights(); });
      this->weights_loaded = true;
    } else if (config.gate_proj != nullptr) {
      printf("From BF16\n");
      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        size_t gate_up_elcount = tpc.intermediate_size * tpc.hidden_size;
        tpc.gate_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        tpc.up_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        tpc.down_proj = new ggml_bf16_t[tpc.expert_num * gate_up_elcount];
        if (tps[i]->config_.load == false) {
          pool->get_subpool(i)->do_work_stealing_job(
              tpc.expert_num, nullptr,
              [&](int expert_id_) {
                size_t expert_id = expert_map(physical_to_logical_map, expert_id_);
                memcpy((ggml_bf16_t*)tpc.gate_proj + expert_id * gate_up_elcount,
                       (ggml_bf16_t*)config.gate_proj + expert_id * config.intermediate_size * config.hidden_size +
                           i * gate_up_elcount,
                       sizeof(ggml_bf16_t) * gate_up_elcount);
                memcpy((ggml_bf16_t*)tpc.up_proj + expert_id * gate_up_elcount,
                       (ggml_bf16_t*)config.up_proj + expert_id * config.intermediate_size * config.hidden_size +
                           i * gate_up_elcount,
                       sizeof(ggml_bf16_t) * gate_up_elcount);
                for (size_t col = 0; col < config.hidden_size; col++) {
                  memcpy((ggml_bf16_t*)tpc.down_proj + expert_id * tpc.hidden_size * tpc.intermediate_size +
                             col * tpc.intermediate_size,
                         (ggml_bf16_t*)config.down_proj + expert_id * config.intermediate_size * config.hidden_size +
                             col * config.intermediate_size + i * tpc.intermediate_size,
                         sizeof(ggml_bf16_t) * tpc.intermediate_size);
                }
              },
              nullptr);
        }
      }

      pool->dispense_backend()->do_numa_job([this, pool](int numa_id) { this->tps[numa_id]->load_weights(); });

      for (auto i = 0; i < tp_count; i++) {
        auto& tpc = tps[i]->config_;
        delete[] (ggml_bf16_t*)(tpc.gate_proj);
        delete[] (ggml_bf16_t*)(tpc.up_proj);
        delete[] (ggml_bf16_t*)(tpc.down_proj);
      }
      if (config.save) {
        // free the bf16 weights after saving
        tps.clear();
      }

      this->weights_loaded = true;
    } else if (config.path != "") {
      printf("TP Load from file\n");
      pool->dispense_backend()->do_numa_job([this, pool](int numa_id) { this->tps[numa_id]->load_weights(); });
      this->weights_loaded = true;
    } else {
      throw std::runtime_error("no weight source");
    }
  }

  void merge_results(int qlen, void* output, bool incremental) {
    // #ifdef FORWARD_TIME_PROFILE
    //     forward_perf_start();
    // #endif
    auto pool = this->config.pool;
    auto merge_fn = [this, output, incremental](int token_nth) {
      auto& local_output_numa = this->local_output_numa;
      auto& tp_configs = this->tp_configs;
      auto& tp_count = this->tp_count;
      auto& config = this->config;
      float* merge_to = local_output_numa[0] + token_nth * tp_configs[0].hidden_size;
      if (incremental) {
        for (int e = 0; e < config.hidden_size; e++) {
          merge_to[e] += ggml_bf16_to_fp32(((ggml_bf16_t*)output + token_nth * config.hidden_size)[e]);
        }
      }

      for (int i = 1; i < tp_count; i++) {
        float* merge_from = local_output_numa[i] + token_nth * tp_configs[i].hidden_size;
        // TODO: 后续用 SVE 来加速
        // for (int e = 0; e < tp_configs[i].hidden_size; e += 16) {
        //   *((__m512 *)(merge_to + e)) = _mm512_add_ps(*((__m512 *)(merge_to + e)), *((__m512 *)(merge_from + e)));
        // }
        // CHECK: 目前用普通的纯 C++ 来实现
        for (int e = 0; e < tp_configs[i].hidden_size; e++) {
          merge_to[e] += merge_from[e];
        }
      }

      convert_or_copy((ggml_bf16_t*)output + token_nth * config.hidden_size, merge_to, config.hidden_size);

      // for (int e = 0; e < config.hidden_size; e += 32) {
      // TODO: 这里需要用 SVE 来加速，实现 fp32 到 bf16 的转换
      // __m512 x0 = *(__m512 *)(merge_to + e);
      // __m512 x1 = *(__m512 *)(merge_to + e + 16);
      // avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)output + token_nth * config.hidden_size + e));

      // CHECK: 目前用普通的纯 C++ 来实现 fp32 到 bf16 的转换

      // convert_32fp32_to_32bf16_pure_c(merge_to + e,
      // (uint16_t *)((ggml_bf16_t *)output + token_nth * config.hidden_size + e));

      // }
    };
    MOE_DIRECT_OR_POOL_BY_VAR(qlen, merge_fn);
    // #ifdef FORWARD_TIME_PROFILE
    //     PROFILE_RECORD_TIME_STAMP("moe merge done");
    // #endif
    // #ifdef FORWARD_TIME_PROFILE
    //     time_perf_name = "[moe merge] decode layer " + std::to_string(this->config.layer_idx);
    //     perf_report();
    // #endif
  }

  void merge_results(int qlen, void* output) { merge_results(qlen, output, false); }
};

#endif

================================================
FILE: kt-kernel/operators/moe_kernel/test/convert-test.cpp
================================================
#include <arm_sve.h>

#include <cmath>
#include <cstdlib>
#include <fstream>
#include <stdexcept>

#include "../../reduce.hpp"
#include "../../rms-norm.hpp"
#include "../../rope.hpp"
#include "../../softmax.hpp"
#include "../la/arm_kml.hpp"
#include "llama.cpp/ggml-common.h"
#include "llama.cpp/ggml.h"

void bf16_to_fp16(const ggml_bf16_t* src, ggml_fp16_t* dst, size_t n) {
  for (size_t i = 0; i < n; ++i) {
    float x = ggml_bf16_to_fp32(src[i]);
    dst[i] = ggml_fp32_to_fp16(x);
  }
}

void debug_rope() {
  float16_t* fp16 = new float16_t[1024 * 64];

  for (size_t i = 0; i < 1024 * 64; i++) {
    fp16[i] = static_cast<double>(std::rand()) / RAND_MAX;
  }
  std::ofstream("before_rope", std::ios::binary).write((char*)fp16, 1024 * 64 * sizeof(float16_t));

  DeepseekV3YarnRotaryEmbedding rope(64, 163840, 10000, 40, 4096, 32, 1, 1, 1);

  rope.init(1024);

  Rope<DeepseekV3YarnRotaryEmbedding, float16_t> rope_applier;
  rope_applier.apply_multiple(rope, fp16, 64, 64, 0, 1024);

  std::ofstream("cos", std::ios::binary).write((char*)rope.cos(0), 1024 * 32 * sizeof(float));
  std::ofstream("sin", std::ios::binary).write((char*)rope.sin(0), 1024 * 32 * sizeof(float));

  std::ofstream("after_rope", std::ios::binary).write((char*)fp16, 1024 * 64 * sizeof(float16_t));
}

void debug_softmax() {
  float16_t* fp16 = new float16_t[64 * 1024];

  for (size_t i = 0; i < 1024 * 64; i++) {
    fp16[i] = static_cast<double>(std::rand()) / RAND_MAX * 10;
    if (i % 12 == 0) {
      fp16[i] -= std::numeric_limits<float16_t>::infinity();
    }
  }
  std::ofstream("before_softmax", std::ios::binary).write((char*)fp16, 1024 * 64 * sizeof(float16_t));

  Softmax<float16_t>::apply_multiple(64, fp16, 1024, 1024);
  std::ofstream("after_softmax", std::ios::binary).write((char*)fp16, 1024 * 64 * sizeof(float16_t));
}

void debug_inf() {
  float16_t x, y;
  // x = std::numeric_limits<float16_t>::infinity(); // 0.00
  // y = -std::numeric_limits<float16_t>::infinity(); // -0.00
  // x = 1e10;
  x = std::numeric_limits<float>::infinity();   // inf
  y = -std::numeric_limits<float>::infinity();  // -inf
  printf("x = %f, y = %f\n", x, y);
}

void debug_reduce() {
  std::vector<float16_t*> fp16s(128);
  for (size_t i = 0; i < 128; i++) {
    fp16s[i] = new float16_t[1024];
    for (size_t j = 0; j < 1024; j++) {
      fp16s[i][j] = i;
    }
  }

  reduce_sum(fp16s.data(), 128, 0, 10);
  for (int i = 0; i < 10; i++) {
    printf("%f ", fp16s[0][i]);
  }
}

int main() {
  debug_reduce();

  return 0;
}


================================================
FILE: kt-kernel/operators/moe_kernel/test/debug.hpp
================================================
#ifndef KML_DEBUG_HPP
#define KML_DEBUG_HPP

#include <arm_sve.h>

#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <string>

inline std::string get_env_or_default(const char* var_name, const std::string& default_value) {
  const char* value = std::getenv(var_name);
  return (value != nullptr) ? std::string(value) : default_value;
}

inline void dump_bin(std::string file_name, float16_t* data, size_t count) {
  file_name = get_env_or_default("KML_DEBUG_PATH", "debug") + "/" + file_name + ".f16";
  std::ofstream f(file_name, std::ios::binary);
  f.write(reinterpret_cast<const char*>(data), count * sizeof(*data));
  f.close();
}
inline void dump_bin(std::string file_name, float* data, size_t count) {
  file_name = get_env_or_default("KML_DEBUG_PATH", "debug") + "/" + file_name + ".f32";
  std::ofstream f(file_name, std::ios::binary);
  f.write(reinterpret_cast<const char*>(data), count * sizeof(*data));
  f.close();
}
inline void dump_bin(std::string file_name, int64_t* data, size_t count) {
  file_name = get_env_or_default("KML_DEBUG_PATH", "debug") + "/" + file_name + ".int64";
  std::ofstream f(file_name, std::ios::binary);
  f.write(reinterpret_cast<const char*>(data), count * sizeof(*data));
  f.close();
}

inline void dump_bin(std::string file_name, int8_t* data, size_t count) {
  file_name = get_env_or_default("KML_DEBUG_PATH", "debug") + "/" + file_name + ".int8";
  std::ofstream f(file_name, std::ios::binary);
  f.write(reinterpret_cast<const char*>(data), count * sizeof(*data));
  f.close();
}

inline void dump_bin(std::string file_name, int32_t* data, size_t count) {
  file_name = get_env_or_default("KML_DEBUG_PATH", "debug") + "/" + file_name + ".int32";
  std::ofstream f(file_name, std::ios::binary);
  f.write(reinterpret_cast<const char*>(data), count * sizeof(*data));
  f.close();
}

inline void load_bin(std::string file_name, float* data, size_t count) {
  file_name = get_env_or_default("KML_DEBUG_PATH", "debug") + "/" + file_name + ".f32";
  std::ifstream f(file_name, std::ios::binary);
  if (!f.is_open()) {
    throw std::runtime_error("Failed to open file: " + file_name);
  }
  f.read(reinterpret_cast<char*>(data), count * sizeof(*data));
  f.close();
}

#endif


================================================
FILE: kt-kernel/operators/moe_kernel/test/int4_mul-test.cpp
================================================
#include <cstdint>
#include <cstdio>
#include <cstdlib>

#include "../la/arm_kml.hpp"
#include "debug.hpp"
#include "kblas.h"
const int M = 1, K = 7168, N = 8;

int main() {
  // 随机生成a, b, c矩阵
  arm_kml::GemmKernelInt4::BufferA buffer_a(M, K);
  arm_kml::GemmKernelInt4::BufferB buffer_b(N, K, true);
  arm_kml::GemmKernelInt4::BufferC buffer_c(M, N);

  arm_kml::GemmKernelInt8::BufferA buffer_a_check(M, K);
  arm_kml::GemmKernelInt8::BufferB buffer_b_check(N, K, true);
  arm_kml::GemmKernelInt8::BufferC buffer_c_check(M, N);

  float* a = (float*)aligned_alloc(64, sizeof(float) * M * K);
  float* b = (float*)aligned_alloc(64, sizeof(float) * K * N);
  float* c = (float*)aligned_alloc(64, sizeof(float) * M * N);
  float* c_check = (float*)aligned_alloc(64, sizeof(float) * M * N);
  int8_t* buffer_a_data = (int8_t*)aligned_alloc(64, buffer_a.required_size());
  int4_2_t* buffer_b_data = (int4_2_t*)aligned_alloc(64, buffer_b.required_size());
  int32_t* c_data = (int32_t*)aligned_alloc(64, buffer_c.required_size());
  int8_t* buffer_a_data_check = (int8_t*)aligned_alloc(64, buffer_a_check.required_size());
  int8_t* buffer_b_data_check = (int8_t*)aligned_alloc(64, buffer_b_check.required_size());
  int32_t* c_data_check = (int32_t*)aligned_alloc(64, buffer_c_check.required_size());
  // 初始化元素内容
  load_bin("input.bin", a, M * K);
  load_bin("local_q_a_proj_quant.bin", b, N * K);

  // for (int i = 0; i < M * K; i++) {
  //   // 随机浮点数
  //   // a[i] = (static_cast<float>(rand()) / (float)RAND_MAX) / 25 - 0.02;
  //   a[i] = -(static_cast<float>(rand()) / (float)RAND_MAX) / 25;
  //   // a[i] = i % 10;
  //   // a[i] = 1;
  // }
  // for (int i = 0; i < K * N; i++) {
  //   // 随机浮点数
  //   // b[i] = (static_cast<float>(rand()) / (float)RAND_MAX) / 25 - 0.02;
  //   b[i] = -(static_cast<float>(rand()) / (float)RAND_MAX) / 25;
  //   // b[i] = i % 10;
  //   // b[i] = 1;
  // }
  // // // // 设置离群值
  // for (int i = 0; i < N; i++) {
  //   b[i * K] = 0.06f; // 设置第一列为离群值
  // }
  // // 打印一下输入矩阵和权重矩阵
  // printf("Input matrix a:\n");
  // for (int i = 0; i < M; i++) {
  //   for (int j = 0; j < K; j++) {
  //     printf("%f ", a[i * K + j]);
  //   }
  //   printf("\n");
  // }
  // printf("Weight matrix b:\n");
  // for (int i = 0; i < N; i++) {
  //   for (int j = 0; j < K; j++) {
  //     printf("%f ", b[i * K + j]);
  //   }
  //   printf("\n");
  // }
  buffer_a.set_data(buffer_a_data);
  buffer_b.set_data(buffer_b_data);
  buffer_c.set_data(c_data);
  buffer_a_check.set_data(buffer_a_data_check);
  buffer_b_check.set_data(buffer_b_data_check);
  buffer_c_check.set_data(c_data_check);
  //   调用 from mat 进行量化
  buffer_a.from_mat(M, a, 0, M);
  for (int i = 0; i <= arm_kml::GemmKernelInt4::recommended_nth(N); i++) {
    buffer_b.from_mat(b, i, arm_kml::GemmKernelInt4::recommended_nth(N));
  }
  buffer_a_check.from_mat(M, a, 0, M);
  for (int i = 0; i <= arm_kml::GemmKernelInt8::recommended_nth(N); i++) {
    buffer_b_check.from_mat(b, i, arm_kml::GemmKernelInt8::recommended_nth(N));
  }
  // 进行乘法
  arm_kml::MatRef<int8_t> a_ref(buffer_a.a, M, K, K, CblasRowMajor);
  arm_kml::MatRef<int4_2_t> b_ref(buffer_b.b, K, N, K, CblasColMajor, CblasNoTrans, buffer_b.if_pack);
  arm_kml::MatRef<int32_t> c_ref(buffer_c.c, M, N, N, CblasRowMajor);
  b_ref = b_ref.offset_col(0, N);

  arm_kml::MatRef<int8_t> a_ref_check(buffer_a_check.a, M, K, K, CblasRowMajor);
  arm_kml::MatRef<int8_t> b_ref_check(buffer_b_check.b, K, N, K, CblasColMajor, CblasNoTrans, buffer_b_check.if_pack);
  arm_kml::MatRef<int32_t> c_ref_check(buffer_c_check.c, M, N, N, CblasRowMajor);

  arm_kml::decode_mul_mat_clearc(a_ref, b_ref, c_ref);
  arm_kml::decode_mul_mat_clearc(a_ref_check, b_ref_check, c_ref_check);
  //   反量化，apply scale
  arm_kml::GemmKernelInt4::apply_scale(c, N, &buffer_a, &buffer_b, &buffer_c, 0, M, 0, N, true);
  arm_kml::GemmKernelInt8::apply_scale(c_check, N, &buffer_a_check, &buffer_b_check, &buffer_c_check, 0, M, 0, N, true);
  // 打印结果,比较 c 和 c_check
  const float threashold = 0.05;
  for (int i = 0; i < M * N; i++) {
    float diff_relative = (c[i] - c_check[i]) / (c_check[i] + 1e-6);

    if (diff_relative > threashold || diff_relative < -threashold) {
      printf("diff_relative: %f\n", diff_relative);
      printf("Mismatch at index %d: c = %f, c_check = %f\n", i, c[i], c_check[i]);
    } else {
      printf("Match at index %d: c = %f, c_check = %f\n", i, c[i], c_check[i]);
    }
  }
  return 0;
}

================================================
FILE: kt-kernel/operators/moe_kernel/test/mat_test.cpp
================================================
#include "arm_kml.hpp"

int main() {
  const size_t M = 128, N = 64;
  float16_t* a = new float16_t[M * N];
  float16_t* b = new float16_t[M * N];
  float16_t* c = new float16_t[M * M];
  float16_t* c_check = new float16_t[M * M];
  for (size_t i = 0; i < M * N; i++) {
    a[i] = static_cast<double>(std::rand()) / RAND_MAX / 10.0;
    b[i] = static_cast<double>(std::rand()) / RAND_MAX / 10.0;
  }

  arm_kml::MatRef<float16_t> aref(a, M, N, M, CblasColMajor);
  arm_kml::MatRef<float16_t> bref(b, N, M, M, CblasColMajor);
  arm_kml::MatRef<float16_t> cref(c, M, M, M, CblasColMajor);
  {
    memset(c, 0, M * M * sizeof(float16_t));
    memset(c_check, 0, M * M * sizeof(float16_t));
    arm_kml::mul_mat(aref, bref, cref);
  }
}


================================================
FILE: kt-kernel/operators/moe_kernel/test/utils_test.cpp
================================================
// #pragma once
#ifdef TEST_UTIL
#include <arm_neon.h>
#include <arm_sve.h>
#include <stdio.h>

static inline void sve_32xbf16_to_32xfp32(const bfloat16_t* src, float* dst0, float* dst1) {
#ifdef __ARM_FEATURE_SVE
  // 全真谓词，对应每个 16‑bit 元素
#else
// fallback: scalar or NEON
#endif
}

static inline void neon_32xbf16_to_32xfp32(const uint16_t* src, float* dst0, float* dst1) {
  // src 指向 32 个连续的 BF16（uint16_t）
  // dst0、dst1 各指向 16 个 float 的缓冲

  for (int block = 0; block < 4; ++block) {
    // 每次处理 8 个 BF16 → 8 个 FP32（拆为两次 4→4 存储）
    uint16x8_t v_bf16 = vld1q_u16(src + block * 8);  // load 8×BF16 :contentReference[oaicite:6]{index=6}

    // 拆低半、高半各 4 个到 u32
    uint32x4_t lo_u32 = vmovl_u16(vget_low_u16(v_bf16));   // lower 4 → u32 :contentReference[oaicite:7]{index=7}
    uint32x4_t hi_u32 = vmovl_u16(vget_high_u16(v_bf16));  // upper 4 → u32 :contentReference[oaicite:8]{index=8}

    // 左移 16 位，相当于将 BF16 的 16 位 mantissa+exp 放到 FP32 高位
    lo_u32 = vshlq_n_u32(lo_u32, 16);  // shift left 16 :contentReference[oaicite:9]{index=9}
    hi_u32 = vshlq_n_u32(hi_u32, 16);  // shift left 16 :contentReference[oaicite:10]{index=10}

    // 重新解释为 float32x4_t
    float32x4_t lo_f32 = vreinterpretq_f32_u32(lo_u32);  // bits → FP32 :contentReference[oaicite:11]{index=11}
    float32x4_t hi_f32 = vreinterpretq_f32_u32(hi_u32);  // bits → FP32 :contentReference[oaicite:12]{index=12}

    // 存储到 dst0 或 dst1，每次存 8 个
    if (block < 2) {
      vst1q_f32(dst0 + block * 4, lo_f32);      // store 4 floats :contentReference[oaicite:13]{index=13}
      vst1q_f32(dst0 + block * 4 + 4, hi_f32);  // store next 4 floats :contentReference[oaicite:14]{index=14}
    } else {
      int b = block - 2;
      vst1q_f32(dst1 + b * 4, lo_f32);      // store 4 floats :contentReference[oaicite:15]{index=15}
      vst1q_f32(dst1 + b * 4 + 4, hi_f32);  // store next 4 floats :contentReference[oaicite:16]{index=16}
    }
  }
}

int main() {
  // 测试代码
  uint16_t bf16_data[32] = {0};  // 假设这里填充了一些 BF16 数据
  float f32_data0[16] = {0};
  float f32_data1[16] = {0};

  neon_32xbf16_to_32xfp32(bf16_data, f32_data0, f32_data1);

  // 打印结果
  for (int i = 0; i < 16; ++i) {
    printf("f32_data0[%d]: %f\n", i, f32_data0[i]);
    printf("f32_data1[%d]: %f\n", i, f32_data1[i]);
  }

  return 0;
}
#endif


================================================
FILE: kt-kernel/operators/reduce.hpp
================================================
#ifndef CPUINFER_REDUCE_HPP
#define CPUINFER_REDUCE_HPP

#include <cmath>

template <typename T>
void reduce_sum(T** data, size_t data_groups_count, size_t begin, size_t end) {
  if (data_groups_count <= 1) {
  } else if (data_groups_count == 2) {
    for (size_t i = begin; i < end; i++) {
      data[0][i] += data[1][i];
    }
  } else {
    int part1 = data_groups_count / 2;
    reduce_sum(data, part1, begin, end);
    reduce_sum(data + part1, data_groups_count - part1, begin, end);
    for (size_t i = begin; i < end; i++) {
      data[0][i] += data[part1][i];
    }
  }
}

#endif

================================================
FILE: kt-kernel/operators/rms-norm.hpp
================================================
#ifndef CPUINFER_RMS_NORM_HPP
#define CPUINFER_RMS_NORM_HPP

#include <cmath>

template <typename T, typename A>
concept RMS_NORM = requires(T t, int size, int hidden_size, int qlen, A* weights, A* input) {
  { T::rms_norm(hidden_size, qlen, input) } -> std::same_as<void>;
  { T::rms_norm_single(size, input) } -> std::same_as<void>;
  { T::rms_norm_with_weights(hidden_size, qlen, weights, input) } -> std::same_as<void>;
  { T::rms_norm_single_with_weights(size, weights, input) } -> std::same_as<void>;
};

template <typename A>
struct RMSNorm {
  static void rms_norm_single(int size, A* input) {
    const float epsilon = 1e-6;
    float sum = 0;
    for (int i = 0; i < size; i++) {
      sum += (float)input[i] * (float)input[i];
    }
    sum = sqrt(sum / size + epsilon);
    for (int i = 0; i < size; i++) {
      input[i] = (float)input[i] / sum;
    }
  }

  static void rms_norm(int hidden_size, int qlen, A* input) {
    const A epsilon = 1e-6;
    for (int t = 0; t < qlen; t++) {
      rms_norm_single(hidden_size, input + t * hidden_size);
    }
  }

  static void rms_norm_with_weights(int hidden_size, int qlen, A* weights, A* input) {
    const A epsilon = 1e-6;
    for (int t = 0; t < qlen; t++) {
      rms_norm_single_with_weights(hidden_size, input + t * hidden_size);
    }
  }
  static void rms_norm_single_with_weights(int size, A* weights, A* input) {
    const float epsilon = 1e-6;
    float sum = 0;
    for (int i = 0; i < size; i++) {
      sum += (float)input[i] * (float)input[i];
    }
    sum = sqrt(sum / size + epsilon);
    for (int i = 0; i < size; i++) {
      input[i] = (float)weights[i] * (float)input[i] / sum;
    }
  }
};

#endif

================================================
FILE: kt-kernel/operators/rope.hpp
================================================
#ifndef CPUINFER_ROPE_HPP
#define CPUINFER_ROPE_HPP

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstring>
#include <stdexcept>
#include <vector>

template <typename T, typename E, typename A>
concept ROPE_APPLIER = requires(T t, E* emb, int size, int pos_start, int pos_len, A* v) {
  // must be thread safe and efficient

  // apply embeddings with pos_start to v, v is vector of size
  { T::apply_single(emb, v, size, pos_start) } -> std::same_as<void>;

  // for every v i, apply embeddings with pos_start + i to v[i], v is vector of size
  { T::apply_multiple(emb, v, size, pos_start, pos_len) } -> std::same_as<void>;
};

template <typename T, typename A>
concept ROPE_ANGLE = requires(T t, size_t at) {
  { t.cos(at) } -> std::same_as<float*>;
  { t.sin(at) } -> std::same_as<float*>;
  { t.init(at) } -> std::same_as<void>;
};

template <typename E, typename A>
  requires ROPE_ANGLE<E, A>
struct Rope {
 public:
  static void apply_single(E& emb, A* v, int size, int pos_start) {
    if (size == 0) {
      return;
    }
    if (size % 2 != 0) {
      throw std::invalid_argument("Rope::apply_single: 'size' (head_dim) must be even for LLaMA-style RoPE.");
    }

    const float* cos = emb.cos(pos_start);
    const float* sin = emb.sin(pos_start);

    thread_local static std::vector<float> v2;
    if (v2.size() < size) {
      v2.resize(size);
    }

    for (int i = 0; i < size / 2; i++) {
      float a = v[2 * i], b = v[2 * i + 1];
      v2[i] = cos[i] * a - sin[i] * b;
      v2[i + size / 2] = sin[i] * a + cos[i] * b;
    }

    for (int i = 0; i < size; i++) {
      v[i] = v2[i];
    }
  }

  static void apply_multiple(E& emb, A* v_block_start, int size_per_vector, int pos_start, int pos_len) {
    if (size_per_vector == 0 || pos_len == 0) {
      return;
    }
    if (size_per_vector % 2 != 0) {
      throw std::invalid_argument("Rope::apply_multiple: 'size_per_vector' (head_dim) must be even.");
    }

    for (int i = 0; i < pos_len; ++i) {
      apply_single(emb, v_block_start + size_per_vector * i, size_per_vector, pos_start + i);
    }
  }
};

class RotaryEmbeddingBase {
 public:
  virtual ~RotaryEmbeddingBase() = default;
  virtual void init(size_t seq_len) {
    calculate_inv_freq();
    set_cos_sin_cache(seq_len);
    this->max_seq_len_cached_ = seq_len;
  }

 protected:
  RotaryEmbeddingBase(size_t dim, size_t max_pos_embeddings, double base_val)
      : dim_(dim), max_position_embeddings_(max_pos_embeddings), base_(base_val), max_seq_len_cached_(0) {}

  virtual void calculate_inv_freq() = 0;
  virtual void set_cos_sin_cache(size_t seq_len) = 0;

  size_t dim_;
  size_t max_position_embeddings_;
  double base_;
  std::vector<double> inv_freq_;
  size_t max_seq_len_cached_;
};

class DeepseekV3RotaryEmbedding : public RotaryEmbeddingBase {
 public:
  DeepseekV3RotaryEmbedding(size_t dim, size_t max_position_embeddings = 2048, double base = 10000.0f)
      : RotaryEmbeddingBase(dim, max_position_embeddings, base) {
    if (this->dim_ % 2 != 0 || this->dim_ < 0) {
      throw std::invalid_argument("Dimension must be even for RotaryEmbedding and >= 0.");
    }

    if (this->max_position_embeddings_ < 0) {
      throw std::invalid_argument("DeepseekV3RotaryEmbedding max_position_embeddings_ must be >= 0.");
    }

    calculate_inv_freq();
    set_cos_sin_cache(this->max_position_embeddings_);
  }

  float* sin(size_t at) { return sin_cached_.data() + at * this->dim_ / 2; }
  float* cos(size_t at) { return cos_cached_.data() + at * this->dim_ / 2; }

 protected:
  void calculate_inv_freq() override {
    this->inv_freq_.resize(this->dim_ / 2);
    for (size_t i = 0; i < this->dim_ / 2; ++i) {
      this->inv_freq_[i] = 1.0 / std::pow(this->base_, 2.0 * i / this->dim_);
    }
  }

  void set_cos_sin_cache(size_t seq_len) override {
    if (this->inv_freq_.empty()) {
      calculate_inv_freq();
    }

    cos_cached_.resize(seq_len * this->dim_ / 2);
    sin_cached_.resize(seq_len * this->dim_ / 2);

    for (size_t i = 0; i < seq_len; ++i) {
      for (size_t j = 0; j < this->inv_freq_.size(); ++j) {
        double freq = static_cast<double>(i) * this->inv_freq_[j];
        double cos_val = std::cos(freq);
        double sin_val = std::sin(freq);
        size_t idx1 = i * this->dim_ / 2 + j;

        cos_cached_.at(idx1) = cos_val;
        sin_cached_.at(idx1) = sin_val;
      }
    }
    this->max_seq_len_cached_ = seq_len;
  }

  std::vector<float> cos_cached_;
  std::vector<float> sin_cached_;
};

inline double yarn_find_correction_dim(double num_rotations, double dim, double base, double max_position_embeddings) {
  return (dim * std::log(max_position_embeddings / (num_rotations * static_cast<double>(2.0f) * M_PI))) /
         (static_cast<double>(2.0f) * std::log(base));
}

inline std::pair<size_t, size_t> yarn_find_correction_range(double low_rot, double high_rot, size_t dim,
                                                            double base = 10000,
                                                            double max_position_embeddings = 2048) {
  double low_f = std::floor(yarn_find_correction_dim(low_rot, static_cast<double>(dim), base, max_position_embeddings));
  double high_f =
      std::ceil(yarn_find_correction_dim(high_rot, static_cast<double>(dim), base, max_position_embeddings));

  size_t low = static_cast<size_t>(std::max(0.0, low_f));
  size_t high = static_cast<size_t>(std::min(static_cast<double>(dim - 1), high_f));
  return std::pair{low, high};
}

inline std::vector<double> yarn_linear_ramp_mask(double min_val, double max_val, size_t dim) {
  if (std::abs(min_val - max_val) < 1e-6f) {
    max_val += 0.001;
  }
  std::vector<double> ramp_func(dim);
  for (size_t i = 0; i < dim; ++i) {
    double linear_func = (static_cast<double>(i) - min_val) / (max_val - min_val);
    ramp_func[i] = std::clamp(linear_func, 0.0, 1.0);
  }
  return ramp_func;
}

inline double yarn_get_mscale(double scale = 1.0, double mscale = 1.0) {
  if (scale <= 1.0) {
    return 1.0;
  }
  return 0.1 * mscale * std::log(scale) + 1.0;
}

class DeepseekV3YarnRotaryEmbedding : public DeepseekV3RotaryEmbedding {
 public:
  DeepseekV3YarnRotaryEmbedding(size_t dim, size_t max_position_embeddings = 2048, double base = 10000.0f,
                                double scaling_factor = 1.0, size_t original_max_position_embeddings = 4096,
                                double beta_fast = 32.0, double beta_slow = 1.0, double mscale_val = 1.0,
                                double mscale_all_dim_val = 0.0)
      : DeepseekV3RotaryEmbedding(dim, 0, base),
        scaling_factor_(scaling_factor),
        original_max_position_embeddings_(original_max_position_embeddings),
        beta_fast_(beta_fast),
        beta_slow_(beta_slow),
        mscale_(mscale_val),
        mscale_all_dim_(mscale_all_dim_val) {
    if (this->dim_ % 2 != 0 || this->dim_ < 0) {
      throw std::invalid_argument("Dimension must be even for RotaryEmbedding and >= 0.");
    }

    if (this->max_position_embeddings_ < 0) {
      throw std::invalid_argument("DeepseekV3YarnRotaryEmbedding: max_position_embeddings_ must be >= 0.");
    }
    calculate_inv_freq();
    set_cos_sin_cache(max_position_embeddings);
  }

 protected:
  void calculate_inv_freq() override {
    if (this->dim_ == 0) {
      this->inv_freq_.clear();
      return;
    }
    size_t dim_half = this->dim_ / 2;
    this->inv_freq_.resize(dim_half);

    std::vector<double> freq_extra(dim_half);
    std::vector<double> freq_inter(dim_half);
    for (size_t i = 0; i < dim_half; ++i) {
      double freq_index = 2.0 * i / this->dim_;
      freq_extra[i] = 1.0 / std::pow(this->base_, freq_index);
      freq_inter[i] = 1.0f / (scaling_factor_ * std::pow(this->base_, freq_index));
    }

    auto [low_idx_f, high_idx_f] =
        yarn_find_correction_range(beta_fast_, beta_slow_, this->dim_, this->base_, original_max_position_embeddings_);

    size_t low_idx = static_cast<size_t>(low_idx_f);
    size_t high_idx = static_cast<size_t>(high_idx_f);

    std::vector<double> inv_freq_mask_ramp;
    inv_freq_mask_ramp = yarn_linear_ramp_mask(low_idx, high_idx, dim_half);

    for (size_t i = 0; i < dim_half; ++i) {
      double mask_val = 1.0 - inv_freq_mask_ramp[i];
      this->inv_freq_[i] = freq_inter[i] * (1.0 - mask_val) + freq_extra[i] * mask_val;
    }
  }

  void set_cos_sin_cache(size_t seq_len) override {
    if (this->inv_freq_.empty() || this->inv_freq_.size() != this->dim_ / 2) {
      calculate_inv_freq();
    }

    this->cos_cached_.resize(seq_len * this->dim_ / 2);
    this->sin_cached_.resize(seq_len * this->dim_ / 2);

    // printf("scaling_factor %f, mscale %f, mscale all dim %f\n", scaling_factor_, mscale_, mscale_all_dim_);
    double scale_factor_val = yarn_get_mscale(scaling_factor_, mscale_);
    double scale_all_dim_factor_val = yarn_get_mscale(scaling_factor_, mscale_all_dim_);
    double actual_mscale = 1.0;
    if (std::abs(scale_all_dim_factor_val) > 1e-6f) {
      actual_mscale = scale_factor_val / scale_all_dim_factor_val;
    }
    // printf("actual_mscale: %f, %f, %f\n", actual_mscale, scale_factor_val, scale_all_dim_factor_val);

    for (size_t i = 0; i < seq_len; ++i) {
      for (size_t j = 0; j < this->inv_freq_.size(); ++j) {
        double freq = static_cast<double>(i) * this->inv_freq_[j];
        double cos_val = std::cos(freq) * actual_mscale;
        double sin_val = std::sin(freq) * actual_mscale;
        size_t idx1 = i * this->dim_ / 2 + j;

        this->cos_cached_.at(idx1) = cos_val;
        this->sin_cached_.at(idx1) = sin_val;
      }
    }
    this->max_seq_len_cached_ = seq_len;
  }

 private:
  double scaling_factor_;
  size_t original_max_position_embeddings_;
  double beta_fast_;
  double beta_slow_;
  double mscale_;
  double mscale_all_dim_;
};

#endif

================================================
FILE: kt-kernel/operators/softmax.hpp
================================================
#ifndef CPUINFER_OPERATOR_SOFTMAX_HPP
#define CPUINFER_OPERATOR_SOFTMAX_HPP

#include <algorithm>  // max_element
#include <cmath>      // exp
#include <cstddef>
#ifdef __aarch64__
#include <arm_sve.h>
#endif

#include <type_traits>

template <typename T, typename A>
concept SOFTMAX_APPLIER = requires(T t, A* v, size_t size, size_t count, size_t ld) {
  { T::apply_single(v, size) } -> std::same_as<void>;
  { T::apply_multiple(count, v, size, ld) } -> std::same_as<void>;
};

template <typename A>
class Softmax {
 public:
  /// 对单个向量做 softmax，就地写回
  static void apply_single(A* v, size_t size) {
    static thread_local std::vector<float> v2(100000);
    if (size == 0 || v == nullptr) return;
    if (size > v2.size()) {
      v2.resize(size);
    }

    for (int i = 0; i < size; i++) {
      v2[i] = v[i];
    }

    const float max_val = *std::max_element(v2.begin(), v2.begin() + size);

    float sum = 0;
    for (size_t i = 0; i < size; ++i) {
      v2[i] = std::exp(v2[i] - max_val);
      sum += v2[i];
    }
    if (sum == 0) return;  // 理论上不会发生，但防御一下
    const float inv_sum = 1.0 / sum;
    for (size_t i = 0; i < size; ++i) {
      v[i] = v2[i] * inv_sum;
    }
  }

  static void apply_multiple(size_t count, A* v, size_t size, size_t ld) {
    for (size_t i = 0; i < count; ++i) {
      apply_single(v + i * ld, size);
    }
  }
};

#endif  // CPUINFER_OPERATOR_SOFTMAX_HPP


================================================
FILE: kt-kernel/operators/tp.hpp
================================================


================================================
FILE: kt-kernel/pyproject.toml
================================================
[build-system]
# Minimum versions: setuptools for setup.py declarative usage, wheel for bdist_wheel
requires = ["setuptools>=61", "wheel", "cmake>=3.16", "pybind11"]
build-backend = "setuptools.build_meta"

[project]
name = "kt-kernel"
# Version is dynamically read from ../version.py via setup.py
dynamic = ["version"]
description = "KT-Kernel: High-performance kernel operations for KTransformers (AMX/AVX/KML optimizations)"
readme = "README.md"
authors = [{ name = "kvcache-ai" }]
# Use SPDX string form (table form deprecated in newer setuptools)
license = "Apache-2.0"
classifiers = [
  "Programming Language :: Python :: 3",
  "Programming Language :: C++",
  "Operating System :: POSIX :: Linux",
  "Operating System :: MacOS",
]
requires-python = ">=3.8"
dependencies = [
  # Core dependencies
  "torch>=2.0.0",
  "safetensors>=0.4.0",
  "compressed-tensors>=0.7.0",
  "numpy>=1.24.0",
  "triton>=2.0.0",
  "gguf>=0.17.0",
  # CLI dependencies
  "typer[all]>=0.9.0",
  "rich>=13.0.0",
  "pyyaml>=6.0",
  "httpx>=0.25.0",
  "packaging>=23.0",
  # SGLang (kvcache-ai fork)
  "sglang-kt",
  # Development dependencies
  "black>=25.9.0",
]

[project.optional-dependencies]
test = [
  "pytest>=7.0.0",
  "psutil>=5.9.0",
]

[project.scripts]
kt = "kt_kernel.cli.main:main"

[project.urls]
Homepage = "https://github.com/kvcache-ai"

[tool.setuptools]
packages = [
  "kt_kernel",
  "kt_kernel.utils",
  "kt_kernel.cli",
  "kt_kernel.cli.commands",
  "kt_kernel.cli.config",
  "kt_kernel.cli.utils",
  "kt_kernel.cli.completions",
]
include-package-data = true

[tool.setuptools.package-dir]
kt_kernel = "python"
"kt_kernel.utils" = "python/utils"
"kt_kernel.cli" = "python/cli"
"kt_kernel.cli.commands" = "python/cli/commands"
"kt_kernel.cli.config" = "python/cli/config"
"kt_kernel.cli.utils" = "python/cli/utils"
"kt_kernel.cli.completions" = "python/cli/completions"

[tool.setuptools.package-data]
"kt_kernel.cli.completions" = ["*.bash", "*.fish", "_kt"]

[tool.setuptools.exclude-package-data]
# (empty)

[tool.cpuinfer]
# Custom section (example). You can place build options documentation here.
# CPUINFER_CPU_INSTRUCT: NATIVE|FANCY|AVX512|AVX2
# CPUINFER_ENABLE_AMX: ON/OFF
# CPUINFER_VERBOSE: 1/0

[tool.black]
# Code style for Black formatter
line-length = 120
target-version = ["py311"]
exclude = '''
(
  /(\.
    | build
    | dist
    | temp
    | __pycache__
    | kt_kernel\.egg-info
    | third_party
  )/
)
'''


================================================
FILE: kt-kernel/pytest.ini
================================================
[pytest]
# Test paths
testpaths = test/per_commit

# File and function naming conventions
python_files = test_*.py
python_classes = Test*
python_functions = test_*

# Markers for hardware backends
markers =
    cpu: CPU backend tests (Intel AMX/AVX512/AVX2)
    cuda: CUDA backend tests (NVIDIA GPUs)
    amd: AMD backend tests (ROCm)
    slow: Slow-running tests (>60 seconds)
    requires_model: Tests requiring model files

# Output options
addopts =
    -v
    --tb=short
    --strict-markers

# Filter warnings
filterwarnings =
    ignore::DeprecationWarning
    ignore::PendingDeprecationWarning


================================================
FILE: kt-kernel/python/__init__.py
================================================
# KT-Kernel: High-performance kernel operations for KTransformers
# SPDX-License-Identifier: Apache-2.0

"""
KT-Kernel provides high-performance kernel operations for KTransformers,
including CPU-optimized MoE inference with AMX, AVX, and KML support.

The package automatically detects your CPU capabilities and loads the optimal
kernel variant (AMX, AVX512, or AVX2) at runtime.

Example usage:
    >>> from kt_kernel import KTMoEWrapper
    >>> wrapper = KTMoEWrapper(
    ...     layer_idx=0,
    ...     num_experts=8,
    ...     num_experts_per_tok=2,
    ...     hidden_size=4096,
    ...     moe_intermediate_size=14336,
    ...     num_gpu_experts=2,
    ...     cpuinfer_threads=32,
    ...     threadpool_count=2,
    ...     weight_path="/path/to/weights",
    ...     chunked_prefill_size=512,
    ...     method="AMXINT4"
    ... )

    Check which CPU variant is loaded:
    >>> import kt_kernel
    >>> print(kt_kernel.__cpu_variant__)  # 'amx', 'avx512', or 'avx2'

Environment Variables:
    KT_KERNEL_CPU_VARIANT: Override automatic detection ('amx', 'avx512', 'avx2')
    KT_KERNEL_DEBUG: Enable debug output ('1' to enable)
"""

from __future__ import annotations

# Detect CPU and load optimal extension variant
from ._cpu_detect import initialize as _initialize_cpu

_kt_kernel_ext, __cpu_variant__ = _initialize_cpu()

# Make the extension module available to other modules in this package
import sys

sys.modules["kt_kernel_ext"] = _kt_kernel_ext

# Also expose kt_kernel_ext as an attribute for backward compatibility
kt_kernel_ext = _kt_kernel_ext

# Import main API
from .experts import KTMoEWrapper
from .experts_base import generate_gpu_experts_masks

# Read version from package metadata (preferred) or fallback to project root
try:
    # Try to get version from installed package metadata (works in installed environment)
    from importlib.metadata import version, PackageNotFoundError

    try:
        __version__ = version("kt-kernel")
    except PackageNotFoundError:
        # Package not installed, try to read from source tree version.py
        import os

        _root_version_file = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "version.py")
        if os.path.exists(_root_version_file):
            _version_ns = {}
            with open(_root_version_file, "r", encoding="utf-8") as f:
                exec(f.read(), _version_ns)
            __version__ = _version_ns.get("__version__", "0.4.3")
        else:
            __version__ = "0.4.3"
except ImportError:
    # Python < 3.8, fallback to pkg_resources or hardcoded version
    try:
        from pkg_resources import get_distribution, DistributionNotFound

        try:
            __version__ = get_distribution("kt-kernel").version
        except DistributionNotFound:
            __version__ = "0.4.3"
    except ImportError:
        __version__ = "0.4.3"

__all__ = ["KTMoEWrapper", "generate_gpu_experts_masks", "kt_kernel_ext", "__cpu_variant__", "__version__"]


================================================
FILE: kt-kernel/python/_cpu_detect.py
================================================
"""
CPU feature detection and optimal kernel loader for kt-kernel.

This module automatically detects CPU capabilities and loads the best available
kernel variant (AMX, AVX512, or AVX2) at runtime.

Environment Variables:
    KT_KERNEL_CPU_VARIANT: Override automatic detection ('amx', 'avx512', 'avx2')
    KT_KERNEL_DEBUG: Enable debug output ('1' to enable)

Example:
    >>> import kt_kernel
    >>> print(kt_kernel.__cpu_variant__)  # Shows detected variant

    # Override detection
    >>> import os
    >>> os.environ['KT_KERNEL_CPU_VARIANT'] = 'avx2'
    >>> import kt_kernel  # Will use AVX2 variant
"""

import os
import sys
from pathlib import Path


def detect_cpu_features():
    """
    Detect CPU features and determine the best kernel variant using progressive matching.

    Progressive variant hierarchy (from most to least advanced):
        1. AMX: amx_tile, amx_int8, amx_bf16 + full AVX512
        2. AVX512_BF16: avx512f, avx512bw, avx512_vnni, avx512_vbmi, avx512_bf16
        3. AVX512_VBMI: avx512f, avx512bw, avx512_vnni, avx512_vbmi
        4. AVX512_VNNI: avx512f, avx512bw, avx512_vnni
        5. AVX512_BASE: avx512f, avx512bw
        6. AVX2: avx2 (fallback)

    Returns:
        str: Variant name - one of: 'amx', 'avx512_bf16', 'avx512_vbmi',
             'avx512_vnni', 'avx512_base', 'avx2'
    """
    # Check environment override
    variant = os.environ.get("KT_KERNEL_CPU_VARIANT", "").lower()
    valid_variants = ["amx", "avx512_bf16", "avx512_vbmi", "avx512_vnni", "avx512_base", "avx2"]
    if variant in valid_variants:
        if os.environ.get("KT_KERNEL_DEBUG") == "1":
            print(f"[kt-kernel] Using environment override: {variant}")
        return variant

    # Try to read /proc/cpuinfo on Linux
    try:
        with open("/proc/cpuinfo", "r") as f:
            cpuinfo = f.read().lower()

        # Extract CPU flags into a set for fast lookup
        cpu_flags = set()
        for line in cpuinfo.split("\n"):
            if line.startswith("flags"):
                flags_str = line.split(":", 1)[1]
                cpu_flags = set(flags_str.split())
                break

        # Define variant requirements in priority order (best to worst)
        variant_requirements = [
            (
                "amx",
                [
                    "amx_tile",
                    "amx_int8",
                    "amx_bf16",
                    "avx512f",
                    "avx512bw",
                    "avx512_vnni",
                    "avx512_vbmi",
                    "avx512_bf16",
                ],
            ),
            ("avx512_bf16", ["avx512f", "avx512bw", "avx512_vnni", "avx512_vbmi", "avx512_bf16"]),
            ("avx512_vbmi", ["avx512f", "avx512bw", "avx512_vnni", "avx512_vbmi"]),
            ("avx512_vnni", ["avx512f", "avx512bw", "avx512_vnni"]),
            ("avx512_base", ["avx512f", "avx512bw"]),
            ("avx2", ["avx2"]),
        ]

        # Find the best matching variant
        for variant_name, required_flags in variant_requirements:
            # Check if all required flags are present
            # Handle flag name variations (e.g., avx512_bf16 vs avx512bf16)
            has_all_flags = True
            for flag in required_flags:
                # Try exact match first, then without underscore
                flag_alt = flag.replace("_", "")
                if flag not in cpu_flags and flag_alt not in cpu_flags:
                    has_all_flags = False
                    break

            if has_all_flags:
                if os.environ.get("KT_KERNEL_DEBUG") == "1":
                    print(f"[kt-kernel] Detected {variant_name} support via /proc/cpuinfo")
                    print(f"[kt-kernel] Matched flags: {', '.join(required_flags)}")
                return variant_name

        # Fallback to AVX2 (should be rare on modern CPUs)
        if os.environ.get("KT_KERNEL_DEBUG") == "1":
            print("[kt-kernel] No supported features detected, using AVX2 fallback")
        return "avx2"

    except FileNotFoundError:
        # /proc/cpuinfo doesn't exist (not Linux or in container)
        # Try cpufeature package as fallback
        if os.environ.get("KT_KERNEL_DEBUG") == "1":
            print("[kt-kernel] /proc/cpuinfo not found, trying cpufeature package")

        try:
            import cpufeature

            # Define variant requirements in priority order (using cpufeature naming)
            cpufeature_requirements = [
                (
                    "amx",
                    [
                        "AMX_TILE",
                        "AMX_INT8",
                        "AMX_BF16",
                        "AVX512F",
                        "AVX512BW",
                        "AVX512_VNNI",
                        "AVX512_VBMI",
                        "AVX512_BF16",
                    ],
                ),
                ("avx512_bf16", ["AVX512F", "AVX512BW", "AVX512_VNNI", "AVX512_VBMI", "AVX512_BF16"]),
                ("avx512_vbmi", ["AVX512F", "AVX512BW", "AVX512_VNNI", "AVX512_VBMI"]),
                ("avx512_vnni", ["AVX512F", "AVX512BW", "AVX512_VNNI"]),
                ("avx512_base", ["AVX512F", "AVX512BW"]),
                ("avx2", ["AVX2"]),
            ]

            # Find the best matching variant
            for variant_name, required_features in cpufeature_requirements:
                has_all_features = all(cpufeature.CPUFeature.get(feat, False) for feat in required_features)
                if has_all_features:
                    if os.environ.get("KT_KERNEL_DEBUG") == "1":
                        print(f"[kt-kernel] Detected {variant_name} support via cpufeature")
                    return variant_name

            # Fallback to AVX2
            if os.environ.get("KT_KERNEL_DEBUG") == "1":
                print("[kt-kernel] Using AVX2 fallback via cpufeature")
            return "avx2"

        except ImportError:
            # cpufeature not available - ultimate fallback
            if os.environ.get("KT_KERNEL_DEBUG") == "1":
                print("[kt-kernel] cpufeature not available, using AVX2 fallback")
            return "avx2"

    except Exception as e:
        # Any other error - safe fallback
        if os.environ.get("KT_KERNEL_DEBUG") == "1":
            print(f"[kt-kernel] Error during CPU detection: {e}, using AVX2 fallback")
        return "avx2"


def load_extension(variant):
    """
    Load the appropriate kt_kernel_ext variant.

    Tries to import the specified variant, with automatic fallback to
    lower-performance variants if the requested one is not available.

    Supports both multi-variant builds (_kt_kernel_ext_amx.*.so) and
    single-variant builds (kt_kernel_ext.*.so).

    Fallback chain (each variant falls back to the next in line):
        amx -> avx512_bf16 -> avx512_vbmi -> avx512_vnni -> avx512_base -> avx2 -> single-variant

    Args:
        variant (str): One of 'amx', 'avx512_bf16', 'avx512_vbmi', 'avx512_vnni', 'avx512_base', 'avx2'

    Returns:
        module: The loaded extension module

    Raises:
        ImportError: If all variants fail to load
    """
    import importlib.util
    import glob

    # The .so files can be named in two ways:
    # Multi-variant: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
    # Single-variant: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
    # Both export PyInit_kt_kernel_ext (the original module name)

    try:
        # Find the kt_kernel package directory
        # We can't import kt_kernel here (circular import), so use __file__
        kt_kernel_dir = os.path.dirname(os.path.abspath(__file__))

        # Try multi-variant naming first
        pattern = os.path.join(kt_kernel_dir, f"_kt_kernel_ext_{variant}.*.so")
        so_files = glob.glob(pattern)

        if not so_files:
            # Try single-variant naming (fallback for builds without CPUINFER_BUILD_ALL_VARIANTS)
            pattern = os.path.join(kt_kernel_dir, "kt_kernel_ext.*.so")
            so_files = glob.glob(pattern)

            if so_files:
                if os.environ.get("KT_KERNEL_DEBUG") == "1":
                    print(f"[kt-kernel] Multi-variant {variant} not found, using single-variant build")
            else:
                raise ImportError(
                    f"No .so file found for variant {variant} (tried patterns: {kt_kernel_dir}/_kt_kernel_ext_{variant}.*.so and {kt_kernel_dir}/kt_kernel_ext.*.so)"
                )

        so_file = so_files[0]

        if os.environ.get("KT_KERNEL_DEBUG") == "1":
            print(f"[kt-kernel] Loading {variant} from: {so_file}")

        # Load the module manually
        # The module exports PyInit_kt_kernel_ext, so we use that as the module name
        spec = importlib.util.spec_from_file_location("kt_kernel_ext", so_file)
        if spec is None or spec.loader is None:
            raise ImportError(f"Failed to create spec for {so_file}")

        ext = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(ext)

        if os.environ.get("KT_KERNEL_DEBUG") == "1":
            print(f"[kt-kernel] Successfully loaded {variant.upper()} variant")
        return ext

    except (ImportError, ModuleNotFoundError, FileNotFoundError) as e:
        if os.environ.get("KT_KERNEL_DEBUG") == "1":
            print(f"[kt-kernel] Failed to load {variant} variant: {e}")

        # Define fallback chain: each variant falls back to the next lower one
        fallback_chain = {
            "amx": "avx512_bf16",
            "avx512_bf16": "avx512_vbmi",
            "avx512_vbmi": "avx512_vnni",
            "avx512_vnni": "avx512_base",
            "avx512_base": "avx2",
            "avx2": None,  # No fallback - terminal variant
        }

        # Get next fallback variant
        next_variant = fallback_chain.get(variant)

        if next_variant:
            # Try next variant in the chain
            if os.environ.get("KT_KERNEL_DEBUG") == "1":
                print(f"[kt-kernel] Falling back from {variant} to {next_variant}")
            return load_extension(next_variant)
        else:
            # AVX2 is the last fallback - if this fails, we can't continue
            raise ImportError(
                f"Failed to load kt_kernel extension (variant: {variant}). "
                f"Original error: {e}\n"
                f"This usually means the kt_kernel package is not properly installed."
            )


def initialize():
    """
    Detect CPU capabilities and load the optimal extension variant.

    This is the main entry point called by kt_kernel.__init__.py.

    Returns:
        tuple: (extension_module, variant_name)
    - extension_module: The loaded C++ extension module
            - variant_name: String indicating which variant was loaded ('amx', 'avx512', 'avx2')

    Example:
        >>> ext, variant = initialize()
        >>> print(f"Loaded {variant} variant")
        >>> wrapper = ext.AMXMoEWrapper(...)
    """
    # Detect CPU features
    variant = detect_cpu_features()

    if os.environ.get("KT_KERNEL_DEBUG") == "1":
        print(f"[kt-kernel] Selected CPU variant: {variant}")

    # Load the appropriate extension
    ext = load_extension(variant)

    if os.environ.get("KT_KERNEL_DEBUG") == "1":
        print(f"[kt-kernel] Extension module loaded: {ext.__name__}")

    return ext, variant


================================================
FILE: kt-kernel/python/cli/__init__.py
================================================
"""
KTransformers CLI - A unified command-line interface for KTransformers.

This CLI provides a user-friendly interface to all KTransformers functionality,
including model inference, fine-tuning, benchmarking, and more.
"""

__version__ = "0.1.0"


================================================
FILE: kt-kernel/python/cli/commands/__init__.py
================================================
"""
Command modules for kt-cli.
"""


================================================
FILE: kt-kernel/python/cli/commands/bench.py
================================================
"""
Bench commands for kt-cli.

Runs benchmarks for performance testing.
"""

import subprocess
import sys
from enum import Enum
from pathlib import Path
from typing import Optional

import typer

from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
    console,
    print_error,
    print_info,
    print_step,
    print_success,
)


class BenchType(str, Enum):
    """Benchmark type."""

    INFERENCE = "inference"
    MLA = "mla"
    MOE = "moe"
    LINEAR = "linear"
    ATTENTION = "attention"
    ALL = "all"


def bench(
    type: BenchType = typer.Option(
        BenchType.ALL,
        "--type",
        "-t",
        help="Benchmark type",
    ),
    model: Optional[str] = typer.Option(
        None,
        "--model",
        "-m",
        help="Model to benchmark",
    ),
    output: Optional[Path] = typer.Option(
        None,
        "--output",
        "-o",
        help="Output file for results (JSON)",
    ),
    iterations: int = typer.Option(
        10,
        "--iterations",
        "-n",
        help="Number of iterations",
    ),
) -> None:
    """Run full benchmark suite."""
    console.print()
    print_step(t("bench_starting"))
    print_info(t("bench_type", type=type.value))
    console.print()

    if type == BenchType.ALL:
        _run_all_benchmarks(model, output, iterations)
    elif type == BenchType.INFERENCE:
        _run_inference_benchmark(model, output, iterations)
    elif type == BenchType.MLA:
        _run_component_benchmark("mla", output, iterations)
    elif type == BenchType.MOE:
        _run_component_benchmark("moe", output, iterations)
    elif type == BenchType.LINEAR:
        _run_component_benchmark("linear", output, iterations)
    elif type == BenchType.ATTENTION:
        _run_component_benchmark("attention", output, iterations)

    console.print()
    print_success(t("bench_complete"))
    if output:
        console.print(f"  Results saved to: {output}")
    console.print()


def microbench(
    component: str = typer.Argument(
        "moe",
        help="Component to benchmark (moe, mla, linear, attention)",
    ),
    batch_size: int = typer.Option(
        1,
        "--batch-size",
        "-b",
        help="Batch size",
    ),
    seq_len: int = typer.Option(
        1,
        "--seq-len",
        "-s",
        help="Sequence length",
    ),
    iterations: int = typer.Option(
        100,
        "--iterations",
        "-n",
        help="Number of iterations",
    ),
    warmup: int = typer.Option(
        10,
        "--warmup",
        "-w",
        help="Warmup iterations",
    ),
    output: Optional[Path] = typer.Option(
        None,
        "--output",
        "-o",
        help="Output file for results (JSON)",
    ),
) -> None:
    """Run micro-benchmark for specific components."""
    console.print()
    console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
    console.print()
    raise typer.Exit(0)

    # Try to find the benchmark script
    kt_kernel_path = _find_kt_kernel_path()

    if kt_kernel_path is None:
        print_error("kt-kernel not found. Install with: kt install inference")
        raise typer.Exit(1)

    bench_dir = kt_kernel_path / "bench"

    # Map component to script
    component_scripts = {
        "moe": "bench_moe.py",
        "mla": "bench_mla.py",
        "linear": "bench_linear.py",
        "attention": "bench_attention.py",
        "mlp": "bench_mlp.py",
    }

    script_name = component_scripts.get(component.lower())
    if script_name is None:
        print_error(f"Unknown component: {component}")
        console.print(f"Available: {', '.join(component_scripts.keys())}")
        raise typer.Exit(1)

    script_path = bench_dir / script_name
    if not script_path.exists():
        print_error(f"Benchmark script not found: {script_path}")
        raise typer.Exit(1)

    # Run benchmark
    cmd = [
        sys.executable,
        str(script_path),
        "--batch-size",
        str(batch_size),
        "--seq-len",
        str(seq_len),
        "--iterations",
        str(iterations),
        "--warmup",
        str(warmup),
    ]

    if output:
        cmd.extend(["--output", str(output)])

    console.print(f"[dim]$ {' '.join(cmd)}[/dim]")
    console.print()

    try:
        process = subprocess.run(cmd)

        if process.returncode == 0:
            console.print()
            print_success(t("bench_complete"))
            if output:
                console.print(f"  Results saved to: {output}")
        else:
            print_error(f"Benchmark failed with exit code {process.returncode}")
            raise typer.Exit(process.returncode)

    except FileNotFoundError as e:
        print_error(f"Failed to run benchmark: {e}")
        raise typer.Exit(1)


def _find_kt_kernel_path() -> Optional[Path]:
    """Find the kt-kernel installation path."""
    try:
        import kt_kernel

        return Path(kt_kernel.__file__).parent.parent
    except ImportError:
        pass

    # Check common locations
    possible_paths = [
        Path.home() / "Projects" / "ktransformers" / "kt-kernel",
        Path("/opt/ktransformers/kt-kernel"),
        Path.cwd() / "kt-kernel",
    ]

    for path in possible_paths:
        if path.exists() and (path / "bench").exists():
            return path

    return None


def _run_all_benchmarks(model: Optional[str], output: Optional[Path], iterations: int) -> None:
    """Run all benchmarks."""
    components = ["moe", "mla", "linear", "attention"]

    for component in components:
        console.print(f"\n[bold]Running {component} benchmark...[/bold]")
        _run_component_benchmark(component, None, iterations)


def _run_inference_benchmark(model: Optional[str], output: Optional[Path], iterations: int) -> None:
    """Run inference benchmark."""
    if model is None:
        print_error("Model required for inference benchmark. Use --model flag.")
        raise typer.Exit(1)

    print_info(f"Running inference benchmark on {model}...")
    console.print()
    console.print("[dim]This will start the server and run test requests.[/dim]")
    console.print()

    # TODO: Implement actual inference benchmarking
    print_error("Inference benchmarking not yet implemented.")


def _run_component_benchmark(component: str, output: Optional[Path], iterations: int) -> None:
    """Run a component benchmark."""
    kt_kernel_path = _find_kt_kernel_path()

    if kt_kernel_path is None:
        print_error("kt-kernel not found.")
        return

    bench_dir = kt_kernel_path / "bench"
    script_map = {
        "moe": "bench_moe.py",
        "mla": "bench_mla.py",
        "linear": "bench_linear.py",
        "attention": "bench_attention.py",
    }

    script_name = script_map.get(component)
    if script_name is None:
        print_error(f"Unknown component: {component}")
        return

    script_path = bench_dir / script_name
    if not script_path.exists():
        print_error(f"Script not found: {script_path}")
        return

    cmd = [sys.executable, str(script_path), "--iterations", str(iterations)]

    try:
        subprocess.run(cmd)
    except Exception as e:
        print_error(f"Benchmark failed: {e}")


================================================
FILE: kt-kernel/python/cli/commands/chat.py
================================================
"""
Chat command for kt-cli.

Provides interactive chat interface with running model server.
"""

import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

import typer
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.prompt import Prompt, Confirm

from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
    console,
    print_error,
    print_info,
    print_success,
    print_warning,
)

# Try to import OpenAI SDK
try:
    from openai import OpenAI

    HAS_OPENAI = True
except ImportError:
    HAS_OPENAI = False


def chat(
    host: Optional[str] = typer.Option(
        None,
        "--host",
        "-H",
        help="Server host address",
    ),
    port: Optional[int] = typer.Option(
        None,
        "--port",
        "-p",
        help="Server port",
    ),
    model: Optional[str] = typer.Option(
        None,
        "--model",
        "-m",
        help="Model name (if server hosts multiple models)",
    ),
    temperature: float = typer.Option(
        0.7,
        "--temperature",
        "-t",
        help="Sampling temperature (0.0 to 2.0)",
    ),
    max_tokens: int = typer.Option(
        2048,
        "--max-tokens",
        help="Maximum tokens to generate",
    ),
    system_prompt: Optional[str] = typer.Option(
        None,
        "--system",
        "-s",
        help="System prompt",
    ),
    save_history: bool = typer.Option(
        True,
        "--save-history/--no-save-history",
        help="Save conversation history",
    ),
    history_file: Optional[Path] = typer.Option(
        None,
        "--history-file",
        help="Path to save conversation history",
    ),
    stream: bool = typer.Option(
        True,
        "--stream/--no-stream",
        help="Enable streaming output",
    ),
) -> None:
    """Start interactive chat with a running model server.

    Examples:
        kt chat                          # Connect to default server
        kt chat --host 127.0.0.1 -p 8080 # Connect to specific server
        kt chat -t 0.9 --max-tokens 4096 # Adjust generation parameters
    """
    if not HAS_OPENAI:
        print_error(t("chat_openai_required"))
        console.print()
        console.print(t("chat_install_hint"))
        console.print("  pip install openai")
        raise typer.Exit(1)

    settings = get_settings()

    # Resolve server connection
    final_host = host or settings.get("server.host", "127.0.0.1")
    final_port = port or settings.get("server.port", 30000)

    # Construct base URL for OpenAI-compatible API
    base_url = f"http://{final_host}:{final_port}/v1"

    console.print()
    console.print(
        Panel.fit(
            f"[bold cyan]{t('chat_title')}[/bold cyan]\n\n"
            f"{t('chat_server')}: [yellow]{final_host}:{final_port}[/yellow]\n"
            f"{t('chat_temperature')}: [cyan]{temperature}[/cyan] | {t('chat_max_tokens')}: [cyan]{max_tokens}[/cyan]\n\n"
            f"[dim]{t('chat_help_hint')}[/dim]",
            border_style="cyan",
        )
    )
    console.print()

    # Check for proxy environment variables
    proxy_vars = ["HTTP_PROXY", "HTTPS_PROXY", "http_proxy", "https_proxy", "ALL_PROXY", "all_proxy"]
    detected_proxies = {var: os.environ.get(var) for var in proxy_vars if os.environ.get(var)}

    if detected_proxies:
        proxy_info = ", ".join(f"{k}={v}" for k, v in detected_proxies.items())
        console.print()
        print_warning(t("chat_proxy_detected"))
        console.print(f"  [dim]{proxy_info}[/dim]")
        console.print()

        use_proxy = Confirm.ask(t("chat_proxy_confirm"), default=False)

        if not use_proxy:
            # Temporarily disable proxy for this connection
            for var in proxy_vars:
                if var in os.environ:
                    del os.environ[var]
            print_info(t("chat_proxy_disabled"))
            console.print()

    # Initialize OpenAI client
    try:
        client = OpenAI(
            base_url=base_url,
            api_key="EMPTY",  # SGLang doesn't require API key
        )

        # Test connection
        print_info(t("chat_connecting"))
        models = client.models.list()
        available_models = [m.id for m in models.data]

        if not available_models:
            print_error(t("chat_no_models"))
            raise typer.Exit(1)

        # Select model
        if model:
            if model not in available_models:
                print_warning(t("chat_model_not_found", model=model, available=", ".join(available_models)))
                selected_model = available_models[0]
            else:
                selected_model = model
        else:
            selected_model = available_models[0]

        print_success(t("chat_connected", model=selected_model))
        console.print()

        # Load tokenizer for accurate token counting
        tokenizer = None
        try:
            from transformers import AutoTokenizer

            # selected_model is the model path
            tokenizer = AutoTokenizer.from_pretrained(selected_model, trust_remote_code=True)
            console.print(f"[dim]Loaded tokenizer from {selected_model}[/dim]")
            console.print()
        except Exception as e:
            console.print(f"[dim yellow]Warning: Could not load tokenizer, token counts will be estimated[/dim]")
            console.print()

    except Exception as e:
        print_error(t("chat_connect_failed", error=str(e)))
        console.print()
        console.print(t("chat_server_not_running"))
        console.print("  kt run <model>")
        raise typer.Exit(1)

    # Initialize conversation history
    messages = []

    # Add system prompt if provided
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    # Setup history file
    if save_history:
        if history_file is None:
            history_dir = settings.config_dir / "chat_history"
            history_dir.mkdir(parents=True, exist_ok=True)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            history_file = history_dir / f"chat_{timestamp}.json"
        else:
            history_file = Path(history_file)
            history_file.parent.mkdir(parents=True, exist_ok=True)

    # Main chat loop
    try:
        while True:
            # Get user input - use console.input() for better CJK character support
            try:
                user_input = console.input(f"[bold green]{t('chat_user_prompt')}[/bold green]: ")
            except (EOFError, KeyboardInterrupt):
                console.print()
                print_info(t("chat_goodbye"))
                break

            if not user_input.strip():
                continue

            # Handle special commands
            if user_input.startswith("/"):
                if _handle_command(user_input, messages, temperature, max_tokens):
                    continue
                else:
                    break  # Exit command

            # Add user message to history
            messages.append({"role": "user", "content": user_input})

            # Generate response
            console.print()
            console.print(f"[bold cyan]{t('chat_assistant_prompt')}[/bold cyan]")

            try:
                if stream:
                    # Streaming response
                    response_content = _stream_response(
                        client, selected_model, messages, temperature, max_tokens, tokenizer
                    )
                else:
                    # Non-streaming response
                    response_content = _generate_response(
                        client, selected_model, messages, temperature, max_tokens, tokenizer
                    )

                # Add assistant response to history
                messages.append({"role": "assistant", "content": response_content})

                console.print()

            except Exception as e:
                print_error(t("chat_generation_error", error=str(e)))
                # Remove the user message that caused the error
                messages.pop()
                continue

            # Save history if enabled
            if save_history:
                _save_history(history_file, messages, selected_model)

    except KeyboardInterrupt:
        console.print()
        console.print()
        print_info(t("chat_interrupted"))

    # Final history save
    if save_history and messages:
        _save_history(history_file, messages, selected_model)
        console.print(f"[dim]{t('chat_history_saved', path=str(history_file))}[/dim]")
        console.print()


def _stream_response(
    client: "OpenAI",
    model: str,
    messages: list,
    temperature: float,
    max_tokens: int,
    tokenizer=None,
) -> str:
    """Generate streaming response and display in real-time."""
    import time

    response_content = ""
    reasoning_content = ""

    # Performance tracking
    first_token_time = None
    chunk_count = 0

    try:
        # Start timing before sending request
        start_time = time.time()

        stream = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
            stream=True,
        )

        for chunk in stream:
            delta = chunk.choices[0].delta if chunk.choices else None
            if delta:
                reasoning_delta = getattr(delta, "reasoning_content", None)
                if reasoning_delta:
                    if first_token_time is None:
                        first_token_time = time.time()
                    reasoning_content += reasoning_delta
                    console.print(reasoning_delta, end="", style="dim")
                    chunk_count += 1

                if delta.content:
                    if first_token_time is None:
                        first_token_time = time.time()
                    content = delta.content
                    response_content += content
                    console.print(content, end="")
                    chunk_count += 1

        console.print()  # Newline after streaming

        # Display performance metrics
        end_time = time.time()
        if first_token_time and chunk_count > 0:
            ttft = first_token_time - start_time
            total_time = end_time - start_time

            # Calculate TPOT based on chunks
            if chunk_count > 1:
                generation_time = total_time - ttft
                tpot = generation_time / (chunk_count - 1)
            else:
                tpot = 0

            # Calculate accurate token counts using tokenizer
            if tokenizer:
                input_tokens = _count_tokens_with_tokenizer(messages, tokenizer)
                output_tokens = _count_tokens_with_tokenizer(
                    [{"role": "assistant", "content": response_content}], tokenizer
                )
                token_prefix = ""
            else:
                # Fallback to estimation
                input_tokens = _estimate_tokens(messages)
                output_tokens = _estimate_tokens([{"role": "assistant", "content": response_content}])
                token_prefix = "~"

            # Build metrics display
            metrics = f"[dim]Total: {total_time*1000:.0f}ms | TTFT: {ttft*1000:.0f}ms"
            if tpot > 0:
                metrics += f" | TPOT: {tpot*1000:.1f}ms"
            metrics += f" | In: {token_prefix}{input_tokens} | Out: {token_prefix}{output_tokens}"
            metrics += "[/dim]"

            console.print(metrics)

    except Exception as e:
        raise Exception(f"Streaming error: {e}")

    return response_content


def _count_tokens_with_tokenizer(messages: list, tokenizer) -> int:
    """Count tokens accurately using the model's tokenizer."""
    try:
        # Concatenate all message content
        text = ""
        for msg in messages:
            role = msg.get("role", "")
            content = msg.get("content", "")
            # Simple format: role + content
            text += f"{role}: {content}\n"

        # Encode and count tokens - suppress any debug output from custom tokenizers
        import os
        import sys
        from contextlib import redirect_stdout, redirect_stderr

        with open(os.devnull, "w") as devnull:
            with redirect_stdout(devnull), redirect_stderr(devnull):
                tokens = tokenizer.encode(text, add_special_tokens=True)
        return len(tokens)
    except Exception:
        # Fallback to estimation if tokenizer fails
        return _estimate_tokens(messages)


def _estimate_tokens(messages: list) -> int:
    """Estimate token count for messages (rough approximation)."""
    total_chars = 0
    for msg in messages:
        content = msg.get("content", "")
        total_chars += len(content)

    # Rough estimation:
    # - English: ~4 chars per token
    # - Chinese: ~1.5 chars per token
    # Use 2.5 as average
    return max(1, int(total_chars / 2.5))


def _generate_response(
    client: "OpenAI",
    model: str,
    messages: list,
    temperature: float,
    max_tokens: int,
    tokenizer=None,
) -> str:
    """Generate non-streaming response."""
    import time

    try:
        start_time = time.time()

        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
            stream=False,
        )

        end_time = time.time()
        total_time = end_time - start_time

        content = response.choices[0].message.content

        # Display as markdown
        md = Markdown(content)
        console.print(md)

        # Calculate accurate token counts using tokenizer
        if tokenizer:
            input_tokens = _count_tokens_with_tokenizer(messages, tokenizer)
            output_tokens = _count_tokens_with_tokenizer([{"role": "assistant", "content": content}], tokenizer)
            token_prefix = ""
        else:
            # Fallback to API usage or estimation
            input_tokens = response.usage.prompt_tokens if response.usage else _estimate_tokens(messages)
            output_tokens = (
                response.usage.completion_tokens
                if response.usage
                else _estimate_tokens([{"role": "assistant", "content": content}])
            )
            token_prefix = "" if response.usage else "~"

        # Display performance metrics
        console.print(
            f"[dim]Time: {total_time*1000:.0f}ms | "
            f"In: {token_prefix}{input_tokens} | Out: {token_prefix}{output_tokens}[/dim]"
        )

        return content

    except Exception as e:
        raise Exception(f"Generation error: {e}")


def _handle_command(command: str, messages: list, temperature: float, max_tokens: int) -> bool:
    """Handle special commands. Returns True to continue chat, False to exit."""
    cmd = command.lower().strip()

    if cmd in ["/quit", "/exit", "/q"]:
        console.print()
        print_info(t("chat_goodbye"))
        return False

    elif cmd in ["/help", "/h"]:
        console.print()
        console.print(
            Panel(
                f"[bold]{t('chat_help_title')}[/bold]\n\n{t('chat_help_content')}",
                title="Help",
                border_style="cyan",
            )
        )
        console.print()
        return True

    elif cmd in ["/clear", "/c"]:
        messages.clear()
        console.print()
        print_success(t("chat_history_cleared"))
        console.print()
        return True

    elif cmd in ["/history", "/hist"]:
        console.print()
        if not messages:
            print_info(t("chat_no_history"))
        else:
            console.print(
                Panel(
                    _format_history(messages),
                    title=t("chat_history_title", count=len(messages)),
                    border_style="cyan",
                )
            )
        console.print()
        return True

    elif cmd in ["/info", "/i"]:
        console.print()
        console.print(
            Panel(
                f"[bold]{t('chat_info_title')}[/bold]\n\n{t('chat_info_content', temperature=temperature, max_tokens=max_tokens, messages=len(messages))}",
                title="Info",
                border_style="cyan",
            )
        )
        console.print()
        return True

    elif cmd in ["/retry", "/r"]:
        if len(messages) >= 2 and messages[-1]["role"] == "assistant":
            # Remove last assistant response
            messages.pop()
            print_info(t("chat_retrying"))
            console.print()
        else:
            print_warning(t("chat_no_retry"))
            console.print()
        return True

    else:
        print_warning(t("chat_unknown_command", command=command))
        console.print(f"[dim]{t('chat_unknown_hint')}[/dim]")
        console.print()
        return True


def _format_history(messages: list) -> str:
    """Format conversation history for display."""
    lines = []
    for i, msg in enumerate(messages, 1):
        role = msg["role"].capitalize()
        content = msg["content"]

        # Truncate long messages
        if len(content) > 200:
            content = content[:200] + "..."

        lines.append(f"[bold]{i}. {role}:[/bold] {content}")

    return "\n\n".join(lines)


def _save_history(file_path: Path, messages: list, model: str) -> None:
    """Save conversation history to file."""
    try:
        history_data = {
            "model": model,
            "timestamp": datetime.now().isoformat(),
            "messages": messages,
        }

        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(history_data, f, indent=2, ensure_ascii=False)

    except Exception as e:
        print_warning(f"Failed to save history: {e}")


================================================
FILE: kt-kernel/python/cli/commands/config.py
================================================
"""
Config command for kt-cli.

Manages kt-cli configuration.
"""

from typing import Optional

import typer
import yaml
from rich.syntax import Syntax

from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import confirm, console, print_error, print_success

app = typer.Typer(help="Manage kt-cli configuration")


@app.command(name="init")
def init() -> None:
    """Initialize or re-run the first-time setup wizard."""
    from kt_kernel.cli.main import _show_first_run_setup
    from kt_kernel.cli.config.settings import get_settings

    settings = get_settings()
    _show_first_run_setup(settings)


@app.command(name="show")
def show(
    key: Optional[str] = typer.Argument(None, help="Configuration key to show (e.g., server.port)"),
) -> None:
    """Show current configuration."""
    settings = get_settings()

    if key:
        value = settings.get(key)
        if value is not None:
            if isinstance(value, (dict, list)):
                console.print(yaml.dump({key: value}, default_flow_style=False, allow_unicode=True))
            else:
                console.print(t("config_get_value", key=key, value=value))
        else:
            print_error(t("config_get_not_found", key=key))
            raise typer.Exit(1)
    else:
        console.print(f"\n[bold]{t('config_show_title')}[/bold]\n")
        console.print(f"[dim]{t('config_file_location', path=str(settings.config_path))}[/dim]\n")

        config_yaml = yaml.dump(settings.get_all(), default_flow_style=False, allow_unicode=True)
        syntax = Syntax(config_yaml, "yaml", theme="monokai", line_numbers=False)
        console.print(syntax)


@app.command(name="set")
def set_config(
    key: str = typer.Argument(..., help="Configuration key (e.g., server.port)"),
    value: str = typer.Argument(..., help="Value to set"),
) -> None:
    """Set a configuration value."""
    settings = get_settings()

    # Try to parse value as JSON/YAML for complex types
    parsed_value = _parse_value(value)

    settings.set(key, parsed_value)
    print_success(t("config_set_success", key=key, value=parsed_value))


@app.command(name="get")
def get_config(
    key: str = typer.Argument(..., help="Configuration key (e.g., server.port)"),
) -> None:
    """Get a configuration value."""
    settings = get_settings()
    value = settings.get(key)

    if value is not None:
        if isinstance(value, (dict, list)):
            console.print(yaml.dump(value, default_flow_style=False, allow_unicode=True))
        else:
            console.print(str(value))
    else:
        print_error(t("config_get_not_found", key=key))
        raise typer.Exit(1)


@app.command(name="reset")
def reset(
    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
) -> None:
    """Reset configuration to defaults."""
    if not yes:
        if not confirm(t("config_reset_confirm"), default=False):
            raise typer.Abort()

    settings = get_settings()
    settings.reset()
    print_success(t("config_reset_success"))


@app.command(name="path")
def path() -> None:
    """Show configuration file path."""
    settings = get_settings()
    console.print(str(settings.config_path))


@app.command(name="model-path-list", deprecated=True, hidden=True)
def model_path_list() -> None:
    """[Deprecated] Use 'kt model path-list' instead."""
    console.print("[yellow]⚠ This command is deprecated. Use 'kt model path-list' instead.[/yellow]\n")
    import subprocess
    subprocess.run(["kt", "model", "path-list"])


@app.command(name="model-path-add", deprecated=True, hidden=True)
def model_path_add(
    path: str = typer.Argument(..., help="Path to add"),
) -> None:
    """[Deprecated] Use 'kt model path-add' instead."""
    console.print("[yellow]⚠ This command is deprecated. Use 'kt model path-add' instead.[/yellow]\n")
    import subprocess
    subprocess.run(["kt", "model", "path-add", path])


@app.command(name="model-path-remove", deprecated=True, hidden=True)
def model_path_remove(
    path: str = typer.Argument(..., help="Path to remove"),
) -> None:
    """[Deprecated] Use 'kt model path-remove' instead."""
    console.print("[yellow]⚠ This command is deprecated. Use 'kt model path-remove' instead.[/yellow]\n")
    import subprocess
    subprocess.run(["kt", "model", "path-remove", path])


def _parse_value(value: str):
    """Parse a string value into appropriate Python type."""
    # Try boolean
    if value.lower() in ("true", "yes", "on", "1"):
        return True
    if value.lower() in ("false", "no", "off", "0"):
        return False

    # Try integer
    try:
        return int(value)
    except ValueError:
        pass

    # Try float
    try:
        return float(value)
    except ValueError:
        pass

    # Try YAML/JSON parsing for lists/dicts
    try:
        parsed = yaml.safe_load(value)
        if isinstance(parsed, (dict, list)):
            return parsed
    except yaml.YAMLError:
        pass

    # Return as string
    return value


================================================
FILE: kt-kernel/python/cli/commands/doctor.py
================================================
"""
Doctor command for kt-cli.

Diagnoses environment issues and provides recommendations.
"""

import glob
import os
import platform
import shutil
from pathlib import Path
from typing import Optional

import typer
from rich.table import Table

from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console, print_error, print_info, print_success, print_warning
from kt_kernel.cli.utils.environment import (
    check_docker,
    detect_available_ram_gb,
    detect_cpu_info,
    detect_cuda_version,
    detect_disk_space_gb,
    detect_env_managers,
    detect_gpus,
    detect_memory_info,
    detect_ram_gb,
    get_installed_package_version,
)


def _get_kt_kernel_info() -> dict:
    """Get kt-kernel installation information."""
    info = {
        "installed": False,
        "version": None,
        "cpu_variant": None,
        "install_path": None,
        "available_variants": [],
        "extension_file": None,
    }

    try:
        import kt_kernel

        info["installed"] = True
        info["version"] = getattr(kt_kernel, "__version__", "unknown")
        info["cpu_variant"] = getattr(kt_kernel, "__cpu_variant__", "unknown")

        # Get installation path
        info["install_path"] = os.path.dirname(kt_kernel.__file__)

        # Find available .so files
        kt_kernel_dir = info["install_path"]
        so_files = glob.glob(os.path.join(kt_kernel_dir, "_kt_kernel_ext_*.so"))
        so_files.extend(glob.glob(os.path.join(kt_kernel_dir, "kt_kernel_ext*.so")))

        # Parse variant names from filenames
        variants = set()
        for so_file in so_files:
            basename = os.path.basename(so_file)
            if "_kt_kernel_ext_" in basename:
                # Extract variant from _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
                parts = basename.split("_")
                if len(parts) >= 4:
                    variant = parts[3]  # "amx" from "_kt_kernel_ext_amx..."
                    if variant.startswith("avx"):
                        # Normalize avx variants
                        if variant in ["avx512", "avx512_bf16", "avx512_vbmi", "avx512_vnni", "avx512_base"]:
                            variants.add("avx512")
                        else:
                            variants.add(variant)
                    else:
                        variants.add(variant)
            elif "kt_kernel_ext" in basename:
                variants.add("default")

        info["available_variants"] = sorted(list(variants))

        # Get current extension file
        if hasattr(kt_kernel, "kt_kernel_ext"):
            ext_module = kt_kernel.kt_kernel_ext
            info["extension_file"] = getattr(ext_module, "__file__", None)

    except ImportError:
        info["installed"] = False
    except Exception as e:
        info["error"] = str(e)

    return info


def doctor(
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed diagnostics"),
) -> None:
    """Diagnose environment issues."""
    console.print(f"\n[bold]{t('doctor_title')}[/bold]\n")

    issues_found = False
    checks = []

    # 1. Python version
    python_version = platform.python_version()
    python_ok = _check_python_version(python_version)
    checks.append(
        {
            "name": t("doctor_check_python"),
            "status": "ok" if python_ok else "error",
            "value": python_version,
            "hint": "Python 3.10+ required" if not python_ok else None,
        }
    )
    if not python_ok:
        issues_found = True

    # 2. CUDA availability
    cuda_version = detect_cuda_version()
    checks.append(
        {
            "name": t("doctor_check_cuda"),
            "status": "ok" if cuda_version else "warning",
            "value": cuda_version or t("version_cuda_not_found"),
            "hint": "CUDA is optional but recommended for GPU acceleration" if not cuda_version else None,
        }
    )

    # 3. GPU detection
    gpus = detect_gpus()
    if gpus:
        gpu_names = ", ".join(g.name for g in gpus)
        total_vram = sum(g.vram_gb for g in gpus)
        checks.append(
            {
                "name": t("doctor_check_gpu"),
                "status": "ok",
                "value": t("doctor_gpu_found", count=len(gpus), names=gpu_names),
                "hint": f"Total VRAM: {total_vram}GB",
            }
        )
    else:
        checks.append(
            {
                "name": t("doctor_check_gpu"),
                "status": "warning",
                "value": t("doctor_gpu_not_found"),
                "hint": "GPU recommended for best performance",
            }
        )

    # 4. CPU information
    cpu_info = detect_cpu_info()
    checks.append(
        {
            "name": t("doctor_check_cpu"),
            "status": "ok",
            "value": t("doctor_cpu_info", name=cpu_info.name, cores=cpu_info.cores, threads=cpu_info.threads),
            "hint": None,
        }
    )

    # 5. CPU instruction sets (critical for kt-kernel)
    isa_list = cpu_info.instruction_sets
    # Check for recommended instruction sets
    recommended_isa = {"AVX2", "AVX512F", "AMX-INT8"}
    has_recommended = bool(set(isa_list) & recommended_isa)
    has_avx2 = "AVX2" in isa_list
    has_avx512 = any(isa.startswith("AVX512") for isa in isa_list)
    has_amx = any(isa.startswith("AMX") for isa in isa_list)

    # Determine status and build display string
    if has_amx:
        isa_status = "ok"
        isa_hint = "AMX available - best performance for INT4/INT8"
    elif has_avx512:
        isa_status = "ok"
        isa_hint = "AVX512 available - good performance"
    elif has_avx2:
        isa_status = "warning"
        isa_hint = "AVX2 only - consider upgrading CPU for better performance"
    else:
        isa_status = "error"
        isa_hint = "AVX2 required for kt-kernel"

    # Show top instruction sets (prioritize important ones)
    display_isa = isa_list[:8] if len(isa_list) > 8 else isa_list
    isa_display = ", ".join(display_isa)
    if len(isa_list) > 8:
        isa_display += f" (+{len(isa_list) - 8} more)"

    checks.append(
        {
            "name": t("doctor_check_cpu_isa"),
            "status": isa_status,
            "value": isa_display if isa_display else "None detected",
            "hint": isa_hint,
        }
    )

    # 6. NUMA topology
    numa_detail = []
    for node, cpus in sorted(cpu_info.numa_info.items()):
        if len(cpus) > 6:
            cpu_str = f"{cpus[0]}-{cpus[-1]}"
        else:
            cpu_str = ",".join(str(c) for c in cpus)
        numa_detail.append(f"{node}: {cpu_str}")

    numa_value = t("doctor_numa_info", nodes=cpu_info.numa_nodes)
    if verbose and numa_detail:
        numa_value += " (" + "; ".join(numa_detail) + ")"

    checks.append(
        {
            "name": t("doctor_check_numa"),
            "status": "ok",
            "value": numa_value,
            "hint": f"{cpu_info.threads // cpu_info.numa_nodes} threads per node" if cpu_info.numa_nodes > 1 else None,
        }
    )

    # 6b. kt-kernel installation check
    kt_info = _get_kt_kernel_info()

    if kt_info["installed"]:
        # Build display string for kt-kernel
        variant = kt_info["cpu_variant"]
        version = kt_info["version"]
        available_variants = kt_info["available_variants"]

        # Determine status based on CPU variant
        if variant == "amx":
            kt_status = "ok"
            kt_hint = "AMX variant loaded - optimal performance"
        elif variant.startswith("avx512"):
            kt_status = "ok"
            kt_hint = "AVX512 variant loaded - good performance"
        elif variant == "avx2":
            kt_status = "warning"
            kt_hint = "AVX2 variant - consider upgrading CPU for AMX/AVX512"
        else:
            kt_status = "warning"
            kt_hint = f"Unknown variant: {variant}"

        kt_value = f"v{version} ({variant.upper()})"
        if verbose and available_variants:
            kt_value += f" [dim] - available: {', '.join(available_variants)}[/dim]"

        checks.append(
            {
                "name": "kt-kernel",
                "status": kt_status,
                "value": kt_value,
                "hint": kt_hint,
            }
        )

        # Show extension file path in verbose mode
        if verbose and kt_info.get("extension_file"):
            ext_file = os.path.basename(kt_info["extension_file"])
            checks.append(
                {
                    "name": "  └─ Extension",
                    "status": "ok",
                    "value": ext_file,
                    "hint": None,
                }
            )

        # Show installation path in verbose mode
        if verbose and kt_info.get("install_path"):
            checks.append(
                {
                    "name": "  └─ Path",
                    "status": "ok",
                    "value": kt_info["install_path"],
                    "hint": None,
                }
            )
    else:
        error_msg = kt_info.get("error", "Not installed")
        checks.append(
            {
                "name": "kt-kernel",
                "status": "error",
                "value": error_msg,
                "hint": "kt-kernel is required - run: pip install kt-kernel",
            }
        )
        issues_found = True

    # 7. System memory (with frequency if available)
    mem_info = detect_memory_info()
    if mem_info.frequency_mhz and mem_info.type:
        mem_value = t(
            "doctor_memory_freq",
            available=f"{mem_info.available_gb}GB",
            total=f"{mem_info.total_gb}GB",
            freq=mem_info.frequency_mhz,
            type=mem_info.type,
        )
    else:
        mem_value = t("doctor_memory_info", available=f"{mem_info.available_gb}GB", total=f"{mem_info.total_gb}GB")

    ram_ok = mem_info.total_gb >= 32
    checks.append(
        {
            "name": t("doctor_check_memory"),
            "status": "ok" if ram_ok else "warning",
            "value": mem_value,
            "hint": "32GB+ RAM recommended for large models" if not ram_ok else None,
        }
    )

    # 8. Disk space - check all model paths
    settings = get_settings()
    model_paths = settings.get_model_paths()

    # Check all configured model paths
    for i, disk_path in enumerate(model_paths):
        available_disk, total_disk = detect_disk_space_gb(str(disk_path))
        disk_ok = available_disk >= 100

        # For multiple paths, add index to name
        path_label = f"Model Path {i+1}" if len(model_paths) > 1 else t("doctor_check_disk")

        checks.append(
            {
                "name": path_label,
                "status": "ok" if disk_ok else "warning",
                "value": t("doctor_disk_info", available=f"{available_disk}GB", path=str(disk_path)),
                "hint": "100GB+ free space recommended for model storage" if not disk_ok else None,
            }
        )

    # 6. Required packages
    packages = [
        ("kt-kernel", ">=0.4.0", False),  # name, version_req, required
        ("sglang", ">=0.4.0", False),
        ("torch", ">=2.4.0", True),
        ("transformers", ">=4.45.0", True),
    ]

    package_issues = []
    for pkg_name, version_req, required in packages:
        version = get_installed_package_version(pkg_name)
        if version:
            package_issues.append((pkg_name, version, "ok"))
        elif required:
            package_issues.append((pkg_name, t("version_not_installed"), "error"))
            issues_found = True
        else:
            package_issues.append((pkg_name, t("version_not_installed"), "warning"))

    if verbose:
        checks.append(
            {
                "name": t("doctor_check_packages"),
                "status": "ok" if not any(p[2] == "error" for p in package_issues) else "error",
                "value": f"{sum(1 for p in package_issues if p[2] == 'ok')}/{len(package_issues)} installed",
                "packages": package_issues,
            }
        )

    # 7. SGLang installation source check
    from kt_kernel.cli.utils.sglang_checker import check_sglang_installation, check_sglang_kt_kernel_support

    sglang_info = check_sglang_installation()

    if sglang_info["installed"]:
        if sglang_info.get("is_kvcache_fork"):
            # Package name is sglang-kt — this is definitively the kvcache-ai fork
            if sglang_info["from_source"] and sglang_info["git_info"]:
                git_remote = sglang_info["git_info"].get("remote", "unknown")
                git_branch = sglang_info["git_info"].get("branch", "unknown")
                sglang_source_value = f"sglang-kt (Source: {git_remote}, branch: {git_branch})"
            elif sglang_info["editable"]:
                sglang_source_value = "sglang-kt (editable)"
            else:
                sglang_source_value = "sglang-kt"
            sglang_source_status = "ok"
            sglang_source_hint = None
        elif sglang_info["from_source"]:
            if sglang_info["git_info"]:
                git_remote = sglang_info["git_info"].get("remote", "unknown")
                git_branch = sglang_info["git_info"].get("branch", "unknown")
                sglang_source_value = f"Source (GitHub: {git_remote}, branch: {git_branch})"
                sglang_source_status = "ok"
                sglang_source_hint = None
            else:
                sglang_source_value = "Source (editable)"
                sglang_source_status = "ok"
                sglang_source_hint = None
        else:
            sglang_source_value = "PyPI sglang (not kvcache-ai fork)"
            sglang_source_status = "warning"
            sglang_source_hint = t("sglang_pypi_hint")
    else:
        sglang_source_value = "Not installed"
        sglang_source_status = "warning"
        sglang_source_hint = t("sglang_install_hint")

    checks.append(
        {
            "name": "SGLang Source",
            "status": sglang_source_status,
            "value": sglang_source_value,
            "hint": sglang_source_hint,
        }
    )

    # 7b. SGLang kt-kernel support check (only if SGLang is installed)
    kt_kernel_support = {"supported": True}  # Default to True if not checked
    if sglang_info["installed"]:
        # Use cache=False to force re-check in doctor, but silent=True since we show in table
        kt_kernel_support = check_sglang_kt_kernel_support(use_cache=False, silent=True)

        if kt_kernel_support["supported"]:
            kt_kernel_value = t("sglang_kt_kernel_supported")
            kt_kernel_status = "ok"
            kt_kernel_hint = None
        else:
            kt_kernel_value = t("sglang_kt_kernel_not_supported")
            kt_kernel_status = "error"
            kt_kernel_hint = "Reinstall SGLang: pip uninstall sglang -y && pip install sglang-kt (or run ./install.sh from ktransformers root)"
            issues_found = True

        checks.append(
            {
                "name": "SGLang kt-kernel",
                "status": kt_kernel_status,
                "value": kt_kernel_value,
                "hint": kt_kernel_hint,
            }
        )

    # 8. Environment managers
    env_managers = detect_env_managers()
    docker = check_docker()
    env_list = [f"{m.name} {m.version}" for m in env_managers]
    if docker:
        env_list.append(f"docker {docker.version}")

    checks.append(
        {
            "name": "Environment Managers",
            "status": "ok" if env_list else "warning",
            "value": ", ".join(env_list) if env_list else "None found",
            "hint": "conda or docker recommended for installation" if not env_list else None,
        }
    )

    # Display results
    _display_results(checks, verbose)

    # Show SGLang installation instructions if not installed
    if not sglang_info["installed"]:
        from kt_kernel.cli.utils.sglang_checker import print_sglang_install_instructions

        console.print()
        print_sglang_install_instructions()
    # Show kt-kernel installation instructions if SGLang is installed but doesn't support kt-kernel
    elif sglang_info["installed"] and not kt_kernel_support.get("supported", True):
        from kt_kernel.cli.utils.sglang_checker import print_sglang_kt_kernel_instructions

        console.print()
        print_sglang_kt_kernel_instructions()

    # Summary
    console.print()
    if issues_found:
        print_warning(t("doctor_has_issues"))
    else:
        print_success(t("doctor_all_ok"))
    console.print()


def _check_python_version(version: str) -> bool:
    """Check if Python version meets requirements."""
    parts = version.split(".")
    try:
        major, minor = int(parts[0]), int(parts[1])
        return major >= 3 and minor >= 10
    except (IndexError, ValueError):
        return False


def _display_results(checks: list[dict], verbose: bool) -> None:
    """Display diagnostic results."""
    table = Table(show_header=True, header_style="bold")
    table.add_column("Check", style="bold")
    table.add_column("Status", width=8)
    table.add_column("Value")
    if verbose:
        table.add_column("Notes", style="dim")

    for check in checks:
        status = check["status"]
        if status == "ok":
            status_str = f"[green]{t('doctor_status_ok')}[/green]"
        elif status == "warning":
            status_str = f"[yellow]{t('doctor_status_warning')}[/yellow]"
        else:
            status_str = f"[red]{t('doctor_status_error')}[/red]"

        if verbose:
            table.add_row(
                check["name"],
                status_str,
                check["value"],
                check.get("hint", ""),
            )
        else:
            table.add_row(
                check["name"],
                status_str,
                check["value"],
            )

        # Show package details if verbose
        if verbose and "packages" in check:
            for pkg_name, pkg_version, pkg_status in check["packages"]:
                if pkg_status == "ok":
                    pkg_status_str = "[green]✓[/green]"
                elif pkg_status == "warning":
                    pkg_status_str = "[yellow]○[/yellow]"
                else:
                    pkg_status_str = "[red]✗[/red]"

                table.add_row(
                    f"  └─ {pkg_name}",
                    pkg_status_str,
                    pkg_version,
                    "",
                )

    console.print(table)


================================================
FILE: kt-kernel/python/cli/commands/model.py
================================================
"""
Model command for kt-cli.

Manages models: download, list, and storage paths.
"""

import os
from pathlib import Path
from typing import Optional, List

import typer

from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t, get_lang
from kt_kernel.cli.utils.console import (
    confirm,
    console,
    print_error,
    print_info,
    print_step,
    print_success,
    print_warning,
    prompt_choice,
)


# Common SHA256 status display mapping used across multiple commands
SHA256_STATUS_MAP = {
    "not_checked": "[dim]Not Checked[/dim]",
    "checking": "[yellow]Checking...[/yellow]",
    "passed": "[green]✓ Passed[/green]",
    "failed": "[red]✗ Failed[/red]",
    "no_repo": "[dim]-[/dim]",
}

# Plain text version for panels and verbose output
SHA256_STATUS_MAP_PLAIN = {
    "not_checked": "Not Checked",
    "checking": "Checking...",
    "passed": "✓ Passed",
    "failed": "✗ Failed",
    "no_repo": "-",
}


def is_amx_weights(model_path) -> tuple[bool, int]:
    """
    Determine if a model uses AMX weights and count NUMA nodes.

    Returns:
        (is_amx, numa_count): Tuple where is_amx indicates AMX weights,
        and numa_count is the number of NUMA nodes (0 if not AMX).
    """
    import re
    from pathlib import Path
    from safetensors import safe_open

    model_path = Path(model_path)
    safetensors_files = sorted(model_path.glob("*.safetensors"))

    if not safetensors_files:
        return False, 0

    numa_indices = set()
    numa_pattern = re.compile(r"\.numa\.(\d+)\.")

    # Check first 3 files for NUMA keys
    for file_path in safetensors_files[:3]:
        try:
            with safe_open(file_path, framework="pt", device="cpu") as f:
                for key in f.keys():
                    if ".numa." in key:
                        match = numa_pattern.search(key)
                        if match:
                            numa_indices.add(int(match.group(1)))
        except Exception:
            continue

    if not numa_indices:
        return False, 0

    return True, len(numa_indices)


app = typer.Typer(
    help="Manage models and storage paths",
    invoke_without_command=True,
    no_args_is_help=False,
)


@app.callback()
def callback(ctx: typer.Context) -> None:
    """
    Model management commands.

    Run without arguments to see available models.
    """
    # If no subcommand is provided, show the full model list
    if ctx.invoked_subcommand is None:
        list_models(verbose=False, all_models=False, show_moe=True, no_cache=False)


@app.command(name="download")
def download(
    repo: Optional[str] = typer.Argument(None, help="Repository ID (optional, interactive mode if not provided)"),
    local_dir: Optional[str] = typer.Option(
        None,
        "--local-dir",
        "-d",
        help="Local directory to download to (default: auto-detect from config)",
    ),
    repo_type: Optional[str] = typer.Option(
        None,
        "--repo-type",
        "-t",
        help="Repository type: huggingface or modelscope",
    ),
    resume: bool = typer.Option(
        True,
        "--resume/--no-resume",
        help="Resume incomplete downloads",
    ),
    yes: bool = typer.Option(
        False,
        "--yes",
        "-y",
        help="Skip all prompts and use defaults",
    ),
) -> None:
    """Download model from HuggingFace or ModelScope (interactive mode)."""
    import subprocess
    import os
    from pathlib import Path
    from rich.prompt import Prompt, Confirm
    from rich.table import Table
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry, UserModel
    from kt_kernel.cli.utils.model_scanner import scan_single_path, format_size
    from kt_kernel.cli.utils.model_verifier import check_huggingface_connectivity
    from kt_kernel.cli.utils.download_helper import (
        list_remote_files_hf,
        list_remote_files_ms,
        filter_files_by_pattern,
        calculate_total_size,
        format_file_list_table,
        verify_repo_exists,
    )

    settings = get_settings()
    user_registry = UserModelRegistry()

    console.print()

    # ========== Step 1: Select repository type ==========
    if not repo_type and not yes:
        console.print("[bold cyan]Step 1: Select Repository Source[/bold cyan]\n")
        console.print("  1. HuggingFace")
        console.print("  2. ModelScope")
        console.print()

        choice = Prompt.ask("Select source", choices=["1", "2"], default="1")
        repo_type = "huggingface" if choice == "1" else "modelscope"
        console.print()
    elif not repo_type:
        repo_type = "huggingface"  # Default for --yes mode

    # Validate repo_type
    if repo_type not in ["huggingface", "modelscope"]:
        print_error(f"Invalid repo type: {repo_type}. Must be 'huggingface' or 'modelscope'")
        raise typer.Exit(1)

    # Check HuggingFace connectivity and auto-switch to mirror if needed
    use_mirror = False
    if repo_type == "huggingface":
        with console.status("[dim]Checking HuggingFace connectivity...[/dim]"):
            is_accessible, message = check_huggingface_connectivity(timeout=5)

        if not is_accessible:
            print_warning("HuggingFace Connection Failed")
            console.print()
            console.print(f"  {message}")
            console.print()
            console.print("  [yellow]Auto-switching to HuggingFace mirror:[/yellow] [cyan]hf-mirror.com[/cyan]")
            console.print()
            use_mirror = True

    # ========== Step 2: Input repository ID ==========
    while True:
        if not repo and not yes:
            console.print("[bold cyan]Step 2: Enter Repository ID[/bold cyan]\n")
            console.print("  Examples:")
            console.print("    • HuggingFace: deepseek-ai/DeepSeek-V3")
            console.print("    • ModelScope: Qwen/Qwen3-Coder-480B-A35B-Instruct")
            console.print()

            repo = Prompt.ask("Repository ID")
            console.print()
        elif not repo:
            print_error("Repository ID is required")
            raise typer.Exit(1)

        # Verify repository exists
        with console.status(f"[dim]Verifying repository: {repo}...[/dim]"):
            exists, msg = verify_repo_exists(repo, repo_type, use_mirror)

        if exists:
            print_success(f"✓ Repository found: {repo}")
            console.print()
            break
        else:
            print_error(msg)
            console.print()
            if yes:
                raise typer.Exit(1)
            repo = None  # Reset to ask again

    # ========== Step 3: Input file pattern and preview files ==========
    files_to_download = []
    file_pattern = "*"

    while True:
        if not yes:
            console.print("[bold cyan]Step 3: Select Files to Download[/bold cyan]\n")
            console.print("  File pattern (glob syntax):")
            console.print("    • *                  - All files (default)")
            console.print("    • *.safetensors      - Only safetensors files")
            console.print("    • *.gguf             - Only GGUF files")
            console.print("    • *Q4_K_M.gguf       - Specific GGUF quant")
            console.print()

            pattern_input = Prompt.ask("File pattern", default="*")
            file_pattern = pattern_input
            console.print()

        # Fetch remote file list
        with console.status(f"[dim]Fetching file list from {repo_type}...[/dim]"):
            try:
                if repo_type == "huggingface":
                    all_files = list_remote_files_hf(repo, use_mirror)
                else:
                    all_files = list_remote_files_ms(repo)

                files_to_download = filter_files_by_pattern(all_files, file_pattern)
            except Exception as e:
                print_error(f"Failed to fetch file list: {e}")
                raise typer.Exit(1)

        if not files_to_download:
            print_warning(f"No files match pattern: {file_pattern}")
            console.print()
            if yes:
                raise typer.Exit(1)
            continue  # Ask for pattern again

        # Display matched files
        total_size = calculate_total_size(files_to_download)
        print_success(f"Found {len(files_to_download)} files (Total: {format_size(total_size)})")
        console.print()

        file_table = format_file_list_table(files_to_download, max_display=10)
        console.print(file_table)
        console.print()

        # Confirm or retry
        if yes:
            break

        action = Prompt.ask("Action", choices=["continue", "retry", "cancel"], default="continue")

        if action == "continue":
            console.print()
            break
        elif action == "cancel":
            console.print()
            print_info("Download cancelled")
            console.print()
            return
        # else retry - loop continues

    # ========== Step 4: Select download path ==========
    download_path = None

    if local_dir:
        download_path = Path(os.path.expanduser(local_dir)).resolve()
    elif not yes:
        console.print("[bold cyan]Step 4: Select Download Location[/bold cyan]\n")

        # Get configured model paths
        model_paths = settings.get_model_paths()
        if not model_paths:
            print_error("No model storage paths configured.")
            console.print()
            console.print(f"  Add a path with: [cyan]kt model path-add <path>[/cyan]")
            console.print()
            raise typer.Exit(1)

        # Display configured paths
        console.print("  Configured storage paths:")
        for i, path in enumerate(model_paths, 1):
            console.print(f"    {i}. {path}")
        console.print(f"    {len(model_paths) + 1}. Custom path (manual input)")
        console.print()

        path_choice = Prompt.ask("Select path", choices=[str(i) for i in range(1, len(model_paths) + 2)], default="1")

        if int(path_choice) <= len(model_paths):
            base_path = model_paths[int(path_choice) - 1]
        else:
            custom = Prompt.ask("Enter custom path")
            base_path = Path(os.path.expanduser(custom)).resolve()

        console.print()

        # Ask for folder name
        default_folder = repo.split("/")[-1]
        folder_name = Prompt.ask("Folder name", default=default_folder)

        download_path = base_path / folder_name
        console.print()
    else:
        # --yes mode: use default
        model_paths = settings.get_model_paths()
        if not model_paths:
            print_error("No model storage paths configured.")
            raise typer.Exit(1)

        default_folder = repo.split("/")[-1]
        download_path = model_paths[0] / default_folder

    # ========== Step 5: Confirm and download ==========
    print_info(f"Download destination: {download_path}")
    console.print()

    # Check if path exists
    if download_path.exists():
        existing = user_registry.find_by_path(str(download_path))
        if existing:
            print_warning(f"Model already registered as: {existing.name}")
            console.print()
            if not yes and not Confirm.ask("Re-download anyway?", default=False):
                return
        else:
            print_warning(f"Directory already exists: {download_path}")
            if not yes and not Confirm.ask("Overwrite?", default=False):
                return
        console.print()

    # Final confirmation
    if not yes:
        console.print("[bold]Download Summary:[/bold]")
        console.print(f"  Source:      {repo_type}:{repo}")
        console.print(
            f"  Files:       {len(files_to_download)} files ({format_size(calculate_total_size(files_to_download))})"
        )
        console.print(f"  Pattern:     {file_pattern}")
        console.print(f"  Destination: {download_path}")
        console.print()

        if not Confirm.ask("Start download?", default=True):
            console.print()
            print_info("Download cancelled")
            console.print()
            return

    # Download
    console.print()
    print_step("Downloading model files...")
    console.print()

    # Set mirror for HuggingFace if needed
    original_hf_endpoint = os.environ.get("HF_ENDPOINT")
    if use_mirror and repo_type == "huggingface" and not original_hf_endpoint:
        os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

    try:
        if repo_type == "huggingface":
            from huggingface_hub import snapshot_download

            snapshot_download(
                repo_id=repo,
                local_dir=str(download_path),
                allow_patterns=file_pattern if file_pattern != "*" else None,
                local_dir_use_symlinks=False,
                resume_download=resume,
            )

        else:  # modelscope
            from modelscope.hub.snapshot_download import snapshot_download

            snapshot_download(
                model_id=repo,
                local_dir=str(download_path),
                allow_file_pattern=file_pattern if file_pattern != "*" else None,
            )

    except ImportError as e:
        pkg = "huggingface_hub" if repo_type == "huggingface" else "modelscope"
        print_error(f"{pkg} not installed. Install: pip install {pkg}")
        raise typer.Exit(1)
    except Exception as e:
        print_error(f"Download failed: {e}")
        raise typer.Exit(1)
    finally:
        # Restore HF_ENDPOINT
        if use_mirror and repo_type == "huggingface" and not original_hf_endpoint:
            os.environ.pop("HF_ENDPOINT", None)
        elif original_hf_endpoint:
            os.environ["HF_ENDPOINT"] = original_hf_endpoint

    # ========== Step 6: Scan and register ==========
    console.print()
    print_success("Download complete!")

    console.print()
    print_step("Scanning downloaded model...")

    try:
        scanned = scan_single_path(download_path)
    except Exception as e:
        print_error(f"Failed to scan model: {e}")
        console.print()
        console.print(f"  You can manually add it: [cyan]kt model add {download_path}[/cyan]")
        console.print()
        raise typer.Exit(1)

    if not scanned:
        print_warning("No model files found in downloaded directory.")
        console.print()
        console.print("  Supported formats: .safetensors, .gguf")
        console.print()
        return

    # Auto-generate model name
    model_name = download_path.name
    if user_registry.check_name_conflict(model_name):
        model_name = user_registry.suggest_name(model_name)

    # Create and register model
    user_model = UserModel(
        name=model_name,
        path=str(download_path),
        format=scanned.format,
        repo_type=repo_type,
        repo_id=repo,
        sha256_status="not_checked",
    )

    try:
        user_registry.add_model(user_model)
        console.print()
        print_success(f"Model registered as: {model_name}")
        console.print()
        console.print(f"  View details:     [cyan]kt model info {model_name}[/cyan]")
        console.print(f"  Run model:        [cyan]kt run {model_name}[/cyan]")
        console.print(f"  Verify integrity: [cyan]kt model verify {model_name}[/cyan]")
        console.print()
    except Exception as e:
        print_error(f"Failed to register model: {e}")
        console.print()
        console.print(f"  You can manually add it: [cyan]kt model add {download_path}[/cyan]")
        console.print()
        raise typer.Exit(1)


@app.command(name="list")
def list_models(
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed info including paths"),
    all_models: bool = typer.Option(False, "--all", help="Show all models (reserved for future use)"),
    show_moe: bool = typer.Option(True, "--moe/--no-moe", help="Show MoE model information (default: enabled)"),
    no_cache: bool = typer.Option(False, "--no-cache", help="Force re-analyze all models, ignore cache"),
) -> None:
    """List user-registered models."""
    from rich.table import Table
    from rich.panel import Panel
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.utils.model_scanner import format_size
    import sys
    from pathlib import Path as PathLib

    # Try to import analyze_moe_model from multiple locations
    analyze_moe_model = None
    try:
        # Try 1: From kt_kernel.cli.utils
        from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model
    except ImportError:
        try:
            # Try 2: From parent directories
            analyze_moe_path = PathLib(__file__).parent.parent.parent.parent.parent.parent / "analyze_moe_model.py"
            if analyze_moe_path.exists():
                sys.path.insert(0, str(analyze_moe_path.parent))
                from analyze_moe_model import analyze_moe_model
        except (ImportError, Exception):
            try:
                # Try 3: Absolute path
                sys.path.insert(0, "/mnt/data2/ljq/ktransformers")
                from analyze_moe_model import analyze_moe_model
            except (ImportError, Exception):
                analyze_moe_model = None

    registry = UserModelRegistry()
    models = registry.list_models()

    console.print()

    if not models:
        print_warning(t("model_no_registered_models"))
        console.print()
        console.print(f"  {t('model_scan_hint')} [cyan]kt model scan[/cyan]")
        console.print(f"  {t('model_add_hint')} [cyan]kt model add <path>[/cyan]")
        console.print()
        return

    # Check for models with non-existent paths and remove them automatically
    models_to_remove = []
    for model in models:
        if not model.path_exists():
            models_to_remove.append(model)

    if models_to_remove:
        console.print(f"[yellow]Found {len(models_to_remove)} model(s) with non-existent paths:[/yellow]")
        for model in models_to_remove:
            console.print(f"  [dim]✗ {model.name}: {model.path}[/dim]")
            registry.remove_model(model.name)
        console.print(f"[green]✓ Automatically removed {len(models_to_remove)} model(s) with missing paths[/green]")
        console.print()

        # Refresh the models list
        models = registry.list_models()

        if not models:
            console.print(f"[dim]No models remaining after cleanup.[/dim]")
            console.print()
            console.print(f"  {t('model_scan_hint')} [cyan]kt model scan[/cyan]")
            console.print(f"  {t('model_add_hint')} [cyan]kt model add <path>[/cyan]")
            console.print()
            return

    if verbose:
        # Verbose mode: detailed cards
        console.print(f"[bold cyan]{t('model_registered_models_title')}[/bold cyan]\n")

        for i, model in enumerate(models, 1):
            # Check if path exists
            path_status = "[green]✓ Exists[/green]" if model.path_exists() else "[red]✗ Missing[/red]"

            # Format repo info
            if model.repo_id:
                repo_abbr = "hf" if model.repo_type == "huggingface" else "ms"
                repo_info = f"{repo_abbr}:{model.repo_id}"
            else:
                repo_info = "-"

            # Format SHA256 status
            sha256_display = SHA256_STATUS_MAP_PLAIN.get(model.sha256_status, model.sha256_status)

            # Calculate folder size if exists
            if model.path_exists():
                from pathlib import Path

                path_obj = Path(model.path)
                try:
                    if model.format == "safetensors":
                        files = list(path_obj.glob("*.safetensors"))
                    else:
                        files = list(path_obj.glob("*.gguf"))

                    total_size = sum(f.stat().st_size for f in files if f.exists())
                    size_str = format_size(total_size)
                    file_count = len(files)
                    size_info = f"{size_str} ({file_count} files)"
                except:
                    size_info = "Unknown"
            else:
                size_info = "-"

            # Create panel content
            content = f"""[bold]Path:[/bold]   {model.path}
[bold]Format:[/bold] {model.format}
[bold]Repo:[/bold]   {repo_info}
[bold]SHA256:[/bold] {sha256_display}
[bold]Size:[/bold]   {size_info}
[bold]Status:[/bold] {path_status}"""

            panel = Panel(content, title=f"[cyan]{model.name}[/cyan]", border_style="cyan", padding=(0, 1))
            console.print(panel)

        console.print()
        console.print(f"[dim]Total: {len(models)} model(s)[/dim]\n")
    else:
        # Compact mode: separate tables by model type
        from rich.align import Align
        from pathlib import Path

        # Categorize models
        gguf_models = []
        amx_models = []
        gpu_models = []

        for model in models:
            if model.format == "gguf":
                gguf_models.append(model)
            elif model.format == "safetensors" and model.path_exists():
                is_amx, numa_count = is_amx_weights(model.path)
                if is_amx:
                    amx_models.append((model, numa_count))
                else:
                    gpu_models.append(model)
            else:
                gpu_models.append(model)

        # Pre-analyze GPU MoE models concurrently if enabled
        moe_results = {}
        moe_failed_models = []  # Track models that failed MoE analysis
        if show_moe and analyze_moe_model and gpu_models:
            from concurrent.futures import ThreadPoolExecutor, as_completed
            import threading

            # Collect GPU models that need MoE analysis
            # Priority: use cached MoE info from UserModel, only analyze if is_moe is None
            models_to_analyze = []
            models_need_update = []  # Track models that need registry update

            for model in gpu_models:
                # Check if MoE info is already cached in UserModel (and not using --no-cache)
                if not no_cache and model.is_moe is not None:
                    # Use cached info from UserModel
                    if model.is_moe:
                        moe_results[model.name] = {
                            "is_moe": True,
                            "num_experts": model.moe_num_experts,
                            "num_experts_per_tok": model.moe_num_experts_per_tok,
                            "cached": True,
                        }
                    # If is_moe is False, don't add to moe_results
                else:
                    # Need to analyze (is_moe is None or --no-cache)
                    path_obj = Path(model.path)
                    models_to_analyze.append((model.name, str(path_obj)))
                    models_need_update.append(model)

            if models_to_analyze:
                # Use lock for thread-safe console output
                print_lock = threading.Lock()
                completed_count = [0]  # Use list to allow modification in nested function

                def analyze_with_progress(model_info):
                    model_name, model_path = model_info
                    try:
                        with print_lock:
                            console.print(f"[dim]Analyzing MoE: {model_name}...[/dim]")
                        result = analyze_moe_model(model_path, use_cache=not no_cache)

                        # Check if analysis returned valid results
                        if result is None or result.get("num_experts", 0) == 0:
                            with print_lock:
                                completed_count[0] += 1
                                console.print(
                                    f"[dim]✗ [{completed_count[0]}/{len(models_to_analyze)}] {model_name} - Not a MoE model or analysis failed[/dim]"
                                )
                            return (model_name, None, "Not a MoE model or analysis failed")

                        with print_lock:
                            completed_count[0] += 1
                            cached_tag = "[green](cached)[/green]" if result and result.get("cached") else ""
                            console.print(
                                f"[dim]✓ [{completed_count[0]}/{len(models_to_analyze)}] {model_name} {cached_tag}[/dim]"
                            )
                        return (model_name, result, None)
                    except Exception as e:
                        with print_lock:
                            completed_count[0] += 1
                            error_msg = str(e)[:80]
                            console.print(
                                f"[dim]✗ [{completed_count[0]}/{len(models_to_analyze)}] {model_name} - Error: {error_msg}[/dim]"
                            )
                        return (model_name, None, error_msg)

                if no_cache:
                    console.print(f"\n[yellow]Force re-analyzing (--no-cache): ignoring cached results[/yellow]")
                console.print(
                    f"\n[cyan]Analyzing {len(models_to_analyze)} MoE model(s) with {min(16, len(models_to_analyze))} threads...[/cyan]\n"
                )

                # Analyze concurrently with up to 16 workers
                with ThreadPoolExecutor(max_workers=16) as executor:
                    futures = {
                        executor.submit(analyze_with_progress, model_info): model_info
                        for model_info in models_to_analyze
                    }

                    for future in as_completed(futures):
                        model_name, result, error = future.result()
                        if error:
                            # Find the model object
                            failed_model = next((m for m in gpu_models if m.name == model_name), None)
                            if failed_model:
                                moe_failed_models.append((failed_model, error))
                                # Update model registry: mark as non-MoE
                                registry.update_model(model_name, {"is_moe": False})
                        else:
                            moe_results[model_name] = result
                            # Update model registry with MoE info
                            if result and result.get("is_moe"):
                                registry.update_model(
                                    model_name,
                                    {
                                        "is_moe": True,
                                        "moe_num_experts": result.get("num_experts"),
                                        "moe_num_experts_per_tok": result.get("num_experts_per_tok"),
                                    },
                                )
                            else:
                                registry.update_model(model_name, {"is_moe": False})

                console.print(f"\n[green]✓ MoE analysis complete[/green]\n")

                # Remove failed models from gpu_models list
                if moe_failed_models:
                    failed_names = {m.name for m, _ in moe_failed_models}
                    gpu_models = [m for m in gpu_models if m.name not in failed_names]

        # Separate MoE and non-MoE GPU models
        moe_gpu_models = []
        non_moe_gpu_models = []
        for model in gpu_models:
            if model.name in moe_results:
                moe_gpu_models.append(model)
            else:
                non_moe_gpu_models.append(model)

        # Count failed MoE models (these are also non-MoE)
        total_non_moe_count = len(non_moe_gpu_models) + len(moe_failed_models)

        # Filter display based on --all flag
        if not all_models:
            # Default: only show MoE models
            gpu_models_to_display = moe_gpu_models
            show_failed_table = False
        else:
            # --all: show all GPU models including non-MoE and failed
            gpu_models_to_display = gpu_models
            show_failed_table = True
            total_non_moe_count = 0  # Don't show hint when displaying all

        # Helper function to create table rows
        def format_model_row(model, moe_info=None, numa_count=None):
            from kt_kernel.cli.utils.model_scanner import format_size

            # Calculate size
            if model.path_exists():
                path_obj = Path(model.path)
                try:
                    if model.format == "safetensors":
                        files = list(path_obj.glob("*.safetensors"))
                    else:
                        files = list(path_obj.glob("*.gguf"))

                    total_size = sum(f.stat().st_size for f in files if f.exists())
                    size_display = format_size(total_size)
                except:
                    size_display = "[dim]-[/dim]"
            else:
                size_display = "[dim]-[/dim]"

            # Format repo info
            if model.repo_id:
                repo_abbr = "hf" if model.repo_type == "huggingface" else "ms"
                repo_display = f"{repo_abbr}:{model.repo_id}"
            else:
                repo_display = "[dim]-[/dim]"

            # Format SHA256 status
            sha256_display = SHA256_STATUS_MAP.get(model.sha256_status, model.sha256_status)

            row = [model.name, model.path, size_display]

            # Add type-specific columns
            if numa_count is not None:
                # AMX model
                row.append(f"[yellow]{numa_count} NUMA[/yellow]")
            elif moe_info:
                # GPU MoE model
                experts_display = f"[yellow]{moe_info['num_experts']}[/yellow]"
                activated_display = f"[green]{moe_info['num_experts_per_tok']}[/green]"
                moe_total_display = f"[cyan]{size_display}[/cyan]"
                row.extend([experts_display, activated_display, moe_total_display])
            elif show_moe and analyze_moe_model and model.format == "safetensors":
                # GPU non-MoE model
                row.extend(["[dim]-[/dim]", "[dim]-[/dim]", "[dim]-[/dim]"])

            row.extend([repo_display, sha256_display])
            return row

        # Display tables
        title = Align.center(f"[bold cyan]{t('model_registered_models_title')}[/bold cyan]")
        console.print(title)
        console.print()

        # Table 1: GGUF Models (Llamafile)
        if gguf_models:
            console.print("[bold yellow]GGUF Models (Llamafile)[/bold yellow]")
            table = Table(show_header=True, header_style="bold")
            table.add_column("#", justify="right", style="cyan", no_wrap=True)
            table.add_column(t("model_column_name"), style="cyan", no_wrap=True)
            table.add_column("Path", style="dim", overflow="fold")
            table.add_column("Total", justify="right")
            table.add_column(t("model_column_repo"), style="dim", overflow="fold")
            table.add_column(t("model_column_sha256"), justify="center")

            for i, model in enumerate(gguf_models, 1):
                row = [str(i)] + format_model_row(model)
                table.add_row(*row)

            console.print(table)
            console.print()

        # Table 2: AMX Models
        if amx_models:
            from kt_kernel.cli.utils.model_scanner import format_size
            import json

            console.print("[bold magenta]AMX Models (CPU)[/bold magenta]")
            table = Table(show_header=True, header_style="bold", show_lines=False)
            table.add_column("#", justify="right", style="cyan", no_wrap=True)
            table.add_column(t("model_column_name"), style="cyan", no_wrap=True)
            table.add_column("Path", style="dim", overflow="fold")
            table.add_column("Total", justify="right")
            table.add_column("Method", justify="center", style="yellow")
            table.add_column("NUMA", justify="center", style="green")
            table.add_column("Source", style="dim", overflow="fold")

            # Build reverse map: AMX model ID -> GPU models using it
            amx_used_by_gpu = {}  # {amx_model_id: [gpu_model_names]}
            for model, _ in amx_models:
                if model.gpu_model_ids:
                    # This AMX is linked to these GPU models
                    gpu_names = []
                    for gpu_id in model.gpu_model_ids:
                        # Find GPU model by ID
                        for gpu_model in gpu_models:
                            if gpu_model.id == gpu_id:
                                gpu_names.append(gpu_model.name)
                                break
                    if gpu_names:
                        amx_used_by_gpu[model.id] = gpu_names

            for i, (model, numa_count) in enumerate(amx_models, 1):
                # Calculate size
                if model.path_exists():
                    path_obj = Path(model.path)
                    try:
                        files = list(path_obj.glob("*.safetensors"))
                        total_size = sum(f.stat().st_size for f in files if f.exists())
                        size_display = format_size(total_size)
                    except:
                        size_display = "[dim]-[/dim]"
                else:
                    size_display = "[dim]-[/dim]"

                # Read AMX metadata from config.json (fallback if not in UserModel)
                method_from_config = None
                numa_from_config = None
                if model.path_exists():
                    config_path = Path(model.path) / "config.json"
                    if config_path.exists():
                        try:
                            with open(config_path, "r", encoding="utf-8") as f:
                                config = json.load(f)
                                amx_quant = config.get("amx_quantization", {})
                                if amx_quant.get("converted"):
                                    method_from_config = amx_quant.get("method")
                                    numa_from_config = amx_quant.get("numa_count")
                        except:
                            pass

                # AMX-specific metadata (priority: UserModel > config.json > detected numa_count)
                method_display = (
                    model.amx_quant_method.upper()
                    if model.amx_quant_method
                    else method_from_config.upper() if method_from_config else "[dim]?[/dim]"
                )
                numa_display = (
                    str(model.amx_numa_nodes)
                    if model.amx_numa_nodes
                    else (
                        str(numa_from_config) if numa_from_config else str(numa_count) if numa_count else "[dim]?[/dim]"
                    )
                )
                source_display = model.amx_source_model if model.amx_source_model else "[dim]-[/dim]"

                table.add_row(
                    str(i), model.name, model.path, size_display, method_display, numa_display, source_display
                )

                # Add linked GPU models info below this AMX model
                if model.id in amx_used_by_gpu:
                    gpu_list = amx_used_by_gpu[model.id]
                    gpu_names_str = ", ".join([f"[dim]{name}[/dim]" for name in gpu_list])
                    # Create a sub-row with empty cells except for the first column (7 columns total with #)
                    sub_row = ["", f"  [dim]↳ GPU: {gpu_names_str}[/dim]", "", "", "", "", ""]
                    table.add_row(*sub_row, style="dim")

            console.print(table)
            console.print()

        # Table 3: GPU Models (Safetensors)
        if gpu_models_to_display:
            console.print("[bold green]GPU Models (Safetensors)[/bold green]")
            table = Table(show_header=True, header_style="bold", show_lines=False)
            table.add_column("#", justify="right", style="cyan", no_wrap=True)
            table.add_column(t("model_column_name"), style="cyan", no_wrap=True)
            table.add_column("Path", style="dim", overflow="fold")
            table.add_column("Total", justify="right")

            if show_moe and analyze_moe_model:
                table.add_column("Exps", justify="center", style="yellow")
                table.add_column("Act", justify="center", style="green")
                table.add_column("MoE Size", justify="right", style="cyan")

            table.add_column(t("model_column_repo"), style="dim", overflow="fold")
            table.add_column(t("model_column_sha256"), justify="center")

            # Build a map of GPU model UUID -> attached CPU models
            attached_cpu_models = {}  # {gpu_model_id: [(cpu_model, type)]}
            for model in gguf_models:
                if model.gpu_model_ids:
                    for gpu_id in model.gpu_model_ids:
                        if gpu_id not in attached_cpu_models:
                            attached_cpu_models[gpu_id] = []
                        attached_cpu_models[gpu_id].append((model, "GGUF"))

            for model, numa_count in amx_models:
                if model.gpu_model_ids:
                    for gpu_id in model.gpu_model_ids:
                        if gpu_id not in attached_cpu_models:
                            attached_cpu_models[gpu_id] = []
                        attached_cpu_models[gpu_id].append((model, "AMX"))

            for i, model in enumerate(gpu_models_to_display, 1):
                moe_info = moe_results.get(model.name) if show_moe and analyze_moe_model else None
                row = [str(i)] + format_model_row(model, moe_info=moe_info)
                table.add_row(*row)

                # Add attached CPU models info below this GPU model (using UUID matching)
                if model.id in attached_cpu_models:
                    cpu_list = attached_cpu_models[model.id]
                    cpu_names = ", ".join([f"[dim]{m.name} ({t})[/dim]" for m, t in cpu_list])
                    # Create a sub-row with empty cells except for the first column
                    num_cols = len(row)
                    sub_row = ["", f"  [dim]↳ CPU: {cpu_names}[/dim]"] + [""] * (num_cols - 2)
                    table.add_row(*sub_row, style="dim")

            console.print(table)
            console.print()

        # Table 4: Failed MoE Analysis (only show with --all)
        if show_failed_table and moe_failed_models:
            console.print("[bold red]Failed MoE Analysis[/bold red]")
            console.print("[yellow]These models may not be MoE models or have analysis errors:[/yellow]\n")
            table = Table(show_header=True, header_style="bold")
            table.add_column("#", justify="right", style="cyan", no_wrap=True)
            table.add_column(t("model_column_name"), style="red", no_wrap=True)
            table.add_column("Path", style="dim", overflow="fold")
            table.add_column("Total", justify="right")
            table.add_column("Error", style="yellow", overflow="fold")

            for i, (model, error) in enumerate(moe_failed_models, 1):
                from kt_kernel.cli.utils.model_scanner import format_size

                if model.path_exists():
                    path_obj = Path(model.path)
                    try:
                        files = list(path_obj.glob("*.safetensors"))
                        total_size = sum(f.stat().st_size for f in files if f.exists())
                        size_display = format_size(total_size)
                    except:
                        size_display = "[dim]-[/dim]"
                else:
                    size_display = "[dim]-[/dim]"

                table.add_row(str(i), model.name, model.path, size_display, error)

            console.print(table)
            console.print()

        # Show hint if non-MoE models are hidden (display before summary)
        if total_non_moe_count > 0:
            hint_text = t("model_non_moe_hidden_hint", count=total_non_moe_count)
            console.print(f"[dim]{hint_text}[/dim]")
            console.print()

        # Summary
        total_count = len(gguf_models) + len(amx_models) + len(gpu_models)
        failed_count = len(moe_failed_models)
        if failed_count > 0:
            console.print(
                f"[dim]Total: {total_count} model(s) | GGUF: {len(gguf_models)} | AMX: {len(amx_models)} | GPU: {len(gpu_models)} | [red]Failed: {failed_count}[/red][/dim]\n"
            )
        else:
            console.print(
                f"[dim]Total: {total_count} model(s) | GGUF: {len(gguf_models)} | AMX: {len(amx_models)} | GPU: {len(gpu_models)}[/dim]\n"
            )

        # Show usage hints (only in non-verbose mode)
        if not verbose and models:
            console.print(f"[bold cyan]{t('model_usage_title')}[/bold cyan]")
            console.print(f"  {t('model_usage_info'):<17} [cyan]kt model info <name>[/cyan]")
            console.print(f"  {t('model_usage_edit'):<17} [cyan]kt model edit <name>[/cyan]")
            console.print(f"  {t('model_usage_verify'):<17} [cyan]kt model verify <name>[/cyan]")
            console.print(f"  {t('model_usage_quant'):<17} [cyan]kt quant <name>[/cyan]")
            console.print(f"  {t('model_usage_run'):<17} [cyan]kt run <name>[/cyan]")
            console.print()
            console.print(f"  {t('model_usage_scan'):<17} [cyan]kt model scan[/cyan]")
            console.print(f"  {t('model_usage_add'):<17} [cyan]kt model add <path>[/cyan]")
            console.print()


@app.command(name="clear-cache")
def clear_cache() -> None:
    """Clear MoE analysis cache."""
    from pathlib import Path
    import json

    cache_file = Path.home() / ".ktransformers" / "cache" / "moe_analysis.json"

    if not cache_file.exists():
        console.print()
        console.print("[dim]No MoE cache found.[/dim]")
        console.print()
        return

    # Read cache to count entries
    try:
        with open(cache_file, "r") as f:
            cache_data = json.load(f)
        cache_count = len(cache_data)
    except Exception:
        cache_count = 0

    if cache_count == 0:
        console.print()
        console.print("[dim]MoE cache is empty.[/dim]")
        console.print()
        return

    console.print()
    console.print(f"[yellow]Found {cache_count} cached model(s) in:[/yellow]")
    console.print(f"  {cache_file}")
    console.print()

    if confirm("Clear all MoE analysis cache?", default=False):
        cache_file.unlink()
        console.print(f"[green]✓ Cleared cache for {cache_count} model(s)[/green]")
    else:
        console.print("[dim]Cache clear cancelled.[/dim]")

    console.print()


@app.command(name="path-list")
def path_list() -> None:
    """List all configured model storage paths."""
    settings = get_settings()
    model_paths = settings.get_model_paths()

    console.print()
    console.print(f"[bold]{t('model_storage_paths_title')}:[/bold]\n")

    for i, path in enumerate(model_paths, 1):
        marker = "[green]✓[/green]" if path.exists() else "[red]✗[/red]"
        console.print(f"  {marker} [{i}] {path}")

    console.print()


@app.command(name="link-cpu")
def link_cpu(
    cpu_model: str = typer.Argument(..., help="Name of the CPU model (GGUF/AMX)"),
    gpu_models: List[str] = typer.Argument(..., help="Name(s) of GPU model(s) to link with"),
) -> None:
    """Link a CPU model (GGUF/AMX) with one or more GPU models for joint startup."""
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    registry = UserModelRegistry()

    # Check if CPU model exists
    cpu_model_obj = registry.get_model(cpu_model)
    if not cpu_model_obj:
        print_error(f"CPU model '{cpu_model}' not found in registry.")
        console.print()
        console.print(f"  Use [cyan]kt model list[/cyan] to see registered models")
        console.print()
        raise typer.Exit(1)

    # Check if it's actually a CPU model (GGUF or AMX)
    if cpu_model_obj.format == "safetensors":
        # Check if it's AMX by looking for .numa. pattern
        is_amx, _ = is_amx_weights(cpu_model_obj.path)
        if not is_amx:
            print_error(f"Model '{cpu_model}' is a GPU model (safetensors), not a CPU model.")
            console.print()
            console.print(f"  Only GGUF and AMX models can be linked to GPU models")
            console.print()
            raise typer.Exit(1)

    # Verify all GPU models exist and collect their UUIDs
    gpu_model_uuids = []
    missing_models = []
    for gpu_name in gpu_models:
        gpu_model_obj = registry.get_model(gpu_name)
        if not gpu_model_obj:
            missing_models.append(gpu_name)
        else:
            gpu_model_uuids.append(gpu_model_obj.id)

    if missing_models:
        print_error(f"GPU model(s) not found: {', '.join(missing_models)}")
        console.print()
        console.print(f"  Use [cyan]kt model list[/cyan] to see registered models")
        console.print()
        raise typer.Exit(1)

    # Update the CPU model with GPU links (using UUIDs for stability)
    registry.update_model(cpu_model, {"gpu_model_ids": gpu_model_uuids})

    console.print()
    print_success(f"Linked CPU model '{cpu_model}' with GPU model(s):")
    for gpu_name in gpu_models:
        console.print(f"  [green]✓[/green] {gpu_name}")
    console.print()
    console.print(f"  View the relationship with [cyan]kt model list[/cyan]")
    console.print()


@app.command(name="unlink-cpu")
def unlink_cpu(
    cpu_model: str = typer.Argument(..., help="Name of the CPU model to unlink"),
) -> None:
    """Remove GPU model links from a CPU model."""
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    registry = UserModelRegistry()

    # Check if model exists
    model = registry.get_model(cpu_model)
    if not model:
        print_error(f"Model '{cpu_model}' not found in registry.")
        console.print()
        raise typer.Exit(1)

    if not model.gpu_model_ids:
        console.print()
        console.print(f"[yellow]Model '{cpu_model}' has no GPU links.[/yellow]")
        console.print()
        return

    # Remove links
    registry.update_model(cpu_model, {"gpu_model_ids": None})

    console.print()
    print_success(f"Removed all GPU links from '{cpu_model}'")
    console.print()


@app.command(name="path-add")
def path_add(
    path: str = typer.Argument(..., help="Path to add"),
) -> None:
    """Add a new model storage path."""
    # Expand user home directory
    path = os.path.expanduser(path)

    # Check if path exists or can be created
    path_obj = Path(path)
    if not path_obj.exists():
        console.print(f"[yellow]{t('model_path_not_exist', path=path)}[/yellow]")
        if confirm(t("model_create_directory", path=path), default=True):
            try:
                path_obj.mkdir(parents=True, exist_ok=True)
                console.print(f"[green]✓[/green] {t('model_created_directory', path=path)}")
            except (OSError, PermissionError) as e:
                print_error(t("model_create_dir_failed", error=str(e)))
                raise typer.Exit(1)
        else:
            raise typer.Abort()

    # Add to configuration
    settings = get_settings()
    settings.add_model_path(path)
    print_success(t("model_path_added", path=path))


@app.command(name="path-remove")
def path_remove(
    path: str = typer.Argument(..., help="Path to remove"),
) -> None:
    """Remove a model storage path from configuration."""
    # Expand user home directory
    path = os.path.expanduser(path)

    settings = get_settings()
    if settings.remove_model_path(path):
        print_success(t("model_path_removed", path=path))
    else:
        print_error(t("model_path_not_found", path=path))
        raise typer.Exit(1)


@app.command(name="scan")
def scan(
    min_size: float = typer.Option(2.0, "--min-size", help="Minimum model file size in GB (default: 2.0)"),
    max_depth: int = typer.Option(6, "--max-depth", help="Maximum search depth (default: 6)"),
) -> None:
    """Perform global scan for models and add new ones to registry."""
    from kt_kernel.cli.utils.model_discovery import discover_and_register_global, format_discovery_summary
    from kt_kernel.cli.config.settings import get_settings

    settings = get_settings()
    lang = settings.get("general.language", "en")

    console.print()
    if lang == "zh":
        print_info("全局扫描模型权重")
        console.print()
    else:
        print_info("Global Model Scan")
        console.print()

    try:
        total_found, new_found, registered = discover_and_register_global(
            min_size_gb=min_size, max_depth=max_depth, show_progress=True, lang=lang
        )

        format_discovery_summary(
            total_found=total_found,
            new_found=new_found,
            registered=registered,
            lang=lang,
            show_models=True,
            max_show=20,
        )

        if new_found > 0:
            console.print()
            if lang == "zh":
                console.print("[dim]下一步:[/dim]")
                console.print(f"  • 查看模型列表: [cyan]kt model list[/cyan]")
                console.print(f"  • 编辑模型信息: [cyan]kt model edit <name>[/cyan]")
                console.print(f"  • 验证模型: [cyan]kt model verify <name>[/cyan]")
            else:
                console.print("[dim]Next steps:[/dim]")
                console.print(f"  • View model list: [cyan]kt model list[/cyan]")
                console.print(f"  • Edit model info: [cyan]kt model edit <name>[/cyan]")
                console.print(f"  • Verify models: [cyan]kt model verify <name>[/cyan]")
            console.print()

    except Exception as e:
        print_error(f"Scan failed: {e}")
        raise typer.Exit(1)


@app.command(name="add")
def add_model(
    path: str = typer.Argument(..., help="Path to scan for models"),
) -> None:
    """Scan a directory and add all found models to the registry."""
    from pathlib import Path
    from kt_kernel.cli.utils.model_discovery import discover_and_register_path
    from kt_kernel.cli.config.settings import get_settings

    settings = get_settings()
    lang = settings.get("general.language", "en")

    # Expand and validate path
    path_obj = Path(os.path.expanduser(path)).resolve()

    if not path_obj.exists():
        print_error(f"Path does not exist: {path_obj}")
        raise typer.Exit(1)

    if not path_obj.is_dir():
        print_error(f"Not a directory: {path_obj}")
        raise typer.Exit(1)

    # Scan and register models
    console.print()
    try:
        total_found, new_found, registered = discover_and_register_path(
            path=str(path_obj), min_size_gb=2.0, existing_paths=None, show_progress=True, lang=lang
        )

        console.print()
        if new_found == 0:
            if total_found > 0:
                if lang == "zh":
                    console.print(f"[yellow]在此路径找到 {total_found} 个模型，但所有模型均已在列表中[/yellow]")
                else:
                    console.print(
                        f"[yellow]Found {total_found} models in this path, but all already in the list[/yellow]"
                    )
            else:
                if lang == "zh":
                    console.print("[yellow]未找到模型[/yellow]")
                    console.print()
                    console.print("  支持的格式: *.gguf, *.safetensors (需要 config.json)")
                else:
                    console.print("[yellow]No models found[/yellow]")
                    console.print()
                    console.print("  Supported formats: *.gguf, *.safetensors (with config.json)")
        else:
            if lang == "zh":
                console.print(
                    f"[green]✓[/green] 在此路径找到 {total_found} 个模型，成功添加 {len(registered)} 个新模型"
                )
            else:
                console.print(
                    f"[green]✓[/green] Found {total_found} models in this path, added {len(registered)} new models"
                )

            if registered:
                console.print()
                if lang == "zh":
                    console.print("[dim]新添加的模型:[/dim]")
                else:
                    console.print("[dim]Newly added models:[/dim]")

                for model in registered:
                    console.print(f"  • {model.name} ({model.format})")
                    console.print(f"    [dim]{model.path}[/dim]")

        console.print()

    except Exception as e:
        print_error(f"Failed to scan path: {e}")
        raise typer.Exit(1)


@app.command(name="edit")
def edit_model(
    name: Optional[str] = typer.Argument(
        None, help="Name of model to edit (optional - will show selection if not provided)"
    ),
) -> None:
    """Edit model information interactively."""
    from rich.prompt import Prompt, Confirm
    from rich.panel import Panel
    from rich.table import Table
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    registry = UserModelRegistry()

    # If no name provided, show interactive selection
    if name is None:
        all_models = registry.list_models()

        # Filter to only show MoE GPU models (safetensors that are not AMX)
        moe_models = []
        for m in all_models:
            if m.format == "safetensors":
                is_amx_model, _ = is_amx_weights(m.path)
                if not is_amx_model:
                    moe_models.append(m)

        if not moe_models:
            print_error(t("model_edit_no_models"))
            console.print()
            console.print(f"  {t('model_edit_add_hint_scan')} [cyan]kt model scan[/cyan]")
            console.print(f"  {t('model_edit_add_hint_add')} [cyan]kt model add <path>[/cyan]")
            console.print()
            raise typer.Exit(1)

        # Display models table with # column
        console.print()
        console.print(f"[bold cyan]{t('model_edit_select_title')}[/bold cyan]")
        console.print()

        table = Table(show_header=True, header_style="bold", show_lines=False)
        table.add_column("#", justify="right", style="cyan", no_wrap=True)
        table.add_column("Name", style="cyan", no_wrap=True)
        table.add_column("Format", style="dim")
        table.add_column("Path", style="dim", overflow="fold")

        for i, model_item in enumerate(moe_models, 1):
            table.add_row(str(i), model_item.name, model_item.format, model_item.path)

        console.print(table)
        console.print()

        from rich.prompt import IntPrompt

        choice = IntPrompt.ask(t("model_edit_select_model"), default=1, show_choices=False)

        if choice < 1 or choice > len(moe_models):
            print_error(t("model_edit_invalid_choice"))
            raise typer.Exit(1)

        model = moe_models[choice - 1]
    else:
        # Load model by name
        model = registry.get_model(name)
        if not model:
            print_error(t("model_edit_not_found", name=name))
            console.print()
            console.print(f"  {t('model_edit_list_hint')} [cyan]kt model list[/cyan]")
            console.print()
            raise typer.Exit(1)

    # Keep track of original values to detect changes
    original_name = model.name
    original_repo_type = model.repo_type
    original_repo_id = model.repo_id
    original_gpu_model_ids = model.gpu_model_ids.copy() if model.gpu_model_ids else None

    # Working copy for edits (not saved until user confirms)
    edited_name = model.name
    edited_repo_type = model.repo_type
    edited_repo_id = model.repo_id
    edited_gpu_model_ids = model.gpu_model_ids.copy() if model.gpu_model_ids else None

    has_changes = False

    while True:
        # Display current configuration (show edited values)
        console.print()
        console.print(f"[bold cyan]{t('model_edit_current_config')}[/bold cyan]\n")

        # Format SHA256 status (from original model)
        sha256_display = SHA256_STATUS_MAP_PLAIN.get(model.sha256_status, model.sha256_status)

        # Check if this is a CPU model (GGUF or AMX)
        is_cpu_model = model.format == "gguf"
        if not is_cpu_model and model.format == "safetensors":
            is_amx, _ = is_amx_weights(model.path)
            is_cpu_model = is_amx

        # Format GPU links info (for CPU models)
        gpu_links_info = ""
        if is_cpu_model and edited_gpu_model_ids:
            gpu_names = []
            for gpu_id in edited_gpu_model_ids:
                gpu_obj = registry.get_model_by_id(gpu_id)
                if gpu_obj:
                    gpu_names.append(gpu_obj.name)
                else:
                    gpu_names.append(f"[dim red]{gpu_id[:8]}... (deleted)[/dim red]")
            gpu_links_info = f"\n[bold]{t('model_edit_gpu_links')}[/bold]  {', '.join(gpu_names)}"

        content = f"""[bold]Name:[/bold]       {edited_name}
[bold]Path:[/bold]       {model.path}
[bold]Format:[/bold]     {model.format}
[bold]Repo Type:[/bold]  {edited_repo_type or '-'}
[bold]Repo ID:[/bold]    {edited_repo_id or '-'}
[bold]SHA256:[/bold]     {sha256_display}{gpu_links_info}"""

        panel = Panel(content, border_style="cyan", padding=(0, 1))
        console.print(panel)
        console.print()

        # Check if there are any changes
        has_changes = (
            edited_name != original_name
            or edited_repo_type != original_repo_type
            or edited_repo_id != original_repo_id
            or edited_gpu_model_ids != original_gpu_model_ids
        )

        # Show menu
        console.print(f"[bold]{t('model_edit_what_to_edit')}[/bold]")
        console.print("  [1] " + t("model_edit_option_name"))
        console.print("  [2] " + t("model_edit_option_repo"))
        console.print("  [3] " + t("model_edit_option_delete"))
        if is_cpu_model:
            console.print("  [4] " + t("model_edit_manage_gpu_links"))
            save_option = "5"
            cancel_option = "6"
            console.print(
                f"  [{save_option}] {t('model_edit_save_changes')}"
                + (
                    f" [cyan]{t('model_edit_has_changes')}[/cyan]"
                    if has_changes
                    else f" [dim]{t('model_edit_no_changes')}[/dim]"
                )
            )
            console.print(f"  [{cancel_option}] " + t("model_edit_option_cancel"))
            console.print()
            choice = Prompt.ask(t("model_edit_choice_prompt"), choices=["1", "2", "3", "4", "5", "6"], default="6")
        else:
            save_option = "4"
            cancel_option = "5"
            console.print(
                f"  [{save_option}] {t('model_edit_save_changes')}"
                + (
                    f" [cyan]{t('model_edit_has_changes')}[/cyan]"
                    if has_changes
                    else f" [dim]{t('model_edit_no_changes')}[/dim]"
                )
            )
            console.print(f"  [{cancel_option}] " + t("model_edit_option_cancel"))
            console.print()
            choice = Prompt.ask(t("model_edit_choice_prompt"), choices=["1", "2", "3", "4", "5"], default="5")

        if choice == "1":
            # Edit name (update working copy only)
            console.print()
            new_name = Prompt.ask(t("model_edit_new_name"), default=edited_name)

            if new_name != edited_name:
                # Check for conflict (excluding both original and edited names)
                if new_name != original_name and registry.check_name_conflict(new_name, exclude_name=original_name):
                    print_error(t("model_edit_name_conflict", name=new_name))
                    continue

                edited_name = new_name
                console.print()
                print_info(f"[dim]{t('model_edit_name_pending')}[/dim]")

        elif choice == "2":
            # Edit repo configuration (update working copy only)
            console.print()
            console.print(t("model_edit_repo_type_prompt"))
            console.print("  [1] HuggingFace")
            console.print("  [2] ModelScope")
            console.print("  [3] " + t("model_edit_repo_remove"))
            console.print()

            repo_choice = Prompt.ask(t("model_edit_choice_prompt"), choices=["1", "2", "3"], default="3")

            if repo_choice == "3":
                # Remove repo
                edited_repo_type = None
                edited_repo_id = None
                console.print()
                print_info(f"[dim]{t('model_edit_repo_remove_pending')}[/dim]")
            else:
                # Set repo
                repo_type = "huggingface" if repo_choice == "1" else "modelscope"
                example = "deepseek-ai/DeepSeek-V3" if repo_choice == "1" else "deepseek/DeepSeek-V3"

                current_default = edited_repo_id if edited_repo_id and edited_repo_type == repo_type else ""
                repo_id = Prompt.ask(
                    t("model_edit_repo_id_prompt", example=example),
                    default=current_default if current_default else None,
                )

                edited_repo_type = repo_type
                edited_repo_id = repo_id
                console.print()
                print_info(f"[dim]{t('model_edit_repo_update_pending')}[/dim]")

        elif choice == "3":
            # Delete model
            console.print()
            console.print(f"[bold yellow]{t('model_edit_delete_warning')}[/bold yellow]")
            console.print(f"  {t('model_edit_delete_note')}")
            console.print()

            if Confirm.ask(t("model_edit_delete_confirm", name=model.name), default=False):
                registry.remove_model(model.name)
                console.print()
                print_success(t("model_edit_deleted", name=model.name))
                console.print()
                return
            else:
                console.print()
                print_info(t("model_edit_delete_cancelled"))

        elif choice == "4" and is_cpu_model:
            # Manage GPU Links (only for CPU models) - update working copy
            console.print()
            console.print(f"[bold cyan]{t('model_edit_gpu_links_title', name=edited_name)}[/bold cyan]")
            console.print()

            # Show current links (from edited values)
            if edited_gpu_model_ids:
                console.print(f"[bold]{t('model_edit_current_gpu_links')}[/bold]")
                for i, gpu_id in enumerate(edited_gpu_model_ids, 1):
                    gpu_obj = registry.get_model_by_id(gpu_id)
                    if gpu_obj:
                        console.print(f"  [{i}] {gpu_obj.name}")
                    else:
                        console.print(f"  [{i}] [red]{gpu_id[:8]}... (deleted)[/red]")
                console.print()
            else:
                console.print(f"[dim]{t('model_edit_no_gpu_links')}[/dim]")
                console.print()

            console.print(f"{t('model_edit_gpu_options')}")
            console.print(f"  [1] {t('model_edit_gpu_add')}")
            console.print(f"  [2] {t('model_edit_gpu_remove')}")
            console.print(f"  [3] {t('model_edit_gpu_clear')}")
            console.print(f"  [4] {t('model_edit_gpu_back')}")
            console.print()

            link_choice = Prompt.ask(t("model_edit_gpu_choose_option"), choices=["1", "2", "3", "4"], default="4")

            if link_choice == "1":
                # Add GPU link
                # Get all GPU models (safetensors that are not AMX)
                all_models = registry.list_models()
                available_gpu_models = []
                for m in all_models:
                    if m.format == "safetensors":
                        is_amx_model, _ = is_amx_weights(m.path)
                        if not is_amx_model:
                            available_gpu_models.append(m)

                if not available_gpu_models:
                    console.print()
                    console.print(f"[yellow]{t('model_edit_gpu_none_available')}[/yellow]")
                    console.print()
                else:
                    console.print()
                    console.print(f"{t('model_edit_gpu_available_models')}")
                    for i, gpu_m in enumerate(available_gpu_models, 1):
                        already_linked = edited_gpu_model_ids and gpu_m.id in edited_gpu_model_ids
                        status = f" [dim]{t('model_edit_gpu_already_linked')}[/dim]" if already_linked else ""
                        console.print(f"  [{i}] {gpu_m.name}{status}")
                    console.print()

                    gpu_choice = Prompt.ask(t("model_edit_gpu_enter_number"), default="0")
                    try:
                        gpu_idx = int(gpu_choice) - 1
                        if 0 <= gpu_idx < len(available_gpu_models):
                            selected_gpu = available_gpu_models[gpu_idx]

                            # Add to edited_gpu_model_ids
                            current_ids = list(edited_gpu_model_ids) if edited_gpu_model_ids else []
                            if selected_gpu.id not in current_ids:
                                current_ids.append(selected_gpu.id)
                                edited_gpu_model_ids = current_ids
                                console.print()
                                print_info(f"[dim]{t('model_edit_gpu_link_pending', name=selected_gpu.name)}[/dim]")
                            else:
                                console.print()
                                console.print(f"[yellow]{t('model_edit_gpu_already_exists')}[/yellow]")
                        else:
                            console.print()
                            console.print(f"[red]{t('model_edit_gpu_invalid_choice')}[/red]")
                    except ValueError:
                        console.print()
                        console.print(f"[red]{t('model_edit_gpu_invalid_input')}[/red]")

            elif link_choice == "2":
                # Remove GPU link
                if not edited_gpu_model_ids:
                    console.print()
                    console.print(f"[yellow]{t('model_edit_gpu_none_to_remove')}[/yellow]")
                    console.print()
                else:
                    console.print()
                    console.print(f"{t('model_edit_gpu_choose_to_remove')}")
                    gpu_list = []
                    for i, gpu_id in enumerate(edited_gpu_model_ids, 1):
                        gpu_obj = registry.get_model_by_id(gpu_id)
                        gpu_name = gpu_obj.name if gpu_obj else f"{gpu_id[:8]}... (deleted)"
                        gpu_list.append((gpu_id, gpu_name))
                        console.print(f"  [{i}] {gpu_name}")
                    console.print()

                    remove_choice = Prompt.ask(t("model_edit_gpu_enter_to_remove"), default="0")
                    try:
                        remove_idx = int(remove_choice) - 1
                        if 0 <= remove_idx < len(gpu_list):
                            removed_id, removed_name = gpu_list[remove_idx]
                            new_ids = [gid for gid in edited_gpu_model_ids if gid != removed_id]
                            edited_gpu_model_ids = new_ids if new_ids else None
                            console.print()
                            print_info(f"[dim]{t('model_edit_gpu_remove_pending', name=removed_name)}[/dim]")
                        else:
                            console.print()
                            console.print(f"[red]{t('model_edit_gpu_invalid_choice')}[/red]")
                    except ValueError:
                        console.print()
                        console.print(f"[red]{t('model_edit_gpu_invalid_input')}[/red]")

            elif link_choice == "3":
                # Clear all GPU links
                if not edited_gpu_model_ids:
                    console.print()
                    console.print(f"[yellow]{t('model_edit_gpu_none_to_clear')}[/yellow]")
                    console.print()
                else:
                    if Confirm.ask(t("model_edit_gpu_clear_confirm"), default=False):
                        edited_gpu_model_ids = None
                        console.print()
                        print_info(f"[dim]{t('model_edit_gpu_clear_pending')}[/dim]")
                    else:
                        console.print()
                        print_info(t("model_edit_cancelled_short"))

        elif choice == save_option:
            # Save changes
            if not has_changes:
                console.print()
                print_info(f"[dim]{t('model_edit_no_changes_to_save')}[/dim]")
                continue

            console.print()
            console.print(f"[bold cyan]{t('model_edit_saving')}[/bold cyan]")
            console.print()

            # Determine if repo info changed (for verification prompt)
            repo_changed = (original_repo_id is None and edited_repo_id is not None) or (
                original_repo_id != edited_repo_id
            )

            # Build updates dict
            updates = {}
            if edited_name != original_name:
                updates["name"] = edited_name
            if edited_repo_type != original_repo_type:
                updates["repo_type"] = edited_repo_type
            if edited_repo_id != original_repo_id:
                updates["repo_id"] = edited_repo_id
                # Update SHA256 status when repo changes
                if edited_repo_id is None:
                    updates["sha256_status"] = "no_repo"
                else:
                    updates["sha256_status"] = "not_checked"
            if edited_gpu_model_ids != original_gpu_model_ids:
                updates["gpu_model_ids"] = edited_gpu_model_ids

            # Save to registry
            registry.update_model(original_name, updates)
            print_success(t("model_edit_saved"))

            # Update local model object
            if "name" in updates:
                model.name = edited_name
            if "repo_type" in updates:
                model.repo_type = edited_repo_type
            if "repo_id" in updates:
                model.repo_id = edited_repo_id
            if "sha256_status" in updates:
                model.sha256_status = updates["sha256_status"]
            if "gpu_model_ids" in updates:
                model.gpu_model_ids = edited_gpu_model_ids

            # Update original values for next iteration
            original_name = edited_name
            original_repo_type = edited_repo_type
            original_repo_id = edited_repo_id
            original_gpu_model_ids = edited_gpu_model_ids.copy() if edited_gpu_model_ids else None

            # Display updated configuration
            console.print()
            console.print(f"[bold cyan]{t('model_edit_updated_config')}[/bold cyan]\n")

            sha256_display = SHA256_STATUS_MAP_PLAIN.get(model.sha256_status, model.sha256_status)
            gpu_links_info = ""
            if is_cpu_model and model.gpu_model_ids:
                gpu_names = []
                for gpu_id in model.gpu_model_ids:
                    gpu_obj = registry.get_model_by_id(gpu_id)
                    if gpu_obj:
                        gpu_names.append(gpu_obj.name)
                    else:
                        gpu_names.append(f"[dim red]{gpu_id[:8]}... (deleted)[/dim red]")
                gpu_links_info = f"\n[bold]{t('model_edit_gpu_links')}[/bold]  {', '.join(gpu_names)}"

            content = f"""[bold]Name:[/bold]       {model.name}
[bold]Path:[/bold]       {model.path}
[bold]Format:[/bold]     {model.format}
[bold]Repo Type:[/bold]  {model.repo_type or '-'}
[bold]Repo ID:[/bold]    {model.repo_id or '-'}
[bold]SHA256:[/bold]     {sha256_display}{gpu_links_info}"""

            panel = Panel(content, border_style="green", padding=(0, 1))
            console.print(panel)
            console.print()

            # If repo changed, suggest verification
            if repo_changed and model.repo_id:
                console.print()
                console.print(f"[bold yellow]{t('model_edit_repo_changed_warning')}[/bold yellow]")
                console.print()
                console.print(f"  {t('model_edit_verify_hint')}")
                console.print()

            return

        elif choice == cancel_option:
            # Cancel
            console.print()
            if has_changes:
                if Confirm.ask(f"[yellow]{t('model_edit_discard_changes')}[/yellow]", default=False):
                    print_info(t("model_edit_cancelled"))
                    console.print()
                    return
                else:
                    # Go back to menu
                    continue
            else:
                print_info(t("model_edit_cancelled"))
                console.print()
                return


@app.command(name="info")
def info_model(
    name: str = typer.Argument(..., help="Name of model to display"),
) -> None:
    """Display detailed information about a model."""
    from rich.panel import Panel
    from pathlib import Path
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.utils.model_scanner import format_size

    registry = UserModelRegistry()

    # Load model
    model = registry.get_model(name)
    if not model:
        print_error(t("model_info_not_found", name=name))
        console.print()
        console.print(f"  {t('model_info_list_hint')} [cyan]kt model list[/cyan]")
        console.print()
        raise typer.Exit(1)

    console.print()

    # Check if path exists
    path_status = "[green]✓ Exists[/green]" if model.path_exists() else "[red]✗ Missing[/red]"

    # Format repo info
    if model.repo_id:
        repo_abbr = "hf" if model.repo_type == "huggingface" else "ms"
        repo_info = f"{repo_abbr}:{model.repo_id}"
    else:
        repo_info = "-"

    # Format SHA256 status
    sha256_display = SHA256_STATUS_MAP_PLAIN.get(model.sha256_status, model.sha256_status)

    # Calculate folder size and list files if exists
    moe_info = ""
    amx_info = ""

    if model.path_exists():
        path_obj = Path(model.path)
        try:
            if model.format == "safetensors":
                files = list(path_obj.glob("*.safetensors"))

                # Check for AMX weights
                is_amx, numa_count = is_amx_weights(str(path_obj))
                if is_amx:
                    amx_info = f"\n[bold]AMX Format:[/bold]   Yes (NUMA: {numa_count})"
                else:
                    # Check for MOE model
                    try:
                        from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model

                        moe_result = analyze_moe_model(str(path_obj))
                        if moe_result and moe_result.get("num_experts", 0) > 0:
                            moe_info = f"""
[bold]MoE Info:[/bold]
  • Total Experts:     {moe_result['num_experts']}
  • Activated Experts: {moe_result['num_experts_per_tok']} experts/token
  • Hidden Layers:     {moe_result['num_hidden_layers']}
  • Total Model Size:  {moe_result['total_size_gb']:.2f} GB"""
                    except Exception:
                        pass  # Not a MoE model or analysis failed
            else:
                files = list(path_obj.glob("*.gguf"))

            total_size = sum(f.stat().st_size for f in files if f.exists())
            size_str = format_size(total_size)
            file_count = len(files)
            size_info = f"{size_str} ({file_count} files)"

            # List first few files
            file_list = "\n".join([f"  • {f.name}" for f in sorted(files)[:5]])
            if len(files) > 5:
                file_list += f"\n  ... and {len(files) - 5} more files"
        except Exception as e:
            size_info = f"Error calculating size: {e}"
            file_list = "-"
    else:
        size_info = "-"
        file_list = "[red]Path does not exist[/red]"

    # Format created/verified dates
    from datetime import datetime

    try:
        created_date = datetime.fromisoformat(model.created_at).strftime("%Y-%m-%d %H:%M:%S")
    except:
        created_date = model.created_at

    if model.last_verified:
        try:
            verified_date = datetime.fromisoformat(model.last_verified).strftime("%Y-%m-%d %H:%M:%S")
        except:
            verified_date = model.last_verified
    else:
        verified_date = "-"

    # Create detailed panel
    content = f"""[bold]Name:[/bold]         {model.name}
[bold]Path:[/bold]         {model.path}
[bold]Path Status:[/bold]  {path_status}
[bold]Format:[/bold]       {model.format}
[bold]Size:[/bold]         {size_info}{amx_info}{moe_info}
[bold]Repo Type:[/bold]    {model.repo_type or '-'}
[bold]Repo ID:[/bold]      {model.repo_id or '-'}
[bold]SHA256:[/bold]       {sha256_display}
[bold]Created:[/bold]      {created_date}
[bold]Last Verified:[/bold] {verified_date}

[bold]Files:[/bold]
{file_list}"""

    panel = Panel(content, title=f"[cyan]Model Information: {model.name}[/cyan]", border_style="cyan", padding=(1, 2))
    console.print(panel)
    console.print()


@app.command(name="remove")
def remove_model(
    name: str = typer.Argument(..., help="Name of model to remove"),
    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
) -> None:
    """Remove a model from the registry (does not delete files)."""
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    registry = UserModelRegistry()

    # Check if model exists
    model = registry.get_model(name)
    if not model:
        print_error(t("model_remove_not_found", name=name))
        console.print()
        console.print(f"  {t('model_remove_list_hint')} [cyan]kt model list[/cyan]")
        console.print()
        raise typer.Exit(1)

    console.print()
    console.print(f"[bold yellow]{t('model_remove_warning')}[/bold yellow]")
    console.print(f"  {t('model_remove_note')}")
    console.print(f"  [dim]Path: {model.path}[/dim]")
    console.print()

    # Check if this GPU model is linked by any CPU models
    model_uuid = model.id
    affected_cpu_models = []

    # Only check for GPU models (safetensors that are not AMX)
    if model.format == "safetensors":
        is_amx, _ = is_amx_weights(model.path)
        if not is_amx:
            # This is a GPU model, check for CPU models that link to it
            for m in registry.list_models():
                if m.gpu_model_ids and model_uuid in m.gpu_model_ids:
                    affected_cpu_models.append(m)

    # If there are affected CPU models, inform the user
    if affected_cpu_models:
        console.print(f"[yellow]This GPU model is linked by {len(affected_cpu_models)} CPU model(s):[/yellow]")
        for cpu_model in affected_cpu_models:
            console.print(f"  • {cpu_model.name}")
        console.print()
        console.print(f"[dim]These links will be automatically removed.[/dim]")
        console.print()

    # Confirm deletion
    if not yes:
        if not confirm(t("model_remove_confirm", name=name), default=False):
            print_info(t("model_remove_cancelled"))
            console.print()
            return

    # Clean up references in CPU models before removing
    if affected_cpu_models:
        for cpu_model in affected_cpu_models:
            # Remove this GPU model's UUID from the cpu_model's gpu_model_ids list
            new_gpu_ids = [gid for gid in cpu_model.gpu_model_ids if gid != model_uuid]
            registry.update_model(cpu_model.name, {"gpu_model_ids": new_gpu_ids if new_gpu_ids else None})

    # Remove from registry
    if registry.remove_model(name):
        console.print()
        print_success(t("model_removed", name=name))
        console.print()
    else:
        print_error(t("model_remove_failed", name=name))
        raise typer.Exit(1)


@app.command(name="refresh")
def refresh_models() -> None:
    """Check all registered models and identify missing ones."""
    from rich.table import Table
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    registry = UserModelRegistry()
    models = registry.list_models()

    if not models:
        print_warning(t("model_no_registered_models"))
        console.print()
        return

    console.print()
    print_info(t("model_refresh_checking"))

    # Refresh status
    status = registry.refresh_status()

    # Check relationship integrity
    broken_relationships = []  # [(cpu_model, gpu_uuid, gpu_name_or_none)]
    for model in models:
        if model.gpu_model_ids:
            for gpu_uuid in model.gpu_model_ids:
                gpu_obj = registry.get_model_by_id(gpu_uuid)
                if not gpu_obj:
                    broken_relationships.append((model.name, gpu_uuid, None))
                elif not gpu_obj.path_exists():
                    broken_relationships.append((model.name, gpu_uuid, gpu_obj.name))

    console.print()

    # Show results
    has_issues = status["missing"] or broken_relationships

    if not has_issues:
        print_success(t("model_refresh_all_valid", count=len(models)))
        console.print(f"  {t('model_refresh_total', total=len(models))}")
        console.print()
        return

    # Show broken relationships
    if broken_relationships:
        print_warning(f"Found {len(broken_relationships)} broken GPU link(s)")
        console.print()

        from rich.table import Table

        rel_table = Table(show_header=True, header_style="bold yellow")
        rel_table.add_column("CPU Model", style="cyan")
        rel_table.add_column("GPU Model", style="dim")
        rel_table.add_column("Issue", style="red")

        for cpu_name, gpu_uuid, gpu_name in broken_relationships:
            if gpu_name is None:
                gpu_display = f"{gpu_uuid[:8]}..."
                issue = "Deleted"
            else:
                gpu_display = gpu_name
                issue = "Path Missing"
            rel_table.add_row(cpu_name, gpu_display, issue)

        console.print(rel_table)
        console.print()
        console.print(f"[dim]Use [cyan]kt model edit <cpu-model>[/cyan] to fix GPU links[/dim]")
        console.print()

    if not status["missing"]:
        # Only broken relationships, no missing models
        return

    # Show missing models
    print_warning(t("model_refresh_missing_found", count=len(status["missing"])))
    console.print()

    table = Table(show_header=True, header_style="bold")
    table.add_column(t("model_column_name"), style="cyan")
    table.add_column(t("model_column_path"), style="dim")
    table.add_column(t("model_column_status"), justify="center")

    for model in models:
        if model.name in status["missing"]:
            status_text = "[red]✗ Missing[/red]"
        else:
            status_text = "[green]✓ Valid[/green]"

        table.add_row(model.name, model.path, status_text)

    console.print(table)
    console.print()

    # Suggest actions
    console.print(f"[bold]{t('model_refresh_suggestions')}:[/bold]")
    console.print(f"  • {t('model_refresh_remove_hint')} [cyan]kt model remove <name>[/cyan]")
    console.print(f"  • {t('model_refresh_rescan_hint')} [cyan]kt model scan[/cyan]")
    console.print()


@app.command(name="verify")
def verify_model(
    name: str = typer.Argument(None, help="Name of model to verify (interactive if not provided)"),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed SHA256 comparison for each file"),
) -> None:
    """Verify model integrity using SHA256 checksums with interactive repair."""
    from pathlib import Path
    from rich.prompt import Prompt, Confirm
    from rich.table import Table
    from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, MofNCompleteColumn
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.utils.model_verifier import verify_model_integrity_with_progress, check_huggingface_connectivity

    registry = UserModelRegistry()

    # Helper function to display model selection table
    def show_model_table():
        from kt_kernel.cli.utils.model_scanner import format_size
        from pathlib import Path

        # Import MoE analyzer
        analyze_moe_model = None
        try:
            from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model
        except ImportError:
            pass

        all_models = registry.list_models()

        # Filter: only safetensors models with repo_id
        verifiable_models = [m for m in all_models if m.repo_id and m.format == "safetensors"]

        if not verifiable_models:
            print_warning(t("model_verify_all_no_repos"))
            console.print()
            console.print(f"  {t('model_verify_all_config_hint')}")
            console.print()
            return None

        # Analyze MoE models
        moe_results = {}
        if analyze_moe_model:
            for model in verifiable_models:
                try:
                    result = analyze_moe_model(model.path, use_cache=True)
                    if result and result.get("num_experts", 0) > 0:
                        moe_results[model.name] = result
                except Exception:
                    pass

        # Filter to only show MoE models
        moe_verifiable_models = [m for m in verifiable_models if m.name in moe_results]

        if not moe_verifiable_models:
            console.print()
            console.print("[yellow]No MoE models with repo_id found for verification.[/yellow]")
            console.print()
            console.print(
                f"[dim]Only MoE models can be verified. Use [cyan]kt model list[/cyan] to see all models.[/dim]"
            )
            console.print()
            return None

        console.print()
        console.print("[bold]Select a MoE model to verify:[/bold]\n")

        table = Table(show_header=True, header_style="bold", show_lines=False)
        table.add_column("#", justify="right", style="dim", width=4)
        table.add_column(t("model_column_name"), style="cyan", no_wrap=True)
        table.add_column("Path", style="dim", overflow="fold")
        table.add_column("Total", justify="right")
        table.add_column("Exps", justify="center", style="yellow")
        table.add_column("Act", justify="center", style="green")
        table.add_column(t("model_column_repo"), style="dim", overflow="fold")
        table.add_column(t("model_column_sha256"), justify="center")

        for i, model in enumerate(moe_verifiable_models, 1):
            # Calculate size
            if model.path_exists():
                path_obj = Path(model.path)
                try:
                    files = list(path_obj.glob("*.safetensors"))
                    total_size = sum(f.stat().st_size for f in files if f.exists())
                    size_display = format_size(total_size)
                except:
                    size_display = "[dim]-[/dim]"
            else:
                size_display = "[dim]-[/dim]"

            # Get MoE info
            moe_info = moe_results.get(model.name)
            experts_display = f"[yellow]{moe_info['num_experts']}[/yellow]" if moe_info else "[dim]-[/dim]"
            activated_display = f"[green]{moe_info['num_experts_per_tok']}[/green]" if moe_info else "[dim]-[/dim]"

            # Repo info
            repo_abbr = "hf" if model.repo_type == "huggingface" else "ms"
            repo_display = f"{repo_abbr}:{model.repo_id}"

            # SHA256 status
            status_icon = {
                "not_checked": "[dim]○[/dim]",
                "checking": "[yellow]◐[/yellow]",
                "passed": "[green]✓[/green]",
                "failed": "[red]✗[/red]",
                "no_repo": "[dim]-[/dim]",
            }.get(model.sha256_status, "[dim]?[/dim]")

            table.add_row(
                str(i),
                model.name,
                model.path,
                size_display,
                experts_display,
                activated_display,
                repo_display,
                status_icon,
            )

        console.print(table)
        console.print()
        console.print("[dim]SHA256 Status: ○ Not checked | ✓ Passed | ✗ Failed[/dim]")
        console.print()

        return moe_verifiable_models

    # Main verification loop
    # Track files to verify (None = all files, list = specific files for re-verification)
    files_to_verify = None

    while True:
        selected_model = None

        # If name provided directly, use it once then switch to interactive
        if name:
            selected_model = registry.get_model(name)
            if not selected_model:
                print_error(t("model_verify_not_found", name=name))
                console.print()
                console.print(f"  {t('model_verify_list_hint')} [cyan]kt model list[/cyan]")
                console.print()
                raise typer.Exit(1)
            name = None  # Clear so next loop is interactive
        else:
            # Show interactive selection
            verifiable_models = show_model_table()
            if not verifiable_models:
                return

            choice = Prompt.ask("Enter model number to verify (or 'q' to quit)", default="1")

            if choice.lower() == "q":
                return

            try:
                idx = int(choice) - 1
                if 0 <= idx < len(verifiable_models):
                    selected_model = verifiable_models[idx]
                    # Reset files_to_verify when selecting a new model
                    files_to_verify = None
                else:
                    print_error(f"Invalid selection: {choice}")
                    console.print()
                    continue
            except ValueError:
                print_error(f"Invalid input: {choice}")
                console.print()
                continue

        # Check model prerequisites
        console.print()

        if not selected_model.repo_id:
            print_warning(t("model_verify_no_repo", name=selected_model.name))
            console.print()
            console.print(f"  {t('model_verify_config_hint', name=selected_model.name)}")
            console.print()
            continue

        if not selected_model.path_exists():
            print_error(t("model_verify_path_missing", path=selected_model.path))
            console.print()
            continue

        # Check HuggingFace connectivity and decide whether to use mirror
        use_mirror = False
        if selected_model.repo_type == "huggingface":
            with console.status("[dim]Checking HuggingFace connectivity...[/dim]"):
                is_accessible, message = check_huggingface_connectivity(timeout=5)

            if not is_accessible:
                print_warning("HuggingFace Connection Failed")
                console.print()
                console.print(f"  {message}")
                console.print()
                console.print("  [yellow]Auto-switching to HuggingFace mirror:[/yellow] [cyan]hf-mirror.com[/cyan]")
                console.print()
                use_mirror = True

        # Perform verification with progress bar
        if files_to_verify:
            print_info(f"Re-verifying {len(files_to_verify)} repaired files: {selected_model.name}")
        else:
            print_info(f"Verifying: {selected_model.name}")
        console.print(f"  Repository: [yellow]{selected_model.repo_type}[/yellow]:{selected_model.repo_id}")
        console.print(f"  Local path: {selected_model.path}")
        console.print()

        # Helper function to fetch remote hashes with timeout (using console.status like connectivity check)
        def fetch_remote_hashes_with_timeout(repo_type, repo_id, use_mirror, timeout_seconds):
            """Fetch remote hashes with timeout, returns (hashes_dict, timed_out)."""
            from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
            from kt_kernel.cli.utils.model_verifier import fetch_model_sha256

            def fetch_hashes():
                platform = "hf" if repo_type == "huggingface" else "ms"
                return fetch_model_sha256(repo_id, platform, use_mirror=use_mirror, timeout=timeout_seconds)

            executor = ThreadPoolExecutor(max_workers=1)
            try:
                future = executor.submit(fetch_hashes)
                hashes = future.result(timeout=timeout_seconds)
                executor.shutdown(wait=False)
                return (hashes, False)
            except (FutureTimeoutError, Exception):
                executor.shutdown(wait=False)
                return (None, True)

        # Step 1: Fetch remote hashes with timeout and fallback
        official_hashes = None

        if selected_model.repo_type == "huggingface":
            # HF fallback chain: HF → HF-mirror → MS

            # Try 1: HuggingFace (or HF-mirror if already set)
            status = console.status(
                "[dim]Fetching remote hashes from HuggingFace{}...[/dim]".format(" mirror" if use_mirror else "")
            )
            status.start()
            official_hashes, timed_out = fetch_remote_hashes_with_timeout(
                repo_type="huggingface", repo_id=selected_model.repo_id, use_mirror=use_mirror, timeout_seconds=10
            )
            status.stop()

            # Try 2: If timed out and not already using mirror, try HF-mirror
            if timed_out and not use_mirror:
                print_warning("HuggingFace Fetch Timeout (10s)")
                console.print()
                console.print("  [yellow]Auto-switching to HuggingFace mirror:[/yellow] [cyan]hf-mirror.com[/cyan]")
                console.print()

                status = console.status("[dim]Fetching remote hashes from HuggingFace mirror...[/dim]")
                status.start()
                official_hashes, timed_out = fetch_remote_hashes_with_timeout(
                    repo_type="huggingface",
                    repo_id=selected_model.repo_id,
                    use_mirror=True,  # Use mirror
                    timeout_seconds=10,
                )
                status.stop()

            # Try 3: If still timed out, try ModelScope with same repo_id
            if timed_out:
                print_warning("HuggingFace Mirror Timeout (10s)")
                console.print()
                console.print("  [yellow]Fallback to ModelScope mirror with same repo_id...[/yellow]")
                console.print()

                status = console.status("[dim]Fetching remote hashes from ModelScope...[/dim]")
                status.start()
                official_hashes, timed_out = fetch_remote_hashes_with_timeout(
                    repo_type="modelscope",
                    repo_id=selected_model.repo_id,  # Use same repo_id
                    use_mirror=False,
                    timeout_seconds=10,
                )
                status.stop()

                if official_hashes:
                    # Success with ModelScope
                    console.print("  [green]✓ Successfully fetched from ModelScope[/green]")
                    console.print()
                elif timed_out:
                    # All failed
                    print_error("All sources timed out (HuggingFace and ModelScope)")
                    console.print()
                    console.print("  Please check your network connection or try again later")
                    console.print()
                    continue

        elif selected_model.repo_type == "modelscope":
            # ModelScope: no fallback, just timeout
            status = console.status("[dim]Fetching remote hashes from ModelScope...[/dim]")
            status.start()
            official_hashes, timed_out = fetch_remote_hashes_with_timeout(
                repo_type="modelscope", repo_id=selected_model.repo_id, use_mirror=False, timeout_seconds=10
            )
            status.stop()

            if timed_out:
                print_error("ModelScope Fetch Timeout (10s)")
                console.print()
                console.print("  Please check your network connection or try again later")
                console.print()
                continue

        # Check if we successfully fetched remote hashes
        if not official_hashes:
            # Already printed error message above, skip to next model
            continue

        # Success - print confirmation
        console.print(f"  [green]✓ Fetched {len(official_hashes)} file hashes from remote[/green]")
        console.print()

        # Step 2 & 3: Calculate local SHA256 and compare (with Progress bar)
        from kt_kernel.cli.utils.model_verifier import calculate_local_sha256

        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            TimeElapsedColumn(),
            console=console,
        ) as progress:
            # Step 2: Calculate local SHA256 hashes (no timeout)
            local_dir_path = Path(selected_model.path)

            # Determine which files to hash
            if files_to_verify:
                # Only hash files that need re-verification
                clean_filenames = {
                    Path(f.replace(" (missing)", "").replace(" (hash mismatch)", "").strip()).name
                    for f in files_to_verify
                }
                # Collect files matching *.safetensors, *.json, *.py
                files_to_hash = []
                for pattern in ["*.safetensors", "*.json", "*.py"]:
                    files_to_hash.extend(
                        [f for f in local_dir_path.glob(pattern) if f.is_file() and f.name in clean_filenames]
                    )
            else:
                # Collect all important files: *.safetensors, *.json, *.py
                files_to_hash = []
                for pattern in ["*.safetensors", "*.json", "*.py"]:
                    files_to_hash.extend([f for f in local_dir_path.glob(pattern) if f.is_file()])

            total_files = len(files_to_hash)

            # Create progress task for local hashing
            hash_task_id = progress.add_task("[yellow]Calculating local SHA256...", total=total_files)
            completed_count = [0]

            def local_hash_callback(msg: str):
                if "Using" in msg and "workers" in msg:
                    # Show parallel worker info
                    console.print(f"  [dim]{msg}[/dim]")
                elif "[" in msg and "/" in msg and "]" in msg:
                    # Progress update
                    completed_count[0] += 1
                    if "✓" in msg:
                        filename = msg.split("✓")[1].strip().split("(")[0].strip()
                        progress.update(hash_task_id, advance=1, description=f"[yellow]Hashing: {filename[:40]}...")

            local_hashes = calculate_local_sha256(
                local_dir_path,
                "*.safetensors",
                progress_callback=local_hash_callback,
                files_list=files_to_hash if files_to_verify else None,
            )

            progress.remove_task(hash_task_id)
            console.print(f"  [green]✓ Calculated {len(local_hashes)} local file hashes[/green]")

            # Step 3: Compare hashes
            # If re-verifying specific files, only compare those files
            if files_to_verify:
                # Build set of clean filenames to verify
                clean_verify_filenames = {
                    Path(f.replace(" (missing)", "").replace(" (hash mismatch)", "").strip()).name
                    for f in files_to_verify
                }
                # Filter official_hashes to only include files we're re-verifying
                hashes_to_compare = {
                    filename: hash_value
                    for filename, hash_value in official_hashes.items()
                    if Path(filename).name in clean_verify_filenames
                }
            else:
                # First-time verification: compare all files
                hashes_to_compare = official_hashes

            compare_task_id = progress.add_task("[blue]Comparing hashes...", total=len(hashes_to_compare))

            files_failed = []
            files_missing = []
            files_passed = 0

            for filename, official_hash in hashes_to_compare.items():
                file_basename = Path(filename).name

                # Find matching local file
                local_hash = None
                for local_file, local_hash_value in local_hashes.items():
                    if Path(local_file).name == file_basename:
                        local_hash = local_hash_value
                        break

                if local_hash is None:
                    files_missing.append(filename)
                    if verbose:
                        console.print(f"  [red]✗ {file_basename} (missing)[/red]")
                elif local_hash.lower() != official_hash.lower():
                    files_failed.append(f"{filename} (hash mismatch)")
                    if verbose:
                        console.print(f"  [red]✗ {file_basename} (hash mismatch)[/red]")
                else:
                    files_passed += 1
                    if verbose:
                        console.print(f"  [green]✓ {file_basename}[/green]")

                progress.update(compare_task_id, advance=1)

            progress.remove_task(compare_task_id)

            # Build result
            total_checked = len(hashes_to_compare)  # Use actual compared count
            if files_failed or files_missing:
                all_failed = files_failed + [f"{f} (missing)" for f in files_missing]
                result = {
                    "status": "failed",
                    "files_checked": total_checked,
                    "files_passed": files_passed,
                    "files_failed": all_failed,
                }
            else:
                result = {
                    "status": "passed",
                    "files_checked": total_checked,
                    "files_passed": files_passed,
                    "files_failed": [],
                }

        # Update registry status and display results
        if result["status"] == "passed":
            registry.update_model(selected_model.name, {"sha256_status": "passed"})
            console.print()
            print_success(t("model_verify_passed"))
            console.print()
            console.print(f"  ✓ Files checked: [bold green]{result['files_checked']}[/bold green]")
            console.print(f"  ✓ All files passed SHA256 verification")
            console.print()
        elif result["status"] == "failed":
            registry.update_model(selected_model.name, {"sha256_status": "failed"})
            console.print()
            print_error(f"Verification failed! {len(result['files_failed'])} file(s) have issues")
            console.print()
            console.print(f"  Total files: {result['files_checked']}")
            console.print(f"  ✓ Passed: [green]{result['files_passed']}[/green]")
            console.print(f"  ✗ Failed: [red]{len(result['files_failed'])}[/red]")
            console.print()

            # Show failed files (only if not already shown in verbose mode)
            if not verbose:
                console.print("  [bold red]Failed files:[/bold red]")
                for failed_file in result["files_failed"]:
                    console.print(f"    ✗ {failed_file}")
                console.print()

            # Ask if user wants to repair
            if Confirm.ask("[yellow]Do you want to repair (re-download) the failed files?[/yellow]", default=True):
                console.print()
                print_info("Repairing failed files...")

                # Extract clean filenames by removing status suffixes
                files_to_download = [
                    f.replace(" (missing)", "").replace(" (hash mismatch)", "").strip() for f in result["files_failed"]
                ]

                # Download each failed file
                success_count = 0

                # Set mirror for downloads if needed
                import os

                original_hf_endpoint = os.environ.get("HF_ENDPOINT")
                if use_mirror and selected_model.repo_type == "huggingface" and not original_hf_endpoint:
                    os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
                    console.print(f"  [dim]Using HuggingFace mirror for downloads[/dim]")

                try:
                    for file_to_repair in files_to_download:
                        console.print(f"  Repairing: [cyan]{file_to_repair}[/cyan]")

                        # Step 1: Delete the corrupted/missing file if it exists
                        local_file_path = Path(selected_model.path) / file_to_repair
                        if local_file_path.exists():
                            try:
                                local_file_path.unlink()
                                console.print(f"    [dim]✓ Deleted corrupted file[/dim]")
                            except Exception as e:
                                console.print(f"    [yellow]⚠ Could not delete file: {e}[/yellow]")

                        # Step 2: Download the fresh file
                        if selected_model.repo_type == "huggingface":
                            # Use hf_hub_download for HuggingFace (inherits HF_ENDPOINT env var)
                            try:
                                from huggingface_hub import hf_hub_download

                                hf_hub_download(
                                    repo_id=selected_model.repo_id,
                                    filename=file_to_repair,
                                    local_dir=selected_model.path,
                                    local_dir_use_symlinks=False,
                                )
                                console.print(f"    [green]✓ Downloaded successfully[/green]")
                                success_count += 1
                            except ImportError:
                                print_error("huggingface_hub not installed. Install: pip install huggingface_hub")
                                break
                            except Exception as e:
                                console.print(f"    [red]✗ Download failed: {e}[/red]")
                        else:
                            # Use modelscope download for ModelScope
                            try:
                                from modelscope.hub.snapshot_download import snapshot_download

                                # Download directly to local_dir
                                snapshot_download(
                                    model_id=selected_model.repo_id,
                                    local_dir=selected_model.path,
                                    allow_file_pattern=file_to_repair,
                                )
                                console.print(f"    [green]✓ Downloaded successfully[/green]")
                                success_count += 1
                            except ImportError:
                                print_error("modelscope not installed. Install: pip install modelscope")
                                break
                            except Exception as e:
                                console.print(f"    [red]✗ Download failed: {e}[/red]")
                finally:
                    # Restore original HF_ENDPOINT
                    if use_mirror and selected_model.repo_type == "huggingface" and not original_hf_endpoint:
                        os.environ.pop("HF_ENDPOINT", None)
                    elif original_hf_endpoint:
                        os.environ["HF_ENDPOINT"] = original_hf_endpoint

                console.print()
                if success_count > 0:
                    print_success(f"Repaired {success_count}/{len(files_to_download)} files")
                    console.print()

                    # Ask if user wants to re-verify
                    if Confirm.ask("Re-verify the model now?", default=True):
                        # Re-verify by continuing the loop with the same model
                        # Only verify the files that were repaired
                        name = selected_model.name
                        files_to_verify = files_to_download
                        continue


@app.command(name="verify-all")
def verify_all_models() -> None:
    """Verify all models with repo configuration (not yet implemented)."""
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    registry = UserModelRegistry()
    models = registry.list_models()

    # Filter models with repo configuration
    models_with_repo = [m for m in models if m.repo_id]

    if not models_with_repo:
        print_warning(t("model_verify_all_no_repos"))
        console.print()
        console.print(f"  {t('model_verify_all_config_hint')} [cyan]kt model edit <name>[/cyan]")
        console.print()
        return

    console.print()
    print_warning(t("model_verify_not_implemented"))
    console.print()
    console.print(f"  {t('model_verify_all_found', count=len(models_with_repo))}")
    console.print()

    for model in models_with_repo:
        console.print(f"  • {model.name} ({model.repo_type}:{model.repo_id})")

    console.print()
    console.print(f"  [dim]{t('model_verify_future_note')}[/dim]")
    console.print()
    console.print(f"  {t('model_verify_all_manual_hint')} [cyan]kt model verify <name>[/cyan]")
    console.print()


@app.command(name="auto-repo")
def auto_detect_repo(
    apply: bool = typer.Option(
        False, "--apply", "-a", help="Automatically apply detected repo information without confirmation"
    ),
    dry_run: bool = typer.Option(
        False, "--dry-run", "-d", help="Show what would be detected without making any changes"
    ),
) -> None:
    """
    Auto-detect repository information from model README.md files.

    Scans all models without repo_id (safetensors/gguf only) and attempts to
    extract repository information from README.md metadata (license_link field).

    Examples:
        kt model auto-repo              # Scan and ask for confirmation
        kt model auto-repo --apply      # Scan and apply automatically
        kt model auto-repo --dry-run    # Scan only, no changes
    """
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.utils.repo_detector import scan_models_for_repo, format_detection_report, apply_detection_results
    from rich.table import Table

    console.print()
    print_info("Scanning models for repository information...")
    console.print()

    # Get all models
    registry = UserModelRegistry()
    models = registry.list_models()

    if not models:
        print_warning("No models found in registry")
        console.print()
        return

    # Scan for repo information
    print_step("Analyzing README.md files...")
    results = scan_models_for_repo(models)

    # Show results
    console.print()

    if not results["detected"] and not results["not_detected"]:
        print_info("All models already have repository information configured")
        console.print()
        return

    # Create results table
    if results["detected"]:
        console.print("[bold green]✓ Detected Repository Information[/bold green]")
        console.print()

        table = Table(show_header=True, header_style="bold cyan")
        table.add_column("Model Name", style="yellow")
        table.add_column("Repository", style="cyan")
        table.add_column("Type", style="magenta")

        for model, repo_id, repo_type in results["detected"]:
            table.add_row(model.name, repo_id, repo_type)

        console.print(table)
        console.print()

    if results["not_detected"]:
        console.print(
            f"[bold yellow]✗ No Repository Information Found ({len(results['not_detected'])} models)[/bold yellow]"
        )
        console.print()

        for model in results["not_detected"][:5]:  # Show first 5
            console.print(f"  • {model.name}")

        if len(results["not_detected"]) > 5:
            console.print(f"  ... and {len(results['not_detected']) - 5} more")

        console.print()

    if results["skipped"]:
        console.print(
            f"[dim]⊘ Skipped {len(results['skipped'])} models (already configured or not safetensors/gguf)[/dim]"
        )
        console.print()

    # Summary
    console.print("[bold]Summary:[/bold]")
    console.print(f"  • [green]{len(results['detected'])}[/green] detected")
    console.print(f"  • [yellow]{len(results['not_detected'])}[/yellow] not detected")
    console.print(f"  • [dim]{len(results['skipped'])}[/dim] skipped")
    console.print()

    # Exit if dry run or no detections
    if dry_run:
        print_info("Dry run mode - no changes made")
        console.print()
        return

    if not results["detected"]:
        console.print()
        return

    # Ask for confirmation (unless --apply flag)
    if not apply:
        console.print()
        if not confirm(f"Apply repository information to {len(results['detected'])} model(s)?", default=False):
            print_warning("Cancelled - no changes made")
            console.print()
            return

    # Apply changes
    console.print()
    print_step("Applying changes...")

    updated_count = apply_detection_results(results, registry)

    console.print()
    if updated_count > 0:
        print_success(f"✓ Updated {updated_count} model(s) with repository information")
        console.print()
        console.print("  You can now:")
        console.print("  • Run [cyan]kt model verify <name>[/cyan] to verify model integrity")
        console.print("  • Check status with [cyan]kt model list[/cyan]")
        console.print()
    else:
        print_error("Failed to update models")
        console.print()


================================================
FILE: kt-kernel/python/cli/commands/quant.py
================================================
"""
Quant command for kt-cli.

Quantizes model weights for CPU inference.
"""

import subprocess
import sys
from enum import Enum
from pathlib import Path
from typing import Optional

import typer

from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
    confirm,
    console,
    create_progress,
    print_error,
    print_info,
    print_step,
    print_success,
    print_warning,
)
from kt_kernel.cli.utils.environment import detect_cpu_info


class QuantMethod(str, Enum):
    """Quantization method."""

    INT4 = "int4"
    INT8 = "int8"


def quant(
    model: Optional[str] = typer.Argument(
        None,
        help="Model name or path to quantize",
    ),
    method: Optional[QuantMethod] = typer.Option(
        None,
        "--method",
        "-m",
        help="Quantization method",
    ),
    output: Optional[Path] = typer.Option(
        None,
        "--output",
        "-o",
        help="Output path for quantized weights",
    ),
    input_type: Optional[str] = typer.Option(
        None,
        "--input-type",
        "-i",
        help="Input weight type (fp8, fp16, bf16)",
    ),
    cpu_threads: Optional[int] = typer.Option(
        None,
        "--cpu-threads",
        help="Number of CPU threads for quantization",
    ),
    numa_nodes: Optional[int] = typer.Option(
        None,
        "--numa-nodes",
        help="Number of NUMA nodes",
    ),
    no_merge: bool = typer.Option(
        False,
        "--no-merge",
        help="Don't merge safetensor files",
    ),
    gpu: bool = typer.Option(
        False,
        "--gpu",
        help="Use GPU for conversion (faster)",
    ),
    yes: bool = typer.Option(
        False,
        "--yes",
        "-y",
        help="Skip confirmation prompts",
    ),
) -> None:
    """Quantize model weights for CPU inference.

    If no model is specified, interactive mode will be activated.
    """
    settings = get_settings()

    # Check if we should use interactive mode
    # Interactive mode triggers when: no model, or missing critical parameters
    needs_interactive = model is None or method is None or cpu_threads is None or numa_nodes is None
    is_interactive = False

    if needs_interactive and sys.stdin.isatty():
        # Use interactive configuration (includes verification in Step 1.5)
        from kt_kernel.cli.utils.quant_interactive import interactive_quant_config

        console.print()
        console.print(f"[bold cyan]═══ {t('quant_interactive_title')} ═══[/bold cyan]")
        console.print()
        console.print(f"[yellow]{t('quant_new_model_notice')}[/yellow]")
        console.print()

        config = interactive_quant_config()
        if config is None:
            # User cancelled
            raise typer.Exit(0)

        # Extract configuration
        model_obj = config["model"]
        model = model_obj.id
        input_path = Path(model_obj.path)
        method = QuantMethod(config["method"])
        input_type = config["input_type"]
        cpu_threads = config["cpu_threads"]
        numa_nodes = config["numa_nodes"]
        output = config["output_path"]
        gpu = config["use_gpu"]
        is_interactive = True

        console.print()
        print_success(t("quant_config_complete"))
        console.print()
    else:
        # Non-interactive mode - require model parameter
        if model is None:
            print_error("Model argument is required in non-interactive mode")
            console.print()
            console.print("Usage: kt quant <model>")
            console.print("   Or: kt quant  (for interactive mode)")
            raise typer.Exit(1)

        # Set defaults for optional parameters
        method = method or QuantMethod.INT4
        input_type = input_type or "fp8"

        console.print()

        # Resolve input path
        input_path = _resolve_input_path(model, settings)
        if input_path is None:
            print_error(t("quant_input_not_found", path=model))
            raise typer.Exit(1)

        # Pre-quantization verification (only in non-interactive mode)
        # Interactive mode already did verification in interactive_quant_config()
        from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
        from kt_kernel.cli.utils.model_verifier import pre_operation_verification

        user_registry = UserModelRegistry()
        user_model_obj = user_registry.find_by_path(str(input_path))

        if user_model_obj and user_model_obj.format == "safetensors":
            pre_operation_verification(user_model_obj, user_registry, operation_name="quantizing")

    # Get user model info for both modes (needed later for registering quantized model)
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    user_registry = UserModelRegistry()
    user_model_obj = user_registry.find_by_path(str(input_path))

    # Validate that it's a MoE model (not AMX or GGUF)
    from kt_kernel.cli.commands.model import is_amx_weights

    # Check if it's AMX (already quantized)
    is_amx, _ = is_amx_weights(str(input_path))
    if is_amx:
        print_error("Cannot quantize AMX models (already quantized)")
        console.print()
        console.print(f"  The model at {input_path} is already in AMX format.")
        raise typer.Exit(1)

    # Check if it's a MoE model
    from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model

    moe_result = None  # Store for later use when registering quantized model
    try:
        moe_result = analyze_moe_model(str(input_path), use_cache=True)
        if not moe_result or not moe_result.get("is_moe"):
            print_error("Only MoE models can be quantized to AMX format")
            console.print()
            console.print(f"  The model at {input_path} is not a MoE model.")
            console.print("  AMX quantization is designed for MoE models (e.g., DeepSeek-V3).")
            raise typer.Exit(1)
    except Exception as e:
        print_warning(f"Could not detect MoE information: {e}")
        console.print()
        if not yes:
            if not confirm("Continue quantization anyway?", default=False):
                raise typer.Exit(1)

    # Detect CPU configuration and resolve output path (only needed in non-interactive mode)
    if not is_interactive:
        print_info(t("quant_input_path", path=str(input_path)))

        # Detect CPU configuration (needed for output path)
        cpu = detect_cpu_info()
        final_cpu_threads = cpu_threads or cpu.cores
        final_numa_nodes = numa_nodes or cpu.numa_nodes

        # Resolve output path
        if output is None:
            # Priority: paths.weights > paths.models[0] > model's parent directory
            weights_dir = settings.weights_dir

            if weights_dir and weights_dir.exists():
                # Use configured weights directory (highest priority)
                output = weights_dir / f"{input_path.name}-AMX{method.value.upper()}-NUMA{final_numa_nodes}"
            else:
                # Use first model storage path
                model_paths = settings.get_model_paths()
                if model_paths and model_paths[0].exists():
                    output = model_paths[0] / f"{input_path.name}-AMX{method.value.upper()}-NUMA{final_numa_nodes}"
                else:
                    # Fallback to model's parent directory
                    output = input_path.parent / f"{input_path.name}-AMX{method.value.upper()}-NUMA{final_numa_nodes}"

        print_info(t("quant_output_path", path=str(output)))
        print_info(t("quant_method", method=method.value.upper()))
        print_info(t("quant_cpu_threads", threads=final_cpu_threads))
        print_info(t("quant_numa_nodes", nodes=final_numa_nodes))

        # Calculate space requirements
        console.print()
        console.print(f"[bold cyan]{t('quant_disk_analysis')}[/bold cyan]")
        console.print()

        # Calculate source model size
        try:
            total_bytes = sum(f.stat().st_size for f in input_path.glob("*.safetensors") if f.is_file())
            source_size_gb = total_bytes / (1024**3)
        except Exception:
            source_size_gb = 0.0

        # Estimate quantized size
        input_bits = {"fp8": 8, "fp16": 16, "bf16": 16}
        quant_bits = {"int4": 4, "int8": 8}
        input_bit = input_bits.get(input_type, 16)
        quant_bit = quant_bits.get(method.value, 4)
        ratio = quant_bit / input_bit
        estimated_size_gb = source_size_gb * ratio

        # Check available space
        import shutil

        try:
            check_path = output.parent if not output.exists() else output
            while not check_path.exists() and check_path != check_path.parent:
                check_path = check_path.parent
            stat = shutil.disk_usage(check_path)
            available_gb = stat.free / (1024**3)
        except Exception:
            available_gb = 0.0

        is_sufficient = available_gb >= (estimated_size_gb * 1.2)

        console.print(f"  {t('quant_source_size'):<26} {source_size_gb:.2f} GB")
        console.print(f"  {t('quant_estimated_size'):<26} {estimated_size_gb:.2f} GB")
        console.print(f"  {t('quant_available_space'):<26} {available_gb:.2f} GB")
        console.print()

        if not is_sufficient:
            required_with_buffer = estimated_size_gb * 1.2
            print_warning(t("quant_insufficient_space"))
            console.print()
            console.print(f"  {t('quant_required_space'):<26} {required_with_buffer:.2f} GB")
            console.print(f"  {t('quant_available_space'):<26} {available_gb:.2f} GB")
            console.print(f"  {t('quant_shortage'):<26} {required_with_buffer - available_gb:.2f} GB")
            console.print()
            console.print(f"  {t('quant_may_fail')}")
            console.print()

            if not yes:
                if not confirm(t("quant_continue_anyway"), default=False):
                    raise typer.Abort()
            console.print()

        # Check if output exists and generate unique name
        if output.exists():
            print_warning(t("quant_output_exists", path=str(output)))
            console.print()

            # Generate unique name by adding suffix
            original_name = output.name
            parent_dir = output.parent
            counter = 2

            while output.exists():
                new_name = f"{original_name}-{counter}"
                output = parent_dir / new_name
                counter += 1

            print_success(t("quant_using_unique", path=str(output)))
            console.print()

        # Confirm (only show if not using --yes flag)
        if not yes:
            console.print()
            print_warning(t("quant_time_warning"))
            console.print()

            if not confirm(t("prompt_continue")):
                raise typer.Abort()
    else:
        # Interactive mode: cpu_threads and numa_nodes already set
        final_cpu_threads = cpu_threads
        final_numa_nodes = numa_nodes

    # Find conversion script
    kt_kernel_path = _find_kt_kernel_path()
    if kt_kernel_path is None:
        print_error("kt-kernel not found. Install with: kt install inference")
        raise typer.Exit(1)

    script_path = kt_kernel_path / "scripts" / "convert_cpu_weights.py"
    if not script_path.exists():
        print_error(f"Conversion script not found: {script_path}")
        raise typer.Exit(1)

    # Build command
    cmd = [
        sys.executable,
        str(script_path),
        "--input-path",
        str(input_path),
        "--input-type",
        input_type,
        "--output",
        str(output),
        "--quant-method",
        method.value,
        "--cpuinfer-threads",
        str(final_cpu_threads),
        "--threadpool-count",
        str(final_numa_nodes),
    ]

    if no_merge:
        cmd.append("--no-merge-safetensor")

    if gpu:
        cmd.append("--gpu")

    # Run quantization
    console.print()
    print_step(t("quant_starting"))
    console.print()
    console.print(f"[dim]$ {' '.join(cmd)}[/dim]")
    console.print()
    console.print("[dim]" + "=" * 80 + "[/dim]")
    console.print()

    try:
        # Run with real-time stdout/stderr output
        import os
        import time

        env = os.environ.copy()
        env["PYTHONUNBUFFERED"] = "1"  # Disable Python output buffering

        # Record start time
        start_time = time.time()

        process = subprocess.run(
            cmd,
            stdout=None,  # Inherit parent's stdout (real-time output)
            stderr=None,  # Inherit parent's stderr (real-time output)
            env=env,
        )

        # Calculate elapsed time
        elapsed_time = time.time() - start_time
        hours = int(elapsed_time // 3600)
        minutes = int((elapsed_time % 3600) // 60)
        seconds = int(elapsed_time % 60)

        console.print()
        console.print("[dim]" + "=" * 80 + "[/dim]")
        console.print()

        if process.returncode == 0:
            print_success(t("quant_complete"))
            console.print()

            # Display elapsed time
            if hours > 0:
                time_str = f"{hours}h {minutes}m {seconds}s"
            elif minutes > 0:
                time_str = f"{minutes}m {seconds}s"
            else:
                time_str = f"{seconds}s"
            console.print(f"  [cyan]{t('quant_time_elapsed')} {time_str}[/cyan]")
            console.print()
            console.print(f"  Quantized weights saved to: {output}")
            console.print()

            # Auto-register the quantized model
            try:
                from kt_kernel.cli.utils.user_model_registry import UserModel

                # Generate model name from output path
                base_name = output.name
                suggested_name = user_registry.suggest_name(base_name)

                # Determine MoE information and source model name
                if user_model_obj:
                    is_moe_val = user_model_obj.is_moe
                    num_experts = user_model_obj.moe_num_experts
                    num_active = user_model_obj.moe_num_experts_per_tok
                    repo_type_val = user_model_obj.repo_type
                    repo_id_val = user_model_obj.repo_id
                    source_model_name = user_model_obj.name  # Store source model name
                elif moe_result:
                    is_moe_val = moe_result.get("is_moe", True)
                    num_experts = moe_result.get("num_experts")
                    num_active = moe_result.get("num_experts_per_tok")
                    repo_type_val = None
                    repo_id_val = None
                    source_model_name = input_path.name  # Use folder name as fallback
                else:
                    is_moe_val = None
                    num_experts = None
                    num_active = None
                    repo_type_val = None
                    repo_id_val = None
                    source_model_name = input_path.name  # Use folder name as fallback

                # Create new model entry (AMX format uses "safetensors" format, detected by is_amx_weights())
                new_model = UserModel(
                    name=suggested_name,
                    path=str(output),
                    format="safetensors",  # AMX files are safetensors format
                    repo_type=repo_type_val,
                    repo_id=repo_id_val,
                    sha256_status="not_checked",  # AMX weights don't need verification
                    # Inherit MoE information from source model
                    is_moe=is_moe_val,
                    moe_num_experts=num_experts,
                    moe_num_experts_per_tok=num_active,
                    # AMX quantization metadata
                    amx_source_model=source_model_name,
                    amx_quant_method=method.value,  # "int4" or "int8"
                    amx_numa_nodes=final_numa_nodes,
                )

                user_registry.add_model(new_model)
                console.print()
                print_success(t("quant_registered", name=suggested_name))
                console.print()
                console.print(f"  {t('quant_view_with')} [cyan]kt model list[/cyan]")
                console.print(f"  {t('quant_use_with')}  [cyan]kt run {suggested_name}[/cyan]")
                console.print()
            except Exception as e:
                # Non-fatal error - quantization succeeded but registration failed
                console.print()
                print_warning(t("quant_register_failed", error=str(e)))
                console.print()
                console.print(f"  {t('quant_use_with')}")
                console.print(f"    kt run {model} --weights-path {output}")
                console.print()
        else:
            print_error(f"Quantization failed with exit code {process.returncode}")
            raise typer.Exit(process.returncode)

    except FileNotFoundError as e:
        print_error(f"Failed to run quantization: {e}")
        raise typer.Exit(1)
    except KeyboardInterrupt:
        console.print()
        print_warning("Quantization interrupted.")
        raise typer.Exit(130)


def _resolve_input_path(model: str, settings) -> Optional[Path]:
    """Resolve the input model path."""
    # Check if it's already a path
    path = Path(model)
    if path.exists() and (path / "config.json").exists():
        return path

    # Search in models directory
    from kt_kernel.cli.utils.model_registry import get_registry

    registry = get_registry()
    matches = registry.search(model)

    if matches:
        model_info = matches[0]
        # Try to find in all configured model directories
        model_paths = settings.get_model_paths()

        for models_dir in model_paths:
            possible_paths = [
                models_dir / model_info.name,
                models_dir / model_info.name.lower(),
                models_dir / model_info.hf_repo.split("/")[-1],
            ]

            for p in possible_paths:
                if p.exists() and (p / "config.json").exists():
                    return p

    return None


def _find_kt_kernel_path() -> Optional[Path]:
    """Find the kt-kernel installation path."""
    try:
        import kt_kernel

        return Path(kt_kernel.__file__).parent.parent
    except ImportError:
        pass

    # Check common locations
    possible_paths = [
        Path.home() / "Projects" / "ktransformers" / "kt-kernel",
        Path.cwd().parent / "kt-kernel",
        Path.cwd() / "kt-kernel",
    ]

    for path in possible_paths:
        if path.exists() and (path / "scripts").exists():
            return path

    return None


================================================
FILE: kt-kernel/python/cli/commands/run.py
================================================
"""
Run command for kt-cli.

Starts the model inference server using SGLang + kt-kernel.
"""

import os
import subprocess
import sys
from pathlib import Path
from typing import Optional

import click
import typer

from kt_kernel.cli.config.settings import get_settings
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import (
    confirm,
    console,
    print_api_info,
    print_error,
    print_info,
    print_server_info,
    print_step,
    print_success,
    print_warning,
    prompt_choice,
)
from kt_kernel.cli.utils.environment import detect_cpu_info, detect_gpus, detect_ram_gb
from kt_kernel.cli.utils.user_model_registry import UserModelRegistry


@click.command(
    context_settings={"ignore_unknown_options": True, "allow_extra_args": True},
    add_help_option=False,  # We'll handle help manually to avoid conflicts
)
@click.argument("model", required=False, default=None)
@click.option("--host", "-H", default=None, help="Server host address")
@click.option("--port", "-p", type=int, default=None, help="Server port")
@click.option("--gpu-experts", type=int, default=None, help="Number of GPU experts per layer")
@click.option("--cpu-threads", type=int, default=None, help="Number of CPU inference threads")
@click.option("--numa-nodes", type=int, default=None, help="Number of NUMA nodes")
@click.option(
    "--tensor-parallel-size", "--tp", "tensor_parallel_size", type=int, default=None, help="Tensor parallel size"
)
@click.option("--model-path", type=click.Path(), default=None, help="Custom model path")
@click.option("--weights-path", type=click.Path(), default=None, help="Custom quantized weights path")
@click.option("--kt-method", default=None, help="KT quantization method")
@click.option(
    "--kt-gpu-prefill-threshold", "kt_gpu_prefill_threshold", type=int, default=None, help="GPU prefill token threshold"
)
@click.option("--attention-backend", default=None, help="Attention backend")
@click.option("--max-total-tokens", "max_total_tokens", type=int, default=None, help="Maximum total tokens")
@click.option("--max-running-requests", "max_running_requests", type=int, default=None, help="Maximum running requests")
@click.option("--chunked-prefill-size", "chunked_prefill_size", type=int, default=None, help="Chunked prefill size")
@click.option("--mem-fraction-static", "mem_fraction_static", type=float, default=None, help="Memory fraction static")
@click.option("--watchdog-timeout", "watchdog_timeout", type=int, default=None, help="Watchdog timeout")
@click.option("--served-model-name", "served_model_name", default=None, help="Served model name")
@click.option(
    "--disable-shared-experts-fusion",
    "disable_shared_experts_fusion",
    is_flag=True,
    default=None,
    help="Disable shared experts fusion",
)
@click.option(
    "--enable-shared-experts-fusion",
    "enable_shared_experts_fusion",
    is_flag=True,
    default=False,
    help="Enable shared experts fusion",
)
@click.option("--quantize", "-q", is_flag=True, default=False, help="Quantize model")
@click.option("--advanced", is_flag=True, default=False, help="Show advanced options")
@click.option("--dry-run", "dry_run", is_flag=True, default=False, help="Show command without executing")
@click.pass_context
def run(
    ctx: click.Context,
    model: Optional[str],
    host: Optional[str],
    port: Optional[int],
    gpu_experts: Optional[int],
    cpu_threads: Optional[int],
    numa_nodes: Optional[int],
    tensor_parallel_size: Optional[int],
    model_path: Optional[str],
    weights_path: Optional[str],
    kt_method: Optional[str],
    kt_gpu_prefill_threshold: Optional[int],
    attention_backend: Optional[str],
    max_total_tokens: Optional[int],
    max_running_requests: Optional[int],
    chunked_prefill_size: Optional[int],
    mem_fraction_static: Optional[float],
    watchdog_timeout: Optional[int],
    served_model_name: Optional[str],
    disable_shared_experts_fusion: Optional[bool],
    enable_shared_experts_fusion: bool,
    quantize: bool,
    advanced: bool,
    dry_run: bool,
) -> None:
    """Start model inference server.

    \b
    Examples: kt run deepseek-v3 | kt run m2 --tensor-parallel-size 2 | kt run /path/to/model --gpu-experts 4

    \b
    Custom Options: Pass any SGLang server option directly (e.g., kt run m2 --fp8-gemm-backend triton).
    Common: --fp8-gemm-backend, --tool-call-parser, --reasoning-parser, --dp-size, --enable-ma
    For full list: python -m sglang.launch_server --help
    """
    # Handle --help manually since we disabled it
    # Check sys.argv for --help or -h since ctx.args may not be set yet
    if "--help" in sys.argv or "-h" in sys.argv:
        click.echo(ctx.get_help())
        return

    # Handle disable/enable shared experts fusion flags
    if enable_shared_experts_fusion:
        disable_shared_experts_fusion = False

    # Convert Path objects from click
    model_path_obj = Path(model_path) if model_path else None
    weights_path_obj = Path(weights_path) if weights_path else None

    # Get extra args that weren't parsed (unknown options)
    # click stores these in ctx.args when ignore_unknown_options=True
    extra_cli_args = list(ctx.args) if ctx.args else []

    # Remove --help from extra args if present (already handled)
    extra_cli_args = [arg for arg in extra_cli_args if arg not in ["--help", "-h"]]

    # Call the actual run function implementation
    _run_impl(
        model=model,
        host=host,
        port=port,
        gpu_experts=gpu_experts,
        cpu_threads=cpu_threads,
        numa_nodes=numa_nodes,
        tensor_parallel_size=tensor_parallel_size,
        model_path=model_path_obj,
        weights_path=weights_path_obj,
        kt_method=kt_method,
        kt_gpu_prefill_threshold=kt_gpu_prefill_threshold,
        attention_backend=attention_backend,
        max_total_tokens=max_total_tokens,
        max_running_requests=max_running_requests,
        chunked_prefill_size=chunked_prefill_size,
        mem_fraction_static=mem_fraction_static,
        watchdog_timeout=watchdog_timeout,
        served_model_name=served_model_name,
        disable_shared_experts_fusion=disable_shared_experts_fusion,
        quantize=quantize,
        advanced=advanced,
        dry_run=dry_run,
        extra_cli_args=extra_cli_args,
    )


def _run_impl(
    model: Optional[str],
    host: Optional[str],
    port: Optional[int],
    gpu_experts: Optional[int],
    cpu_threads: Optional[int],
    numa_nodes: Optional[int],
    tensor_parallel_size: Optional[int],
    model_path: Optional[Path],
    weights_path: Optional[Path],
    kt_method: Optional[str],
    kt_gpu_prefill_threshold: Optional[int],
    attention_backend: Optional[str],
    max_total_tokens: Optional[int],
    max_running_requests: Optional[int],
    chunked_prefill_size: Optional[int],
    mem_fraction_static: Optional[float],
    watchdog_timeout: Optional[int],
    served_model_name: Optional[str],
    disable_shared_experts_fusion: Optional[bool],
    quantize: bool,
    advanced: bool,
    dry_run: bool,
    extra_cli_args: list[str],
) -> None:
    """Actual implementation of run command."""
    # Check if SGLang is installed before proceeding
    from kt_kernel.cli.utils.sglang_checker import (
        check_sglang_installation,
        check_sglang_kt_kernel_support,
        print_sglang_install_instructions,
        print_sglang_kt_kernel_instructions,
    )

    sglang_info = check_sglang_installation()
    if not sglang_info["installed"]:
        console.print()
        print_error(t("sglang_not_found"))
        console.print()
        print_sglang_install_instructions()
        raise typer.Exit(1)

    # Check if SGLang supports kt-kernel (has --kt-gpu-prefill-token-threshold parameter)
    kt_kernel_support = check_sglang_kt_kernel_support()
    if not kt_kernel_support["supported"]:
        console.print()
        print_error(t("sglang_kt_kernel_not_supported"))
        console.print()
        print_sglang_kt_kernel_instructions()
        raise typer.Exit(1)

    settings = get_settings()
    user_registry = UserModelRegistry()

    # Check if we should use interactive mode
    # Interactive mode triggers when:
    # 1. No model specified, OR
    # 2. Model specified but missing critical parameters (gpu_experts, tensor_parallel_size, etc.)
    use_interactive = False

    if model is None:
        use_interactive = True
    elif (
        gpu_experts is None
        or tensor_parallel_size is None
        or cpu_threads is None
        or numa_nodes is None
        or max_total_tokens is None
    ):
        # Model specified but some parameters missing - use interactive
        use_interactive = True

    if use_interactive and sys.stdin.isatty():
        # Use new interactive configuration flow
        from kt_kernel.cli.utils.run_interactive import interactive_run_config

        console.print()
        console.print("[bold cyan]═══ Interactive Run Configuration ═══[/bold cyan]")
        console.print()

        config = interactive_run_config()
        if config is None:
            # User cancelled
            raise typer.Exit(0)

        # Extract configuration from new format
        user_model_obj = config["model"]
        model = user_model_obj.id
        resolved_model_path = Path(config["model_path"])
        resolved_weights_path = Path(config["weights_path"])

        # Extract parameters
        gpu_experts = config["gpu_experts"]
        cpu_threads = config["cpu_threads"]
        numa_nodes = config["numa_nodes"]
        tensor_parallel_size = config["tp_size"]

        # Get kt-method and other method-specific settings
        kt_method = config["kt_method"]

        # KV cache settings (may be None for non-raw methods)
        max_total_tokens = config.get("kv_cache", 32768)
        chunked_prefill_size = config.get("chunk_prefill", 32768)
        kt_gpu_prefill_threshold = config.get("gpu_prefill_threshold", 500)

        # Memory settings
        mem_fraction_static = config["mem_fraction_static"]

        # Parser settings (optional)
        tool_call_parser = config.get("tool_call_parser")
        reasoning_parser = config.get("reasoning_parser")

        # Server settings
        host = config.get("host", "0.0.0.0")
        port = config.get("port", 30000)

        # Set CUDA_VISIBLE_DEVICES for selected GPUs
        selected_gpus = config["selected_gpus"]
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gpu_id) for gpu_id in selected_gpus)

        # Detect hardware for parameter resolution (needed for resolve() function later)
        gpus = detect_gpus()
        cpu = detect_cpu_info()

        console.print()
        print_info(f"[green]✓[/green] Configuration complete")
        console.print()
    else:
        # Non-interactive mode - use traditional flow
        console.print()

        # Initialize variables that may have been set by interactive mode
        # These will be None in non-interactive mode and will use defaults via resolve()

        # If no model specified, show old interactive selection
        if model is None:
            model = _interactive_model_selection(user_registry, settings)
            if model is None:
                raise typer.Exit(0)

        # Detect hardware (needed for defaults)
        gpus = detect_gpus()
        cpu = detect_cpu_info()
        ram = detect_ram_gb()

        if gpus:
            gpu_info = f"{gpus[0].name} ({gpus[0].vram_gb}GB VRAM)"
            if len(gpus) > 1:
                gpu_info += f" + {len(gpus) - 1} more"
            print_info(t("run_gpu_info", name=gpus[0].name, vram=gpus[0].vram_gb))
        else:
            print_warning(t("doctor_gpu_not_found"))
            gpu_info = "None"

        print_info(t("run_cpu_info", name=cpu.name, cores=cpu.cores, numa=cpu.numa_nodes))
        print_info(t("run_ram_info", total=int(ram)))

        # Step 2: Resolve model
        console.print()
        print_step(t("run_checking_model"))

        user_model_obj = None
        resolved_model_path = model_path

        # Check if model is a path
        if Path(model).exists():
            resolved_model_path = Path(model)
            print_info(t("run_model_path", path=str(resolved_model_path)))

            # Try to find in user registry by path
            user_model_obj = user_registry.find_by_path(str(resolved_model_path))
            if user_model_obj:
                print_info(f"Using registered model: {user_model_obj.name}")
            else:
                print_warning("Using unregistered model path. Consider adding it with 'kt model add'")
        else:
            # Search in user registry by name
            user_model_obj = user_registry.get_model(model)

            if not user_model_obj:
                print_error(t("run_model_not_found", name=model))
                console.print()

                # Show available models
                all_models = user_registry.list_models()
                if all_models:
                    console.print("Available registered models:")
                    for m in all_models[:5]:
                        console.print(f"  - {m.name}")
                    if len(all_models) > 5:
                        console.print(f"  ... and {len(all_models) - 5} more")
                else:
                    console.print("No models registered yet.")

                console.print()
                console.print(f"Add your model with: [cyan]kt model add /path/to/model[/cyan]")
                console.print(f"Or scan for models: [cyan]kt model scan[/cyan]")
                raise typer.Exit(1)

            # Use model path from registry
            resolved_model_path = Path(user_model_obj.path)

            # Verify path exists
            if not resolved_model_path.exists():
                print_error(f"Model path does not exist: {resolved_model_path}")
                console.print()
                console.print(f"Run 'kt model refresh' to check all models")
                raise typer.Exit(1)

            print_info(t("run_model_path", path=str(resolved_model_path)))

        # Step 2.5: Pre-run verification (optional integrity check)
        if user_model_obj and user_model_obj.format == "safetensors":
            from kt_kernel.cli.utils.model_verifier import pre_operation_verification

            pre_operation_verification(user_model_obj, user_registry, operation_name="running")

        # Step 3: Check quantized weights (only if explicitly requested)
        resolved_weights_path = None

        # Only use quantized weights if explicitly specified by user
        if weights_path is not None:
            # User explicitly specified weights path
            resolved_weights_path = weights_path
            if not resolved_weights_path.exists():
                print_error(t("run_weights_not_found"))
                console.print(f"  Path: {resolved_weights_path}")
                raise typer.Exit(1)
            print_info(f"Using quantized weights: {resolved_weights_path}")
        elif quantize:
            # User requested quantization
            console.print()
            print_step(t("run_quantizing"))
            # TODO: Implement quantization
            print_warning("Quantization not yet implemented. Please run 'kt quant' manually.")
            raise typer.Exit(1)
        else:
            # Default: use original precision model without quantization
            console.print()
            print_info("Using original precision model (no quantization)")

    # Step 4: Build command
    # Helper to resolve parameter with fallback chain: CLI > config > default
    def resolve(cli_val, config_key, default):
        if cli_val is not None:
            return cli_val
        config_val = settings.get(config_key)
        return config_val if config_val is not None else default

    # Server configuration
    final_host = resolve(host, "server.host", "0.0.0.0")
    final_port = resolve(port, "server.port", 30000)

    # Tensor parallel size: CLI > config > auto-detect from GPUs
    final_tensor_parallel_size = resolve(
        tensor_parallel_size, "inference.tensor_parallel_size", len(gpus) if gpus else 1
    )

    # CPU/GPU configuration with smart defaults
    total_threads = cpu.threads  # Use logical threads instead of physical cores
    final_cpu_threads = resolve(cpu_threads, "inference.cpu_threads", int(total_threads * 0.8))
    final_numa_nodes = resolve(numa_nodes, "inference.numa_nodes", cpu.numa_nodes)
    final_gpu_experts = resolve(gpu_experts, "inference.gpu_experts", 1)

    # KT-kernel options
    final_kt_method = resolve(kt_method, "inference.kt_method", "AMXINT4")
    final_kt_gpu_prefill_threshold = resolve(kt_gpu_prefill_threshold, "inference.kt_gpu_prefill_token_threshold", 4096)

    # SGLang options
    final_attention_backend = resolve(attention_backend, "inference.attention_backend", "flashinfer")
    final_max_total_tokens = resolve(max_total_tokens, "inference.max_total_tokens", 40000)
    final_max_running_requests = resolve(max_running_requests, "inference.max_running_requests", 32)
    final_chunked_prefill_size = resolve(chunked_prefill_size, "inference.chunked_prefill_size", 4096)
    final_mem_fraction_static = resolve(mem_fraction_static, "inference.mem_fraction_static", 0.98)
    final_watchdog_timeout = resolve(watchdog_timeout, "inference.watchdog_timeout", 3000)
    final_served_model_name = resolve(served_model_name, "inference.served_model_name", "")

    # Performance flags
    final_disable_shared_experts_fusion = resolve(
        disable_shared_experts_fusion, "inference.disable_shared_experts_fusion", True
    )

    # Pass extra CLI parameters
    extra_params = {}

    # Parser parameters (from interactive mode or None in non-interactive mode)
    final_tool_call_parser = None
    final_reasoning_parser = None
    if "tool_call_parser" in locals() and tool_call_parser:
        final_tool_call_parser = tool_call_parser
    if "reasoning_parser" in locals() and reasoning_parser:
        final_reasoning_parser = reasoning_parser

    cmd = _build_sglang_command(
        model_path=resolved_model_path,
        weights_path=resolved_weights_path,
        host=final_host,
        port=final_port,
        gpu_experts=final_gpu_experts,
        cpu_threads=final_cpu_threads,
        numa_nodes=final_numa_nodes,
        tensor_parallel_size=final_tensor_parallel_size,
        kt_method=final_kt_method,
        kt_gpu_prefill_threshold=final_kt_gpu_prefill_threshold,
        attention_backend=final_attention_backend,
        max_total_tokens=final_max_total_tokens,
        max_running_requests=final_max_running_requests,
        chunked_prefill_size=final_chunked_prefill_size,
        mem_fraction_static=final_mem_fraction_static,
        watchdog_timeout=final_watchdog_timeout,
        served_model_name=final_served_model_name,
        disable_shared_experts_fusion=final_disable_shared_experts_fusion,
        tool_call_parser=final_tool_call_parser,
        reasoning_parser=final_reasoning_parser,
        settings=settings,
        extra_model_params=extra_params,
        extra_cli_args=extra_cli_args,
    )

    # Prepare environment variables
    env = os.environ.copy()
    # Add environment variables from advanced.env
    env.update(settings.get_env_vars())
    # Add environment variables from inference.env
    inference_env = settings.get("inference.env", {})
    if isinstance(inference_env, dict):
        env.update({k: str(v) for k, v in inference_env.items()})

    # Step 5: Show configuration summary
    console.print()
    print_step("Configuration")

    # Display model name
    model_display_name = user_model_obj.name if user_model_obj else resolved_model_path.name
    console.print(f"  Model: [bold]{model_display_name}[/bold]")

    console.print(f"  Path: [dim]{resolved_model_path}[/dim]")

    # Key parameters
    console.print()
    console.print(f"  GPU Experts: [cyan]{final_gpu_experts}[/cyan] per layer")
    console.print(f"  CPU Threads (kt-cpuinfer): [cyan]{final_cpu_threads}[/cyan]")
    console.print(f"  NUMA Nodes (kt-threadpool-count): [cyan]{final_numa_nodes}[/cyan]")
    console.print(f"  Tensor Parallel: [cyan]{final_tensor_parallel_size}[/cyan]")
    console.print(f"  Method: [cyan]{final_kt_method}[/cyan]")
    console.print(f"  Attention: [cyan]{final_attention_backend}[/cyan]")

    # Weights info
    if resolved_weights_path:
        console.print()
        console.print(f"  Quantized weights: [yellow]{resolved_weights_path}[/yellow]")

    console.print()
    console.print(f"  Server: [green]http://{final_host}:{final_port}[/green]")
    console.print()

    # Step 6: Show or execute
    if dry_run:
        console.print()
        console.print("[bold]Command:[/bold]")
        console.print()
        console.print(f"  [dim]{' '.join(cmd)}[/dim]")
        console.print()
        return

    # Execute with prepared environment variables
    # Don't print "Server started" or API info here - let sglang's logs speak for themselves
    # The actual startup takes time and these messages are misleading

    # Print the command being executed
    console.print()
    console.print("[bold]Launching server with command:[/bold]")
    console.print()
    console.print(f"  [dim]{' '.join(cmd)}[/dim]")
    console.print()

    try:
        # Execute directly without intercepting output or signals
        # This allows direct output to terminal and Ctrl+C to work naturally
        process = subprocess.run(cmd, env=env)
        sys.exit(process.returncode)

    except FileNotFoundError:
        from kt_kernel.cli.utils.sglang_checker import print_sglang_install_instructions

        print_error(t("sglang_not_found"))
        console.print()
        print_sglang_install_instructions()
        raise typer.Exit(1)
    except Exception as e:
        print_error(f"Failed to start server: {e}")
        raise typer.Exit(1)


# Dead code removed: _find_model_path() and _find_weights_path()
# These functions were part of the old builtin model system


def _build_sglang_command(
    model_path: Path,
    weights_path: Optional[Path],
    host: str,
    port: int,
    gpu_experts: int,
    cpu_threads: int,
    numa_nodes: int,
    tensor_parallel_size: int,
    kt_method: str,
    kt_gpu_prefill_threshold: int,
    attention_backend: str,
    max_total_tokens: int,
    max_running_requests: int,
    chunked_prefill_size: int,
    mem_fraction_static: float,
    watchdog_timeout: int,
    served_model_name: str,
    disable_shared_experts_fusion: bool,
    tool_call_parser: Optional[str],
    reasoning_parser: Optional[str],
    settings,
    extra_model_params: Optional[dict] = None,  # New parameter for additional params
    extra_cli_args: Optional[list[str]] = None,  # Extra args from CLI to pass to sglang
) -> list[str]:
    """Build the SGLang launch command."""
    cmd = [
        sys.executable,
        "-m",
        "sglang.launch_server",
        "--host",
        host,
        "--port",
        str(port),
        "--model",
        str(model_path),
    ]

    # Add kt-kernel options
    # kt-kernel is needed for:
    # 1. Quantized models (when weights_path is provided)
    # 2. MoE models with CPU offloading (when kt-cpuinfer > 0 or kt-num-gpu-experts is configured)
    use_kt_kernel = False

    # Check if we should use kt-kernel
    if weights_path:
        # Quantized model - always use kt-kernel
        use_kt_kernel = True
    elif cpu_threads > 0 or gpu_experts > 1:
        # CPU offloading configured - use kt-kernel
        use_kt_kernel = True

    if use_kt_kernel:
        # Add kt-weight-path: use quantized weights if available, otherwise use model path
        weight_path_to_use = weights_path if weights_path else model_path

        # Add kt-kernel configuration
        cmd.extend(
            [
                "--kt-weight-path",
                str(weight_path_to_use),
                "--kt-cpuinfer",
                str(cpu_threads),
                "--kt-threadpool-count",
                str(numa_nodes),
                "--kt-num-gpu-experts",
                str(gpu_experts),
                "--kt-method",
                kt_method,
                "--kt-gpu-prefill-token-threshold",
                str(kt_gpu_prefill_threshold),
                "--kt-enable-dynamic-expert-update",  # Enable dynamic expert updates
            ]
        )

    # Add SGLang options
    cmd.extend(
        [
            "--attention-backend",
            attention_backend,
            "--trust-remote-code",
            "--mem-fraction-static",
            str(mem_fraction_static),
            "--chunked-prefill-size",
            str(chunked_prefill_size),
            "--max-running-requests",
            str(max_running_requests),
            "--max-total-tokens",
            str(max_total_tokens),
            "--watchdog-timeout",
            str(watchdog_timeout),
            "--enable-mixed-chunk",
            "--tensor-parallel-size",
            str(tensor_parallel_size),
            "--enable-p2p-check",
        ]
    )

    # Add served model name if specified
    if served_model_name:
        cmd.extend(["--served-model-name", served_model_name])

    # Add performance flags
    if disable_shared_experts_fusion:
        cmd.append("--disable-shared-experts-fusion")

    # Add FP8 backend if using FP8 method
    if "FP8" in kt_method.upper():
        cmd.extend(["--fp8-gemm-backend", "triton"])

    # Add parsers if specified
    if tool_call_parser:
        cmd.extend(["--tool-call-parser", tool_call_parser])
    if reasoning_parser:
        cmd.extend(["--reasoning-parser", reasoning_parser])

    # Add any extra parameters from model defaults that weren't explicitly handled
    if extra_model_params:
        # List of parameters already handled above
        handled_params = {
            "kt-num-gpu-experts",
            "kt-cpuinfer",
            "kt-threadpool-count",
            "kt-method",
            "kt-gpu-prefill-token-threshold",
            "attention-backend",
            "tensor-parallel-size",
            "max-total-tokens",
            "max-running-requests",
            "chunked-prefill-size",
            "mem-fraction-static",
            "watchdog-timeout",
            "served-model-name",
            "disable-shared-experts-fusion",
        }

        for key, value in extra_model_params.items():
            if key not in handled_params:
                # Add unhandled parameters dynamically
                cmd.append(f"--{key}")
                if isinstance(value, bool):
                    # Boolean flags don't need a value
                    if not value:
                        # For False boolean, skip the flag entirely
                        cmd.pop()  # Remove the flag we just added
                else:
                    cmd.append(str(value))

    # Add extra args from settings
    extra_args = settings.get("advanced.sglang_args", [])
    if extra_args:
        cmd.extend(extra_args)

    # Add extra CLI args (user-provided options not defined in kt CLI)
    if extra_cli_args:
        cmd.extend(extra_cli_args)

    return cmd


def _interactive_model_selection(user_registry, settings) -> Optional[str]:
    """Show interactive model selection interface.

    Returns:
        Selected model name or None if cancelled.
    """
    from rich.panel import Panel
    from rich.prompt import Prompt

    # Get all user models
    all_models = user_registry.list_models()

    if not all_models:
        console.print()
        print_warning("No models registered.")
        console.print()
        console.print(f"  Add models with: [cyan]kt model scan[/cyan]")
        console.print(f"  Or manually: [cyan]kt model add /path/to/model[/cyan]")
        console.print()
        return None

    console.print()
    console.print(
        Panel.fit(
            "Select a model to run",
            border_style="cyan",
        )
    )
    console.print()

    # Build choices list
    choices = []
    choice_map = {}  # index -> model name

    # Show all user models
    console.print(f"[bold green]Available Models:[/bold green]")
    console.print()

    for i, model in enumerate(all_models, 1):
        # Check if path exists
        path_status = "✓" if model.path_exists() else "✗ Missing"
        console.print(f"  [cyan][{i}][/cyan] [bold]{model.name}[/bold] [{path_status}]")
        console.print(f"      [dim]{model.format} - {model.path}[/dim]")
        choices.append(str(i))
        choice_map[str(i)] = model.name

    console.print()

    # Add cancel option
    cancel_idx = str(len(choices) + 1)
    console.print(f"  [cyan][{cancel_idx}][/cyan] [dim]Cancel[/dim]")
    choices.append(cancel_idx)
    console.print()

    # Prompt for selection
    try:
        selection = Prompt.ask(
            "Select model",
            choices=choices,
            default="1" if choices else cancel_idx,
        )
    except KeyboardInterrupt:
        console.print()
        return None

    if selection == cancel_idx:
        return None

    return choice_map.get(selection)


================================================
FILE: kt-kernel/python/cli/commands/sft.py
================================================
"""
SFT command for kt-cli.

Fine-tuning with LlamaFactory integration.
"""

import typer

from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console

app = typer.Typer(help="Fine-tuning with LlamaFactory (coming soon)")


@app.callback(invoke_without_command=True)
def callback(ctx: typer.Context) -> None:
    """Fine-tuning commands (coming soon)."""
    if ctx.invoked_subcommand is None:
        console.print()
        console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
        console.print()
        console.print("[dim]kt sft train   - Train a model[/dim]")
        console.print("[dim]kt sft chat    - Chat with a trained model[/dim]")
        console.print("[dim]kt sft export  - Export a trained model[/dim]")
        console.print()


@app.command(name="train")
def train() -> None:
    """Train a model using LlamaFactory (coming soon)."""
    console.print()
    console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
    console.print()
    raise typer.Exit(0)


@app.command(name="chat")
def chat() -> None:
    """Chat with a trained model using LlamaFactory (coming soon)."""
    console.print()
    console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
    console.print()
    raise typer.Exit(0)


@app.command(name="export")
def export() -> None:
    """Export a trained model using LlamaFactory (coming soon)."""
    console.print()
    console.print(f"[yellow]{t('feature_coming_soon')}[/yellow]")
    console.print()
    raise typer.Exit(0)


================================================
FILE: kt-kernel/python/cli/commands/version.py
================================================
"""
Version command for kt-cli.

Displays version information for kt-cli and related packages.
"""

import platform
from typing import Optional

import typer

from kt_kernel.cli import __version__
from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console, print_version_table
from kt_kernel.cli.utils.environment import detect_cuda_version, get_installed_package_version


def _get_sglang_info() -> str:
    """Get sglang-kt version and installation source information."""
    from kt_kernel.cli.utils.sglang_checker import check_sglang_installation

    info = check_sglang_installation()

    if not info["installed"]:
        return t("version_not_installed")

    # Get version from package metadata (prefer sglang-kt)
    version = get_installed_package_version("sglang-kt")
    if not version:
        version = get_installed_package_version("sglang")
    if not version:
        version = info.get("version") or "unknown"

    # Determine source label
    if info.get("is_kvcache_fork"):
        if info["from_source"] and info.get("git_info"):
            git_remote = info["git_info"].get("remote", "")
            return f"{version} [dim](Source: {git_remote})[/dim]"
        elif info["editable"]:
            return f"{version} [dim](editable)[/dim]"
        else:
            return f"{version} [dim](sglang-kt)[/dim]"
    elif info["from_source"]:
        if info.get("git_info"):
            git_remote = info["git_info"].get("remote", "")
            return f"{version} [dim](Source: {git_remote})[/dim]"
        return f"{version} [dim](source)[/dim]"
    else:
        return f"{version} [dim](PyPI)[/dim]"


def version(
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed version info"),
) -> None:
    """Show version information."""
    console.print(f"\n[bold]{t('version_info')}[/bold] v{__version__}\n")

    # Basic info
    versions = {
        t("version_python"): platform.python_version(),
        t("version_platform"): f"{platform.system()} {platform.release()}",
    }

    # CUDA version
    cuda_version = detect_cuda_version()
    versions[t("version_cuda")] = cuda_version or t("version_cuda_not_found")

    print_version_table(versions)

    # Always show key packages with installation source
    console.print("\n[bold]Packages:[/bold]\n")

    sglang_info = _get_sglang_info()
    key_packages = {
        t("version_kt_kernel"): get_installed_package_version("kt-kernel") or t("version_not_installed"),
        t("version_sglang"): sglang_info,
    }

    print_version_table(key_packages)

    # Show SGLang installation hint if not installed
    if sglang_info == t("version_not_installed"):
        from kt_kernel.cli.utils.sglang_checker import print_sglang_install_instructions

        console.print()
        print_sglang_install_instructions()

    if verbose:
        console.print("\n[bold]Additional Packages:[/bold]\n")

        package_versions = {
            t("version_ktransformers"): get_installed_package_version("ktransformers") or t("version_not_installed"),
            t("version_llamafactory"): get_installed_package_version("llamafactory") or t("version_not_installed"),
            "typer": get_installed_package_version("typer") or t("version_not_installed"),
            "rich": get_installed_package_version("rich") or t("version_not_installed"),
            "torch": get_installed_package_version("torch") or t("version_not_installed"),
            "transformers": get_installed_package_version("transformers") or t("version_not_installed"),
        }

        print_version_table(package_versions)

    console.print()


================================================
FILE: kt-kernel/python/cli/completions/__init__.py
================================================
"""Shell completion scripts for kt-cli."""


================================================
FILE: kt-kernel/python/cli/completions/_kt
================================================
#compdef kt
# Zsh completion for kt command
# This is a static completion script that doesn't require Python startup

_kt() {
    local -a commands
    commands=(
        'version:Show version information'
        'run:Start model inference server'
        'chat:Interactive chat with running model'
        'quant:Quantize model weights'
        'bench:Run full benchmark'
        'microbench:Run micro-benchmark'
        'doctor:Diagnose environment issues'
        'model:Manage models and storage paths'
        'config:Manage configuration'
        'sft:Fine-tuning with LlamaFactory'
    )

    local -a run_opts
    run_opts=(
        '--host[Server host]:host:'
        '--port[Server port]:port:'
        '--gpu-experts[Number of GPU experts]:count:'
        '--cpu-threads[Number of CPU threads]:count:'
        '--tensor-parallel-size[Tensor parallel size]:size:'
        '--kt-method[KT method]:method:(AMXINT4 FP8 RAWINT4)'
        '--attention-backend[Attention backend]:backend:(triton flashinfer)'
        '--max-total-tokens[Maximum total tokens]:tokens:'
        '--dry-run[Show command without executing]'
        '--help[Show help message]'
    )

    local -a chat_opts
    chat_opts=(
        '--host[Server host]:host:'
        '--port[Server port]:port:'
        '--model[Model name]:model:'
        '--temperature[Sampling temperature]:temp:'
        '--max-tokens[Maximum tokens]:tokens:'
        '--system[System prompt]:prompt:'
        '--save-history[Save conversation history]'
        '--no-save-history[Do not save history]'
        '--history-file[History file path]:path:_files'
        '--stream[Enable streaming output]'
        '--no-stream[Disable streaming output]'
        '--help[Show help message]'
    )

    local -a model_cmds
    model_cmds=(
        'download:Download a model from HuggingFace'
        'list:List available models'
        'path-list:List all model storage paths'
        'path-add:Add a new model storage path'
        'path-remove:Remove a model storage path'
        'search:Search for models in the registry'
    )

    local -a config_cmds
    config_cmds=(
        'show:Show all configuration'
        'get:Get configuration value'
        'set:Set configuration value'
        'reset:Reset to defaults'
        'path:Show configuration file path'
        'init:Re-run first-time setup wizard'
    )

    local -a sft_cmds
    sft_cmds=(
        'train:Train model'
        'chat:Chat with model'
        'export:Export model'
    )

    _arguments -C \
        '1: :->command' \
        '*::arg:->args'

    case $state in
        command)
            _describe 'kt commands' commands
            _arguments \
                '--help[Show help message]' \
                '--version[Show version]'
            ;;
        args)
            case $words[1] in
                run)
                    _arguments $run_opts \
                        '1:model:'
                    ;;
                chat)
                    _arguments $chat_opts
                    ;;
                quant)
                    _arguments \
                        '--method[Quantization method]:method:' \
                        '--output[Output directory]:path:_files -/' \
                        '--help[Show help message]' \
                        '1:model:_files -/'
                    ;;
                bench|microbench)
                    _arguments \
                        '--model[Model name or path]:model:' \
                        '--config[Config file path]:path:_files' \
                        '--help[Show help message]'
                    ;;
                doctor)
                    _arguments \
                        '--verbose[Verbose output]' \
                        '--help[Show help message]'
                    ;;
                model)
                    _arguments \
                        '1: :->model_cmd' \
                        '*::arg:->model_args'

                    case $state in
                        model_cmd)
                            _describe 'model commands' model_cmds
                            ;;
                    esac
                    ;;
                config)
                    _arguments \
                        '1: :->config_cmd' \
                        '*::arg:->config_args'

                    case $state in
                        config_cmd)
                            _describe 'config commands' config_cmds
                            ;;
                    esac
                    ;;
                sft)
                    _arguments \
                        '1: :->sft_cmd' \
                        '*::arg:->sft_args'

                    case $state in
                        sft_cmd)
                            _describe 'sft commands' sft_cmds
                            ;;
                    esac
                    ;;
            esac
            ;;
    esac
}

_kt "$@"


================================================
FILE: kt-kernel/python/cli/completions/kt-completion.bash
================================================
#!/bin/bash
# Bash completion for kt command
# This is a static completion script that doesn't require Python startup

_kt_completion() {
    local cur prev opts
    COMPREPLY=()
    cur="${COMP_WORDS[COMP_CWORD]}"
    prev="${COMP_WORDS[COMP_CWORD-1]}"

    # Main commands
    local commands="version run chat quant edit bench microbench doctor model config sft"

    # Global options
    local global_opts="--help --version"

    # Handle subcommands
    case "${COMP_CWORD}" in
        1)
            # First argument: suggest commands and global options
            COMPREPLY=( $(compgen -W "${commands} ${global_opts}" -- ${cur}) )
            return 0
            ;;
        *)
            # Handle specific command options
            case "${COMP_WORDS[1]}" in
                run)
                    local run_opts="--host --port --gpu-experts --cpu-threads --tensor-parallel-size --kt-method --attention-backend --max-total-tokens --dry-run --help"
                    COMPREPLY=( $(compgen -W "${run_opts}" -- ${cur}) )
                    ;;
                chat)
                    local chat_opts="--host --port --model --temperature --max-tokens --system --save-history --no-save-history --history-file --stream --no-stream --help"
                    COMPREPLY=( $(compgen -W "${chat_opts}" -- ${cur}) )
                    ;;
                quant)
                    local quant_opts="--method --output --help"
                    COMPREPLY=( $(compgen -W "${quant_opts}" -- ${cur}) )
                    ;;
                edit)
                    local edit_opts="--help"
                    COMPREPLY=( $(compgen -W "${edit_opts}" -- ${cur}) )
                    ;;
                bench|microbench)
                    local bench_opts="--model --config --help"
                    COMPREPLY=( $(compgen -W "${bench_opts}" -- ${cur}) )
                    ;;
                doctor)
                    local doctor_opts="--verbose --help"
                    COMPREPLY=( $(compgen -W "${doctor_opts}" -- ${cur}) )
                    ;;
                model)
                    local model_cmds="download list path-list path-add path-remove search"
                    local model_opts="--help"
                    COMPREPLY=( $(compgen -W "${model_cmds} ${model_opts}" -- ${cur}) )
                    ;;
                config)
                    local config_cmds="show get set reset path init model-path-list model-path-add model-path-remove"
                    local config_opts="--help"
                    COMPREPLY=( $(compgen -W "${config_cmds} ${config_opts}" -- ${cur}) )
                    ;;
                sft)
                    local sft_cmds="train chat export"
                    local sft_opts="--help"
                    COMPREPLY=( $(compgen -W "${sft_cmds} ${sft_opts}" -- ${cur}) )
                    ;;
                version)
                    COMPREPLY=( $(compgen -W "--help" -- ${cur}) )
                    ;;
                *)
                    COMPREPLY=()
                    ;;
            esac
            ;;
    esac
}

complete -F _kt_completion kt


================================================
FILE: kt-kernel/python/cli/completions/kt.fish
================================================
# Fish completion for kt command
# This is a static completion script that doesn't require Python startup

# Main commands
complete -c kt -f -n "__fish_use_subcommand" -a "version" -d "Show version information"
complete -c kt -f -n "__fish_use_subcommand" -a "run" -d "Start model inference server"
complete -c kt -f -n "__fish_use_subcommand" -a "chat" -d "Interactive chat with running model"
complete -c kt -f -n "__fish_use_subcommand" -a "quant" -d "Quantize model weights"
complete -c kt -f -n "__fish_use_subcommand" -a "bench" -d "Run full benchmark"
complete -c kt -f -n "__fish_use_subcommand" -a "microbench" -d "Run micro-benchmark"
complete -c kt -f -n "__fish_use_subcommand" -a "doctor" -d "Diagnose environment issues"
complete -c kt -f -n "__fish_use_subcommand" -a "model" -d "Manage models and storage paths"
complete -c kt -f -n "__fish_use_subcommand" -a "config" -d "Manage configuration"
complete -c kt -f -n "__fish_use_subcommand" -a "sft" -d "Fine-tuning with LlamaFactory"

# Global options
complete -c kt -l help -d "Show help message"
complete -c kt -l version -d "Show version"

# Run command options
complete -c kt -f -n "__fish_seen_subcommand_from run" -l host -d "Server host"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l port -d "Server port"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l gpu-experts -d "Number of GPU experts"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l cpu-threads -d "Number of CPU threads"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l tensor-parallel-size -d "Tensor parallel size"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l kt-method -d "KT method"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l attention-backend -d "Attention backend"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l max-total-tokens -d "Maximum total tokens"
complete -c kt -f -n "__fish_seen_subcommand_from run" -l dry-run -d "Show command without executing"

# Chat command options
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l host -d "Server host"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l port -d "Server port"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l model -d "Model name"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l temperature -d "Sampling temperature"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l max-tokens -d "Maximum tokens"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l system -d "System prompt"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l save-history -d "Save conversation history"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l no-save-history -d "Do not save history"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l history-file -d "History file path"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l stream -d "Enable streaming output"
complete -c kt -f -n "__fish_seen_subcommand_from chat" -l no-stream -d "Disable streaming output"

# Quant command options
complete -c kt -f -n "__fish_seen_subcommand_from quant" -l method -d "Quantization method"
complete -c kt -f -n "__fish_seen_subcommand_from quant" -l output -d "Output directory"

# Bench command options
complete -c kt -f -n "__fish_seen_subcommand_from bench microbench" -l model -d "Model name or path"
complete -c kt -f -n "__fish_seen_subcommand_from bench microbench" -l config -d "Config file path"

# Doctor command options
complete -c kt -f -n "__fish_seen_subcommand_from doctor" -l verbose -d "Verbose output"

# Model subcommands
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "download" -d "Download a model from HuggingFace"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "list" -d "List available models"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "path-list" -d "List all model storage paths"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "path-add" -d "Add a new model storage path"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "path-remove" -d "Remove a model storage path"
complete -c kt -f -n "__fish_seen_subcommand_from model; and not __fish_seen_subcommand_from download list path-list path-add path-remove search" -a "search" -d "Search for models in the registry"

# Config subcommands
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "show" -d "Show all configuration"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "get" -d "Get configuration value"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "set" -d "Set configuration value"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "reset" -d "Reset to defaults"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "path" -d "Show configuration file path"
complete -c kt -f -n "__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show get set reset path init" -a "init" -d "Re-run first-time setup wizard"

# SFT subcommands
complete -c kt -f -n "__fish_seen_subcommand_from sft; and not __fish_seen_subcommand_from train chat export" -a "train" -d "Train model"
complete -c kt -f -n "__fish_seen_subcommand_from sft; and not __fish_seen_subcommand_from train chat export" -a "chat" -d "Chat with model"
complete -c kt -f -n "__fish_seen_subcommand_from sft; and not __fish_seen_subcommand_from train chat export" -a "export" -d "Export model"


================================================
FILE: kt-kernel/python/cli/config/__init__.py
================================================
"""
Configuration management for kt-cli.
"""

from kt_kernel.cli.config.settings import Settings, get_settings

__all__ = ["Settings", "get_settings"]


================================================
FILE: kt-kernel/python/cli/config/settings.py
================================================
"""
Configuration management for kt-cli.

Handles reading and writing configuration from ~/.ktransformers/config.yaml
"""

import os
from pathlib import Path
from typing import Any, Optional

import yaml

# Default configuration directory
DEFAULT_CONFIG_DIR = Path.home() / ".ktransformers"
DEFAULT_CONFIG_FILE = DEFAULT_CONFIG_DIR / "config.yaml"
DEFAULT_MODELS_DIR = DEFAULT_CONFIG_DIR / "models"
DEFAULT_CACHE_DIR = DEFAULT_CONFIG_DIR / "cache"

# Default configuration values
DEFAULT_CONFIG = {
    "general": {
        "language": "auto",  # auto, en, zh
        "color": True,
        "verbose": False,
    },
    "paths": {
        "models": str(DEFAULT_MODELS_DIR),
        "cache": str(DEFAULT_CACHE_DIR),
        "weights": "",  # Custom quantized weights path
    },
    "server": {
        "host": "0.0.0.0",
        "port": 30000,
    },
    "inference": {
        # Inference parameters are model-specific and should not have defaults
        # They will be auto-detected or use model-specific optimizations
        # Environment variables (general optimizations)
        "env": {
            "PYTORCH_ALLOC_CONF": "expandable_segments:True",
            "SGLANG_ENABLE_JIT_DEEPGEMM": "0",
        },
    },
    "download": {
        "mirror": "",  # HuggingFace mirror URL
        "resume": True,
        "verify": True,
    },
    "advanced": {
        # Environment variables to set when running
        "env": {},
        # Extra arguments to pass to sglang
        "sglang_args": [],
        # Extra arguments to pass to llamafactory
        "llamafactory_args": [],
    },
    "dependencies": {
        # SGLang installation source configuration
        "sglang": {
            "source": "github",  # "pypi" or "github"
            "repo": "https://github.com/kvcache-ai/sglang",
            "branch": "main",
        },
    },
}


class Settings:
    """Configuration manager for kt-cli."""

    def __init__(self, config_path: Optional[Path] = None):
        """Initialize settings manager.

        Args:
            config_path: Path to config file. Defaults to ~/.ktransformers/config.yaml
        """
        self.config_path = config_path or DEFAULT_CONFIG_FILE
        self.config_dir = self.config_path.parent
        self._config: dict[str, Any] = {}
        self._load()

    def _ensure_dirs(self) -> None:
        """Ensure configuration directories exist."""
        self.config_dir.mkdir(parents=True, exist_ok=True)

        # Ensure all model paths exist
        model_paths = self.get_model_paths()
        for path in model_paths:
            path.mkdir(parents=True, exist_ok=True)

        Path(self.get("paths.cache", DEFAULT_CACHE_DIR)).mkdir(parents=True, exist_ok=True)

    def _load(self) -> None:
        """Load configuration from file."""
        self._config = self._deep_copy(DEFAULT_CONFIG)

        if self.config_path.exists():
            try:
                with open(self.config_path, "r", encoding="utf-8") as f:
                    user_config = yaml.safe_load(f) or {}
                self._deep_merge(self._config, user_config)
            except (yaml.YAMLError, OSError) as e:
                # Log warning but continue with defaults
                print(f"Warning: Failed to load config: {e}")

        self._ensure_dirs()

    def _save(self) -> None:
        """Save configuration to file."""
        self._ensure_dirs()
        try:
            with open(self.config_path, "w", encoding="utf-8") as f:
                yaml.dump(self._config, f, default_flow_style=False, allow_unicode=True)
        except OSError as e:
            raise RuntimeError(f"Failed to save config: {e}")

    def _deep_copy(self, obj: Any) -> Any:
        """Create a deep copy of a nested dict."""
        if isinstance(obj, dict):
            return {k: self._deep_copy(v) for k, v in obj.items()}
        if isinstance(obj, list):
            return [self._deep_copy(item) for item in obj]
        return obj

    def _deep_merge(self, base: dict, override: dict) -> None:
        """Deep merge override into base."""
        for key, value in override.items():
            if key in base and isinstance(base[key], dict) and isinstance(value, dict):
                self._deep_merge(base[key], value)
            else:
                base[key] = value

    def get(self, key: str, default: Any = None) -> Any:
        """Get a configuration value by dot-separated key.

        Args:
            key: Dot-separated key path (e.g., "server.port")
            default: Default value if key not found

        Returns:
            Configuration value or default
        """
        parts = key.split(".")
        value = self._config

        for part in parts:
            if isinstance(value, dict) and part in value:
                value = value[part]
            else:
                return default

        return value

    def set(self, key: str, value: Any) -> None:
        """Set a configuration value by dot-separated key.

        Args:
            key: Dot-separated key path (e.g., "server.port")
            value: Value to set
        """
        parts = key.split(".")
        config = self._config

        # Navigate to parent
        for part in parts[:-1]:
            if part not in config:
                config[part] = {}
            config = config[part]

        # Set value
        config[parts[-1]] = value
        self._save()

    def delete(self, key: str) -> bool:
        """Delete a configuration value.

        Args:
            key: Dot-separated key path

        Returns:
            True if key was deleted, False if not found
        """
        parts = key.split(".")
        config = self._config

        # Navigate to parent
        for part in parts[:-1]:
            if part not in config:
                return False
            config = config[part]

        # Delete key
        if parts[-1] in config:
            del config[parts[-1]]
            self._save()
            return True
        return False

    def reset(self) -> None:
        """Reset configuration to defaults."""
        self._config = self._deep_copy(DEFAULT_CONFIG)
        self._save()

    def get_all(self) -> dict[str, Any]:
        """Get all configuration values."""
        return self._deep_copy(self._config)

    def get_env_vars(self) -> dict[str, str]:
        """Get environment variables to set."""
        env_vars = {}

        # Get from advanced.env
        advanced_env = self.get("advanced.env", {})
        if isinstance(advanced_env, dict):
            env_vars.update({k: str(v) for k, v in advanced_env.items()})

        return env_vars

    @property
    def models_dir(self) -> Path:
        """Get the primary models directory path (for backward compatibility)."""
        paths = self.get_model_paths()
        return paths[0] if paths else Path(DEFAULT_MODELS_DIR)

    def get_model_paths(self) -> list[Path]:
        """Get all model directory paths.

        Returns a list of Path objects. Supports both:
        - Single path: paths.models = "/path/to/models"
        - Multiple paths: paths.models = ["/path/1", "/path/2"]
        """
        models_config = self.get("paths.models", DEFAULT_MODELS_DIR)

        # Handle both string and list
        if isinstance(models_config, str):
            return [Path(models_config)]
        elif isinstance(models_config, list):
            return [Path(p) for p in models_config]
        else:
            return [Path(DEFAULT_MODELS_DIR)]

    def add_model_path(self, path: str) -> None:
        """Add a new model path to the configuration."""
        models_config = self.get("paths.models", DEFAULT_MODELS_DIR)

        # Convert to list if it's a string
        if isinstance(models_config, str):
            paths = [models_config]
        elif isinstance(models_config, list):
            paths = list(models_config)
        else:
            paths = []

        # Add new path if not already present
        if path not in paths:
            paths.append(path)
            self.set("paths.models", paths)

    def remove_model_path(self, path: str) -> bool:
        """Remove a model path from the configuration.

        Returns True if path was removed, False if not found.
        """
        models_config = self.get("paths.models", DEFAULT_MODELS_DIR)

        if isinstance(models_config, str):
            # Can't remove if it's a single string
            if models_config == path:
                # Don't remove the last path
                return False
            return False
        elif isinstance(models_config, list):
            if path in models_config:
                paths = list(models_config)
                paths.remove(path)
                # Don't allow removing all paths
                if not paths:
                    return False
                self.set("paths.models", paths if len(paths) > 1 else paths[0])
                return True

        return False

    @property
    def cache_dir(self) -> Path:
        """Get the cache directory path."""
        return Path(self.get("paths.cache", DEFAULT_CACHE_DIR))

    @property
    def weights_dir(self) -> Optional[Path]:
        """Get the custom weights directory path."""
        weights = self.get("paths.weights", "")
        return Path(weights) if weights else None


# Global settings instance
_settings: Optional[Settings] = None


def get_settings() -> Settings:
    """Get the global settings instance."""
    global _settings
    if _settings is None:
        _settings = Settings()
    return _settings


def reset_settings() -> None:
    """Reset the global settings instance."""
    global _settings
    _settings = None


================================================
FILE: kt-kernel/python/cli/i18n.py
================================================
"""
Internationalization (i18n) module for kt-cli.

Supports English and Chinese languages, with automatic detection based on
system locale or KT_LANG environment variable.
"""

import os
from typing import Any

# Message definitions for all supported languages
MESSAGES: dict[str, dict[str, str]] = {
    "en": {
        # General
        "welcome": "Welcome to KTransformers!",
        "goodbye": "Goodbye!",
        "error": "Error",
        "warning": "Warning",
        "success": "Success",
        "info": "Info",
        "yes": "Yes",
        "no": "No",
        "cancel": "Cancel",
        "confirm": "Confirm",
        "done": "Done",
        "failed": "Failed",
        "skip": "Skip",
        "back": "Back",
        "next": "Next",
        "retry": "Retry",
        "abort": "Abort",
        # Version command
        "version_info": "KTransformers CLI",
        "version_python": "Python",
        "version_platform": "Platform",
        "version_cuda": "CUDA",
        "version_cuda_not_found": "Not found",
        "version_kt_kernel": "kt-kernel",
        "version_ktransformers": "ktransformers",
        "version_sglang": "sglang-kt",
        "version_llamafactory": "llamafactory",
        "version_not_installed": "Not installed",
        # Install command
        "install_detecting_env": "Detecting environment managers...",
        "install_found": "Found {name} (version {version})",
        "install_not_found": "Not found: {name}",
        "install_checking_env": "Checking existing environments...",
        "install_env_exists": "Found existing 'kt' environment",
        "install_env_not_exists": "No 'kt' environment found",
        "install_no_env_manager": "No virtual environment manager detected",
        "install_select_method": "Please select installation method:",
        "install_method_conda": "Create new conda environment 'kt' (Recommended)",
        "install_method_venv": "Create new venv environment",
        "install_method_uv": "Create new uv environment (Fast)",
        "install_method_docker": "Use Docker container",
        "install_method_system": "Install to system Python (Not recommended)",
        "install_select_mode": "Please select installation mode:",
        "install_mode_inference": "Inference - Install kt-kernel + SGLang",
        "install_mode_sft": "Training - Install kt-sft + LlamaFactory",
        "install_mode_full": "Full - Install all components",
        "install_creating_env": "Creating {type} environment '{name}'...",
        "install_env_created": "Environment created successfully",
        "install_installing_deps": "Installing dependencies...",
        "install_checking_deps": "Checking dependency versions...",
        "install_dep_ok": "OK",
        "install_dep_outdated": "Needs update",
        "install_dep_missing": "Missing",
        "install_installing_pytorch": "Installing PyTorch...",
        "install_installing_from_requirements": "Installing from requirements file...",
        "install_deps_outdated": "Found {count} package(s) that need updating. Continue?",
        "install_updating": "Updating packages...",
        "install_complete": "Installation complete!",
        "install_activate_hint": "Activate environment: {command}",
        "install_start_hint": "Get started: kt run --help",
        "install_docker_pulling": "Pulling Docker image...",
        "install_docker_complete": "Docker image ready!",
        "install_docker_run_hint": "Run with: docker run --gpus all -p 30000:30000 {image} kt run {model}",
        "install_in_venv": "Running in virtual environment: {name}",
        "install_continue_without_venv": "Continue installing to system Python?",
        "install_already_installed": "All dependencies are already installed!",
        "install_confirm": "Install {count} package(s)?",
        # Install - System dependencies
        "install_checking_system_deps": "Checking system dependencies...",
        "install_dep_name": "Dependency",
        "install_dep_status": "Status",
        "install_deps_all_installed": "All system dependencies are installed",
        "install_deps_install_prompt": "Install missing dependencies?",
        "install_installing_system_deps": "Installing system dependencies...",
        "install_installing_dep": "Installing {name}",
        "install_dep_no_install_cmd": "No install command available for {name} on {os}",
        "install_dep_install_failed": "Failed to install {name}",
        "install_deps_skipped": "Skipping dependency installation",
        "install_deps_failed": "Failed to install system dependencies",
        # Install - CPU detection
        "install_auto_detect_cpu": "Auto-detecting CPU capabilities...",
        "install_cpu_features": "Detected CPU features: {features}",
        "install_cpu_no_features": "No advanced CPU features detected",
        # Install - Build configuration
        "install_build_config": "Build Configuration:",
        "install_native_warning": "Note: Binary optimized for THIS CPU only (not portable)",
        "install_building_from_source": "Building kt-kernel from source...",
        "install_build_failed": "Build failed",
        "install_build_success": "Build completed successfully",
        # Install - Verification
        "install_verifying": "Verifying installation...",
        "install_verify_success": "kt-kernel {version} ({variant} variant) installed successfully",
        "install_verify_failed": "Verification failed: {error}",
        # Install - Docker
        "install_docker_guide_title": "Docker Installation",
        "install_docker_guide_desc": "For Docker installation, please refer to the official guide:",
        # Config command
        "config_show_title": "Current Configuration",
        "config_set_success": "Configuration updated: {key} = {value}",
        "config_get_value": "{key} = {value}",
        "config_get_not_found": "Configuration key '{key}' not found",
        "config_reset_confirm": "This will reset all configurations to default. Continue?",
        "config_reset_success": "Configuration reset to default",
        "config_file_location": "Configuration file: {path}",
        # Doctor command
        "doctor_title": "KTransformers Environment Diagnostics",
        "doctor_checking": "Running diagnostics...",
        "doctor_check_python": "Python version",
        "doctor_check_cuda": "CUDA availability",
        "doctor_check_gpu": "GPU detection",
        "doctor_check_cpu": "CPU",
        "doctor_check_cpu_isa": "CPU Instructions",
        "doctor_check_numa": "NUMA Topology",
        "doctor_check_memory": "System memory",
        "doctor_check_disk": "Disk space",
        "doctor_check_packages": "Required packages",
        "doctor_check_env": "Environment variables",
        "doctor_status_ok": "OK",
        "doctor_status_warning": "Warning",
        "doctor_status_error": "Error",
        "doctor_gpu_found": "Found {count} GPU(s): {names}",
        "doctor_gpu_not_found": "No GPU detected",
        "doctor_cpu_info": "{name} ({cores} cores / {threads} threads)",
        "doctor_cpu_isa_info": "{isa_list}",
        "doctor_cpu_isa_missing": "Missing recommended: {missing}",
        "doctor_numa_info": "{nodes} node(s)",
        "doctor_numa_detail": "{node}: CPUs {cpus}",
        "doctor_memory_info": "{available} available / {total} total",
        "doctor_memory_freq": "{available} available / {total} total ({freq}MHz {type})",
        "doctor_disk_info": "{available} available at {path}",
        "doctor_all_ok": "All checks passed! Your environment is ready.",
        "doctor_has_issues": "Some issues were found. Please review the warnings/errors above.",
        # Run command
        "run_detecting_hardware": "Detecting hardware configuration...",
        "run_gpu_info": "GPU: {name} ({vram}GB VRAM)",
        "run_cpu_info": "CPU: {name} ({cores} cores, {numa} NUMA nodes)",
        "run_ram_info": "RAM: {total}GB",
        "run_checking_model": "Checking model status...",
        "run_model_path": "Model path: {path}",
        "run_weights_not_found": "Quantized weights not found",
        "run_quant_prompt": "Quantize model now? (This may take a while)",
        "run_quantizing": "Quantizing model...",
        "run_starting_server": "Starting server...",
        "run_server_mode": "Mode: SGLang + kt-kernel",
        "run_server_port": "Port: {port}",
        "run_gpu_experts": "GPU experts: {count}/layer",
        "run_cpu_threads": "CPU threads: {count}",
        "run_server_started": "Server started!",
        "run_api_url": "API URL: http://{host}:{port}",
        "run_docs_url": "Docs URL: http://{host}:{port}/docs",
        "run_stop_hint": "Press Ctrl+C to stop the server",
        "run_model_not_found": "Model '{name}' not found. Run 'kt download' first.",
        "run_multiple_matches": "Multiple models found. Please select:",
        "run_select_model": "Select model",
        "run_select_model_title": "Select a model to run",
        "run_select_model_prompt": "Enter number",
        "run_local_models": "Local Models (Downloaded)",
        "run_registered_models": "Registered Models",
        # Download command
        "download_list_title": "Available Models",
        "download_searching": "Searching for model '{name}'...",
        "download_found": "Found: {name}",
        "download_multiple_found": "Multiple matches found:",
        "download_select": "Select model to download:",
        "download_destination": "Destination: {path}",
        "download_starting": "Starting download...",
        "download_progress": "Downloading {name}...",
        "download_complete": "Download complete!",
        "download_already_exists": "Model already exists at {path}",
        "download_overwrite_prompt": "Overwrite existing files?",
        # Quant command
        "quant_input_path": "Input path: {path}",
        "quant_output_path": "Output path: {path}",
        "quant_method": "Quantization method: {method}",
        "quant_starting": "Starting quantization...",
        "quant_progress": "Quantizing...",
        "quant_complete": "Quantization complete!",
        "quant_input_not_found": "Input model not found at {path}",
        "quant_cpu_threads": "CPU threads: {threads}",
        "quant_numa_nodes": "NUMA nodes: {nodes}",
        "quant_time_warning": "Quantization may take 30-60 minutes depending on model size.",
        "quant_disk_analysis": "Disk Space Analysis:",
        "quant_source_size": "Source model size:",
        "quant_estimated_size": "Estimated output size:",
        "quant_available_space": "Available space:",
        "quant_insufficient_space": "WARNING: Insufficient disk space!",
        "quant_required_space": "Required space (with 20% buffer):",
        "quant_shortage": "Shortage:",
        "quant_may_fail": "Quantization may fail or produce incomplete files.",
        "quant_continue_anyway": "Continue anyway?",
        "quant_settings": "Quantization Settings:",
        "quant_registered": "Quantized model registered: {name}",
        "quant_view_with": "View with:",
        "quant_use_with": "Use with:",
        "quant_register_failed": "Failed to auto-register model: {error}",
        "quant_output_exists": "Output path already exists: {path}",
        "quant_using_unique": "Using unique name: {path}",
        # Interactive quant
        "quant_interactive_title": "Interactive Quantization Configuration",
        "quant_new_model_notice": "⚠ Note: Some newer models cannot be quantized yet (conversion script not adapted). Recommended to use the original precision for inference (no weight conversion needed).",
        "quant_no_moe_models": "No MoE models found for quantization.",
        "quant_only_moe": "Only MoE models (e.g., DeepSeek-V3) can be quantized to AMX format.",
        "quant_add_models": "Add models with: {command}",
        "quant_moe_available": "MoE Models Available for Quantization:",
        "quant_select_model": "Select model to quantize",
        "quant_invalid_choice": "Invalid choice",
        "quant_step2_method": "Step 2: Quantization Method",
        "quant_method_label": "Quantization Method:",
        "quant_int4_desc": "INT4",
        "quant_int8_desc": "INT8",
        "quant_select_method": "Select quantization method",
        "quant_input_type_label": "Input Weight Type:",
        "quant_fp8_desc": "FP8 (for 8-bit float weights)",
        "quant_fp16_desc": "FP16 (for 16-bit float weights)",
        "quant_bf16_desc": "BF16 (for Brain Float 16 weights)",
        "quant_select_input_type": "Select input type",
        "quant_step3_cpu": "Step 3: CPU Configuration",
        "quant_cpu_threads_prompt": "CPU Threads (1 to {max})",
        "quant_numa_nodes_prompt": "NUMA Nodes (1 to {max})",
        "quant_use_gpu_label": "Use GPU for conversion?",
        "quant_gpu_speedup": "GPU can significantly speed up the quantization process",
        "quant_enable_gpu": "Enable GPU acceleration?",
        "quant_step4_output": "Step 4: Output Path",
        "quant_default_path": "Default:",
        "quant_use_default": "Use default output path?",
        "quant_custom_path": "Enter custom output path",
        "quant_output_exists_warn": "⚠ Output path already exists: {path}",
        "quant_using_unique_name": "→ Using unique name: {path}",
        "quant_config_summary": "Configuration Summary",
        "quant_summary_model": "Model:",
        "quant_summary_method": "Method:",
        "quant_summary_input_type": "Input Type:",
        "quant_summary_cpu_threads": "CPU Threads:",
        "quant_summary_numa": "NUMA Nodes:",
        "quant_summary_gpu": "Use GPU:",
        "quant_summary_output": "Output Path:",
        "quant_start_question": "Start quantization?",
        "quant_cancelled": "Cancelled",
        "quant_config_complete": "Configuration complete",
        "quant_time_elapsed": "Time elapsed:",
        "yes": "Yes",
        "no": "No",
        # SFT command
        "sft_mode_train": "Training mode",
        "sft_mode_chat": "Chat mode",
        "sft_mode_export": "Export mode",
        "sft_config_path": "Config file: {path}",
        "sft_starting": "Starting {mode}...",
        "sft_complete": "{mode} complete!",
        "sft_config_not_found": "Config file not found: {path}",
        # Bench command
        "bench_starting": "Starting benchmark...",
        "bench_type": "Benchmark type: {type}",
        "bench_complete": "Benchmark complete!",
        "bench_results_title": "Benchmark Results",
        # Common prompts
        "prompt_continue": "Continue?",
        "prompt_select": "Please select:",
        "prompt_enter_value": "Enter value:",
        "prompt_confirm_action": "Confirm this action?",
        # First-run setup - Model path selection
        "setup_model_path_title": "Model Storage Location",
        "setup_model_path_desc": "LLM models are large (50-200GB+). Please select a storage location with sufficient space:",
        "setup_scanning_disks": "Scanning available storage locations...",
        "setup_disk_option": "{path} ({available} available / {total} total)",
        "setup_disk_option_recommended": "{path} ({available} available / {total} total) [Recommended]",
        "setup_custom_path": "Enter custom path",
        "setup_enter_custom_path": "Enter the path for model storage",
        "setup_path_not_exist": "Path does not exist. Create it?",
        "setup_path_no_write": "No write permission for this path. Please choose another.",
        "setup_path_low_space": "Warning: Less than 100GB available. Large models may not fit.",
        "setup_model_path_set": "Model storage path set to: {path}",
        "setup_no_large_disk": "No large storage locations found. Using default path.",
        "setup_scanning_models": "Scanning for existing models...",
        "setup_found_models": "Found {count} model(s):",
        "setup_model_info": "{name} ({size}, {type})",
        "setup_no_models_found": "No existing models found in this location.",
        "setup_location_has_models": "{count} model(s) found",
        "setup_installing_completion": "Installing shell completion for {shell}...",
        "setup_completion_installed": "Shell completion installed! Restart terminal to enable.",
        "setup_completion_failed": "Failed to install shell completion. Run 'kt --install-completion' manually.",
        # Auto completion
        "completion_installed_title": "Tab Completion",
        "completion_installed_for": "Shell completion installed for {shell}",
        "completion_activate_now": "To enable completion in this terminal session, run:",
        "completion_next_session": "Completion will be automatically enabled in new terminal sessions.",
        # SGLang
        "sglang_not_found": "SGLang not found",
        "sglang_pypi_warning": "SGLang from PyPI may not be compatible with kt-kernel. Use sglang-kt instead: pip install sglang-kt",
        "sglang_pypi_hint": "SGLang from PyPI may not be compatible. Install the kvcache-ai fork: pip install sglang-kt (or run ./install.sh from ktransformers root)",
        "sglang_install_hint": "Install SGLang: pip install sglang-kt (or run ./install.sh from ktransformers root)",
        "sglang_recommend_source": "Recommend reinstalling with the kvcache-ai fork: pip uninstall sglang -y && pip install sglang-kt",
        "sglang_kt_kernel_not_supported": "SGLang does not support kt-kernel (missing --kt-gpu-prefill-token-threshold parameter)",
        "sglang_checking_kt_kernel_support": "Checking SGLang kt-kernel support...",
        "sglang_kt_kernel_supported": "SGLang kt-kernel support verified",
        # Chat
        "chat_proxy_detected": "Proxy detected in environment",
        "chat_proxy_confirm": "Use proxy for connection?",
        "chat_proxy_disabled": "Proxy disabled for this session",
        "chat_openai_required": "OpenAI Python SDK is required for chat functionality.",
        "chat_install_hint": "Install it with:",
        "chat_title": "KTransformers Chat",
        "chat_server": "Server",
        "chat_temperature": "Temperature",
        "chat_max_tokens": "Max tokens",
        "chat_help_hint": "Type '/help' for commands, '/quit' to exit",
        "chat_connecting": "Connecting to server...",
        "chat_no_models": "No models available on server",
        "chat_model_not_found": "Model '{model}' not found. Available models: {available}",
        "chat_connected": "Connected to model: {model}",
        "chat_connect_failed": "Failed to connect to server: {error}",
        "chat_server_not_running": "Make sure the model server is running:",
        "chat_user_prompt": "You",
        "chat_assistant_prompt": "Assistant",
        "chat_generation_error": "Error generating response: {error}",
        "chat_interrupted": "Chat interrupted. Goodbye!",
        "chat_history_saved": "History saved to: {path}",
        "chat_goodbye": "Goodbye!",
        "chat_help_title": "Available Commands:",
        "chat_help_content": "/help, /h         - Show this help message\n/quit, /exit, /q  - Exit chat\n/clear, /c        - Clear conversation history\n/history, /hist   - Show conversation history\n/info, /i         - Show current settings\n/retry, /r        - Regenerate last response",
        "chat_history_cleared": "Conversation history cleared",
        "chat_no_history": "No conversation history",
        "chat_history_title": "History ({count} messages)",
        "chat_info_title": "Current Settings:",
        "chat_info_content": "Temperature: {temperature}\nMax tokens: {max_tokens}\nMessages: {messages}",
        "chat_retrying": "Retrying last response...",
        "chat_no_retry": "No previous response to retry",
        "chat_unknown_command": "Unknown command: {command}",
        "chat_unknown_hint": "Type /help for available commands",
        # Run Interactive
        "run_int_no_moe_models": "No MoE GPU models found.",
        "run_int_add_models": "Add models with: kt model scan",
        "run_int_list_all": "List all models: kt model list --all",
        "run_int_step1_title": "Step 1: Select Model (GPU MoE Models)",
        "run_int_select_model": "Select model",
        "run_int_step2_title": "Step 2: Select Inference Method",
        "run_int_method_raw": "RAW Precision (FP8/FP8_PERCHANNEL/BF16/RAWINT4)",
        "run_int_method_amx": "AMX Quantization (INT4/INT8)",
        "run_int_method_gguf": "GGUF (Llamafile)",
        "run_int_method_saved": "Use Saved Configuration",
        "run_int_select_method": "Select inference method",
        "run_int_raw_precision": "RAW Precision:",
        "run_int_select_precision": "Select precision",
        "run_int_amx_method": "AMX Method:",
        "run_int_select_amx": "Select AMX method",
        "run_int_step3_title": "Step 3: NUMA and CPU Configuration",
        "run_int_numa_nodes": "NUMA Nodes (1-{max})",
        "run_int_cpu_threads": "CPU Threads per NUMA (1-{max})",
        "run_int_amx_warning": "⚠ Warning: AMX INT4/INT8 requires compatible CPU. Check with: kt doctor",
        "run_int_step4_title": "Step 4: GPU Experts Configuration",
        "run_int_gpu_experts": "GPU Experts per Layer (0-{max})",
        "run_int_gpu_experts_info": "Total experts: {total}, Activated per token: {active}",
        "run_int_step5_title": "Step 5: KV Cache Configuration",
        "run_int_kv_cache_size": "KV Cache Size (tokens)",
        "run_int_chunk_prefill": "Enable Chunk Prefill?",
        "run_int_chunk_size": "Chunk Prefill Size (tokens)",
        "run_int_gpu_prefill_threshold": "GPU Prefill Threshold (tokens)",
        "run_int_step6_title": "Step 6: GPU Selection and Tensor Parallelism",
        "run_int_available_gpus": "Available GPUs:",
        "run_int_gpu_id": "GPU {id}",
        "run_int_vram_info": "{name} ({total:.1f}GB total, {free:.1f}GB free)",
        "run_int_select_gpus": "Select GPU IDs (comma-separated)",
        "run_int_invalid_gpu_range": "All GPU IDs must be between 0 and {max}",
        "run_int_tp_size": "TP Size (must be power of 2: 1,2,4,8...)",
        "run_int_tp_mismatch": "TP size must match number of selected GPUs ({count})",
        "run_int_tp_not_power_of_2": "TP size must be a power of 2",
        "run_int_mem_fraction": "Static Memory Fraction (0.0-1.0)",
        "run_int_using_saved_mem": "Using saved memory fraction: {fraction}",
        "run_int_step7_title": "Step 7: Parser Configuration (Optional)",
        "run_int_tool_call_parser": "Tool Call Parser (press Enter to skip)",
        "run_int_reasoning_parser": "Reasoning Parser (press Enter to skip)",
        "run_int_step8_title": "Step 8: Host and Port Configuration",
        "run_int_host": "Host",
        "run_int_port": "Port",
        "run_int_port_occupied": "⚠ Port {port} is already in use",
        "run_int_port_suggestion": "Suggested available port: {port}",
        "run_int_use_suggested": "Use suggested port?",
        "run_int_saved_configs": "Saved Configurations:",
        "run_int_config_name": "Configuration {num}",
        "run_int_kt_method": "KT Method:",
        "run_int_numa_nodes_label": "NUMA Nodes:",
        "run_int_cpu_threads_label": "CPU Threads:",
        "run_int_gpu_experts_label": "GPU Experts:",
        "run_int_tp_size_label": "TP Size:",
        "run_int_mem_fraction_label": "Memory Fraction:",
        "run_int_server_label": "Server:",
        "run_int_kv_cache_label": "KV Cache:",
        "run_int_chunk_prefill_label": "Chunk Prefill:",
        "run_int_gpu_prefill_label": "GPU Prefill Thr:",
        "run_int_tool_parser_label": "Tool Call Parser:",
        "run_int_reasoning_parser_label": "Reasoning Parser:",
        "run_int_command_label": "Command:",
        "run_int_select_config": "Select configuration",
        "run_int_gpu_select_required": "Please select {tp} GPUs (TP size from saved config)",
        "run_int_port_check_title": "Port Configuration",
        "run_int_port_checking": "Checking port {port} availability...",
        "run_int_port_available": "Port {port} is available",
        "run_int_saved_config_title": "Saved Configuration",
        "run_int_save_config_title": "Save Configuration",
        "run_int_save_config_prompt": "Save this configuration for future use?",
        "run_int_config_name_prompt": "Configuration name",
        "run_int_config_name_default": "Config {timestamp}",
        "run_int_config_saved": "Configuration saved: {name}",
        "run_int_config_summary": "Configuration Complete",
        "run_int_model_label": "Model:",
        "run_int_selected_gpus_label": "Selected GPUs:",
        # Model command
        "model_supported_title": "KTransformers Supported Models",
        "model_column_model": "Model",
        "model_column_status": "Status",
        "model_column_local_path": "Local Path",
        "model_status_local": "Local",
        "model_status_not_downloaded": "Not downloaded",
        "model_usage_title": "Usage",
        "model_usage_download": "Download a model:",
        "model_usage_list_local": "List local models:",
        "model_usage_search": "Search models:",
        "model_storage_paths_title": "Model Storage Paths",
        "model_local_models_title": "Locally Downloaded Models",
        "model_available_models_title": "Available Models",
        "model_no_local_models": "No locally downloaded models found",
        "model_download_hint": "Download a model with:",
        "model_download_usage_hint": "Usage: kt model download <model-name>",
        "model_download_list_hint": "Use 'kt model download --list' to see available models.",
        "model_download_hf_hint": "Or specify a HuggingFace repo directly: kt model download org/model-name",
        "model_saved_to": "Model saved to: {path}",
        "model_start_with": "Start with: kt run {name}",
        "model_download_failed": "Download failed: {error}",
        "model_hf_cli_not_found": "huggingface-cli not found. Install with: pip install huggingface-hub",
        "model_path_not_exist": "Path does not exist: {path}",
        "model_create_directory": "Create directory {path}?",
        "model_created_directory": "Created directory: {path}",
        "model_create_dir_failed": "Failed to create directory: {error}",
        "model_path_added": "Added model path: {path}",
        "model_path_removed": "Removed model path: {path}",
        "model_path_not_found": "Path not found in configuration or cannot remove last path: {path}",
        "model_search_no_results": "No models found matching '{query}'",
        "model_search_results_title": "Search Results for '{query}'",
        "model_column_name": "Name",
        "model_column_hf_repo": "HuggingFace Repo",
        "model_column_aliases": "Aliases",
        # Model management - new user registry system
        "model_no_registered_models": "No models registered yet.",
        "model_scan_hint": "Scan for models: kt model scan",
        "model_add_hint": "Add a model: kt model add /path/to/model",
        "model_registered_models_title": "Registered Models",
        "model_column_format": "Format",
        "model_column_repo": "Repository",
        "model_column_sha256": "SHA256",
        "model_non_moe_hidden_hint": "Detected {count} non-MoE models, use kt model list --all to show all",
        "model_usage_title": "Common Operations:",
        "model_usage_info": "View details:",
        "model_usage_edit": "Edit model:",
        "model_usage_verify": "Verify integrity:",
        "model_usage_quant": "Quantize model:",
        "model_usage_run": "Run model:",
        "model_usage_scan": "Scan for models:",
        "model_usage_add": "Add model:",
        "model_usage_verbose": "View with file details:",
        "model_no_storage_paths": "No storage paths configured.",
        "model_add_path_hint": "Add a storage path with: kt config set model.storage_paths /path/to/models",
        "model_scanning_paths": "Scanning configured storage paths...",
        "model_scanning_progress": "Scanning: {path}",
        "model_scan_warnings_title": "Warnings",
        "model_scan_no_models_found": "No models found in configured paths.",
        "model_scan_check_paths_hint": "Check your storage paths: kt config get model.storage_paths",
        "model_scan_min_size_hint": "Folders must be ≥{size}GB to be detected as models.",
        "model_scan_found_title": "Found {count} new model(s)",
        "model_column_path": "Path",
        "model_column_size": "Size",
        "model_scan_auto_adding": "Auto-adding models...",
        "model_added": "Added: {name}",
        "model_add_failed": "Failed to add {name}: {error}",
        "model_scan_complete": "Scan complete! Added {count} model(s).",
        "model_scan_interactive_prompt": "Commands: edit <id> | del <id> | done",
        "model_scan_cmd_edit": "Set custom name for model",
        "model_scan_cmd_delete": "Skip this model",
        "model_scan_cmd_done": "Finish and add models",
        "model_scan_marked_skip": "Skipped model #{id}",
        "model_scan_invalid_id": "Invalid model ID: {id}",
        "model_scan_invalid_command": "Invalid command. Use: edit <id> | del <id> | done",
        "model_scan_edit_model": "Edit model {id}",
        "model_scan_edit_note": "You can change the model name before adding it to registry",
        "model_scan_adding_models": "Adding {count} model(s)...",
        "model_scan_next_steps": "Next Steps",
        "model_scan_view_hint": "View registered models: kt model list",
        "model_scan_edit_hint": "Edit model details: kt model edit <name>",
        "model_scan_no_models_added": "No models were added.",
        "model_add_path_not_exist": "Error: Path does not exist: {path}",
        "model_add_not_directory": "Error: Path is not a directory: {path}",
        "model_add_already_registered": "This path is already registered as: {name}",
        "model_add_view_hint": "View with: kt model info {name}",
        "model_add_scanning": "Scanning model files...",
        "model_add_scan_failed": "Failed to scan model: {error}",
        "model_add_no_model_files": "No model files found in {path}",
        "model_add_supported_formats": "Supported: *.safetensors, *.gguf (folder ≥10GB)",
        "model_add_detected": "Detected: {format} format, {size}, {count} file(s)",
        "model_add_name_conflict": "Name '{name}' already exists.",
        "model_add_prompt_name": "Enter a name for this model",
        "model_add_name_exists": "Name already exists. Please choose another name:",
        "model_add_configure_repo": "Configure repository information for SHA256 verification?",
        "model_add_repo_type_prompt": "Select repository type:",
        "model_add_choice": "Choice",
        "model_add_repo_id_prompt": "Enter repository ID (e.g., deepseek-ai/DeepSeek-V3)",
        "model_add_success": "Successfully added model: {name}",
        "model_add_verify_hint": "Verify integrity: kt model verify {name}",
        "model_add_edit_later_hint": "Edit details later: kt model edit {name}",
        "model_add_failed_generic": "Failed to add model: {error}",
        "model_edit_not_found": "Model '{name}' not found.",
        "model_edit_list_hint": "List models: kt model list",
        "model_edit_current_config": "Current Configuration",
        "model_edit_what_to_edit": "What would you like to edit?",
        "model_edit_option_name": "Edit name",
        "model_edit_option_repo": "Configure repository info",
        "model_edit_option_delete": "Delete this model",
        "model_edit_option_cancel": "Cancel / Exit",
        "model_edit_choice_prompt": "Select option",
        "model_edit_new_name": "Enter new name",
        "model_edit_name_conflict": "Name '{name}' already exists. Please choose another:",
        "model_edit_name_updated": "Name updated: {old} → {new}",
        "model_edit_repo_type_prompt": "Repository type (or enter to remove repo info):",
        "model_edit_repo_remove": "Remove repository info",
        "model_edit_repo_id_prompt": "Enter repository ID",
        "model_edit_repo_removed": "Repository info removed",
        "model_edit_repo_updated": "Repository configured: {repo_type} → {repo_id}",
        "model_edit_delete_warning": "Delete model '{name}' from registry?",
        "model_edit_delete_note": "Note: This only removes the registry entry. Model files in {path} will NOT be deleted.",
        "model_edit_delete_confirm": "Confirm deletion?",
        "model_edit_deleted": "Model '{name}' deleted from registry",
        "model_edit_delete_cancelled": "Deletion cancelled",
        "model_edit_cancelled": "Edit cancelled",
        # Model edit - Interactive selection
        "model_edit_select_title": "Select Model to Edit",
        "model_edit_select_model": "Select model",
        "model_edit_invalid_choice": "Invalid choice",
        "model_edit_no_models": "No models found in registry.",
        "model_edit_add_hint_scan": "Add models with:",
        "model_edit_add_hint_add": "Or:",
        # Model edit - Display
        "model_edit_gpu_links": "GPU Links:",
        # Model edit - Menu options
        "model_edit_manage_gpu_links": "Manage GPU Links",
        "model_edit_save_changes": "Save changes",
        "model_edit_has_changes": "(has changes)",
        "model_edit_no_changes": "(no changes)",
        # Model edit - Pending changes messages
        "model_edit_name_pending": "Name will be updated when you save changes.",
        "model_edit_repo_remove_pending": "Repository info will be removed when you save changes.",
        "model_edit_repo_update_pending": "Repository info will be updated when you save changes.",
        # Model edit - GPU link management
        "model_edit_gpu_links_title": "Manage GPU Links for {name}",
        "model_edit_current_gpu_links": "Current GPU links:",
        "model_edit_no_gpu_links": "No GPU links configured.",
        "model_edit_gpu_options": "Options:",
        "model_edit_gpu_add": "Add GPU link",
        "model_edit_gpu_remove": "Remove GPU link",
        "model_edit_gpu_clear": "Clear all GPU links",
        "model_edit_gpu_back": "Back to main menu",
        "model_edit_gpu_choose_option": "Choose option",
        "model_edit_gpu_none_available": "No GPU models available to link.",
        "model_edit_gpu_available_models": "Available GPU models:",
        "model_edit_gpu_already_linked": "(already linked)",
        "model_edit_gpu_enter_number": "Enter GPU model number to add",
        "model_edit_gpu_link_pending": "GPU link will be added when you save changes: {name}",
        "model_edit_gpu_already_exists": "This GPU model is already linked.",
        "model_edit_gpu_invalid_choice": "Invalid choice.",
        "model_edit_gpu_invalid_input": "Invalid input.",
        "model_edit_gpu_none_to_remove": "No GPU links to remove.",
        "model_edit_gpu_choose_to_remove": "Choose GPU link to remove:",
        "model_edit_gpu_enter_to_remove": "Enter number to remove",
        "model_edit_gpu_remove_pending": "GPU link will be removed when you save changes: {name}",
        "model_edit_gpu_none_to_clear": "No GPU links to clear.",
        "model_edit_gpu_clear_confirm": "Remove all GPU links?",
        "model_edit_gpu_clear_pending": "All GPU links will be removed when you save changes.",
        "model_edit_cancelled_short": "Cancelled.",
        # Model edit - Save operation
        "model_edit_no_changes_to_save": "No changes to save.",
        "model_edit_saving": "Saving changes...",
        "model_edit_saved": "Changes saved successfully!",
        "model_edit_updated_config": "Updated Configuration:",
        "model_edit_repo_changed_warning": "⚠ Repository information has changed.",
        "model_edit_verify_hint": "Run [cyan]kt model verify[/cyan] to verify model integrity with SHA256 checksums.",
        "model_edit_discard_changes": "Discard unsaved changes?",
        "model_info_not_found": "Model '{name}' not found.",
        "model_info_list_hint": "List all models: kt model list",
        "model_remove_not_found": "Model '{name}' not found.",
        "model_remove_list_hint": "List models: kt model list",
        "model_remove_warning": "Remove model '{name}' from registry?",
        "model_remove_note": "Note: This only removes the registry entry. Model files will NOT be deleted from {path}.",
        "model_remove_confirm": "Confirm removal?",
        "model_remove_cancelled": "Removal cancelled",
        "model_removed": "Model '{name}' removed from registry",
        "model_remove_failed": "Failed to remove model: {error}",
        "model_refresh_checking": "Checking model paths...",
        "model_refresh_all_valid": "All models are valid! ({count} model(s) checked)",
        "model_refresh_total": "Total models: {total}",
        "model_refresh_missing_found": "Found {count} missing model(s)",
        "model_refresh_suggestions": "Suggested Actions",
        "model_refresh_remove_hint": "Remove from registry: kt model remove <name>",
        "model_refresh_rescan_hint": "Re-scan for models: kt model scan",
        "model_verify_not_found": "Model '{name}' not found.",
        "model_verify_list_hint": "List models: kt model list",
        "model_verify_no_repo": "Model '{name}' has no repository information configured.",
        "model_verify_config_hint": "Configure repository: kt model edit {name}",
        "model_verify_path_missing": "Model path does not exist: {path}",
        "model_verify_starting": "Verifying model integrity...",
        "model_verify_progress": "Repository: {repo_type} → {repo_id}",
        "model_verify_not_implemented": "SHA256 verification not implemented yet",
        "model_verify_future_note": "This feature will fetch official SHA256 hashes from {repo_type} and compare with local files.",
        "model_verify_passed": "Verification passed! All files match official hashes.",
        "model_verify_failed": "Verification failed! {count} file(s) have hash mismatches.",
        "model_verify_all_no_repos": "No models have repository information configured.",
        "model_verify_all_config_hint": "Configure repos using: kt model edit <name>",
        "model_verify_all_found": "Found {count} model(s) with repository info",
        "model_verify_all_manual_hint": "Verify specific model: kt model verify <name>",
        # Coming soon
        "feature_coming_soon": "This feature is coming soon...",
    },
    "zh": {
        # General
        "welcome": "欢迎使用 KTransformers！",
        "goodbye": "再见！",
        "error": "错误",
        "warning": "警告",
        "success": "成功",
        "info": "信息",
        "yes": "是",
        "no": "否",
        "cancel": "取消",
        "confirm": "确认",
        "done": "完成",
        "failed": "失败",
        "skip": "跳过",
        "back": "返回",
        "next": "下一步",
        "retry": "重试",
        "abort": "中止",
        # Version command
        "version_info": "KTransformers CLI",
        "version_python": "Python",
        "version_platform": "平台",
        "version_cuda": "CUDA",
        "version_cuda_not_found": "未找到",
        "version_kt_kernel": "kt-kernel",
        "version_ktransformers": "ktransformers",
        "version_sglang": "sglang-kt",
        "version_llamafactory": "llamafactory",
        "version_not_installed": "未安装",
        # Install command
        "install_detecting_env": "检测环境管理工具...",
        "install_found": "发现 {name} (版本 {version})",
        "install_not_found": "未找到: {name}",
        "install_checking_env": "检查现有环境...",
        "install_env_exists": "发现现有 'kt' 环境",
        "install_env_not_exists": "未发现 'kt' 环境",
        "install_no_env_manager": "未检测到虚拟环境管理工具",
        "install_select_method": "请选择安装方式:",
        "install_method_conda": "创建新的 conda 环境 'kt' (推荐)",
        "install_method_venv": "创建新的 venv 环境",
        "install_method_uv": "创建新的 uv 环境 (快速)",
        "install_method_docker": "使用 Docker 容器",
        "install_method_system": "安装到系统 Python (不推荐)",
        "install_select_mode": "请选择安装模式:",
        "install_mode_inference": "推理模式 - 安装 kt-kernel + SGLang",
        "install_mode_sft": "训练模式 - 安装 kt-sft + LlamaFactory",
        "install_mode_full": "完整安装 - 安装所有组件",
        "install_creating_env": "正在创建 {type} 环境 '{name}'...",
        "install_env_created": "环境创建成功",
        "install_installing_deps": "正在安装依赖...",
        "install_checking_deps": "检查依赖版本...",
        "install_dep_ok": "正常",
        "install_dep_outdated": "需更新",
        "install_dep_missing": "缺失",
        "install_installing_pytorch": "正在安装 PyTorch...",
        "install_installing_from_requirements": "从依赖文件安装...",
        "install_deps_outdated": "发现 {count} 个包需要更新，是否继续？",
        "install_updating": "正在更新包...",
        "install_complete": "安装完成！",
        "install_activate_hint": "激活环境: {command}",
        "install_start_hint": "开始使用: kt run --help",
        "install_docker_pulling": "正在拉取 Docker 镜像...",
        "install_docker_complete": "Docker 镜像已就绪！",
        "install_docker_run_hint": "运行: docker run --gpus all -p 30000:30000 {image} kt run {model}",
        "install_in_venv": "当前在虚拟环境中: {name}",
        "install_continue_without_venv": "继续安装到系统 Python？",
        "install_already_installed": "所有依赖已安装！",
        "install_confirm": "安装 {count} 个包？",
        # Install - System dependencies
        "install_checking_system_deps": "检查系统依赖...",
        "install_dep_name": "依赖项",
        "install_dep_status": "状态",
        "install_deps_all_installed": "所有系统依赖已安装",
        "install_deps_install_prompt": "是否安装缺失的依赖？",
        "install_installing_system_deps": "正在安装系统依赖...",
        "install_installing_dep": "正在安装 {name}",
        "install_dep_no_install_cmd": "{os} 系统上没有 {name} 的安装命令",
        "install_dep_install_failed": "安装 {name} 失败",
        "install_deps_skipped": "跳过依赖安装",
        "install_deps_failed": "系统依赖安装失败",
        # Install - CPU detection
        "install_auto_detect_cpu": "正在自动检测 CPU 能力...",
        "install_cpu_features": "检测到的 CPU 特性: {features}",
        "install_cpu_no_features": "未检测到高级 CPU 特性",
        # Install - Build configuration
        "install_build_config": "构建配置:",
        "install_native_warning": "注意: 二进制文件仅针对当前 CPU 优化（不可移植）",
        "install_building_from_source": "正在从源码构建 kt-kernel...",
        "install_build_failed": "构建失败",
        "install_build_success": "构建成功",
        # Install - Verification
        "install_verifying": "正在验证安装...",
        "install_verify_success": "kt-kernel {version} ({variant} 变体) 安装成功",
        "install_verify_failed": "验证失败: {error}",
        # Install - Docker
        "install_docker_guide_title": "Docker 安装",
        "install_docker_guide_desc": "有关 Docker 安装，请参阅官方指南:",
        # Config command
        "config_show_title": "当前配置",
        "config_set_success": "配置已更新: {key} = {value}",
        "config_get_value": "{key} = {value}",
        "config_get_not_found": "未找到配置项 '{key}'",
        "config_reset_confirm": "这将重置所有配置为默认值。是否继续？",
        "config_reset_success": "配置已重置为默认值",
        "config_file_location": "配置文件: {path}",
        # Doctor command
        "doctor_title": "KTransformers 环境诊断",
        "doctor_checking": "正在运行诊断...",
        "doctor_check_python": "Python 版本",
        "doctor_check_cuda": "CUDA 可用性",
        "doctor_check_gpu": "GPU 检测",
        "doctor_check_cpu": "CPU",
        "doctor_check_cpu_isa": "CPU 指令集",
        "doctor_check_numa": "NUMA 拓扑",
        "doctor_check_memory": "系统内存",
        "doctor_check_disk": "磁盘空间",
        "doctor_check_packages": "必需的包",
        "doctor_check_env": "环境变量",
        "doctor_status_ok": "正常",
        "doctor_status_warning": "警告",
        "doctor_status_error": "错误",
        "doctor_gpu_found": "发现 {count} 个 GPU: {names}",
        "doctor_gpu_not_found": "未检测到 GPU",
        "doctor_cpu_info": "{name} ({cores} 核心 / {threads} 线程)",
        "doctor_cpu_isa_info": "{isa_list}",
        "doctor_cpu_isa_missing": "缺少推荐指令集: {missing}",
        "doctor_numa_info": "{nodes} 个节点",
        "doctor_numa_detail": "{node}: CPU {cpus}",
        "doctor_memory_info": "{available} 可用 / {total} 总计",
        "doctor_memory_freq": "{available} 可用 / {total} 总计 ({freq}MHz {type})",
        "doctor_disk_info": "{path} 有 {available} 可用空间",
        "doctor_all_ok": "所有检查通过！您的环境已就绪。",
        "doctor_has_issues": "发现一些问题，请查看上方的警告/错误信息。",
        # Run command
        "run_detecting_hardware": "检测硬件配置...",
        "run_gpu_info": "GPU: {name} ({vram}GB 显存)",
        "run_cpu_info": "CPU: {name} ({cores} 核心, {numa} NUMA 节点)",
        "run_ram_info": "内存: {total}GB",
        "run_checking_model": "检查模型状态...",
        "run_model_path": "模型路径: {path}",
        "run_weights_not_found": "未找到量化权重",
        "run_quant_prompt": "是否现在量化模型？(这可能需要一些时间)",
        "run_quantizing": "正在量化模型...",
        "run_starting_server": "正在启动服务器...",
        "run_server_mode": "模式: SGLang + kt-kernel",
        "run_server_port": "端口: {port}",
        "run_gpu_experts": "GPU 专家: {count}/层",
        "run_cpu_threads": "CPU 线程: {count}",
        "run_server_started": "服务器已启动！",
        "run_api_url": "API 地址: http://{host}:{port}",
        "run_docs_url": "文档地址: http://{host}:{port}/docs",
        "run_stop_hint": "按 Ctrl+C 停止服务器",
        "run_model_not_found": "未找到模型 '{name}'。请先运行 'kt download'。",
        "run_multiple_matches": "找到多个匹配的模型，请选择:",
        "run_select_model": "选择模型",
        "run_select_model_title": "选择要运行的模型",
        "run_select_model_prompt": "输入编号",
        "run_local_models": "本地模型 (已下载)",
        "run_registered_models": "注册模型",
        # Download command
        "download_list_title": "可用模型",
        "download_searching": "正在搜索模型 '{name}'...",
        "download_found": "找到: {name}",
        "download_multiple_found": "找到多个匹配:",
        "download_select": "选择要下载的模型:",
        "download_destination": "目标路径: {path}",
        "download_starting": "开始下载...",
        "download_progress": "正在下载 {name}...",
        "download_complete": "下载完成！",
        "download_already_exists": "模型已存在于 {path}",
        "download_overwrite_prompt": "是否覆盖现有文件？",
        # Quant command
        "quant_input_path": "输入路径: {path}",
        "quant_output_path": "输出路径: {path}",
        "quant_method": "量化方法: {method}",
        "quant_starting": "开始量化...",
        "quant_progress": "正在量化...",
        "quant_complete": "量化完成！",
        "quant_input_not_found": "未找到输入模型: {path}",
        "quant_cpu_threads": "CPU 线程数: {threads}",
        "quant_numa_nodes": "NUMA 节点数: {nodes}",
        "quant_time_warning": "量化可能需要 30-60 分钟，具体取决于模型大小。",
        "quant_disk_analysis": "磁盘空间分析：",
        "quant_source_size": "源模型大小：",
        "quant_estimated_size": "预估输出大小：",
        "quant_available_space": "可用空间：",
        "quant_insufficient_space": "警告：磁盘空间不足！",
        "quant_required_space": "所需空间（含20%缓冲）：",
        "quant_shortage": "不足：",
        "quant_may_fail": "量化可能失败或生成不完整的文件。",
        "quant_continue_anyway": "仍然继续？",
        "quant_settings": "量化设置：",
        "quant_registered": "量化模型已注册：{name}",
        "quant_view_with": "查看：",
        "quant_use_with": "使用：",
        "quant_register_failed": "自动注册模型失败：{error}",
        "quant_output_exists": "输出路径已存在：{path}",
        "quant_using_unique": "使用唯一名称：{path}",
        # Interactive quant
        "quant_interactive_title": "交互式量化配置",
        "quant_new_model_notice": "⚠ 注意：部分新模型暂时无法量化（转换脚本未适配），推荐使用原精度进行推理（无需转换权重）。",
        "quant_no_moe_models": "未找到可量化的 MoE 模型。",
        "quant_only_moe": "只有 MoE 模型（如 DeepSeek-V3）可以被量化为 AMX 格式。",
        "quant_add_models": "添加模型：{command}",
        "quant_moe_available": "可量化的 MoE 模型：",
        "quant_select_model": "选择要量化的模型",
        "quant_invalid_choice": "无效选择",
        "quant_step2_method": "第 2 步：量化方法",
        "quant_method_label": "量化方法：",
        "quant_int4_desc": "INT4",
        "quant_int8_desc": "INT8",
        "quant_select_method": "选择量化方法",
        "quant_input_type_label": "输入权重类型：",
        "quant_fp8_desc": "FP8（适用于 8 位浮点权重）",
        "quant_fp16_desc": "FP16（适用于 16 位浮点权重）",
        "quant_bf16_desc": "BF16（适用于 Brain Float 16 权重）",
        "quant_select_input_type": "选择输入类型",
        "quant_step3_cpu": "第 3 步：CPU 配置",
        "quant_cpu_threads_prompt": "CPU 线程数（1 到 {max}）",
        "quant_numa_nodes_prompt": "NUMA 节点数（1 到 {max}）",
        "quant_use_gpu_label": "是否使用 GPU 进行转换？",
        "quant_gpu_speedup": "GPU 可以显著加快量化速度",
        "quant_enable_gpu": "启用 GPU 加速？",
        "quant_step4_output": "第 4 步：输出路径",
        "quant_default_path": "默认：",
        "quant_use_default": "使用默认输出路径？",
        "quant_custom_path": "输入自定义输出路径",
        "quant_output_exists_warn": "⚠ 输出路径已存在：{path}",
        "quant_using_unique_name": "→ 使用唯一名称：{path}",
        "quant_config_summary": "配置摘要",
        "quant_summary_model": "模型：",
        "quant_summary_method": "方法：",
        "quant_summary_input_type": "输入类型：",
        "quant_summary_cpu_threads": "CPU 线程数：",
        "quant_summary_numa": "NUMA 节点数：",
        "quant_summary_gpu": "使用 GPU：",
        "quant_summary_output": "输出路径：",
        "quant_start_question": "开始量化？",
        "quant_cancelled": "已取消",
        "quant_config_complete": "配置完成",
        "quant_time_elapsed": "耗时：",
        "yes": "是",
        "no": "否",
        # SFT command
        "sft_mode_train": "训练模式",
        "sft_mode_chat": "聊天模式",
        "sft_mode_export": "导出模式",
        "sft_config_path": "配置文件: {path}",
        "sft_starting": "正在启动 {mode}...",
        "sft_complete": "{mode} 完成！",
        "sft_config_not_found": "未找到配置文件: {path}",
        # Bench command
        "bench_starting": "开始基准测试...",
        "bench_type": "测试类型: {type}",
        "bench_complete": "基准测试完成！",
        "bench_results_title": "基准测试结果",
        # Common prompts
        "prompt_continue": "是否继续？",
        "prompt_select": "请选择:",
        "prompt_enter_value": "请输入:",
        "prompt_confirm_action": "确认此操作？",
        # First-run setup - Model path selection
        "setup_model_path_title": "模型存储位置",
        "setup_model_path_desc": "大语言模型体积较大（50-200GB+）。请选择一个有足够空间的存储位置：",
        "setup_scanning_disks": "正在扫描可用存储位置...",
        "setup_disk_option": "{path} (可用 {available} / 总共 {total})",
        "setup_disk_option_recommended": "{path} (可用 {available} / 总共 {total}) [推荐]",
        "setup_custom_path": "输入自定义路径",
        "setup_enter_custom_path": "请输入模型存储路径",
        "setup_path_not_exist": "路径不存在，是否创建？",
        "setup_path_no_write": "没有该路径的写入权限，请选择其他路径。",
        "setup_path_low_space": "警告：可用空间不足 100GB，可能无法存储大型模型。",
        "setup_model_path_set": "模型存储路径已设置为: {path}",
        "setup_no_large_disk": "未发现大容量存储位置，使用默认路径。",
        "setup_scanning_models": "正在扫描已有模型...",
        "setup_found_models": "发现 {count} 个模型:",
        "setup_model_info": "{name} ({size}, {type})",
        "setup_no_models_found": "该位置未发现已有模型。",
        "setup_location_has_models": "发现 {count} 个模型",
        "setup_installing_completion": "正在为 {shell} 安装命令补全...",
        "setup_completion_installed": "命令补全已安装！重启终端后生效。",
        "setup_completion_failed": "命令补全安装失败。请手动运行 'kt --install-completion'。",
        # Auto completion
        "completion_installed_title": "命令补全",
        "completion_installed_for": "已为 {shell} 安装命令补全",
        "completion_activate_now": "在当前终端会话中启用补全，请运行：",
        "completion_next_session": "新的终端会话将自动启用补全。",
        # SGLang
        "sglang_not_found": "未找到 SGLang",
        "sglang_pypi_warning": "PyPI 版本的 SGLang 可能与 kt-kernel 不兼容。请使用 sglang-kt: pip install sglang-kt",
        "sglang_pypi_hint": "PyPI 版本可能不兼容。安装 kvcache-ai 分支: pip install sglang-kt (或在 ktransformers 根目录运行 ./install.sh)",
        "sglang_install_hint": "安装 SGLang: pip install sglang-kt (或在 ktransformers 根目录运行 ./install.sh)",
        "sglang_recommend_source": "建议重新安装 kvcache-ai 分支: pip uninstall sglang -y && pip install sglang-kt",
        "sglang_kt_kernel_not_supported": "SGLang 不支持 kt-kernel (缺少 --kt-gpu-prefill-token-threshold 参数)",
        "sglang_checking_kt_kernel_support": "正在检查 SGLang kt-kernel 支持...",
        "sglang_kt_kernel_supported": "SGLang kt-kernel 支持已验证",
        # Chat
        "chat_proxy_detected": "检测到环境中存在代理设置",
        "chat_proxy_confirm": "是否使用代理连接？",
        "chat_proxy_disabled": "已在本次会话中禁用代理",
        "chat_openai_required": "聊天功能需要 OpenAI Python SDK。",
        "chat_install_hint": "安装命令：",
        "chat_title": "KTransformers 对话",
        "chat_server": "服务器",
        "chat_temperature": "温度",
        "chat_max_tokens": "最大 tokens",
        "chat_help_hint": "输入 '/help' 查看命令，'/quit' 退出",
        "chat_connecting": "正在连接服务器...",
        "chat_no_models": "服务器上没有可用模型",
        "chat_model_not_found": "未找到模型 '{model}'。可用模型：{available}",
        "chat_connected": "已连接到模型：{model}",
        "chat_connect_failed": "连接服务器失败：{error}",
        "chat_server_not_running": "请确保模型服务器正在运行：",
        "chat_user_prompt": "用户",
        "chat_assistant_prompt": "助手",
        "chat_generation_error": "生成回复时出错：{error}",
        "chat_interrupted": "对话已中断。再见！",
        "chat_history_saved": "历史记录已保存到：{path}",
        "chat_goodbye": "再见！",
        "chat_help_title": "可用命令：",
        "chat_help_content": "/help, /h         - 显示此帮助信息\n/quit, /exit, /q  - 退出聊天\n/clear, /c        - 清除对话历史\n/history, /hist   - 显示对话历史\n/info, /i         - 显示当前设置\n/retry, /r        - 重新生成上一个回复",
        "chat_history_cleared": "对话历史已清除",
        "chat_no_history": "暂无对话历史",
        "chat_history_title": "历史记录（{count} 条消息）",
        "chat_info_title": "当前设置：",
        "chat_info_content": "温度：{temperature}\n最大 tokens：{max_tokens}\n消息数：{messages}",
        "chat_retrying": "正在重试上一个回复...",
        "chat_no_retry": "没有可重试的回复",
        "chat_unknown_command": "未知命令：{command}",
        "chat_unknown_hint": "输入 /help 查看可用命令",
        # Run Interactive
        "run_int_no_moe_models": "未找到 MoE GPU 模型。",
        "run_int_add_models": "添加模型：kt model scan",
        "run_int_list_all": "列出所有模型：kt model list --all",
        "run_int_step1_title": "第 1 步：选择模型（GPU MoE 模型）",
        "run_int_select_model": "选择模型",
        "run_int_step2_title": "第 2 步：选择推理方法",
        "run_int_method_raw": "RAW 精度（FP8/FP8_PERCHANNEL/BF16/RAWINT4）",
        "run_int_method_amx": "AMX 量化（INT4/INT8）",
        "run_int_method_gguf": "GGUF（Llamafile）",
        "run_int_method_saved": "使用已保存的配置",
        "run_int_select_method": "选择推理方法",
        "run_int_raw_precision": "RAW 精度：",
        "run_int_select_precision": "选择精度",
        "run_int_amx_method": "AMX 方法：",
        "run_int_select_amx": "选择 AMX 方法",
        "run_int_step3_title": "第 3 步：NUMA 和 CPU 配置",
        "run_int_numa_nodes": "NUMA 节点数（1-{max}）",
        "run_int_cpu_threads": "每个 NUMA 的 CPU 线程数（1-{max}）",
        "run_int_amx_warning": "⚠ 警告：AMX INT4/INT8 需要兼容的 CPU。检查命令：kt doctor",
        "run_int_step4_title": "第 4 步：GPU 专家配置",
        "run_int_gpu_experts": "每层 GPU 专家数（0-{max}）",
        "run_int_gpu_experts_info": "总专家数：{total}，每 token 激活：{active}",
        "run_int_step5_title": "第 5 步：KV Cache 配置",
        "run_int_kv_cache_size": "KV Cache 大小（tokens）",
        "run_int_chunk_prefill": "启用分块预填充？",
        "run_int_chunk_size": "分块预填充大小（tokens）",
        "run_int_gpu_prefill_threshold": "GPU 预填充阈值（tokens）",
        "run_int_step6_title": "第 6 步：GPU 选择和张量并行",
        "run_int_available_gpus": "可用 GPU：",
        "run_int_gpu_id": "GPU {id}",
        "run_int_vram_info": "{name}（总计 {total:.1f}GB，空闲 {free:.1f}GB）",
        "run_int_select_gpus": "选择 GPU ID（逗号分隔）",
        "run_int_invalid_gpu_range": "所有 GPU ID 必须在 0 到 {max} 之间",
        "run_int_tp_size": "TP 大小（必须是 2 的幂：1,2,4,8...）",
        "run_int_tp_mismatch": "TP 大小必须与选择的 GPU 数量匹配（{count}）",
        "run_int_tp_not_power_of_2": "TP 大小必须是 2 的幂",
        "run_int_mem_fraction": "静态内存占用比例（0.0-1.0）",
        "run_int_using_saved_mem": "使用已保存的内存占用比例：{fraction}",
        "run_int_step7_title": "第 7 步：解析器配置（可选）",
        "run_int_tool_call_parser": "工具调用解析器（按回车跳过）",
        "run_int_reasoning_parser": "推理解析器（按回车跳过）",
        "run_int_step8_title": "第 8 步：主机和端口配置",
        "run_int_host": "主机",
        "run_int_port": "端口",
        "run_int_port_occupied": "⚠ 端口 {port} 已被占用",
        "run_int_port_suggestion": "建议使用可用端口：{port}",
        "run_int_use_suggested": "使用建议的端口？",
        "run_int_saved_configs": "已保存的配置：",
        "run_int_config_name": "配置 {num}",
        "run_int_kt_method": "KT 方法：",
        "run_int_numa_nodes_label": "NUMA 节点：",
        "run_int_cpu_threads_label": "CPU 线程：",
        "run_int_gpu_experts_label": "GPU 专家：",
        "run_int_tp_size_label": "TP 大小：",
        "run_int_mem_fraction_label": "内存占用比例：",
        "run_int_server_label": "服务器：",
        "run_int_kv_cache_label": "KV Cache：",
        "run_int_chunk_prefill_label": "分块预填充：",
        "run_int_gpu_prefill_label": "GPU 预填充阈值：",
        "run_int_tool_parser_label": "工具调用解析器：",
        "run_int_reasoning_parser_label": "推理解析器：",
        "run_int_command_label": "命令：",
        "run_int_select_config": "选择配置",
        "run_int_gpu_select_required": "请选择 {tp} 个 GPU（来自已保存配置的 TP 大小）",
        "run_int_port_check_title": "端口配置",
        "run_int_port_checking": "正在检查端口 {port} 可用性...",
        "run_int_port_available": "端口 {port} 可用",
        "run_int_saved_config_title": "已保存的配置",
        "run_int_save_config_title": "保存配置",
        "run_int_save_config_prompt": "保存此配置以供将来使用？",
        "run_int_config_name_prompt": "配置名称",
        "run_int_config_name_default": "配置 {timestamp}",
        "run_int_config_saved": "配置已保存：{name}",
        "run_int_config_summary": "配置完成",
        "run_int_model_label": "模型：",
        "run_int_selected_gpus_label": "已选择的 GPU：",
        # Model command
        "model_supported_title": "KTransformers 支持的模型",
        "model_column_model": "模型",
        "model_column_status": "状态",
        "model_column_local_path": "本地路径",
        "model_status_local": "本地",
        "model_status_not_downloaded": "未下载",
        "model_usage_title": "使用方法",
        "model_usage_download": "下载模型:",
        "model_usage_list_local": "列出本地模型:",
        "model_usage_search": "搜索模型:",
        "model_storage_paths_title": "模型存储路径",
        "model_local_models_title": "本地已下载的模型",
        "model_available_models_title": "可用模型",
        "model_no_local_models": "未找到本地已下载的模型",
        "model_download_hint": "下载模型:",
        "model_download_usage_hint": "用法: kt model download <模型名称>",
        "model_download_list_hint": "使用 'kt model download --list' 查看可用模型。",
        "model_download_hf_hint": "或直接指定 HuggingFace 仓库: kt model download org/model-name",
        "model_saved_to": "模型已保存到: {path}",
        "model_start_with": "启动命令: kt run {name}",
        "model_download_failed": "下载失败: {error}",
        "model_hf_cli_not_found": "未找到 huggingface-cli。请安装: pip install huggingface-hub",
        "model_path_not_exist": "路径不存在: {path}",
        "model_create_directory": "创建目录 {path}？",
        "model_created_directory": "已创建目录: {path}",
        "model_create_dir_failed": "创建目录失败: {error}",
        "model_path_added": "已添加模型路径: {path}",
        "model_path_removed": "已移除模型路径: {path}",
        "model_path_not_found": "路径未找到或无法移除最后一个路径: {path}",
        "model_search_no_results": "未找到匹配 '{query}' 的模型",
        "model_search_results_title": "'{query}' 的搜索结果",
        "model_column_name": "名称",
        "model_column_hf_repo": "HuggingFace 仓库",
        "model_column_aliases": "别名",
        # Model management - new user registry system
        "model_no_registered_models": "尚未注册任何模型。",
        "model_scan_hint": "扫描模型: kt model scan",
        "model_add_hint": "添加模型: kt model add /path/to/model",
        "model_registered_models_title": "已注册的模型",
        "model_column_format": "格式",
        "model_column_repo": "仓库",
        "model_column_sha256": "SHA256",
        "model_non_moe_hidden_hint": "检测到 {count} 个非MoE模型，使用 kt model list --all 展示全部",
        "model_usage_title": "常用操作:",
        "model_usage_info": "查看详情:",
        "model_usage_edit": "编辑模型:",
        "model_usage_verify": "校验权重:",
        "model_usage_quant": "量化模型:",
        "model_usage_run": "运行模型:",
        "model_usage_scan": "扫描模型:",
        "model_usage_add": "添加模型:",
        "model_usage_verbose": "查看包含文件详情:",
        "model_no_storage_paths": "未配置存储路径。",
        "model_add_path_hint": "添加存储路径: kt config set model.storage_paths /path/to/models",
        "model_scanning_paths": "正在扫描配置的存储路径...",
        "model_scanning_progress": "扫描中: {path}",
        "model_scan_warnings_title": "警告",
        "model_scan_no_models_found": "在配置的路径中未找到模型。",
        "model_scan_check_paths_hint": "检查存储路径: kt config get model.storage_paths",
        "model_scan_min_size_hint": "文件夹必须 ≥{size}GB 才能被识别为模型。",
        "model_scan_found_title": "发现 {count} 个新模型",
        "model_column_path": "路径",
        "model_column_size": "大小",
        "model_scan_auto_adding": "正在自动添加模型...",
        "model_added": "已添加: {name}",
        "model_add_failed": "添加 {name} 失败: {error}",
        "model_scan_complete": "扫描完成！已添加 {count} 个模型。",
        "model_scan_interactive_prompt": "命令: edit <id> | del <id> | done",
        "model_scan_cmd_edit": "设置模型自定义名称和仓库",
        "model_scan_cmd_delete": "跳过此模型",
        "model_scan_cmd_done": "完成并添加模型",
        "model_scan_marked_skip": "已跳过模型 #{id}",
        "model_scan_invalid_id": "无效的模型 ID: {id}",
        "model_scan_invalid_command": "无效命令。使用: edit <id> | del <id> | done",
        "model_scan_edit_model": "编辑模型 {id}",
        "model_scan_edit_note": "您可以在添加到注册表前更改模型名称和配置仓库信息",
        "model_scan_adding_models": "正在添加 {count} 个模型...",
        "model_scan_next_steps": "后续步骤",
        "model_scan_view_hint": "查看已注册模型: kt model list",
        "model_scan_edit_hint": "编辑模型详情: kt model edit <name>",
        "model_scan_no_models_added": "未添加任何模型。",
        "model_add_path_not_exist": "错误: 路径不存在: {path}",
        "model_add_not_directory": "错误: 路径不是目录: {path}",
        "model_add_already_registered": "此路径已注册为: {name}",
        "model_add_view_hint": "查看: kt model info {name}",
        "model_add_scanning": "正在扫描模型文件...",
        "model_add_scan_failed": "扫描模型失败: {error}",
        "model_add_no_model_files": "在 {path} 中未找到模型文件",
        "model_add_supported_formats": "支持: *.safetensors, *.gguf (文件夹 ≥10GB)",
        "model_add_detected": "检测到: {format} 格式, {size}, {count} 个文件",
        "model_add_name_conflict": "名称 '{name}' 已存在。",
        "model_add_prompt_name": "为此模型输入名称",
        "model_add_name_exists": "名称已存在。请选择其他名称:",
        "model_add_configure_repo": "配置仓库信息以进行 SHA256 验证?",
        "model_add_repo_type_prompt": "选择仓库类型:",
        "model_add_choice": "选择",
        "model_add_repo_id_prompt": "输入仓库 ID (例如: deepseek-ai/DeepSeek-V3)",
        "model_add_success": "成功添加模型: {name}",
        "model_add_verify_hint": "验证完整性: kt model verify {name}",
        "model_add_edit_later_hint": "稍后编辑详情: kt model edit {name}",
        "model_add_failed_generic": "添加模型失败: {error}",
        "model_edit_not_found": "未找到模型 '{name}'。",
        "model_edit_list_hint": "列出模型: kt model list",
        "model_edit_current_config": "当前配置",
        "model_edit_what_to_edit": "您想编辑什么?",
        "model_edit_option_name": "编辑名称",
        "model_edit_option_repo": "配置仓库信息",
        "model_edit_option_delete": "删除此模型",
        "model_edit_option_cancel": "取消 / 退出",
        "model_edit_choice_prompt": "选择选项",
        "model_edit_new_name": "输入新名称",
        "model_edit_name_conflict": "名称 '{name}' 已存在。请选择其他名称:",
        "model_edit_name_updated": "名称已更新: {old} → {new}",
        "model_edit_repo_type_prompt": "仓库类型 (或按回车删除仓库信息):",
        "model_edit_repo_remove": "删除仓库信息",
        "model_edit_repo_id_prompt": "输入仓库 ID",
        "model_edit_repo_removed": "仓库信息已删除",
        "model_edit_repo_updated": "仓库已配置: {repo_type} → {repo_id}",
        "model_edit_delete_warning": "从注册表中删除模型 '{name}'?",
        "model_edit_delete_note": "注意: 这只会删除注册表条目。{path} 中的模型文件不会被删除。",
        "model_edit_delete_confirm": "确认删除?",
        "model_edit_deleted": "模型 '{name}' 已从注册表中删除",
        "model_edit_delete_cancelled": "删除已取消",
        "model_edit_cancelled": "编辑已取消",
        # Model edit - Interactive selection
        "model_edit_select_title": "选择要编辑的模型",
        "model_edit_select_model": "选择模型",
        "model_edit_invalid_choice": "无效选择",
        "model_edit_no_models": "注册表中未找到模型。",
        "model_edit_add_hint_scan": "添加模型:",
        "model_edit_add_hint_add": "或:",
        # Model edit - Display
        "model_edit_gpu_links": "GPU 链接:",
        # Model edit - Menu options
        "model_edit_manage_gpu_links": "管理 GPU 链接",
        "model_edit_save_changes": "保存更改",
        "model_edit_has_changes": "(有更改)",
        "model_edit_no_changes": "(无更改)",
        # Model edit - Pending changes messages
        "model_edit_name_pending": "名称将在保存更改时更新。",
        "model_edit_repo_remove_pending": "仓库信息将在保存更改时删除。",
        "model_edit_repo_update_pending": "仓库信息将在保存更改时更新。",
        # Model edit - GPU link management
        "model_edit_gpu_links_title": "管理 {name} 的 GPU 链接",
        "model_edit_current_gpu_links": "当前 GPU 链接:",
        "model_edit_no_gpu_links": "未配置 GPU 链接。",
        "model_edit_gpu_options": "选项:",
        "model_edit_gpu_add": "添加 GPU 链接",
        "model_edit_gpu_remove": "删除 GPU 链接",
        "model_edit_gpu_clear": "清除所有 GPU 链接",
        "model_edit_gpu_back": "返回主菜单",
        "model_edit_gpu_choose_option": "选择选项",
        "model_edit_gpu_none_available": "没有可链接的 GPU 模型。",
        "model_edit_gpu_available_models": "可用的 GPU 模型:",
        "model_edit_gpu_already_linked": "(已链接)",
        "model_edit_gpu_enter_number": "输入要添加的 GPU 模型编号",
        "model_edit_gpu_link_pending": "GPU 链接将在保存更改时添加: {name}",
        "model_edit_gpu_already_exists": "此 GPU 模型已链接。",
        "model_edit_gpu_invalid_choice": "无效选择。",
        "model_edit_gpu_invalid_input": "无效输入。",
        "model_edit_gpu_none_to_remove": "没有可删除的 GPU 链接。",
        "model_edit_gpu_choose_to_remove": "选择要删除的 GPU 链接:",
        "model_edit_gpu_enter_to_remove": "输入要删除的编号",
        "model_edit_gpu_remove_pending": "GPU 链接将在保存更改时删除: {name}",
        "model_edit_gpu_none_to_clear": "没有可清除的 GPU 链接。",
        "model_edit_gpu_clear_confirm": "删除所有 GPU 链接?",
        "model_edit_gpu_clear_pending": "所有 GPU 链接将在保存更改时删除。",
        "model_edit_cancelled_short": "已取消。",
        # Model edit - Save operation
        "model_edit_no_changes_to_save": "没有更改可保存。",
        "model_edit_saving": "正在保存更改...",
        "model_edit_saved": "更改保存成功!",
        "model_edit_updated_config": "更新后的配置:",
        "model_edit_repo_changed_warning": "⚠ 仓库信息已更改。",
        "model_edit_verify_hint": "运行 [cyan]kt model verify[/cyan] 以使用 SHA256 校验和验证模型完整性。",
        "model_edit_discard_changes": "放弃未保存的更改?",
        "model_info_not_found": "未找到模型 '{name}'。",
        "model_info_list_hint": "列出所有模型: kt model list",
        "model_remove_not_found": "未找到模型 '{name}'。",
        "model_remove_list_hint": "列出模型: kt model list",
        "model_remove_warning": "从注册表中删除模型 '{name}'?",
        "model_remove_note": "注意: 这只会删除注册表条目。模型文件不会从 {path} 中删除。",
        "model_remove_confirm": "确认删除?",
        "model_remove_cancelled": "删除已取消",
        "model_removed": "模型 '{name}' 已从注册表中删除",
        "model_remove_failed": "删除模型失败: {error}",
        "model_refresh_checking": "正在检查模型路径...",
        "model_refresh_all_valid": "所有模型都有效! (已检查 {count} 个模型)",
        "model_refresh_total": "总模型数: {total}",
        "model_refresh_missing_found": "发现 {count} 个缺失的模型",
        "model_refresh_suggestions": "建议操作",
        "model_refresh_remove_hint": "从注册表中删除: kt model remove <name>",
        "model_refresh_rescan_hint": "重新扫描模型: kt model scan",
        "model_verify_not_found": "未找到模型 '{name}'。",
        "model_verify_list_hint": "列出模型: kt model list",
        "model_verify_no_repo": "模型 '{name}' 未配置仓库信息。",
        "model_verify_config_hint": "配置仓库: kt model edit {name}",
        "model_verify_path_missing": "模型路径不存在: {path}",
        "model_verify_starting": "正在验证模型完整性...",
        "model_verify_progress": "仓库: {repo_type} → {repo_id}",
        "model_verify_not_implemented": "SHA256 验证尚未实现",
        "model_verify_future_note": "此功能将从 {repo_type} 获取官方 SHA256 哈希值并与本地文件进行比较。",
        "model_verify_passed": "验证通过！所有文件都与官方哈希匹配。",
        "model_verify_failed": "验证失败！{count} 个文件的哈希不匹配。",
        "model_verify_all_no_repos": "没有模型配置了仓库信息。",
        "model_verify_all_config_hint": "配置仓库使用: kt model edit <name>",
        "model_verify_all_found": "发现 {count} 个配置了仓库信息的模型",
        "model_verify_all_manual_hint": "验证特定模型: kt model verify <name>",
        # Coming soon
        "feature_coming_soon": "此功能即将推出...",
    },
}


# Cache for language detection to avoid repeated I/O
_lang_cache: str | None = None


def get_lang() -> str:
    """
    Detect the current language setting.

    Priority:
    1. KT_LANG environment variable
    2. Config file general.language setting
    3. LANG environment variable (if config is "auto")
    4. Default to English

    Returns:
        Language code: "zh" for Chinese, "en" for English
    """
    global _lang_cache

    # 1. Check KT_LANG environment variable (highest priority)
    kt_lang = os.environ.get("KT_LANG", "").lower()
    if kt_lang:
        return "zh" if kt_lang.startswith("zh") else "en"

    # 2. Return cached value if available (avoids I/O on every call)
    if _lang_cache is not None:
        return _lang_cache

    # 3. Check config file setting (with caching)
    # Import here to avoid circular imports
    from kt_kernel.cli.config.settings import get_settings

    try:
        settings = get_settings()
        config_lang = settings.get("general.language", "auto")
        if config_lang and config_lang != "auto":
            lang = "zh" if config_lang.lower().startswith("zh") else "en"
            _lang_cache = lang
            return lang
    except Exception:
        # If settings fail to load, continue with system detection
        pass

    # 4. Check system LANG environment variable
    system_lang = os.environ.get("LANG", "").lower()
    lang = "zh" if system_lang.startswith("zh") else "en"
    _lang_cache = lang
    return lang


def t(msg_key: str, **kwargs: Any) -> str:
    """
    Translate a message key to the current language.

    Args:
        msg_key: Message key to translate
        **kwargs: Format arguments for the message

    Returns:
        Translated and formatted message string

    Example:
        >>> t("welcome")
        "Welcome to KTransformers!"  # or "欢迎使用 KTransformers！" in Chinese

        >>> t("install_found", name="conda", version="24.1.0")
        "Found conda (version 24.1.0)"
    """
    lang = get_lang()
    messages = MESSAGES.get(lang, MESSAGES["en"])
    message = messages.get(msg_key, MESSAGES["en"].get(msg_key, msg_key))

    if kwargs:
        try:
            return message.format(**kwargs)
        except KeyError:
            return message
    return message


def set_lang(lang: str) -> None:
    """
    Set the language for the current session.

    Args:
        lang: Language code ("en" or "zh")
    """
    global _lang_cache
    os.environ["KT_LANG"] = lang
    _lang_cache = lang  # Update cache when language is explicitly set


================================================
FILE: kt-kernel/python/cli/main.py
================================================
"""
Main entry point for kt-cli.

KTransformers CLI - A unified command-line interface for KTransformers.
"""

import sys
import warnings

# Suppress numpy subnormal warnings
warnings.filterwarnings("ignore", message="The value of the smallest subnormal")

import typer

from kt_kernel.cli import __version__
from kt_kernel.cli.commands import bench, chat, config, doctor, model, quant, run, sft, version
from kt_kernel.cli.i18n import t, set_lang, get_lang


def _get_app_help() -> str:
    """Get app help text based on current language."""
    lang = get_lang()
    if lang == "zh":
        return "KTransformers CLI - KTransformers 统一命令行界面"
    return "KTransformers CLI - A unified command-line interface for KTransformers."


def _get_help(key: str) -> str:
    """Get help text based on current language."""
    help_texts = {
        "version": {"en": "Show version information", "zh": "显示版本信息"},
        "run": {"en": "Start model inference server", "zh": "启动模型推理服务器"},
        "chat": {"en": "Interactive chat with running model", "zh": "与运行中的模型进行交互式聊天"},
        "quant": {"en": "Quantize model weights", "zh": "量化模型权重"},
        "edit": {"en": "Edit model information", "zh": "编辑模型信息"},
        "bench": {"en": "Run full benchmark", "zh": "运行完整基准测试"},
        "microbench": {"en": "Run micro-benchmark", "zh": "运行微基准测试"},
        "doctor": {"en": "Diagnose environment issues", "zh": "诊断环境问题"},
        "model": {"en": "Manage models and storage paths", "zh": "管理模型和存储路径"},
        "config": {"en": "Manage configuration", "zh": "管理配置"},
        "sft": {"en": "Fine-tuning with LlamaFactory", "zh": "使用 LlamaFactory 进行微调"},
    }
    lang = get_lang()
    return help_texts.get(key, {}).get(lang, help_texts.get(key, {}).get("en", key))


# Create main app with dynamic help
app = typer.Typer(
    name="kt",
    help="KTransformers CLI - A unified command-line interface for KTransformers.",
    no_args_is_help=False,  # Handle no-args case manually to support first-run setup
    add_completion=False,  # Use static completion scripts instead of dynamic completion
    rich_markup_mode="rich",
)


def _update_help_texts() -> None:
    """Update all help texts based on current language setting."""
    # Update main app help
    app.info.help = _get_app_help()

    # Update command help texts
    for cmd_info in app.registered_commands:
        # cmd_info is a CommandInfo object
        if hasattr(cmd_info, "name") and cmd_info.name:
            cmd_info.help = _get_help(cmd_info.name)

    # Update sub-app help texts
    for group_info in app.registered_groups:
        if hasattr(group_info, "name") and group_info.name:
            group_info.help = _get_help(group_info.name)


# Commands are registered later after tui_command is defined


def check_first_run() -> None:
    """Check if this is the first run and prompt for language setup."""
    import os

    # Skip if not running in interactive terminal
    if not sys.stdin.isatty():
        return

    from kt_kernel.cli.config.settings import DEFAULT_CONFIG_FILE

    # Only check if config file exists - don't create it yet
    if not DEFAULT_CONFIG_FILE.exists():
        # First run - show welcome and language selection
        from kt_kernel.cli.config.settings import get_settings

        settings = get_settings()
        _show_first_run_setup(settings)
    else:
        # Config exists - check if initialized
        from kt_kernel.cli.config.settings import get_settings

        settings = get_settings()
        if not settings.get("general._initialized"):
            _show_first_run_setup(settings)


def _show_first_run_setup(settings) -> None:
    """Show first-run setup wizard."""
    from rich.console import Console
    from rich.panel import Panel
    from rich.prompt import Prompt, Confirm
    from rich.spinner import Spinner
    from rich.live import Live

    from kt_kernel.cli.utils.environment import scan_storage_locations, format_size_gb

    console = Console()

    # Welcome message
    console.print()
    console.print(
        Panel.fit(
            "[bold cyan]Welcome to KTransformers CLI! / 欢迎使用 KTransformers CLI![/bold cyan]\n\n"
            "Let's set up your preferences.\n"
            "让我们设置您的偏好。",
            title="kt-cli",
            border_style="cyan",
        )
    )
    console.print()

    # Language selection
    console.print("[bold]Select your preferred language / 选择您的首选语言:[/bold]")
    console.print()
    console.print("  [cyan][1][/cyan] English")
    console.print("  [cyan][2][/cyan] 中文 (Chinese)")
    console.print()

    choice = Prompt.ask("Enter choice / 输入选择", choices=["1", "2"], default="1")
    lang = "en" if choice == "1" else "zh"

    # Save language setting
    settings.set("general.language", lang)
    set_lang(lang)

    # Confirmation message
    console.print()
    if lang == "zh":
        console.print("[green]✓[/green] 语言已设置为中文")
    else:
        console.print("[green]✓[/green] Language set to English")

    # Model discovery section
    console.print()
    if lang == "zh":
        console.print("[bold]发现模型权重[/bold]")
        console.print()
        console.print("[dim]扫描系统中已有的模型权重文件，以便快速添加到模型列表。[/dim]")
        console.print()
        console.print("  [cyan][1][/cyan] 全局扫描 (自动扫描所有非系统路径)")
        console.print("  [cyan][2][/cyan] 手动指定路径 (可添加多个)")
        console.print("  [cyan][3][/cyan] 跳过 (稍后手动添加)")
        console.print()
        scan_choice = Prompt.ask("选择扫描方式", choices=["1", "2", "3"], default="1")
    else:
        console.print("[bold]Discover Model Weights[/bold]")
        console.print()
        console.print("[dim]Scan existing model weights on your system to quickly add them to the model list.[/dim]")
        console.print()
        console.print("  [cyan][1][/cyan] Global scan (auto-scan all non-system paths)")
        console.print("  [cyan][2][/cyan] Manual paths (add multiple paths)")
        console.print("  [cyan][3][/cyan] Skip (add manually later)")
        console.print()
        scan_choice = Prompt.ask("Select scan method", choices=["1", "2", "3"], default="1")

    if scan_choice == "1":
        # Global scan
        from kt_kernel.cli.utils.model_discovery import discover_and_register_global, format_discovery_summary

        console.print()
        try:
            total_found, new_found, registered = discover_and_register_global(
                min_size_gb=2.0, max_depth=6, show_progress=True, lang=lang
            )

            format_discovery_summary(
                total_found=total_found,
                new_found=new_found,
                registered=registered,
                lang=lang,
                show_models=True,
                max_show=10,
            )

        except Exception as e:
            console.print(f"[yellow]Warning: Scan failed - {e}[/yellow]")

    elif scan_choice == "2":
        # Manual path specification
        from kt_kernel.cli.utils.model_discovery import discover_and_register_path
        import os

        discovered_paths = set()  # Track paths discovered in this session
        total_registered = []

        while True:
            console.print()
            if lang == "zh":
                path = Prompt.ask("输入要扫描的路径 (例如: /mnt/data/models)")
            else:
                path = Prompt.ask("Enter path to scan (e.g., /mnt/data/models)")

            # Expand and validate path
            path = os.path.expanduser(path)

            if not os.path.exists(path):
                if lang == "zh":
                    console.print(f"[yellow]警告: 路径不存在: {path}[/yellow]")
                else:
                    console.print(f"[yellow]Warning: Path does not exist: {path}[/yellow]")
                continue

            if not os.path.isdir(path):
                if lang == "zh":
                    console.print(f"[yellow]警告: 不是一个目录: {path}[/yellow]")
                else:
                    console.print(f"[yellow]Warning: Not a directory: {path}[/yellow]")
                continue

            # Scan this path
            console.print()
            try:
                total_found, new_found, registered = discover_and_register_path(
                    path=path, min_size_gb=2.0, existing_paths=discovered_paths, show_progress=True, lang=lang
                )

                # Update discovered paths
                for model in registered:
                    discovered_paths.add(model.path)
                total_registered.extend(registered)

                console.print()
                if lang == "zh":
                    console.print(f"[green]✓[/green] 在此路径找到 {total_found} 个模型，其中 {new_found} 个为新模型")
                else:
                    console.print(f"[green]✓[/green] Found {total_found} models in this path, {new_found} are new")

                if new_found > 0:
                    for model in registered[:5]:
                        console.print(f"  • {model.name} ({model.format})")

                    if len(registered) > 5:
                        if lang == "zh":
                            console.print(f"  [dim]... 还有 {len(registered) - 5} 个新模型[/dim]")
                        else:
                            console.print(f"  [dim]... and {len(registered) - 5} more new models[/dim]")

            except Exception as e:
                console.print(f"[red]Error scanning path: {e}[/red]")

            # Ask if continue
            console.print()
            if lang == "zh":
                continue_scan = Confirm.ask("是否继续添加其他路径?", default=False)
            else:
                continue_scan = Confirm.ask("Continue adding more paths?", default=False)

            if not continue_scan:
                break

        if total_registered:
            console.print()
            if lang == "zh":
                console.print(f"[green]✓[/green] 总共发现 {len(total_registered)} 个新模型")
            else:
                console.print(f"[green]✓[/green] Total {len(total_registered)} new models discovered")

    # Model storage path selection
    console.print()
    console.print(f"[bold]{t('setup_model_path_title')}[/bold]")
    console.print()
    console.print(f"[dim]{t('setup_model_path_desc')}[/dim]")
    console.print()

    # Scan for storage locations
    console.print(f"[dim]{t('setup_scanning_disks')}[/dim]")
    locations = scan_storage_locations(min_size_gb=50.0)
    console.print()

    if locations:
        # Show storage location options
        for i, loc in enumerate(locations[:5], 1):  # Show top 5 options
            available = format_size_gb(loc.available_gb)
            total = format_size_gb(loc.total_gb)

            # Build the option string
            if i == 1:
                option_str = t("setup_disk_option_recommended", path=loc.path, available=available, total=total)
            else:
                option_str = t("setup_disk_option", path=loc.path, available=available, total=total)

            console.print(f"  [cyan][{i}][/cyan] {option_str}")

        # Custom path option
        custom_idx = min(len(locations), 5) + 1
        console.print(f"  [cyan][{custom_idx}][/cyan] {t('setup_custom_path')}")
        console.print()

        valid_choices = [str(i) for i in range(1, custom_idx + 1)]
        path_choice = Prompt.ask(t("prompt_select"), choices=valid_choices, default="1")

        if path_choice == str(custom_idx):
            # Custom path
            selected_path = _prompt_custom_path(console, settings)
        else:
            selected_path = locations[int(path_choice) - 1].path
    else:
        # No large storage found, ask for custom path
        console.print(f"[yellow]{t('setup_no_large_disk')}[/yellow]")
        console.print()
        selected_path = _prompt_custom_path(console, settings)

    # Ensure the path exists
    import os
    from pathlib import Path

    if not os.path.exists(selected_path):
        if Confirm.ask(t("setup_path_not_exist"), default=True):
            try:
                Path(selected_path).mkdir(parents=True, exist_ok=True)
            except (OSError, PermissionError) as e:
                console.print(f"[red]{t('error')}: {e}[/red]")
                # Fall back to default
                selected_path = str(Path.home() / ".ktransformers" / "models")
                Path(selected_path).mkdir(parents=True, exist_ok=True)

    # Check available space and warn if low
    from kt_kernel.cli.utils.environment import detect_disk_space_gb

    available_gb, _ = detect_disk_space_gb(
        selected_path if os.path.exists(selected_path) else str(Path(selected_path).parent)
    )
    if available_gb < 100:
        console.print(f"[yellow]{t('setup_path_low_space')}[/yellow]")

    # Save the path
    settings.set("paths.models", selected_path)
    settings.set("general._initialized", True)

    console.print()
    console.print(f"[green]✓[/green] {t('setup_model_path_set', path=selected_path)}")
    console.print()

    # Tips
    if lang == "zh":
        console.print("[dim]提示: 运行 'kt config show' 查看所有配置[/dim]")
    else:
        console.print("[dim]Tip: Run 'kt config show' to view all settings[/dim]")

    console.print()


def _prompt_custom_path(console, settings) -> str:
    """Prompt user to enter a custom path."""
    from rich.prompt import Prompt
    from pathlib import Path
    import os

    default_path = str(Path.home() / ".ktransformers" / "models")

    while True:
        custom_path = Prompt.ask(t("setup_enter_custom_path"), default=default_path)

        # Expand user home
        custom_path = os.path.expanduser(custom_path)

        # Check if path exists or parent is writable
        if os.path.exists(custom_path):
            if os.access(custom_path, os.W_OK):
                return custom_path
            else:
                console.print(f"[red]{t('setup_path_no_write')}[/red]")
        else:
            # Check if we can create it (parent writable)
            parent = str(Path(custom_path).parent)
            while not os.path.exists(parent) and parent != "/":
                parent = str(Path(parent).parent)

            if os.access(parent, os.W_OK):
                return custom_path
            else:
                console.print(f"[red]{t('setup_path_no_write')}[/red]")


def _install_shell_completion() -> None:
    """Install shell completion scripts to user directories.

    Uses standard locations that are auto-loaded by shell completion systems:
    - Bash: ~/.local/share/bash-completion/completions/kt (auto-loaded by bash-completion 2.0+)
    - Zsh: ~/.zfunc/_kt (requires fpath setup, but commonly used)
    - Fish: ~/.config/fish/completions/kt.fish (auto-loaded)
    """
    import os
    import shutil
    from pathlib import Path

    from kt_kernel.cli.config.settings import get_settings

    settings = get_settings()

    # Check if already installed
    if settings.get("general._completion_installed", False):
        return

    # Detect current shell
    shell = os.environ.get("SHELL", "")
    shell_name = "zsh" if "zsh" in shell else "fish" if "fish" in shell else "bash"

    try:
        cli_dir = Path(__file__).parent
        completions_dir = cli_dir / "completions"
        home = Path.home()

        def install_completion(src_name: str, dest_dir: Path, dest_name: str) -> None:
            """Install completion file from source to destination."""
            src_file = completions_dir / src_name
            if src_file.exists():
                dest_dir.mkdir(parents=True, exist_ok=True)
                shutil.copy2(src_file, dest_dir / dest_name)

        if shell_name == "bash":
            install_completion(
                "kt-completion.bash", home / ".local" / "share" / "bash-completion" / "completions", "kt"
            )
        elif shell_name == "zsh":
            install_completion("_kt", home / ".zfunc", "_kt")
        elif shell_name == "fish":
            install_completion("kt.fish", home / ".config" / "fish" / "completions", "kt.fish")

        # Mark as installed
        settings.set("general._completion_installed", True)

        # For bash/zsh, completion will work in new terminals automatically
        # (bash-completion 2.0+ auto-loads from ~/.local/share/bash-completion/completions/)

    except (OSError, IOError):
        # Silently ignore errors - completion is not critical
        pass


def _apply_saved_language() -> None:
    """Apply the saved language setting.

    Priority:
    1. KT_LANG environment variable (if already set, don't override)
    2. Config file setting
    3. System locale (auto)
    """
    import os

    # Don't override if KT_LANG is already set by user
    if os.environ.get("KT_LANG"):
        return

    from kt_kernel.cli.config.settings import get_settings

    settings = get_settings()
    lang = settings.get("general.language", "auto")

    if lang != "auto":
        set_lang(lang)


app.command(name="version", help="Show version information")(version.version)
app.command(name="chat", help="Interactive chat with running model")(chat.chat)
app.command(name="quant", help="Quantize model weights")(quant.quant)
app.command(name="edit", help="Edit model information")(model.edit_model)
app.command(name="bench", help="Run full benchmark")(bench.bench)
app.command(name="microbench", help="Run micro-benchmark")(bench.microbench)
app.command(name="doctor", help="Diagnose environment issues")(doctor.doctor)

# Register sub-apps
app.add_typer(model.app, name="model", help="Manage models and storage paths")
app.add_typer(config.app, name="config", help="Manage configuration")
app.add_typer(sft.app, name="sft", help="Fine-tuning with LlamaFactory")


def main():
    """Main entry point."""
    # Apply saved language setting first (before anything else for correct help display)
    _apply_saved_language()

    # Update help texts based on language
    _update_help_texts()

    # Check for first run (but not for certain commands)
    # Skip first-run check for: --help, config commands, version
    args = sys.argv[1:] if len(sys.argv) > 1 else []
    skip_commands = ["--help", "-h", "config", "version", "--version", "--no-tui"]

    should_check_first_run = True
    for arg in args:
        if arg in skip_commands:
            should_check_first_run = False
            break

    # Handle no arguments case
    if not args:
        # Check if this is first run
        from kt_kernel.cli.config.settings import DEFAULT_CONFIG_FILE, get_settings

        is_first_run = False
        if not DEFAULT_CONFIG_FILE.exists():
            is_first_run = True
        else:
            settings = get_settings()
            if not settings.get("general._initialized"):
                is_first_run = True

        if is_first_run:
            # First run - start initialization
            _install_shell_completion()
            check_first_run()
            return
        else:
            # Not first run - show help
            app(["--help"])
            return

    # Auto-install shell completion on first run
    if should_check_first_run:
        _install_shell_completion()

    # Check first run before running commands
    if should_check_first_run:
        check_first_run()

    # Handle "run" command specially to pass through unknown options
    if args and args[0] == "run":
        # Get args after "run"
        run_args = args[1:]
        # Use click command directly with ignore_unknown_options
        from kt_kernel.cli.commands import run as run_module

        sys.exit(run_module.run.main(args=run_args, standalone_mode=False))

    app()


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/python/cli/requirements/inference.txt
================================================
# Inference dependencies for KTransformers
# NOTE: sglang is installed separately from source (see install.py)

transformers>=4.45.0
safetensors>=0.4.0
huggingface-hub>=0.20.0


================================================
FILE: kt-kernel/python/cli/requirements/sft.txt
================================================
# SFT (Supervised Fine-Tuning) dependencies for KTransformers

llamafactory>=0.9.0
peft>=0.12.0
transformers>=4.45.0
datasets>=2.14.0
accelerate>=0.30.0


================================================
FILE: kt-kernel/python/cli/utils/__init__.py
================================================
"""
Utility modules for kt-cli.
"""


================================================
FILE: kt-kernel/python/cli/utils/analyze_moe_model.py
================================================
#!/usr/bin/env python3
"""
快速分析 MoE 模型 - 基于 config.json
(复用 sglang 的模型注册表和判断逻辑)
"""
import json
import hashlib
from pathlib import Path
from typing import Optional, Dict, Any


def _get_sglang_moe_architectures():
    """
    从 sglang 的模型注册表获取所有 MoE 架构

    复用 sglang 的代码，这样 sglang 更新后自动支持新模型
    """
    try:
        import sys

        # 添加 sglang 路径到 sys.path
        sglang_path = Path("/mnt/data2/ljq/sglang/python")
        if sglang_path.exists() and str(sglang_path) not in sys.path:
            sys.path.insert(0, str(sglang_path))

        # 直接导入 sglang 的 ModelRegistry
        # 注意：这需要 sglang 及其依赖正确安装
        from sglang.srt.models.registry import ModelRegistry

        # 获取所有支持的架构
        supported_archs = ModelRegistry.get_supported_archs()

        # 过滤出 MoE 模型（名称包含 Moe）
        moe_archs = {arch for arch in supported_archs if "Moe" in arch or "moe" in arch.lower()}

        # 手动添加一些不带 "Moe" 字样但是 MoE 模型的架构
        # DeepSeek V2/V3 系列
        deepseek_moe = {arch for arch in supported_archs if arch.startswith("Deepseek") or arch.startswith("deepseek")}
        moe_archs.update(deepseek_moe)

        # DBRX 也是 MoE 模型
        dbrx_moe = {arch for arch in supported_archs if "DBRX" in arch or "dbrx" in arch.lower()}
        moe_archs.update(dbrx_moe)

        # Grok 也是 MoE 模型
        grok_moe = {arch for arch in supported_archs if "Grok" in arch or "grok" in arch.lower()}
        moe_archs.update(grok_moe)

        return moe_archs
    except Exception as e:
        # 如果 sglang 不可用，返回空集合
        # 这种情况下，后续会使用配置文件中的其他判断方法
        import warnings

        warnings.warn(f"Failed to load MoE architectures from sglang: {e}. Using fallback detection methods.")
        return set()


# 获取 MoE 架构列表（优先从 sglang 获取）
MOE_ARCHITECTURES = _get_sglang_moe_architectures()


def _get_cache_file():
    """获取集中式缓存文件路径"""
    cache_dir = Path.home() / ".ktransformers" / "cache"
    cache_dir.mkdir(parents=True, exist_ok=True)
    return cache_dir / "moe_analysis_v2.json"


def _load_all_cache():
    """加载所有缓存数据"""
    cache_file = _get_cache_file()
    if not cache_file.exists():
        return {}

    try:
        with open(cache_file, "r") as f:
            return json.load(f)
    except Exception:
        return {}


def _save_all_cache(cache_data):
    """保存所有缓存数据"""
    cache_file = _get_cache_file()
    try:
        with open(cache_file, "w") as f:
            json.dump(cache_data, f, indent=2)
    except Exception as e:
        import warnings

        warnings.warn(f"Failed to save MoE cache: {e}")


def _compute_config_fingerprint(config_path: Path) -> Optional[str]:
    """计算 config.json 指纹"""
    if not config_path.exists():
        return None

    try:
        stat = config_path.stat()
        # 使用文件大小和修改时间作为指纹
        fingerprint_str = f"{config_path.name}:{stat.st_size}:{int(stat.st_mtime)}"
        return hashlib.md5(fingerprint_str.encode()).hexdigest()
    except Exception:
        return None


def _load_cache(model_path: Path) -> Optional[Dict[str, Any]]:
    """加载指定模型的缓存"""
    model_path_str = str(model_path.resolve())
    all_cache = _load_all_cache()

    if model_path_str not in all_cache:
        return None

    try:
        cache_entry = all_cache[model_path_str]

        # 验证缓存版本
        cache_version = cache_entry.get("cache_version", 0)
        if cache_version != 2:
            return None

        # 验证 config.json 指纹
        config_path = model_path / "config.json"
        current_fingerprint = _compute_config_fingerprint(config_path)
        if cache_entry.get("fingerprint") != current_fingerprint:
            return None

        return cache_entry.get("result")
    except Exception:
        return None


def _save_cache(model_path: Path, result: Dict[str, Any]):
    """保存指定模型的缓存"""
    model_path_str = str(model_path.resolve())

    try:
        config_path = model_path / "config.json"
        fingerprint = _compute_config_fingerprint(config_path)

        all_cache = _load_all_cache()

        all_cache[model_path_str] = {
            "fingerprint": fingerprint,
            "result": result,
            "cache_version": 2,
            "last_updated": __import__("datetime").datetime.now().isoformat(),
        }

        _save_all_cache(all_cache)
    except Exception as e:
        import warnings

        warnings.warn(f"Failed to save MoE cache for {model_path}: {e}")


def _load_config_json(model_path: Path) -> Optional[Dict[str, Any]]:
    """读取 config.json 文件

    参考 sglang 的 get_config() 实现
    """
    config_path = model_path / "config.json"

    if not config_path.exists():
        return None

    try:
        with open(config_path, "r", encoding="utf-8") as f:
            config = json.load(f)
        return config
    except Exception:
        return None


def _is_moe_model(config: Dict[str, Any]) -> bool:
    """判断是否是 MoE 模型

    参考 sglang 的模型注册表和架构识别方式
    """
    # 方法1: 检查架构名称
    architectures = config.get("architectures", [])
    if any(arch in MOE_ARCHITECTURES for arch in architectures):
        return True

    # 方法2: 检查是否有 MoE 相关字段（Mistral 格式）
    if config.get("moe"):
        return True

    # 方法3: 检查是否有 num_experts 或其变体字段
    # 需要检查 text_config（对于某些多模态模型）
    text_config = config.get("text_config", config)

    # 检查各种专家数量字段
    if (
        text_config.get("num_experts") or text_config.get("num_local_experts") or text_config.get("n_routed_experts")
    ):  # Kimi-K2 使用这个字段
        return True

    return False


def _extract_moe_params(config: Dict[str, Any]) -> Dict[str, Any]:
    """从 config 中提取 MoE 参数

    参考 sglang 的各种 MoE 模型实现
    """
    # 处理嵌套的 text_config
    text_config = config.get("text_config", config)

    # 提取基本参数
    result = {
        "architectures": config.get("architectures", []),
        "model_type": config.get("model_type", "unknown"),
    }

    # 专家数量（不同模型字段名不同）
    num_experts = (
        text_config.get("num_experts")  # Qwen2/3 MoE, DeepSeek V2
        or text_config.get("num_local_experts")  # Mixtral
        or text_config.get("n_routed_experts")  # Kimi-K2, DeepSeek V3
        or config.get("moe", {}).get("num_experts")  # Mistral 格式
    )

    # 每个 token 激活的专家数
    num_experts_per_tok = (
        text_config.get("num_experts_per_tok")
        or text_config.get("num_experts_per_token")
        or config.get("moe", {}).get("num_experts_per_tok")
        or 2  # 默认值
    )

    # 层数
    num_hidden_layers = text_config.get("num_hidden_layers") or text_config.get("n_layer") or 0

    # 隐藏层维度
    hidden_size = text_config.get("hidden_size") or text_config.get("d_model") or 0

    # MoE 专家中间层大小
    moe_intermediate_size = (
        text_config.get("moe_intermediate_size")
        or text_config.get("intermediate_size")  # 如果没有特殊的 moe_intermediate_size
        or 0
    )

    # 共享专家中间层大小（Qwen2/3 MoE）
    shared_expert_intermediate_size = text_config.get("shared_expert_intermediate_size", 0)

    result.update(
        {
            "num_experts": num_experts or 0,
            "num_experts_per_tok": num_experts_per_tok,
            "num_hidden_layers": num_hidden_layers,
            "hidden_size": hidden_size,
            "moe_intermediate_size": moe_intermediate_size,
            "shared_expert_intermediate_size": shared_expert_intermediate_size,
        }
    )

    # 提取其他有用的参数
    result["num_attention_heads"] = text_config.get("num_attention_heads", 0)
    result["num_key_value_heads"] = text_config.get("num_key_value_heads", 0)
    result["vocab_size"] = text_config.get("vocab_size", 0)
    result["max_position_embeddings"] = text_config.get("max_position_embeddings", 0)

    return result


def _estimate_model_size(model_path: Path) -> float:
    """估算模型总大小（GB）

    快速统计 safetensors 文件总大小
    """
    try:
        total_size = 0
        for file_path in model_path.glob("*.safetensors"):
            total_size += file_path.stat().st_size
        return total_size / (1024**3)
    except Exception:
        return 0.0


def analyze_moe_model(model_path, use_cache=True):
    """
    快速分析 MoE 模型 - 只读取 config.json

    参数:
        model_path: 模型路径（字符串或Path对象）
        use_cache: 是否使用缓存（默认True）

    返回:
        dict: {
            'is_moe': 是否是 MoE 模型,
            'num_experts': 专家总数,
            'num_experts_per_tok': 每个 token 激活的专家数,
            'num_hidden_layers': 层数,
            'hidden_size': 隐藏层维度,
            'moe_intermediate_size': MoE 专家中间层大小,
            'shared_expert_intermediate_size': 共享专家中间层大小,
            'architectures': 模型架构列表,
            'model_type': 模型类型,
            'total_size_gb': 模型总大小（估算，GB）,
            'cached': 是否从缓存读取
        }
        如果不是 MoE 模型或失败，返回 None
    """
    model_path = Path(model_path)

    if not model_path.exists():
        return None

    # 尝试加载缓存
    if use_cache:
        cached_result = _load_cache(model_path)
        if cached_result:
            cached_result["cached"] = True
            return cached_result

    # 读取 config.json
    config = _load_config_json(model_path)
    if not config:
        return None

    # 判断是否是 MoE 模型
    if not _is_moe_model(config):
        return None

    # 提取 MoE 参数
    params = _extract_moe_params(config)

    # 验证必要参数
    if params["num_experts"] == 0:
        return None

    # 估算模型大小
    total_size_gb = _estimate_model_size(model_path)

    # 组装结果
    result = {
        "is_moe": True,
        "num_experts": params["num_experts"],
        "num_experts_per_tok": params["num_experts_per_tok"],
        "num_hidden_layers": params["num_hidden_layers"],
        "hidden_size": params["hidden_size"],
        "moe_intermediate_size": params["moe_intermediate_size"],
        "shared_expert_intermediate_size": params["shared_expert_intermediate_size"],
        "architectures": params["architectures"],
        "model_type": params["model_type"],
        "total_size_gb": total_size_gb,
        "cached": False,
        # 额外参数
        "num_attention_heads": params.get("num_attention_heads", 0),
        "num_key_value_heads": params.get("num_key_value_heads", 0),
        "vocab_size": params.get("vocab_size", 0),
    }

    # 保存缓存
    if use_cache:
        _save_cache(model_path, result)

    return result


def print_analysis(model_path):
    """打印模型分析结果"""
    print(f"分析模型: {model_path}\n")

    result = analyze_moe_model(model_path)

    if result is None:
        print("不是 MoE 模型或分析失败")
        return

    print("=" * 70)
    print("MoE 模型分析结果")
    if result.get("cached"):
        print("[使用缓存]")
    print("=" * 70)
    print(f"模型架构:")
    print(f"  - 架构: {', '.join(result['architectures'])}")
    print(f"  - 类型: {result['model_type']}")
    print()
    print(f"MoE 结构:")
    print(f"  - 专家总数: {result['num_experts']}")
    print(f"  - 激活专家数: {result['num_experts_per_tok']} experts/token")
    print(f"  - 层数: {result['num_hidden_layers']}")
    print(f"  - 隐藏维度: {result['hidden_size']}")
    print(f"  - MoE 中间层: {result['moe_intermediate_size']}")
    if result["shared_expert_intermediate_size"] > 0:
        print(f"  - 共享专家中间层: {result['shared_expert_intermediate_size']}")
    print()
    print(f"大小统计:")
    print(f"  - 模型总大小: {result['total_size_gb']:.2f} GB")
    print("=" * 70)
    print()


def main():
    import sys

    models = ["/mnt/data2/models/Qwen3-30B-A3B", "/mnt/data2/models/Qwen3-235B-A22B-Instruct-2507"]

    if len(sys.argv) > 1:
        models = [sys.argv[1]]

    for model_path in models:
        print_analysis(model_path)


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/python/cli/utils/console.py
================================================
"""
Console utilities for kt-cli.

Provides Rich-based console output helpers for consistent formatting.
"""

from typing import Optional

from rich.console import Console
from rich.panel import Panel
from rich.progress import (
    BarColumn,
    DownloadColumn,
    Progress,
    SpinnerColumn,
    TaskProgressColumn,
    TextColumn,
    TimeElapsedColumn,
    TimeRemainingColumn,
    TransferSpeedColumn,
)
from rich.prompt import Confirm, Prompt
from rich.table import Table
from rich.theme import Theme

from kt_kernel.cli.i18n import t

# Custom theme for kt-cli
KT_THEME = Theme(
    {
        "info": "cyan",
        "warning": "yellow",
        "error": "bold red",
        "success": "bold green",
        "highlight": "bold magenta",
        "muted": "dim",
    }
)

# Global console instance
console = Console(theme=KT_THEME)


def print_info(message: str, **kwargs) -> None:
    """Print an info message."""
    console.print(f"[info]ℹ[/info] {message}", **kwargs)


def print_success(message: str, **kwargs) -> None:
    """Print a success message."""
    console.print(f"[success]✓[/success] {message}", **kwargs)


def print_warning(message: str, **kwargs) -> None:
    """Print a warning message."""
    console.print(f"[warning]⚠[/warning] {message}", **kwargs)


def print_error(message: str, **kwargs) -> None:
    """Print an error message."""
    console.print(f"[error]✗[/error] {message}", **kwargs)


def print_step(message: str, **kwargs) -> None:
    """Print a step indicator."""
    console.print(f"[highlight]→[/highlight] {message}", **kwargs)


def print_header(title: str, subtitle: Optional[str] = None) -> None:
    """Print a header panel."""
    content = f"[bold]{title}[/bold]"
    if subtitle:
        content += f"\n[muted]{subtitle}[/muted]"
    console.print(Panel(content, expand=False))


def print_version_table(versions: dict[str, Optional[str]]) -> None:
    """Print a version information table."""
    table = Table(show_header=False, box=None, padding=(0, 2))
    table.add_column("Component", style="bold")
    table.add_column("Version")

    for name, version in versions.items():
        if version:
            table.add_row(name, f"[success]{version}[/success]")
        else:
            table.add_row(name, f"[muted]{t('version_not_installed')}[/muted]")

    console.print(table)


def print_dependency_table(deps: list[dict]) -> None:
    """Print a dependency status table."""
    table = Table(title=t("install_checking_deps"))
    table.add_column(t("version_info"), style="bold")
    table.add_column("Current")
    table.add_column("Required")
    table.add_column("Status")

    for dep in deps:
        status = dep.get("status", "ok")
        if status == "ok":
            status_str = f"[success]{t('install_dep_ok')}[/success]"
        elif status == "outdated":
            status_str = f"[warning]{t('install_dep_outdated')}[/warning]"
        else:
            status_str = f"[error]{t('install_dep_missing')}[/error]"

        table.add_row(
            dep["name"],
            dep.get("installed", "-"),
            dep.get("required", "-"),
            status_str,
        )

    console.print(table)


def confirm(message: str, default: bool = True) -> bool:
    """Ask for confirmation."""
    return Confirm.ask(message, default=default, console=console)


def prompt_choice(message: str, choices: list[str], default: Optional[str] = None) -> str:
    """Prompt for a choice from a list."""
    # Display numbered choices
    console.print(f"\n[bold]{message}[/bold]")
    for i, choice in enumerate(choices, 1):
        console.print(f"  [highlight][{i}][/highlight] {choice}")

    while True:
        response = Prompt.ask(
            "\n" + t("prompt_select"),
            console=console,
            default=str(choices.index(default) + 1) if default else None,
        )
        try:
            idx = int(response) - 1
            if 0 <= idx < len(choices):
                return choices[idx]
        except ValueError:
            # Check if response matches a choice directly
            if response in choices:
                return response

        print_error(f"Please enter a number between 1 and {len(choices)}")


def prompt_text(message: str, default: Optional[str] = None) -> str:
    """Prompt for text input."""
    return Prompt.ask(message, console=console, default=default)


def create_progress() -> Progress:
    """Create a progress bar for general tasks."""
    return Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        TaskProgressColumn(),
        TimeElapsedColumn(),
        console=console,
    )


def create_download_progress() -> Progress:
    """Create a progress bar for downloads."""
    return Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        DownloadColumn(),
        TransferSpeedColumn(),
        TimeRemainingColumn(),
        console=console,
    )


def print_model_table(models: list[dict]) -> None:
    """Print a table of models."""
    table = Table(title=t("download_list_title"))
    table.add_column("Name", style="bold")
    table.add_column("Repository")
    table.add_column("Type")
    table.add_column("Requirements")

    for model in models:
        reqs = []
        if model.get("gpu_vram_gb"):
            reqs.append(f"GPU: {model['gpu_vram_gb']}GB")
        if model.get("cpu_ram_gb"):
            reqs.append(f"RAM: {model['cpu_ram_gb']}GB")

        table.add_row(
            model.get("name", ""),
            model.get("hf_repo", ""),
            model.get("type", ""),
            ", ".join(reqs) if reqs else "-",
        )

    console.print(table)


def print_hardware_info(gpu_info: str, cpu_info: str, ram_info: str) -> None:
    """Print hardware information."""
    table = Table(show_header=False, box=None)
    table.add_column("Icon", width=3)
    table.add_column("Info")

    table.add_row("🖥️", gpu_info)
    table.add_row("💻", cpu_info)
    table.add_row("🧠", ram_info)

    console.print(Panel(table, title="Hardware", expand=False))


def print_server_info(
    mode: str, host: str, port: int, gpu_experts: int, cpu_threads: int
) -> None:
    """Print server startup information."""
    table = Table(show_header=False, box=None)
    table.add_column("Key", style="bold")
    table.add_column("Value")

    table.add_row(t("run_server_mode").split(":")[0], mode)
    table.add_row("Host", host)
    table.add_row("Port", str(port))
    table.add_row(t("run_gpu_experts").split(":")[0], f"{gpu_experts}/layer")
    table.add_row(t("run_cpu_threads").split(":")[0], str(cpu_threads))

    console.print(Panel(table, title=t("run_server_started"), expand=False, border_style="green"))


def print_api_info(host: str, port: int) -> None:
    """Print API endpoint information."""
    api_url = f"http://{host}:{port}"
    docs_url = f"http://{host}:{port}/docs"

    console.print()
    console.print(f"  {t('run_api_url', host=host, port=port)}")
    console.print(f"  {t('run_docs_url', host=host, port=port)}")
    console.print()
    console.print(f"  [muted]Test command:[/muted]")
    console.print(
        f"  [dim]curl {api_url}/v1/chat/completions -H 'Content-Type: application/json' "
        f"-d '{{\"model\": \"default\", \"messages\": [{{\"role\": \"user\", \"content\": \"Hello\"}}]}}'[/dim]"
    )
    console.print()
    console.print(f"  [muted]{t('run_stop_hint')}[/muted]")


================================================
FILE: kt-kernel/python/cli/utils/debug_configs.py
================================================
"""
Debug utility to inspect saved run configurations.

Usage: python -m kt_kernel.cli.utils.debug_configs
"""

from pathlib import Path
import yaml
from rich.console import Console
from rich.table import Table
from rich import box

console = Console()


def main():
    """Show all saved configurations."""
    config_file = Path.home() / ".ktransformers" / "run_configs.yaml"

    console.print()
    console.print(f"[bold]Configuration file:[/bold] {config_file}")
    console.print()

    if not config_file.exists():
        console.print("[red]✗ Configuration file does not exist![/red]")
        console.print()
        console.print("No configurations have been saved yet.")
        return

    try:
        with open(config_file, "r", encoding="utf-8") as f:
            data = yaml.safe_load(f) or {}
    except Exception as e:
        console.print(f"[red]✗ Failed to load configuration file: {e}[/red]")
        return

    console.print(f"[green]✓[/green] Configuration file loaded")
    console.print()

    configs = data.get("configs", {})

    if not configs:
        console.print("[yellow]No saved configurations found.[/yellow]")
        return

    console.print(f"[bold]Found configurations for {len(configs)} model(s):[/bold]")
    console.print()

    for model_id, model_configs in configs.items():
        console.print(f"[cyan]Model ID:[/cyan] {model_id}")
        console.print(f"[dim]  {len(model_configs)} configuration(s)[/dim]")
        console.print()

        if not model_configs:
            continue

        # Display configs in a table
        table = Table(box=box.ROUNDED, show_header=True, header_style="bold cyan")
        table.add_column("#", justify="right", style="cyan")
        table.add_column("Name", style="white")
        table.add_column("Method", style="yellow")
        table.add_column("TP", justify="right", style="green")
        table.add_column("GPU Experts", justify="right", style="magenta")
        table.add_column("Created", style="dim")

        for i, cfg in enumerate(model_configs, 1):
            method = cfg.get("inference_method", "?")
            kt_method = cfg.get("kt_method", "?")
            method_display = f"{method.upper()}"
            if method == "raw":
                method_display += f" ({cfg.get('raw_method', '?')})"
            elif method == "amx":
                method_display += f" ({kt_method})"

            table.add_row(
                str(i),
                cfg.get("config_name", f"Config {i}"),
                method_display,
                str(cfg.get("tp_size", "?")),
                str(cfg.get("gpu_experts", "?")),
                cfg.get("created_at", "Unknown")[:19] if cfg.get("created_at") else "Unknown",
            )

        console.print(table)
        console.print()

    # Also check user_models.yaml to show model names
    console.print("[bold]Checking model registry...[/bold]")
    console.print()

    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    try:
        registry = UserModelRegistry()
        all_models = registry.list_models()

        console.print(f"[green]✓[/green] Found {len(all_models)} registered model(s)")
        console.print()

        # Map model IDs to names
        id_to_name = {m.id: m.name for m in all_models}

        console.print("[bold]Model ID → Name mapping:[/bold]")
        console.print()

        for model_id in configs.keys():
            model_name = id_to_name.get(model_id, "[red]Unknown (model not found in registry)[/red]")
            console.print(f"  {model_id[:8]}... → {model_name}")

        console.print()

    except Exception as e:
        console.print(f"[yellow]⚠ Could not load model registry: {e}[/yellow]")
        console.print()


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/python/cli/utils/download_helper.py
================================================
"""Helper functions for interactive model download."""

from pathlib import Path
from typing import Dict, List, Tuple
import fnmatch


def list_remote_files_hf(repo_id: str, use_mirror: bool = False) -> List[Dict[str, any]]:
    """
    List files in a HuggingFace repository.

    Returns:
        List of dicts with keys: 'path', 'size' (in bytes)
    """
    from huggingface_hub import HfApi
    import os

    # Set mirror if needed
    original_endpoint = os.environ.get("HF_ENDPOINT")
    if use_mirror and not original_endpoint:
        os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

    try:
        api = HfApi()
        files_info = api.list_repo_tree(repo_id=repo_id, recursive=True)

        result = []
        for item in files_info:
            # Skip directories
            if hasattr(item, "type") and item.type == "directory":
                continue

            # Get file info
            file_path = item.path if hasattr(item, "path") else str(item)
            file_size = item.size if hasattr(item, "size") else 0

            result.append({"path": file_path, "size": file_size})

        return result
    finally:
        # Restore original endpoint
        if use_mirror and not original_endpoint:
            os.environ.pop("HF_ENDPOINT", None)
        elif original_endpoint:
            os.environ["HF_ENDPOINT"] = original_endpoint


def list_remote_files_ms(repo_id: str) -> List[Dict[str, any]]:
    """
    List files in a ModelScope repository.

    Returns:
        List of dicts with keys: 'path', 'size' (in bytes)
    """
    from modelscope.hub.api import HubApi

    api = HubApi()
    files_info = api.get_model_files(model_id=repo_id, recursive=True)

    result = []
    for file_info in files_info:
        file_path = file_info.get("Name", file_info.get("Path", ""))
        file_size = file_info.get("Size", 0)

        result.append({"path": file_path, "size": file_size})

    return result


def filter_files_by_pattern(files: List[Dict[str, any]], pattern: str) -> List[Dict[str, any]]:
    """Filter files by glob pattern."""
    if pattern == "*":
        return files

    filtered = []
    for file in files:
        # Check if filename matches pattern
        filename = Path(file["path"]).name
        full_path = file["path"]

        if fnmatch.fnmatch(filename, pattern) or fnmatch.fnmatch(full_path, pattern):
            filtered.append(file)

    return filtered


def calculate_total_size(files: List[Dict[str, any]]) -> int:
    """Calculate total size of files in bytes."""
    return sum(f["size"] for f in files)


def format_file_list_table(files: List[Dict[str, any]], max_display: int = 10):
    """Format file list as a table for display."""
    from rich.table import Table
    from kt_kernel.cli.utils.model_scanner import format_size

    table = Table(show_header=True, header_style="bold")
    table.add_column("File", style="cyan", overflow="fold")
    table.add_column("Size", justify="right")

    # Show first max_display files
    for file in files[:max_display]:
        table.add_row(file["path"], format_size(file["size"]))

    if len(files) > max_display:
        table.add_row(f"... and {len(files) - max_display} more files", "[dim]...[/dim]")

    return table


def verify_repo_exists(repo_id: str, repo_type: str, use_mirror: bool = False) -> Tuple[bool, str]:
    """
    Verify if a repository exists.

    Returns:
        (exists: bool, message: str)
    """
    try:
        if repo_type == "huggingface":
            import os

            original_endpoint = os.environ.get("HF_ENDPOINT")
            if use_mirror and not original_endpoint:
                os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

            from huggingface_hub import HfApi

            try:
                api = HfApi()
                api.repo_info(repo_id=repo_id, repo_type="model")
                return True, "Repository found"
            finally:
                if use_mirror and not original_endpoint:
                    os.environ.pop("HF_ENDPOINT", None)
                elif original_endpoint:
                    os.environ["HF_ENDPOINT"] = original_endpoint

        else:  # modelscope
            from modelscope.hub.api import HubApi

            api = HubApi()
            api.get_model(model_id=repo_id)
            return True, "Repository found"

    except Exception as e:
        return False, f"Repository not found: {str(e)}"


================================================
FILE: kt-kernel/python/cli/utils/environment.py
================================================
"""
Environment detection utilities for kt-cli.

Provides functions to detect:
- Virtual environment managers (conda, venv, uv, mamba)
- Python version and packages
- CUDA and GPU information
- System resources (CPU, RAM, disk)
"""

import os
import platform
import shutil
import subprocess
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional


@dataclass
class EnvManager:
    """Information about an environment manager."""

    name: str
    version: str
    path: str


@dataclass
class GPUInfo:
    """Information about a GPU."""

    index: int
    name: str
    vram_gb: float
    cuda_capability: Optional[str] = None


@dataclass
class CPUInfo:
    """Information about the CPU."""

    name: str
    cores: int
    threads: int
    numa_nodes: int
    instruction_sets: list[str] = field(default_factory=list)  # AVX, AVX2, AVX512, AMX, etc.
    numa_info: dict = field(default_factory=dict)  # node -> cpus mapping


@dataclass
class MemoryInfo:
    """Information about system memory."""

    total_gb: float
    available_gb: float
    frequency_mhz: Optional[int] = None
    channels: Optional[int] = None
    type: Optional[str] = None  # DDR4, DDR5, etc.


@dataclass
class SystemInfo:
    """Complete system information."""

    python_version: str
    platform: str
    cuda_version: Optional[str]
    gpus: list[GPUInfo]
    cpu: CPUInfo
    ram_gb: float
    env_managers: list[EnvManager]


def run_command(cmd: list[str], timeout: int = 10) -> Optional[str]:
    """Run a command and return its output, or None if it fails."""
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
        if result.returncode == 0:
            return result.stdout.strip()
        return None
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return None


def detect_env_managers() -> list[EnvManager]:
    """Detect available virtual environment managers."""
    managers = []

    # Check conda
    conda_path = shutil.which("conda")
    if conda_path:
        version = run_command(["conda", "--version"])
        if version:
            # "conda 24.1.0" -> "24.1.0"
            version = version.split()[-1] if version else "unknown"
            managers.append(EnvManager(name="conda", version=version, path=conda_path))

    # Check mamba
    mamba_path = shutil.which("mamba")
    if mamba_path:
        version = run_command(["mamba", "--version"])
        if version:
            # First line: "mamba 1.5.0"
            version = version.split("\n")[0].split()[-1] if version else "unknown"
            managers.append(EnvManager(name="mamba", version=version, path=mamba_path))

    # Check uv
    uv_path = shutil.which("uv")
    if uv_path:
        version = run_command(["uv", "--version"])
        if version:
            # "uv 0.5.0" -> "0.5.0"
            version = version.split()[-1] if version else "unknown"
            managers.append(EnvManager(name="uv", version=version, path=uv_path))

    # Check if venv is available (built into Python)
    try:
        import venv  # noqa: F401

        managers.append(EnvManager(name="venv", version="builtin", path="python -m venv"))
    except ImportError:
        pass

    return managers


def check_docker() -> Optional[EnvManager]:
    """Check if Docker is available."""
    docker_path = shutil.which("docker")
    if docker_path:
        version = run_command(["docker", "--version"])
        if version:
            # "Docker version 24.0.7, build afdd53b"
            parts = version.split()
            version = parts[2].rstrip(",") if len(parts) > 2 else "unknown"
            return EnvManager(name="docker", version=version, path=docker_path)
    return None


def check_kt_env_exists(manager: str, env_name: str = "kt") -> bool:
    """Check if a kt environment exists for the given manager."""
    if manager == "conda" or manager == "mamba":
        result = run_command([manager, "env", "list"])
        if result:
            # Check if env_name appears as a separate word in the output
            for line in result.split("\n"):
                parts = line.split()
                if parts and parts[0] == env_name:
                    return True
    elif manager == "uv":
        # uv uses .venv in the project directory or ~/.local/share/uv/envs/
        venv_path = Path.home() / ".local" / "share" / "uv" / "envs" / env_name
        if venv_path.exists():
            return True
        # Also check current directory
        if Path(env_name).exists() and (Path(env_name) / "bin" / "python").exists():
            return True
    elif manager == "venv":
        # Check common locations
        venv_path = Path.home() / ".virtualenvs" / env_name
        if venv_path.exists():
            return True
        if Path(env_name).exists() and (Path(env_name) / "bin" / "python").exists():
            return True

    return False


def get_kt_env_path(manager: str, env_name: str = "kt") -> Optional[Path]:
    """Get the path to the kt environment."""
    if manager == "conda" or manager == "mamba":
        result = run_command([manager, "env", "list"])
        if result:
            for line in result.split("\n"):
                parts = line.split()
                if parts and parts[0] == env_name:
                    # The path is the last part
                    return Path(parts[-1])
    elif manager == "uv":
        venv_path = Path.home() / ".local" / "share" / "uv" / "envs" / env_name
        if venv_path.exists():
            return venv_path
    elif manager == "venv":
        venv_path = Path.home() / ".virtualenvs" / env_name
        if venv_path.exists():
            return venv_path

    return None


def detect_cuda_version() -> Optional[str]:
    """Detect CUDA version from nvidia-smi or nvcc."""
    # Try nvidia-smi first
    nvidia_smi = run_command(["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"])
    if nvidia_smi:
        # Get CUDA version from nvidia-smi
        full_output = run_command(["nvidia-smi"])
        if full_output:
            for line in full_output.split("\n"):
                if "CUDA Version:" in line:
                    # "| CUDA Version: 12.1     |"
                    parts = line.split("CUDA Version:")
                    if len(parts) > 1:
                        version = parts[1].strip().split()[0]
                        return version

    # Try nvcc
    nvcc_output = run_command(["nvcc", "--version"])
    if nvcc_output:
        for line in nvcc_output.split("\n"):
            if "release" in line.lower():
                # "Cuda compilation tools, release 12.1, V12.1.105"
                parts = line.split("release")
                if len(parts) > 1:
                    version = parts[1].strip().split(",")[0].strip()
                    return version

    return None


def detect_gpus() -> list[GPUInfo]:
    """Detect available NVIDIA GPUs, respecting CUDA_VISIBLE_DEVICES."""
    gpus = []

    nvidia_smi = run_command(["nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader,nounits"])

    if nvidia_smi:
        for line in nvidia_smi.strip().split("\n"):
            parts = [p.strip() for p in line.split(",")]
            if len(parts) >= 3:
                try:
                    index = int(parts[0])
                    name = parts[1]
                    vram_mb = float(parts[2])
                    vram_gb = round(vram_mb / 1024, 1)
                    gpus.append(GPUInfo(index=index, name=name, vram_gb=vram_gb))
                except (ValueError, IndexError):
                    continue

    # Filter by CUDA_VISIBLE_DEVICES if set
    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
    if cuda_visible is not None:
        if cuda_visible == "":
            # Empty string means no GPUs visible
            return []

        try:
            # Parse CUDA_VISIBLE_DEVICES (can be "0,1,2" or "0-3" etc.)
            visible_indices = _parse_cuda_visible_devices(cuda_visible)
            # Filter GPUs to only those in CUDA_VISIBLE_DEVICES
            filtered_gpus = [gpu for gpu in gpus if gpu.index in visible_indices]
            # Re-index GPUs to match CUDA's logical indexing (0, 1, 2, ...)
            for i, gpu in enumerate(filtered_gpus):
                # Keep original index in a comment, but CUDA sees them as 0,1,2...
                gpu.index = i
            return filtered_gpus
        except ValueError:
            # If parsing fails, return all GPUs as fallback
            pass

    return gpus


def _parse_cuda_visible_devices(cuda_visible: str) -> list[int]:
    """Parse CUDA_VISIBLE_DEVICES string into list of GPU indices.

    Supports formats like:
    - "0,1,2,3" -> [0, 1, 2, 3]
    - "0-3" -> [0, 1, 2, 3]
    - "0,2-4,7" -> [0, 2, 3, 4, 7]
    """
    indices = []
    parts = cuda_visible.split(",")

    for part in parts:
        part = part.strip()
        if "-" in part:
            # Range like "0-3"
            start, end = part.split("-")
            indices.extend(range(int(start), int(end) + 1))
        else:
            # Single index
            indices.append(int(part))

    return sorted(set(indices))  # Remove duplicates and sort


def detect_cpu_info() -> CPUInfo:
    """Detect CPU information including instruction sets and NUMA topology."""
    name = "Unknown"
    cores = os.cpu_count() or 1
    threads = cores
    numa_nodes = 1
    instruction_sets: list[str] = []
    numa_info: dict[str, list[int]] = {}

    if platform.system() == "Linux":
        try:
            with open("/proc/cpuinfo", "r") as f:
                content = f.read()

            # Get CPU name
            for line in content.split("\n"):
                if line.startswith("model name"):
                    name = line.split(":")[1].strip()
                    break

            # Get physical cores vs threads
            cpu_cores = content.count("processor\t:")
            if cpu_cores > 0:
                threads = cpu_cores

            siblings = None
            cores_per = None
            for line in content.split("\n"):
                if "siblings" in line:
                    siblings = int(line.split(":")[1].strip())
                if "cpu cores" in line:
                    cores_per = int(line.split(":")[1].strip())
            if siblings and cores_per:
                cores = threads // (siblings // cores_per) if siblings > cores_per else threads

            # Get instruction sets from flags
            for line in content.split("\n"):
                if line.startswith("flags"):
                    flags = line.split(":")[1].strip().split()
                    instruction_sets = _parse_cpu_flags(flags)
                    break

        except (OSError, IOError, ValueError):
            pass

        # Get NUMA topology
        numa_path = Path("/sys/devices/system/node")
        if numa_path.exists():
            numa_dirs = [d for d in numa_path.iterdir() if d.name.startswith("node")]
            numa_nodes = len(numa_dirs)

            for node_dir in numa_dirs:
                node_name = node_dir.name  # e.g., "node0"
                cpulist_path = node_dir / "cpulist"
                if cpulist_path.exists():
                    try:
                        cpulist = cpulist_path.read_text().strip()
                        numa_info[node_name] = _parse_cpu_list(cpulist)
                    except (OSError, IOError):
                        pass

    elif platform.system() == "Darwin":
        # macOS
        name_output = run_command(["sysctl", "-n", "machdep.cpu.brand_string"])
        if name_output:
            name = name_output.strip()
        cores_output = run_command(["sysctl", "-n", "hw.physicalcpu"])
        if cores_output:
            cores = int(cores_output.strip())
        threads_output = run_command(["sysctl", "-n", "hw.logicalcpu"])
        if threads_output:
            threads = int(threads_output.strip())

        # Get instruction sets on macOS
        features_output = run_command(["sysctl", "-n", "machdep.cpu.features"])
        if features_output:
            flags = features_output.lower().split()
            instruction_sets = _parse_cpu_flags(flags)

    return CPUInfo(
        name=name,
        cores=cores,
        threads=threads,
        numa_nodes=numa_nodes,
        instruction_sets=instruction_sets,
        numa_info=numa_info,
    )


def _parse_cpu_flags(flags: list[str]) -> list[str]:
    """Parse CPU flags to extract relevant instruction sets for KTransformers."""
    # Instruction sets important for KTransformers/kt-kernel
    relevant_instructions = {
        # Basic SIMD
        "sse": "SSE",
        "sse2": "SSE2",
        "sse3": "SSE3",
        "ssse3": "SSSE3",
        "sse4_1": "SSE4.1",
        "sse4_2": "SSE4.2",
        # AVX family
        "avx": "AVX",
        "avx2": "AVX2",
        "avx512f": "AVX512F",
        "avx512bw": "AVX512BW",
        "avx512vl": "AVX512VL",
        "avx512dq": "AVX512DQ",
        "avx512cd": "AVX512CD",
        "avx512vnni": "AVX512VNNI",
        "avx512_bf16": "AVX512BF16",
        "avx512_fp16": "AVX512FP16",
        "avx_vnni": "AVX-VNNI",
        # AMX (Advanced Matrix Extensions) - Intel
        "amx_tile": "AMX-TILE",
        "amx_bf16": "AMX-BF16",
        "amx_int8": "AMX-INT8",
        "amx_fp16": "AMX-FP16",
        # Other relevant
        "fma": "FMA",
        "f16c": "F16C",
        "bmi1": "BMI1",
        "bmi2": "BMI2",
    }

    found = []
    flags_lower = {f.lower() for f in flags}

    for flag, display_name in relevant_instructions.items():
        if flag in flags_lower:
            found.append(display_name)

    # Sort by importance for display
    priority = [
        "AMX-INT8",
        "AMX-BF16",
        "AMX-FP16",
        "AMX-TILE",
        "AVX512BF16",
        "AVX512VNNI",
        "AVX512F",
        "AVX512BW",
        "AVX512VL",
        "AVX2",
        "AVX",
        "FMA",
        "SSE4.2",
    ]
    result = []
    for p in priority:
        if p in found:
            result.append(p)
            found.remove(p)
    result.extend(sorted(found))  # Add remaining

    return result


def _parse_cpu_list(cpulist: str) -> list[int]:
    """Parse CPU list string like '0-3,8-11' to list of CPU IDs."""
    cpus = []
    for part in cpulist.split(","):
        if "-" in part:
            start, end = part.split("-")
            cpus.extend(range(int(start), int(end) + 1))
        else:
            cpus.append(int(part))
    return cpus


def detect_memory_info() -> MemoryInfo:
    """Detect detailed memory information including frequency and type."""
    total_gb = detect_ram_gb()
    available_gb = detect_available_ram_gb()
    frequency_mhz: Optional[int] = None
    channels: Optional[int] = None
    mem_type: Optional[str] = None

    if platform.system() == "Linux":
        # Try dmidecode without sudo first (may work if user has permissions)
        dmidecode_output = run_command(["dmidecode", "-t", "memory"])
        if dmidecode_output:
            frequency_mhz, mem_type, channels = _parse_dmidecode_memory(dmidecode_output)

        # Fallback: try to read from /sys or /proc
        if frequency_mhz is None:
            frequency_mhz = _detect_memory_frequency_sysfs()

    elif platform.system() == "Darwin":
        # macOS - use system_profiler
        mem_output = run_command(["system_profiler", "SPMemoryDataType"])
        if mem_output:
            frequency_mhz, mem_type = _parse_macos_memory(mem_output)

    return MemoryInfo(
        total_gb=total_gb,
        available_gb=available_gb,
        frequency_mhz=frequency_mhz,
        channels=channels,
        type=mem_type,
    )


def _parse_dmidecode_memory(output: str) -> tuple[Optional[int], Optional[str], Optional[int]]:
    """Parse dmidecode memory output."""
    frequency_mhz: Optional[int] = None
    mem_type: Optional[str] = None
    dimm_count = 0

    for line in output.split("\n"):
        line = line.strip()
        if line.startswith("Speed:") and "MHz" in line:
            try:
                # "Speed: 4800 MHz" or "Speed: 4800 MT/s"
                parts = line.split(":")[1].strip().split()
                freq = int(parts[0])
                if freq > 0 and (frequency_mhz is None or freq > frequency_mhz):
                    frequency_mhz = freq
            except (ValueError, IndexError):
                pass
        elif line.startswith("Type:") and mem_type is None:
            type_val = line.split(":")[1].strip()
            if type_val and type_val != "Unknown":
                mem_type = type_val
        elif line.startswith("Size:") and "MB" in line or "GB" in line:
            dimm_count += 1

    return frequency_mhz, mem_type, dimm_count if dimm_count > 0 else None


def _detect_memory_frequency_sysfs() -> Optional[int]:
    """Try to detect memory frequency from sysfs."""
    # This is a fallback and may not work on all systems
    try:
        # Try reading from edac
        edac_path = Path("/sys/devices/system/edac/mc")
        if edac_path.exists():
            for mc_dir in edac_path.iterdir():
                freq_file = mc_dir / "mc_config"
                if freq_file.exists():
                    content = freq_file.read_text()
                    # Parse for frequency information
                    # Format varies by system
                    pass
    except (OSError, IOError):
        pass

    return None


def _parse_macos_memory(output: str) -> tuple[Optional[int], Optional[str]]:
    """Parse macOS system_profiler memory output."""
    frequency_mhz: Optional[int] = None
    mem_type: Optional[str] = None

    for line in output.split("\n"):
        line = line.strip()
        if "Speed:" in line:
            try:
                parts = line.split(":")[1].strip().split()
                frequency_mhz = int(parts[0])
            except (ValueError, IndexError):
                pass
        elif "Type:" in line:
            mem_type = line.split(":")[1].strip()

    return frequency_mhz, mem_type


def detect_ram_gb() -> float:
    """Detect total system RAM in GB."""
    if platform.system() == "Linux":
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if line.startswith("MemTotal:"):
                        # "MemTotal:       32780516 kB"
                        kb = int(line.split()[1])
                        return round(kb / 1024 / 1024, 1)
        except (OSError, IOError, ValueError):
            pass
    elif platform.system() == "Darwin":
        mem_output = run_command(["sysctl", "-n", "hw.memsize"])
        if mem_output:
            return round(int(mem_output) / 1024 / 1024 / 1024, 1)

    # Fallback
    try:
        import psutil

        return round(psutil.virtual_memory().total / 1024 / 1024 / 1024, 1)
    except ImportError:
        return 0.0


def detect_available_ram_gb() -> float:
    """Detect available system RAM in GB."""
    if platform.system() == "Linux":
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if line.startswith("MemAvailable:"):
                        kb = int(line.split()[1])
                        return round(kb / 1024 / 1024, 1)
        except (OSError, IOError, ValueError):
            pass

    # Fallback
    try:
        import psutil

        return round(psutil.virtual_memory().available / 1024 / 1024 / 1024, 1)
    except ImportError:
        return 0.0


def detect_disk_space_gb(path: str = "/") -> tuple[float, float]:
    """Detect disk space (available, total) in GB for the given path."""
    try:
        import shutil

        total, used, free = shutil.disk_usage(path)
        return round(free / 1024 / 1024 / 1024, 1), round(total / 1024 / 1024 / 1024, 1)
    except (OSError, IOError):
        return 0.0, 0.0


def get_installed_package_version(package_name: str) -> Optional[str]:
    """Get the version of an installed Python package."""
    try:
        from importlib.metadata import version

        return version(package_name)
    except Exception:
        return None


def get_system_info() -> SystemInfo:
    """Gather complete system information."""
    return SystemInfo(
        python_version=platform.python_version(),
        platform=f"{platform.system()} {platform.release()}",
        cuda_version=detect_cuda_version(),
        gpus=detect_gpus(),
        cpu=detect_cpu_info(),
        ram_gb=detect_ram_gb(),
        env_managers=detect_env_managers(),
    )


def is_in_virtual_env() -> bool:
    """Check if currently running inside a virtual environment."""
    return (
        hasattr(sys, "real_prefix")
        or (hasattr(sys, "base_prefix") and sys.base_prefix != sys.prefix)
        or os.environ.get("VIRTUAL_ENV") is not None
        or os.environ.get("CONDA_PREFIX") is not None
    )


def get_current_env_name() -> Optional[str]:
    """Get the name of the current virtual environment."""
    if os.environ.get("CONDA_DEFAULT_ENV"):
        return os.environ["CONDA_DEFAULT_ENV"]
    if os.environ.get("VIRTUAL_ENV"):
        return Path(os.environ["VIRTUAL_ENV"]).name
    return None


# Import sys for is_in_virtual_env
import sys  # noqa: E402


@dataclass
class StorageLocation:
    """Information about a storage location."""

    path: str
    available_gb: float
    total_gb: float
    is_writable: bool
    mount_point: str


def scan_storage_locations(min_size_gb: float = 50.0) -> list[StorageLocation]:
    """
    Scan system for potential model storage locations.

    Looks for:
    - Large mounted filesystems (> min_size_gb)
    - Common model storage paths
    - User home directory

    Args:
        min_size_gb: Minimum available space in GB to consider

    Returns:
        List of StorageLocation sorted by available space (descending)
    """
    locations: dict[str, StorageLocation] = {}  # Use dict to deduplicate by path

    # Get all mount points from /proc/mounts (Linux)
    mount_points = _get_mount_points()

    for mount_point in mount_points:
        try:
            available_gb, total_gb = detect_disk_space_gb(mount_point)

            # Skip small or pseudo filesystems
            if total_gb < 10:
                continue

            # Check if writable
            is_writable = os.access(mount_point, os.W_OK)

            # Create potential model paths under this mount
            potential_paths = _get_potential_model_paths(mount_point)

            for path in potential_paths:
                if path in locations:
                    continue

                # Get actual available space for this path
                path_available, path_total = detect_disk_space_gb(path)

                if path_available >= min_size_gb:
                    path_writable = os.access(path, os.W_OK) if os.path.exists(path) else is_writable
                    locations[path] = StorageLocation(
                        path=path,
                        available_gb=path_available,
                        total_gb=path_total,
                        is_writable=path_writable,
                        mount_point=mount_point,
                    )
        except (OSError, IOError):
            continue

    # Also check common model storage locations
    common_paths = [
        str(Path.home() / ".ktransformers" / "models"),
        str(Path.home() / "models"),
        str(Path.home() / ".cache" / "huggingface"),
        "/data/models",
        "/models",
        "/opt/models",
    ]

    for path in common_paths:
        if path in locations:
            continue
        try:
            # Check if parent exists for paths that don't exist yet
            check_path = path
            while not os.path.exists(check_path) and check_path != "/":
                check_path = str(Path(check_path).parent)

            if os.path.exists(check_path):
                available_gb, total_gb = detect_disk_space_gb(check_path)
                if available_gb >= min_size_gb:
                    is_writable = os.access(check_path, os.W_OK)
                    locations[path] = StorageLocation(
                        path=path,
                        available_gb=available_gb,
                        total_gb=total_gb,
                        is_writable=is_writable,
                        mount_point=check_path,
                    )
        except (OSError, IOError):
            continue

    # Sort by available space descending, then by path
    sorted_locations = sorted(locations.values(), key=lambda x: (-x.available_gb, x.path))

    # Filter to only writable locations
    return [loc for loc in sorted_locations if loc.is_writable]


def _get_mount_points() -> list[str]:
    """Get all mount points on the system."""
    mount_points = []

    if platform.system() == "Linux":
        try:
            with open("/proc/mounts", "r") as f:
                for line in f:
                    parts = line.split()
                    if len(parts) >= 2:
                        mount_point = parts[1]
                        fs_type = parts[2] if len(parts) > 2 else ""

                        # Skip pseudo filesystems
                        skip_fs = {
                            "proc",
                            "sysfs",
                            "devpts",
                            "tmpfs",
                            "cgroup",
                            "cgroup2",
                            "pstore",
                            "securityfs",
                            "debugfs",
                            "hugetlbfs",
                            "mqueue",
                            "fusectl",
                            "configfs",
                            "devtmpfs",
                            "efivarfs",
                            "autofs",
                            "binfmt_misc",
                            "overlay",
                            "nsfs",
                            "tracefs",
                        }
                        if fs_type in skip_fs:
                            continue

                        # Skip paths that are clearly system paths
                        skip_prefixes = ("/sys", "/proc", "/dev", "/run/user")
                        if any(mount_point.startswith(p) for p in skip_prefixes):
                            continue

                        mount_points.append(mount_point)
        except (OSError, IOError):
            pass

    # Always include home and root
    mount_points.extend([str(Path.home()), "/"])

    # Deduplicate while preserving order
    seen = set()
    unique_mounts = []
    for mp in mount_points:
        if mp not in seen:
            seen.add(mp)
            unique_mounts.append(mp)

    return unique_mounts


def _get_potential_model_paths(mount_point: str) -> list[str]:
    """Get potential model storage paths under a mount point."""
    paths = []

    # The mount point itself (for dedicated data drives)
    if mount_point not in ("/", "/home"):
        paths.append(mount_point)
        paths.append(os.path.join(mount_point, "models"))

    # If it's under home, suggest standard locations
    home = str(Path.home())
    if mount_point == home or mount_point == "/home":
        paths.append(os.path.join(home, ".ktransformers", "models"))
        paths.append(os.path.join(home, "models"))

    # For root mount, suggest /data or /opt
    if mount_point == "/":
        paths.extend(["/data/models", "/opt/models"])

    # Check for common data directories on this mount
    for subdir in ["data", "models", "ai", "llm", "huggingface"]:
        potential = os.path.join(mount_point, subdir)
        if os.path.exists(potential) and os.path.isdir(potential):
            paths.append(potential)

    return paths


def format_size_gb(size_gb: float) -> str:
    """Format size in GB to human readable string."""
    if size_gb >= 1000:
        return f"{size_gb / 1000:.1f}TB"
    return f"{size_gb:.1f}GB"


@dataclass
class LocalModel:
    """Information about a locally detected model."""

    name: str
    path: str
    size_gb: float
    model_type: str  # "huggingface", "gguf", "safetensors"
    has_config: bool
    file_count: int


def scan_local_models(search_paths: list[str], max_depth: int = 3) -> list[LocalModel]:
    """
    Scan directories for locally downloaded models.

    Looks for:
    - Directories with config.json (HuggingFace format)
    - Directories with .safetensors files
    - Directories with .gguf files

    Args:
        search_paths: List of paths to search
        max_depth: Maximum directory depth to search

    Returns:
        List of LocalModel sorted by size (descending)
    """
    models: dict[str, LocalModel] = {}  # Use path as key to deduplicate

    for search_path in search_paths:
        if not os.path.exists(search_path):
            continue

        _scan_directory_for_models(search_path, models, current_depth=0, max_depth=max_depth)

    # Sort by size descending
    return sorted(models.values(), key=lambda x: -x.size_gb)


def _scan_directory_for_models(
    directory: str, models: dict[str, LocalModel], current_depth: int, max_depth: int
) -> None:
    """Recursively scan a directory for models."""
    if current_depth > max_depth:
        return

    try:
        entries = list(os.scandir(directory))
    except (PermissionError, OSError):
        return

    # Check if this directory is a model
    model = _detect_model_in_directory(directory, entries)
    if model:
        models[model.path] = model
        return  # Don't scan subdirectories of a model

    # Scan subdirectories
    for entry in entries:
        if entry.is_dir() and not entry.name.startswith("."):
            _scan_directory_for_models(entry.path, models, current_depth + 1, max_depth)


def _detect_model_in_directory(directory: str, entries: list) -> Optional[LocalModel]:
    """Detect if a directory contains a model."""
    entry_names = {e.name for e in entries}

    has_config = "config.json" in entry_names
    safetensor_files = [e for e in entries if e.name.endswith(".safetensors") and e.is_file()]
    gguf_files = [e for e in entries if e.name.endswith(".gguf") and e.is_file()]

    # Determine model type
    model_type = None
    if has_config and safetensor_files:
        model_type = "huggingface"
    elif gguf_files:
        model_type = "gguf"
    elif safetensor_files:
        model_type = "safetensors"
    elif has_config:
        # Config but no weights - might be incomplete
        # Check for other model-related files
        model_files = {
            "model.safetensors.index.json",
            "pytorch_model.bin.index.json",
            "model.safetensors",
            "pytorch_model.bin",
        }
        if entry_names & model_files:
            model_type = "huggingface"

    if not model_type:
        return None

    # Calculate directory size
    size_bytes = _get_directory_size(directory)
    size_gb = size_bytes / (1024**3)

    # Skip very small directories (likely incomplete or config-only)
    if size_gb < 0.1:
        return None

    # Get model name from directory name
    name = os.path.basename(directory)

    # Count model files
    file_count = len(safetensor_files) + len(gguf_files)
    if not file_count:
        # Count .bin files as fallback
        file_count = len([e for e in entries if e.name.endswith(".bin") and e.is_file()])

    return LocalModel(
        name=name,
        path=directory,
        size_gb=round(size_gb, 1),
        model_type=model_type,
        has_config=has_config,
        file_count=file_count,
    )


def _get_directory_size(directory: str) -> int:
    """Get total size of a directory in bytes."""
    total_size = 0
    try:
        for entry in os.scandir(directory):
            try:
                if entry.is_file(follow_symlinks=False):
                    total_size += entry.stat().st_size
                elif entry.is_dir(follow_symlinks=False):
                    total_size += _get_directory_size(entry.path)
            except (PermissionError, OSError):
                continue
    except (PermissionError, OSError):
        pass
    return total_size


def scan_models_in_location(location: StorageLocation, max_depth: int = 2) -> list[LocalModel]:
    """Scan a storage location for models."""
    search_paths = [location.path]

    # Also check common subdirectories
    for subdir in ["models", "huggingface", "hub", ".cache/huggingface/hub"]:
        subpath = os.path.join(location.path, subdir)
        if os.path.exists(subpath):
            search_paths.append(subpath)

    return scan_local_models(search_paths, max_depth=max_depth)


@dataclass
class CPUBuildFeatures:
    """CPU features for build configuration."""

    has_amx: bool
    has_avx512: bool
    has_avx512_vnni: bool
    has_avx512_bf16: bool
    has_avx2: bool
    recommended_instruct: str  # NATIVE, AVX512, AVX2
    recommended_amx: bool


def detect_cpu_build_features() -> CPUBuildFeatures:
    """
    Detect CPU features for build configuration.

    This is used to auto-configure kt-kernel source builds.
    Reads /proc/cpuinfo on Linux to detect instruction set support.

    Returns:
        CPUBuildFeatures with detection results
    """
    has_amx = False
    has_avx512 = False
    has_avx512_vnni = False
    has_avx512_bf16 = False
    has_avx2 = False

    if platform.system() == "Linux":
        try:
            with open("/proc/cpuinfo", "r") as f:
                content = f.read()

            # Get flags from first processor
            for line in content.split("\n"):
                if line.startswith("flags"):
                    flags = line.split(":")[1].strip().split()
                    flags_lower = {f.lower() for f in flags}

                    # Check for AMX support (requires all three)
                    if {"amx_tile", "amx_int8", "amx_bf16"} <= flags_lower:
                        has_amx = True

                    # Check for AVX512 support
                    if "avx512f" in flags_lower:
                        has_avx512 = True

                    # Check for AVX512 VNNI
                    if "avx512_vnni" in flags_lower or "avx512vnni" in flags_lower:
                        has_avx512_vnni = True

                    # Check for AVX512 BF16
                    if "avx512_bf16" in flags_lower or "avx512bf16" in flags_lower:
                        has_avx512_bf16 = True

                    # Check for AVX2
                    if "avx2" in flags_lower:
                        has_avx2 = True

                    break
        except (OSError, IOError):
            pass

    elif platform.system() == "Darwin":
        # macOS - use sysctl
        features_output = run_command(["sysctl", "-n", "machdep.cpu.features"])
        if features_output:
            flags_lower = {f.lower() for f in features_output.split()}
            has_avx2 = "avx2" in flags_lower
            # macOS doesn't have AMX or AVX512 typically

    # Determine recommended configuration
    if has_amx:
        recommended_instruct = "NATIVE"
        recommended_amx = True
    elif has_avx512:
        recommended_instruct = "NATIVE"
        recommended_amx = False
    elif has_avx2:
        recommended_instruct = "NATIVE"
        recommended_amx = False
    else:
        recommended_instruct = "AVX2"
        recommended_amx = False

    return CPUBuildFeatures(
        has_amx=has_amx,
        has_avx512=has_avx512,
        has_avx512_vnni=has_avx512_vnni,
        has_avx512_bf16=has_avx512_bf16,
        has_avx2=has_avx2,
        recommended_instruct=recommended_instruct,
        recommended_amx=recommended_amx,
    )


================================================
FILE: kt-kernel/python/cli/utils/input_validators.py
================================================
"""
Input validation utilities with retry mechanism.

Provides robust input validation with automatic retry on failure.
"""

from typing import Optional, List, Callable, Any
from rich.console import Console
from rich.prompt import Prompt

console = Console()


def prompt_int_with_retry(
    message: str,
    default: Optional[int] = None,
    min_val: Optional[int] = None,
    max_val: Optional[int] = None,
    validator: Optional[Callable[[int], bool]] = None,
    validator_error_msg: Optional[str] = None,
) -> int:
    """Prompt for integer input with validation and retry.

    Args:
        message: Prompt message
        default: Default value (optional)
        min_val: Minimum allowed value (optional)
        max_val: Maximum allowed value (optional)
        validator: Custom validation function (optional)
        validator_error_msg: Error message for custom validator (optional)

    Returns:
        Validated integer value
    """
    while True:
        # Build prompt with default
        if default is not None:
            prompt_text = f"{message} [{default}]"
        else:
            prompt_text = message

        # Get input
        user_input = Prompt.ask(prompt_text, default=str(default) if default is not None else None)

        # Try to parse as integer
        try:
            value = int(user_input)
        except ValueError:
            console.print(f"[red]✗ Invalid input. Please enter a valid integer.[/red]")
            console.print()
            continue

        # Validate range
        if min_val is not None and value < min_val:
            console.print(f"[red]✗ Value must be at least {min_val}[/red]")
            console.print()
            continue

        if max_val is not None and value > max_val:
            console.print(f"[red]✗ Value must be at most {max_val}[/red]")
            console.print()
            continue

        # Custom validation
        if validator is not None:
            if not validator(value):
                error_msg = validator_error_msg or "Invalid value"
                console.print(f"[red]✗ {error_msg}[/red]")
                console.print()
                continue

        # All validations passed
        return value


def prompt_float_with_retry(
    message: str,
    default: Optional[float] = None,
    min_val: Optional[float] = None,
    max_val: Optional[float] = None,
) -> float:
    """Prompt for float input with validation and retry.

    Args:
        message: Prompt message
        default: Default value (optional)
        min_val: Minimum allowed value (optional)
        max_val: Maximum allowed value (optional)

    Returns:
        Validated float value
    """
    while True:
        # Build prompt with default
        if default is not None:
            prompt_text = f"{message} [{default}]"
        else:
            prompt_text = message

        # Get input
        user_input = Prompt.ask(prompt_text, default=str(default) if default is not None else None)

        # Try to parse as float
        try:
            value = float(user_input)
        except ValueError:
            console.print(f"[red]✗ Invalid input. Please enter a valid number.[/red]")
            console.print()
            continue

        # Validate range
        if min_val is not None and value < min_val:
            console.print(f"[red]✗ Value must be at least {min_val}[/red]")
            console.print()
            continue

        if max_val is not None and value > max_val:
            console.print(f"[red]✗ Value must be at most {max_val}[/red]")
            console.print()
            continue

        # All validations passed
        return value


def prompt_choice_with_retry(
    message: str,
    choices: List[str],
    default: Optional[str] = None,
) -> str:
    """Prompt for choice input with validation and retry.

    Args:
        message: Prompt message
        choices: List of valid choices
        default: Default choice (optional)

    Returns:
        Selected choice
    """
    while True:
        # Get input
        user_input = Prompt.ask(message, default=default)

        # Validate choice
        if user_input not in choices:
            console.print(f"[red]✗ Invalid choice. Please select from: {', '.join(choices)}[/red]")
            console.print()
            continue

        return user_input


def prompt_int_list_with_retry(
    message: str,
    default: Optional[str] = None,
    min_val: Optional[int] = None,
    max_val: Optional[int] = None,
    validator: Optional[Callable[[List[int]], tuple[bool, Optional[str]]]] = None,
) -> List[int]:
    """Prompt for comma-separated integer list with validation and retry.

    Args:
        message: Prompt message
        default: Default value as string (e.g., "0,1,2,3")
        min_val: Minimum allowed value for each integer (optional)
        max_val: Maximum allowed value for each integer (optional)
        validator: Custom validation function that returns (is_valid, error_message) (optional)

    Returns:
        List of validated integers
    """
    while True:
        # Get input
        user_input = Prompt.ask(message, default=default)

        # Clean input: support Chinese comma and spaces
        user_input_cleaned = user_input.replace("，", ",").replace(" ", "")

        # Try to parse as integers
        try:
            values = [int(x.strip()) for x in user_input_cleaned.split(",") if x.strip()]
        except ValueError:
            console.print(f"[red]✗ Invalid format. Please enter numbers separated by commas.[/red]")
            console.print()
            continue

        # Validate each value's range
        invalid_values = []
        for value in values:
            if min_val is not None and value < min_val:
                invalid_values.append(value)
            elif max_val is not None and value > max_val:
                invalid_values.append(value)

        if invalid_values:
            if min_val is not None and max_val is not None:
                console.print(f"[red]✗ Invalid value(s): {invalid_values}[/red]")
                console.print(f"[yellow]Valid range: {min_val}-{max_val}[/yellow]")
            elif min_val is not None:
                console.print(f"[red]✗ Value(s) must be at least {min_val}: {invalid_values}[/red]")
            elif max_val is not None:
                console.print(f"[red]✗ Value(s) must be at most {max_val}: {invalid_values}[/red]")
            console.print()
            continue

        # Custom validation
        if validator is not None:
            is_valid, error_msg = validator(values)
            if not is_valid:
                console.print(f"[red]✗ {error_msg}[/red]")
                console.print()
                continue

        # All validations passed
        return values


================================================
FILE: kt-kernel/python/cli/utils/kv_cache_calculator.py
================================================
#!/usr/bin/env python3
"""
KV Cache Size Calculator for SGLang

This script calculates the KV cache size in GB for a given model and number of tokens.
It follows the same logic as in sglang/srt/model_executor/model_runner.py
"""

import os
import sys
import torch
from transformers import AutoConfig

# Add sglang to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "python"))

from sglang.srt.configs.model_config import ModelConfig, is_deepseek_nsa, get_nsa_index_head_dim
from sglang.srt.mem_cache.memory_pool import NSATokenToKVPool


def get_dtype_bytes(dtype_str: str) -> int:
    """Get the number of bytes for a given dtype string."""
    dtype_map = {
        "float32": 4,
        "float16": 2,
        "bfloat16": 2,
        "float8_e4m3fn": 1,
        "float8_e5m2": 1,
        "auto": 2,  # Usually defaults to bfloat16
    }
    return dtype_map.get(dtype_str, 2)


def get_kv_size_gb(
    model_path: str,
    max_total_tokens: int,
    tp: int = 1,
    dtype: str = "auto",
    verbose: bool = True,
) -> dict:
    """
    Calculate the KV cache size in GB for a given model and number of tokens.

    Args:
        model_path: Path to the model
        max_total_tokens: Maximum number of tokens to cache
        tp: Tensor parallelism size
        dtype: Data type for KV cache (auto, float16, bfloat16, float8_e4m3fn, etc.)
        verbose: Whether to print detailed information

    Returns:
        dict: Dictionary containing calculation details
    """
    # Load model config
    model_config = ModelConfig(model_path, dtype=dtype)
    hf_config = model_config.hf_config

    # Determine dtype bytes
    dtype_bytes = get_dtype_bytes(dtype)
    if dtype == "auto":
        # Auto dtype usually becomes bfloat16
        dtype_bytes = 2

    # Number of layers
    num_layers = model_config.num_attention_layers

    # Check if it's MLA (Multi-head Latent Attention) model
    is_mla = hasattr(model_config, "attention_arch") and model_config.attention_arch.name == "MLA"

    result = {
        "model_path": model_path,
        "max_total_tokens": max_total_tokens,
        "tp": tp,
        "dtype": dtype,
        "dtype_bytes": dtype_bytes,
        "num_layers": num_layers,
        "is_mla": is_mla,
    }

    if is_mla:
        # MLA models (DeepSeek-V2/V3, MiniCPM3, etc.)
        kv_lora_rank = model_config.kv_lora_rank
        qk_rope_head_dim = model_config.qk_rope_head_dim

        # Calculate cell size (per token)
        cell_size = (kv_lora_rank + qk_rope_head_dim) * num_layers * dtype_bytes

        result.update(
            {
                "kv_lora_rank": kv_lora_rank,
                "qk_rope_head_dim": qk_rope_head_dim,
                "cell_size_bytes": cell_size,
            }
        )

        # Check if it's NSA (Native Sparse Attention) model
        if is_deepseek_nsa(hf_config):
            index_head_dim = get_nsa_index_head_dim(hf_config)
            indexer_size_per_token = index_head_dim + index_head_dim // NSATokenToKVPool.quant_block_size * 4
            indexer_dtype_bytes = torch._utils._element_size(NSATokenToKVPool.index_k_with_scale_buffer_dtype)
            indexer_cell_size = indexer_size_per_token * num_layers * indexer_dtype_bytes
            cell_size += indexer_cell_size

            result.update(
                {
                    "is_nsa": True,
                    "index_head_dim": index_head_dim,
                    "indexer_cell_size_bytes": indexer_cell_size,
                    "total_cell_size_bytes": cell_size,
                }
            )
        else:
            result["is_nsa"] = False
    else:
        # Standard MHA models
        num_kv_heads = model_config.get_num_kv_heads(tp)
        head_dim = model_config.head_dim
        v_head_dim = model_config.v_head_dim

        # Calculate cell size (per token)
        cell_size = num_kv_heads * (head_dim + v_head_dim) * num_layers * dtype_bytes

        result.update(
            {
                "num_kv_heads": num_kv_heads,
                "head_dim": head_dim,
                "v_head_dim": v_head_dim,
                "cell_size_bytes": cell_size,
            }
        )

    # Calculate total KV cache size
    total_size_bytes = max_total_tokens * cell_size
    total_size_gb = total_size_bytes / (1024**3)

    # For MHA models with separate K and V buffers
    if not is_mla:
        k_size_bytes = max_total_tokens * num_kv_heads * head_dim * num_layers * dtype_bytes
        v_size_bytes = max_total_tokens * num_kv_heads * v_head_dim * num_layers * dtype_bytes
        k_size_gb = k_size_bytes / (1024**3)
        v_size_gb = v_size_bytes / (1024**3)

        result.update(
            {
                "k_size_gb": k_size_gb,
                "v_size_gb": v_size_gb,
            }
        )

    result.update(
        {
            "total_size_bytes": total_size_bytes,
            "total_size_gb": total_size_gb,
        }
    )

    if verbose:
        print(f"Model: {model_path}")
        print(f"Tokens: {max_total_tokens}, TP: {tp}, Dtype: {dtype}")
        print(f"Architecture: {'MLA' if is_mla else 'MHA'}")
        print(f"Layers: {num_layers}")

        if is_mla:
            print(f"KV LoRA Rank: {kv_lora_rank}, QK RoPE Head Dim: {qk_rope_head_dim}")
            if result.get("is_nsa"):
                print(f"NSA Index Head Dim: {index_head_dim}")
                print(
                    f"Cell size: {cell_size} bytes (Main: {result['cell_size_bytes']}, Indexer: {result['indexer_cell_size_bytes']})"
                )
            else:
                print(f"Cell size: {cell_size} bytes")
        else:
            print(f"KV Heads: {num_kv_heads}, Head Dim: {head_dim}, V Head Dim: {v_head_dim}")
            print(f"Cell size: {cell_size} bytes")
            print(f"K size: {k_size_gb:.2f} GB, V size: {v_size_gb:.2f} GB")

        print(f"Total KV Cache Size: {total_size_gb:.2f} GB")

    return result


def main():
    import argparse

    parser = argparse.ArgumentParser(description="Calculate KV cache size for a model")
    parser.add_argument("model_path", help="Path to the model")
    parser.add_argument("max_total_tokens", type=int, help="Maximum number of tokens")
    parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism size")
    parser.add_argument("--dtype", type=str, default="auto", help="Data type (auto, float16, bfloat16, etc.)")
    parser.add_argument("--quiet", action="store_true", help="Suppress verbose output")

    args = parser.parse_args()

    result = get_kv_size_gb(
        args.model_path,
        args.max_total_tokens,
        tp=args.tp,
        dtype=args.dtype,
        verbose=not args.quiet,
    )

    if args.quiet:
        print(f"{result['total_size_gb']:.2f}")


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/python/cli/utils/model_discovery.py
================================================
"""
Model Discovery Utilities

Shared functions for discovering and registering new models across different commands.
"""

from typing import List, Optional, Tuple
from pathlib import Path
from rich.console import Console

from kt_kernel.cli.utils.model_scanner import (
    discover_models,
    scan_directory_for_models,
    ScannedModel,
)
from kt_kernel.cli.utils.user_model_registry import UserModelRegistry, UserModel


console = Console()


def discover_and_register_global(
    min_size_gb: float = 2.0, max_depth: int = 6, show_progress: bool = True, lang: str = "en"
) -> Tuple[int, int, List[UserModel]]:
    """
    Perform global model discovery and register new models.

    Args:
        min_size_gb: Minimum model size in GB
        max_depth: Maximum search depth
        show_progress: Whether to show progress messages
        lang: Language for messages ("en" or "zh")

    Returns:
        Tuple of (total_found, new_found, registered_models)
    """
    registry = UserModelRegistry()

    if show_progress:
        if lang == "zh":
            console.print("[dim]正在扫描系统中的模型权重，这可能需要30-60秒...[/dim]")
        else:
            console.print("[dim]Scanning system for model weights, this may take 30-60 seconds...[/dim]")

    # Global scan
    all_models = discover_models(mount_points=None, min_size_gb=min_size_gb, max_depth=max_depth)

    # Filter out existing models
    new_models = []
    for model in all_models:
        if not registry.find_by_path(model.path):
            new_models.append(model)

    # Register new models
    registered = []
    for model in new_models:
        user_model = _create_and_register_model(registry, model)
        if user_model:
            registered.append(user_model)

    return len(all_models), len(new_models), registered


def discover_and_register_path(
    path: str,
    min_size_gb: float = 2.0,
    existing_paths: Optional[set] = None,
    show_progress: bool = True,
    lang: str = "en",
) -> Tuple[int, int, List[UserModel]]:
    """
    Discover models in a specific path and register new ones.

    Args:
        path: Directory path to scan
        min_size_gb: Minimum model file size in GB
        existing_paths: Set of already discovered paths in this session (optional)
        show_progress: Whether to show progress messages
        lang: Language for messages ("en" or "zh")

    Returns:
        Tuple of (total_found, new_found, registered_models)
    """
    registry = UserModelRegistry()

    if show_progress:
        if lang == "zh":
            console.print(f"[dim]正在扫描 {path}...[/dim]")
        else:
            console.print(f"[dim]Scanning {path}...[/dim]")

    # Scan directory
    model_info = scan_directory_for_models(path, min_file_size_gb=min_size_gb)

    if not model_info:
        return 0, 0, []

    # Convert to ScannedModel and filter
    new_models = []
    for dir_path, (format_type, size_bytes, file_count, files) in model_info.items():
        # Check if already in registry
        if registry.find_by_path(dir_path):
            continue

        # Check if already discovered in this session
        if existing_paths and dir_path in existing_paths:
            continue

        model = ScannedModel(
            path=dir_path, format=format_type, size_bytes=size_bytes, file_count=file_count, files=files
        )
        new_models.append(model)

    # Register new models
    registered = []
    for model in new_models:
        user_model = _create_and_register_model(registry, model)
        if user_model:
            registered.append(user_model)

    return len(model_info), len(new_models), registered


def _create_and_register_model(registry: UserModelRegistry, scanned_model: ScannedModel) -> Optional[UserModel]:
    """
    Create a UserModel from ScannedModel and register it.

    Handles name conflicts by suggesting a unique name (e.g., model-2, model-3).
    Automatically detects repo_id from README.md YAML frontmatter.
    Automatically detects and caches MoE information for safetensors models.

    Args:
        registry: UserModelRegistry instance
        scanned_model: ScannedModel to register

    Returns:
        Registered UserModel or None if failed
    """
    # Use suggest_name to get a unique name (adds -2, -3, etc. if needed)
    unique_name = registry.suggest_name(scanned_model.folder_name)

    user_model = UserModel(name=unique_name, path=scanned_model.path, format=scanned_model.format)

    # Auto-detect repo_id from README.md (only YAML frontmatter)
    try:
        from kt_kernel.cli.utils.repo_detector import detect_repo_for_model

        repo_info = detect_repo_for_model(scanned_model.path)
        if repo_info:
            repo_id, repo_type = repo_info
            user_model.repo_id = repo_id
            user_model.repo_type = repo_type
    except Exception:
        # Silently continue if detection fails
        pass

    # Auto-detect MoE information for safetensors models
    if scanned_model.format == "safetensors":
        try:
            from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model

            moe_result = analyze_moe_model(scanned_model.path, use_cache=True)
            if moe_result and moe_result.get("is_moe"):
                user_model.is_moe = True
                user_model.moe_num_experts = moe_result.get("num_experts")
                user_model.moe_num_experts_per_tok = moe_result.get("num_experts_per_tok")
            else:
                user_model.is_moe = False
        except Exception:
            # Silently continue if MoE detection fails
            # is_moe will remain None
            pass

    try:
        registry.add_model(user_model)
        return user_model
    except Exception:
        # Should not happen since we used suggest_name, but handle gracefully
        return None


def format_discovery_summary(
    total_found: int,
    new_found: int,
    registered: List[UserModel],
    lang: str = "en",
    show_models: bool = True,
    max_show: int = 10,
) -> None:
    """
    Print formatted discovery summary.

    Args:
        total_found: Total models found
        new_found: New models found
        registered: List of registered UserModel objects
        lang: Language ("en" or "zh")
        show_models: Whether to show model list
        max_show: Maximum models to show
    """
    console.print()

    if new_found == 0:
        if total_found > 0:
            if lang == "zh":
                console.print(f"[green]✓[/green] 扫描完成：找到 {total_found} 个模型，所有模型均已在列表中")
            else:
                console.print(f"[green]✓[/green] Scan complete: found {total_found} models, all already in the list")
        else:
            if lang == "zh":
                console.print("[yellow]未找到模型[/yellow]")
            else:
                console.print("[yellow]No models found[/yellow]")
        return

    # Show summary
    if lang == "zh":
        console.print(f"[green]✓[/green] 扫描完成：找到 {total_found} 个模型，其中 {new_found} 个为新模型")
    else:
        console.print(f"[green]✓[/green] Scan complete: found {total_found} models, {new_found} are new")

    # Show registered count
    if len(registered) > 0:
        if lang == "zh":
            console.print(f"[green]✓[/green] 成功添加 {len(registered)} 个新模型到列表")
        else:
            console.print(f"[green]✓[/green] Successfully added {len(registered)} new models to list")

    # Show model list
    if show_models and registered:
        console.print()
        if lang == "zh":
            console.print(f"[dim]新发现的模型（前{max_show}个）:[/dim]")
        else:
            console.print(f"[dim]Newly discovered models (first {max_show}):[/dim]")

        for i, model in enumerate(registered[:max_show], 1):
            # Get size from registry or estimate
            size_str = "?.? GB"
            # Try to find the ScannedModel to get size
            # For now just show name and path
            console.print(f"  {i}. {model.name} ({model.format})")
            console.print(f"     [dim]{model.path}[/dim]")

        if len(registered) > max_show:
            remaining = len(registered) - max_show
            if lang == "zh":
                console.print(f"  [dim]... 还有 {remaining} 个新模型[/dim]")
            else:
                console.print(f"  [dim]... and {remaining} more new models[/dim]")


================================================
FILE: kt-kernel/python/cli/utils/model_registry.py
================================================
"""
Model registry for kt-cli.

Provides a registry of supported models with fuzzy matching capabilities.
"""

import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Optional

import yaml

from kt_kernel.cli.config.settings import get_settings


@dataclass
class ModelInfo:
    """Information about a supported model."""

    name: str
    hf_repo: str
    aliases: list[str] = field(default_factory=list)
    type: str = "moe"  # moe, dense
    gpu_vram_gb: float = 0
    cpu_ram_gb: float = 0
    default_params: dict = field(default_factory=dict)
    description: str = ""
    description_zh: str = ""
    max_tensor_parallel_size: Optional[int] = None  # Maximum tensor parallel size for this model


# Built-in model registry
BUILTIN_MODELS: list[ModelInfo] = [
    ModelInfo(
        name="DeepSeek-V3-0324",
        hf_repo="deepseek-ai/DeepSeek-V3-0324",
        aliases=["deepseek-v3-0324", "deepseek-v3", "dsv3", "deepseek3", "v3-0324"],
        type="moe",
        default_params={
            "kt-num-gpu-experts": 1,
            "attention-backend": "triton",
            "disable-shared-experts-fusion": True,
            "kt-method": "AMXINT4",
        },
        description="DeepSeek V3-0324 685B MoE model (March 2025, improved benchmarks)",
        description_zh="DeepSeek V3-0324 685B MoE 模型（2025年3月，改进的基准测试）",
    ),
    ModelInfo(
        name="DeepSeek-V3.2",
        hf_repo="deepseek-ai/DeepSeek-V3.2",
        aliases=["deepseek-v3.2", "dsv3.2", "deepseek3.2", "v3.2"],
        type="moe",
        default_params={
            "kt-method": "FP8",
            "kt-gpu-prefill-token-threshold": 4096,
            "attention-backend": "flashinfer",
            "fp8-gemm-backend": "triton",
            "max-total-tokens": 100000,
            "max-running-requests": 16,
            "chunked-prefill-size": 32768,
            "mem-fraction-static": 0.80,
            "watchdog-timeout": 3000,
            "served-model-name": "DeepSeek-V3.2",
            "disable-shared-experts-fusion": True,
        },
        description="DeepSeek V3.2 671B MoE model (latest)",
        description_zh="DeepSeek V3.2 671B MoE 模型（最新）",
    ),
    ModelInfo(
        name="DeepSeek-R1-0528",
        hf_repo="deepseek-ai/DeepSeek-R1-0528",
        aliases=["deepseek-r1-0528", "deepseek-r1", "dsr1", "r1", "r1-0528"],
        type="moe",
        default_params={
            "kt-num-gpu-experts": 1,
            "attention-backend": "triton",
            "disable-shared-experts-fusion": True,
            "kt-method": "AMXINT4",
        },
        description="DeepSeek R1-0528 reasoning model (May 2025, improved reasoning depth)",
        description_zh="DeepSeek R1-0528 推理模型（2025年5月，改进的推理深度）",
    ),
    ModelInfo(
        name="Kimi-K2-Thinking",
        hf_repo="moonshotai/Kimi-K2-Thinking",
        aliases=["kimi-k2-thinking", "kimi-thinking", "k2-thinking", "kimi", "k2"],
        type="moe",
        default_params={
            "kt-method": "RAWINT4",
            "kt-gpu-prefill-token-threshold": 400,
            "attention-backend": "flashinfer",
            "max-total-tokens": 100000,
            "max-running-requests": 16,
            "chunked-prefill-size": 32768,
            "mem-fraction-static": 0.80,
            "watchdog-timeout": 3000,
            "served-model-name": "Kimi-K2-Thinking",
            "disable-shared-experts-fusion": True,
        },
        description="Moonshot Kimi K2 Thinking MoE model",
        description_zh="月之暗面 Kimi K2 Thinking MoE 模型",
    ),
    ModelInfo(
        name="MiniMax-M2",
        hf_repo="MiniMaxAI/MiniMax-M2",
        aliases=["minimax-m2", "m2"],
        type="moe",
        default_params={
            "kt-method": "FP8",
            "kt-gpu-prefill-token-threshold": 4096,
            "attention-backend": "flashinfer",
            "fp8-gemm-backend": "triton",
            "max-total-tokens": 100000,
            "max-running-requests": 16,
            "chunked-prefill-size": 32768,
            "mem-fraction-static": 0.80,
            "watchdog-timeout": 3000,
            "served-model-name": "MiniMax-M2",
            "disable-shared-experts-fusion": True,
            "tool-call-parser": "minimax-m2",
            "reasoning-parser": "minimax-append-think",
        },
        description="MiniMax M2 MoE model",
        description_zh="MiniMax M2 MoE 模型",
        max_tensor_parallel_size=4,  # M2 only supports up to 4-way tensor parallelism
    ),
    ModelInfo(
        name="MiniMax-M2.1",
        hf_repo="MiniMaxAI/MiniMax-M2.1",
        aliases=["minimax-m2.1", "m2.1"],
        type="moe",
        default_params={
            "kt-method": "FP8",
            "kt-gpu-prefill-token-threshold": 4096,
            "attention-backend": "flashinfer",
            "fp8-gemm-backend": "triton",
            "max-total-tokens": 100000,
            "max-running-requests": 16,
            "chunked-prefill-size": 32768,
            "mem-fraction-static": 0.80,
            "watchdog-timeout": 3000,
            "served-model-name": "MiniMax-M2.1",
            "disable-shared-experts-fusion": True,
            "tool-call-parser": "minimax-m2",
            "reasoning-parser": "minimax-append-think",
        },
        description="MiniMax M2.1 MoE model (enhanced multi-language programming)",
        description_zh="MiniMax M2.1 MoE 模型（增强多语言编程能力）",
        max_tensor_parallel_size=4,  # M2.1 only supports up to 4-way tensor parallelism
    ),
]


class ModelRegistry:
    """Registry of supported models with fuzzy matching."""

    def __init__(self):
        """Initialize the model registry."""
        self._models: dict[str, ModelInfo] = {}
        self._aliases: dict[str, str] = {}
        self._load_builtin_models()
        self._load_user_models()

    def _load_builtin_models(self) -> None:
        """Load built-in models."""
        for model in BUILTIN_MODELS:
            self._register(model)

    def _load_user_models(self) -> None:
        """Load user-defined models from config."""
        settings = get_settings()
        registry_file = settings.config_dir / "registry.yaml"

        if registry_file.exists():
            try:
                with open(registry_file, "r", encoding="utf-8") as f:
                    data = yaml.safe_load(f) or {}

                for name, info in data.get("models", {}).items():
                    model = ModelInfo(
                        name=name,
                        hf_repo=info.get("hf_repo", ""),
                        aliases=info.get("aliases", []),
                        type=info.get("type", "moe"),
                        gpu_vram_gb=info.get("gpu_vram_gb", 0),
                        cpu_ram_gb=info.get("cpu_ram_gb", 0),
                        default_params=info.get("default_params", {}),
                        description=info.get("description", ""),
                        description_zh=info.get("description_zh", ""),
                        max_tensor_parallel_size=info.get("max_tensor_parallel_size"),
                    )
                    self._register(model)
            except (yaml.YAMLError, OSError):
                pass

    def _register(self, model: ModelInfo) -> None:
        """Register a model."""
        self._models[model.name.lower()] = model

        # Register aliases
        for alias in model.aliases:
            self._aliases[alias.lower()] = model.name.lower()

    def get(self, name: str) -> Optional[ModelInfo]:
        """Get a model by exact name or alias."""
        name_lower = name.lower()

        # Check direct match
        if name_lower in self._models:
            return self._models[name_lower]

        # Check aliases
        if name_lower in self._aliases:
            return self._models[self._aliases[name_lower]]

        return None

    def search(self, query: str, limit: int = 10) -> list[ModelInfo]:
        """Search for models using fuzzy matching.

        Args:
            query: Search query
            limit: Maximum number of results

        Returns:
            List of matching models, sorted by relevance
        """
        query_lower = query.lower()
        results: list[tuple[float, ModelInfo]] = []

        for model in self._models.values():
            score = self._match_score(query_lower, model)
            if score > 0:
                results.append((score, model))

        # Sort by score descending
        results.sort(key=lambda x: x[0], reverse=True)

        return [model for _, model in results[:limit]]

    def _match_score(self, query: str, model: ModelInfo) -> float:
        """Calculate match score for a model.

        Returns a score between 0 and 1, where 1 is an exact match.
        """
        # Check exact match
        if query == model.name.lower():
            return 1.0

        # Check alias exact match
        for alias in model.aliases:
            if query == alias.lower():
                return 0.95

        # Check if query is contained in name
        if query in model.name.lower():
            return 0.8

        # Check if query is contained in aliases
        for alias in model.aliases:
            if query in alias.lower():
                return 0.7

        # Check if query is contained in hf_repo
        if query in model.hf_repo.lower():
            return 0.6

        # Fuzzy matching - check if all query parts are present
        query_parts = re.split(r"[-_.\s]", query)
        name_lower = model.name.lower()

        matches = sum(1 for part in query_parts if part and part in name_lower)
        if matches > 0:
            return 0.5 * (matches / len(query_parts))

        return 0.0

    def list_all(self) -> list[ModelInfo]:
        """List all registered models."""
        return list(self._models.values())

    def find_local_models(self, max_depth: int = 3) -> list[tuple[ModelInfo, Path]]:
        """Find models that are downloaded locally in any configured model path.

        Args:
            max_depth: Maximum depth to search within each model path (default: 3)

        Returns:
            List of (ModelInfo, path) tuples for local models
        """
        settings = get_settings()
        model_paths = settings.get_model_paths()
        results = []

        for model in self._models.values():
            found = False
            # Search in all configured model directories
            for models_dir in model_paths:
                if not models_dir.exists():
                    continue

                # Generate possible names to search for
                possible_names = [
                    model.name,
                    model.name.lower(),
                    model.hf_repo.split("/")[-1],
                    model.hf_repo.replace("/", "--"),
                ]

                # Search recursively up to max_depth
                for depth in range(max_depth):
                    # Build glob pattern for current depth
                    # depth=0: direct children, depth=1: grandchildren, etc.
                    glob_pattern = "*" if depth > 0 else ""
                    for _ in range(depth):
                        glob_pattern = "*/" + glob_pattern if glob_pattern else "*"

                    for name in possible_names:
                        if depth == 0:
                            # Direct children: models_dir / name
                            search_paths = [models_dir / name]
                        else:
                            # Nested: use rglob to find directories matching the name
                            search_paths = list(models_dir.rglob(name))

                        for path in search_paths:
                            if path.exists() and (path / "config.json").exists():
                                results.append((model, path))
                                found = True
                                break

                        if found:
                            break

                    if found:
                        break

                if found:
                    break

        return results


# Global registry instance
_registry: Optional[ModelRegistry] = None


def get_registry() -> ModelRegistry:
    """Get the global model registry instance."""
    global _registry
    if _registry is None:
        _registry = ModelRegistry()
    return _registry


# ============================================================================
# Model-specific parameter computation functions
# ============================================================================


def compute_deepseek_v3_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
    per_gpu_gb = 16
    if vram_per_gpu_gb < per_gpu_gb:
        return int(0)
    total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))

    return total_vram // 3


def compute_kimi_k2_thinking_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
    """Compute kt-num-gpu-experts for Kimi K2 Thinking."""
    per_gpu_gb = 16
    if vram_per_gpu_gb < per_gpu_gb:
        return int(0)
    total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))

    return total_vram * 2 // 3


def compute_minimax_m2_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
    """Compute kt-num-gpu-experts for MiniMax M2/M2.1."""
    per_gpu_gb = 16
    if vram_per_gpu_gb < per_gpu_gb:
        return int(0)
    total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))

    return total_vram // 1


# Model name to computation function mapping
MODEL_COMPUTE_FUNCTIONS: dict[str, Callable[[int, float], int]] = {
    "DeepSeek-V3-0324": compute_deepseek_v3_gpu_experts,
    "DeepSeek-V3.2": compute_deepseek_v3_gpu_experts,  # Same as V3-0324
    "DeepSeek-R1-0528": compute_deepseek_v3_gpu_experts,  # Same as V3-0324
    "Kimi-K2-Thinking": compute_kimi_k2_thinking_gpu_experts,
    "MiniMax-M2": compute_minimax_m2_gpu_experts,
    "MiniMax-M2.1": compute_minimax_m2_gpu_experts,  # Same as M2
}


================================================
FILE: kt-kernel/python/cli/utils/model_scanner.py
================================================
"""
Model Scanner

Scans directories for model files (safetensors, gguf) and identifies models
"""

from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Set, Tuple, Dict
from collections import defaultdict
import os
import subprocess
import json


@dataclass
class ScannedModel:
    """Temporary structure for scanned model information"""

    path: str  # Absolute path to model directory
    format: str  # "safetensors" | "gguf" | "mixed"
    size_bytes: int  # Total size in bytes
    file_count: int  # Number of model files
    files: List[str]  # List of model file names

    @property
    def size_gb(self) -> float:
        """Get size in GB"""
        return self.size_bytes / (1024**3)

    @property
    def folder_name(self) -> str:
        """Get the folder name (default model name)"""
        return Path(self.path).name


class ModelScanner:
    """Scanner for discovering models in directory trees"""

    def __init__(self, min_size_gb: float = 10.0):
        """
        Initialize scanner

        Args:
            min_size_gb: Minimum folder size in GB to be considered a model
        """
        self.min_size_bytes = int(min_size_gb * 1024**3)

    def scan_directory(
        self, base_path: Path, exclude_paths: Optional[Set[str]] = None
    ) -> Tuple[List[ScannedModel], List[str]]:
        """
        Scan directory tree for models

        Args:
            base_path: Root directory to scan
            exclude_paths: Set of absolute paths to exclude from results

        Returns:
            Tuple of (valid_models, warnings)
            - valid_models: List of ScannedModel instances
            - warnings: List of warning messages
        """
        if not base_path.exists():
            raise ValueError(f"Path does not exist: {base_path}")

        if not base_path.is_dir():
            raise ValueError(f"Path is not a directory: {base_path}")

        exclude_paths = exclude_paths or set()
        results: List[ScannedModel] = []
        warnings: List[str] = []

        # Walk the directory tree
        for root, dirs, files in os.walk(base_path):
            root_path = Path(root).resolve()

            # Skip if already registered
            if str(root_path) in exclude_paths:
                dirs[:] = []  # Don't descend into this directory
                continue

            # Check for model files
            safetensors_files = [f for f in files if f.endswith(".safetensors")]
            gguf_files = [f for f in files if f.endswith(".gguf")]

            if not safetensors_files and not gguf_files:
                continue  # No model files in this directory

            # Calculate total size
            model_files = safetensors_files + gguf_files
            total_size = self._calculate_total_size(root_path, model_files)

            # Check if size meets minimum threshold
            if total_size < self.min_size_bytes:
                continue  # Too small, but keep scanning subdirectories

            # Detect format
            if safetensors_files and gguf_files:
                # Mixed format - issue warning
                warnings.append(
                    f"Mixed format detected in {root_path}: "
                    f"{len(safetensors_files)} safetensors + {len(gguf_files)} gguf files. "
                    "Please separate into different folders and re-scan."
                )
                dirs[:] = []  # Don't descend into mixed format directories
                continue

            # Determine format
            format_type = "safetensors" if safetensors_files else "gguf"

            # Create scanned model
            scanned = ScannedModel(
                path=str(root_path),
                format=format_type,
                size_bytes=total_size,
                file_count=len(model_files),
                files=model_files,
            )

            results.append(scanned)

            # Continue scanning subdirectories - they might also contain models
            # Each subdirectory will be independently checked for size >= 10GB

        return results, warnings

    def scan_single_path(self, path: Path) -> Optional[ScannedModel]:
        """
        Scan a single path for model files

        Args:
            path: Path to scan

        Returns:
            ScannedModel instance or None if not a valid model
        """
        if not path.exists() or not path.is_dir():
            return None

        # Find model files
        safetensors_files = list(path.glob("*.safetensors"))
        gguf_files = list(path.glob("*.gguf"))

        if not safetensors_files and not gguf_files:
            return None

        # Check for mixed format
        if safetensors_files and gguf_files:
            raise ValueError(
                f"Mixed format detected: {len(safetensors_files)} safetensors + "
                f"{len(gguf_files)} gguf files. Please use a single format."
            )

        # Calculate size
        model_files = [f.name for f in safetensors_files + gguf_files]
        total_size = self._calculate_total_size(path, model_files)

        # Determine format
        format_type = "safetensors" if safetensors_files else "gguf"

        return ScannedModel(
            path=str(path.resolve()),
            format=format_type,
            size_bytes=total_size,
            file_count=len(model_files),
            files=model_files,
        )

    def _calculate_total_size(self, directory: Path, filenames: List[str]) -> int:
        """
        Calculate total size of specified files in directory

        Args:
            directory: Directory containing the files
            filenames: List of filenames to sum

        Returns:
            Total size in bytes
        """
        total = 0
        for filename in filenames:
            file_path = directory / filename
            if file_path.exists():
                try:
                    total += file_path.stat().st_size
                except OSError:
                    # File might be inaccessible, skip it
                    pass
        return total


# Convenience functions


def scan_directory(
    base_path: Path, min_size_gb: float = 10.0, exclude_paths: Optional[Set[str]] = None
) -> Tuple[List[ScannedModel], List[str]]:
    """
    Convenience function to scan a directory

    Args:
        base_path: Root directory to scan
        min_size_gb: Minimum folder size in GB
        exclude_paths: Set of paths to exclude

    Returns:
        Tuple of (models, warnings)
    """
    scanner = ModelScanner(min_size_gb=min_size_gb)
    return scanner.scan_directory(base_path, exclude_paths)


def scan_single_path(path: Path) -> Optional[ScannedModel]:
    """
    Convenience function to scan a single path

    Args:
        path: Path to scan

    Returns:
        ScannedModel or None
    """
    scanner = ModelScanner()
    return scanner.scan_single_path(path)


def format_size(size_bytes: int) -> str:
    """
    Format size in bytes to human-readable string

    Args:
        size_bytes: Size in bytes

    Returns:
        Formatted string (e.g., "42.3 GB")
    """
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if size_bytes < 1024.0:
            return f"{size_bytes:.1f} {unit}"
        size_bytes /= 1024.0
    return f"{size_bytes:.1f} PB"


# ===== Fast Scanning with Find Command and Tree-based Root Detection =====


def find_files_fast(mount_point: str, pattern: str, max_depth: int = 6, timeout: int = 30) -> List[str]:
    """
    Use find command to quickly locate files

    Args:
        mount_point: Starting directory
        pattern: File pattern (e.g., "config.json", "*.gguf")
        max_depth: Maximum directory depth (default: 6)
        timeout: Command timeout in seconds

    Returns:
        List of absolute file paths
    """
    try:
        # Use shell=False for better security and handling of special characters in paths
        cmd = ["find", mount_point, "-maxdepth", str(max_depth), "-name", pattern, "-type", "f"]
        result = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            text=True,
            timeout=timeout,
        )

        # Return results even if returncode is non-zero (due to permission errors)
        # As long as we got some output
        if result.stdout:
            return [line.strip() for line in result.stdout.strip().split("\n") if line.strip()]
        return []
    except (subprocess.TimeoutExpired, FileNotFoundError):
        return []


def is_valid_model_directory(directory: Path, min_size_gb: float = 10.0) -> Tuple[bool, Optional[str]]:
    """
    Check if a directory is a valid model directory

    Args:
        directory: Path to check
        min_size_gb: Minimum size in GB

    Returns:
        (is_valid, model_type) where model_type is "safetensors", "gguf", or None
    """
    if not directory.exists() or not directory.is_dir():
        return False, None

    has_config = (directory / "config.json").exists()
    safetensors_files = list(directory.glob("*.safetensors"))
    gguf_files = list(directory.glob("*.gguf"))

    # Determine model type
    model_type = None
    if (has_config and safetensors_files) or safetensors_files:
        model_type = "safetensors"
    elif gguf_files:
        model_type = "gguf"
    else:
        return False, None

    # Check size - only count model files (fast!)
    total_size = 0
    if model_type == "safetensors":
        for f in safetensors_files:
            try:
                total_size += f.stat().st_size
            except OSError:
                pass
    else:  # gguf
        for f in gguf_files:
            try:
                total_size += f.stat().st_size
            except OSError:
                pass

    size_gb = total_size / (1024**3)
    if size_gb < min_size_gb:
        return False, None

    return True, model_type


def scan_all_models_fast(mount_points: List[str], min_size_gb: float = 10.0, max_depth: int = 6) -> List[str]:
    """
    Fast scan for all model paths using find command

    Args:
        mount_points: List of mount points to scan
        min_size_gb: Minimum model size in GB
        max_depth: Maximum search depth (default: 6)

    Returns:
        List of valid model directory paths
    """
    model_paths = set()

    for mount in mount_points:
        if not os.path.exists(mount):
            continue

        # Find all config.json files
        config_files = find_files_fast(mount, "config.json", max_depth=max_depth)
        for config_path in config_files:
            model_dir = Path(config_path).parent
            is_valid, model_type = is_valid_model_directory(model_dir, min_size_gb)
            if is_valid:
                model_paths.add(str(model_dir.resolve()))

        # Find all *.gguf files
        gguf_files = find_files_fast(mount, "*.gguf", max_depth=max_depth)
        for gguf_path in gguf_files:
            model_dir = Path(gguf_path).parent
            is_valid, model_type = is_valid_model_directory(model_dir, min_size_gb)
            if is_valid:
                model_paths.add(str(model_dir.resolve()))

    return sorted(model_paths)


def get_root_subdirs() -> List[str]:
    """
    Get subdirectories of / that are worth scanning

    Filters out system paths only

    Returns:
        List of directories to scan
    """
    # System paths to exclude
    excluded = {
        "dev",
        "proc",
        "sys",
        "run",
        "boot",
        "tmp",
        "usr",
        "lib",
        "lib64",
        "bin",
        "sbin",
        "etc",
        "opt",
        "var",
        "snap",
    }

    scan_dirs = []

    try:
        for entry in os.scandir("/"):
            if not entry.is_dir():
                continue

            # Skip excluded paths
            if entry.name in excluded:
                continue

            scan_dirs.append(entry.path)

    except PermissionError:
        pass

    return sorted(scan_dirs)


def scan_directory_for_models(directory: str, min_file_size_gb: float = 2.0) -> Dict[str, tuple]:
    """
    Scan a directory for models using find command with size filter

    Uses find -size +2G to only locate large model files (>=2GB)

    Args:
        directory: Directory to scan
        min_file_size_gb: Minimum individual file size in GB (default: 2.0)

    Returns:
        Dict mapping model_path -> (model_type, size_bytes, file_count, files)
    """
    model_info = {}

    # Convert GB to find's format (e.g., 2GB = +2G)
    if min_file_size_gb >= 1.0:
        size_filter = f"+{int(min_file_size_gb)}G"
    else:
        size_mb = int(min_file_size_gb * 1024)
        size_filter = f"+{size_mb}M"

    # 1. Find *.gguf files >= 2GB
    gguf_cmd = ["find", directory, "-name", "*.gguf", "-type", "f", "-size", size_filter, "-printf", "%p\t%s\n"]
    result = subprocess.run(gguf_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, timeout=120)

    # Group by directory
    gguf_dirs = defaultdict(list)
    for line in result.stdout.strip().split("\n"):
        if not line:
            continue
        parts = line.split("\t")
        if len(parts) != 2:
            continue
        file_path, size_str = parts
        file_path_obj = Path(file_path)
        dir_path = str(file_path_obj.parent)
        gguf_dirs[dir_path].append((file_path_obj.name, int(size_str)))

    # Add all gguf directories
    for dir_path, files in gguf_dirs.items():
        total_size = sum(size for _, size in files)
        model_info[dir_path] = ("gguf", total_size, len(files), [name for name, _ in files])

    # 2. Find *.safetensors files >= 2GB
    safetensors_cmd = ["find", directory, "-name", "*.safetensors", "-type", "f", "-size", size_filter, "-printf", "%p\t%s\n"]
    result = subprocess.run(safetensors_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, timeout=120)

    # Group by directory
    safetensors_dirs = defaultdict(list)
    for line in result.stdout.strip().split("\n"):
        if not line:
            continue
        parts = line.split("\t")
        if len(parts) != 2:
            continue
        file_path, size_str = parts
        file_path_obj = Path(file_path)
        dir_path = str(file_path_obj.parent)
        safetensors_dirs[dir_path].append((file_path_obj.name, int(size_str)))

    # 3. Check each safetensors directory for config.json
    for dir_path, files in safetensors_dirs.items():
        if os.path.exists(os.path.join(dir_path, "config.json")):
            total_size = sum(size for _, size in files)
            model_info[dir_path] = ("safetensors", total_size, len(files), [name for name, _ in files])

    return model_info


def scan_all_models_with_info(
    mount_points: Optional[List[str]] = None, min_size_gb: float = 10.0, max_depth: int = 6
) -> Dict[str, tuple]:
    """
    Fast scan with parallel directory scanning

    Strategy:
    1. Use provided directories or auto-detect root subdirectories
    2. Scan each directory in parallel (one thread per directory)
    3. Use find -size +2G to find large model files (>=2GB)

    Args:
        mount_points: Specific directories to scan, or None to auto-detect from / subdirs
        min_size_gb: Not used anymore (kept for API compatibility)
        max_depth: Not used anymore (kept for API compatibility)

    Returns:
        Dict mapping model_path -> (model_type, size_bytes, file_count, files)
    """
    from concurrent.futures import ThreadPoolExecutor, as_completed

    # Get directories to scan
    if mount_points is None:
        # Get root subdirectories (exclude system paths)
        scan_dirs = get_root_subdirs()
    else:
        scan_dirs = mount_points

    if not scan_dirs:
        return {}

    model_info = {}

    # Scan each directory in parallel (max 8 concurrent)
    # Use 2GB threshold to find model files
    with ThreadPoolExecutor(max_workers=min(len(scan_dirs), 8)) as executor:
        futures = {executor.submit(scan_directory_for_models, d, 2.0): d for d in scan_dirs}

        for future in as_completed(futures):
            try:
                dir_results = future.result()
                model_info.update(dir_results)
            except Exception as e:
                # Skip directories with errors
                pass

    return model_info


def find_model_roots_from_paths(model_paths: List[str]) -> Tuple[List[str], Dict[str, int]]:
    """
    Find optimal root paths from model paths using tree-based algorithm

    Algorithm:
    1. Build path tree with all intermediate paths
    2. DFS to calculate f(x) = subtree sum (number of models in subtree)
    3. Find roots where f(parent) = f(x) > max(f(children))

    Args:
        model_paths: List of model directory paths

    Returns:
        (root_paths, subtree_sizes) where:
        - root_paths: List of inferred root directories
        - subtree_sizes: Dict mapping each root to number of models
    """
    if not model_paths:
        return [], {}

    # 1. Build path set (including all intermediate paths)
    all_paths = set()
    model_set = set(model_paths)

    for model_path in model_paths:
        path = Path(model_path)
        for i in range(1, len(path.parts) + 1):
            all_paths.add(str(Path(*path.parts[:i])))

    # 2. Build parent-child relationships
    children_map = defaultdict(list)
    for path in all_paths:
        path_obj = Path(path)
        if len(path_obj.parts) > 1:
            parent = str(path_obj.parent)
            if parent in all_paths:
                children_map[parent].append(path)

    # 3. DFS to calculate f(x) and max_child_f(x)
    f = {}  # path -> subtree sum
    max_child_f = {}  # path -> max(f(children))
    visited = set()

    def dfs(path: str) -> int:
        if path in visited:
            return f[path]
        visited.add(path)

        # Current node weight (1 if it's a model path, 0 otherwise)
        weight = 1 if path in model_set else 0

        # Recursively calculate children
        children = children_map.get(path, [])
        if not children:
            # Leaf node
            f[path] = weight
            max_child_f[path] = 0
            return weight

        # Calculate f values for all children
        children_f_values = [dfs(child) for child in children]

        # Calculate f(x) and max_child_f(x)
        f[path] = weight + sum(children_f_values)
        max_child_f[path] = max(children_f_values) if children_f_values else 0

        return f[path]

    # Find top-level nodes (no parent in all_paths)
    top_nodes = []
    for path in all_paths:
        parent = str(Path(path).parent)
        if parent not in all_paths or parent == path:
            top_nodes.append(path)

    # Execute DFS from all top nodes
    for top in top_nodes:
        dfs(top)

    # 4. Find root nodes: f(parent) = f(x) >= max(f(children))
    # Note: Use >= instead of > to handle the case where a directory contains only one model
    candidate_roots = []
    for path in all_paths:
        # Skip model paths themselves (leaf nodes in model tree)
        if path in model_set:
            continue

        parent = str(Path(path).parent)

        # Check condition: f(parent) = f(x) and f(x) >= max(f(children))
        if parent in f and f.get(parent, 0) == f.get(path, 0):
            if f.get(path, 0) >= max_child_f.get(path, 0) and f.get(path, 0) > 0:
                candidate_roots.append(path)

    # 5. Remove redundant roots (prefer deeper paths)
    # If a root is an ancestor of another root with the same f value, remove it
    roots = []
    candidate_roots_sorted = sorted(candidate_roots, key=lambda p: -len(Path(p).parts))

    for root in candidate_roots_sorted:
        # Check if this root is a parent of any already selected root
        is_redundant = False
        for selected in roots:
            if selected.startswith(root + "/"):
                # selected is a child of root
                # Only keep root if it has more models (shouldn't happen by algorithm)
                if f.get(root, 0) == f.get(selected, 0):
                    is_redundant = True
                    break

        if not is_redundant:
            # Also filter out very shallow paths (< 3 levels)
            if len(Path(root).parts) >= 3:
                roots.append(root)

    # Build subtree sizes for roots
    subtree_sizes = {root: f.get(root, 0) for root in roots}

    return sorted(roots), subtree_sizes


@dataclass
class ModelRootInfo:
    """Information about a detected model root path"""

    path: str
    model_count: int
    models: List[ScannedModel]


def discover_models(
    mount_points: Optional[List[str]] = None, min_size_gb: float = 10.0, max_depth: int = 6
) -> List[ScannedModel]:
    """
    Discover all model directories on the system

    Fast scan using find command to locate all models that meet the criteria

    Args:
        mount_points: List of mount points to scan (None = auto-detect)
        min_size_gb: Minimum model size in GB (default: 10.0)
        max_depth: Maximum search depth (default: 6)

    Returns:
        List of ScannedModel sorted by path
    """
    # Auto-detect mount points if not provided
    if mount_points is None:
        mount_points = _get_mount_points()

    # Fast scan with cached info (only scan once!)
    model_info = scan_all_models_with_info(mount_points, min_size_gb, max_depth)

    if not model_info:
        return []

    # Convert to ScannedModel objects
    results = []
    for model_path, (model_type, total_size, file_count, files) in model_info.items():
        results.append(
            ScannedModel(path=model_path, format=model_type, size_bytes=total_size, file_count=file_count, files=files)
        )

    # Sort by path
    results.sort(key=lambda m: m.path)
    return results


def _get_mount_points() -> List[str]:
    """
    Get all valid mount points from /proc/mounts, filtering out system paths

    Returns:
        List of mount point paths suitable for model storage
        (excludes root "/" to avoid scanning entire filesystem)
    """
    mount_points = set()

    # System paths to exclude (unlikely to contain model files)
    excluded_paths = [
        "/snap/",
        "/proc/",
        "/sys/",
        "/run/",
        "/boot",
        "/dev/",
        "/usr",
        "/lib",
        "/lib64",
        "/bin",
        "/sbin",
        "/etc",
        "/opt",
        "/var",
        "/tmp",
    ]

    try:
        with open("/proc/mounts", "r") as f:
            for line in f:
                parts = line.split()
                if len(parts) < 3:
                    continue

                device, mount_point, fs_type = parts[0], parts[1], parts[2]

                # Filter out pseudo filesystems
                pseudo_fs = {
                    "proc",
                    "sysfs",
                    "devpts",
                    "tmpfs",
                    "devtmpfs",
                    "cgroup",
                    "cgroup2",
                    "pstore",
                    "bpf",
                    "tracefs",
                    "debugfs",
                    "hugetlbfs",
                    "mqueue",
                    "configfs",
                    "securityfs",
                    "fuse.gvfsd-fuse",
                    "fusectl",
                    "squashfs",
                    "overlay",  # snap packages
                }

                if fs_type in pseudo_fs:
                    continue

                # Skip root directory (too large to scan)
                if mount_point == "/":
                    continue

                # Filter out system paths
                if any(mount_point.startswith(x) for x in excluded_paths):
                    continue

                # Only include if it exists and is readable
                if os.path.exists(mount_point) and os.access(mount_point, os.R_OK):
                    mount_points.add(mount_point)

        # If no mount points found, add common data directories
        if not mount_points:
            # Add /home if it exists and is not already a separate mount point
            common_paths = ["/home", "/data", "/mnt"]
            for path in common_paths:
                if os.path.exists(path) and os.access(path, os.R_OK):
                    mount_points.add(path)

    except (FileNotFoundError, PermissionError):
        # Fallback to common paths
        mount_points = {"/home", "/mnt", "/data"}

    return sorted(mount_points)


================================================
FILE: kt-kernel/python/cli/utils/model_table_builder.py
================================================
"""
Shared model table builders for consistent UI across commands.

Provides reusable table construction functions for displaying models
in kt model list, kt quant, kt run, etc.
"""

from typing import List, Optional, Tuple
from pathlib import Path
from rich.table import Table
from rich.console import Console
import json


def format_model_size(model_path: Path, format_type: str) -> str:
    """Calculate and format model size."""
    from kt_kernel.cli.utils.model_scanner import format_size

    try:
        if format_type == "safetensors":
            files = list(model_path.glob("*.safetensors"))
        elif format_type == "gguf":
            files = list(model_path.glob("*.gguf"))
        else:
            return "[dim]-[/dim]"

        total_size = sum(f.stat().st_size for f in files if f.exists())
        return format_size(total_size)
    except Exception:
        return "[dim]-[/dim]"


def format_repo_info(model) -> str:
    """Format repository information."""
    if model.repo_id:
        repo_abbr = "hf" if model.repo_type == "huggingface" else "ms"
        return f"{repo_abbr}:{model.repo_id}"
    return "[dim]-[/dim]"


def format_sha256_status(model, status_map: dict) -> str:
    """Format SHA256 verification status."""
    return status_map.get(model.sha256_status or "not_checked", "[dim]?[/dim]")


def build_moe_gpu_table(
    models: List, status_map: dict, show_index: bool = True, start_index: int = 1
) -> Tuple[Table, List]:
    """
    Build MoE GPU models table.

    Args:
        models: List of MoE GPU model objects
        status_map: SHA256_STATUS_MAP for formatting status
        show_index: Whether to show # column for selection (default: True)
        start_index: Starting index number

    Returns:
        Tuple of (Table object, list of models in display order)
    """
    table = Table(show_header=True, header_style="bold", show_lines=False)

    if show_index:
        table.add_column("#", justify="right", style="cyan", no_wrap=True)

    table.add_column("Name", style="cyan", no_wrap=True)
    table.add_column("Path", style="dim", overflow="fold")
    table.add_column("Total", justify="right")
    table.add_column("Exps", justify="center", style="yellow")
    table.add_column("Act", justify="center", style="green")
    table.add_column("Repository", style="dim", overflow="fold")
    table.add_column("SHA256", justify="center")

    displayed_models = []

    for i, model in enumerate(models, start_index):
        displayed_models.append(model)

        # Calculate size
        size_str = format_model_size(Path(model.path), "safetensors")

        # MoE info
        num_experts = str(model.moe_num_experts) if model.moe_num_experts else "[dim]-[/dim]"
        num_active = str(model.moe_num_experts_per_tok) if model.moe_num_experts_per_tok else "[dim]-[/dim]"

        # Repository and SHA256
        repo_str = format_repo_info(model)
        sha256_str = format_sha256_status(model, status_map)

        row = []
        if show_index:
            row.append(str(i))

        row.extend([model.name, model.path, size_str, num_experts, num_active, repo_str, sha256_str])

        table.add_row(*row)

    return table, displayed_models


def build_amx_table(
    models: List,
    status_map: dict = None,  # Kept for API compatibility but not used
    show_index: bool = True,
    start_index: int = 1,
    show_linked_gpus: bool = False,
    gpu_models: Optional[List] = None,
) -> Tuple[Table, List]:
    """
    Build AMX models table.

    Note: AMX models are locally quantized, so no SHA256 verification column.

    Args:
        models: List of AMX model objects
        status_map: (Unused - kept for API compatibility)
        show_index: Whether to show # column for selection (default: True)
        start_index: Starting index number
        show_linked_gpus: Whether to show sub-rows for linked GPU models
        gpu_models: List of GPU models (required if show_linked_gpus=True)

    Returns:
        Tuple of (Table object, list of models in display order)
    """
    table = Table(show_header=True, header_style="bold", show_lines=False)

    if show_index:
        table.add_column("#", justify="right", style="cyan", no_wrap=True)

    table.add_column("Name", style="cyan", no_wrap=True)
    table.add_column("Path", style="dim", overflow="fold")
    table.add_column("Total", justify="right")
    table.add_column("Method", justify="center", style="yellow")
    table.add_column("NUMA", justify="center", style="green")
    table.add_column("Source", style="dim", overflow="fold")

    # Build reverse map if needed
    amx_used_by_gpu = {}
    if show_linked_gpus and gpu_models:
        for model in models:
            if model.gpu_model_ids:
                gpu_names = []
                for gpu_id in model.gpu_model_ids:
                    for gpu_model in gpu_models:
                        if gpu_model.id == gpu_id:
                            gpu_names.append(gpu_model.name)
                            break
                if gpu_names:
                    amx_used_by_gpu[model.id] = gpu_names

    displayed_models = []

    for i, model in enumerate(models, start_index):
        displayed_models.append(model)

        # Calculate size
        size_str = format_model_size(Path(model.path), "safetensors")

        # Read metadata from config.json or UserModel fields
        method_from_config = None
        numa_from_config = None
        try:
            config_path = Path(model.path) / "config.json"
            if config_path.exists():
                with open(config_path, "r", encoding="utf-8") as f:
                    config = json.load(f)
                    amx_quant = config.get("amx_quantization", {})
                    if amx_quant.get("converted"):
                        method_from_config = amx_quant.get("method")
                        numa_from_config = amx_quant.get("numa_count")
        except Exception:
            pass

        # Priority: UserModel fields > config.json > ?
        method_display = (
            model.amx_quant_method.upper()
            if model.amx_quant_method
            else method_from_config.upper() if method_from_config else "[dim]?[/dim]"
        )
        numa_display = (
            str(model.amx_numa_nodes)
            if model.amx_numa_nodes
            else str(numa_from_config) if numa_from_config else "[dim]?[/dim]"
        )
        source_display = model.amx_source_model or "[dim]-[/dim]"

        row = []
        if show_index:
            row.append(str(i))

        row.extend([model.name, model.path, size_str, method_display, numa_display, source_display])

        table.add_row(*row)

        # Add sub-row showing linked GPUs
        if show_linked_gpus and model.id in amx_used_by_gpu:
            gpu_list = amx_used_by_gpu[model.id]
            gpu_names_str = ", ".join([f"[dim]{name}[/dim]" for name in gpu_list])
            sub_row = []
            if show_index:
                sub_row.append("")
            sub_row.extend([f"  [dim]↳ GPU: {gpu_names_str}[/dim]", "", "", "", "", ""])
            table.add_row(*sub_row, style="dim")

    return table, displayed_models


def build_gguf_table(
    models: List, status_map: dict, show_index: bool = True, start_index: int = 1
) -> Tuple[Table, List]:
    """
    Build GGUF models table.

    Args:
        models: List of GGUF model objects
        status_map: SHA256_STATUS_MAP for formatting status
        show_index: Whether to show # column for selection (default: True)
        start_index: Starting index number

    Returns:
        Tuple of (Table object, list of models in display order)
    """
    table = Table(show_header=True, header_style="bold", show_lines=False)

    if show_index:
        table.add_column("#", justify="right", style="cyan", no_wrap=True)

    table.add_column("Name", style="cyan", no_wrap=True)
    table.add_column("Path", style="dim", overflow="fold")
    table.add_column("Total", justify="right")
    table.add_column("Repository", style="dim", overflow="fold")
    table.add_column("SHA256", justify="center")

    displayed_models = []

    for i, model in enumerate(models, start_index):
        displayed_models.append(model)

        # Calculate size
        size_str = format_model_size(Path(model.path), "gguf")

        # Repository and SHA256
        repo_str = format_repo_info(model)
        sha256_str = format_sha256_status(model, status_map)

        row = []
        if show_index:
            row.append(str(i))

        row.extend([model.name, model.path, size_str, repo_str, sha256_str])

        table.add_row(*row)

    return table, displayed_models


================================================
FILE: kt-kernel/python/cli/utils/model_verifier.py
================================================
"""
Model Verifier

SHA256 verification for model integrity
"""

import hashlib
import requests
import os
from pathlib import Path
from typing import Dict, Any, Literal, Tuple
from concurrent.futures import ProcessPoolExecutor, as_completed


def _compute_file_sha256(file_path: Path) -> Tuple[str, str, float]:
    """
    Compute SHA256 for a single file (worker function for multiprocessing).

    Args:
        file_path: Path to the file

    Returns:
        Tuple of (filename, sha256_hash, file_size_mb)
    """
    sha256_hash = hashlib.sha256()
    file_size_mb = file_path.stat().st_size / (1024 * 1024)

    # Read file in chunks to handle large files
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(8192 * 1024), b""):  # 8MB chunks
            sha256_hash.update(byte_block)

    return file_path.name, sha256_hash.hexdigest(), file_size_mb


def check_huggingface_connectivity(timeout: int = 5) -> Tuple[bool, str]:
    """
    Check if HuggingFace is accessible.

    Args:
        timeout: Connection timeout in seconds

    Returns:
        Tuple of (is_accessible, message)
    """
    test_url = "https://huggingface.co"

    try:
        response = requests.head(test_url, timeout=timeout, allow_redirects=True)
        if response.status_code < 500:  # 2xx, 3xx, 4xx are all considered "accessible"
            return True, "HuggingFace is accessible"
    except requests.exceptions.Timeout:
        return False, f"Connection to {test_url} timed out"
    except requests.exceptions.ConnectionError:
        return False, f"Cannot connect to {test_url}"
    except requests.exceptions.RequestException as e:
        return False, f"Connection error: {str(e)}"

    return False, "Unknown connection error"


def verify_model_integrity(
    repo_type: Literal["huggingface", "modelscope"],
    repo_id: str,
    local_dir: Path,
    progress_callback=None,
) -> Dict[str, Any]:
    """
    Verify local model integrity against remote repository SHA256 hashes.

    Verifies all important files:
    - *.safetensors (weights)
    - *.json (config files)
    - *.py (custom model code)

    Args:
        repo_type: Type of repository ("huggingface" or "modelscope")
        repo_id: Repository ID (e.g., "deepseek-ai/DeepSeek-V3")
        local_dir: Local directory containing model files
        progress_callback: Optional callback function(message: str) for progress updates

    Returns:
        Dictionary with verification results:
        {
            "status": "passed" | "failed" | "error",
            "files_checked": int,
            "files_passed": int,
            "files_failed": [list of filenames],
            "error_message": str (optional)
        }
    """

    def report_progress(msg: str):
        """Helper to report progress"""
        if progress_callback:
            progress_callback(msg)

    try:
        # Convert repo_type to platform format
        platform = "hf" if repo_type == "huggingface" else "ms"

        # 1. Fetch official SHA256 hashes from remote
        report_progress("Fetching official SHA256 hashes from remote repository...")
        official_hashes = fetch_model_sha256(repo_id, platform)
        report_progress(f"✓ Fetched {len(official_hashes)} file hashes from remote")

        if not official_hashes:
            return {
                "status": "error",
                "files_checked": 0,
                "files_passed": 0,
                "files_failed": [],
                "error_message": f"No verifiable files found in remote repository: {repo_id}",
            }

        # 2. Calculate local SHA256 hashes with progress
        report_progress(f"Calculating SHA256 for local files...")

        # Get all local files matching the patterns
        local_files = []
        for pattern in ["*.safetensors", "*.json", "*.py"]:
            local_files.extend([f for f in local_dir.glob(pattern) if f.is_file()])

        if not local_files:
            return {
                "status": "error",
                "files_checked": 0,
                "files_passed": 0,
                "files_failed": [],
                "error_message": f"No verifiable files found in local directory: {local_dir}",
            }

        # Calculate hashes for all files
        local_hashes = calculate_local_sha256(
            local_dir,
            file_pattern="*.safetensors",  # Unused when files_list is provided
            progress_callback=report_progress,
            files_list=local_files,
        )
        report_progress(f"✓ Calculated {len(local_hashes)} local file hashes")

        # 3. Compare hashes with progress
        report_progress(f"Comparing {len(official_hashes)} files...")
        files_failed = []
        files_missing = []
        files_passed = 0

        for idx, (filename, official_hash) in enumerate(official_hashes.items(), 1):
            # Handle potential path separators in filename
            file_basename = Path(filename).name

            # Try to find the file in local hashes
            local_hash = None
            for local_file, local_hash_value in local_hashes.items():
                if Path(local_file).name == file_basename:
                    local_hash = local_hash_value
                    break

            if local_hash is None:
                files_missing.append(filename)
                report_progress(f"  [{idx}/{len(official_hashes)}] ✗ {file_basename} - MISSING")
            elif local_hash.lower() != official_hash.lower():
                files_failed.append(f"{filename} (hash mismatch)")
                report_progress(f"  [{idx}/{len(official_hashes)}] ✗ {file_basename} - HASH MISMATCH")
            else:
                files_passed += 1
                report_progress(f"  [{idx}/{len(official_hashes)}] ✓ {file_basename}")

        # 4. Return results
        total_checked = len(official_hashes)

        if files_failed or files_missing:
            all_failed = files_failed + [f"{f} (missing)" for f in files_missing]
            return {
                "status": "failed",
                "files_checked": total_checked,
                "files_passed": files_passed,
                "files_failed": all_failed,
                "error_message": f"{len(all_failed)} file(s) failed verification",
            }
        else:
            return {
                "status": "passed",
                "files_checked": total_checked,
                "files_passed": files_passed,
                "files_failed": [],
            }

    except ImportError as e:
        return {
            "status": "error",
            "files_checked": 0,
            "files_passed": 0,
            "files_failed": [],
            "error_message": f"Missing required package: {str(e)}. Install with: pip install huggingface-hub modelscope",
            "is_network_error": False,
        }
    except (
        requests.exceptions.ConnectionError,
        requests.exceptions.Timeout,
        requests.exceptions.RequestException,
    ) as e:
        # Network-related errors - suggest mirror
        error_msg = f"Network error: {str(e)}"
        if repo_type == "huggingface":
            error_msg += "\n\nTry using HuggingFace mirror:\n  export HF_ENDPOINT=https://hf-mirror.com"
        return {
            "status": "error",
            "files_checked": 0,
            "files_passed": 0,
            "files_failed": [],
            "error_message": error_msg,
            "is_network_error": True,
        }
    except Exception as e:
        return {
            "status": "error",
            "files_checked": 0,
            "files_passed": 0,
            "files_failed": [],
            "error_message": f"Verification failed: {str(e)}",
            "is_network_error": False,
        }


def calculate_local_sha256(
    local_dir: Path, file_pattern: str = "*.safetensors", progress_callback=None, files_list: list[Path] = None
) -> Dict[str, str]:
    """
    Calculate SHA256 hashes for files in a directory using parallel processing.

    Args:
        local_dir: Directory to scan
        file_pattern: Glob pattern for files to hash (ignored if files_list is provided)
        progress_callback: Optional callback function(message: str) for progress updates
        files_list: Optional pre-filtered list of files to hash (overrides file_pattern)

    Returns:
        Dictionary mapping filename to SHA256 hash
    """
    result = {}

    if not local_dir.exists():
        return result

    # Get all files first to report total
    if files_list is not None:
        files_to_hash = files_list
    else:
        files_to_hash = [f for f in local_dir.glob(file_pattern) if f.is_file()]
    total_files = len(files_to_hash)

    if total_files == 0:
        return result

    # Use min(16, total_files) workers to avoid over-spawning processes
    max_workers = min(16, total_files)

    if progress_callback:
        progress_callback(f"  Using {max_workers} parallel workers for SHA256 calculation")

    # Use ProcessPoolExecutor for CPU-intensive SHA256 computation
    completed_count = 0
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_file = {executor.submit(_compute_file_sha256, file_path): file_path for file_path in files_to_hash}

        # Process results as they complete
        for future in as_completed(future_to_file):
            completed_count += 1
            try:
                filename, sha256_hash, file_size_mb = future.result()
                result[filename] = sha256_hash

                if progress_callback:
                    progress_callback(f"  [{completed_count}/{total_files}] ✓ {filename} ({file_size_mb:.1f} MB)")

            except Exception as e:
                file_path = future_to_file[future]
                if progress_callback:
                    progress_callback(f"  [{completed_count}/{total_files}] ✗ {file_path.name} - Error: {str(e)}")

    return result


def fetch_model_sha256(
    repo_id: str,
    platform: Literal["hf", "ms"],
    revision: str | None = None,
    use_mirror: bool = False,
    timeout: int | None = None,
) -> dict[str, str]:
    """
    获取模型仓库中所有重要文件的 sha256 哈希值。

    包括：
    - *.safetensors (权重文件)
    - *.json (配置文件：config.json, tokenizer_config.json 等)
    - *.py (自定义模型代码：modeling.py, configuration.py 等)

    Args:
        repo_id: 仓库 ID，例如 "Qwen/Qwen3-30B-A3B"
        platform: 平台，"hf" (HuggingFace) 或 "ms" (ModelScope)
        revision: 版本/分支，默认 HuggingFace 为 "main"，ModelScope 为 "master"
        use_mirror: 是否使用镜像（仅对 HuggingFace 有效）
        timeout: 网络请求超时时间（秒），None 表示不设置超时

    Returns:
        dict: 文件名到 sha256 的映射，例如 {"model-00001-of-00016.safetensors": "abc123...", "config.json": "def456..."}
    """
    if platform == "hf":
        # 先尝试直连，失败后自动使用镜像
        try:
            if use_mirror:
                return _fetch_from_huggingface(repo_id, revision or "main", use_mirror=True, timeout=timeout)
            else:
                return _fetch_from_huggingface(repo_id, revision or "main", use_mirror=False, timeout=timeout)
        except Exception as e:
            # 如果不是镜像模式且失败了，自动重试使用镜像
            if not use_mirror:
                return _fetch_from_huggingface(repo_id, revision or "main", use_mirror=True, timeout=timeout)
            else:
                raise e
    elif platform == "ms":
        return _fetch_from_modelscope(repo_id, revision or "master", timeout=timeout)
    else:
        raise ValueError(f"不支持的平台: {platform}，请使用 'hf' 或 'ms'")


def _fetch_from_huggingface(
    repo_id: str, revision: str, use_mirror: bool = False, timeout: int | None = None
) -> dict[str, str]:
    """从 HuggingFace 获取所有重要文件的 sha256

    Args:
        repo_id: 仓库 ID
        revision: 版本/分支
        use_mirror: 是否使用镜像（hf-mirror.com）
        timeout: 网络请求超时时间（秒），None 表示不设置超时
    """
    import os
    import socket

    # 如果需要使用镜像，设置环境变量
    original_endpoint = os.environ.get("HF_ENDPOINT")
    if use_mirror and not original_endpoint:
        os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

    # Set socket timeout if specified
    original_timeout = socket.getdefaulttimeout()
    if timeout is not None:
        socket.setdefaulttimeout(timeout)

    from huggingface_hub import HfApi, list_repo_files

    try:
        api = HfApi()
        all_files = list_repo_files(repo_id=repo_id, revision=revision)

        # 筛选重要文件：*.safetensors, *.json, *.py
        important_files = [f for f in all_files if f.endswith((".safetensors", ".json", ".py"))]

        if not important_files:
            return {}

        paths_info = api.get_paths_info(
            repo_id=repo_id,
            paths=important_files,
            revision=revision,
        )

        result = {}
        for file_info in paths_info:
            if hasattr(file_info, "lfs") and file_info.lfs is not None:
                sha256 = file_info.lfs.sha256
            else:
                sha256 = getattr(file_info, "blob_id", None)
            result[file_info.path] = sha256

        return result
    finally:
        # 恢复原始 socket timeout
        socket.setdefaulttimeout(original_timeout)

        # 恢复原始环境变量
        if use_mirror and not original_endpoint:
            os.environ.pop("HF_ENDPOINT", None)
        elif original_endpoint:
            os.environ["HF_ENDPOINT"] = original_endpoint


def _fetch_from_modelscope(repo_id: str, revision: str, timeout: int | None = None) -> dict[str, str]:
    """从 ModelScope 获取所有重要文件的 sha256

    Args:
        repo_id: 仓库 ID
        revision: 版本/分支
        timeout: 网络请求超时时间（秒），None 表示不设置超时
    """
    import socket
    from modelscope.hub.api import HubApi

    # Set socket timeout if specified
    original_timeout = socket.getdefaulttimeout()
    if timeout is not None:
        socket.setdefaulttimeout(timeout)

    try:
        api = HubApi()
        files_info = api.get_model_files(model_id=repo_id, revision=revision)

        result = {}
        for file_info in files_info:
            filename = file_info.get("Name", file_info.get("Path", ""))
            # 筛选重要文件：*.safetensors, *.json, *.py
            if filename.endswith((".safetensors", ".json", ".py")):
                sha256 = file_info.get("Sha256", file_info.get("sha256", None))
                result[filename] = sha256

        return result
    finally:
        # 恢复原始 socket timeout
        socket.setdefaulttimeout(original_timeout)


def verify_model_integrity_with_progress(
    repo_type: Literal["huggingface", "modelscope"],
    repo_id: str,
    local_dir: Path,
    progress_callback=None,
    verbose: bool = False,
    use_mirror: bool = False,
    files_to_verify: list[str] | None = None,
    timeout: int | None = None,
) -> Dict[str, Any]:
    """
    Verify model integrity with enhanced progress reporting for Rich Progress bars.

    This is a wrapper around verify_model_integrity() that provides more detailed
    progress information suitable for progress bar display.

    The progress_callback receives:
    - (message: str, total: int, current: int) for countable operations
    - (message: str) for status updates

    Args:
        repo_type: Repository type ("huggingface" or "modelscope")
        repo_id: Repository ID
        local_dir: Local directory path
        progress_callback: Optional callback for progress updates
        verbose: If True, output detailed SHA256 comparison for each file
        use_mirror: If True, use HuggingFace mirror (hf-mirror.com)
        files_to_verify: Optional list of specific files to verify (for re-verification)
        timeout: Network request timeout in seconds (None = no timeout)
    """

    def report_progress(msg: str, total=None, current=None):
        """Enhanced progress reporter"""
        if progress_callback:
            progress_callback(msg, total, current)

    try:
        platform = "hf" if repo_type == "huggingface" else "ms"

        # 1. Fetch official SHA256 hashes
        if files_to_verify:
            report_progress(f"Fetching SHA256 hashes for {len(files_to_verify)} files...")
        elif use_mirror and platform == "hf":
            report_progress("Fetching official SHA256 hashes from mirror (hf-mirror.com)...")
        else:
            report_progress("Fetching official SHA256 hashes from remote repository...")

        official_hashes = fetch_model_sha256(repo_id, platform, use_mirror=use_mirror, timeout=timeout)

        # Filter to only requested files if specified
        if files_to_verify:
            # Extract clean filenames from files_to_verify (remove markers like "(missing)")
            clean_filenames = set()
            for f in files_to_verify:
                clean_f = f.replace(" (missing)", "").replace(" (hash mismatch)", "").strip()
                # Ensure we only use the filename, not full path
                clean_filenames.add(Path(clean_f).name)

            # Filter official_hashes to only include requested files
            # Compare using basename since official_hashes keys might have paths
            official_hashes = {k: v for k, v in official_hashes.items() if Path(k).name in clean_filenames}

        report_progress(f"✓ Fetched {len(official_hashes)} file hashes from remote")

        if not official_hashes:
            return {
                "status": "error",
                "files_checked": 0,
                "files_passed": 0,
                "files_failed": [],
                "error_message": f"No safetensors files found in remote repository: {repo_id}",
            }

        # 2. Calculate local SHA256 hashes
        local_dir_path = Path(local_dir)

        # Only hash the files we need to verify
        if files_to_verify:
            # Extract clean filenames (without markers)
            clean_filenames = set()
            for f in files_to_verify:
                clean_f = f.replace(" (missing)", "").replace(" (hash mismatch)", "").strip()
                # Ensure we only use the filename, not full path
                clean_filenames.add(Path(clean_f).name)

            # Only hash files that match the clean filenames
            files_to_hash = [
                f for f in local_dir_path.glob("*.safetensors") if f.is_file() and f.name in clean_filenames
            ]
        else:
            files_to_hash = [f for f in local_dir_path.glob("*.safetensors") if f.is_file()]

        total_files = len(files_to_hash)

        if files_to_verify:
            report_progress(f"Calculating SHA256 for {total_files} repaired files...", total=total_files, current=0)
        else:
            report_progress(f"Calculating SHA256 for local files...", total=total_files, current=0)

        # Progress wrapper for hashing
        completed_count = [0]  # Use list for mutable closure

        def hash_progress_callback(msg: str):
            if "Using" in msg and "workers" in msg:
                report_progress(msg)
            elif "[" in msg and "/" in msg and "]" in msg:
                # Progress update like: [1/10] ✓ filename (123.4 MB)
                completed_count[0] += 1
                report_progress(msg, total=total_files, current=completed_count[0])

        # Pass the pre-filtered files_to_hash list
        local_hashes = calculate_local_sha256(
            local_dir_path,
            "*.safetensors",
            progress_callback=hash_progress_callback,
            files_list=files_to_hash if files_to_verify else None,
        )
        report_progress(f"✓ Calculated {len(local_hashes)} local file hashes")

        # 3. Compare hashes
        report_progress(f"Comparing {len(official_hashes)} files...", total=len(official_hashes), current=0)

        files_failed = []
        files_missing = []
        files_passed = 0

        for idx, (filename, official_hash) in enumerate(official_hashes.items(), 1):
            file_basename = Path(filename).name

            # Find matching local file
            local_hash = None
            for local_file, local_hash_value in local_hashes.items():
                if Path(local_file).name == file_basename:
                    local_hash = local_hash_value
                    break

            if local_hash is None:
                files_missing.append(filename)
                if verbose:
                    report_progress(
                        f"[{idx}/{len(official_hashes)}] ✗ {file_basename} (missing)\n  Remote: {official_hash}\n  Local:  <missing>",
                        total=len(official_hashes),
                        current=idx,
                    )
                else:
                    report_progress(
                        f"[{idx}/{len(official_hashes)}] ✗ {file_basename} (missing)",
                        total=len(official_hashes),
                        current=idx,
                    )
            elif local_hash.lower() != official_hash.lower():
                files_failed.append(f"{filename} (hash mismatch)")
                if verbose:
                    report_progress(
                        f"[{idx}/{len(official_hashes)}] ✗ {file_basename} (hash mismatch)\n  Remote: {official_hash}\n  Local:  {local_hash}",
                        total=len(official_hashes),
                        current=idx,
                    )
                else:
                    report_progress(
                        f"[{idx}/{len(official_hashes)}] ✗ {file_basename} (hash mismatch)",
                        total=len(official_hashes),
                        current=idx,
                    )
            else:
                files_passed += 1
                if verbose:
                    report_progress(
                        f"[{idx}/{len(official_hashes)}] ✓ {file_basename}\n  Remote: {official_hash}\n  Local:  {local_hash}",
                        total=len(official_hashes),
                        current=idx,
                    )
                else:
                    report_progress(
                        f"[{idx}/{len(official_hashes)}] ✓ {file_basename}", total=len(official_hashes), current=idx
                    )

        # 4. Return results
        total_checked = len(official_hashes)

        if files_failed or files_missing:
            all_failed = files_failed + [f"{f} (missing)" for f in files_missing]
            return {
                "status": "failed",
                "files_checked": total_checked,
                "files_passed": files_passed,
                "files_failed": all_failed,
                "error_message": f"{len(all_failed)} file(s) failed verification",
            }
        else:
            return {
                "status": "passed",
                "files_checked": total_checked,
                "files_passed": files_passed,
                "files_failed": [],
            }

    except (
        requests.exceptions.ConnectionError,
        requests.exceptions.Timeout,
        requests.exceptions.RequestException,
        TimeoutError,  # Socket timeout from socket.setdefaulttimeout()
        OSError,  # Network-related OS errors
    ) as e:
        error_msg = f"Network error: {str(e)}"
        if repo_type == "huggingface":
            error_msg += "\n\nTry using HuggingFace mirror:\n  export HF_ENDPOINT=https://hf-mirror.com"
        return {
            "status": "error",
            "files_checked": 0,
            "files_passed": 0,
            "files_failed": [],
            "error_message": error_msg,
            "is_network_error": True,
        }
    except Exception as e:
        return {
            "status": "error",
            "files_checked": 0,
            "files_passed": 0,
            "files_failed": [],
            "error_message": f"Verification failed: {str(e)}",
            "is_network_error": False,
        }


def pre_operation_verification(user_model, user_registry, operation_name: str = "operation") -> None:
    """Pre-operation verification of model integrity.

    Can be used before running or quantizing models to ensure integrity.

    Args:
        user_model: UserModel object to verify
        user_registry: UserModelRegistry instance
        operation_name: Name of the operation (e.g., "running", "quantizing")
    """
    from rich.prompt import Prompt, Confirm
    from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn, TimeElapsedColumn
    from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
    from kt_kernel.cli.i18n import get_lang
    from kt_kernel.cli.utils.console import console, print_info, print_warning, print_error, print_success, print_step
    import typer

    lang = get_lang()

    # Check if already verified
    if user_model.sha256_status == "passed":
        console.print()
        print_info("Model integrity already verified ✓")
        console.print()
        return

    # Model not verified yet
    console.print()
    console.print("[bold yellow]═══ Model Integrity Check ═══[/bold yellow]")
    console.print()

    # Check if repo_id exists
    if not user_model.repo_id:
        # No repo_id - ask user to provide one
        console.print("[yellow]No repository ID configured for this model.[/yellow]")
        console.print()
        console.print("To verify model integrity, we need the repository ID (e.g., 'deepseek-ai/DeepSeek-V3')")
        console.print()

        if not Confirm.ask("Would you like to configure repository ID now?", default=True):
            console.print()
            print_warning(f"Skipping verification. Model will be used for {operation_name} without integrity check.")
            console.print()
            return

        # Ask for repo type
        console.print()
        console.print("Repository type:")
        console.print("  [cyan][1][/cyan] HuggingFace")
        console.print("  [cyan][2][/cyan] ModelScope")
        console.print()

        repo_type_choice = Prompt.ask("Select repository type", choices=["1", "2"], default="1")
        repo_type = "huggingface" if repo_type_choice == "1" else "modelscope"

        # Ask for repo_id
        console.print()
        repo_id = Prompt.ask("Enter repository ID (e.g., deepseek-ai/DeepSeek-V3)")

        # Update model
        user_registry.update_model(user_model.name, {"repo_type": repo_type, "repo_id": repo_id})
        user_model.repo_type = repo_type
        user_model.repo_id = repo_id

        console.print()
        print_success(f"Repository configured: {repo_type}:{repo_id}")
        console.print()

    # Now ask if user wants to verify
    console.print("[dim]Model integrity verification is a one-time check that ensures your[/dim]")
    console.print("[dim]model weights are not corrupted. This helps prevent runtime errors.[/dim]")
    console.print()

    if not Confirm.ask(f"Would you like to verify model integrity before {operation_name}?", default=True):
        console.print()
        print_warning(f"Skipping verification. Model will be used for {operation_name} without integrity check.")
        console.print()
        return

    # Perform verification
    console.print()
    print_step("Verifying model integrity...")
    console.print()

    # Check connectivity first
    use_mirror = False
    if user_model.repo_type == "huggingface":
        with console.status("[dim]Checking HuggingFace connectivity...[/dim]"):
            is_accessible, message = check_huggingface_connectivity(timeout=5)

        if not is_accessible:
            print_warning("HuggingFace Connection Failed")
            console.print()
            console.print(f"  {message}")
            console.print()
            console.print("  [yellow]Auto-switching to HuggingFace mirror:[/yellow] [cyan]hf-mirror.com[/cyan]")
            console.print()
            use_mirror = True

    # Fetch remote hashes with timeout
    def fetch_with_timeout(repo_type, repo_id, use_mirror, timeout):
        """Fetch hashes with timeout."""
        executor = ThreadPoolExecutor(max_workers=1)
        try:
            platform = "hf" if repo_type == "huggingface" else "ms"
            future = executor.submit(fetch_model_sha256, repo_id, platform, use_mirror=use_mirror, timeout=timeout)
            hashes = future.result(timeout=timeout)
            executor.shutdown(wait=False)
            return (hashes, False)
        except (FutureTimeoutError, Exception):
            executor.shutdown(wait=False)
            return (None, True)

    # Try fetching hashes
    status = console.status("[dim]Fetching remote hashes...[/dim]")
    status.start()
    official_hashes, timed_out = fetch_with_timeout(user_model.repo_type, user_model.repo_id, use_mirror, 10)
    status.stop()

    # Handle timeout with fallback
    if timed_out and user_model.repo_type == "huggingface" and not use_mirror:
        print_warning("HuggingFace Fetch Timeout (10s)")
        console.print()
        console.print("  [yellow]Trying HuggingFace mirror...[/yellow]")
        console.print()

        status = console.status("[dim]Fetching remote hashes from mirror...[/dim]")
        status.start()
        official_hashes, timed_out = fetch_with_timeout(user_model.repo_type, user_model.repo_id, True, 10)
        status.stop()

    if timed_out and user_model.repo_type == "huggingface":
        print_warning("HuggingFace Mirror Timeout (10s)")
        console.print()
        console.print("  [yellow]Fallback to ModelScope...[/yellow]")
        console.print()

        status = console.status("[dim]Fetching remote hashes from ModelScope...[/dim]")
        status.start()
        official_hashes, timed_out = fetch_with_timeout("modelscope", user_model.repo_id, False, 10)
        status.stop()

    if not official_hashes or timed_out:
        print_error("Failed to fetch remote hashes (network timeout)")
        console.print()
        console.print("  [yellow]Unable to verify model integrity due to network issues.[/yellow]")
        console.print()

        if not Confirm.ask(f"Continue {operation_name} without verification?", default=False):
            raise typer.Exit(0)

        console.print()
        return

    console.print(f"  [green]✓ Fetched {len(official_hashes)} file hashes[/green]")
    console.print()

    # Calculate local hashes and compare
    local_dir = Path(user_model.path)
    files_to_hash = [f for f in local_dir.glob("*.safetensors") if f.is_file()]

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        MofNCompleteColumn(),
        TimeElapsedColumn(),
        console=console,
    ) as progress:
        # Calculate local hashes
        task = progress.add_task("[yellow]Calculating local SHA256...", total=len(files_to_hash))

        def hash_callback(msg):
            if "[" in msg and "/" in msg and "]" in msg and "✓" in msg:
                progress.advance(task)

        local_hashes = calculate_local_sha256(local_dir, "*.safetensors", progress_callback=hash_callback)
        progress.remove_task(task)

        console.print(f"  [green]✓ Calculated {len(local_hashes)} local hashes[/green]")
        console.print()

        # Compare hashes
        task = progress.add_task("[blue]Comparing hashes...", total=len(official_hashes))

        files_failed = []
        files_missing = []
        files_passed = 0

        for filename, official_hash in official_hashes.items():
            file_basename = Path(filename).name
            local_hash = None

            for local_file, local_hash_value in local_hashes.items():
                if Path(local_file).name == file_basename:
                    local_hash = local_hash_value
                    break

            if local_hash is None:
                files_missing.append(filename)
            elif local_hash.lower() != official_hash.lower():
                files_failed.append(f"{filename} (hash mismatch)")
            else:
                files_passed += 1

            progress.advance(task)

        progress.remove_task(task)

    console.print()

    # Check results
    if not files_failed and not files_missing:
        # Verification passed
        user_registry.update_model(user_model.name, {"sha256_status": "passed"})
        print_success("Model integrity verification PASSED ✓")
        console.print()
        console.print(f"  All {files_passed} files verified successfully")
        console.print()
    else:
        # Verification failed
        user_registry.update_model(user_model.name, {"sha256_status": "failed"})
        print_error(f"Model integrity verification FAILED")
        console.print()
        console.print(f"  ✓ Passed: [green]{files_passed}[/green]")
        console.print(f"  ✗ Failed: [red]{len(files_failed) + len(files_missing)}[/red]")
        console.print()

        if files_missing:
            console.print(f"  [red]Missing files ({len(files_missing)}):[/red]")
            for f in files_missing[:5]:
                console.print(f"    - {Path(f).name}")
            if len(files_missing) > 5:
                console.print(f"    ... and {len(files_missing) - 5} more")
            console.print()

        if files_failed:
            console.print(f"  [red]Hash mismatch ({len(files_failed)}):[/red]")
            for f in files_failed[:5]:
                console.print(f"    - {f}")
            if len(files_failed) > 5:
                console.print(f"    ... and {len(files_failed) - 5} more")
            console.print()

        console.print("[bold red]⚠ WARNING: Model weights may be corrupted![/bold red]")
        console.print()
        console.print("This could cause runtime errors or incorrect inference results.")
        console.print()

        # Ask if user wants to repair
        if Confirm.ask("Would you like to repair (re-download) the corrupted files?", default=True):
            console.print()
            print_info("Please run: [cyan]kt model verify " + user_model.name + "[/cyan]")
            console.print()
            console.print("The verify command will guide you through the repair process.")
            raise typer.Exit(0)

        # Ask if user wants to continue anyway
        console.print()
        if not Confirm.ask(
            f"[yellow]Continue {operation_name} with potentially corrupted weights?[/yellow]", default=False
        ):
            raise typer.Exit(0)

        console.print()
        print_warning(f"Proceeding with {operation_name} using unverified weights at your own risk...")
        console.print()


================================================
FILE: kt-kernel/python/cli/utils/port_checker.py
================================================
"""
Port availability checking utilities.
"""

import socket
from typing import Tuple


def is_port_available(host: str, port: int) -> bool:
    """Check if a port is available on the given host.

    Args:
        host: Host address (e.g., "0.0.0.0", "127.0.0.1")
        port: Port number to check

    Returns:
        True if port is available, False if occupied
    """
    try:
        # Try to bind to the port
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(1)

        # Use SO_REUSEADDR to allow binding to recently closed ports
        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

        # Try to bind
        result = sock.connect_ex((host if host != "0.0.0.0" else "127.0.0.1", port))
        sock.close()

        # If connect_ex returns 0, port is occupied
        # If it returns error (non-zero), port is available
        return result != 0

    except Exception:
        # If any error occurs, assume port is not available
        return False


def find_available_port(host: str, start_port: int, max_attempts: int = 100) -> Tuple[bool, int]:
    """Find an available port starting from start_port.

    Args:
        host: Host address
        start_port: Starting port number to check
        max_attempts: Maximum number of ports to try

    Returns:
        Tuple of (found, port_number)
        - found: True if an available port was found
        - port_number: The available port number (or start_port if not found)
    """
    for port in range(start_port, start_port + max_attempts):
        if is_port_available(host, port):
            return True, port

    return False, start_port


================================================
FILE: kt-kernel/python/cli/utils/quant_interactive.py
================================================
"""
Interactive configuration for kt quant command.

Provides rich, multi-step interactive configuration for model quantization.
"""

from typing import Optional, Dict, Any
from pathlib import Path
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.prompt import Prompt, Confirm, IntPrompt
from kt_kernel.cli.i18n import t


console = Console()


def select_model_to_quantize() -> Optional[Any]:
    """Select model to quantize interactively."""
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.commands.model import is_amx_weights, SHA256_STATUS_MAP
    from kt_kernel.cli.utils.model_table_builder import build_moe_gpu_table

    registry = UserModelRegistry()
    all_models = registry.list_models()

    # Filter MoE models only (safetensors, not AMX, is_moe=True)
    quant_models = []
    for model in all_models:
        if model.format == "safetensors":
            # Skip AMX models
            is_amx, _ = is_amx_weights(model.path)
            if is_amx:
                continue

            # Only include MoE models
            if model.is_moe:
                quant_models.append(model)

    if not quant_models:
        console.print(f"[yellow]{t('quant_no_moe_models')}[/yellow]")
        console.print()
        console.print(f"  {t('quant_only_moe')}")
        console.print()
        console.print(f"  {t('quant_add_models', command='kt model scan')}")
        console.print(f"  {t('quant_add_models', command='kt model add <path>')}")
        return None

    # Display models
    console.print()
    console.print(f"[bold green]{t('quant_moe_available')}[/bold green]")
    console.print()

    # Use shared table builder
    table, displayed_models = build_moe_gpu_table(
        models=quant_models, status_map=SHA256_STATUS_MAP, show_index=True, start_index=1
    )

    console.print(table)
    console.print()

    choice = IntPrompt.ask(t("quant_select_model"), default=1, show_choices=False)

    if choice < 1 or choice > len(displayed_models):
        console.print(f"[red]{t('quant_invalid_choice')}[/red]")
        return None

    return displayed_models[choice - 1]


def configure_quantization_method() -> Dict[str, str]:
    """Select quantization method and input type."""
    console.print()
    console.print(Panel(f"[bold cyan]{t('quant_step2_method')}[/bold cyan]", expand=False))
    console.print()

    # Method selection
    console.print(f"[bold]{t('quant_method_label')}[/bold]")
    console.print(f"  [cyan][1][/cyan] {t('quant_int4_desc')}")
    console.print(f"  [cyan][2][/cyan] {t('quant_int8_desc')}")
    console.print()

    method_choice = Prompt.ask(t("quant_select_method"), choices=["1", "2"], default="1")
    method = "int4" if method_choice == "1" else "int8"

    console.print()
    console.print(f"[bold]{t('quant_input_type_label')}[/bold]")
    console.print(f"  [cyan][1][/cyan] {t('quant_fp8_desc')}")
    console.print(f"  [cyan][2][/cyan] {t('quant_fp16_desc')}")
    console.print(f"  [cyan][3][/cyan] {t('quant_bf16_desc')}")
    console.print()

    input_choice = Prompt.ask(t("quant_select_input_type"), choices=["1", "2", "3"], default="1")
    input_type_map = {"1": "fp8", "2": "fp16", "3": "bf16"}
    input_type = input_type_map[input_choice]

    return {"method": method, "input_type": input_type}


def configure_cpu_params(max_cores: int, max_numa: int) -> Dict[str, Any]:
    """Configure CPU parameters."""
    console.print()
    console.print(Panel(f"[bold cyan]{t('quant_step3_cpu')}[/bold cyan]", expand=False))
    console.print()

    def clamp(value: int, min_val: int, max_val: int, default: int) -> int:
        """Clamp value to range or return default if out of bounds."""
        if min_val <= value <= max_val:
            return max(min_val, min(value, max_val))
        return default

    default_threads = int(max_cores * 0.8)
    cpu_threads = IntPrompt.ask(t("quant_cpu_threads_prompt", max=max_cores), default=default_threads)
    cpu_threads = clamp(cpu_threads, 1, max_cores, default_threads)

    numa_nodes = IntPrompt.ask(t("quant_numa_nodes_prompt", max=max_numa), default=max_numa)
    numa_nodes = clamp(numa_nodes, 1, max_numa, max_numa)

    # Ask about GPU usage
    console.print()
    console.print(f"[bold]{t('quant_use_gpu_label')}[/bold]")
    console.print(f"  [dim]{t('quant_gpu_speedup')}[/dim]")
    console.print()
    use_gpu = Confirm.ask(t("quant_enable_gpu"), default=True)

    return {"cpu_threads": cpu_threads, "numa_nodes": numa_nodes, "use_gpu": use_gpu}


def configure_output_path(model: Any, method: str, numa_nodes: int) -> Path:
    """Configure output path for quantized weights."""
    from kt_kernel.cli.config.settings import get_settings

    console.print()
    console.print(Panel(f"[bold cyan]{t('quant_step4_output')}[/bold cyan]", expand=False))
    console.print()

    # Generate default output path
    model_path = Path(model.path)
    method_upper = method.upper()
    settings = get_settings()

    # Priority: paths.weights > paths.models[0] > model's parent directory
    weights_dir = settings.weights_dir
    if weights_dir and weights_dir.exists():
        # Use configured weights directory (highest priority)
        default_output = weights_dir / f"{model_path.name}-AMX{method_upper}-NUMA{numa_nodes}"
    else:
        # Use first model storage path
        model_paths = settings.get_model_paths()
        if model_paths and model_paths[0].exists():
            default_output = model_paths[0] / f"{model_path.name}-AMX{method_upper}-NUMA{numa_nodes}"
        else:
            # Fallback to model's parent directory
            default_output = model_path.parent / f"{model_path.name}-AMX{method_upper}-NUMA{numa_nodes}"

    console.print(f"[dim]{t('quant_default_path')}[/dim]", default_output)
    console.print()

    use_default = Confirm.ask(t("quant_use_default"), default=True)

    if use_default:
        return default_output

    custom_path = Prompt.ask(t("quant_custom_path"), default=str(default_output))

    return Path(custom_path)


def calculate_quantized_size(source_path: Path, input_type: str, quant_method: str) -> tuple[float, float]:
    """
    Calculate source model size and estimated quantized size.

    Args:
        source_path: Path to source model
        input_type: Input type (fp8, fp16, bf16)
        quant_method: Quantization method (int4, int8)

    Returns:
        Tuple of (source_size_gb, estimated_quant_size_gb)
    """
    # Calculate source model size
    try:
        total_bytes = sum(f.stat().st_size for f in source_path.glob("*.safetensors") if f.is_file())
        source_size_gb = total_bytes / (1024**3)
    except Exception:
        return 0.0, 0.0

    # Bits mapping
    input_bits = {"fp8": 8, "fp16": 16, "bf16": 16}
    quant_bits = {"int4": 4, "int8": 8}

    input_bit = input_bits.get(input_type, 16)
    quant_bit = quant_bits.get(quant_method, 4)

    # Estimate: source_size * (quant_bits / input_bits)
    ratio = quant_bit / input_bit
    estimated_size_gb = source_size_gb * ratio

    return source_size_gb, estimated_size_gb


def check_disk_space(output_path: Path, required_size_gb: float) -> tuple[float, bool]:
    """
    Check available disk space at output path.

    Args:
        output_path: Target output path
        required_size_gb: Required space in GB

    Returns:
        Tuple of (available_gb, is_sufficient)
        is_sufficient is True if available >= required * 1.2
    """
    import shutil

    try:
        # Get parent directory that exists
        check_path = output_path.parent if not output_path.exists() else output_path
        while not check_path.exists() and check_path != check_path.parent:
            check_path = check_path.parent

        stat = shutil.disk_usage(check_path)
        available_gb = stat.free / (1024**3)

        # Check if available space >= required * 1.2 (20% buffer)
        is_sufficient = available_gb >= (required_size_gb * 1.2)

        return available_gb, is_sufficient
    except Exception:
        return 0.0, False


def interactive_quant_config() -> Optional[Dict[str, Any]]:
    """
    Interactive configuration for kt quant.

    Returns configuration dict or None if cancelled.
    """
    from kt_kernel.cli.utils.environment import detect_cpu_info

    # Get CPU info
    cpu_info = detect_cpu_info()

    # Step 1: Select model
    model = select_model_to_quantize()
    if not model:
        return None

    # Step 1.5: Pre-quantization verification (optional)
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.utils.model_verifier import pre_operation_verification

    user_registry = UserModelRegistry()
    user_model_obj = user_registry.find_by_path(model.path)

    if user_model_obj and user_model_obj.format == "safetensors":
        pre_operation_verification(user_model_obj, user_registry, operation_name="quantizing")

    # Step 2: Configure quantization method
    quant_config = configure_quantization_method()

    # Step 3: Configure CPU parameters
    cpu_config = configure_cpu_params(cpu_info.threads, cpu_info.numa_nodes)  # Use logical threads

    # Step 4: Configure output path
    output_path = configure_output_path(model, quant_config["method"], cpu_config["numa_nodes"])

    # Step 4.5: Check if output path already exists and generate unique name
    if output_path.exists():
        console.print()
        console.print(t("quant_output_exists_warn", path=str(output_path)))
        console.print()

        # Generate unique name by adding suffix
        original_name = output_path.name
        parent_dir = output_path.parent
        counter = 2

        while output_path.exists():
            new_name = f"{original_name}-{counter}"
            output_path = parent_dir / new_name
            counter += 1

        console.print(t("quant_using_unique_name", path=str(output_path)))
        console.print()

    # Step 5: Calculate space requirements and check availability
    console.print()
    console.print(Panel(f"[bold cyan]{t('quant_disk_analysis')}[/bold cyan]", expand=False))
    console.print()

    source_size_gb, estimated_size_gb = calculate_quantized_size(
        Path(model.path), quant_config["input_type"], quant_config["method"]
    )

    available_gb, is_sufficient = check_disk_space(output_path, estimated_size_gb)

    console.print(f"  {t('quant_source_size'):<26} [cyan]{source_size_gb:.2f} GB[/cyan]")
    console.print(f"  {t('quant_estimated_size'):<26} [yellow]{estimated_size_gb:.2f} GB[/yellow]")
    console.print(
        f"  {t('quant_available_space'):<26} [{'green' if is_sufficient else 'red'}]{available_gb:.2f} GB[/{'green' if is_sufficient else 'red'}]"
    )
    console.print()

    if not is_sufficient:
        required_with_buffer = estimated_size_gb * 1.2
        console.print(f"[bold red]⚠ {t('quant_insufficient_space')}[/bold red]")
        console.print()
        console.print(f"  {t('quant_required_space'):<26} [yellow]{required_with_buffer:.2f} GB[/yellow]")
        console.print(f"  {t('quant_available_space'):<26} [red]{available_gb:.2f} GB[/red]")
        console.print(f"  {t('quant_shortage'):<26} [red]{required_with_buffer - available_gb:.2f} GB[/red]")
        console.print()
        console.print(f"  {t('quant_may_fail')}")
        console.print()

        if not Confirm.ask(f"[yellow]{t('quant_continue_anyway')}[/yellow]", default=False):
            console.print(f"[yellow]{t('quant_cancelled')}[/yellow]")
            return None
        console.print()

    # Summary and confirmation
    console.print()
    console.print(Panel(f"[bold cyan]{t('quant_config_summary')}[/bold cyan]", expand=False))
    console.print()
    console.print(f"  {t('quant_summary_model'):<15} {model.name}")
    console.print(f"  {t('quant_summary_method'):<15} {quant_config['method'].upper()}")
    console.print(f"  {t('quant_summary_input_type'):<15} {quant_config['input_type'].upper()}")
    console.print(f"  {t('quant_summary_cpu_threads'):<15} {cpu_config['cpu_threads']}")
    console.print(f"  {t('quant_summary_numa'):<15} {cpu_config['numa_nodes']}")
    console.print(f"  {t('quant_summary_gpu'):<15} {t('yes') if cpu_config['use_gpu'] else t('no')}")
    console.print(f"  {t('quant_summary_output'):<15} {output_path}")
    console.print()

    if not Confirm.ask(f"[bold green]{t('quant_start_question')}[/bold green]", default=True):
        console.print(f"[yellow]{t('quant_cancelled')}[/yellow]")
        return None

    return {
        "model": model,
        "method": quant_config["method"],
        "input_type": quant_config["input_type"],
        "cpu_threads": cpu_config["cpu_threads"],
        "numa_nodes": cpu_config["numa_nodes"],
        "use_gpu": cpu_config["use_gpu"],
        "output_path": output_path,
    }


================================================
FILE: kt-kernel/python/cli/utils/repo_detector.py
================================================
"""
Repo Detector

Automatically detect repository information from model README.md files
"""

import re
from pathlib import Path
from typing import Optional, Dict, Tuple
import yaml


def parse_readme_frontmatter(readme_path: Path) -> Optional[Dict]:
    """
    Parse YAML frontmatter from README.md

    Args:
        readme_path: Path to README.md file

    Returns:
        Dictionary of frontmatter data, or None if not found
    """
    if not readme_path.exists():
        return None

    try:
        with open(readme_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Match YAML frontmatter between --- markers
        match = re.match(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL)
        if not match:
            return None

        yaml_content = match.group(1)

        # Parse YAML
        try:
            data = yaml.safe_load(yaml_content)
            return data if isinstance(data, dict) else None
        except yaml.YAMLError:
            return None

    except Exception as e:
        return None


def extract_repo_from_frontmatter(frontmatter: Dict) -> Optional[Tuple[str, str]]:
    """
    Extract repo_id and repo_type from frontmatter

    Args:
        frontmatter: Parsed YAML frontmatter dictionary

    Returns:
        Tuple of (repo_id, repo_type) or None
        repo_type is either "huggingface" or "modelscope"
    """
    if not frontmatter:
        return None

    # Priority 1: Extract from license_link (most reliable)
    license_link = frontmatter.get("license_link")
    if license_link and isinstance(license_link, str):
        result = _extract_repo_from_url(license_link)
        if result:
            return result

    # Priority 2: Try to find repo_id from other fields
    repo_id = None

    # Check base_model field
    base_model = frontmatter.get("base_model")
    if base_model:
        if isinstance(base_model, list) and len(base_model) > 0:
            # base_model is a list, take first item
            repo_id = base_model[0]
        elif isinstance(base_model, str):
            repo_id = base_model

    # Check model-index field
    if not repo_id:
        model_index = frontmatter.get("model-index")
        if isinstance(model_index, list) and len(model_index) > 0:
            first_model = model_index[0]
            if isinstance(first_model, dict):
                repo_id = first_model.get("name")

    # Check model_name field
    if not repo_id:
        repo_id = frontmatter.get("model_name")

    if not repo_id or not isinstance(repo_id, str):
        return None

    # Validate format: should be "namespace/model-name"
    if "/" not in repo_id:
        return None

    parts = repo_id.split("/")
    if len(parts) != 2:
        return None

    # Determine repo type
    repo_type = "huggingface"  # Default

    # Look for ModelScope indicators
    if "modelscope" in repo_id.lower():
        repo_type = "modelscope"

    # Check tags
    tags = frontmatter.get("tags", [])
    if isinstance(tags, list):
        if "modelscope" in [str(t).lower() for t in tags]:
            repo_type = "modelscope"

    return (repo_id, repo_type)


def _extract_repo_from_url(url: str) -> Optional[Tuple[str, str]]:
    """
    Extract repo_id and repo_type from a URL

    Supports:
    - https://huggingface.co/Qwen/Qwen3-30B-A3B/blob/main/LICENSE
    - https://modelscope.cn/models/Qwen/Qwen3-30B-A3B

    Args:
        url: URL string

    Returns:
        Tuple of (repo_id, repo_type) or None
    """
    # HuggingFace pattern: https://huggingface.co/{namespace}/{model}/...
    hf_match = re.match(r"https?://huggingface\.co/([^/]+)/([^/]+)", url)
    if hf_match:
        namespace = hf_match.group(1)
        model_name = hf_match.group(2)
        repo_id = f"{namespace}/{model_name}"
        return (repo_id, "huggingface")

    # ModelScope pattern: https://modelscope.cn/models/{namespace}/{model}
    ms_match = re.match(r"https?://(?:www\.)?modelscope\.cn/models/([^/]+)/([^/]+)", url)
    if ms_match:
        namespace = ms_match.group(1)
        model_name = ms_match.group(2)
        repo_id = f"{namespace}/{model_name}"
        return (repo_id, "modelscope")

    return None


def extract_repo_from_global_search(readme_path: Path) -> Optional[Tuple[str, str]]:
    """
    Extract repo info by globally searching for URLs in README.md

    Args:
        readme_path: Path to README.md file

    Returns:
        Tuple of (repo_id, repo_type) or None if not found
    """
    if not readme_path.exists():
        return None

    try:
        with open(readme_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Find all HuggingFace URLs
        hf_pattern = r"https?://huggingface\.co/([^/\s]+)/([^/\s\)]+)"
        hf_matches = re.findall(hf_pattern, content)

        # Find all ModelScope URLs
        ms_pattern = r"https?://(?:www\.)?modelscope\.cn/models/([^/\s]+)/([^/\s\)]+)"
        ms_matches = re.findall(ms_pattern, content)

        # Collect all found repos with their types
        found_repos = []

        for namespace, model_name in hf_matches:
            # Skip common non-repo paths
            if namespace.lower() in ["docs", "blog", "spaces", "datasets"]:
                continue
            if model_name.lower() in ["tree", "blob", "raw", "resolve", "discussions"]:
                continue

            repo_id = f"{namespace}/{model_name}"
            found_repos.append((repo_id, "huggingface"))

        for namespace, model_name in ms_matches:
            repo_id = f"{namespace}/{model_name}"
            found_repos.append((repo_id, "modelscope"))

        if not found_repos:
            return None

        # If multiple different repos found, use the last one
        # First, deduplicate
        seen = {}
        for repo_id, repo_type in found_repos:
            seen[repo_id] = repo_type  # Will keep the last occurrence

        # Get the last unique repo
        if seen:
            # Use the last item from found_repos that's unique
            last_unique = None
            for repo_id, repo_type in found_repos:
                if repo_id in seen:
                    last_unique = (repo_id, repo_type)

            return last_unique

        return None

    except Exception as e:
        return None


def detect_repo_for_model(model_path: str) -> Optional[Tuple[str, str]]:
    """
    Detect repository information for a model

    Strategy:
    Only extract from YAML frontmatter metadata in README.md
    (Removed global URL search to avoid false positives)

    Args:
        model_path: Path to model directory

    Returns:
        Tuple of (repo_id, repo_type) or None if not detected
    """
    model_dir = Path(model_path)

    if not model_dir.exists() or not model_dir.is_dir():
        return None

    # Look for README.md
    readme_path = model_dir / "README.md"
    if not readme_path.exists():
        return None

    # Only parse YAML frontmatter (no fallback to global search)
    frontmatter = parse_readme_frontmatter(readme_path)
    if frontmatter:
        return extract_repo_from_frontmatter(frontmatter)

    return None


def scan_models_for_repo(model_list) -> Dict:
    """
    Scan a list of models and detect repo information

    Args:
        model_list: List of UserModel objects

    Returns:
        Dictionary with scan results:
        {
            'detected': [(model, repo_id, repo_type), ...],
            'not_detected': [model, ...],
            'skipped': [model, ...]  # Already has repo_id
        }
    """
    results = {"detected": [], "not_detected": [], "skipped": []}

    for model in model_list:
        # Skip if already has repo_id
        if model.repo_id:
            results["skipped"].append(model)
            continue

        # Only process safetensors and gguf models
        if model.format not in ["safetensors", "gguf"]:
            results["skipped"].append(model)
            continue

        # Try to detect repo
        repo_info = detect_repo_for_model(model.path)

        if repo_info:
            repo_id, repo_type = repo_info
            results["detected"].append((model, repo_id, repo_type))
        else:
            results["not_detected"].append(model)

    return results


def format_detection_report(results: Dict) -> str:
    """
    Format scan results into a readable report

    Args:
        results: Results from scan_models_for_repo()

    Returns:
        Formatted string report
    """
    lines = []

    lines.append("=" * 80)
    lines.append("Auto-Detection Report")
    lines.append("=" * 80)
    lines.append("")

    # Detected
    if results["detected"]:
        lines.append(f"✓ Detected repository information ({len(results['detected'])} models):")
        lines.append("")
        for model, repo_id, repo_type in results["detected"]:
            lines.append(f"  • {model.name}")
            lines.append(f"    Path: {model.path}")
            lines.append(f"    Repo: {repo_id} ({repo_type})")
            lines.append("")

    # Not detected
    if results["not_detected"]:
        lines.append(f"✗ No repository information found ({len(results['not_detected'])} models):")
        lines.append("")
        for model in results["not_detected"]:
            lines.append(f"  • {model.name}")
            lines.append(f"    Path: {model.path}")
        lines.append("")

    # Skipped
    if results["skipped"]:
        lines.append(f"⊘ Skipped ({len(results['skipped'])} models):")
        lines.append(f"  (Already have repo_id or not safetensors/gguf format)")
        lines.append("")

    lines.append("=" * 80)
    lines.append(
        f"Summary: {len(results['detected'])} detected, "
        f"{len(results['not_detected'])} not detected, "
        f"{len(results['skipped'])} skipped"
    )
    lines.append("=" * 80)

    return "\n".join(lines)


def apply_detection_results(results: Dict, registry) -> int:
    """
    Apply detected repo information to models in registry

    Args:
        results: Results from scan_models_for_repo()
        registry: UserModelRegistry instance

    Returns:
        Number of models updated
    """
    updated_count = 0

    for model, repo_id, repo_type in results["detected"]:
        success = registry.update_model(model.name, {"repo_id": repo_id, "repo_type": repo_type})

        if success:
            updated_count += 1

    return updated_count


================================================
FILE: kt-kernel/python/cli/utils/run_configs.py
================================================
"""
Configuration save/load for kt run command.

Manages saved run configurations bound to specific models.
"""

from pathlib import Path
from typing import Dict, List, Optional, Any
from datetime import datetime
import yaml


CONFIG_FILE = Path.home() / ".ktransformers" / "run_configs.yaml"


class RunConfigManager:
    """Manager for saved run configurations."""

    def __init__(self):
        self.config_file = CONFIG_FILE
        self._ensure_config_file()

    def _ensure_config_file(self):
        """Ensure config file exists."""
        if not self.config_file.exists():
            self.config_file.parent.mkdir(parents=True, exist_ok=True)
            self._save_data({"version": "1.0", "configs": {}})

    def _load_data(self) -> Dict:
        """Load raw config data."""
        try:
            with open(self.config_file, "r", encoding="utf-8") as f:
                return yaml.safe_load(f) or {"version": "1.0", "configs": {}}
        except Exception:
            return {"version": "1.0", "configs": {}}

    def _save_data(self, data: Dict):
        """Save raw config data."""
        with open(self.config_file, "w", encoding="utf-8") as f:
            yaml.dump(data, f, allow_unicode=True, default_flow_style=False)

    def list_configs(self, model_id: str) -> List[Dict[str, Any]]:
        """List all saved configs for a model.

        Returns:
            List of config dicts with 'config_name' and other fields.
        """
        data = self._load_data()
        configs = data.get("configs", {}).get(model_id, [])
        return configs if isinstance(configs, list) else []

    def save_config(self, model_id: str, config: Dict[str, Any]):
        """Save a configuration for a model.

        Args:
            model_id: Model ID to bind config to
            config: Configuration dict with all run parameters
        """
        data = self._load_data()

        if "configs" not in data:
            data["configs"] = {}

        if model_id not in data["configs"]:
            data["configs"][model_id] = []

        # Add timestamp
        config["created_at"] = datetime.now().isoformat()

        # Append config
        data["configs"][model_id].append(config)

        self._save_data(data)

    def delete_config(self, model_id: str, config_index: int) -> bool:
        """Delete a saved configuration.

        Args:
            model_id: Model ID
            config_index: Index of config to delete (0-based)

        Returns:
            True if deleted, False if not found
        """
        data = self._load_data()

        if model_id not in data.get("configs", {}):
            return False

        configs = data["configs"][model_id]
        if config_index < 0 or config_index >= len(configs):
            return False

        configs.pop(config_index)
        self._save_data(data)
        return True

    def get_config(self, model_id: str, config_index: int) -> Optional[Dict[str, Any]]:
        """Get a specific saved configuration.

        Args:
            model_id: Model ID
            config_index: Index of config to get (0-based)

        Returns:
            Config dict or None if not found
        """
        configs = self.list_configs(model_id)
        if config_index < 0 or config_index >= len(configs):
            return None
        return configs[config_index]


================================================
FILE: kt-kernel/python/cli/utils/run_interactive.py
================================================
"""
Interactive configuration for kt run command - New Implementation.

Provides step-by-step interactive configuration for running models.
"""

from typing import Optional, List, Dict, Any, Tuple
from pathlib import Path
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.prompt import Prompt, Confirm
from rich import box
import torch

from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.input_validators import (
    prompt_int_with_retry,
    prompt_float_with_retry,
    prompt_choice_with_retry,
    prompt_int_list_with_retry,
)


console = Console()


def get_gpu_info() -> List[Dict[str, Any]]:
    """Get real-time GPU information with free VRAM."""
    from kt_kernel.cli.utils.environment import detect_gpus

    gpus = detect_gpus()
    gpu_info_list = []

    for i, gpu in enumerate(gpus):
        total_vram_gb = gpu.vram_gb
        free_vram_gb = gpu.vram_gb  # Default fallback

        # Try to get real-time free VRAM
        if torch.cuda.is_available() and i < torch.cuda.device_count():
            try:
                free_vram_bytes, total_vram_bytes = torch.cuda.mem_get_info(i)
                free_vram_gb = free_vram_bytes / (1024**3)
                total_vram_gb = total_vram_bytes / (1024**3)
            except Exception:
                pass  # Use fallback values

        gpu_info_list.append(
            {
                "id": i,
                "name": gpu.name,
                "total_vram_gb": total_vram_gb,
                "free_vram_gb": free_vram_gb,
            }
        )

    return gpu_info_list


def select_model() -> Optional[Any]:
    """Step 1: Select a safetensors MoE model.

    Returns:
        Selected UserModel object or None if cancelled.
    """
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.commands.model import is_amx_weights

    registry = UserModelRegistry()
    all_models = registry.list_models()

    # Filter: safetensors models only (exclude AMX and GGUF)
    # Then filter to only show MoE models (matching kt model list behavior)
    moe_models = []
    for model in all_models:
        if model.format == "safetensors" and model.path_exists():
            is_amx, _ = is_amx_weights(model.path)
            if not is_amx:
                # Only include MoE models (is_moe == True)
                # Also include models not yet analyzed (is_moe == None) for backwards compatibility
                if model.is_moe is True or model.is_moe is None:
                    moe_models.append(model)

    if not moe_models:
        console.print(f"[yellow]{t('run_int_no_moe_models')}[/yellow]")
        console.print(f"  {t('run_int_add_models')}")
        console.print(f"  {t('run_int_list_all')}")
        return None

    console.print()
    console.print(Panel(f"[bold cyan]{t('run_int_step1_title')}[/bold cyan]", expand=False))
    console.print()

    # Display models using same format as kt model list
    from kt_kernel.cli.utils.model_scanner import format_size
    from kt_kernel.cli.commands.model import SHA256_STATUS_MAP

    table = Table(box=box.ROUNDED, show_header=True, header_style="bold cyan")
    table.add_column("#", justify="right", style="cyan", no_wrap=True)
    table.add_column("Name", style="cyan", no_wrap=True)
    table.add_column("Path", style="dim", overflow="fold")
    table.add_column("Total", justify="right")
    table.add_column("Exps", justify="center", style="yellow")
    table.add_column("Act", justify="center", style="green")
    table.add_column("MoE Size", justify="right", style="cyan")
    table.add_column("Repo", style="dim", overflow="fold")
    table.add_column("SHA256", justify="center")

    for i, model in enumerate(moe_models, 1):
        # Calculate size
        if model.path_exists():
            path_obj = Path(model.path)
            try:
                files = list(path_obj.glob("*.safetensors"))
                total_size = sum(f.stat().st_size for f in files if f.exists())
                size_display = format_size(total_size)
            except:
                size_display = "[dim]-[/dim]"
        else:
            size_display = "[dim]-[/dim]"

        # Format MoE info
        experts = f"[yellow]{model.moe_num_experts}[/yellow]" if model.moe_num_experts else "[dim]-[/dim]"
        active = f"[green]{model.moe_num_experts_per_tok}[/green]" if model.moe_num_experts_per_tok else "[dim]-[/dim]"
        moe_size = f"[cyan]{size_display}[/cyan]" if model.moe_num_experts else "[dim]-[/dim]"

        # Format repo info
        if model.repo_id:
            repo_abbr = "hf" if model.repo_type == "huggingface" else "ms"
            repo_display = f"{repo_abbr}:{model.repo_id}"
        else:
            repo_display = "[dim]-[/dim]"

        # Format SHA256 status
        sha256_display = SHA256_STATUS_MAP.get(model.sha256_status, model.sha256_status)

        table.add_row(
            str(i),
            model.name,
            str(model.path),
            size_display,
            experts,
            active,
            moe_size,
            repo_display,
            sha256_display,
        )

    console.print(table)
    console.print()

    choice = prompt_int_with_retry(
        t("run_int_select_model"),
        default=1,
        min_val=1,
        max_val=len(moe_models),
    )

    return moe_models[choice - 1]


def select_inference_method(model: Any) -> Optional[Dict[str, Any]]:
    """Step 2: Select inference method.

    Args:
        model: Selected UserModel

    Returns:
        Dict with 'method' (raw/amx/gguf/saved), and method-specific fields, or None if cancelled.
    """
    from kt_kernel.cli.utils.run_configs import RunConfigManager

    config_manager = RunConfigManager()
    saved_configs = config_manager.list_configs(model.id)

    # Debug output (can be removed later)
    if False:  # Set to True for debugging
        console.print()
        console.print(f"[dim]DEBUG: Model ID: {model.id}[/dim]")
        console.print(f"[dim]DEBUG: Saved configs count: {len(saved_configs)}[/dim]")
        if saved_configs:
            console.print(f"[dim]DEBUG: Configs: {[c.get('config_name', '?') for c in saved_configs]}[/dim]")
        console.print()

    console.print()
    console.print(Panel("[bold cyan]Step 2: Select Inference Method[/bold cyan]", expand=False))
    console.print()

    options = []
    option_map = {}

    # Option 1: Use saved configuration (if any)
    if saved_configs:
        option_idx = len(options) + 1
        console.print(f"  [cyan][{option_idx}][/cyan] [bold]Use Saved Configuration[/bold]")
        console.print(f"      [dim]{len(saved_configs)} saved config(s) available[/dim]")
        options.append(str(option_idx))
        option_map[str(option_idx)] = "saved"

    # Option 2: Raw precision inference
    option_idx = len(options) + 1
    console.print(f"  [cyan][{option_idx}][/cyan] [bold]Raw Precision Inference[/bold]")
    console.print("      [dim]FP8 / FP8_PERCHANNEL / BF16 / RAWINT4[/dim]")
    options.append(str(option_idx))
    option_map[str(option_idx)] = "raw"

    # Option 3: AMX quantized inference
    option_idx = len(options) + 1
    console.print(f"  [cyan][{option_idx}][/cyan] [bold]AMX Quantized Inference[/bold]")
    console.print("      [dim]INT4 / INT8 (CPU optimized)[/dim]")
    options.append(str(option_idx))
    option_map[str(option_idx)] = "amx"

    # Option 4: GGUF inference
    option_idx = len(options) + 1
    console.print(f"  [cyan][{option_idx}][/cyan] [bold]GGUF Inference[/bold]")
    console.print("      [dim]Llamafile format[/dim]")
    options.append(str(option_idx))
    option_map[str(option_idx)] = "gguf"

    console.print()

    choice = prompt_choice_with_retry("Select method", choices=options, default="1")
    method = option_map[choice]

    if method == "saved":
        return _select_saved_config(model, saved_configs)
    elif method == "raw":
        return _configure_raw_inference(model)
    elif method == "amx":
        return _configure_amx_inference(model)
    elif method == "gguf":
        return _configure_gguf_inference(model)

    return None


def _select_saved_config(model: Any, saved_configs: List[Dict]) -> Optional[Dict[str, Any]]:
    """Select from saved configurations with detailed display."""
    console.print()
    console.print("[bold]Saved Configurations:[/bold]")
    console.print()

    for i, cfg in enumerate(saved_configs, 1):
        # Build method display
        method_display = cfg.get("inference_method", "unknown").upper()
        kt_method = cfg.get("kt_method", "unknown")

        if cfg.get("inference_method") == "raw":
            raw_method = cfg.get("raw_method", "unknown")
            method_display = f"{raw_method}"
        elif cfg.get("inference_method") == "amx":
            method_display = kt_method
        elif cfg.get("inference_method") == "gguf":
            method_display = "LLAMAFILE"
        else:
            method_display = kt_method

        # Display config header
        console.print(f"  [cyan][{i}][/cyan] [bold]{cfg.get('config_name', f'Config {i}')}[/bold]")
        console.print()

        # Display detailed parameters
        console.print(f"      [yellow]KT Method:[/yellow]       {method_display}")
        console.print(f"      [yellow]NUMA Nodes:[/yellow]      {cfg.get('numa_nodes', '?')}")
        console.print(f"      [yellow]CPU Threads:[/yellow]     {cfg.get('cpu_threads', '?')}")
        console.print(f"      [yellow]GPU Experts:[/yellow]     {cfg.get('gpu_experts', '?')}")
        console.print(f"      [yellow]TP Size:[/yellow]         {cfg.get('tp_size', '?')}")
        console.print(f"      [yellow]Memory Fraction:[/yellow] {cfg.get('mem_fraction_static', '?')}")
        console.print(f"      [yellow]Server:[/yellow]          {cfg.get('host', '0.0.0.0')}:{cfg.get('port', 30000)}")

        # Display KV cache info if present
        if cfg.get("kv_cache"):
            console.print(f"      [yellow]KV Cache:[/yellow]        {cfg.get('kv_cache', '?')}")
            console.print(f"      [yellow]Chunk Prefill:[/yellow]   {cfg.get('chunk_prefill', '?')}")
            console.print(f"      [yellow]GPU Prefill Thr:[/yellow] {cfg.get('gpu_prefill_threshold', '?')}")

        # Display parser info if present
        if cfg.get("tool_call_parser") or cfg.get("reasoning_parser"):
            if cfg.get("tool_call_parser"):
                console.print(f"      [yellow]Tool Call Parser:[/yellow] {cfg.get('tool_call_parser')}")
            if cfg.get("reasoning_parser"):
                console.print(f"      [yellow]Reasoning Parser:[/yellow] {cfg.get('reasoning_parser')}")

        console.print()

        # Build and display command preview
        cmd_preview = _build_command_preview(model, cfg)
        console.print("      [dim]Command:[/dim]")
        console.print()
        for line in cmd_preview:
            console.print(f"      {line}")
        console.print()

    choice = prompt_int_with_retry(
        "Select configuration",
        default=1,
        min_val=1,
        max_val=len(saved_configs),
    )

    selected_config = saved_configs[choice - 1].copy()
    selected_config["method"] = "saved"
    return selected_config


def _build_command_preview(model: Any, cfg: Dict[str, Any]) -> List[str]:
    """Build command preview for saved configuration.

    Args:
        model: UserModel object
        cfg: Saved configuration dict

    Returns:
        List of command lines for display
    """
    import sys

    host = cfg.get("host", "0.0.0.0")
    port = cfg.get("port", 30000)

    lines = [
        "python -m sglang.launch_server \\",
        f"    --host {host} \\",
        f"    --port {port} \\",
        f"    --model {cfg.get('model_path', '?')} \\",
        f"    --kt-weight-path {cfg.get('weights_path', '?')} \\",
        f"    --kt-cpuinfer {cfg.get('cpu_threads', '?')} \\",
        f"    --kt-threadpool-count {cfg.get('numa_nodes', '?')} \\",
        f"    --kt-num-gpu-experts {cfg.get('gpu_experts', '?')} \\",
        f"    --kt-method {cfg.get('kt_method', '?')} \\",
    ]

    # Add GPU prefill threshold (use saved value or default)
    gpu_prefill = cfg.get("gpu_prefill_threshold", 500)
    lines.append(f"    --kt-gpu-prefill-token-threshold {gpu_prefill} \\")
    lines.append("    --kt-enable-dynamic-expert-update \\")

    # Add attention backend
    lines.append("    --attention-backend flashinfer \\")
    lines.append("    --trust-remote-code \\")

    # Add memory and performance settings
    lines.append(f"    --mem-fraction-static {cfg.get('mem_fraction_static', 0.9)} \\")

    # Add KV cache settings
    chunk_prefill = cfg.get("chunk_prefill", 32768)
    max_tokens = cfg.get("kv_cache", 32768)
    lines.append(f"    --chunked-prefill-size {chunk_prefill} \\")
    lines.append(f"    --max-total-tokens {max_tokens} \\")

    lines.append("    --max-running-requests 4 \\")
    lines.append("    --watchdog-timeout 3000 \\")
    lines.append("    --enable-mixed-chunk \\")

    # Add TP size (will be updated with actual GPU selection)
    lines.append(f"    --tensor-parallel-size {cfg.get('tp_size', '?')} \\")
    lines.append("    --enable-p2p-check \\")

    # Add FP8 backend if using FP8
    kt_method = cfg.get("kt_method", "")
    if "FP8" in kt_method.upper():
        lines.append("    --fp8-gemm-backend triton \\")

    # Add parsers if configured
    if cfg.get("tool_call_parser"):
        lines.append(f"    --tool-call-parser {cfg['tool_call_parser']} \\")
    if cfg.get("reasoning_parser"):
        lines.append(f"    --reasoning-parser {cfg['reasoning_parser']} \\")

    # Remove trailing backslash from last line
    if lines:
        lines[-1] = lines[-1].rstrip(" \\")

    return lines


def _configure_raw_inference(model: Any) -> Dict[str, Any]:
    """Configure raw precision inference."""
    console.print()
    console.print("[bold]Select Raw Precision Type:[/bold]")
    console.print()
    console.print("  [cyan][1][/cyan] FP8")
    console.print("  [cyan][2][/cyan] FP8_PERCHANNEL")
    console.print("  [cyan][3][/cyan] BF16")
    console.print("  [cyan][4][/cyan] RAWINT4")
    console.print()

    choice = prompt_choice_with_retry("Select precision", choices=["1", "2", "3", "4"], default="1")

    precision_map = {
        "1": "FP8",
        "2": "FP8_PERCHANNEL",
        "3": "BF16",
        "4": "RAWINT4",
    }

    raw_method = precision_map[choice]

    return {
        "method": "raw",
        "raw_method": raw_method,
        "kt_method": raw_method,
        "model_path": model.path,
        "weights_path": model.path,  # Same as model path for raw
    }


def _configure_amx_inference(model: Any) -> Optional[Dict[str, Any]]:
    """Configure AMX quantized inference."""
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
    from kt_kernel.cli.commands.model import is_amx_weights

    registry = UserModelRegistry()
    all_models = registry.list_models()

    # Filter AMX models
    amx_models = []
    for m in all_models:
        if m.format == "safetensors":
            is_amx, numa = is_amx_weights(m.path)
            if is_amx:
                # Check if it's derived from the selected model
                if m.amx_source_model == model.name:
                    amx_models.insert(0, m)  # Prioritize matched models
                else:
                    amx_models.append(m)

    if not amx_models:
        console.print("[yellow]No AMX quantized models found.[/yellow]")
        console.print("  Quantize your model with: [cyan]kt quant[/cyan]")
        return None

    console.print()
    console.print("[bold]Select AMX Weights:[/bold]")
    console.print()

    for i, m in enumerate(amx_models, 1):
        is_amx, numa = is_amx_weights(m.path)
        method_str = m.amx_quant_method.upper() if m.amx_quant_method else "Unknown"
        match_indicator = "[green]★[/green]" if m.amx_source_model == model.name else " "
        console.print(f"  {match_indicator} [cyan][{i}][/cyan] {m.name}")
        console.print(
            f"      [dim]Method: AMX{method_str}, NUMA: {numa}, Source: {m.amx_source_model or 'Unknown'}[/dim]"
        )

    console.print()
    choice = prompt_int_with_retry(
        "Select AMX weights",
        default=1,
        min_val=1,
        max_val=len(amx_models),
    )

    selected_amx = amx_models[choice - 1]
    is_amx, numa = is_amx_weights(selected_amx.path)
    kt_method = f"AMX{selected_amx.amx_quant_method.upper()}" if selected_amx.amx_quant_method else "AMXINT4"

    return {
        "method": "amx",
        "kt_method": kt_method,
        "model_path": model.path,
        "weights_path": selected_amx.path,
        "amx_numa_nodes": numa,
    }


def _configure_gguf_inference(model: Any) -> Optional[Dict[str, Any]]:
    """Configure GGUF inference."""
    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry

    registry = UserModelRegistry()
    all_models = registry.list_models()

    # Filter GGUF models
    gguf_models = [m for m in all_models if m.format == "gguf"]

    if not gguf_models:
        console.print("[yellow]No GGUF models found.[/yellow]")
        console.print("  Add GGUF models with: [cyan]kt model add /path/to/model.gguf[/cyan]")
        return None

    console.print()
    console.print("[bold]Select GGUF Weights:[/bold]")
    console.print()

    for i, m in enumerate(gguf_models, 1):
        console.print(f"  [cyan][{i}][/cyan] {m.name}")
        console.print(f"      [dim]Path: {m.path}[/dim]")

    console.print()
    choice = prompt_int_with_retry(
        "Select GGUF weights",
        default=1,
        min_val=1,
        max_val=len(gguf_models),
    )

    selected_gguf = gguf_models[choice - 1]

    return {
        "method": "gguf",
        "kt_method": "LLAMAFILE",
        "model_path": model.path,
        "weights_path": selected_gguf.path,
    }


def configure_numa_and_cpu(method_config: Dict[str, Any]) -> Dict[str, int]:
    """Step 3: Configure NUMA and CPU threads.

    Args:
        method_config: Config from step 2 (may contain amx_numa_nodes hint)

    Returns:
        Dict with 'numa_nodes' and 'cpu_threads'
    """
    from kt_kernel.cli.utils.environment import detect_cpu_info

    cpu_info = detect_cpu_info()
    max_numa = cpu_info.numa_nodes
    max_cores = cpu_info.threads  # Use logical threads instead of physical cores

    console.print()
    console.print(Panel("[bold cyan]Step 3: NUMA and CPU Configuration[/bold cyan]", expand=False))
    console.print()

    # Show AMX hint if applicable
    if method_config.get("method") == "amx" and method_config.get("amx_numa_nodes"):
        amx_numa = method_config["amx_numa_nodes"]
        console.print(f"[yellow]⚠ Note: This AMX model was quantized with NUMA={amx_numa}[/yellow]")
        console.print(f"[yellow]  For optimal performance, use the same NUMA setting.[/yellow]")
        console.print()
        default_numa = amx_numa
    else:
        default_numa = max_numa

    numa_nodes = prompt_int_with_retry(
        f"NUMA Nodes (1 to {max_numa})",
        default=default_numa,
        min_val=1,
        max_val=max_numa,
    )

    default_threads = int(max_cores * 0.8)
    cpu_threads = prompt_int_with_retry(
        f"CPU Threads (1 to {max_cores})",
        default=default_threads,
        min_val=1,
        max_val=max_cores,
    )

    return {
        "numa_nodes": numa_nodes,
        "cpu_threads": cpu_threads,
    }


def configure_gpu_experts(model: Any) -> int:
    """Step 4: Configure GPU expert count.

    Args:
        model: Selected model

    Returns:
        Number of GPU experts
    """
    from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model

    console.print()
    console.print(Panel("[bold cyan]Step 4: GPU Experts Configuration[/bold cyan]", expand=False))
    console.print()

    # Try to get num_experts from model
    try:
        moe_result = analyze_moe_model(model.path)
        num_experts = moe_result.get("num_experts", 256)
    except Exception:
        num_experts = 256  # Default fallback

    console.print(f"[dim]Model has {num_experts} experts total[/dim]")
    console.print()
    console.print("[yellow]⚠ Tip: More GPU experts = faster inference, but uses more VRAM[/yellow]")
    console.print()

    default_experts = min(8, num_experts)
    gpu_experts = prompt_int_with_retry(
        f"GPU Experts per layer (0 to {num_experts})",
        default=default_experts,
        min_val=0,
        max_val=num_experts,
    )

    return gpu_experts


def configure_kv_cache(is_raw_inference: bool) -> Optional[Dict[str, int]]:
    """Step 5: Configure KV Cache (only for raw inference).

    Args:
        is_raw_inference: True if using raw precision inference

    Returns:
        Dict with 'kv_cache', 'chunk_prefill', 'gpu_prefill_threshold' or None if not applicable
    """
    if not is_raw_inference:
        return None

    console.print()
    console.print(Panel("[bold cyan]Step 5: KV Cache and Prefill Configuration[/bold cyan]", expand=False))
    console.print()
    console.print("[dim]These settings control memory allocation and prefill batch size[/dim]")
    console.print("[dim]gpu-prefill-token-threshold: maximum length for single layerwise prefill[/dim]")
    console.print()

    kv_cache = prompt_int_with_retry("KV Cache Size (max_total_tokens)", default=32768, min_val=1)
    chunk_prefill = prompt_int_with_retry("Chunk Prefill Size", default=32768, min_val=1)
    gpu_prefill_threshold = prompt_int_with_retry("GPU Prefill Token Threshold", default=500, min_val=1)

    return {
        "kv_cache": kv_cache,
        "chunk_prefill": chunk_prefill,
        "gpu_prefill_threshold": gpu_prefill_threshold,
    }


def select_gpus_and_tp(
    required_tp_size: Optional[int] = None, saved_mem_fraction: Optional[float] = None
) -> Tuple[List[int], int, float]:
    """Step 6: Select GPUs, TP size, and memory fraction.

    Args:
        required_tp_size: If specified, user must select exactly this many GPUs.
                         If None, TP size can be any power of 2.
        saved_mem_fraction: If specified, use this memory fraction instead of prompting.
                           Used when loading saved configurations.

    Returns:
        Tuple of (selected_gpu_ids, tp_size, mem_fraction_static)
    """
    gpu_info_list = get_gpu_info()

    if not gpu_info_list:
        console.print("[red]No GPUs detected[/red]")
        return [], 0, 0.9

    console.print()
    if required_tp_size is not None:
        console.print(Panel(f"[bold cyan]Select {required_tp_size} GPUs (for saved config)[/bold cyan]", expand=False))
        console.print()
        console.print(f"[yellow]Required TP size: {required_tp_size}[/yellow]")
        console.print(f"[yellow]You must select exactly {required_tp_size} GPU(s)[/yellow]")
    else:
        console.print(Panel("[bold cyan]Step 6: GPU Selection and Memory[/bold cyan]", expand=False))
        console.print()
        console.print("[dim]TP (Tensor Parallel) size must be a power of 2: 1, 2, 4, 8, ...[/dim]")
    console.print()

    # Display GPUs
    table = Table(box=box.ROUNDED, show_header=True, header_style="bold cyan")
    table.add_column("ID", justify="right", style="cyan")
    table.add_column("Name", style="white")
    table.add_column("Free VRAM", justify="right", style="green")
    table.add_column("Total VRAM", justify="right", style="dim")

    for gpu in gpu_info_list:
        table.add_row(str(gpu["id"]), gpu["name"], f"{gpu['free_vram_gb']:.1f} GB", f"{gpu['total_vram_gb']:.1f} GB")

    console.print(table)
    console.print()

    # Validator function
    def validate_tp_requirements(gpu_ids: List[int]) -> tuple[bool, Optional[str]]:
        """Validate TP requirements based on required_tp_size."""
        actual_count = len(gpu_ids)

        if required_tp_size is not None:
            # Exact count required
            if actual_count != required_tp_size:
                return False, f"Must select exactly {required_tp_size} GPU(s), but you selected {actual_count}."
        else:
            # Must be power of 2
            if actual_count & (actual_count - 1) != 0:
                return (
                    False,
                    f"TP size ({actual_count}) must be a power of 2. Valid sizes: 1, 2, 4, 8, 16, 32, ...\nYou selected {actual_count} GPU(s). Please select a different number.",
                )

        return True, None

    # Generate default GPU selection
    if required_tp_size is not None:
        # For saved config: select first N GPUs
        if required_tp_size <= len(gpu_info_list):
            default_gpus = ",".join(str(i) for i in range(required_tp_size))
        else:
            default_gpus = ",".join(str(i) for i in range(len(gpu_info_list)))
        prompt_text = f"Enter {required_tp_size} GPU ID(s) separated by commas (e.g., 0,1,2,3)"
    else:
        # For new config: select all GPUs
        default_gpus = ",".join(str(i) for i in range(len(gpu_info_list)))
        prompt_text = "Enter GPU IDs separated by commas (e.g., 0,1,2,3)"
        console.print(prompt_text)
        console.print(f"  Or press Enter to use all {len(gpu_info_list)} GPUs")

    console.print()

    selected_gpu_ids = prompt_int_list_with_retry(
        "GPU IDs",
        default=default_gpus,
        min_val=0,
        max_val=len(gpu_info_list) - 1,
        validator=validate_tp_requirements,
    )

    tp_size = len(selected_gpu_ids)

    console.print()
    console.print(f"[green]✓[/green] Selected {tp_size} GPU(s): {selected_gpu_ids}")
    console.print()

    # Memory fraction - use saved value if provided, otherwise prompt
    if saved_mem_fraction is not None:
        mem_fraction = saved_mem_fraction
        console.print(f"[dim]Using saved memory fraction: {mem_fraction}[/dim]")
    else:
        mem_fraction = prompt_float_with_retry(
            "Static Memory Fraction (0.0-1.0)",
            default=0.9,
            min_val=0.0,
            max_val=1.0,
        )

    return selected_gpu_ids, tp_size, mem_fraction


def configure_parsers() -> Dict[str, Optional[str]]:
    """Step 7: Configure parsers (optional).

    Returns:
        Dict with 'tool_call_parser' and 'reasoning_parser' (can be None)
    """
    console.print()
    console.print(Panel("[bold cyan]Step 7: Parser Configuration (Optional)[/bold cyan]", expand=False))
    console.print()
    console.print("[dim]Press Enter to skip (no parser will be added)[/dim]")
    console.print()

    tool_call_parser = Prompt.ask("Tool Call Parser (e.g., glm47)", default="")
    tool_call_parser = tool_call_parser.strip() if tool_call_parser else None

    reasoning_parser = Prompt.ask("Reasoning Parser (e.g., glm45)", default="")
    reasoning_parser = reasoning_parser.strip() if reasoning_parser else None

    if tool_call_parser or reasoning_parser:
        console.print()
        if tool_call_parser:
            console.print(f"[green]✓[/green] Tool Call Parser: {tool_call_parser}")
        if reasoning_parser:
            console.print(f"[green]✓[/green] Reasoning Parser: {reasoning_parser}")
    else:
        console.print()
        console.print("[dim]No parsers configured[/dim]")

    return {
        "tool_call_parser": tool_call_parser,
        "reasoning_parser": reasoning_parser,
    }


def configure_host_and_port() -> Dict[str, Any]:
    """Step 8: Configure host and port with availability check.

    Returns:
        Dict with 'host' and 'port'
    """
    from kt_kernel.cli.utils.port_checker import is_port_available

    console.print()
    console.print(Panel("[bold cyan]Step 8: Server Configuration[/bold cyan]", expand=False))
    console.print()

    # Get host
    host = Prompt.ask("Server Host", default="0.0.0.0")

    # Get port with availability check
    while True:
        port = prompt_int_with_retry(
            "Server Port",
            default=30000,
            min_val=1024,
            max_val=65535,
        )

        # Check if port is available
        console.print()
        console.print(f"[dim]Checking port {port} availability...[/dim]")

        if is_port_available(host, port):
            console.print(f"[green]✓[/green] Port {port} is available")
            break
        else:
            console.print(f"[red]✗[/red] Port {port} is already in use")
            console.print()

            # Suggest next available port
            from kt_kernel.cli.utils.port_checker import find_available_port

            found, suggested_port = find_available_port(host, port + 1, max_attempts=100)
            if found:
                console.print(f"[yellow]Suggestion:[/yellow] Port {suggested_port} is available")
            console.print()

    console.print()
    console.print(f"[green]✓[/green] Server will listen on {host}:{port}")

    return {
        "host": host,
        "port": port,
    }


def save_config_prompt(model: Any, full_config: Dict[str, Any]) -> bool:
    """Step 7: Prompt to save configuration.

    Args:
        model: Selected model
        full_config: Complete configuration dict

    Returns:
        True if saved, False otherwise
    """
    console.print()
    console.print(Panel("[bold cyan]Step 7: Save Configuration[/bold cyan]", expand=False))
    console.print()

    if not Confirm.ask("Save this configuration for future use?", default=True):
        return False

    config_name = Prompt.ask("Configuration name", default=f"Config {full_config.get('inference_method', 'default')}")

    from kt_kernel.cli.utils.run_configs import RunConfigManager

    config_manager = RunConfigManager()

    # Prepare config to save (exclude runtime-only fields and non-serializable objects)
    save_config = {
        "config_name": config_name,
        "inference_method": full_config["inference_method"],
        "kt_method": full_config["kt_method"],
        "model_path": str(full_config["model_path"]),
        "weights_path": str(full_config["weights_path"]),
        "numa_nodes": full_config["numa_nodes"],
        "cpu_threads": full_config["cpu_threads"],
        "gpu_experts": full_config["gpu_experts"],
        "tp_size": full_config["tp_size"],
        "mem_fraction_static": full_config["mem_fraction_static"],
        "host": full_config["host"],
        "port": full_config["port"],
        # Note: selected_gpus is NOT saved - user will select GPUs when loading config
    }

    # Add parser config if present
    if full_config.get("tool_call_parser"):
        save_config["tool_call_parser"] = full_config["tool_call_parser"]
    if full_config.get("reasoning_parser"):
        save_config["reasoning_parser"] = full_config["reasoning_parser"]

    # Add raw-specific config if present
    if full_config.get("raw_method"):
        save_config["raw_method"] = full_config["raw_method"]

    if full_config.get("kv_cache"):
        save_config["kv_cache"] = full_config["kv_cache"]
        save_config["chunk_prefill"] = full_config["chunk_prefill"]
        save_config["gpu_prefill_threshold"] = full_config["gpu_prefill_threshold"]

    config_manager.save_config(model.id, save_config)

    console.print()
    console.print(f"[green]✓[/green] Configuration saved: {config_name}")

    return True


def interactive_run_config() -> Optional[Dict[str, Any]]:
    """
    Main interactive configuration flow for kt run.

    Returns:
        Complete configuration dict or None if cancelled.
    """
    # Step 1: Select model
    model = select_model()
    if not model:
        return None

    # Step 2: Select inference method
    method_config = select_inference_method(model)
    if not method_config:
        return None

    # If using saved config, add model object and return directly
    if method_config.get("method") == "saved":
        console.print()
        console.print("[green]✓[/green] Using saved configuration")

        # Let user select GPUs (must match saved TP size)
        saved_tp_size = method_config.get("tp_size", 1)

        console.print()
        console.print(f"[yellow]This configuration requires TP={saved_tp_size}[/yellow]")
        console.print(f"[yellow]Please select {saved_tp_size} GPU(s)[/yellow]")

        # Get saved memory fraction
        saved_mem_fraction = method_config.get("mem_fraction_static", 0.9)

        selected_gpus, actual_tp_size, _ = select_gpus_and_tp(
            required_tp_size=saved_tp_size, saved_mem_fraction=saved_mem_fraction
        )
        if not selected_gpus:
            return None

        # Update config with selected GPUs (keep saved mem_fraction_static)
        method_config["selected_gpus"] = selected_gpus
        # tp_size is already in method_config from saved data

        # Check port availability
        from kt_kernel.cli.utils.port_checker import is_port_available, find_available_port

        saved_host = method_config.get("host", "0.0.0.0")
        saved_port = method_config.get("port", 30000)

        console.print()
        console.print(f"[dim]Checking port {saved_port} availability...[/dim]")

        if is_port_available(saved_host, saved_port):
            console.print(f"[green]✓[/green] Port {saved_port} is available")
            method_config["port"] = saved_port
            method_config["host"] = saved_host
        else:
            console.print(f"[red]✗[/red] Port {saved_port} is already in use")
            console.print()

            # Suggest next available port
            found, suggested_port = find_available_port(saved_host, saved_port + 1, max_attempts=100)
            if found:
                console.print(f"[yellow]Suggestion:[/yellow] Port {suggested_port} is available")
            console.print()

            # Ask user for new port
            while True:
                new_port = prompt_int_with_retry(
                    "Enter new port",
                    default=suggested_port if found else saved_port + 1,
                    min_val=1024,
                    max_val=65535,
                )

                console.print()
                console.print(f"[dim]Checking port {new_port} availability...[/dim]")

                if is_port_available(saved_host, new_port):
                    console.print(f"[green]✓[/green] Port {new_port} is available")
                    method_config["port"] = new_port
                    method_config["host"] = saved_host
                    break
                else:
                    console.print(f"[red]✗[/red] Port {new_port} is already in use")
                    console.print()

        # Add model object for run.py compatibility
        method_config["model"] = model

        # Ensure paths are Path objects
        from pathlib import Path

        if "model_path" in method_config:
            method_config["model_path"] = Path(method_config["model_path"])
        if "weights_path" in method_config:
            method_config["weights_path"] = Path(method_config["weights_path"])

        # Display configuration summary
        console.print()
        console.print(Panel("[bold cyan]Saved Configuration[/bold cyan]", expand=False))
        console.print()
        _display_config_summary(method_config)
        console.print()

        # Start directly without confirmation when using saved config
        return method_config

    # Step 3: Configure NUMA and CPU
    numa_cpu_config = configure_numa_and_cpu(method_config)

    # Step 4: Configure GPU experts
    gpu_experts = configure_gpu_experts(model)

    # Step 5: Configure KV Cache (only for raw)
    is_raw = method_config.get("method") == "raw"
    kv_config = configure_kv_cache(is_raw)

    # Step 6: Select GPUs and TP
    selected_gpus, tp_size, mem_fraction = select_gpus_and_tp()
    if not selected_gpus:
        return None

    # Step 7: Configure parsers (optional)
    parser_config = configure_parsers()

    # Step 8: Configure host and port
    server_config = configure_host_and_port()

    # Build complete configuration
    full_config = {
        "model": model,
        "inference_method": method_config["method"],
        "kt_method": method_config["kt_method"],
        "model_path": method_config["model_path"],
        "weights_path": method_config["weights_path"],
        **numa_cpu_config,
        "gpu_experts": gpu_experts,
        "selected_gpus": selected_gpus,
        "tp_size": tp_size,
        "mem_fraction_static": mem_fraction,
        **parser_config,  # Add parser config
        **server_config,  # Add server config (host, port)
    }

    # Add raw-specific config
    if kv_config:
        full_config["raw_method"] = method_config.get("raw_method")
        full_config.update(kv_config)

    # Step 9: Save configuration
    save_config_prompt(model, full_config)

    # Final confirmation
    console.print()
    console.print(Panel("[bold cyan]Configuration Complete[/bold cyan]", expand=False))
    console.print()
    _display_config_summary(full_config)
    console.print()

    if not Confirm.ask("[bold green]Start model server with this configuration?[/bold green]", default=True):
        console.print("[yellow]Cancelled[/yellow]")
        return None

    return full_config


def _display_config_summary(config: Dict[str, Any]):
    """Display configuration summary."""
    model = config["model"]
    console.print(f"  Model:           {model.name}")
    console.print(f"  KT Method:       {config['kt_method']}")
    console.print(f"  NUMA Nodes:      {config['numa_nodes']}")
    console.print(f"  CPU Threads:     {config['cpu_threads']}")
    console.print(f"  GPU Experts:     {config['gpu_experts']}")

    # Handle both new config and saved config format
    tp_size = config.get("tp_size", len(config.get("selected_gpus", [])))
    selected_gpus = config.get("selected_gpus", [])

    console.print(f"  GPUs:            {selected_gpus} (TP={tp_size})")
    console.print(f"  Memory Fraction: {config['mem_fraction_static']}")

    # Server config
    host = config.get("host", "0.0.0.0")
    port = config.get("port", 30000)
    console.print(f"  Server:          {host}:{port}")

    if config.get("kv_cache"):
        console.print(f"  KV Cache:        {config['kv_cache']}")
        console.print(f"  Chunk Prefill:   {config['chunk_prefill']}")
        console.print(f"  GPU Prefill Thr: {config['gpu_prefill_threshold']}")

    # Display parsers if configured
    if config.get("tool_call_parser") or config.get("reasoning_parser"):
        console.print()
        if config.get("tool_call_parser"):
            console.print(f"  Tool Call Parser: {config['tool_call_parser']}")
        if config.get("reasoning_parser"):
            console.print(f"  Reasoning Parser: {config['reasoning_parser']}")


================================================
FILE: kt-kernel/python/cli/utils/sglang_checker.py
================================================
"""
SGLang installation checker and installation instructions provider.

This module provides utilities to:
- Check if SGLang is installed and get its metadata
- Provide installation instructions when SGLang is not found
"""

import subprocess
import sys
from pathlib import Path
from typing import Optional

from kt_kernel.cli.i18n import t
from kt_kernel.cli.utils.console import console


def check_sglang_installation() -> dict:
    """Check if SGLang is installed and get its metadata.

    Returns:
        dict with keys:
        - installed: bool
        - version: str or None
        - location: str or None (installation path)
        - editable: bool (whether installed in editable mode)
        - git_info: dict or None (git remote and branch if available)
        - from_source: bool (whether installed from source repository)
    """
    try:
        # Try to import sglang
        import sglang

        version = getattr(sglang, "__version__", None)

        # Use pip show to get detailed package information
        location = None
        editable = False
        git_info = None
        from_source = False
        is_kvcache_fork = False  # True if installed as sglang-kt package

        try:
            # Get pip show output (try sglang-kt first, then sglang)
            result = subprocess.run(
                [sys.executable, "-m", "pip", "show", "sglang-kt"],
                capture_output=True,
                text=True,
                timeout=10,
            )
            if result.returncode == 0:
                is_kvcache_fork = True  # sglang-kt package name proves it's the fork
            else:
                result = subprocess.run(
                    [sys.executable, "-m", "pip", "show", "sglang"],
                    capture_output=True,
                    text=True,
                    timeout=10,
                )

            if result.returncode == 0:
                pip_info = {}
                for line in result.stdout.split("\n"):
                    if ":" in line:
                        key, value = line.split(":", 1)
                        pip_info[key.strip()] = value.strip()

                location = pip_info.get("Location")
                editable_location = pip_info.get("Editable project location")

                if editable_location:
                    editable = True
                    location = editable_location
        except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
            # Fallback to module location
            if hasattr(sglang, "__file__") and sglang.__file__:
                location = str(Path(sglang.__file__).parent.parent)

        # Check if it's installed from source (has .git directory)
        if location:
            git_root = None
            check_path = Path(location)

            # Check current directory and up to 2 parent directories
            for _ in range(3):
                git_dir = check_path / ".git"
                if git_dir.exists():
                    git_root = check_path
                    from_source = True
                    break
                if check_path.parent == check_path:  # Reached root
                    break
                check_path = check_path.parent

            if from_source and git_root:
                # Try to get git remote and branch info
                try:
                    # Get remote URL
                    result = subprocess.run(
                        ["git", "remote", "get-url", "origin"],
                        cwd=git_root,
                        capture_output=True,
                        text=True,
                        timeout=5,
                    )
                    remote_url = result.stdout.strip() if result.returncode == 0 else None

                    # Extract org/repo from URL
                    remote_short = None
                    if remote_url:
                        # Handle both https and git@ URLs
                        if "github.com" in remote_url:
                            parts = remote_url.rstrip("/").replace(".git", "").split("github.com")[-1]
                            remote_short = parts.lstrip("/").lstrip(":")

                    # Get current branch
                    result = subprocess.run(
                        ["git", "branch", "--show-current"],
                        cwd=git_root,
                        capture_output=True,
                        text=True,
                        timeout=5,
                    )
                    branch = result.stdout.strip() if result.returncode == 0 else None

                    if remote_url or branch:
                        git_info = {
                            "remote": remote_short or remote_url,
                            "branch": branch,
                        }
                except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
                    pass

        return {
            "installed": True,
            "version": version,
            "location": location,
            "editable": editable,
            "git_info": git_info,
            "from_source": from_source,
            "is_kvcache_fork": is_kvcache_fork,
        }
    except ImportError:
        return {
            "installed": False,
            "version": None,
            "location": None,
            "editable": False,
            "git_info": None,
            "from_source": False,
            "is_kvcache_fork": False,
        }


def get_sglang_install_instructions(lang: Optional[str] = None) -> str:
    """Get SGLang installation instructions.

    Args:
        lang: Language code ('en' or 'zh'). If None, uses current language setting.

    Returns:
        Formatted installation instructions string.
    """
    from kt_kernel.cli.i18n import get_lang

    if lang is None:
        lang = get_lang()

    if lang == "zh":
        return """
[bold yellow]SGLang \u672a\u5b89\u88c5[/bold yellow]

\u8bf7\u9009\u62e9\u4ee5\u4e0b\u65b9\u5f0f\u4e4b\u4e00\u5b89\u88c5 SGLang (kvcache-ai \u5206\u652f):

[bold]\u65b9\u5f0f A - \u4e00\u952e\u5b89\u88c5 (\u63a8\u8350):[/bold]
   \u4ece ktransformers \u6839\u76ee\u5f55\u8fd0\u884c:
   [cyan]./install.sh[/cyan]

[bold]\u65b9\u5f0f B - pip \u5b89\u88c5:[/bold]
   [cyan]pip install sglang-kt[/cyan]

[bold]\u65b9\u5f0f C - \u4ece\u6e90\u7801\u5b89\u88c5:[/bold]
   git clone --recursive https://github.com/kvcache-ai/ktransformers.git
   cd ktransformers
   pip install "third_party/sglang/python[all]"

[dim]\u6ce8\u610f: \u8bf7\u786e\u4fdd\u5728\u6b63\u786e\u7684 Python \u73af\u5883\u4e2d\u6267\u884c\u4ee5\u4e0a\u547d\u4ee4[/dim]
"""
    else:
        return """
[bold yellow]SGLang is not installed[/bold yellow]

Install SGLang (kvcache-ai fork) using one of these methods:

[bold]Option A - One-click install (recommended):[/bold]
   From the ktransformers root directory, run:
   [cyan]./install.sh[/cyan]

[bold]Option B - pip install:[/bold]
   [cyan]pip install sglang-kt[/cyan]

[bold]Option C - From source:[/bold]
   git clone --recursive https://github.com/kvcache-ai/ktransformers.git
   cd ktransformers
   pip install "third_party/sglang/python[all]"

[dim]Note: Make sure to run these commands in the correct Python environment[/dim]
"""


def print_sglang_install_instructions() -> None:
    """Print SGLang installation instructions to console."""
    instructions = get_sglang_install_instructions()
    console.print(instructions)


def check_sglang_and_warn() -> bool:
    """Check if SGLang is installed, print warning if not.

    Returns:
        True if SGLang is installed, False otherwise.
    """
    info = check_sglang_installation()

    if not info["installed"]:
        print_sglang_install_instructions()
        return False

    # Check if installed from PyPI (not recommended)
    if info["installed"] and not info["from_source"]:
        from kt_kernel.cli.utils.console import print_warning

        print_warning(t("sglang_pypi_warning"))
        console.print()
        console.print("[dim]" + t("sglang_recommend_source") + "[/dim]")
        console.print()

    return True


def _get_sglang_kt_kernel_cache_path() -> Path:
    """Get the path to the sglang kt-kernel support cache file."""
    cache_dir = Path.home() / ".ktransformers" / "cache"
    cache_dir.mkdir(parents=True, exist_ok=True)
    return cache_dir / "sglang_kt_kernel_supported"


def _is_sglang_kt_kernel_cache_valid() -> bool:
    """Check if the sglang kt-kernel support cache is valid.

    The cache is considered valid if:
    1. The cache file exists
    2. The cache file contains 'true' (indicating previous check passed)

    Returns:
        True if cache is valid and indicates support, False otherwise.
    """
    cache_path = _get_sglang_kt_kernel_cache_path()
    if cache_path.exists():
        try:
            content = cache_path.read_text().strip().lower()
            return content == "true"
        except (OSError, IOError):
            pass
    return False


def _save_sglang_kt_kernel_cache(supported: bool) -> None:
    """Save the sglang kt-kernel support check result to cache."""
    cache_path = _get_sglang_kt_kernel_cache_path()
    try:
        cache_path.write_text("true" if supported else "false")
    except (OSError, IOError):
        pass  # Ignore cache write errors


def clear_sglang_kt_kernel_cache() -> None:
    """Clear the sglang kt-kernel support cache, forcing a re-check on next run."""
    cache_path = _get_sglang_kt_kernel_cache_path()
    try:
        if cache_path.exists():
            cache_path.unlink()
    except (OSError, IOError):
        pass


def check_sglang_kt_kernel_support(use_cache: bool = True, silent: bool = False) -> dict:
    """Check if SGLang supports kt-kernel parameters (--kt-gpu-prefill-token-threshold).

    This function runs `python -m sglang.launch_server --help` and checks if the
    output contains the `--kt-gpu-prefill-token-threshold` parameter. This parameter
    is only available in the kvcache-ai/sglang fork, not in the official sglang.

    The result is cached after the first successful check to avoid repeated checks.

    Args:
        use_cache: If True, use cached result if available. Default is True.
        silent: If True, don't print checking message. Default is False.

    Returns:
        dict with keys:
        - supported: bool - True if kt-kernel parameters are supported
        - help_output: str or None - The help output from sglang.launch_server
        - error: str or None - Error message if check failed
        - from_cache: bool - True if result was from cache
    """
    from kt_kernel.cli.utils.console import print_step

    # Check cache first
    if use_cache and _is_sglang_kt_kernel_cache_valid():
        return {
            "supported": True,
            "help_output": None,
            "error": None,
            "from_cache": True,
        }

    # Print checking message
    if not silent:
        print_step(t("sglang_checking_kt_kernel_support"))

    try:
        result = subprocess.run(
            [sys.executable, "-m", "sglang.launch_server", "--help"],
            capture_output=True,
            text=True,
            timeout=90,  # Increased for slow CUDA init and module loading in some environments
        )

        help_output = result.stdout + result.stderr

        # Check if --kt-gpu-prefill-token-threshold is in the help output
        supported = "--kt-gpu-prefill-token-threshold" in help_output

        # Save to cache if supported
        if supported:
            _save_sglang_kt_kernel_cache(True)

        return {
            "supported": supported,
            "help_output": help_output,
            "error": None,
            "from_cache": False,
        }

    except subprocess.TimeoutExpired:
        return {
            "supported": False,
            "help_output": None,
            "error": "Timeout while checking sglang.launch_server --help",
            "from_cache": False,
        }
    except FileNotFoundError:
        return {
            "supported": False,
            "help_output": None,
            "error": "Python interpreter not found",
            "from_cache": False,
        }
    except Exception as e:
        return {
            "supported": False,
            "help_output": None,
            "error": str(e),
            "from_cache": False,
        }


def print_sglang_kt_kernel_instructions() -> None:
    """Print instructions for installing the kvcache-ai fork of SGLang with kt-kernel support."""
    from kt_kernel.cli.i18n import get_lang

    lang = get_lang()

    if lang == "zh":
        instructions = """
[bold red]SGLang 不支持 kt-kernel[/bold red]

您当前安装的 SGLang 不包含 kt-kernel 支持。
kt-kernel 需要使用 kvcache-ai 维护的 SGLang 分支。

[bold]请按以下步骤重新安装:[/bold]

[cyan]1. 卸载当前的 SGLang:[/cyan]
   pip uninstall sglang -y

[cyan]2. 安装 kvcache-ai 版本 (选择一种方式):[/cyan]

   [bold]方式 A - 一键安装 (推荐):[/bold]
   从 ktransformers 根目录运行: ./install.sh

   [bold]方式 B - pip 安装:[/bold]
   pip install sglang-kt

[dim]注意: 请确保在正确的 Python 环境中执行以上命令[/dim]
"""
    else:
        instructions = """
[bold red]SGLang does not support kt-kernel[/bold red]

Your current SGLang installation does not include kt-kernel support.
kt-kernel requires the kvcache-ai maintained fork of SGLang.

[bold]Please reinstall SGLang:[/bold]

[cyan]1. Uninstall current SGLang:[/cyan]
   pip uninstall sglang -y

[cyan]2. Install the kvcache-ai fork (choose one):[/cyan]

   [bold]Option A - One-click install (recommended):[/bold]
   From the ktransformers root directory, run: ./install.sh

   [bold]Option B - pip install:[/bold]
   pip install sglang-kt

[dim]Note: Make sure to run these commands in the correct Python environment[/dim]
"""
    console.print(instructions)


================================================
FILE: kt-kernel/python/cli/utils/tuna_engine.py
================================================
"""
Tuna engine for auto-tuning GPU experts configuration.

Automatically finds the maximum viable num-gpu-experts through binary search
by testing actual server launches with different configurations.
"""

import json
import math
import random
import subprocess
import sys
import time
from pathlib import Path
from typing import Optional

from kt_kernel.cli.utils.console import console, print_error, print_info, print_warning


def get_num_experts(model_path: Path) -> int:
    """
    Get the number of experts per layer from model config.

    Args:
        model_path: Path to the model directory

    Returns:
        Number of experts per layer

    Raises:
        ValueError: If config.json not found or num_experts field missing
    """
    config_file = model_path / "config.json"

    if not config_file.exists():
        raise ValueError(f"config.json not found in {model_path}")

    try:
        config = json.loads(config_file.read_text())
    except Exception as e:
        raise ValueError(f"Failed to parse config.json: {e}")

    # Different models may use different field names
    possible_keys = [
        "num_experts_per_tok",  # DeepSeek
        "num_local_experts",  # Mixtral
        "n_routed_experts",  # Qwen
        "num_experts",  # Generic
    ]

    for key in possible_keys:
        if key in config:
            return config[key]

    raise ValueError(f"Cannot find num_experts field in {config_file}. " f"Tried: {', '.join(possible_keys)}")


def detect_oom(log_line: Optional[str]) -> bool:
    """
    Detect OOM (Out Of Memory) errors from log output.

    Args:
        log_line: A line from server output

    Returns:
        True if OOM detected, False otherwise
    """
    if log_line is None:
        return False

    log_lower = log_line.lower()

    oom_patterns = [
        "cuda out of memory",
        "out of memory",
        "outofmemoryerror",
        "oom",
        "failed to allocate",
        "cumemalloc failed",
        "cumemallocasync failed",
        "allocation failed",
    ]

    return any(pattern in log_lower for pattern in oom_patterns)


def test_config(
    num_gpu_experts: int,
    model_path: Path,
    config: dict,
    verbose: bool = False,
) -> tuple[bool, float]:
    """
    Test if a configuration with given num_gpu_experts works.

    Args:
        num_gpu_experts: Number of GPU experts to test
        model_path: Path to the model
        config: Configuration dict with all parameters
        verbose: Whether to show detailed logs

    Returns:
        (success: bool, elapsed_time: float)
        - success: True if server starts and inference works
        - elapsed_time: Time taken for the test
    """
    start_time = time.time()

    # Use random port to avoid conflicts
    test_port = random.randint(30000, 40000)

    # Build command
    cmd = [
        sys.executable,
        "-m",
        "sglang.launch_server",
        "--model",
        str(model_path),
        "--port",
        str(test_port),
        "--host",
        "127.0.0.1",
        "--tensor-parallel-size",
        str(config["tensor_parallel_size"]),
        "--kt-num-gpu-experts",
        str(num_gpu_experts),
        "--max-total-tokens",
        str(config["max_total_tokens"]),
    ]

    # Add kt-kernel options
    if config.get("weights_path"):
        cmd.extend(["--kt-weight-path", str(config["weights_path"])])
    else:
        cmd.extend(["--kt-weight-path", str(model_path)])

    cmd.extend(
        [
            "--kt-cpuinfer",
            str(config.get("cpu_threads", 64)),
            "--kt-threadpool-count",
            str(config.get("numa_nodes", 2)),
            "--kt-method",
            config.get("kt_method", "AMXINT4"),
            "--kt-gpu-prefill-token-threshold",
            str(config.get("kt_gpu_prefill_threshold", 4096)),
        ]
    )

    # Add other SGLang options
    if config.get("attention_backend"):
        cmd.extend(["--attention-backend", config["attention_backend"]])

    cmd.extend(
        [
            "--trust-remote-code",
            "--mem-fraction-static",
            str(config.get("mem_fraction_static", 0.98)),
            "--chunked-prefill-size",
            str(config.get("chunked_prefill_size", 4096)),
            "--max-running-requests",
            str(config.get("max_running_requests", 1)),  # Use 1 for faster testing
            "--watchdog-timeout",
            str(config.get("watchdog_timeout", 3000)),
            "--enable-mixed-chunk",
            "--enable-p2p-check",
        ]
    )

    # Add disable-shared-experts-fusion if specified
    if config.get("disable_shared_experts_fusion"):
        cmd.append("--disable-shared-experts-fusion")

    # Add extra args
    if config.get("extra_args"):
        cmd.extend(config["extra_args"])

    if verbose:
        console.print(f"[dim]Command: {' '.join(cmd)}[/dim]")

    # Start process
    try:
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
            env=config.get("env"),
        )
    except Exception as e:
        if verbose:
            print_error(f"Failed to start process: {e}")
        return False, time.time() - start_time

    # Monitor process output
    timeout = 60  # Maximum 60 seconds to wait
    server_ready = False

    try:
        while time.time() - start_time < timeout:
            # Check if process has output
            if process.poll() is not None:
                # Process exited
                if verbose:
                    print_warning("Process exited early")
                return False, time.time() - start_time

            # Read output line (non-blocking)
            try:
                line = process.stdout.readline()
                if not line:
                    time.sleep(0.1)
                    continue

                if verbose:
                    console.print(f"[dim]{line.rstrip()}[/dim]")

                # Fast OOM detection
                if detect_oom(line):
                    if verbose:
                        print_warning(f"OOM detected: {line.rstrip()}")
                    process.terminate()
                    try:
                        process.wait(timeout=2)
                    except subprocess.TimeoutExpired:
                        process.kill()
                    return False, time.time() - start_time

                # Check for startup success
                if "Uvicorn running" in line or "Application startup complete" in line:
                    server_ready = True
                    break

            except Exception as e:
                if verbose:
                    print_warning(f"Error reading output: {e}")
                break

        if not server_ready:
            # Timeout or failed to start
            process.terminate()
            try:
                process.wait(timeout=2)
            except subprocess.TimeoutExpired:
                process.kill()
            return False, time.time() - start_time

        # Server is ready, test inference
        success = test_inference(test_port, verbose=verbose)

        # Cleanup
        process.terminate()
        try:
            process.wait(timeout=5)
        except subprocess.TimeoutExpired:
            process.kill()
            process.wait(timeout=2)

        return success, time.time() - start_time

    except KeyboardInterrupt:
        # User cancelled
        process.terminate()
        try:
            process.wait(timeout=2)
        except subprocess.TimeoutExpired:
            process.kill()
        raise
    except Exception as e:
        if verbose:
            print_error(f"Test failed with exception: {e}")
        try:
            process.terminate()
            process.wait(timeout=2)
        except:
            try:
                process.kill()
            except:
                pass
        return False, time.time() - start_time


def test_inference(port: int, verbose: bool = False) -> bool:
    """
    Test if the server can handle a simple inference request.

    Args:
        port: Server port
        verbose: Whether to show detailed logs

    Returns:
        True if inference succeeds, False otherwise
    """
    try:
        # Wait a bit for server to be fully ready
        time.sleep(2)

        # Try to import OpenAI client
        try:
            from openai import OpenAI
        except ImportError:
            if verbose:
                print_warning("OpenAI package not available, skipping inference test")
            return True  # Assume success if we can't test

        client = OpenAI(
            base_url=f"http://127.0.0.1:{port}/v1",
            api_key="test",
        )

        # Send a simple test request
        response = client.chat.completions.create(
            model="test",
            messages=[{"role": "user", "content": "Hi"}],
            max_tokens=1,
            temperature=0,
            timeout=10,
        )

        # Check if we got a valid response
        success = response.choices and len(response.choices) > 0 and response.choices[0].message.content is not None

        if verbose:
            if success:
                print_info(f"Inference test passed: {response.choices[0].message.content}")
            else:
                print_warning("Inference test failed: no valid response")

        return success

    except Exception as e:
        if verbose:
            print_warning(f"Inference test failed: {e}")
        return False


def find_max_gpu_experts(
    model_path: Path,
    config: dict,
    verbose: bool = False,
) -> int:
    """
    Binary search to find the maximum viable num_gpu_experts.

    Args:
        model_path: Path to the model
        config: Configuration dict
        verbose: Whether to show detailed logs

    Returns:
        Maximum number of GPU experts that works
    """
    # Get number of experts from model config
    try:
        num_experts = get_num_experts(model_path)
    except ValueError as e:
        print_error(str(e))
        raise

    console.print()
    console.print(f"Binary search range: [0, {num_experts}]")
    console.print()

    left, right = 0, num_experts
    result = 0
    iteration = 0
    total_iterations = math.ceil(math.log2(num_experts + 1))

    while left <= right:
        iteration += 1
        mid = (left + right) // 2

        console.print(f"[{iteration}/{total_iterations}] Testing gpu-experts={mid}... ", end="")

        success, elapsed = test_config(mid, model_path, config, verbose=verbose)

        if success:
            console.print(f"[green]✓ OK[/green] ({elapsed:.1f}s)")
            result = mid
            left = mid + 1
        else:
            console.print(f"[red]✗ FAILED[/red] ({elapsed:.1f}s)")
            right = mid - 1

    return result


def run_tuna(
    model_path: Path,
    tensor_parallel_size: int,
    max_total_tokens: int,
    kt_method: str,
    verbose: bool = False,
    **kwargs,
) -> int:
    """
    Run tuna auto-tuning to find optimal num_gpu_experts.

    Args:
        model_path: Path to the model
        tensor_parallel_size: Tensor parallel size
        max_total_tokens: Maximum total tokens
        kt_method: KT quantization method
        verbose: Whether to show detailed logs
        **kwargs: Additional configuration parameters

    Returns:
        Optimal num_gpu_experts value

    Raises:
        ValueError: If tuning fails completely
    """
    # Prepare configuration
    config = {
        "tensor_parallel_size": tensor_parallel_size,
        "max_total_tokens": max_total_tokens,
        "kt_method": kt_method,
        **kwargs,
    }

    # Run binary search
    try:
        result = find_max_gpu_experts(model_path, config, verbose=verbose)
    except KeyboardInterrupt:
        console.print()
        print_warning("Tuning cancelled by user")
        raise

    console.print()

    # Check if even 0 doesn't work
    if result == 0:
        console.print("[yellow]Testing if gpu-experts=0 is viable...[/yellow]")
        success, _ = test_config(0, model_path, config, verbose=verbose)

        if not success:
            # Even 0 doesn't work
            console.print()
            print_error("Failed to start server even with all experts on CPU (gpu-experts=0)")
            console.print()
            console.print("[bold]Possible reasons:[/bold]")
            console.print("  • Insufficient GPU memory for base model layers")
            console.print("  • max-total-tokens is too large for available VRAM")
            console.print("  • Tensor parallel configuration issue")
            console.print()
            console.print("[bold]Suggestions:[/bold]")
            console.print(f"  • Reduce --max-total-tokens (current: {max_total_tokens})")
            console.print(f"  • Reduce --tensor-parallel-size (current: {tensor_parallel_size})")
            console.print("  • Use more GPUs or GPUs with more VRAM")
            console.print("  • Try a smaller model")
            console.print()
            raise ValueError("Minimum GPU memory requirements not met")
        else:
            # 0 works but nothing more
            console.print()
            print_warning("All experts will run on CPU (gpu-experts=0). " "Performance will be limited by CPU speed.")

    return result


================================================
FILE: kt-kernel/python/cli/utils/user_model_registry.py
================================================
"""
User Model Registry

Manages user-registered models in ~/.ktransformers/user_models.yaml
"""

from dataclasses import dataclass, asdict, field
from datetime import datetime
from pathlib import Path
from typing import Optional, List, Dict, Any
import yaml


# Constants
USER_MODELS_FILE = Path.home() / ".ktransformers" / "user_models.yaml"
REGISTRY_VERSION = "1.0"


@dataclass
class UserModel:
    """Represents a user-registered model"""

    name: str  # User-editable name (default: folder name)
    path: str  # Absolute path to model directory
    format: str  # "safetensors" | "gguf"
    id: Optional[str] = None  # Unique UUID for this model (auto-generated if None)
    repo_type: Optional[str] = None  # "huggingface" | "modelscope" | None
    repo_id: Optional[str] = None  # e.g., "deepseek-ai/DeepSeek-V3"
    sha256_status: str = "not_checked"  # "not_checked" | "checking" | "passed" | "failed" | "no_repo"
    gpu_model_ids: Optional[List[str]] = None  # For llamafile/AMX: list of GPU model UUIDs to run with
    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
    last_verified: Optional[str] = None  # ISO format datetime
    # MoE information (cached from analyze_moe_model)
    is_moe: Optional[bool] = None  # True if MoE model, False if non-MoE, None if not analyzed
    moe_num_experts: Optional[int] = None  # Total number of experts (for MoE models)
    moe_num_experts_per_tok: Optional[int] = None  # Number of active experts per token (for MoE models)
    # AMX quantization metadata (for format == "amx")
    amx_source_model: Optional[str] = None  # Name of the source MoE model that was quantized
    amx_quant_method: Optional[str] = None  # "int4" | "int8"
    amx_numa_nodes: Optional[int] = None  # Number of NUMA nodes used for quantization

    def __post_init__(self):
        """Ensure ID is set after initialization"""
        if self.id is None:
            import uuid

            self.id = str(uuid.uuid4())

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for YAML serialization"""
        return asdict(self)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "UserModel":
        """Create from dictionary loaded from YAML"""
        return cls(**data)

    def path_exists(self) -> bool:
        """Check if model path still exists"""
        return Path(self.path).exists()


class UserModelRegistry:
    """Manages the user model registry"""

    def __init__(self, registry_file: Optional[Path] = None):
        """
        Initialize the registry

        Args:
            registry_file: Path to the registry YAML file (default: USER_MODELS_FILE)
        """
        self.registry_file = registry_file or USER_MODELS_FILE
        self.models: List[UserModel] = []
        self.version = REGISTRY_VERSION

        # Ensure directory exists
        self.registry_file.parent.mkdir(parents=True, exist_ok=True)

        # Load existing registry
        self.load()

    def load(self) -> None:
        """Load models from YAML file"""
        if not self.registry_file.exists():
            # Initialize empty registry
            self.models = []
            self.save()  # Create the file
            return

        try:
            with open(self.registry_file, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f)

            if not data:
                self.models = []
                return

            # Load version
            self.version = data.get("version", REGISTRY_VERSION)

            # Load models
            models_data = data.get("models", [])
            self.models = [UserModel.from_dict(m) for m in models_data]

            # Migrate: ensure all models have UUIDs (for backward compatibility)
            needs_save = False
            for model in self.models:
                if model.id is None:
                    import uuid

                    model.id = str(uuid.uuid4())
                    needs_save = True

            if needs_save:
                self.save()

        except Exception as e:
            raise RuntimeError(f"Failed to load user model registry: {e}")

    def save(self) -> None:
        """Save models to YAML file"""
        data = {"version": self.version, "models": [m.to_dict() for m in self.models]}

        try:
            with open(self.registry_file, "w", encoding="utf-8") as f:
                yaml.safe_dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
        except Exception as e:
            raise RuntimeError(f"Failed to save user model registry: {e}")

    def add_model(self, model: UserModel) -> None:
        """
        Add a model to the registry

        Args:
            model: UserModel instance to add

        Raises:
            ValueError: If a model with the same name already exists
        """
        if self.check_name_conflict(model.name):
            raise ValueError(f"Model with name '{model.name}' already exists")

        self.models.append(model)
        self.save()

    def remove_model(self, name: str) -> bool:
        """
        Remove a model from the registry

        Args:
            name: Name of the model to remove

        Returns:
            True if model was removed, False if not found
        """
        original_count = len(self.models)
        self.models = [m for m in self.models if m.name != name]

        if len(self.models) < original_count:
            self.save()
            return True
        return False

    def update_model(self, name: str, updates: Dict[str, Any]) -> bool:
        """
        Update a model's attributes

        Args:
            name: Name of the model to update
            updates: Dictionary of attributes to update

        Returns:
            True if model was updated, False if not found
        """
        model = self.get_model(name)
        if not model:
            return False

        # Update attributes
        for key, value in updates.items():
            if hasattr(model, key):
                setattr(model, key, value)

        self.save()
        return True

    def get_model(self, name: str) -> Optional[UserModel]:
        """
        Get a model by name

        Args:
            name: Name of the model

        Returns:
            UserModel instance or None if not found
        """
        for model in self.models:
            if model.name == name:
                return model
        return None

    def get_model_by_id(self, model_id: str) -> Optional[UserModel]:
        """
        Get a model by its unique ID

        Args:
            model_id: UUID of the model

        Returns:
            UserModel instance or None if not found
        """
        for model in self.models:
            if model.id == model_id:
                return model
        return None

    def list_models(self) -> List[UserModel]:
        """
        List all models

        Returns:
            List of all UserModel instances
        """
        return self.models.copy()

    def find_by_path(self, path: str) -> Optional[UserModel]:
        """
        Find a model by its path

        Args:
            path: Model directory path

        Returns:
            UserModel instance or None if not found
        """
        # Normalize paths for comparison
        search_path = str(Path(path).resolve())

        for model in self.models:
            model_path = str(Path(model.path).resolve())
            if model_path == search_path:
                return model
        return None

    def check_name_conflict(self, name: str, exclude_name: Optional[str] = None) -> bool:
        """
        Check if a name conflicts with existing models

        Args:
            name: Name to check
            exclude_name: Optional name to exclude from check (for rename operations)

        Returns:
            True if conflict exists, False otherwise
        """
        for model in self.models:
            if model.name == name and model.name != exclude_name:
                return True
        return False

    def refresh_status(self) -> Dict[str, List[str]]:
        """
        Check all models and identify missing ones

        Returns:
            Dictionary with 'valid' and 'missing' lists of model names
        """
        valid = []
        missing = []

        for model in self.models:
            if model.path_exists():
                valid.append(model.name)
            else:
                missing.append(model.name)

        return {"valid": valid, "missing": missing}

    def get_model_count(self) -> int:
        """Get total number of registered models"""
        return len(self.models)

    def suggest_name(self, base_name: str) -> str:
        """
        Suggest a unique name based on base_name

        Args:
            base_name: Base name to derive from

        Returns:
            A unique name (may have suffix like -2, -3 etc.)
        """
        if not self.check_name_conflict(base_name):
            return base_name

        counter = 2
        while True:
            candidate = f"{base_name}-{counter}"
            if not self.check_name_conflict(candidate):
                return candidate
            counter += 1


================================================
FILE: kt-kernel/python/experts.py
================================================
# Wrapper for MoE CPU inference operations
# This module encapsulates CPU inference engine, weight loading, and buffer management
# SPDX-License-Identifier: Apache-2.0

"""
Expert wrappers for CPU-based MoE inference.

This module provides the main factory interface (KTMoEWrapper) that automatically
selects the appropriate backend implementation based on the method parameter.
"""

from __future__ import annotations

import torch
from typing import List, Optional

# Import base infrastructure
from .experts_base import BaseMoEWrapper, KExpertsCPUBuffer

# Import backend implementations
from .utils.amx import AMXMoEWrapper, NativeMoEWrapper
from .utils.llamafile import LlamafileMoEWrapper
from .utils.moe_kernel import GeneralMoEWrapper


class KTMoEWrapper:
    """
    Factory interface for MoE CPU inference operations.

    This class serves as the main entry point for external code. It automatically
    selects the appropriate backend implementation based on the `method` parameter.

    Usage:
        # Create a mask where experts 0, 2, 5 are on GPU
        gpu_mask = torch.zeros(8, dtype=torch.bool)
        gpu_mask[[0, 2, 5]] = True

        wrapper = KTMoEWrapper(
            layer_idx=0,
            num_experts=8,
            num_experts_per_tok=2,
            hidden_size=4096,
            moe_intermediate_size=14336,
            gpu_experts_mask=gpu_mask,  # or None for all experts on CPU
            cpuinfer_threads=32,
            threadpool_count=2,
            weight_path="/path/to/weights",
            chunked_prefill_size=512,
            method="AMXINT4"  # or "AMXINT8", "LLAMAFILE"
        )
    """

    def __new__(
        cls,
        layer_idx: int,
        num_experts: int,
        num_experts_per_tok: int,
        hidden_size: int,
        moe_intermediate_size: int,
        gpu_experts_mask: Optional[torch.Tensor],
        cpuinfer_threads: int,
        threadpool_count: int,
        weight_path: str,
        chunked_prefill_size: int,
        cpu_save: bool = False,
        max_deferred_experts_per_token: Optional[int] = None,
        method: str = "AMXINT4",
    ):
        """
        Factory method to create the appropriate backend implementation.

        Args:
            layer_idx: Layer index
            num_experts: Total number of experts
            num_experts_per_tok: Number of experts per token (top-k)
            hidden_size: Hidden dimension size
            moe_intermediate_size: MoE intermediate size
            gpu_experts_mask: Boolean mask indicating which experts are on GPU.
                              Shape: [num_experts], dtype: torch.bool.
                              mask[i] = True means expert i is on GPU.
                              If None, all experts are on CPU.
            cpuinfer_threads: Number of CPU inference threads
            threadpool_count: Number of NUMA subpools
            weight_path: Path to weights
            chunked_prefill_size: Maximum prefill chunk size
            cpu_save: Whether to save weights to CPU memory
            max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
            method: Backend method ("AMXINT4", "AMXINT8", "RAWINT4", "FP8", "BF16", "LLAMAFILE", "MOE_INT4", "MOE_INT8")

        Returns:
            An instance of the appropriate backend implementation (e.g., AMXMoEWrapper)
        """
        # Select backend based on method
        if method in ["AMXINT4", "AMXINT8"]:
            backend_cls = AMXMoEWrapper
        elif method in ["RAWINT4", "FP8", "BF16", "FP8_PERCHANNEL"]:
            backend_cls = NativeMoEWrapper
        elif method == "LLAMAFILE":
            backend_cls = LlamafileMoEWrapper
        elif method in ["MOE_INT4", "MOE_INT8"]:
            backend_cls = GeneralMoEWrapper
        else:
            raise NotImplementedError(f"Unsupported method: {method}")

        # Create and return backend instance
        return backend_cls(
            layer_idx=layer_idx,
            num_experts=num_experts,
            num_experts_per_tok=num_experts_per_tok,
            hidden_size=hidden_size,
            moe_intermediate_size=moe_intermediate_size,
            gpu_experts_mask=gpu_experts_mask,
            cpuinfer_threads=cpuinfer_threads,
            threadpool_count=threadpool_count,
            weight_path=weight_path,
            chunked_prefill_size=chunked_prefill_size,
            cpu_save=cpu_save,
            max_deferred_experts_per_token=max_deferred_experts_per_token,
            method=method,
        )

    # Forward static methods to the base class
    @staticmethod
    def set_capture_batch_sizes(capture_bs: List[int]):
        """
        Set batch sizes to capture and cache buffers for.

        This allows pre-allocation of CPU buffers for specific batch sizes,
        improving performance by avoiding buffer re-allocation during inference.

        Args:
            capture_bs: List of batch sizes to capture (e.g., [1, 2, 4, 8, 16])
        """
        BaseMoEWrapper.set_capture_batch_sizes(capture_bs)

    @staticmethod
    def get_capture_batch_sizes() -> List[int]:
        """
        Get currently configured capture batch sizes.

        Returns:
            List of batch sizes that are being captured
        """
        return BaseMoEWrapper.get_capture_batch_sizes()

    @staticmethod
    def clear_buffer_cache():
        """
        Clear all cached buffers.

        This frees up memory by clearing the buffer cache. Useful when you want
        to reset the buffer state or free memory.
        """
        BaseMoEWrapper.clear_buffer_cache()


================================================
FILE: kt-kernel/python/experts_base.py
================================================
# Base classes for MoE CPU inference operations
# SPDX-License-Identifier: Apache-2.0

"""
Base infrastructure for CPU-based MoE inference.

This module contains base classes and utilities shared across all backend implementations.
"""

from __future__ import annotations

import torch
from typing import Dict, List, Optional, Tuple
from abc import ABC, abstractmethod
import os
import ctypes

from kt_kernel import kt_kernel_ext


def generate_gpu_experts_masks(
    activation_freq: torch.Tensor,
    num_gpu_experts: int,
) -> torch.Tensor:
    """
    Generate GPU experts masks based on activation frequency.

    Selects the top `num_gpu_experts` experts with highest activation frequency
    across all layers to be placed on GPU.

    Args:
        activation_freq: Activation frequency table of shape (num_layers, num_experts).
                         Higher values indicate more frequently activated experts.
        num_gpu_experts: Total number of experts to place on GPU across all layers.

    Returns:
        gpu_experts_masks: Boolean mask of shape (num_layers, num_experts) on CPU.
                           True means the expert should be on GPU.

    Example:
        >>> activation_freq = torch.tensor([
        ...     [0.1, 0.5, 0.3, 0.8],  # layer 0
        ...     [0.2, 0.4, 0.9, 0.1],  # layer 1
        ... ])
        >>> masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=3)
        >>> # Top 3: layer0-expert3 (0.8), layer1-expert2 (0.9), layer0-expert1 (0.5)
        >>> masks
        tensor([[False,  True, False,  True],
                [False, False,  True, False]])
    """
    num_layers, num_experts_per_layer = activation_freq.shape
    total_experts = num_layers * num_experts_per_layer

    # Clamp num_gpu_experts to valid range
    num_gpu_experts = min(num_gpu_experts, total_experts)
    num_gpu_experts = max(num_gpu_experts, 0)

    if num_gpu_experts == 0:
        return torch.zeros(num_layers, num_experts_per_layer, dtype=torch.bool, device="cpu")

    # Flatten and find top-k indices
    flat_freq = activation_freq.view(-1).to(device="cpu")
    _, top_indices = torch.topk(flat_freq, k=num_gpu_experts, largest=True, sorted=False)

    # Create mask
    gpu_experts_masks = torch.zeros(total_experts, dtype=torch.bool, device="cpu")
    gpu_experts_masks[top_indices] = True

    # Reshape to (num_layers, num_experts)
    gpu_experts_masks = gpu_experts_masks.view(num_layers, num_experts_per_layer)

    return gpu_experts_masks


class KExpertsCPUBuffer:
    """
    CPU buffer management for expert computation.

    Manages pinned memory buffers for efficient GPU-CPU data transfer.
    """

    capture_bs: List = list()
    capture_buffers: Dict = dict()
    temp_bs: int = 0
    temp_buffer: tuple = tuple()
    buffer_depth: int = 2

    @classmethod
    def get_buffer(cls, hidden_states: torch.Tensor, num_experts_per_tok):
        hidden_size = hidden_states.shape[-1]
        batch_size = hidden_states.shape[0]

        if batch_size in cls.capture_buffers:
            return cls.capture_buffers[batch_size]
        if batch_size == cls.temp_bs:
            return cls.temp_buffer

        input_tensor_cpu = [
            torch.zeros((batch_size, hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
            for _ in range(cls.buffer_depth)
        ]
        immediate_experts_ids_cpu = [
            torch.zeros((batch_size, num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
            for _ in range(cls.buffer_depth)
        ]
        deferred_experts_ids_cpu = [
            torch.full((batch_size, num_experts_per_tok), -1, device="cpu", dtype=torch.long, pin_memory=True)
            for _ in range(cls.buffer_depth)
        ]
        weights_cpu = [
            torch.zeros((batch_size, num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
            for _ in range(cls.buffer_depth)
        ]
        output_cpu = [
            torch.zeros((batch_size, hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
            for _ in range(cls.buffer_depth)
        ]
        bsz_tensor_cpu = [
            torch.full((1,), batch_size, device="cpu", dtype=torch.int32, pin_memory=True)
            for _ in range(cls.buffer_depth)
        ]
        output_gpu = [
            torch.zeros((batch_size, hidden_size), device=hidden_states.device, dtype=hidden_states.dtype)
            for _ in range(cls.buffer_depth)
        ]

        cur_buffer = (
            input_tensor_cpu,
            immediate_experts_ids_cpu,
            deferred_experts_ids_cpu,
            weights_cpu,
            output_cpu,
            bsz_tensor_cpu,
            output_gpu,
        )
        if batch_size in cls.capture_bs:
            cls.capture_buffers[batch_size] = cur_buffer
        cls.temp_bs = batch_size
        cls.temp_buffer = cur_buffer
        return cur_buffer


class BaseMoEWrapper(ABC):
    """
    Base class for MoE CPU inference operations.
    Provides common functionality for all backend implementations.
    """

    _cpu_infer_instance = None
    _layer_has_pending_deferred: Dict[int, bool] = {}

    def __init__(
        self,
        layer_idx: int,
        num_experts: int,
        num_experts_per_tok: int,
        hidden_size: int,
        moe_intermediate_size: int,
        gpu_experts_mask: Optional[torch.Tensor],
        cpuinfer_threads: int,
        threadpool_count: int,
        weight_path: str,
        chunked_prefill_size: int,
        cpu_save: bool = False,
        max_deferred_experts_per_token: Optional[int] = None,
        method: str = "AMXINT4",
    ):
        """
        Initialize base MoE Wrapper.

        Args:
            layer_idx: Layer index
            num_experts: Total number of experts
            num_experts_per_tok: Number of experts per token (top-k)
            hidden_size: Hidden dimension size
            moe_intermediate_size: MoE intermediate size
            gpu_experts_mask: Boolean mask indicating which experts are on GPU.
                              Shape: [num_experts], dtype: torch.bool.
                              mask[i] = True means expert i is on GPU.
                              If None, all experts are on CPU.
            cpuinfer_threads: Number of CPU inference threads
            threadpool_count: Number of NUMA subpools
            weight_path: Path to weights
            chunked_prefill_size: Maximum prefill chunk size
            cpu_save: Whether to save weights to CPU memory
            max_deferred_experts_per_token: Number of experts per token to defer on this layer. Defaults to 0 (no defer).
            method: Backend method string
        """
        self.layer_idx = layer_idx
        self.num_experts = num_experts
        self.num_experts_per_tok = num_experts_per_tok
        self.hidden_size = hidden_size
        self.moe_intermediate_size = moe_intermediate_size

        # Process gpu_experts_mask: convert to bool tensor on CPU, pinned memory for async copy
        # This mask is shared between C and Python (C uses uint8_t*), both can read/write it
        if gpu_experts_mask is None:
            # No GPU experts - all experts on CPU
            self.gpu_experts_mask = torch.zeros(num_experts, dtype=torch.bool, device="cpu", pin_memory=True)
        else:
            # Create a new pinned tensor and copy data into it
            self.gpu_experts_mask = torch.empty(num_experts, dtype=torch.bool, device="cpu", pin_memory=True)
            self.gpu_experts_mask.copy_(gpu_experts_mask)

        self.num_gpu_experts = int(self.gpu_experts_mask.sum().item())

        # GPU copy for mask operations in forward pass (e.g., mask_cpu_expert_ids)
        # This will be lazily initialized when needed
        self._gpu_experts_mask_gpu: Optional[torch.Tensor] = None
        self.weight_path = weight_path
        self.chunked_prefill_size = chunked_prefill_size
        self.cpu_save = cpu_save
        self.max_deferred_experts_per_token = (
            int(max_deferred_experts_per_token) if max_deferred_experts_per_token is not None else 0
        )

        BaseMoEWrapper._layer_has_pending_deferred[self.layer_idx] = False
        self.method = method

        # Initialize CPU inference engine (singleton)
        if BaseMoEWrapper._cpu_infer_instance is None:
            worker_config = kt_kernel_ext.WorkerPoolConfig()

            subpool_numa_map = list(range(threadpool_count))
            subpool_thread_count = [
                cpuinfer_threads // threadpool_count + (1 if i < cpuinfer_threads % threadpool_count else 0)
                for i in range(threadpool_count)
            ]

            worker_config.subpool_count = threadpool_count
            worker_config.subpool_numa_map = subpool_numa_map
            worker_config.subpool_thread_count = subpool_thread_count
            BaseMoEWrapper._cpu_infer_instance = kt_kernel_ext.CPUInfer(worker_config)

        self.cpu_infer = BaseMoEWrapper._cpu_infer_instance

        # Backend-specific initialization happens in subclasses
        self.moe = None

    @abstractmethod
    def load_weights_from_tensors(
        self,
        gate_proj: torch.Tensor,
        up_proj: torch.Tensor,
        down_proj: torch.Tensor,
        physical_to_logical_map_cpu: torch.Tensor,
    ):
        """
        Load and quantize weights from BF16/FP16 tensors (online quantization).

        Args:
            gate_proj: Gate projection weights [num_experts, intermediate_size, hidden_size]
            up_proj: Up projection weights [num_experts, intermediate_size, hidden_size]
            down_proj: Down projection weights [num_experts, hidden_size, intermediate_size]
            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
        """
        pass

    @abstractmethod
    def load_weights(self, physical_to_logical_map_cpu: torch.Tensor):
        """
        Load weights for this layer and initialize the MoE module.

        Args:
            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
        """
        pass

    def select_deferred_experts(
        self,
        expert_ids: torch.Tensor,
        expert_scores: torch.Tensor,
        protected_k: int,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        batch, topk = expert_ids.shape
        device = expert_ids.device

        protected_k = max(0, min(int(protected_k), topk))
        if protected_k == 0:
            deferred_ids = expert_ids.clone()
            immediate_ids = torch.full_like(expert_ids, -1)
            return immediate_ids, deferred_ids

        topk_result = torch.topk(expert_scores, k=protected_k, dim=-1, largest=True, sorted=False)
        protected_indices = topk_result.indices
        protected_ids = torch.gather(expert_ids, -1, protected_indices)

        protected_flag = torch.zeros((self.num_experts,), dtype=torch.int32, device=device)
        protected_flag.scatter_(0, protected_ids.reshape(-1), 1)

        protected_mask_flat = torch.gather(protected_flag, 0, expert_ids.reshape(-1)).ne(0)
        protected_mask = protected_mask_flat.view(batch, topk)

        immediate_ids = expert_ids.clone().masked_fill(~protected_mask, -1)
        deferred_ids = expert_ids.clone().masked_fill(protected_mask, -1)

        return immediate_ids, deferred_ids

    def submit_forward(
        self,
        hidden_states: torch.Tensor,
        topk_ids: torch.Tensor,
        topk_weights: torch.Tensor,
        cuda_stream,
    ):
        """
        Submit forward inference task to CPU (non-blocking).

        Args:
            hidden_states: Input hidden states [batch_size, hidden_size]
            topk_ids: Top-k expert IDs [batch_size, num_experts_per_tok]
            topk_weights: Top-k expert weights [batch_size, num_experts_per_tok]
            cuda_stream: CUDA stream for synchronization
        """
        flat_hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        batch_size = flat_hidden_states.shape[0]

        (
            input_tensor_cpu,
            immediate_experts_ids_cpu,
            deferred_experts_ids_cpu,
            weights_cpu,
            output_cpu,
            bsz_tensor_cpu,
            _output_gpu,
        ) = KExpertsCPUBuffer.get_buffer(flat_hidden_states, self.num_experts_per_tok)

        current_slot = self.layer_idx % KExpertsCPUBuffer.buffer_depth
        next_slot = (current_slot + 1) % KExpertsCPUBuffer.buffer_depth

        bsz_slot_tensor = bsz_tensor_cpu[current_slot]

        topk_ids_long = topk_ids.to(torch.long)
        immediate_ids: torch.Tensor
        deferred_ids: Optional[torch.Tensor]
        if self.max_deferred_experts_per_token > 0:
            protected_k = self.num_experts_per_tok - self.max_deferred_experts_per_token

            immediate_ids, deferred_ids = self.select_deferred_experts(topk_ids_long, topk_weights, protected_k)
        else:
            immediate_ids = topk_ids_long
            deferred_ids = None

        input_tensor_cpu[current_slot].copy_(flat_hidden_states, non_blocking=True)
        weights_cpu[current_slot].copy_(topk_weights, non_blocking=True)
        immediate_experts_ids_cpu[current_slot].copy_(immediate_ids, non_blocking=True)

        incremental = BaseMoEWrapper._layer_has_pending_deferred.get(self.layer_idx - 1, False)
        self.cpu_infer.submit_with_cuda_stream(
            cuda_stream,
            self.moe.forward_task(
                bsz_slot_tensor.data_ptr(),
                immediate_experts_ids_cpu[current_slot].size(-1),
                immediate_experts_ids_cpu[current_slot].data_ptr(),
                weights_cpu[current_slot].data_ptr(),
                input_tensor_cpu[current_slot].data_ptr(),
                output_cpu[current_slot].data_ptr(),
                incremental,
            ),
        )

        BaseMoEWrapper._layer_has_pending_deferred[self.layer_idx] = False
        if deferred_ids is not None:
            deferred_experts_ids_cpu[current_slot].copy_(deferred_ids, non_blocking=True)
            self.cpu_infer.submit_with_cuda_stream(
                cuda_stream,
                self.moe.forward_task(
                    bsz_slot_tensor.data_ptr(),
                    deferred_experts_ids_cpu[current_slot].size(-1),
                    deferred_experts_ids_cpu[current_slot].data_ptr(),
                    weights_cpu[current_slot].data_ptr(),
                    input_tensor_cpu[current_slot].data_ptr(),
                    output_cpu[next_slot].data_ptr(),
                    False,
                ),
            )
            BaseMoEWrapper._layer_has_pending_deferred[self.layer_idx] = True

    def sync_forward(self, hidden_states: torch.Tensor, cuda_stream) -> torch.Tensor:
        """
        Synchronize and retrieve forward inference results.

        Args:
            hidden_states: Original input hidden states (for getting buffer)
            cuda_stream: CUDA stream for synchronization

        Returns:
            output_gpu: Output tensor on GPU
        """
        flat_hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        (
            _input_tensor_cpu,
            _immediate_experts_ids_cpu,
            _deferred_experts_ids_cpu,
            _weights_cpu,
            output_cpu,
            _bsz_tensor_cpu,
            output_gpu,
        ) = KExpertsCPUBuffer.get_buffer(flat_hidden_states, self.num_experts_per_tok)

        current_slot = self.layer_idx % KExpertsCPUBuffer.buffer_depth
        allow_pending = 1 if BaseMoEWrapper._layer_has_pending_deferred.get(self.layer_idx, False) else 0
        self.cpu_infer.sync_with_cuda_stream(cuda_stream, allow_pending)
        output_gpu[current_slot].copy_(output_cpu[current_slot], non_blocking=True)
        return output_gpu[current_slot]

    def forward(
        self,
        hidden_states: torch.Tensor,
        topk_ids: torch.Tensor,
        topk_weights: torch.Tensor,
        cuda_stream,
    ) -> torch.Tensor:
        """
        Execute forward inference synchronously (submit + sync).

        Args:
            hidden_states: Input hidden states [batch_size, hidden_size]
            topk_ids: Top-k expert IDs [batch_size, num_experts_per_tok]
            topk_weights: Top-k expert weights [batch_size, num_experts_per_tok]
            cuda_stream: CUDA stream for synchronization

        Returns:
            Output tensor on GPU
        """
        self.submit_forward(hidden_states, topk_ids, topk_weights, cuda_stream)
        return self.sync_forward(hidden_states, cuda_stream)

    @staticmethod
    def set_capture_batch_sizes(capture_bs: List[int]):
        """
        Set batch sizes to capture and cache buffers for.

        This allows pre-allocation of CPU buffers for specific batch sizes,
        improving performance by avoiding buffer re-allocation during inference.

        Args:
            capture_bs: List of batch sizes to capture (e.g., [1, 2, 4, 8, 16])

        Example:
            >>> BaseMoEWrapper.set_capture_batch_sizes([1, 2, 4, 8, 16])
        """
        KExpertsCPUBuffer.capture_bs = capture_bs

    @staticmethod
    def get_capture_batch_sizes() -> List[int]:
        """
        Get currently configured capture batch sizes.

        Returns:
            List of batch sizes that are being captured
        """
        return KExpertsCPUBuffer.capture_bs

    @staticmethod
    def clear_buffer_cache():
        """
        Clear all cached buffers.

        This frees up memory by clearing the buffer cache. Useful when you want
        to reset the buffer state or free memory.
        """
        KExpertsCPUBuffer.capture_buffers.clear()
        KExpertsCPUBuffer.temp_bs = 0
        KExpertsCPUBuffer.temp_buffer = tuple()


================================================
FILE: kt-kernel/python/utils/__init__.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Utilities for kt_kernel package.
"""

from .amx import AMXMoEWrapper, NativeMoEWrapper
from .llamafile import LlamafileMoEWrapper
from .loader import SafeTensorLoader, GGUFLoader, CompressedSafeTensorLoader

__all__ = [
    "AMXMoEWrapper",
    "NativeMoEWrapper",
    "LlamafileMoEWrapper",
    "SafeTensorLoader",
    "CompressedSafeTensorLoader",
    "GGUFLoader",
]


================================================
FILE: kt-kernel/python/utils/amx.py
================================================
import os
import torch
import ctypes
from typing import Optional

# Use relative imports for package structure
from ..experts_base import BaseMoEWrapper
from .loader import SafeTensorLoader, CompressedSafeTensorLoader, FP8SafeTensorLoader, BF16SafeTensorLoader
from kt_kernel_ext.moe import MOEConfig
import kt_kernel_ext.moe as _moe_mod

AMXInt4_MOE = getattr(_moe_mod, "AMXInt4_MOE", None)
AMXInt8_MOE = getattr(_moe_mod, "AMXInt8_MOE", None)
AMXInt4_KGroup_MOE = getattr(_moe_mod, "AMXInt4_KGroup_MOE", None)
AMXFP8_MOE = getattr(_moe_mod, "AMXFP8_MOE", None)
AMXBF16_MOE = getattr(_moe_mod, "AMXBF16_MOE", None)
AMXFP8PerChannel_MOE = getattr(_moe_mod, "AMXFP8PerChannel_MOE", None)

_HAS_AMXINT4_SUPPORT = AMXInt4_MOE is not None
_HAS_AMXINT8_SUPPORT = AMXInt8_MOE is not None
_HAS_RAWINT4_SUPPORT = AMXInt4_KGroup_MOE is not None
_HAS_FP8_SUPPORT = AMXFP8_MOE is not None
_HAS_BF16_SUPPORT = AMXBF16_MOE is not None
_HAS_FP8_PERCHANNEL_SUPPORT = AMXFP8PerChannel_MOE is not None


class AMXMoEWrapper(BaseMoEWrapper):
    """
    AMX-based MoE wrapper implementation.
    Supports AMXINT4 and AMXINT8 quantization methods.
    """

    _safetensor_loader_instance = None  # Singleton SafeTensorLoader

    def __init__(
        self,
        layer_idx: int,
        num_experts: int,
        num_experts_per_tok: int,
        hidden_size: int,
        moe_intermediate_size: int,
        gpu_experts_mask: Optional[torch.Tensor],
        cpuinfer_threads: int,
        threadpool_count: int,
        weight_path: str,
        chunked_prefill_size: int,
        cpu_save: bool = False,
        max_deferred_experts_per_token: Optional[int] = None,
        method: str = "AMXINT4",
    ):
        """
        Initialize AMX MoE Wrapper.

        Args:
            layer_idx: Layer index
            num_experts: Total number of experts
            num_experts_per_tok: Number of experts per token (top-k)
            hidden_size: Hidden dimension size
            moe_intermediate_size: MoE intermediate size
            gpu_experts_mask: Boolean mask indicating which experts are on GPU.
                              Shape: [num_experts], dtype: torch.bool.
                              mask[i] = True means expert i is on GPU.
                              If None, all experts are on CPU.
            cpuinfer_threads: Number of CPU inference threads
            threadpool_count: Number of NUMA subpools
            weight_path: Path to AMX weights (SafeTensor format)
            chunked_prefill_size: Maximum prefill chunk size
            cpu_save: Whether to save weights to CPU memory
            max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
            method: AMX quantization method ("AMXINT4" or "AMXINT8")
        """
        if method == "AMXINT4" and not _HAS_AMXINT4_SUPPORT:
            raise RuntimeError(
                "AMXINT4 backend not available. Required ISA:\n"
                "  - AVX512F + AVX512BW (VNNI optional)\n"
                "Please recompile kt_kernel_ext with AVX512 enabled."
            )
        if method == "AMXINT8" and not _HAS_AMXINT8_SUPPORT:
            raise RuntimeError(
                "AMXINT8 backend not available. Required ISA:\n"
                "  - AVX512F + AVX512BW (VNNI optional)\n"
                "Please recompile kt_kernel_ext with AVX512 enabled."
            )

        # Initialize base class
        super().__init__(
            layer_idx=layer_idx,
            num_experts=num_experts,
            num_experts_per_tok=num_experts_per_tok,
            hidden_size=hidden_size,
            moe_intermediate_size=moe_intermediate_size,
            gpu_experts_mask=gpu_experts_mask,
            cpuinfer_threads=cpuinfer_threads,
            threadpool_count=threadpool_count,
            weight_path=weight_path,
            chunked_prefill_size=chunked_prefill_size,
            cpu_save=cpu_save,
            max_deferred_experts_per_token=max_deferred_experts_per_token,
            method=method,
        )

        # AMX-specific: Check if we should load merged safetensor weights
        self.load_merged_weight = False
        import glob

        if glob.glob(os.path.join(weight_path, "*.safetensors")):
            self.load_merged_weight = True

        # Initialize SafeTensor loader (singleton)
        if self.load_merged_weight:
            if AMXMoEWrapper._safetensor_loader_instance is None:
                AMXMoEWrapper._safetensor_loader_instance = SafeTensorLoader(weight_path)
            self.safetensor_loader = AMXMoEWrapper._safetensor_loader_instance

        # AMX-specific weight storage
        self.gate_weights = None
        self.up_weights = None
        self.down_weights = None
        self.gate_scales = None
        self.up_scales = None
        self.down_scales = None

    def load_weights_from_tensors(
        self,
        gate_proj: torch.Tensor,
        up_proj: torch.Tensor,
        down_proj: torch.Tensor,
        physical_to_logical_map_cpu: torch.Tensor,
    ):
        """
        Load and quantize weights from BF16/FP16 tensors (online quantization).

        Args:
            gate_proj: Gate projection weights [num_experts, intermediate_size, hidden_size]
            up_proj: Up projection weights [num_experts, intermediate_size, hidden_size]
            down_proj: Down projection weights [num_experts, hidden_size, intermediate_size]
            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
        """
        # Store tensors as instance variables to keep them alive
        self.gate_proj = gate_proj.contiguous()
        self.up_proj = up_proj.contiguous()
        self.down_proj = down_proj.contiguous()

        # Configure MoE with online quantization (cpu_save mode)
        moe_config = MOEConfig(
            self.num_experts,
            self.num_experts_per_tok,
            self.hidden_size,
            self.moe_intermediate_size,
            self.gpu_experts_mask.data_ptr(),
        )
        moe_config.layer_idx = self.layer_idx
        moe_config.pool = self.cpu_infer.backend_
        moe_config.max_len = self.chunked_prefill_size

        # Enable save mode for online quantization
        moe_config.save = True
        moe_config.load = False

        # Set weight pointers
        moe_config.gate_proj = self.gate_proj.data_ptr()
        moe_config.up_proj = self.up_proj.data_ptr()
        moe_config.down_proj = self.down_proj.data_ptr()

        # Set output path for quantized weights
        moe_config.path = self.weight_path

        # Create MoE module based on AMX method
        if self.method == "AMXINT4":
            self.moe = AMXInt4_MOE(moe_config)
        elif self.method == "AMXINT8":
            self.moe = AMXInt8_MOE(moe_config)
        else:
            raise NotImplementedError(f"Unsupported AMX method: {self.method}")

        # Submit quantization and save task
        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
        self.cpu_infer.sync()

    def load_weights(self, physical_to_logical_map_cpu: torch.Tensor):
        """
        Load weights for this layer and initialize the MoE module.

        Args:
            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
        """
        gate_ptr = 0
        up_ptr = 0
        down_ptr = 0

        gate_ptrs = []
        up_ptrs = []
        down_ptrs = []

        gate_scale_ptrs = []
        up_scale_ptrs = []
        down_scale_ptrs = []

        if self.load_merged_weight:
            base_key = f"blk.{self.layer_idx}"
            w = self.safetensor_loader.load_experts(base_key)

            self.gate_weights = w["gate"]
            self.up_weights = w["up"]
            self.down_weights = w["down"]
            self.gate_scales = w["gate_scale"]
            self.up_scales = w["up_scale"]
            self.down_scales = w["down_scale"]

            # Get pointers to weight arrays
            gate_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.gate_weights
            ]

            up_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.up_weights
            ]

            down_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.down_weights
            ]

            gate_scale_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.gate_scales
            ]

            up_scale_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.up_scales
            ]

            down_scale_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.down_scales
            ]

        # Configure MoE
        moe_config = MOEConfig(
            self.num_experts,
            self.num_experts_per_tok,
            self.hidden_size,
            self.moe_intermediate_size,
            self.gpu_experts_mask.data_ptr(),
        )
        moe_config.layer_idx = self.layer_idx
        moe_config.pool = self.cpu_infer.backend_
        moe_config.max_len = self.chunked_prefill_size

        moe_config.gate_proj = gate_ptr
        moe_config.up_proj = up_ptr
        moe_config.down_proj = down_ptr
        moe_config.gate_projs = gate_ptrs
        moe_config.up_projs = up_ptrs
        moe_config.down_projs = down_ptrs
        moe_config.gate_scales = gate_scale_ptrs
        moe_config.up_scales = up_scale_ptrs
        moe_config.down_scales = down_scale_ptrs

        if self.cpu_save:
            moe_config.save = True
            moe_config.load = False
            base_key = f"model.layers.{self.layer_idx}"
            w = self.safetensor_loader.load_experts(base_key)

            self.gate_proj = torch.cat(w["gate_weight"], dim=0).contiguous()
            self.up_proj = torch.cat(w["up_weight"], dim=0).contiguous()
            self.down_proj = torch.cat(w["down_weight"], dim=0).contiguous()

            moe_config.gate_proj = self.gate_proj.data_ptr()
            moe_config.up_proj = self.up_proj.data_ptr()
            moe_config.down_proj = self.down_proj.data_ptr()
        else:
            moe_config.load = True

        if not self.load_merged_weight:
            moe_config.path = self.weight_path

        # Create MoE module based on AMX method
        if self.method == "AMXINT4":
            self.moe = AMXInt4_MOE(moe_config)
        elif self.method == "AMXINT8":
            self.moe = AMXInt8_MOE(moe_config)
        else:
            raise NotImplementedError(f"Unsupported AMX method: {self.method}")

        # Load weights
        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
        self.cpu_infer.sync()

        # Clean up temporary weight storage if using merged weights
        if self.load_merged_weight:
            del self.gate_weights
            del self.up_weights
            del self.down_weights
            del self.gate_scales
            del self.up_scales
            del self.down_scales


class NativeMoEWrapper(BaseMoEWrapper):
    """Wrapper for RAWINT4/FP8/FP8_PERCHANNEL/BF16 experts stored in compressed SafeTensor format."""

    _native_loader_instance = None

    def __init__(
        self,
        layer_idx: int,
        num_experts: int,
        num_experts_per_tok: int,
        hidden_size: int,
        moe_intermediate_size: int,
        gpu_experts_mask: Optional[torch.Tensor],
        cpuinfer_threads: int,
        threadpool_count: int,
        weight_path: str,
        chunked_prefill_size: int,
        cpu_save: bool = False,
        max_deferred_experts_per_token: Optional[int] = None,
        method: str = "RAWINT4",
    ):
        if method == "RAWINT4" and not _HAS_RAWINT4_SUPPORT:
            raise RuntimeError(
                "RAWINT4 backend not available. Required ISA:\n"
                "  - AVX512F + AVX512BW (VNNI optional)\n"
                "Please recompile kt_kernel_ext with AVX512 enabled."
            )
        if method == "FP8" and not _HAS_FP8_SUPPORT:
            raise RuntimeError(
                "FP8 backend not available. Required ISA:\n"
                "  - AVX512F + AVX512BW + AVX512_BF16 + AVX512_VBMI\n"
                "Please recompile kt_kernel_ext with AVX512 + BF16 + VBMI enabled."
            )
        if method == "FP8_PERCHANNEL" and not _HAS_FP8_PERCHANNEL_SUPPORT:
            raise RuntimeError(
                "FP8_PERCHANNEL backend not available. Required ISA:\n"
                "  - AVX512F + AVX512BW + AVX512_BF16 + AVX512_VBMI\n"
                "Please recompile kt_kernel_ext with AVX512 + BF16 + VBMI enabled."
            )
        if method == "BF16" and not _HAS_BF16_SUPPORT:
            raise RuntimeError(
                "BF16 backend not available. Required ISA:\n"
                "  - AVX512F + AVX512BW + AVX512_BF16\n"
                "Please recompile kt_kernel_ext with AVX512 + BF16 enabled."
            )

        super().__init__(
            layer_idx=layer_idx,
            num_experts=num_experts,
            num_experts_per_tok=num_experts_per_tok,
            hidden_size=hidden_size,
            moe_intermediate_size=moe_intermediate_size,
            gpu_experts_mask=gpu_experts_mask,
            cpuinfer_threads=cpuinfer_threads,
            threadpool_count=threadpool_count,
            weight_path=weight_path,
            chunked_prefill_size=chunked_prefill_size,
            cpu_save=cpu_save,
            max_deferred_experts_per_token=max_deferred_experts_per_token,
            method=method,
        )

        if NativeMoEWrapper._native_loader_instance is None:
            if method == "RAWINT4":
                NativeMoEWrapper._native_loader_instance = CompressedSafeTensorLoader(weight_path)
            elif method == "FP8":
                NativeMoEWrapper._native_loader_instance = FP8SafeTensorLoader(weight_path)
            elif method == "FP8_PERCHANNEL":
                # Use FP8SafeTensorLoader with per-channel scale format
                NativeMoEWrapper._native_loader_instance = FP8SafeTensorLoader(weight_path, scale_suffix="weight_scale")
            elif method == "BF16":
                NativeMoEWrapper._native_loader_instance = BF16SafeTensorLoader(weight_path)
            else:
                raise NotImplementedError(f"Unsupported method for NativeMoEWrapper: {method}")
        self.loader = NativeMoEWrapper._native_loader_instance

        self.gate_weights = None
        self.up_weights = None
        self.down_weights = None
        self.gate_scales = None
        self.up_scales = None
        self.down_scales = None

    def load_weights_from_tensors(
        self,
        gate_proj: torch.Tensor,
        up_proj: torch.Tensor,
        down_proj: torch.Tensor,
        physical_to_logical_map_cpu: torch.Tensor,
    ):
        raise NotImplementedError("RAWINT4 wrapper expects pre-quantized safetensor weights.")

    def load_weights(self, physical_to_logical_map_cpu: torch.Tensor):
        import time

        t0 = time.time()
        base_key = f"model.layers.{self.layer_idx}"
        weights = self.loader.load_experts(base_key)
        t1 = time.time()

        # Keep individual tensors instead of stacking - avoid expensive memory copy
        # weights["gate"], weights["up"], weights["down"] are lists of tensors per expert
        self.gate_weights = weights["gate"]  # list of tensors
        self.up_weights = weights["up"]
        self.down_weights = weights["down"]

        # BF16 has no scales, others have scales
        if self.method == "BF16":
            # BF16 doesn't have scales
            self.gate_scales = None
            self.up_scales = None
            self.down_scales = None
        else:
            # Convert scales to bf16 individually
            # self.gate_scales = [t.to(torch.bfloat16).contiguous() for t in weights["gate_scale"]]
            # self.up_scales = [t.to(torch.bfloat16).contiguous() for t in weights["up_scale"]]
            # self.down_scales = [t.to(torch.bfloat16).contiguous() for t in weights["down_scale"]]
            self.gate_scales = weights["gate_scale"]
            self.up_scales = weights["up_scale"]
            self.down_scales = weights["down_scale"]
            if self.method == "RAWINT4":
                assert self.gate_scales[0].dtype == torch.bfloat16, "Expected bf16 scales for RAWINT4"
            elif self.method == "FP8":
                if self.gate_scales[0].dtype != torch.float32:
                    self.gate_scales = [t.to(torch.float32).contiguous() for t in weights["gate_scale"]]
                    self.up_scales = [t.to(torch.float32).contiguous() for t in weights["up_scale"]]
                    self.down_scales = [t.to(torch.float32).contiguous() for t in weights["down_scale"]]
                assert self.gate_scales[0].dtype == torch.float32, "Expected float32 scales for FP8"
            elif self.method == "FP8_PERCHANNEL":
                if self.gate_scales[0].dtype != torch.float32:
                    self.gate_scales = [t.to(torch.float32).contiguous() for t in weights["gate_scale"]]
                    self.up_scales = [t.to(torch.float32).contiguous() for t in weights["up_scale"]]
                    self.down_scales = [t.to(torch.float32).contiguous() for t in weights["down_scale"]]
                assert self.gate_scales[0].dtype == torch.float32, "Expected float32 scales for FP8_PERCHANNEL"

        t2 = time.time()

        # Build pointer lists: [numa_id][expert_id] -> pointer
        # Since RAWINT4/FP8/BF16 has no numa sharding, numa dimension is 1
        gate_ptrs = [[t.data_ptr() for t in self.gate_weights]]
        up_ptrs = [[t.data_ptr() for t in self.up_weights]]
        down_ptrs = [[t.data_ptr() for t in self.down_weights]]

        # BF16 has no scales, pass empty lists (will use 0/nullptr for consistency)
        if self.method == "BF16":
            gate_scale_ptrs = [[0 for _ in self.gate_weights]]
            up_scale_ptrs = [[0 for _ in self.up_weights]]
            down_scale_ptrs = [[0 for _ in self.down_weights]]
        else:
            gate_scale_ptrs = [[t.data_ptr() for t in self.gate_scales]]
            up_scale_ptrs = [[t.data_ptr() for t in self.up_scales]]
            down_scale_ptrs = [[t.data_ptr() for t in self.down_scales]]
        t3 = time.time()

        moe_config = MOEConfig(
            self.num_experts,
            self.num_experts_per_tok,
            self.hidden_size,
            self.moe_intermediate_size,
            self.gpu_experts_mask.data_ptr(),
        )
        moe_config.layer_idx = self.layer_idx
        moe_config.pool = self.cpu_infer.backend_
        moe_config.max_len = self.chunked_prefill_size

        # Use gate_projs instead of gate_proj for per-expert pointers
        moe_config.gate_projs = gate_ptrs
        moe_config.up_projs = up_ptrs
        moe_config.down_projs = down_ptrs
        moe_config.gate_scales = gate_scale_ptrs
        moe_config.up_scales = up_scale_ptrs
        moe_config.down_scales = down_scale_ptrs

        # Infer group_size from scale shape (column-major layout)
        # For gate/up projection: in_features = hidden_size
        # So: group_size = hidden_size / scale.shape[1]

        if self.method == "RAWINT4":
            group_size = self.hidden_size // self.gate_scales[0].shape[1]
            moe_config.quant_config.bits = 4
            moe_config.quant_config.group_size = group_size
            moe_config.quant_config.zero_point = False
            self.moe = AMXInt4_KGroup_MOE(moe_config)
        elif self.method == "FP8":
            moe_config.quant_config.bits = 8
            moe_config.quant_config.group_size = 128
            moe_config.quant_config.zero_point = False
            self.moe = AMXFP8_MOE(moe_config)
        elif self.method == "FP8_PERCHANNEL":
            moe_config.quant_config.bits = 8
            moe_config.quant_config.per_channel = True
            moe_config.quant_config.zero_point = False
            self.moe = AMXFP8PerChannel_MOE(moe_config)
        elif self.method == "BF16":
            # BF16 has no quantization config needed
            self.moe = AMXBF16_MOE(moe_config)
        t4 = time.time()

        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
        self.cpu_infer.sync()
        t5 = time.time()

        del self.gate_weights
        del self.up_weights
        del self.down_weights
        if self.gate_scales is not None:
            del self.gate_scales
            del self.up_scales
            del self.down_scales
        t6 = time.time()

        print(
            f"[NativeMoEWrapper Layer {self.layer_idx}] "
            f"load_experts: {(t1-t0)*1000:.1f}ms, "
            f"prepare_tensors: {(t2-t1)*1000:.1f}ms, "
            f"build_ptrs: {(t3-t2)*1000:.1f}ms, "
            f"create_moe: {(t4-t3)*1000:.1f}ms, "
            f"cpp_load_weights: {(t5-t4)*1000:.1f}ms, "
            f"cleanup: {(t6-t5)*1000:.1f}ms, "
            f"total: {(t6-t0)*1000:.1f}ms"
        )

    def submit_write_weight_scale_to_buffer(
        self,
        gpu_tp_count: int,
        expert_id: int,
        w13_weight_ptrs,
        w13_scale_ptrs,
        w2_weight_ptrs,
        w2_scale_ptrs,
    ):
        """
        Submit the write_weight_scale_to_buffer task for RAWINT4 KGroup AMX implementation.

        This method submits the C++-exposed task `write_weight_scale_to_buffer_task` to the
        shared CPUInfer queue. The pointer lists should be plain integer lists (e.g. from
        tensor.data_ptr()).
        """
        if self.moe is None:
            raise RuntimeError("MoE instance not initialized; cannot submit write_weight_scale_to_buffer task.")

        if not hasattr(self.moe, "write_weight_scale_to_buffer_task"):
            raise NotImplementedError(
                "write_weight_scale_to_buffer_task is not available for this backend implementation."
            )

        self.cpu_infer.submit(
            self.moe.write_weight_scale_to_buffer_task(
                gpu_tp_count,
                expert_id,
                w13_weight_ptrs,
                w13_scale_ptrs,
                w2_weight_ptrs,
                w2_scale_ptrs,
            )
        )

    def sync_write_weight_scale_to_buffer(self):
        """
        Block until previously submitted write_weight_scale_to_buffer tasks finish.
        """
        # The CPUInfer.sync() call blocks until pending tasks complete.
        self.cpu_infer.sync()


================================================
FILE: kt-kernel/python/utils/llamafile.py
================================================
import torch
from typing import Optional
import os

# Use relative imports for package structure
from ..experts_base import BaseMoEWrapper
from .loader import GGUFLoader
from kt_kernel_ext.moe import MOEConfig

try:
    from kt_kernel_ext.moe import MOE

    _HAS_LLAMAFILE_SUPPORT = True
except (ImportError, AttributeError):
    _HAS_LLAMAFILE_SUPPORT = False
    MOE = None

from kt_kernel_ext.kvcache import ggml_type


class LlamafileMoEWrapper(BaseMoEWrapper):
    """
    Llamafile-based MoE wrapper implementation.
    Supports GGUF quantized weights with llamafile backend.
    """

    _gguf_loader_instance = None  # Singleton GGUFLoader

    def __init__(
        self,
        layer_idx: int,
        num_experts: int,
        num_experts_per_tok: int,
        hidden_size: int,
        moe_intermediate_size: int,
        gpu_experts_mask: Optional[torch.Tensor],
        cpuinfer_threads: int,
        threadpool_count: int,
        weight_path: str,
        chunked_prefill_size: int,
        cpu_save: bool = False,
        max_deferred_experts_per_token: Optional[int] = None,
        method: str = "LLAMAFILE",
    ):
        """
        Initialize Llamafile MoE Wrapper.

        Args:
            layer_idx: Layer index
            num_experts: Total number of experts
            num_experts_per_tok: Number of experts per token (top-k)
            hidden_size: Hidden dimension size
            moe_intermediate_size: MoE intermediate size
            gpu_experts_mask: Boolean mask indicating which experts are on GPU.
                              Shape: [num_experts], dtype: torch.bool.
                              mask[i] = True means expert i is on GPU.
                              If None, all experts are on CPU.
            cpuinfer_threads: Number of CPU inference threads
            threadpool_count: Number of NUMA subpools (TP count)
            weight_path: Path to GGUF weights
            chunked_prefill_size: Maximum prefill chunk size
            cpu_save: Not supported for Llamafile backend
            max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
            method: Should be "LLAMAFILE"
        """
        if not _HAS_LLAMAFILE_SUPPORT:
            raise RuntimeError(
                "Llamafile backend not available. kt_kernel_ext was not compiled with Llamafile support.\n"
                "Please recompile with Llamafile enabled."
            )

        if not os.path.exists(weight_path):
            raise FileNotFoundError(f"GGUF weight path not found: {weight_path}")

        # Initialize GGUF loader (singleton)
        if LlamafileMoEWrapper._gguf_loader_instance is None:
            LlamafileMoEWrapper._gguf_loader_instance = GGUFLoader(weight_path)
        self.gguf_loader = LlamafileMoEWrapper._gguf_loader_instance

        # Validate TP configuration with QK_K alignment
        QK_K = 256

        # Check if intermediate_size is divisible by QK_K
        if moe_intermediate_size % QK_K != 0:
            raise ValueError(
                f"intermediate_size ({moe_intermediate_size}) must be divisible by QK_K ({QK_K}) "
                f"for Llamafile backend"
            )

        # Calculate TP splits with QK_K alignment
        num_blocks = moe_intermediate_size // QK_K
        base_blocks = num_blocks // threadpool_count
        extra_blocks = num_blocks % threadpool_count

        # Validate that we have enough blocks
        if base_blocks == 0:
            valid_tp_counts = list(range(1, num_blocks + 1))
            raise ValueError(
                f"intermediate_size ({moe_intermediate_size}) is too small for threadpool_count ({threadpool_count}).\n"
                f"Total blocks: {num_blocks} (intermediate_size / QK_K)\n"
                f"Cannot distribute to {threadpool_count} TPs (each TP needs at least 1 block).\n"
                f"Valid threadpool_count values: {valid_tp_counts}"
            )

        # Log TP split information
        print(f"[LlamafileMoEWrapper] Layer {layer_idx} TP configuration:")
        print(f"  intermediate_size: {moe_intermediate_size}")
        print(f"  threadpool_count: {threadpool_count}")
        print(f"  QK_K: {QK_K}")
        print(f"  Total blocks: {num_blocks}")
        print(f"  Base blocks per TP: {base_blocks}")
        print(f"  Extra blocks (distributed to first TPs): {extra_blocks}")

        current_offset = 0
        for tp_id in range(threadpool_count):
            tp_blocks = base_blocks + (1 if tp_id < extra_blocks else 0)
            tp_size = tp_blocks * QK_K
            print(f"  TP {tp_id}: size={tp_size}, offset={current_offset}, blocks={tp_blocks}")
            current_offset += tp_size

        # Initialize base class
        super().__init__(
            layer_idx=layer_idx,
            num_experts=num_experts,
            num_experts_per_tok=num_experts_per_tok,
            hidden_size=hidden_size,
            moe_intermediate_size=moe_intermediate_size,
            gpu_experts_mask=gpu_experts_mask,
            cpuinfer_threads=cpuinfer_threads,
            threadpool_count=threadpool_count,
            weight_path=weight_path,
            chunked_prefill_size=chunked_prefill_size,
            cpu_save=cpu_save,
            max_deferred_experts_per_token=max_deferred_experts_per_token,
            method=method,
        )

        self.weights_to_keep = None

    def load_weights_from_tensors(
        self,
        gate_proj: torch.Tensor,
        up_proj: torch.Tensor,
        down_proj: torch.Tensor,
        physical_to_logical_map_cpu: torch.Tensor,
    ):
        """
        Online quantization is not supported for Llamafile backend.
        Use pre-quantized GGUF weights instead.
        """
        raise NotImplementedError(
            "Llamafile backend does not support online quantization (load_weights_from_tensors).\n"
            "Please use pre-quantized GGUF weights and call load_weights() instead."
        )

    def load_weights(self, physical_to_logical_map_cpu: Optional[torch.Tensor] = None):
        """
        Load weights for this layer from GGUF files and initialize the MoE module.

        Args:
            physical_to_logical_map_cpu: Optional mapping from physical to logical expert IDs
                                         Shape: [num_experts], dtype: int32
                                         If None, uses identity mapping [0, 1, 2, ..., num_experts-1]
        """
        if not _HAS_LLAMAFILE_SUPPORT:
            raise RuntimeError(
                "Llamafile backend not available. kt_kernel_ext was not compiled with Llamafile support.\n"
                "Please recompile with Llamafile enabled."
            )

        if physical_to_logical_map_cpu is None:
            physical_to_logical_map_cpu = torch.arange(self.num_experts, dtype=torch.int32, device="cpu")
            print(f"  Using default identity mapping for {self.num_experts} experts")

        base_key = f"blk.{self.layer_idx}"

        # Load quantized tensors from GGUF
        gate_data, gate_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_gate_exps.weight")

        up_data, up_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_up_exps.weight")

        down_data, down_type = self.gguf_loader.get_undequanted_tensor_and_ggml_type(f"{base_key}.ffn_down_exps.weight")

        # Keep tensors alive
        self.weights_to_keep = (gate_data, up_data, down_data)

        hidden_type = ggml_type.BF16

        # Configure MoE
        moe_config = MOEConfig(
            self.num_experts,
            self.num_experts_per_tok,
            self.hidden_size,
            self.moe_intermediate_size,
            self.gpu_experts_mask.data_ptr(),
        )
        moe_config.layer_idx = self.layer_idx
        moe_config.pool = self.cpu_infer.backend_

        # Llamafile-specific configuration
        moe_config.m_block = 32  # Parallel block size
        moe_config.group_min_len = 10  # Use forward_one when qlen < 10
        moe_config.max_len = self.chunked_prefill_size
        moe_config.group_max_len = max(1, int(self.chunked_prefill_size))

        # Set weight pointers
        moe_config.gate_proj = gate_data.data_ptr()
        moe_config.up_proj = up_data.data_ptr()
        moe_config.down_proj = down_data.data_ptr()

        # Set quantization types
        moe_config.gate_type = gate_type
        moe_config.up_type = up_type
        moe_config.down_type = down_type
        moe_config.hidden_type = hidden_type

        # Create MoE module
        self.moe = MOE(moe_config)

        # Load weights
        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
        self.cpu_infer.sync()

        # Drop original weights after loading
        self.weights_to_keep = None


================================================
FILE: kt-kernel/python/utils/loader.py
================================================
"""
Weight loaders for different formats.

This module provides loaders for:
- SafeTensor format (for AMX quantized weights)
- GGUF format (for Llamafile quantized weights)
"""

from __future__ import annotations

import os
import numpy as np
import torch
from enum import IntEnum
from safetensors import safe_open
from gguf.gguf_reader import GGUFReader


class GGMLQuantizationType(IntEnum):
    """GGML quantization type enumeration"""

    F32 = 0
    F16 = 1
    Q4_0 = 2
    Q4_1 = 3
    Q5_0 = 6
    Q5_1 = 7
    Q8_0 = 8
    Q8_1 = 9
    Q2_K = 10
    Q3_K = 11
    Q4_K = 12
    Q5_K = 13
    Q6_K = 14
    Q8_K = 15
    IQ2_XXS = 16
    IQ2_XS = 17
    IQ3_XXS = 18
    IQ1_S = 19
    IQ4_NL = 20
    IQ3_S = 21
    IQ2_S = 22
    IQ4_XS = 23
    I8 = 24
    I16 = 25
    I32 = 26
    I64 = 27
    F64 = 28
    IQ1_M = 29
    BF16 = 30


def translate_name_to_gguf(name):
    """
    Translate PyTorch tensor name to GGUF format
    """
    name = name.replace("lm_head.", "output.")
    name = name.replace("model.embed_tokens.", "token_embd.")
    name = name.replace("model.norm.", "output_norm.")
    name = name.replace("model.layers.", "blk.")
    name = name.replace(".input_layernorm", ".attn_norm")
    name = name.replace(".mlp.down_proj", ".ffn_down")
    name = name.replace(".mlp.gate_proj", ".ffn_gate")
    name = name.replace(".mlp.up_proj", ".ffn_up")
    name = name.replace(".post_attention_layernorm", ".ffn_norm")
    name = name.replace(".self_attn.q_proj", ".attn_q")
    name = name.replace(".self_attn.k_proj", ".attn_k")
    name = name.replace(".self_attn.v_proj", ".attn_v")
    name = name.replace(".self_attn.o_proj", ".attn_output")
    name = name.replace(".self_attn.qkv_proj", ".attn_qkv")
    name = name.replace(".self_attn.kv_a_proj_with_mqa", ".attn_kv_a_mqa")
    name = name.replace(".self_attn.kv_a_layernorm", ".attn_kv_a_norm")
    name = name.replace(".self_attn.kv_b_proj", ".attn_kv_b")
    name = name.replace(".self_attn.q_a_proj", ".attn_q_a")
    name = name.replace(".self_attn.q_a_layernorm", ".attn_q_a_norm")
    name = name.replace(".self_attn.q_b_proj", ".attn_q_b")
    name = name.replace(".self_attn.q_norm", ".attn_q_norm")
    name = name.replace(".self_attn.k_norm", ".attn_k_norm")
    name = name.replace(".shared_expert.", ".shared_experts.")
    name = name.replace(".shared_expert_", ".shared_experts_")
    name = name.replace(".gate_up_proj.", ".up_proj")
    name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
    name = name.replace(".mlp.gate.e_score_correction_bias", ".exp_probs_b.bias")
    name = name.replace(".mlp.gate", ".ffn_gate_inp")
    name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
    name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
    name = name.replace(".mlp.shared_experts_gate", ".ffn_gate_inp_shexp")
    name = name.replace(".mlp.experts", "")
    name = name.replace(".mlp.experts.ffn_down_exps", ".ffn_down_exps")
    name = name.replace(".mlp.experts.ffn_gate_exps", ".ffn_gate_exps")
    name = name.replace(".mlp.experts.ffn_up_exps", ".ffn_up_exps")
    name = name.replace(".block_sparse_moe.gate.", ".ffn_gate_inp.")
    name = name.replace(".block_sparse_moe.experts", "")
    name = name.replace(".feed_forward.experts", "")
    name = name.replace(".feed_forward.router", ".ffn_gate_inp")
    name = name.replace(".feed_forward.shared_experts.down_proj", ".ffn_down_shexp")
    name = name.replace(".feed_forward.shared_experts.gate_proj", ".ffn_gate_shexp")
    name = name.replace(".feed_forward.shared_experts.up_proj", ".ffn_up_shexp")
    return name


class SafeTensorLoader:
    """
    SafeTensor format loader for AMX quantized weights.

    Supports loading tensors from .safetensors files with NUMA-sharded expert weights.
    """

    tensor_file_map: dict
    tensor_type_map: dict
    file_handle_map: dict
    tensor_device_map: dict

    def __init__(self, file_path: str):
        self.__load_tensor_file_map(file_path)

    def __load_tensor_file_map(self, file_path: str):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Path not found: {file_path}")
        if os.path.isfile(file_path):
            folder_path = os.path.dirname(file_path)
        else:
            folder_path = file_path
        self.file_handle_map = {}
        self.tensor_file_map = {}
        self.tensor_type_map = {}
        self.tensor_device_map = {}

        found_safetensor = False
        for root, _, files in os.walk(folder_path):
            files = sorted(files)
            for file in files:
                if file.endswith(".safetensors"):
                    found_safetensor = True
                    file_path = os.path.join(root, file)
                    if file not in self.file_handle_map:
                        try:
                            handle = safe_open(file_path, framework="pt")
                            self.file_handle_map[file] = handle
                        except Exception as e:
                            print(f"Error opening Safetensor file {file_path}: {e}")
                            continue

                    f = self.file_handle_map.get(file)
                    if f is None:
                        continue
                    try:
                        for key in f.keys():
                            self.tensor_file_map[key] = file
                    except Exception as e:
                        print(f"Error reading Safetensor file {file_path}: {e}")

        if not found_safetensor:
            raise FileNotFoundError(f"No Safetensor files found in {folder_path}")

    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        return tensor.to(device)

    def close_all_handles(self):
        for handle in self.file_handle_map.values():
            handle.close()
        self.file_handle_map.clear()

    def load_experts(self, base_key: str, device: str = "cpu"):
        """
        Load expert weights from SafeTensor files.

        Expected format:
        - blk.{layer_index}.ffn_[up, down, gate]_exps.{expert_id}.numa.{numa_id}.weight
        - blk.{layer_index}.ffn_[up, down, gate]_exps.{expert_id}.numa.{numa_id}.scale

        Args:
            base_key: Base key like "blk.{layer_index}"
            device: Target device for tensors

        Returns:
            Dictionary with keys: up, gate, down, up_scale, gate_scale, down_scale
            Each value is a list of lists: [numa_id][expert_id] -> numpy array
        """
        up_base_key = f"{base_key}.ffn_up_exps"
        gate_base_key = f"{base_key}.ffn_gate_exps"
        down_base_key = f"{base_key}.ffn_down_exps"
        max_numa_id = -1
        max_experts_count = -1
        while self.has_tensor(f"{up_base_key}.{max_experts_count+1}.numa.{0}.weight"):
            max_experts_count += 1
        if max_experts_count == 0:
            raise ValueError(f"No experts found for key {base_key}")
        while self.has_tensor(f"{up_base_key}.{0}.numa.{max_numa_id+1}.weight"):
            max_numa_id += 1
        # Initialize empty lists to store tensors for each projection type
        up_weights = [[] for _ in range(max_numa_id + 1)]
        gate_weights = [[] for _ in range(max_numa_id + 1)]
        down_weights = [[] for _ in range(max_numa_id + 1)]
        up_scales = [[] for _ in range(max_numa_id + 1)]
        gate_scales = [[] for _ in range(max_numa_id + 1)]
        down_scales = [[] for _ in range(max_numa_id + 1)]
        for numa_id in range(max_numa_id + 1):
            for expert_id in range(max_experts_count + 1):
                up_key = f"{up_base_key}.{expert_id}.numa.{numa_id}.weight"
                gate_key = f"{gate_base_key}.{expert_id}.numa.{numa_id}.weight"
                down_key = f"{down_base_key}.{expert_id}.numa.{numa_id}.weight"
                up_scale_key = f"{up_base_key}.{expert_id}.numa.{numa_id}.scale"
                gate_scale_key = f"{gate_base_key}.{expert_id}.numa.{numa_id}.scale"
                down_scale_key = f"{down_base_key}.{expert_id}.numa.{numa_id}.scale"
                # make sure contiguous
                up_tensor = self.load_tensor(up_key, device).numpy()
                gate_tensor = self.load_tensor(gate_key, device).numpy()
                down_tensor = self.load_tensor(down_key, device).numpy()
                up_scale_tensor = self.load_tensor(up_scale_key, device).numpy()
                gate_scale_tensor = self.load_tensor(gate_scale_key, device).numpy()
                down_scale_tensor = self.load_tensor(down_scale_key, device).numpy()

                up_weights[numa_id].append(up_tensor)
                gate_weights[numa_id].append(gate_tensor)
                down_weights[numa_id].append(down_tensor)
                up_scales[numa_id].append(up_scale_tensor)
                gate_scales[numa_id].append(gate_scale_tensor)
                down_scales[numa_id].append(down_scale_tensor)
        return {
            "up": up_weights,
            "gate": gate_weights,
            "down": down_weights,
            "up_scale": up_scales,
            "gate_scale": gate_scales,
            "down_scale": down_scales,
        }

    def has_tensor(self, name: str):
        return name in self.tensor_file_map


class FP8SafeTensorLoader(SafeTensorLoader):
    """Loader for FP8 expert weights with auto-detection of naming formats.

    Supported formats:
    - DeepSeek style: {base}.mlp.experts.{id}.{gate,up,down}_proj.weight
    - Mixtral/MiniMax style: {base}.block_sparse_moe.experts.{id}.{w1,w3,w2}.weight
    - Mistral style: {base}.experts.{id}.{w1,w3,w2}.weight

    Supported scale formats (auto-detected):
    - Block-wise: weight_scale_inv (DeepSeek FP8)
    - Per-channel: weight_scale (GLM-4.7-FP8)

    The format is auto-detected during initialization.
    """

    # Known MoE naming formats: (experts_path_template, gate_name, up_name, down_name)
    MOE_FORMATS = {
        "deepseek": ("{base}.mlp.experts", "gate_proj", "up_proj", "down_proj"),
        "mixtral": ("{base}.block_sparse_moe.experts", "w1", "w3", "w2"),
        "mistral": ("{base}.experts", "w1", "w3", "w2"),
    }

    def __init__(self, file_path: str, scale_suffix: str = None):
        """Initialize FP8 loader with optional scale suffix override.

        Args:
            file_path: Path to safetensor files
            scale_suffix: Optional scale key suffix. If None, auto-detect between
                         'weight_scale_inv' (block-wise) and 'weight_scale' (per-channel).
        """
        super().__init__(file_path)
        self._detected_format = None
        self._scale_suffix = scale_suffix  # None means auto-detect
        # Set per_channel based on explicit scale_suffix if provided
        if scale_suffix == "weight_scale":
            self._is_per_channel = True
        elif scale_suffix == "weight_scale_inv":
            self._is_per_channel = False
        else:
            self._is_per_channel = False  # Will be updated in _detect_format if auto-detect
        self._is_vl_model = False
        self._detect_format()

    def _detect_format(self):
        """Auto-detect the MoE naming format and scale format by checking tensor keys."""
        # Sample some tensor names to detect format
        sample_keys = list(self.tensor_file_map.keys())[:1000]

        for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items():
            # Check if any key matches this format pattern
            # Look for pattern like: model.layers.0.{experts_path}.0.{gate_name}.weight
            for key in sample_keys:
                if ".experts." in key and f".{gate}.weight" in key:
                    # Verify the path template matches
                    if "block_sparse_moe.experts" in key and fmt_name == "mixtral":
                        self._detected_format = fmt_name
                        print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
                        break
                    elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek":
                        self._detected_format = fmt_name
                        print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
                        break
                    elif fmt_name == "mistral" and ".mlp.experts" not in key and ".block_sparse_moe.experts" not in key:
                        self._detected_format = fmt_name
                        print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
                        break
            if self._detected_format:
                break

        # Default to deepseek if no format detected
        if not self._detected_format:
            self._detected_format = "deepseek"
            print("[FP8SafeTensorLoader] No MoE format detected, defaulting to: deepseek")

        # Auto-detect scale suffix if not specified
        if self._scale_suffix is None:
            _, gate, _, _ = self.MOE_FORMATS[self._detected_format]
            # Check for per-channel scale (weight_scale) vs block-wise (weight_scale_inv)
            for key in sample_keys:
                if f".{gate}.weight_scale_inv" in key:
                    self._scale_suffix = "weight_scale_inv"
                    self._is_per_channel = False
                    print("[FP8SafeTensorLoader] Detected scale format: block-wise (weight_scale_inv)")
                    if key.startswith("model.language_model.") and self._detected_format == "deepseek":
                        # VL models(Qwen3.5): model.layers.{N} -> model.language_model.layers.{N}
                        self._is_vl_model = True
                        print("[FP8SafeTensorLoader] Detected VL model")
                    return
                elif f".{gate}.weight_scale" in key and "weight_scale_inv" not in key:
                    self._scale_suffix = "weight_scale"
                    # Some models (e.g., Mistral) use block-wise FP8 scales but keep
                    # the key suffix as `weight_scale` (without `_inv`). Infer format
                    # from scale tensor shape instead of suffix alone:
                    # - per-channel: [N] or [N, 1]
                    # - block-wise: [N_block, K_block] (both dims > 1)
                    scale_tensor = self.load_tensor(key, device="cpu")
                    if scale_tensor.dim() == 1:
                        self._is_per_channel = True
                    elif scale_tensor.dim() == 2 and scale_tensor.shape[1] == 1:
                        self._is_per_channel = True
                    else:
                        self._is_per_channel = False

                    scale_kind = "per-channel" if self._is_per_channel else "block-wise"
                    print(f"[FP8SafeTensorLoader] Detected scale format: {scale_kind} (weight_scale)")
                    return
            # Default to weight_scale_inv
            self._scale_suffix = "weight_scale_inv"
            self._is_per_channel = False
            print("[FP8SafeTensorLoader] No scale format detected, defaulting to: weight_scale_inv")
        else:
            # Scale suffix was explicitly provided
            scale_type = "per-channel" if self._is_per_channel else "block-wise"
            print(f"[FP8SafeTensorLoader] Using explicit scale format: {scale_type} ({self._scale_suffix})")

    def _get_experts_prefix_candidates(self, base_key: str) -> list[str]:
        """Get candidate experts prefixes based on detected format and base key variants."""
        path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format]
        candidates = []
        if self._is_vl_model:
            base_key = base_key.replace("model.layers", "model.language_model.layers")
        candidates.append(path_tpl.format(base=base_key))

        # Some model weights (e.g., Mistral native format) do not have "model." prefix.
        if base_key.startswith("model."):
            candidates.append(path_tpl.format(base=base_key[len("model.") :]))

        # Deduplicate while preserving order.
        return list(dict.fromkeys(candidates))

    def _get_proj_names(self):
        """Get projection names (gate, up, down) based on detected format."""
        _, gate, up, down = self.MOE_FORMATS[self._detected_format]
        return gate, up, down

    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        if device == "cpu":
            return tensor
        return tensor.to(device)

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load FP8 expert weights and their scale tensors.

        Supports both block-wise (weight_scale_inv) and per-channel (weight_scale) formats.
        Per-channel scales are squeezed from [N, 1] to [N] if needed.
        """
        experts_prefix_candidates = self._get_experts_prefix_candidates(base_key)
        gate_name, up_name, down_name = self._get_proj_names()

        expert_count = 0
        experts_prefix = None
        for prefix in experts_prefix_candidates:
            expert_count = 0
            while self.has_tensor(f"{prefix}.{expert_count}.{gate_name}.weight"):
                expert_count += 1
            if expert_count > 0:
                experts_prefix = prefix
                break

        if expert_count == 0 or experts_prefix is None:
            raise ValueError(f"No experts found for keys: {experts_prefix_candidates}")

        gate_weights = [None] * expert_count
        up_weights = [None] * expert_count
        down_weights = [None] * expert_count
        gate_scales = [None] * expert_count
        up_scales = [None] * expert_count
        down_scales = [None] * expert_count

        for exp_id in range(expert_count):
            gate_w_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight"
            up_w_key = f"{experts_prefix}.{exp_id}.{up_name}.weight"
            down_w_key = f"{experts_prefix}.{exp_id}.{down_name}.weight"
            gate_s_key = f"{experts_prefix}.{exp_id}.{gate_name}.{self._scale_suffix}"
            up_s_key = f"{experts_prefix}.{exp_id}.{up_name}.{self._scale_suffix}"
            down_s_key = f"{experts_prefix}.{exp_id}.{down_name}.{self._scale_suffix}"

            gate_weights[exp_id] = self.load_tensor(gate_w_key, device).contiguous()
            up_weights[exp_id] = self.load_tensor(up_w_key, device).contiguous()
            down_weights[exp_id] = self.load_tensor(down_w_key, device).contiguous()

            gate_scale = self.load_tensor(gate_s_key, device)
            up_scale = self.load_tensor(up_s_key, device)
            down_scale = self.load_tensor(down_s_key, device)

            # For per-channel scales, squeeze [N, 1] -> [N] if needed
            if self._is_per_channel:
                if gate_scale.dim() == 2 and gate_scale.shape[1] == 1:
                    gate_scale = gate_scale.squeeze(1)
                if up_scale.dim() == 2 and up_scale.shape[1] == 1:
                    up_scale = up_scale.squeeze(1)
                if down_scale.dim() == 2 and down_scale.shape[1] == 1:
                    down_scale = down_scale.squeeze(1)

            gate_scales[exp_id] = gate_scale.contiguous()
            up_scales[exp_id] = up_scale.contiguous()
            down_scales[exp_id] = down_scale.contiguous()

        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
            "gate_scale": gate_scales,
            "up_scale": up_scales,
            "down_scale": down_scales,
        }

    def is_per_channel(self) -> bool:
        """Return True if using per-channel quantization, False for block-wise."""
        return self._is_per_channel


class BF16SafeTensorLoader(SafeTensorLoader):
    """Loader for native BF16 expert weights (no quantization, no scales).

    Supported formats:
    - DeepSeek style: {base}.mlp.experts.{id}.{gate,up,down}_proj.weight
    - Mixtral/MiniMax style: {base}.block_sparse_moe.experts.{id}.{w1,w3,w2}.weight
    - Mistral style: {base}.experts.{id}.{w1,w3,w2}.weight

    The format is auto-detected during initialization.
    """

    MOE_FORMATS = {
        "deepseek": ("{base}.mlp.experts", "gate_proj", "up_proj", "down_proj"),
        "mixtral": ("{base}.block_sparse_moe.experts", "w1", "w3", "w2"),
        "mistral": ("{base}.experts", "w1", "w3", "w2"),
    }

    def __init__(self, file_path: str):
        super().__init__(file_path)
        self._detected_format = None
        self._detect_format()

    def _detect_format(self):
        """Auto-detect the MoE naming format by checking tensor keys."""
        sample_keys = list(self.tensor_file_map.keys())[:1000]

        # Check for packed format first (Qwen3.5 MoE style: all experts in one 3D tensor)
        for key in sample_keys:
            if key.endswith(".mlp.experts.gate_up_proj"):
                self._detected_format = "packed"
                print("[BF16SafeTensorLoader] Detected format: packed (Qwen3.5 MoE style)")
                return

        for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items():
            for key in sample_keys:
                if ".experts." in key and f".{gate}.weight" in key:
                    if "block_sparse_moe.experts" in key and fmt_name == "mixtral":
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return
                    elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek":
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return
                    elif fmt_name == "mistral" and ".mlp.experts" not in key and ".block_sparse_moe.experts" not in key:
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return

        self._detected_format = "deepseek"
        print("[BF16SafeTensorLoader] No MoE format detected, defaulting to: deepseek")

    def _get_experts_prefix_candidates(self, base_key: str) -> list[str]:
        """Get candidate experts prefixes based on detected format and base key variants."""
        path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format]
        candidates = [path_tpl.format(base=base_key)]

        # Some model weights (e.g., Mistral native format) do not have "model." prefix.
        if base_key.startswith("model."):
            candidates.append(path_tpl.format(base=base_key[len("model.") :]))

        return list(dict.fromkeys(candidates))

    def _get_proj_names(self):
        """Get projection names (gate, up, down) based on detected format."""
        _, gate, up, down = self.MOE_FORMATS[self._detected_format]
        return gate, up, down

    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        if device == "cpu":
            return tensor
        return tensor.to(device)

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load BF16 expert weights (no scales needed)."""
        if self._detected_format == "packed":
            return self._load_experts_packed(base_key, device)

        experts_prefix_candidates = self._get_experts_prefix_candidates(base_key)
        gate_name, up_name, down_name = self._get_proj_names()

        expert_count = 0
        experts_prefix = None
        for prefix in experts_prefix_candidates:
            expert_count = 0
            while self.has_tensor(f"{prefix}.{expert_count}.{gate_name}.weight"):
                expert_count += 1
            if expert_count > 0:
                experts_prefix = prefix
                break

        if expert_count == 0 or experts_prefix is None:
            raise ValueError(f"No experts found for keys: {experts_prefix_candidates}")

        gate_weights = [None] * expert_count
        up_weights = [None] * expert_count
        down_weights = [None] * expert_count

        for exp_id in range(expert_count):
            gate_w_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight"
            up_w_key = f"{experts_prefix}.{exp_id}.{up_name}.weight"
            down_w_key = f"{experts_prefix}.{exp_id}.{down_name}.weight"

            gate_weights[exp_id] = self.load_tensor(gate_w_key, device).contiguous()
            up_weights[exp_id] = self.load_tensor(up_w_key, device).contiguous()
            down_weights[exp_id] = self.load_tensor(down_w_key, device).contiguous()

        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
        }

    def _resolve_packed_experts_prefix(self, base_key: str) -> str:
        """Resolve the experts prefix for packed format, trying fallbacks."""
        # Direct: model.layers.{N}.mlp.experts
        experts_prefix = f"{base_key}.mlp.experts"
        if self.has_tensor(f"{experts_prefix}.gate_up_proj"):
            return experts_prefix

        # VL models: model.layers.{N} -> model.language_model.layers.{N}
        parts = base_key.split(".", 1)
        if len(parts) == 2:
            alt_base = f"{parts[0]}.language_model.{parts[1]}"
            experts_prefix = f"{alt_base}.mlp.experts"
            if self.has_tensor(f"{experts_prefix}.gate_up_proj"):
                return experts_prefix

        raise ValueError(f"No packed experts found for base_key '{base_key}'.")

    def _load_experts_packed(self, base_key: str, device: str = "cpu"):
        """Load packed expert weights (Qwen3.5 MoE style).

        Packed format stores all experts in stacked 3D tensors:
        - gate_up_proj: [num_experts, 2 * intermediate_size, hidden_size]
        - down_proj:    [num_experts, hidden_size, intermediate_size]
        """
        experts_prefix = self._resolve_packed_experts_prefix(base_key)

        gate_up_key = f"{experts_prefix}.gate_up_proj"
        down_key = f"{experts_prefix}.down_proj"

        gate_up = self.load_tensor(gate_up_key, device)  # [E, 2*I, H]
        down = self.load_tensor(down_key, device)  # [E, H, I]

        mid = gate_up.shape[1] // 2
        gate_list = [gate_up[i, :mid, :].contiguous() for i in range(gate_up.shape[0])]
        up_list = [gate_up[i, mid:, :].contiguous() for i in range(gate_up.shape[0])]
        down_list = [down[i].contiguous() for i in range(down.shape[0])]

        return {
            "gate": gate_list,
            "up": up_list,
            "down": down_list,
        }


class CompressedSafeTensorLoader(SafeTensorLoader):
    """Loader for compressed SafeTensor layouts (RAWINT4 weights)."""

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load raw expert weights stored in compressed safetensor format."""

        experts_prefix = f"{base_key}.mlp.experts"

        expert_idx = 0
        while self.has_tensor(f"{experts_prefix}.{expert_idx}.up_proj.weight_packed"):
            expert_idx += 1

        if expert_idx == 0:
            experts_prefix = f"language_model.{base_key}.mlp.experts"
            expert_idx = 0
            while self.has_tensor(f"{experts_prefix}.{expert_idx}.up_proj.weight_packed"):
                expert_idx += 1
            if expert_idx == 0:
                raise ValueError(f"No experts found for key {experts_prefix}")

        def load_projection(proj_name: str):
            weight_entries = []
            scale_entries = []

            for exp_id in range(expert_idx):
                weight_key = f"{experts_prefix}.{exp_id}.{proj_name}_proj.weight_packed"
                scale_key = f"{experts_prefix}.{exp_id}.{proj_name}_proj.weight_scale"

                if not self.has_tensor(weight_key):
                    raise KeyError(f"Missing tensor: {weight_key}")
                if not self.has_tensor(scale_key):
                    raise KeyError(f"Missing tensor: {scale_key}")

                weight_tensor = self.load_tensor(weight_key, device).contiguous()
                scale_tensor = self.load_tensor(scale_key, device).contiguous()

                weight_entries.append(weight_tensor)
                scale_entries.append(scale_tensor)

            return weight_entries, scale_entries

        gate_weights, gate_scales = load_projection("gate")
        up_weights, up_scales = load_projection("up")
        down_weights, down_scales = load_projection("down")

        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
            "gate_scale": gate_scales,
            "up_scale": up_scales,
            "down_scale": down_scales,
        }


class GGUFLoader:
    """
    GGUF format loader using the official gguf library (gguf.gguf_reader.GGUFReader)

    This is a cleaner implementation compared to manual binary parsing.
    """

    def __init__(self, gguf_path: str):
        """
        Initialize GGUF loader from a file or directory

        Args:
            gguf_path: Path to a single GGUF file or a directory containing GGUF files
        """
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"GGUF path not found: {gguf_path}")

        self.tensor_info = {}
        self.metadata = {}
        self.tensor_file_map = {}
        self.file_data_map = {}

        if os.path.isfile(gguf_path) and gguf_path.endswith(".gguf"):
            print(f"\n[GGUFLoader] Loading single GGUF file : {os.path.basename(gguf_path)}")
            self._load_single_file(gguf_path)
        elif os.path.isdir(gguf_path):
            print(f"\n[GGUFLoader] Loading GGUF files from directory: {gguf_path}")
            self._load_directory(gguf_path)
        else:
            raise ValueError(f"Path must be a .gguf file or a directory: {gguf_path}")

        print(f"[GGUFLoader] Summary:")
        print(f"  Files loaded: {len(self.file_data_map)}")
        print(f"  Total tensors: {len(self.tensor_info)}")
        print(f"  Metadata keys: {len(self.metadata)}")
        tensors = ["blk.0.ffn_up_exps.weight", "blk.0.ffn_gate_exps.weight", "blk.0.ffn_down_exps.weight"]
        for key in tensors:
            if key in self.tensor_info:
                info = self.tensor_info[key]
                print(f" {'.'.join(key.split('.')[2:-1])}, Dtype: {info['dtype'].name}")

    def _load_single_file(self, file_path: str):
        """Load a single GGUF file"""
        reader = GGUFReader(file_path)

        for key, field in reader.fields.items():
            value = field.parts[field.data[0]]
            if isinstance(value, bytes):
                value = value.decode("utf-8")
            elif isinstance(value, np.ndarray) and value.dtype == np.uint8:
                try:
                    value = bytes(value).decode("utf-8")
                except:
                    pass
            self.metadata[key] = value

        for tensor in reader.tensors:
            self.tensor_info[tensor.name] = {
                "shape": list(reversed(tensor.shape)),  # Reverse to match PyTorch order
                "dtype": tensor.tensor_type,
                "offset": tensor.data_offset,
                "n_elements": tensor.n_elements,
            }
            self.tensor_file_map[tensor.name] = file_path

        self.file_data_map[file_path] = np.memmap(file_path, mode="r")

    def _load_directory(self, dir_path: str):
        """Load all GGUF files from a directory (non-recursive)"""
        found_gguf = False

        for file in sorted(os.listdir(dir_path)):
            if file.endswith(".gguf"):
                found_gguf = True
                file_path = os.path.join(dir_path, file)
                print(f"  Loading: {file}")

                reader = GGUFReader(file_path)

                for key, field in reader.fields.items():
                    value = field.parts[field.data[0]]
                    if isinstance(value, bytes):
                        value = value.decode("utf-8")
                    elif isinstance(value, np.ndarray) and value.dtype == np.uint8:
                        try:
                            value = bytes(value).decode("utf-8")
                        except:
                            pass
                    self.metadata[key] = value

                for tensor in reader.tensors:
                    self.tensor_info[tensor.name] = {
                        "shape": list(reversed(tensor.shape)),
                        "dtype": tensor.tensor_type,
                        "offset": tensor.data_offset,
                        "n_elements": tensor.n_elements,
                    }
                    self.tensor_file_map[tensor.name] = file_path

                self.file_data_map[file_path] = np.memmap(file_path, mode="r")

        if not found_gguf:
            raise FileNotFoundError(f"No .gguf files found in directory: {dir_path}")

    def get_model_config(self, layer_idx: int = 0):
        """
        Extract model configuration from GGUF metadata and tensor shapes.

        Args:
            layer_idx: Layer index to inspect (default: 0)

        Returns:
            dict with keys: num_experts, num_experts_per_tok, hidden_size, moe_intermediate_size
        """
        config = {}

        arch = self.metadata.get("general.architecture", "unknown")

        num_experts = None
        for key_suffix in [
            "expert_count",
            "expert.count",
            "moe.expert_count",
            "expert_feed_forward_length",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                num_experts = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        num_experts_per_tok = None
        for key_suffix in [
            "expert_used_count",
            "expert.used_count",
            "moe.num_experts_per_tok",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                num_experts_per_tok = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        hidden_size = None
        for key_suffix in [
            "embedding_length",
            "embed_length",
            "hidden_size",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                hidden_size = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        moe_intermediate_size = None
        for key_suffix in [
            "expert_feed_forward_length",
            "feed_forward_length",
            "ffn_length",
            "intermediate_size",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                moe_intermediate_size = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        if any(v is None for v in [num_experts, hidden_size, moe_intermediate_size]):

            base_key = f"blk.{layer_idx}.ffn_gate_exps.weight"
            if base_key in self.tensor_info:
                gate_shape = self.tensor_info[base_key]["shape"]
                print(f"  Found tensor '{base_key}' with shape: {gate_shape}")

                if len(gate_shape) >= 3:
                    if num_experts is None:
                        num_experts = int(gate_shape[0])
                    if moe_intermediate_size is None:
                        moe_intermediate_size = int(gate_shape[1])
                    if hidden_size is None:
                        hidden_size = int(gate_shape[2])

        config = {
            "num_experts": num_experts,
            "num_experts_per_tok": num_experts_per_tok,
            "hidden_size": hidden_size,
            "moe_intermediate_size": moe_intermediate_size,
        }

        return config

    def print_metadata(self, filter_keywords=None):
        """
        Print GGUF file metadata for debugging.

        Args:
            filter_keywords: Optional list of keywords to filter metadata keys
        """
        print(f"\n[GGUFLoader] GGUF Metadata:")
        print(f"  Total metadata entries: {len(self.metadata)}")

        if filter_keywords:
            filtered = {
                k: v for k, v in self.metadata.items() if any(kw.lower() in k.lower() for kw in filter_keywords)
            }
            for k, v in sorted(filtered.items()):
                print(f"  {k}: {v}")
        else:
            for k, v in sorted(self.metadata.items()):
                print(f"  {k}: {v}")

    def has_tensor(self, name: str):
        """Check if tensor exists"""
        name = translate_name_to_gguf(name)
        return name in self.tensor_info

    def get_ggml_type(self, name: str):
        """Get GGML type of a tensor"""
        name = translate_name_to_gguf(name)
        if name not in self.tensor_info:
            raise KeyError(f"Tensor '{name}' not found in GGUF files")
        return self.tensor_info[name]["dtype"]

    def get_undequanted_tensor_and_ggml_type(self, name: str):
        """
        Get tensor data and its GGML type without dequantizing

        Args:
            name: Tensor name (in PyTorch format, will be translated to GGUF format)

        Returns:
            (data, ggml_type): Tuple of tensor data and GGML quantization type
        """
        name = translate_name_to_gguf(name)

        if name not in self.tensor_info:
            raise KeyError(f"Tensor '{name}' not found in GGUF files")

        info = self.tensor_info[name]
        file_path = self.tensor_file_map[name]
        mmap_data = self.file_data_map[file_path]

        offset = info["offset"]
        n_elements = info["n_elements"]
        ggml_type = info["dtype"]

        GGML_QUANT_SIZES = {
            GGMLQuantizationType.F32: (1, 4),
            GGMLQuantizationType.F16: (1, 2),
            GGMLQuantizationType.BF16: (1, 2),
            GGMLQuantizationType.Q4_0: (32, 2 + 16),
            GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
            GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
            GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
            GGMLQuantizationType.Q8_0: (32, 2 + 32),
            GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
            GGMLQuantizationType.Q2_K: (256, 2 + 2 + 256 // 16 + 256 // 4),
            GGMLQuantizationType.Q3_K: (256, 2 + 256 // 4 + 256 // 8 + 12),
            GGMLQuantizationType.Q4_K: (256, 2 + 2 + 256 // 2 + 12),
            GGMLQuantizationType.Q5_K: (256, 2 + 2 + 256 // 2 + 256 // 8 + 12),
            GGMLQuantizationType.Q6_K: (256, 2 + 256 // 2 + 256 // 4 + 256 // 16),
            GGMLQuantizationType.Q8_K: (256, 4 + 256 + 256 // 8),
            GGMLQuantizationType.IQ2_XXS: (256, 2 + 256 // 4),
            GGMLQuantizationType.IQ2_XS: (256, 2 + 256 // 4 + 256 // 32),
            GGMLQuantizationType.IQ3_XXS: (256, 2 + 256 // 4 + 256 // 8),
            GGMLQuantizationType.IQ1_S: (256, 2 + 256 // 8 + 256 // 16),
            GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
            GGMLQuantizationType.IQ3_S: (256, 2 + 256 // 4 + 256 // 8 + 256 // 32 + 4),
            GGMLQuantizationType.IQ2_S: (256, 2 + 256 // 4 + 256 // 16),
            GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + 256 // 2 + 256 // 64),
            GGMLQuantizationType.I8: (1, 1),
            GGMLQuantizationType.I16: (1, 2),
            GGMLQuantizationType.I32: (1, 4),
            GGMLQuantizationType.I64: (1, 8),
            GGMLQuantizationType.F64: (1, 8),
            GGMLQuantizationType.IQ1_M: (256, 256 // 8 + 256 // 16 + 256 // 32),
        }

        block_size, type_size = GGML_QUANT_SIZES[ggml_type]
        n_bytes = n_elements * type_size // block_size

        data_bytes = mmap_data[offset : offset + n_bytes]
        data = torch.from_numpy(np.frombuffer(data_bytes, dtype=np.uint8).copy())

        return data, ggml_type


================================================
FILE: kt-kernel/python/utils/moe_kernel.py
================================================
import os
import torch
import ctypes
from typing import Optional

# Use relative imports for package structure
from ..experts_base import BaseMoEWrapper
from .loader import SafeTensorLoader
from kt_kernel_ext.moe import MOEConfig

try:
    from kt_kernel_ext.moe import Int8_KERNEL_MOE

    _HAS_INT8_SUPPORT = True
except (ImportError, AttributeError):
    Int8_KERNEL_MOE = None
    _HAS_INT8_SUPPORT = False
try:
    from kt_kernel_ext.moe import Int4_KERNEL_MOE

    _HAS_INT4_SUPPORT = True
except (ImportError, AttributeError):
    Int4_KERNEL_MOE = None
    _HAS_INT4_SUPPORT = False

from typing import Optional


class GeneralMoEWrapper(BaseMoEWrapper):
    """
    moe-based MoE wrapper implementation.
    Supports MOE_INT4 and MOE_INT8 quantization methods.
    """

    _safetensor_loader_instance = None  # Singleton SafeTensorLoader

    def __init__(
        self,
        layer_idx: int,
        num_experts: int,
        num_experts_per_tok: int,
        hidden_size: int,
        moe_intermediate_size: int,
        gpu_experts_mask: Optional[torch.Tensor],
        cpuinfer_threads: int,
        threadpool_count: int,
        weight_path: str,
        chunked_prefill_size: int,
        cpu_save: bool = False,
        max_deferred_experts_per_token: Optional[int] = None,
        method: str = "MOE_INT8",
    ):
        """
        Initialize general MoE Wrapper.

        Args:
            layer_idx: Layer index
            num_experts: Total number of experts
            num_experts_per_tok: Number of experts per token (top-k)
            hidden_size: Hidden dimension size
            moe_intermediate_size: MoE intermediate size
            gpu_experts_mask: Boolean mask indicating which experts are on GPU.
                              Shape: [num_experts], dtype: torch.bool.
                              mask[i] = True means expert i is on GPU.
                              If None, all experts are on CPU.
            cpuinfer_threads: Number of CPU inference threads
            threadpool_count: Number of NUMA subpools
            weight_path: Path to weights (SafeTensor format)
            chunked_prefill_size: Maximum prefill chunk size
            cpu_save: Whether to save weights to CPU memory
            max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
            method: general quantization method ("MOE_INT4" or "MOE_INT8")
        """
        if not _HAS_INT4_SUPPORT and method == "MOE_INT4":
            raise RuntimeError(
                "MoE_INT4 backend not available. kt_kernel_ext was not compiled with int4 support.\n"
                "Please recompile with int4 enabled."
            )
        if not _HAS_INT8_SUPPORT and method == "MOE_INT8":
            raise RuntimeError(
                "MoE_INT8 backend not available. kt_kernel_ext was not compiled with int8 support.\n"
                "Please recompile with int8 enabled."
            )

        # Initialize base class
        super().__init__(
            layer_idx=layer_idx,
            num_experts=num_experts,
            num_experts_per_tok=num_experts_per_tok,
            hidden_size=hidden_size,
            moe_intermediate_size=moe_intermediate_size,
            gpu_experts_mask=gpu_experts_mask,
            cpuinfer_threads=cpuinfer_threads,
            threadpool_count=threadpool_count,
            weight_path=weight_path,
            chunked_prefill_size=chunked_prefill_size,
            cpu_save=cpu_save,
            max_deferred_experts_per_token=max_deferred_experts_per_token,
            method=method,
        )

        # moe-specific: Check if we should load merged safetensor weights
        self.load_merged_weight = False
        import glob

        if glob.glob(os.path.join(weight_path, "*.safetensors")):
            self.load_merged_weight = True

        # Initialize SafeTensor loader (singleton)
        if self.load_merged_weight:
            if GeneralMoEWrapper._safetensor_loader_instance is None:
                GeneralMoEWrapper._safetensor_loader_instance = SafeTensorLoader(weight_path)
            self.safetensor_loader = GeneralMoEWrapper._safetensor_loader_instance

        # moe-specific weight storage
        self.gate_weights = None
        self.up_weights = None
        self.down_weights = None
        self.gate_scales = None
        self.up_scales = None
        self.down_scales = None

    def load_weights_from_tensors(
        self,
        gate_proj: torch.Tensor,
        up_proj: torch.Tensor,
        down_proj: torch.Tensor,
        physical_to_logical_map_cpu: torch.Tensor,
    ):
        """
        Load and quantize weights from BF16/FP16 tensors (online quantization).

        Args:
            gate_proj: Gate projection weights [num_experts, intermediate_size, hidden_size]
            up_proj: Up projection weights [num_experts, intermediate_size, hidden_size]
            down_proj: Down projection weights [num_experts, hidden_size, intermediate_size]
            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
        """
        # Store tensors as instance variables to keep them alive
        self.gate_proj = gate_proj.contiguous()
        self.up_proj = up_proj.contiguous()
        self.down_proj = down_proj.contiguous()

        # Configure MoE with online quantization (cpu_save mode)
        moe_config = MOEConfig(
            self.num_experts,
            self.num_experts_per_tok,
            self.hidden_size,
            self.moe_intermediate_size,
            self.gpu_experts_mask.data_ptr(),
        )
        moe_config.layer_idx = self.layer_idx
        moe_config.pool = self.cpu_infer.backend_
        moe_config.max_len = self.chunked_prefill_size

        # Enable save mode for online quantization
        moe_config.save = True
        moe_config.load = False

        # Set weight pointers
        moe_config.gate_proj = self.gate_proj.data_ptr()
        moe_config.up_proj = self.up_proj.data_ptr()
        moe_config.down_proj = self.down_proj.data_ptr()

        # Set output path for quantized weights
        moe_config.path = self.weight_path

        # Create MoE module based on method
        if self.method == "MOE_INT4":
            self.moe = Int4_KERNEL_MOE(moe_config)
        elif self.method == "MOE_INT8":
            self.moe = Int8_KERNEL_MOE(moe_config)
        else:
            raise NotImplementedError(f"Unsupported MoE method: {self.method}")

        # Submit quantization and save task
        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
        self.cpu_infer.sync()

    def load_weights(self, physical_to_logical_map_cpu: torch.Tensor):
        """
        Load weights for this layer and initialize the MoE module.

        Args:
            physical_to_logical_map_cpu: Mapping from physical to logical expert IDs
        """
        gate_ptr = 0
        up_ptr = 0
        down_ptr = 0

        gate_ptrs = []
        up_ptrs = []
        down_ptrs = []

        gate_scale_ptrs = []
        up_scale_ptrs = []
        down_scale_ptrs = []

        if self.load_merged_weight:
            base_key = f"blk.{self.layer_idx}"
            w = self.safetensor_loader.load_experts(base_key)

            self.gate_weights = w["gate"]
            self.up_weights = w["up"]
            self.down_weights = w["down"]
            self.gate_scales = w["gate_scale"]
            self.up_scales = w["up_scale"]
            self.down_scales = w["down_scale"]

            # Get pointers to weight arrays
            gate_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.gate_weights
            ]

            up_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.up_weights
            ]

            down_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.down_weights
            ]

            gate_scale_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.gate_scales
            ]

            up_scale_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.up_scales
            ]

            down_scale_ptrs = [
                [
                    ctypes.addressof(ctypes.cast(et.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents)
                    for et in numa_array
                ]
                for numa_array in self.down_scales
            ]

        # Configure MoE
        moe_config = MOEConfig(
            self.num_experts,
            self.num_experts_per_tok,
            self.hidden_size,
            self.moe_intermediate_size,
            self.gpu_experts_mask.data_ptr(),
        )
        moe_config.layer_idx = self.layer_idx
        moe_config.pool = self.cpu_infer.backend_
        moe_config.max_len = self.chunked_prefill_size

        moe_config.gate_proj = gate_ptr
        moe_config.up_proj = up_ptr
        moe_config.down_proj = down_ptr
        moe_config.gate_projs = gate_ptrs
        moe_config.up_projs = up_ptrs
        moe_config.down_projs = down_ptrs
        moe_config.gate_scales = gate_scale_ptrs
        moe_config.up_scales = up_scale_ptrs
        moe_config.down_scales = down_scale_ptrs

        if self.cpu_save:
            moe_config.save = True
            moe_config.load = False
            base_key = f"model.layers.{self.layer_idx}"
            w = self.safetensor_loader.load_experts(base_key)

            self.gate_proj = torch.cat(w["gate_weight"], dim=0).contiguous()
            self.up_proj = torch.cat(w["up_weight"], dim=0).contiguous()
            self.down_proj = torch.cat(w["down_weight"], dim=0).contiguous()

            moe_config.gate_proj = self.gate_proj.data_ptr()
            moe_config.up_proj = self.up_proj.data_ptr()
            moe_config.down_proj = self.down_proj.data_ptr()
        else:
            moe_config.load = True

        if not self.load_merged_weight:
            moe_config.path = self.weight_path

        # Create MoE module based on moe method
        if self.method == "MOE_INT4":
            self.moe = Int4_KERNEL_MOE(moe_config)
        elif self.method == "MOE_INT8":
            self.moe = Int8_KERNEL_MOE(moe_config)
        else:
            raise NotImplementedError(f"Unsupported MoE method: {self.method}")

        # Load weights
        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
        self.cpu_infer.sync()

        # Clean up temporary weight storage if using merged weights
        if self.load_merged_weight:
            del self.gate_weights
            del self.up_weights
            del self.down_weights
            del self.gate_scales
            del self.up_scales
            del self.down_scales


================================================
FILE: kt-kernel/requirements.txt
================================================
# Optional: Install these if not already available in your environment
# These dependencies will be automatically installed when running `pip install .`
# You can skip this file if you already have these packages installed

# Core dependencies (minimum versions)
torch>=2.0.0
safetensors>=0.4.0
compressed-tensors>=0.7.0
numpy>=1.24.0
triton>=2.0.0
gguf>=0.17.0
# Development dependencies
black>=25.9.0


================================================
FILE: kt-kernel/scripts/README.md
================================================
# Weight Quantization Tools

KT-Kernel provides weight conversion tools for CPU-GPU hybrid inference (e.g., integrating KTransformers with SGLang). Both tools work together to enable heterogeneous expert placement:

- **CPU Weights (`convert_cpu_weights.py`)**: Quantize weights to INT4/INT8 with AMX optimization for CPU-resident "cold" experts
- **GPU Weights (`convert_gpu_weights.py`)**: Apply GPTQ/RTN quantization (W4A16/W8A16) for GPU-resident "hot" experts

---

## CPU Weight Quantization

Convert weights to INT4/INT8 format optimized for AMX inference on CPU. These quantized weights are used for "cold" experts (less frequently accessed) that run on CPU in hybrid inference scenarios.

### Quantization Methods

- **INT4**: 4-bit quantization for maximum memory efficiency
- **INT8**: 8-bit quantization for better accuracy

### Supported Input Formats

- **FP8**: 8-bit floating point with automatic dequantization
- **FP16**: 16-bit floating point
- **BF16**: BFloat16 format

> **⚠️ Precision Warning:** Quantizing directly from FP8 to INT4/INT8 may cause significant accuracy degradation. For best results, use the original **BF16** model as the source for INT4/INT8 quantization.

## Basic Usage

### Quantize BF16 model to INT4

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/bf16/model \
  --input-type bf16 \
  --output /path/to/output \
  --quant-method int4
```

### Quantize FP16 model to INT8

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/fp16/model \
  --input-type fp16 \
  --output /path/to/output \
  --quant-method int8
```

### Quantize FP8 model to INT4

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/fp8/model \
  --input-type fp8 \
  --output /path/to/output \
  --quant-method int4
```

## Output Format

By default, the converted weights are saved in SafeTensors format with NUMA-aware layout:

```
output_dir/
├── model-00001-of-00050.safetensors
├── model-00002-of-00050.safetensors
├── ...
├── config.json
└── tokenizer files...
```

Each expert's weights are split across NUMA nodes for optimal memory access:
- `blk.{layer}.ffn_{proj}_exps.{expert}.numa.{numa_idx}.weight`: Quantized weights
- `blk.{layer}.ffn_{proj}_exps.{expert}.numa.{numa_idx}.scale`: Quantization scales

## Advanced Options

### Low Memory Mode

For systems with insufficient memory to complete full model quantization, use the `--no-merge-safetensor` flag to keep weights in layer folder structure without merging into safetensor files:

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/model \
  --input-type bf16 \
  --output /path/to/output \
  --quant-method int4 \
  --no-merge-safetensor
```

This will save quantized weights in the following folder structure:

```
output_dir/
├── _layer_0/
│   ├── _numa_0/
│   │   ├── INT4_down_0_*.kt
│   │   ├── INT4_gate_0_*.kt
│   │   └── INT4_up_0_*.kt
│   └── _numa_1/
│       └── ...
├── _layer_1/
│   └── ...
└── ...
```

**When to use `--no-merge-safetensor`:**
- Machine runs out of memory during the merge step
- Need to process very large models on memory-constrained systems
- Want to preserve intermediate layer-wise quantized weights

### Resume Layer

For memory-constrained systems that are unable to complete quantization despite enabling low memory mode with `--no-merge-safetensor`, restart the script with the `--resume-layer` arg to specify the layer from which to continue the conversion process. In the example below, we skip layers 0-11 and resume conversion starting with layer 12.

```bash
python scripts/convert_cpu_weights.py \
  --input-path /path/to/model \
  --input-type bf16 \
  --output /path/to/output \
  --quant-method int4 \
  --no-merge-safetensor
  --resume-layer 12
```

## Examples

### Example 1: Quantize DeepSeek-V3.1 (FP8 → INT4)

```bash
python scripts/convert_cpu_weights.py \
  --input-path /mnt/data/models/DeepSeek-V3.1 \
  --input-type fp8 \
  --output /mnt/data/models/DeepSeek-V3.1-INT4 \
  --quant-method int4 \
  --cpuinfer-threads 60 \
  --threadpool-count 2
```

### Example 2: Quantize Qwen3-Next-80B (BF16 → INT4, Low Memory)

```bash
python scripts/convert_cpu_weights.py \
  --input-path /mnt/data/models/Qwen3-Next-80B-A3B-Instruct \
  --input-type bf16 \
  --output /mnt/data/models/Qwen3-Next-80B-A3B-Instruct-INT4 \
  --quant-method int4 \
  --cpuinfer-threads 60 \
  --threadpool-count 2 \
  --no-merge-safetensor
```

---

## GPU Weight Quantization

### Prerequisites

GPU weight quantization requires additional dependencies. Install them before proceeding:

```bash
pip install accelerate transformers llmcompressor datasets
```

**Required packages:**
- `accelerate`: For distributed model loading and device mapping
- `transformers`: For model and tokenizer loading
- `llmcompressor`: For quantization (supports GPTQ and RTN methods)
- `datasets`: For calibration data loading (GPTQ only)

**Documentation:** This tool is based on llmcompressor. For more details, see [llmcompressor quantization guide](https://docs.vllm.ai/projects/llm-compressor/en/latest/getting-started/compress/#select-a-quantization-method-and-scheme).

### Overview

Apply weight quantization to model weights for GPU-resident "hot" experts (frequently accessed) in CPU-GPU hybrid inference. This tool works together with `convert_cpu_weights.py` to enable heterogeneous expert placement:

- **GPU-resident experts** ("hot" experts) use GPTQ/RTN quantization (this tool) for efficient GPU memory usage
- **CPU-resident experts** ("cold" experts) use AMX-optimized INT4/INT8 quantization (convert_cpu_weights.py)
- **Attention layers, gates, and shared experts** remain in higher precision

This approach maximizes throughput and resource utilization by intelligently distributing experts across CPUs and GPUs.

### Quantization Methods

#### 1. GPTQ (Calibration-based, Default)
**Pros:**
- Higher accuracy through calibration-based quantization
- Recommended for production deployments

**Cons:**
- Requires calibration dataset
- Slower quantization process
- Higher memory requirements (needs Hessian matrix)

#### 2. RTN (Round-To-Nearest)
**Pros:**
- Fast quantization (no calibration needed)
- Lower memory requirements
- Good for quick testing and prototyping

**Cons:**
- Slightly lower accuracy compared to GPTQ
- No calibration optimization

### Quantization Types

- **W4A16**: 4-bit weights, 16-bit activations (INT4)
- **W8A16**: 8-bit weights, 16-bit activations (INT8)

### Basic Usage

#### GPTQ Quantization (Recommended for Production)
```bash
python scripts/convert_gpu_weights.py \
  --model_id /path/to/model \
  --output_dir /path/to/output \
  --quant_method GPTQ \
  --quant_type W4A16
```

#### RTN Quantization (Fast, for Testing)
```bash
python scripts/convert_gpu_weights.py \
  --model_id /path/to/model \
  --output_dir /path/to/output \
  --quant_method RTN \
  --quant_type W4A16
```

### Memory Requirements

Understanding memory requirements is crucial for successful quantization. The requirements differ significantly between RTN and GPTQ methods.

#### RTN Memory Requirements

RTN only requires memory for quantization parameters (scales/zero-points):

| Component | Requirement |
|-----------|-------------|
| **DRAM (CPU Memory)** | ≥ Total model parameters |
| **VRAM (GPU Memory)** | ≥ Single layer parameters |

**Example: DeepSeek-R1-0528-BF16 (684B parameters)**
- DRAM: ~1368 GB (684B params × 2 bytes)
- VRAM: ~22.4 GB (1 layer)

#### GPTQ Memory Requirements

GPTQ requires additional memory for Hessian matrices during calibration:

| Component | Requirement |
|-----------|-------------|
| **DRAM (CPU Memory)** | ≥ Total model parameters |
| **VRAM (GPU Memory)** | ≥ Single layer parameters × 2 |

The Hessian matrix is approximately the same size as the layer weights and is used to increase accuracy recovery.

**Example: DeepSeek-R1-0528-BF16 (684B parameters)**
- DRAM: ~1368 GB (684B params × 2 bytes)
- VRAM: ~44.8 GB (1 layer × 2 for Hessian matrix)

#### Method Comparison

| Method | Speed | VRAM | Accuracy | Use Case |
|--------|-------|------|----------|----------|
| **RTN** | Fast | Low (~22GB) | Good | Testing, prototyping |
| **GPTQ** | Slow | High (~45GB) | Better | Production deployment |

### Advanced Options

#### Calibration Configuration (GPTQ Only)

For GPTQ quantization, control the calibration process for better quantization quality:

```bash
python scripts/convert_gpu_weights.py \
  --model_id /path/to/model \
  --output_dir /path/to/output \
  --quant_method GPTQ \
  --quant_type W4A16 \
  --num_calibration_samples 512 \
  --max_sequence_length 2048 \
  --dataset HuggingFaceH4/ultrachat_200k \
  --dataset_split train_sft
```

**Options (GPTQ only):**
- `--num_calibration_samples`: Number of samples for calibration (default: 512)
- `--max_sequence_length`: Maximum sequence length (default: 2048)
- `--dataset`: HuggingFace dataset for calibration
- `--dataset_split`: Dataset split to use
- `--dampening_frac`: Dampening fraction to reduce quantization noise (default: 0.1)

#### Memory Management

Use `--max_gpu_memory` to limit GPU memory usage and offload remaining layers to CPU:

```bash
python scripts/convert_gpu_weights.py \
  --model_id /path/to/model \
  --output_dir /path/to/output \
  --quant_method GPTQ \
  --quant_type W4A16 \
  --max_gpu_memory "40GiB"
```

**Recommended settings for GPTQ:**

| GPU VRAM | Suggested `--max_gpu_memory` | Notes |
|----------|------------------------------|-------|
| 24 GiB   | 10-12 GiB | Reserve ~50% for Hessian |
| 48 GiB   | 24-30 GiB | Reserve ~40% for Hessian |
| 80 GiB   | 40-50 GiB | Reserve ~40% for Hessian |

**Recommended settings for RTN:**

| GPU VRAM | Suggested `--max_gpu_memory` | Notes |
|----------|------------------------------|-------|
| 24 GiB   | 18-20 GiB | No Hessian needed |
| 48 GiB   | 40-45 GiB | No Hessian needed |
| 80 GiB   | 70-75 GiB | No Hessian needed |

**Options:**
- `--max_gpu_memory`: Maximum GPU memory for model weights per device (e.g., '40GiB')
- `--max_cpu_memory`: Maximum CPU memory (default: 1000GiB when `--max_gpu_memory` is set)

**Important:** llmcompressor does not support disk offloading. Ensure your machine has enough GPU + CPU memory to load the entire model. If you still encounter OOM:
1. Use RTN instead of GPTQ (requires less memory)
2. Reduce `--num_calibration_samples` (GPTQ only, e.g., 256)
3. Reduce `--max_sequence_length` (GPTQ only, e.g., 1024)
4. Use `--force_cpu` to run entirely on CPU (slower but avoids GPU OOM)

### Examples

#### Example 1: GPTQ Quantization for Production (Qwen3-Next-80B, W4A16)

```bash
python scripts/convert_gpu_weights.py \
  --model_id /mnt/data/models/Qwen3-Next-80B-A3B-Instruct \
  --output_dir /mnt/data/models/Qwen3-Next-80B-A3B-Instruct-GPTQ-W4A16 \
  --quant_method GPTQ \
  --quant_type W4A16 \
  --num_calibration_samples 512 \
  --max_sequence_length 2048 \
  --max_gpu_memory "40GiB" \
  --trust_remote_code
```

#### Example 2: RTN Quantization for Fast Testing (DeepSeek-R1, W4A16)

```bash
python scripts/convert_gpu_weights.py \
  --model_id /mnt/data/models/DeepSeek-R1-0528-BF16 \
  --output_dir /mnt/data/models/DeepSeek-R1-0528-RTN-W4A16 \
  --quant_method RTN \
  --quant_type W4A16 \
  --max_gpu_memory "70GiB" \
  --trust_remote_code
```

#### Example 3: GPTQ with Custom Calibration Dataset (GLM-4.5-Air, W8A16)

```bash
python scripts/convert_gpu_weights.py \
  --model_id /mnt/data/models/GLM-4.5-Air \
  --output_dir /mnt/data/models/GLM-4.5-Air-GPTQ-W8A16 \
  --quant_method GPTQ \
  --quant_type W8A16 \
  --dataset "tatsu-lab/alpaca" \
  --dataset_split "train" \
  --num_calibration_samples 256 \
  --max_gpu_memory "40GiB" \
  --trust_remote_code
```


================================================
FILE: kt-kernel/scripts/check.py
================================================
import os

# insert the path of the project
import sys

# sys.path.insert(0, "/home/azure/ktransformers")
import argparse
import torch
from safetensors import safe_open
from safetensors.torch import save_file
import re
from collections import defaultdict
import itertools
import os
import torch
import numpy as np

tensor_from_amx = [".mlp.experts."]  # todo: add keys in gguf that should be used in the final tensor


def safe_open_binary_to_tensor(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"文件不存在: {file_path}")

    if not os.access(file_path, os.R_OK):
        raise PermissionError(f"没有权限读取文件: {file_path}")

    try:
        with open(file_path, "rb") as f:
            binary_data = f.read()

        np_array = np.frombuffer(binary_data, dtype=np.int8)

        tensor = torch.from_numpy(np_array)

        return tensor

    except Exception as e:
        raise IOError(f"file process error: {str(e)}")


def read_safetensor_keys_from_folder(folder_path) -> dict:
    """
    :param folder_path: folder path
    :return: key_to_file_map
    """
    # check if the folder path is exist
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"GGUF dir not found: {folder_path}")
    if os.path.isfile(folder_path):
        folder_path = os.path.dirname(folder_path)

    key_to_file_map = {}

    found_safetensor = False
    for root, dirs, files in os.walk(folder_path):
        # sort files
        files = sorted(files)
        for file in files:
            if file.endswith(".safetensors"):
                found_safetensor = True
                file_path = os.path.join(root, file)
                try:
                    with safe_open(file_path, framework="pt") as f:
                        for key in f.keys():
                            if "model.layers.61" in key:
                                # skip MTP layer
                                continue
                            # try:
                            #     if int(key.split('.')[2]) > 4:
                            #         continue
                            # except:
                            #     pass
                            key_to_file_map[key] = file_path
                except Exception as e:
                    print(f"Error reading Safetensor file {file_path}: {e}")

    if not found_safetensor:
        raise FileNotFoundError(f"No Safetensor files found in {folder_path}")

    return key_to_file_map


def read_amx_tensor_from_folder(folder_path, keys) -> dict:
    layer_list = [f"_layer_{i}" for i in range(3, 61)]
    numa_list = ["_numa_0", "_numa_1"]

    down_list = [f"INT4_down_{i}_quant_.kt" for i in range(256)]
    gate_list = [f"INT4_gate_{i}_quant_.kt" for i in range(256)]
    up_list = [f"INT4_up_{i}_quant_.kt" for i in range(256)]
    down_scale_list = [f"INT4_down_{i}_scale_.kt" for i in range(256)]
    gate_scale_list = [f"INT4_gate_{i}_scale_.kt" for i in range(256)]
    up_scale_list = [f"INT4_up_{i}_scale_.kt" for i in range(256)]
    target = ["ffn_up_exps", "ffn_down_exps", "ffn_gate_exps"]
    tensor_file_map = {}
    for key in keys:
        layer = int(key.split(".")[1])
        if layer < 3:
            continue
        layer_path = f"_layer_{layer}"
        # concatenate the path layer/numa/(down|gate|up)_(0-255)_3670016Byte_quant_.kt
        # store the path in the tensor_file_map
        # key = key+'.idx.weight'
        # scale_key = key+'.idx.scale'
        for numa_idx, numa in enumerate(numa_list):
            # TODO: 256 should be a variable
            for i in range(256):
                prefix_key = ".".join(key.split(".")[:-1])

                experts_key = prefix_key + f".{i}.numa.{numa_idx}.weight"
                scale_key = prefix_key + f".{i}.numa.{numa_idx}.scale"
                if "down" in experts_key:
                    tensor_file_map[experts_key] = os.path.join(folder_path, layer_path, numa, down_list[i])
                    tensor_file_map[scale_key] = os.path.join(folder_path, layer_path, numa, down_scale_list[i])
                elif "gate" in experts_key:
                    tensor_file_map[experts_key] = os.path.join(folder_path, layer_path, numa, gate_list[i])
                    tensor_file_map[scale_key] = os.path.join(folder_path, layer_path, numa, gate_scale_list[i])
                elif "up" in experts_key:
                    tensor_file_map[experts_key] = os.path.join(folder_path, layer_path, numa, up_list[i])
                    tensor_file_map[scale_key] = os.path.join(folder_path, layer_path, numa, up_scale_list[i])
    return tensor_file_map


# def translate_name(name:str)->str:
#     """
#     :param name: name of the tensor
#     :return: translated name
#     """
#     name = translate_name_to_gguf(name)
#     name = name.replace(".up_proj.", ".ffn_up_exps.")
#     name = name.replace(".down_proj.", ".ffn_down_exps.")
#     name = name.replace(".gate_proj.", ".ffn_gate_exps.")
#     name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias")
#     return name


def _clean_keys(keys):
    keys = list(keys)
    target = ["ffn_up_exps", "ffn_down_exps", "ffn_gate_exps"]
    # only keep the keys that contain the target
    keys = [key for key in keys if any(target_key in key for target_key in target) and "ggml_type" not in key]
    return keys


def combine_tensor_sources(safetensor_path, amx_path):
    safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)

    keys = _clean_keys(safetensor_tensor_file_map.keys())

    amx_tensor_file_map = read_amx_tensor_from_folder(amx_path, keys)
    target_tensor_map = {}
    for key in safetensor_tensor_file_map.keys():
        if "_exps." in key:
            continue

        target_tensor_map[key] = safetensor_tensor_file_map[key]

    for key in amx_tensor_file_map.keys():
        target_tensor_map[key] = amx_tensor_file_map[key]

    return target_tensor_map


def write_combined_tensor(target_tensor_map: dict, output_path: str):
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Cache for safetensor file handles and GGUF loaders
    safetensors_cache = {}
    amx_cache = {}

    # Group tensors by layer
    layer_groups = defaultdict(list)
    non_layer_keys = []
    layer_pattern = re.compile(r"blk\.(\d+)\.")

    for key in target_tensor_map:
        match = layer_pattern.search(key)
        if match:
            layer_groups[int(match.group(1))].append(key)
        else:
            non_layer_keys.append(key)

    # Calculate the number of shards
    total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1

    shard_idx = 0
    # Save non-layer tensors to the first shard if they exist
    if non_layer_keys:
        tensors = {}
        for key in non_layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None
            if file_path.endswith(".safetensors"):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework="pt")
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
            elif file_path.endswith(".kt"):
                tensor = safe_open_binary_to_tensor(file_path)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
            tensors[key] = tensor

        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
        print(f"Saving non-layer tensors to {output_file}")
        save_file(tensors, output_file)
        shard_idx += 1

    # Save each layer's tensors to subsequent shards
    for layer_num in sorted(layer_groups.keys()):
        layer_keys = layer_groups[layer_num]
        tensors = {}
        for key in layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None
            if file_path.endswith(".safetensors"):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework="pt")
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
                tensor_info = tensor.shape
            elif file_path.endswith(".kt"):
                tensor = safe_open_binary_to_tensor(file_path)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
            tensors[key] = tensor

        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
        print(f"Saving layer {layer_num} to {output_file}")
        save_file(tensors, output_file)
        shard_idx += 1
    return


def main():
    # 输入已经处理过的混合模型路径，提前处理好的amx路径，输出路径
    parser = argparse.ArgumentParser(description="Read parameters from Safetensor and GGUF files")
    parser.add_argument(
        "--safetensor_path",
        type=str,
        help="Path to the Safetensor file",
        default="/mnt/data/models/DeepSeek-R1-GGML-FP8-Hybrid/DeepSeek-R1-IQ1S-FP8",
    )
    parser.add_argument(
        "--amx_path", type=str, help="Path to the GGUF file", default="/mnt/data/models/DeepSeek-R1-INT4"
    )
    parser.add_argument(
        "--output_path",
        type=str,
        help="Path to the output file",
        default="/mnt/data/models/DeepSeek-R1-GGML-FP8-Hybrid/DeepSeek-R1-AMXQ4-FP8",
    )

    # print all the arguments
    print("All the arguments:")
    print(parser.parse_args())

    # 解析命令行参数
    args = parser.parse_args()

    safetensor_path = args.safetensor_path
    amx_path = args.amx_path
    output_path = args.output_path

    target_tensor_map = combine_tensor_sources(safetensor_path, amx_path)
    for key, value in target_tensor_map.items():
        print(f"{key}: {value}")
    write_combined_tensor(target_tensor_map, output_path)

    return


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/scripts/check_cpu_features.py
================================================
#!/usr/bin/env python3
"""
CPU feature detection script for kt-kernel.

This script checks if your CPU supports the required instruction sets for FP8 MoE:
- AVX512F (foundation)
- AVX512_BF16 (BF16 dot product)
- AVX512_VNNI (VNNI instructions)
- AVX512_VBMI (byte permutation)

Usage:
    python3 scripts/check_cpu_features.py
"""

import os
import sys


def check_cpuinfo():
    """Check CPU features via /proc/cpuinfo."""
    try:
        with open("/proc/cpuinfo", "r") as f:
            cpuinfo = f.read().lower()
        return cpuinfo
    except FileNotFoundError:
        return None


def main():
    print("=" * 70)
    print("KT-Kernel CPU Feature Detection")
    print("=" * 70)
    print()

    cpuinfo = check_cpuinfo()

    if cpuinfo is None:
        print("❌ /proc/cpuinfo not found (not on Linux?)")
        print("   Cannot detect CPU features automatically.")
        sys.exit(1)

    # Extract CPU model
    for line in cpuinfo.split("\n"):
        if "model name" in line:
            model = line.split(":")[1].strip()
            print(f"CPU Model: {model}")
            break
    print()

    # Check AMX support
    print("AMX Support (Intel Sapphire Rapids+):")
    amx_flags = ["amx_tile", "amx_int8", "amx_bf16"]
    amx_status = {}
    for flag in amx_flags:
        has_flag = flag in cpuinfo
        amx_status[flag] = has_flag
        status = "✅" if has_flag else "❌"
        print(f"  {status} {flag.upper()}")

    has_amx = all(amx_status.values())
    print(f"\n  Overall AMX Support: {'✅ YES' if has_amx else '❌ NO'}")
    print()

    # Check AVX512 support
    print("AVX512 Support (required for FP8 MoE):")
    avx512_flags = ["avx512f", "avx512_bf16", "avx512_vnni", "avx512_vbmi"]
    avx512_status = {}
    for flag in avx512_flags:
        has_flag = flag in cpuinfo
        avx512_status[flag] = has_flag
        status = "✅" if has_flag else "❌"
        flag_desc = {
            "avx512f": "AVX512F (foundation)",
            "avx512_bf16": "AVX512_BF16 (BF16 dot product)",
            "avx512_vnni": "AVX512_VNNI (VNNI instructions)",
            "avx512_vbmi": "AVX512_VBMI (byte permutation)",
        }
        print(f"  {status} {flag_desc.get(flag, flag.upper())}")

    has_avx512_full = all(avx512_status.values())
    print(f"\n  Overall AVX512 Support: {'✅ YES' if has_avx512_full else '❌ NO'}")

    if not has_avx512_full and avx512_status["avx512f"]:
        missing = [f for f in avx512_flags if not avx512_status[f]]
        print(f"  ⚠️  Warning: AVX512F detected but missing: {', '.join(missing)}")
        print(f"      kt-kernel will fall back to AVX2 mode")
    print()

    # Check AVX2 support
    print("AVX2 Support (fallback):")
    has_avx2 = "avx2" in cpuinfo
    status = "✅" if has_avx2 else "❌"
    print(f"  {status} AVX2")
    print()

    # Recommendation
    print("=" * 70)
    print("Recommendation:")
    print("=" * 70)
    if has_amx:
        print("✅ Your CPU supports AMX - you can use the highest performance mode!")
        print("   Build with: -DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON -DKTRANSFORMERS_CPU_USE_AMX=ON")
    elif has_avx512_full:
        print("✅ Your CPU supports full AVX512 (F/BF16/VNNI/VBMI) - FP8 MoE will work!")
        print("   Build with: -DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON")
    elif avx512_status.get("avx512f", False):
        print("⚠️  Your CPU has AVX512F but missing required extensions.")
        print("   FP8 MoE will NOT work. kt-kernel will fall back to AVX2 mode.")
        print("   Missing extensions:", ", ".join([f for f in avx512_flags if not avx512_status.get(f, False)]))
    elif has_avx2:
        print("ℹ️  Your CPU supports AVX2 only - basic compatibility mode.")
        print("   FP8 MoE will NOT be available, but other features will work.")
    else:
        print("❌ Your CPU does not support the minimum required instruction set (AVX2).")
        print("   kt-kernel may not work on this system.")
    print()


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/scripts/compare_weights.py
================================================
#!/usr/bin/env python3
"""
Compare two sets of quantized weights generated by convert_cpu_weights.py

This script supports comparing:
- Two safetensor format weights (merged)
- Two .kt format weights (layer folder structure)
- One safetensor and one .kt format (cross-format comparison)

Usage:
    python compare_weights.py --path1 /path/to/weights1 --path2 /path/to/weights2
    python compare_weights.py --path1 /path/to/weights1 --path2 /path/to/weights2 --tolerance 1e-5
"""

import argparse
import os
import glob
import numpy as np
import torch
from safetensors import safe_open
from typing import Dict, Tuple
from collections import defaultdict


def unpack_awq_int32_to_int8(packed: np.ndarray, bits: int = 4) -> np.ndarray:
    """Unpack AWQ int32 packed format to int8

    AWQ uses INT4 quantization: 8 x 4-bit values packed into 1 x 32-bit integer

    Args:
        packed: Packed int32 array
        bits: Number of bits per element (default: 4)

    Returns:
        Unpacked int8 array
    """
    if packed.dtype != np.int32:
        # Try to reinterpret as int32
        packed = packed.view(np.int32)

    pack_num = 32 // bits  # 8 for INT4
    unpacked_size = packed.size * pack_num

    unpacked = np.empty(unpacked_size, dtype=np.int8)

    for i in range(pack_num):
        shift = i * bits
        mask = (1 << bits) - 1  # 0x0F for 4-bit
        unpacked[i::pack_num] = ((packed >> shift) & mask).astype(np.int8)

    return unpacked


def normalize_tensor_dtype(tensor: np.ndarray, tensor_name: str, is_awq: bool = False) -> np.ndarray:
    """Normalize tensor to consistent dtype based on tensor type

    Args:
        tensor: Input tensor
        tensor_name: Name of the tensor (used to determine type)
        is_awq: Whether this is AWQ format (requires unpacking)

    Returns:
        Normalized tensor with consistent dtype
    """
    # Determine tensor type from name
    is_scale = "scale" in tensor_name
    is_weight = "weight" in tensor_name
    is_qzeros = "qzeros" in tensor_name

    if is_scale:
        # Scale should be float32
        if tensor.dtype != np.float32:
            # Try to reinterpret bytes as float32
            tensor = tensor.view(np.float32)
        return tensor

    elif is_weight or is_qzeros:
        # Weight/qzeros should be int8
        if is_awq and tensor.dtype == np.int32:
            # AWQ format: unpack int32 to int8
            tensor = unpack_awq_int32_to_int8(tensor)
        elif tensor.dtype == np.float32:
            # Two cases for float32:
            # Case 1: Values look like int8 values (e.g., [37., 73., -70.])
            #         -> use astype to convert values
            # Case 2: Values are large scientific notation (e.g., [2.6e34, ...])
            #         -> use view to reinterpret bytes

            # Check if values are in int8 range (-128 to 127)
            if len(tensor) > 0:
                sample_size = min(100, len(tensor))
                sample_values = tensor.flat[:sample_size]

                # If most values are in int8 range and have no decimal parts
                in_int8_range = np.all((sample_values >= -128) & (sample_values <= 127))
                is_integer_valued = np.all(sample_values == np.round(sample_values))

                if in_int8_range and is_integer_valued:
                    # Case 1: Direct value conversion
                    tensor = tensor.astype(np.int8)
                else:
                    # Case 2: Byte reinterpretation (4 bytes -> 4 int8s)
                    tensor = tensor.view(np.int8)
            else:
                tensor = tensor.astype(np.int8)

        elif tensor.dtype == np.int32:
            # Reinterpret int32 as int8 (4x more elements)
            tensor = tensor.view(np.int8)
        elif tensor.dtype != np.int8:
            # Other types: try to convert
            tensor = tensor.astype(np.int8)

        return tensor

    else:
        # Unknown type, return as-is
        return tensor


def load_kt_binary(file_path: str) -> np.ndarray:
    """Load .kt format binary tensor file

    Args:
        file_path: Path to .kt binary file

    Returns:
        numpy array with the loaded tensor
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, "rb") as f:
        binary_data = f.read()

    # Determine dtype based on file name
    if "scale" in file_path:
        dtype = np.float32
    else:
        dtype = np.int8

    return np.frombuffer(binary_data, dtype=dtype)


def detect_weight_format(path: str) -> str:
    """Detect if weights are in safetensor or .kt format

    Args:
        path: Path to weight directory

    Returns:
        'safetensor' or 'kt' or 'unknown'
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Path not found: {path}")

    # Check for safetensor files
    safetensor_files = glob.glob(os.path.join(path, "*.safetensors"))
    if safetensor_files:
        return "safetensor"

    # Check for layer folder structure
    layer_folders = glob.glob(os.path.join(path, "_layer_*"))
    if layer_folders:
        return "kt"

    return "unknown"


def detect_awq_format(weights_sample: Dict[str, np.ndarray]) -> bool:
    """Detect if weights are in AWQ format

    AWQ format characteristics:
    - Has 'qzeros' tensors
    - Weight tensors are int32 dtype (packed)

    Args:
        weights_sample: Sample of loaded weights

    Returns:
        True if AWQ format detected
    """
    has_qzeros = any("qzeros" in key for key in weights_sample.keys())

    if not has_qzeros:
        return False

    # Check if weight tensors are int32
    for key, tensor in weights_sample.items():
        if "weight" in key and tensor.dtype == np.int32:
            return True

    return False


def load_safetensor_weights(path: str) -> Dict[str, np.ndarray]:
    """Load all weights from safetensor format

    Args:
        path: Path to directory containing safetensor files

    Returns:
        Dictionary mapping tensor names to numpy arrays (dtype normalized)
    """
    weights = {}

    safetensor_files = sorted(glob.glob(os.path.join(path, "*.safetensors")))
    if not safetensor_files:
        raise FileNotFoundError(f"No safetensor files found in {path}")

    print(f"Loading safetensor files from {path}")

    # First pass: load all tensors
    for file in safetensor_files:
        with safe_open(file, framework="pt") as f:
            for key in f.keys():
                # Only load MoE expert weights for comparison
                if ".ffn_" in key and "_exps." in key:
                    tensor = f.get_tensor(key)
                    weights[key] = tensor.cpu().numpy()

    # Detect AWQ format
    is_awq = detect_awq_format(weights)
    print(f"  Format detected: {'AWQ' if is_awq else 'INT4/INT8'}")

    # Second pass: normalize dtypes
    print(f"  Normalizing dtypes...")
    for key in list(weights.keys()):
        original_dtype = weights[key].dtype
        original_shape = weights[key].shape
        weights[key] = normalize_tensor_dtype(weights[key], key, is_awq=is_awq)

        if weights[key].shape != original_shape or weights[key].dtype != original_dtype:
            print(f"    {key}: {original_dtype}{original_shape} -> {weights[key].dtype}{weights[key].shape}")

    print(f"  Loaded {len(weights)} tensors from safetensor format")
    return weights


def load_kt_weights(path: str) -> Dict[str, np.ndarray]:
    """Load all weights from .kt format (layer folder structure)

    Args:
        path: Path to directory containing _layer_* folders

    Returns:
        Dictionary mapping tensor names to numpy arrays
    """
    weights = {}

    layer_folders = sorted(glob.glob(os.path.join(path, "_layer_*")))
    if not layer_folders:
        raise FileNotFoundError(f"No _layer_* folders found in {path}")

    print(f"Loading .kt files from {path}")

    for layer_folder in layer_folders:
        # Extract layer index from folder name
        layer_idx = int(os.path.basename(layer_folder).split("_")[-1])

        # Find all NUMA folders
        numa_folders = sorted(glob.glob(os.path.join(layer_folder, "_numa_*")))

        for numa_folder in numa_folders:
            # Extract NUMA index
            numa_idx = int(os.path.basename(numa_folder).split("_")[-1])

            # Find all .kt files
            kt_files = glob.glob(os.path.join(numa_folder, "*.kt"))

            for kt_file in kt_files:
                filename = os.path.basename(kt_file)

                # Parse filename to extract metadata
                # Format: {METHOD}_{proj}_{expert}_{size}Byte_{type}_.kt
                parts = filename.replace(".kt", "").split("_")

                if len(parts) >= 5:
                    method = parts[0]  # INT4, INT8, etc.
                    proj = parts[1]  # down, gate, up
                    expert = parts[2]  # expert ID
                    tensor_type = parts[4]  # quant or scale

                    # Map proj names
                    proj_map = {"down": "ffn_down_exps", "gate": "ffn_gate_exps", "up": "ffn_up_exps"}

                    proj_key = proj_map.get(proj, proj)

                    # Build key matching safetensor format
                    if tensor_type == "quant":
                        key = f"blk.{layer_idx}.{proj_key}.{expert}.numa.{numa_idx}.weight"
                    else:  # scale
                        key = f"blk.{layer_idx}.{proj_key}.{expert}.numa.{numa_idx}.scale"

                    # Load tensor
                    weights[key] = load_kt_binary(kt_file)

    # Normalize dtypes (.kt format is never AWQ)
    print(f"  Normalizing dtypes...")
    for key in list(weights.keys()):
        original_dtype = weights[key].dtype
        original_shape = weights[key].shape
        weights[key] = normalize_tensor_dtype(weights[key], key, is_awq=False)

        if weights[key].shape != original_shape or weights[key].dtype != original_dtype:
            print(f"    {key}: {original_dtype}{original_shape} -> {weights[key].dtype}{weights[key].shape}")

    print(f"  Loaded {len(weights)} tensors from .kt format")
    return weights


def normalize_key(key: str) -> Tuple[int, str, int, str]:
    """Normalize tensor key to extract layer, projection, expert, and type

    Args:
        key: Tensor key like "blk.0.ffn_up_exps.5.weight" or "blk.0.ffn_up_exps.5.numa.0.weight"

    Returns:
        Tuple of (layer_idx, proj_name, expert_idx, tensor_type)
    """
    parts = key.split(".")

    layer_idx = int(parts[1])
    proj_name = parts[2]
    expert_idx = int(parts[3])

    # Handle both formats: with and without numa
    if "numa" in key:
        tensor_type = parts[6]  # weight or scale
    else:
        tensor_type = parts[4]  # weight, scale, or qzeros

    return (layer_idx, proj_name, expert_idx, tensor_type)


def compare_weights(
    weights1: Dict[str, np.ndarray], weights2: Dict[str, np.ndarray], tolerance: float = 1e-6
) -> Tuple[bool, Dict[str, Dict]]:
    """Compare two sets of weights

    Args:
        weights1: First set of weights
        weights2: Second set of weights
        tolerance: Numerical tolerance for comparison

    Returns:
        Tuple of (all_match, differences_dict)
    """
    print("\n" + "=" * 80)
    print("WEIGHT COMPARISON")
    print("=" * 80)

    # Group keys by normalized form (ignoring numa index)
    def group_by_base_key(weights):
        groups = defaultdict(list)
        for key in weights.keys():
            try:
                layer, proj, expert, ttype = normalize_key(key)
                base_key = f"blk.{layer}.{proj}.{expert}.{ttype}"
                groups[base_key].append(key)
            except:
                # Skip keys that don't match expected format
                pass
        return groups

    groups1 = group_by_base_key(weights1)
    groups2 = group_by_base_key(weights2)

    all_base_keys = sorted(set(groups1.keys()) | set(groups2.keys()))

    all_match = True
    differences = {}

    total_comparisons = 0
    matching_comparisons = 0

    for base_key in all_base_keys:
        keys1 = groups1.get(base_key, [])
        keys2 = groups2.get(base_key, [])

        if not keys1:
            print(f"❌ Missing in weights1: {base_key}")
            differences[base_key] = {"status": "missing_in_weights1"}
            all_match = False
            continue

        if not keys2:
            print(f"❌ Missing in weights2: {base_key}")
            differences[base_key] = {"status": "missing_in_weights2"}
            all_match = False
            continue

        # For kt format, we may have multiple keys (one per NUMA node)
        # We need to concatenate them for comparison
        if len(keys1) > 1 or len(keys2) > 1:
            # Concatenate tensors from all NUMA nodes
            tensor1 = np.concatenate([weights1[k] for k in sorted(keys1)])
            tensor2 = np.concatenate([weights2[k] for k in sorted(keys2)])
        else:
            tensor1 = weights1[keys1[0]]
            tensor2 = weights2[keys2[0]]

        total_comparisons += 1

        # Debug: print dtype and shape info
        if tensor1.dtype != tensor2.dtype:
            print(f"⚠️  Dtype mismatch for {base_key}: {tensor1.dtype} vs {tensor2.dtype}")
            print(f"   This should have been normalized. Shape: {tensor1.shape} vs {tensor2.shape}")

        # Compare shapes
        if tensor1.shape != tensor2.shape:
            print(f"❌ Shape mismatch for {base_key}:")
            print(f"   Shape1: {tensor1.shape} (dtype: {tensor1.dtype})")
            print(f"   Shape2: {tensor2.shape} (dtype: {tensor2.dtype})")
            differences[base_key] = {
                "status": "shape_mismatch",
                "shape1": tensor1.shape,
                "shape2": tensor2.shape,
                "dtype1": str(tensor1.dtype),
                "dtype2": str(tensor2.dtype),
            }
            all_match = False
            continue

        # Compare dtypes (should be consistent after normalization)
        if tensor1.dtype != tensor2.dtype:
            print(f"❌ Dtype mismatch for {base_key} after normalization:")
            print(f"   Dtype1: {tensor1.dtype}")
            print(f"   Dtype2: {tensor2.dtype}")
            differences[base_key] = {
                "status": "dtype_mismatch",
                "dtype1": str(tensor1.dtype),
                "dtype2": str(tensor2.dtype),
            }
            all_match = False
            continue

        # Compare values
        if np.allclose(tensor1, tensor2, atol=tolerance, rtol=tolerance):
            matching_comparisons += 1
        else:
            max_diff = np.max(np.abs(tensor1 - tensor2))
            mean_diff = np.mean(np.abs(tensor1 - tensor2))

            print(f"❌ Value mismatch for {base_key}:")
            print(f"   Max difference: {max_diff:.2e}")
            print(f"   Mean difference: {mean_diff:.2e}")
            print(f"   Tolerance: {tolerance:.2e}")

            differences[base_key] = {
                "status": "value_mismatch",
                "max_diff": float(max_diff),
                "mean_diff": float(mean_diff),
                "tolerance": tolerance,
            }
            all_match = False

    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total comparisons: {total_comparisons}")
    print(f"Matching: {matching_comparisons}")
    print(f"Mismatching: {total_comparisons - matching_comparisons}")
    print(f"Missing tensors: {len(differences) - (total_comparisons - matching_comparisons)}")

    if all_match:
        print("\n✅ All weights match!")
    else:
        print(f"\n❌ Found {len(differences)} differences")

    return all_match, differences


def main():
    parser = argparse.ArgumentParser(description="Compare two sets of quantized weights")
    parser.add_argument("--path1", type=str, required=True, help="Path to first weight directory")
    parser.add_argument("--path2", type=str, required=True, help="Path to second weight directory")
    parser.add_argument(
        "--tolerance", type=float, default=1e-6, help="Numerical tolerance for comparison (default: 1e-6)"
    )

    args = parser.parse_args()

    # Validate paths
    if not os.path.exists(args.path1):
        print(f"Error: Path1 does not exist: {args.path1}")
        return 1

    if not os.path.exists(args.path2):
        print(f"Error: Path2 does not exist: {args.path2}")
        return 1

    # Detect formats
    print("Detecting weight formats...")
    format1 = detect_weight_format(args.path1)
    format2 = detect_weight_format(args.path2)

    print(f"Path1 format: {format1}")
    print(f"Path2 format: {format2}")

    if format1 == "unknown":
        print(f"Error: Unable to detect weight format in {args.path1}")
        return 1

    if format2 == "unknown":
        print(f"Error: Unable to detect weight format in {args.path2}")
        return 1

    # Load weights based on format
    print("\nLoading weights...")

    if format1 == "safetensor":
        weights1 = load_safetensor_weights(args.path1)
    else:
        weights1 = load_kt_weights(args.path1)

    if format2 == "safetensor":
        weights2 = load_safetensor_weights(args.path2)
    else:
        weights2 = load_kt_weights(args.path2)

    # Compare weights
    all_match, differences = compare_weights(weights1, weights2, args.tolerance)

    return 0 if all_match else 1


if __name__ == "__main__":
    exit(main())


================================================
FILE: kt-kernel/scripts/convert_cpu_weights.py
================================================
#!/usr/bin/env python3

import argparse
import os
from collections import defaultdict
from typing import Dict, List
import torch
from safetensors import safe_open
from safetensors.torch import save_file
import gc
import time
import json
import sys
import glob
import numpy as np

# Add parent directory to path to import kt_kernel
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from kt_kernel import KTMoEWrapper

import triton
import triton.language as tl


Q_BITS = 4
STORAGE_BITS = 32
PACK_NUM = STORAGE_BITS // Q_BITS
NUMA_NUM = 2

REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


@triton.jit
def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
    pid_m = tl.program_id(axis=0)
    pid_n = tl.program_id(axis=1)
    n = tl.cdiv(N, BLOCK_SIZE)
    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs = offs_m[:, None] * N + offs_n[None, :]
    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
    s = tl.load(s_ptr + pid_m * n + pid_n)
    y = x * s
    tl.store(y_ptr + offs, y, mask=mask)


def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
    assert x.is_contiguous() and s.is_contiguous()
    assert x.dim() == 2 and s.dim() == 2
    M, N = x.size()
    y = torch.empty_like(x, dtype=torch.get_default_dtype())
    grid = lambda meta: (triton.cdiv(M, meta["BLOCK_SIZE"]), triton.cdiv(N, meta["BLOCK_SIZE"]))
    weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
    return y


def load_model_config(input_path: str, input_type: str = None) -> Dict:
    """Load model configuration from config.json

    Args:
        input_path: Path to directory containing config.json
        input_type: Input weight type (fp8/fp16/bf16/awq), used to validate FP8 config

    Returns:
        Dictionary with model configuration
    """
    config_path = os.path.join(input_path, "config.json")
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"config.json not found in {input_path}")

    with open(config_path, "r") as f:
        config = json.load(f)

    if "text_config" in config:
        text_cfg = config["text_config"]
    else:
        text_cfg = config

    # Extract required fields with fallbacks
    model_config = {
        "num_experts": text_cfg.get("n_routed_experts", text_cfg.get("num_experts")),
        "num_experts_per_tok": text_cfg.get("num_experts_per_tok", 2),
        "hidden_size": text_cfg.get("hidden_size"),
        "moe_intermediate_size": text_cfg.get("moe_intermediate_size", text_cfg.get("intermediate_size")),
    }

    # Validate required fields
    missing_fields = [k for k, v in model_config.items() if v is None]
    if missing_fields:
        raise ValueError(f"Missing required config fields: {missing_fields}")

    # For FP8 input, extract and validate quantization_config
    if input_type == "fp8":
        quant_config = config.get("quantization_config") or text_cfg.get("quantization_config")
        if quant_config is None:
            raise ValueError(
                "FP8 input type specified but 'quantization_config' not found in config.json. "
                "Expected quantization_config with weight_block_size field."
            )

        weight_block_size = quant_config.get("weight_block_size")
        if weight_block_size is None:
            raise ValueError(
                "FP8 quantization_config found but 'weight_block_size' field is missing. "
                "Expected format: 'weight_block_size': [128, 128]"
            )

        if not isinstance(weight_block_size, list) or len(weight_block_size) != 2:
            raise ValueError(
                f"Invalid weight_block_size format: {weight_block_size}. "
                "Expected a list of two integers, e.g., [128, 128]"
            )

        model_config["fp8_weight_block_size"] = weight_block_size
        print(f"FP8 quantization config detected:")
        print(f"  format: {quant_config.get('fmt', 'unknown')}")
        print(f"  weight_block_size: {weight_block_size}")
    return model_config


def pack(imatrix: torch.Tensor):
    """
    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
    Args:
        imatrix (torch.Tensor): matrix of integers
        direction (str): direction of packing, either "column" or "row"

    Returns:
        qmatrix (torch.Tensor): packed matrix of integers
    """
    shifts = torch.arange(0, STORAGE_BITS, Q_BITS, device=imatrix.device)

    imatrix = torch.bitwise_and(imatrix, 0x0F).to(torch.int32)  # eventually correct overflow

    imatrix = imatrix.view(imatrix.shape[0], imatrix.shape[1], -1, PACK_NUM)
    qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, None, :]).sum(dim=-1)

    qmatrix = qmatrix.to(torch.int32)

    return qmatrix


def unpack(qmatrix: torch.Tensor):
    """
    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.

    Args:
        qmatrix (torch.Tensor): matrix of packed integers
        direction (str): direction of unpacking, either "column" or "row"

    Returns:
        imatrix (torch.Tensor): matrix of integers
    """
    shifts = torch.arange(0, STORAGE_BITS, Q_BITS, device=qmatrix.device)

    imatrix = torch.bitwise_right_shift(qmatrix[:, :, :, None], shifts[None, None, None, :]).view(
        qmatrix.shape[0], qmatrix.shape[1], -1
    )

    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow

    return imatrix


def reverse_awq_interleaving(imatrix: torch.Tensor):
    """Reverse AWQ interleaving to get original order"""
    # Reshape to handle interleaving at pack level
    original_shape = imatrix.shape
    imatrix_reshaped = imatrix.view(original_shape[0], original_shape[1], -1, PACK_NUM)

    # Apply reverse AWQ pack order
    imatrix_reordered = imatrix_reshaped[:, :, :, REVERSE_AWQ_PACK_ORDER]

    return imatrix_reordered.view(original_shape)


def unpack_reverse_awq_interleaving(qweight: torch.Tensor, qzeros: torch.Tensor = None):
    """
    Row-major unpack AWQ I32 -> INT4 and reverse interleaving to get original order

    Args:
        qweight: Packed AWQ weights with interleaving (I32)
        qzeros: Packed AWQ zeros with interleaving (I32, optional)

    Returns:
        Tuple of (unpacked_weights, unpacked_zeros) in row major order (original)
    """
    # Step 1: Row-major unpack I32 to INT4
    iweights = unpack(qweight)  # Use row direction for row-major

    if qzeros is not None:
        izeros = unpack(qzeros)  # Use row direction for row-major
    else:
        izeros = None

    # Step 2: Reverse AWQ interleaving to get original row-major order
    iweights_original = reverse_awq_interleaving(iweights)

    if izeros is not None:
        izeros_original = reverse_awq_interleaving(izeros)
    else:
        izeros_original = None

    return iweights_original, izeros_original


def pack_column_major_1d(iweights: torch.Tensor, izeros: torch.Tensor = None):
    """
    Pack INT4 -> I32 then flatten to 1D with different logic for weights vs zeros

    Args:
        iweights: Unpacked weights in row major order (INT4)
        izeros: Unpacked zeros in row major order (INT4, optional)

    Returns:
        Tuple of (packed_weights, packed_zeros) as 1D tensors
    """
    # qweight: transpose to column-major then pack
    iweights_transposed = iweights.transpose(1, 2).contiguous()
    qweight = pack(iweights_transposed)
    # qweight = qweight_2d.flatten()  # Flatten to 1D

    # qzeros: NO transpose, keep original shape, pack with original interleaving (01234567)
    if izeros is not None:
        qzeros = pack(izeros)  # Keep original shape, original interleaving
        # qzeros = qzeros_2d.flatten()  # Flatten to 1D
    else:
        qzeros = None

    return qweight, qzeros


class ConverterBase:
    """Base class for converting model weights.

    Subclasses must implement `_convert_layer_experts` to handle the expert
    tensor transformation for a given quantization method (e.g., awq, int4, int8).
    """

    def __init__(
        self,
        input_path: str,
        output_path: str,
        model_config: Dict,
        cpuinfer_threads: int = 60,
        threadpool_count: int = 2,
        input_type: str = None,
        merge_to_safetensor: bool = True,
    ):
        self.input_path = input_path
        self.output_path = output_path
        self.model_config = model_config
        self.cpuinfer_threads = cpuinfer_threads
        self.threadpool_count = threadpool_count
        self.input_type = input_type
        self.merge_to_safetensor = merge_to_safetensor
        self.tensor_file_map: Dict[str, str] = {}  # key -> filename
        self.tensor_key_map: Dict[str, str] = {}  # old key -> new key
        self.file_handle_map: Dict[str, any] = {}  # filename -> file

        # Extract commonly used config values for convenience
        self.num_experts = model_config["num_experts"]
        self.num_experts_per_tok = model_config["num_experts_per_tok"]
        self.hidden_size = model_config["hidden_size"]
        self.moe_intermediate_size = model_config["moe_intermediate_size"]
        self.layout = "base"

        # Load input safetensors files
        self._load_input_files()

    def _load_input_files(self):
        """Load all safetensors files from input directory"""
        print(f"Loading safetensors files from {self.input_path}")

        found_safetensor = False
        for root, _, files in os.walk(self.input_path):
            files = sorted(files)
            for file in files:
                if file.endswith(".safetensors"):
                    found_safetensor = True
                    file_path = os.path.join(root, file)
                    try:
                        handle = safe_open(file_path, framework="pt")
                        self.file_handle_map[file] = handle
                        renamed = False
                        for key in handle.keys():
                            if "language_model" in key:
                                key_ = key.replace("language_model.", "")
                                # print("  Renaming key:", key, "->", key_)
                                renamed = True
                            else:
                                key_ = key
                            self.tensor_key_map[key_] = key
                            self.tensor_file_map[key_] = file
                        print(
                            f"  Loaded: {file} ({len(list(handle.keys()))} tensors){' (renamed keys)' if renamed else ''}"
                        )
                    except Exception as e:
                        print(f"  Error loading {file}: {e}")

        if not found_safetensor:
            raise FileNotFoundError(f"No safetensors files found in {self.input_path}")

        print(f"Total tensors loaded: {len(self.tensor_file_map)}")

    def _load_tensor(self, key: str) -> torch.Tensor:
        """Load tensor by key"""
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found")

        file = self.tensor_file_map[key]
        handle = self.file_handle_map[file]
        return handle.get_tensor(self.tensor_key_map.get(key, key))

    # layers_id -> list[experts_id]
    def _find_expert_layers(self) -> Dict[int, List[int]]:
        """Find all layers and experts in the model"""
        layers = defaultdict(set)

        # detect layout
        for key in self.tensor_file_map.keys():
            if "mlp.experts" in key and "gate_up" in key:
                self.layout = "fused"
                break

        if self.layout == "fused":  # Pattern: model.layers.{layer}.mlp.experts.{proj}
            layers = set()
            for key in self.tensor_file_map.keys():
                if "model.layers." in key and ".mlp.experts." in key:
                    parts = key.split(".")
                    if len(parts) >= 6:
                        layer_idx = int(parts[2])
                        layers.add(layer_idx)

            result: Dict[int, List[int]] = {}
            for layer_idx in sorted(layers):
                result[layer_idx] = [-1]

            print(f"Found {len(result)} layers with fused MoE experts")
            return result

        # Pattern: model.layers.{layer}.mlp.experts.{expert}.{proj}.{type}
        for key in self.tensor_file_map.keys():
            if "model.layers." in key and ".mlp.experts." in key:
                parts = key.split(".")
                if len(parts) >= 6:
                    layer_idx = int(parts[2])
                    expert_idx = int(parts[5])
                    layers[layer_idx].add(expert_idx)

        # Convert to sorted lists
        result: Dict[int, List[int]] = {}
        for layer_idx, expert_set in layers.items():
            result[layer_idx] = sorted(list(expert_set))

        print(f"Found {len(result)} layers with MoE experts:")
        for layer_idx, experts in sorted(result.items()):
            print(f"  Layer {layer_idx}: {len(experts)} experts (0-{max(experts)})")

        return result

    def _convert_layer_experts(self, layer_idx: int, expert_ids: List[int]) -> Dict[str, torch.Tensor]:
        """Subclasses must implement expert conversion for a given layer.

        Expected to return a mapping from output tensor keys to tensors.
        """
        raise NotImplementedError("Subclasses must implement _convert_layer_experts")

    def convert(self, resume_layer: int = 0):
        """Convert all expert layers using subclass-specific logic.

        Args:
            resume_layer (int, optional): The layer index to resume conversion from.
                Layers with an index lower than this will be skipped. Defaults to 0.
        """
        print("Starting conversion...")
        print(f"Input: {self.input_path}")
        print(f"Output: {self.output_path}")
        if resume_layer > 0:
            print(f"Resuming from layer: {resume_layer}")

        # Create output directory
        os.makedirs(self.output_path, exist_ok=True)

        # Find all expert layers
        expert_layers = self._find_expert_layers()

        if not expert_layers:
            print("No MoE expert layers found in input!")
            return

        # Convert each layer with memory management
        all_tensors: Dict[str, torch.Tensor] = {}

        # Enable memory optimization
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Process layers with memory cleanup
        for i, (layer_idx, expert_ids) in enumerate(sorted(expert_layers.items())):
            if layer_idx < resume_layer:
                continue
            print(f"Processing layer {layer_idx} ({i+1}/{len(expert_layers)})...")

            layer_tensors = self._convert_layer_experts(layer_idx, expert_ids)
            all_tensors.update(layer_tensors)

            # Periodic garbage collection to free memory
            if (i + 1) % 5 == 0:  # Every 5 layers
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                print(f"  Memory cleanup after layer {layer_idx}")

        if self.merge_to_safetensor:
            # Copy non-expert tensors (embeddings, norms, etc.)
            print("Copying non-expert tensors...")
            for key in self.tensor_file_map.keys():
                if not (".mlp.experts." in key):
                    # Convert key format for consistency
                    if key.startswith("model."):
                        # Convert model.layers.X -> blk.X for non-expert layers
                        new_key = key.replace("model.layers.", "blk.").replace("model.", "")
                        all_tensors[new_key] = self._load_tensor(key)
                    else:
                        all_tensors[key] = self._load_tensor(key)

            # Save all tensors
            print(f"Saving {len(all_tensors)} tensors...")

            # Split into multiple files if too large
            max_tensors_per_file = 3000  # Adjust based on memory constraints
            tensor_items = list(all_tensors.items())

            if len(tensor_items) <= max_tensors_per_file:
                # Single file
                output_file = os.path.join(self.output_path, "model.safetensors")
                save_file(dict(tensor_items), output_file)
                print(f"Saved to: {output_file}")
            else:
                # Multiple files
                for i in range(0, len(tensor_items), max_tensors_per_file):
                    batch = dict(tensor_items[i : i + max_tensors_per_file])
                    output_file = os.path.join(self.output_path, f"model-{i//max_tensors_per_file + 1:05d}.safetensors")
                    save_file(batch, output_file)
                    print(f"Saved batch to: {output_file}")

            # Copy config files
            self._copy_config_files()

            print("Conversion completed successfully!")
        else:
            print("Skipping safetensor merge, weights kept in layer folder structure")
            print("Conversion completed successfully!")

    def _copy_config_files(self):
        """Copy configuration files to output directory"""
        config_files = ["config.json", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]

        for config_file in config_files:
            src_path = os.path.join(self.input_path, config_file)
            if os.path.exists(src_path):
                import shutil

                dst_path = os.path.join(self.output_path, config_file)
                shutil.copy2(src_path, dst_path)
                print(f"Copied: {config_file}")

    def close(self):
        """Close all file handles"""
        self.file_handle_map.clear()


class AWQToColumnMajorConverter(ConverterBase):
    """Convert raw AWQ safetensors to NUMA-sliced column-major format."""

    # NOTE: Only this method differs across quantization methods.
    def _convert_layer_experts(self, layer_idx: int, expert_ids: List[int]) -> Dict[str, torch.Tensor]:
        """Convert all experts in a layer to column major format with optimized AWQ processing"""
        output_tensors = {}

        start_time = time.time()
        print(f"Converting layer {layer_idx} with {len(expert_ids)} experts...")

        # Pre-compute projection name mappings
        proj_mappings = {"up_proj": "ffn_up_exps", "gate_proj": "ffn_gate_exps", "down_proj": "ffn_down_exps"}

        # Batch process all experts to reduce nested loops
        for proj_name, out_proj in proj_mappings.items():
            # Load all expert tensors for this projection at once
            expert_qweights = []
            expert_qzeros = []
            expert_scales = []
            valid_experts = []

            for expert_id in expert_ids:
                qweight_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.{proj_name}.qweight"
                qzeros_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.{proj_name}.qzeros"
                scales_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.{proj_name}.scales"

                if qweight_key in self.tensor_file_map:
                    qweight = self._load_tensor(qweight_key)
                    qzeros = self._load_tensor(qzeros_key) if qzeros_key in self.tensor_file_map else None
                    scales = self._load_tensor(scales_key) if scales_key in self.tensor_file_map else None

                    expert_qweights.append(qweight)
                    expert_qzeros.append(qzeros)
                    expert_scales.append(scales)
                    valid_experts.append(expert_id)

            if not valid_experts:
                continue

            print(f"  Processing {proj_name}: {len(valid_experts)} experts")

            qweights_stack = torch.stack([w for w in expert_qweights if w is not None], dim=0)
            qzeros_stack = torch.stack([z for z in expert_qzeros if z is not None], dim=0)

            batch_size = 128

            for batch_start in range(0, len(valid_experts), batch_size):
                batch_end = min(batch_start + batch_size, len(valid_experts))
                qweights_batch = qweights_stack[batch_start:batch_end].to("cuda")
                qzeros_batch = qzeros_stack[batch_start:batch_end].to("cuda")
                iweights_batch, izeros_batch = unpack_reverse_awq_interleaving(qweights_batch, qzeros_batch)
                qweights_1d_batch, qzeros_1d_batch = pack_column_major_1d(iweights_batch, izeros_batch)

                for idx in range(batch_start, batch_end):
                    expert_id = valid_experts[idx]
                    batch_idx = idx - batch_start
                    output_tensors[f"blk.{layer_idx}.{out_proj}.{expert_id}.scale"] = expert_scales[idx].flatten()
                    output_tensors[f"blk.{layer_idx}.{out_proj}.{expert_id}.weight"] = qweights_1d_batch[
                        batch_idx
                    ].cpu()
                    if qzeros_1d_batch is not None:
                        output_tensors[f"blk.{layer_idx}.{out_proj}.{expert_id}.qzeros"] = qzeros_1d_batch[
                            batch_idx
                        ].cpu()

            gc.collect()

        elapsed = time.time() - start_time
        print(f"  Generated {len(output_tensors)} column-major 1D tensors in {elapsed:.2f}s")
        return output_tensors


class OnlineQuantConverter(ConverterBase):
    """Convert FP8/FP16/BF16 weights to quantized format using AMXMoEWrapper.

    Performs online quantization (FP8/FP16/BF16 -> INT4/INT8) using AMXMoEWrapper
    with NUMA-aware memory management and automatic weight saving.
    """

    def __init__(
        self,
        input_path: str,
        output_path: str,
        model_config: Dict,
        cpuinfer_threads: int = 60,
        threadpool_count: int = 2,
        input_type: str = None,
        quant_method: str = "int4",
        merge_to_safetensor: bool = True,
    ):
        super().__init__(
            input_path, output_path, model_config, cpuinfer_threads, threadpool_count, input_type, merge_to_safetensor
        )
        self.quant_method = quant_method

        # For FP8, get block size from model_config
        if input_type == "fp8":
            self.fp8_block_size = model_config.get("fp8_weight_block_size", [128, 128])
        else:
            self.fp8_block_size = None

    def _dequantize_fp8_blockwise(self, fp8_weight: torch.Tensor, scale_inv: torch.Tensor) -> torch.Tensor:
        """Dequantize FP8 weight with block-wise scaling.

        Args:
            fp8_weight: FP8 weight tensor of shape [H, W]
            scale_inv: Scale inverse tensor of shape [H//block_size, W//block_size]

        Returns:
            Dequantized BF16 weight tensor of shape [H, W]
        """
        H, W = fp8_weight.shape
        num_blocks_h, num_blocks_w = scale_inv.shape

        # Infer block size from shapes
        block_h = H // num_blocks_h
        block_w = W // num_blocks_w

        # Reshape fp8_weight to [num_blocks_h, block_h, num_blocks_w, block_w]
        fp8_reshaped = fp8_weight.view(num_blocks_h, block_h, num_blocks_w, block_w)

        # Reshape scale_inv to [num_blocks_h, 1, num_blocks_w, 1] for broadcasting
        scale_inv_reshaped = scale_inv.view(num_blocks_h, 1, num_blocks_w, 1)

        # Dequantize: convert to bf16 and multiply by scale_inv
        dequantized = fp8_reshaped.to(torch.bfloat16) * scale_inv_reshaped

        # Reshape back to [H, W]
        dequantized = dequantized.view(H, W).contiguous()

        return dequantized

    def _load_binary_tensor(self, file_path: str) -> torch.Tensor:
        """Load .kt format binary tensor file

        Args:
            file_path: Path to .kt binary file

        Returns:
            torch.Tensor: Loaded tensor
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        with open(file_path, "rb") as f:
            binary_data = f.read()

        # Determine dtype based on file name
        if "scale" in file_path:
            # Scale tensors are typically float32
            np_array = np.frombuffer(binary_data, dtype=np.float32)
        else:
            # Quant tensors are typically int8
            np_array = np.frombuffer(binary_data, dtype=np.int8)

        tensor = torch.from_numpy(np_array.copy())
        return tensor

    def _load_layer_tensors_from_disk(self, layer_idx: int) -> Dict[str, torch.Tensor]:
        """Load all quantized tensors from _layer_{layer_idx} folder

        Args:
            layer_idx: Layer index

        Returns:
            Dict[str, torch.Tensor]: Dictionary with keys in format:
                'blk.{layer}.ffn_{proj}_exps.{expert}.numa.{numa_idx}.{weight|scale}'
        """
        layer_path = os.path.join(self.output_path, f"_layer_{layer_idx}")
        if not os.path.exists(layer_path):
            raise FileNotFoundError(f"Layer folder not found: {layer_path}")

        tensors = {}

        # Get AMX method from quant_method parameter (INT4/INT8)
        # Map quant_method to AMX_METHOD format
        quant_to_amx_map = {
            "int4": "INT4",
            "int8": "INT8",
            "moe_int4": "MOE_INT4",
            "moe_int8": "MOE_INT8",
        }
        amx_method = quant_to_amx_map.get(self.quant_method, "INT4")

        # Iterate through all NUMA folders
        for numa_idx in range(self.threadpool_count):
            numa_folder = os.path.join(layer_path, f"_numa_{numa_idx}")
            if not os.path.exists(numa_folder):
                print(f"  Warning: NUMA folder not found: {numa_folder}, skipping...")
                continue

            # Iterate through all experts
            for expert_id in range(self.num_experts):
                # For each projection (down, gate, up)
                proj_mappings = [("down", "ffn_down_exps"), ("gate", "ffn_gate_exps"), ("up", "ffn_up_exps")]

                for proj_name, proj_key in proj_mappings:
                    # Build file patterns
                    quant_pattern = os.path.join(numa_folder, f"{amx_method}_{proj_name}_{expert_id}_*Byte_quant_.kt")
                    scale_pattern = os.path.join(numa_folder, f"{amx_method}_{proj_name}_{expert_id}_*Byte_scale_.kt")

                    # Find files using glob
                    quant_files = glob.glob(quant_pattern)
                    scale_files = glob.glob(scale_pattern)

                    # Build keys (following merge_small_tensor.py format)
                    weight_key = f"blk.{layer_idx}.{proj_key}.{expert_id}.numa.{numa_idx}.weight"
                    scale_key = f"blk.{layer_idx}.{proj_key}.{expert_id}.numa.{numa_idx}.scale"

                    # Load quant tensor
                    if quant_files:
                        if len(quant_files) > 1:
                            raise ValueError(f"Multiple quant files found: {quant_files}")
                        tensors[weight_key] = self._load_binary_tensor(quant_files[0])

                    # Load scale tensor
                    if scale_files:
                        if len(scale_files) > 1:
                            raise ValueError(f"Multiple scale files found: {scale_files}")
                        tensors[scale_key] = self._load_binary_tensor(scale_files[0])

        return tensors

    def _remove_layer_folder(self, layer_idx: int):
        """Remove _layer_{layer_idx} folder and all its contents

        Args:
            layer_idx: Layer index
        """
        import shutil

        layer_path = os.path.join(self.output_path, f"_layer_{layer_idx}")
        if os.path.exists(layer_path):
            shutil.rmtree(layer_path)
            print(f"  Removed temporary folder: {layer_path}")

    def _convert_layer_experts(self, layer_idx: int, expert_ids: List[int]) -> Dict[str, torch.Tensor]:
        """Convert all experts in a layer using online quantization via AMXMoEWrapper"""
        start_time = time.time()
        print(
            f"Converting layer {layer_idx} with {len(expert_ids) if self.layout == 'base' else 'fused'} experts via online quantization..."
        )
        # Load all expert weights for this layer
        if self.layout == "fused":
            if self.input_type not in ["bf16", "fp16"]:
                raise ValueError(f"Fused path currently supports bf16/fp16 only, got input_type={self.input_type}")

            proj_set = set()
            prefix = f"model.layers.{layer_idx}.mlp.experts."
            for key in self.tensor_file_map.keys():
                if key.startswith(prefix):
                    parts = key.split(".")
                    if len(parts) >= 6:
                        proj_set.add(parts[5])

            if not proj_set:
                raise ValueError(f"[Fused] No fused MoE experts found for layer {layer_idx} under 'model.layers'")

            projs = sorted(proj_set)
            print(f"  [Fused] layer {layer_idx} fused proj keys: {projs}")
            if len(projs) < 2:
                raise ValueError(
                    f"[Fused] Expect at least 2 fused tensors (down & gate_up) in layer {layer_idx}, got {len(projs)}"
                )

            fused_tensors = []
            for p in projs:
                key = f"model.layers.{layer_idx}.mlp.experts.{p}"
                if key not in self.tensor_file_map:
                    raise KeyError(f"[Fused] Missing fused tensor {key} for layer {layer_idx}")
                w = self._load_tensor(key)
                if self.input_type == "fp16":
                    w = w.to(torch.bfloat16)
                print(f"    [Fused] tensor {p} shape: {tuple(w.shape)}")
                fused_tensors.append(w)

            #   fused_tensors[0] : down-like, [E, I, H]
            #   fused_tensors[1] : gate_up-like, [E, H, 2I]
            down_fused = fused_tensors[0]
            gate_up_fused = fused_tensors[1]

            #    gate_up_fused: [E, H, 2I] -> [E, 2I, H] -> gate / up
            if gate_up_fused.dim() != 3:
                raise ValueError(
                    f"[Fused] Expect gate_up fused tensor to be 3D, got shape {tuple(gate_up_fused.shape)}"
                )
            E, H, twoI = gate_up_fused.shape
            if twoI % 2 != 0:
                raise ValueError(f"[Fused] gate_up last dim (2I) not even: {twoI}")
            I = twoI // 2

            gate_up_T = gate_up_fused.transpose(1, 2).contiguous()  # [E, 2I, H]
            gate_proj = gate_up_T[:, :I, :]  # [E, I, H]
            up_proj = gate_up_T[:, I:, :]  # [E, I, H]

            if down_fused.dim() != 3:
                raise ValueError(f"[Fused] Expect down fused tensor to be 3D, got shape {tuple(down_fused.shape)}")
            if down_fused.shape[0] != E:
                raise ValueError(f"[Fused] down_fused expert dim mismatch: {down_fused.shape[0]} vs gate_up {E}")
            down_proj = down_fused.transpose(1, 2).contiguous()  # [E, H, I]
            del fused_tensors
            del gate_up_fused
            del down_fused
        else:
            gate_weights = []
            up_weights = []
            down_weights = []

            for expert_id in expert_ids:
                gate_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.gate_proj.weight"
                up_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.up_proj.weight"
                down_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.down_proj.weight"

                if gate_key not in self.tensor_file_map:
                    raise KeyError(f"Missing gate weight for layer {layer_idx}, expert {expert_id}")
                if up_key not in self.tensor_file_map:
                    raise KeyError(f"Missing up weight for layer {layer_idx}, expert {expert_id}")
                if down_key not in self.tensor_file_map:
                    raise KeyError(f"Missing down weight for layer {layer_idx}, expert {expert_id}")

                # Load weights based on input type
                if self.input_type == "fp8":
                    # Load FP8 weights and their scale_inv tensors
                    gate_scale_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.gate_proj.weight_scale_inv"
                    up_scale_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.up_proj.weight_scale_inv"
                    down_scale_key = f"model.layers.{layer_idx}.mlp.experts.{expert_id}.down_proj.weight_scale_inv"

                    if gate_scale_key not in self.tensor_file_map:
                        raise KeyError(f"Missing gate weight_scale_inv for layer {layer_idx}, expert {expert_id}")
                    if up_scale_key not in self.tensor_file_map:
                        raise KeyError(f"Missing up weight_scale_inv for layer {layer_idx}, expert {expert_id}")
                    if down_scale_key not in self.tensor_file_map:
                        raise KeyError(f"Missing down weight_scale_inv for layer {layer_idx}, expert {expert_id}")

                    # Load FP8 weights and scales
                    gate_fp8 = self._load_tensor(gate_key).to("cuda")
                    up_fp8 = self._load_tensor(up_key).to("cuda")
                    down_fp8 = self._load_tensor(down_key).to("cuda")

                    gate_scale_inv = self._load_tensor(gate_scale_key).to("cuda")
                    up_scale_inv = self._load_tensor(up_scale_key).to("cuda")
                    down_scale_inv = self._load_tensor(down_scale_key).to("cuda")

                    # Dequantize FP8 to BF16 using block-wise scaling
                    gate_weight = weight_dequant(gate_fp8, gate_scale_inv).to("cpu").to(torch.bfloat16).contiguous()
                    up_weight = weight_dequant(up_fp8, up_scale_inv).to("cpu").to(torch.bfloat16).contiguous()
                    down_weight = weight_dequant(down_fp8, down_scale_inv).to("cpu").to(torch.bfloat16).contiguous()

                elif self.input_type == "fp16":
                    # Load FP16 and convert to BF16
                    gate_weight = self._load_tensor(gate_key).to(torch.bfloat16)
                    up_weight = self._load_tensor(up_key).to(torch.bfloat16)
                    down_weight = self._load_tensor(down_key).to(torch.bfloat16)

                elif self.input_type == "bf16":
                    # Load BF16 directly
                    gate_weight = self._load_tensor(gate_key)
                    up_weight = self._load_tensor(up_key)
                    down_weight = self._load_tensor(down_key)

                else:
                    raise ValueError(f"Unsupported input_type for INT4 conversion: {self.input_type}")

                gate_weights.append(gate_weight)
                up_weights.append(up_weight)
                down_weights.append(down_weight)

            # Stack weights into single tensors: [num_experts, ...]
            gate_proj = torch.stack(gate_weights, dim=0).contiguous()
            up_proj = torch.stack(up_weights, dim=0).contiguous()
            down_proj = torch.stack(down_weights, dim=0).contiguous()
            del gate_weights, up_weights, down_weights

        print(f"  Loaded weights shapes:")
        print(f"    gate_proj: {gate_proj.shape}")
        print(f"    up_proj: {up_proj.shape}")
        print(f"    down_proj: {down_proj.shape}")

        # Create physical_to_logical_map: identity mapping where position i maps to expert i
        physical_to_logical_map = torch.arange(self.num_experts, dtype=torch.int64)

        # Map quant_method to AMX method format
        quant_to_amx_map = {
            "int4": "AMXINT4",
            "int8": "AMXINT8",
            "moe_int4": "MOE_INT4",
            "moe_int8": "MOE_INT8",
        }
        amx_method = quant_to_amx_map.get(self.quant_method, "AMXINT4")

        # Create KTMoEWrapper instance for this layer
        # gpu_experts_mask: all False means all experts are on CPU for conversion
        gpu_experts_mask = torch.zeros(self.num_experts, dtype=torch.bool)
        wrapper = KTMoEWrapper(
            layer_idx=layer_idx,
            num_experts=self.num_experts,
            num_experts_per_tok=self.num_experts_per_tok,
            hidden_size=self.hidden_size,
            moe_intermediate_size=self.moe_intermediate_size,
            gpu_experts_mask=gpu_experts_mask,  # All experts on CPU for conversion
            cpuinfer_threads=self.cpuinfer_threads,
            threadpool_count=self.threadpool_count,
            weight_path=self.output_path,  # Output path for quantized weights
            chunked_prefill_size=512,  # Arbitrary value, not critical for conversion
            cpu_save=True,  # Enable saving quantized weights to output
            method=amx_method,  # Specify quantization method (AMXINT4 or AMXINT8)
        )

        # Load and quantize weights from tensors
        # This triggers the quantization process and saves to disk
        wrapper.load_weights_from_tensors(gate_proj, up_proj, down_proj, physical_to_logical_map)

        # Clean up to free memory
        del gate_proj, up_proj, down_proj
        gc.collect()

        elapsed = time.time() - start_time

        if self.merge_to_safetensor:
            # Load quantized tensors from disk
            print(f"  Loading quantized tensors from disk...")
            layer_tensors = self._load_layer_tensors_from_disk(layer_idx)
            print(f"  Loaded {len(layer_tensors)} tensors")

            # Remove temporary layer folder
            self._remove_layer_folder(layer_idx)

            print(f"  Layer {layer_idx} quantized and saved in {elapsed:.2f}s")

            # Return loaded tensors
            return layer_tensors
        else:
            # Keep layer folders, return empty dict
            print(f"  Layer {layer_idx} quantized and saved in {elapsed:.2f}s")
            print(f"  Keeping layer folder structure at {self.output_path}/_layer_{layer_idx}")
            return {}


"""
Example usage(test passed):
python convert_cpu_weights.py --input-path /mnt/data3/models/DeepSeek-R1-0528/ --input-type fp8 --output /mnt/data3/models/DeepSeek-R1-0528-INT4-test --quant-method int4 --cpuinfer-threads 60 --threadpool-count 2
python convert_cpu_weights.py --input-path /mnt/data3/models/DeepSeek-R1-0528/ --input-type fp8 --output /mnt/data3/models/DeepSeek-R1-0528-INT8-test --quant-method int8 --cpuinfer-threads 60 --threadpool-count 2
python convert_cpu_weights.py --input-path /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct --input-type bf16 --output /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct-INT4-test --quant-method int4 --cpuinfer-threads 60 --threadpool-count 2
"""


def main():
    parser = argparse.ArgumentParser(description="Convert SafeTensors to column major 1D format")
    parser.add_argument("--input-path", "-i", required=True, help="Input directory with safetensors")
    parser.add_argument(
        "--input-type",
        choices=["awq", "fp8", "fp16", "bf16"],
        required=True,
        help="Input weight type (awq/fp8/fp16/bf16)",
    )
    parser.add_argument("--output", "-o", required=True, help="Output directory for converted safetensors")
    parser.add_argument(
        "--quant-method",
        choices=["int4", "int8", "awq", "moe_int4", "moe_int8"],
        default="int4",
        help="Quantization method for output (default: int4)",
    )
    parser.add_argument(
        "--cpuinfer-threads",
        type=int,
        default=60,
        help="Number of CPU inference threads (default: 60)",
    )
    parser.add_argument(
        "--threadpool-count",
        type=int,
        default=2,
        help="Number of NUMA subpools for thread distribution (default: 2)",
    )
    parser.add_argument("--gpu", action="store_true", help="Use GPU for conversion if available")
    parser.add_argument(
        "--no-merge-safetensor",
        action="store_true",
        default=False,
        help="Keep layer folders without merging to safetensor files (default: False)",
    )
    parser.add_argument(
        "--resume-layer",
        type=int,
        default=0,
        help="Resume conversion starting at this layer index (default: 0)",
    )

    args = parser.parse_args()

    # Validate inputs
    if not os.path.exists(args.input_path):
        print(f"Error: Input path does not exist: {args.input_path}")
        return 1
    try:
        # Load model configuration from config.json
        print("Loading model configuration...")
        model_config = load_model_config(args.input_path, args.input_type)
        print(f"Model config: {model_config}")
        print(f"  num_experts: {model_config['num_experts']}")
        print(f"  num_experts_per_tok: {model_config['num_experts_per_tok']}")
        print(f"  hidden_size: {model_config['hidden_size']}")
        print(f"  moe_intermediate_size: {model_config['moe_intermediate_size']}")
        print(f"CPU inference config:")
        print(f"  cpuinfer_threads: {args.cpuinfer_threads}")
        print(f"  threadpool_count: {args.threadpool_count}")
        print()

        # Create converter by quantization method
        quant_method = args.quant_method.lower()
        merge_to_safetensor = not args.no_merge_safetensor

        if quant_method == "awq":
            converter = AWQToColumnMajorConverter(
                args.input_path,
                args.output,
                model_config,
                args.cpuinfer_threads,
                args.threadpool_count,
                input_type=None,
                merge_to_safetensor=merge_to_safetensor,
            )
        elif quant_method in ["int4", "int8", "moe_int4", "moe_int8"] and args.input_type in ["fp8", "fp16", "bf16"]:
            # Use OnlineQuantConverter for both INT4 and INT8 quantization
            converter = OnlineQuantConverter(
                args.input_path,
                args.output,
                model_config,
                args.cpuinfer_threads,
                args.threadpool_count,
                args.input_type,
                quant_method,
                merge_to_safetensor,
            )
        else:
            raise ValueError(
                f"Unsupported quant_method: {args.quant_method} or incompatible input_type: {args.input_type}"
            )

        # Run conversion
        converter.convert(resume_layer=args.resume_layer)

        # Cleanup
        converter.close()
        return 0

    except Exception as e:
        print(f"Error during conversion: {e}")
        import traceback

        traceback.print_exc()
        return 1


if __name__ == "__main__":
    exit(main())


================================================
FILE: kt-kernel/scripts/convert_gpu_weights.py
================================================
#!/usr/bin/env python
"""
GPU Weight Quantization Tool for KTransformers

This script quantizes model weights for CPU-GPU hybrid inference when integrating
KTransformers with SGLang. It supports multiple quantization methods (GPTQ, RTN) and
applies selective quantization to GPU-resident layers while preserving certain
components (e.g., attention, gates, shared experts) in higher precision.

Usage:
    python convert_gpu_weights.py --model_id /path/to/model --output_dir /path/to/output --quant_method GPTQ --quant_type W4A16

Example (GPTQ with calibration for best accuracy):
    python convert_gpu_weights.py \
        --model_id /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct \
        --output_dir /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct-GPU-weight \
        --quant_method GPTQ \
        --quant_type W4A16

Example (RTN for fast quantization without calibration):
    python convert_gpu_weights.py \
        --model_id /mnt/data/models/GLM-4.5-Air \
        --output_dir /mnt/data/models/GLM-4.5-Air-GPU-weights-rtn \
        --quant_method RTN \
        --quant_type W4A16
"""

import os
import sys
import warnings
import argparse

# IMPORTANT: Parse force_cpu argument BEFORE importing torch
# CUDA_VISIBLE_DEVICES must be set before torch initializes CUDA
if __name__ == "__main__":
    # Quick check for --force_cpu flag before full argument parsing
    if "--force_cpu" in sys.argv:
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        warnings.filterwarnings("ignore", message="Can't initialize NVML")
        print("🔧 Forced CPU-only mode (CUDA_VISIBLE_DEVICES set before torch import)")

# Now it's safe to import torch and other GPU-dependent libraries
import torch
from accelerate import init_empty_weights, infer_auto_device_map
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization.gptq import GPTQModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from datasets import load_dataset


def parse_args():
    parser = argparse.ArgumentParser(description="Quantize MoE models with selective quantization")

    # Required arguments
    parser.add_argument("--model_id", type=str, required=True, help="Path to the input model directory")
    parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model")

    # Optional arguments
    parser.add_argument(
        "--quant_method",
        type=str,
        choices=["GPTQ", "RTN"],
        default="GPTQ",
        help="Quantization method: GPTQ (calibration-based) or RTN (round-to-nearest, no calibration). Default: GPTQ",
    )
    parser.add_argument(
        "--quant_type",
        type=str,
        choices=["W4A16", "W8A16"],
        default="W8A16",
        help="Quantization type: W4A16 (INT4) or W8A16 (INT8). Default: W8A16",
    )
    parser.add_argument(
        "--num_calibration_samples",
        type=int,
        default=512,
        help="Number of calibration samples (GPTQ only). Default: 512",
    )
    parser.add_argument(
        "--max_sequence_length",
        type=int,
        default=2048,
        help="Maximum sequence length for calibration (GPTQ only). Default: 2048",
    )
    parser.add_argument(
        "--dampening_frac",
        type=float,
        default=0.1,
        help="Dampening fraction to mitigate quantization noise (GPTQ only). Default: 0.1",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        default="HuggingFaceH4/ultrachat_200k",
        help="Dataset for calibration (GPTQ only). Default: HuggingFaceH4/ultrachat_200k",
    )
    parser.add_argument(
        "--dataset_split", type=str, default="train_sft", help="Dataset split to use (GPTQ only). Default: train_sft"
    )
    parser.add_argument(
        "--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
    )
    parser.add_argument(
        "--ignore_patterns",
        type=str,
        nargs="*",
        default=[
            "lm_head",
            r"re:.*\.mlp\.gate$",
            r"re:.*\.self_attn\..*$",
            r"re:.*\.shared_expert\..*$",
            r"re:.*\.shared_experts\..*$",
            r"re:.*\.mlp\.shared_expert_gate$",
            r"re:.*\.linear_attn\..*$",
        ],
        help="Regex patterns for layers to ignore during quantization",
    )
    parser.add_argument(
        "--torch_dtype",
        type=str,
        choices=["bfloat16", "float16", "float32"],
        default="bfloat16",
        help="PyTorch dtype for model loading. Default: bfloat16",
    )
    parser.add_argument(
        "--trust_remote_code", action="store_true", help="Allow loading of remote code (required for some models)"
    )
    parser.add_argument("--random_seed", type=int, default=42, help="Random seed for dataset shuffling. Default: 42")
    parser.add_argument(
        "--max_gpu_memory",
        type=str,
        default=None,
        help="Maximum GPU memory for model weights per device (e.g., '40GiB'). "
        "GPTQ quantization requires additional GPU memory for Hessian matrix computation, "
        "so reserve 40-50%% of total VRAM. For example, use '40GiB' on 80GB GPUs. "
        "Remaining layers will be offloaded to CPU. Default: use all available",
    )
    parser.add_argument(
        "--max_cpu_memory",
        type=str,
        default=None,
        help="Maximum CPU memory to use (e.g., '100GiB'). Default: use all available",
    )

    return parser.parse_args()


def setup_environment(force_cpu=False):
    """
    Verify environment setup (actual setup happens before torch import).

    Args:
        force_cpu: If True, was requested to force CPU-only mode

    Note:
        CUDA_VISIBLE_DEVICES must be set BEFORE importing torch.
        The actual environment setup is done at module import time.
    """
    if force_cpu:
        # Verify the environment variable was set correctly
        cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
        if cuda_visible != "":
            print("⚠️  Warning: force_cpu was requested but CUDA_VISIBLE_DEVICES is not empty")
            print(f"   Current value: '{cuda_visible}'")
            print("   This may happen if imported as a module. Recommend running as script.")
        else:
            print("✅ CPU-only mode verified (CUDA_VISIBLE_DEVICES is empty)")


def get_torch_dtype(dtype_str):
    """
    Convert string to torch dtype.

    Args:
        dtype_str: String representation of dtype ("bfloat16", "float16", "float32")

    Returns:
        torch.dtype: Corresponding PyTorch dtype
    """
    dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}
    return dtype_map[dtype_str]


def check_dense_layers_and_update_ignore(model_id, ignore_patterns, trust_remote_code=False):
    """
    Check if the model has dense layers (first_k_dense_replace parameter) and add them to ignore list.

    Some MoE models have dense MLP layers in the first few layers instead of MoE layers.
    These dense layers should not be quantized using the same scheme as expert layers.

    Args:
        model_id: Path to the model
        ignore_patterns: List of existing ignore patterns
        trust_remote_code: Whether to trust remote code

    Returns:
        Updated ignore_patterns list with dense layer patterns added
    """
    print("🔍 Checking model configuration for dense layers...")

    try:
        # Load model configuration
        config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)

        # Check if the model has first_k_dense_replace parameter
        first_k_dense_replace = getattr(config, "first_k_dense_replace", None)

        if first_k_dense_replace is not None and first_k_dense_replace > 0:
            print(f"✅ Found dense layers configuration: first_k_dense_replace = {first_k_dense_replace}")
            print(f"   Adding first {first_k_dense_replace} layers to ignore list...")

            # Create regex pattern for dense layers (layers 0 to first_k_dense_replace-1)
            if first_k_dense_replace == 1:
                dense_pattern = r"re:model\.layers\.0\.mlp\..*$"
            else:
                # For multiple layers, use range pattern
                layer_range = f"[0-{first_k_dense_replace-1}]"
                dense_pattern = f"re:model\\.layers\\.{layer_range}\\.mlp\\..*$"

            # Add the dense layer pattern to ignore list
            updated_ignore_patterns = ignore_patterns + [dense_pattern]

            print(f"   Dense layer pattern added: {dense_pattern}")
            print(f"   This will ignore MLP components in layers 0-{first_k_dense_replace-1}")

            return updated_ignore_patterns
        else:
            print("ℹ️  No dense layers detected (first_k_dense_replace not found or is 0)")
            return ignore_patterns

    except Exception as e:
        print(f"⚠️  Warning: Could not check model config for dense layers: {e}")
        print("   Proceeding with original ignore patterns...")
        return ignore_patterns


def load_and_prepare_dataset(dataset_name, dataset_split, num_samples, max_length, tokenizer, seed=42):
    """
    Load and prepare calibration dataset for GPTQ quantization.

    GPTQ requires calibration data to compute optimal quantization parameters.
    This function loads a conversation dataset, applies chat template, and tokenizes it.

    Args:
        dataset_name: HuggingFace dataset name
        dataset_split: Dataset split to use (e.g., "train_sft")
        num_samples: Number of samples to use for calibration
        max_length: Maximum sequence length for tokenization
        tokenizer: Model tokenizer
        seed: Random seed for shuffling

    Returns:
        Dataset with tokenized calibration samples
    """
    print(f"📊 Loading dataset: {dataset_name}")

    # Load dataset
    ds = load_dataset(dataset_name, split=f"{dataset_split}[:{num_samples}]")
    ds = ds.shuffle(seed=seed)

    # Preprocess the data into the format the model is trained with
    def preprocess(example):
        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

    ds = ds.map(preprocess)

    # Tokenize the data
    def tokenize(sample):
        return tokenizer(
            sample["text"], padding=False, max_length=max_length, truncation=True, add_special_tokens=False
        )

    ds = ds.map(tokenize, remove_columns=ds.column_names)
    print(f"✅ Dataset prepared with {len(ds)} samples")

    return ds


def main():
    """
    Main function for GPU weight quantization.

    This performs weight quantization on model weights intended for GPU execution
    in CPU-GPU hybrid inference scenarios. Supports two quantization methods:

    1. GPTQ (default): Calibration-based quantization for better accuracy
       - Requires calibration dataset
       - Higher accuracy but slower
       - Recommended for production use

    2. RTN (Round-To-Nearest): Fast quantization without calibration
       - No calibration dataset needed
       - Faster but may have lower accuracy
       - Good for quick testing or prototyping

    The quantization is selective:
    - Expert MLP weights are quantized to INT4/INT8
    - Attention layers, gates, and shared experts remain in original precision
    - Dense layers (if present) are excluded from quantization

    The quantized model can be used with SGLang+KTransformers for heterogeneous
    inference, where "hot" experts run on GPU and "cold" experts run on CPU.
    """
    args = parse_args()

    # Setup environment
    setup_environment(args.force_cpu)

    # Convert torch dtype
    torch_dtype = get_torch_dtype(args.torch_dtype)

    print(f"🚀 Starting quantization process")
    print(f"   Model: {args.model_id}")
    print(f"   Output: {args.output_dir}")
    print(f"   Quantization method: {args.quant_method}")
    print(f"   Quantization type: {args.quant_type}")
    if args.quant_method == "GPTQ":
        print(f"   Calibration samples: {args.num_calibration_samples}")
        print(f"   Max sequence length: {args.max_sequence_length}")
    else:
        print(f"   Calibration: Not required for {args.quant_method}")

    # --------------------------------------------------------------------
    # 0) Check for dense layers and update ignore patterns
    # Dense layers in the first few layers should not be quantized
    updated_ignore_patterns = check_dense_layers_and_update_ignore(
        args.model_id, args.ignore_patterns, args.trust_remote_code
    )

    # --------------------------------------------------------------------
    # 1) Build a dummy model (no weights) to infer a device map
    # This determines optimal device placement for each module
    if args.force_cpu:
        # In force_cpu mode, directly get module names without calling infer_auto_device_map
        # to avoid GPU memory allocation
        print("🔍 Building CPU-only device map...")
        with init_empty_weights():
            dummy = AutoModelForCausalLM.from_pretrained(
                args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
            )
            device_map = {name: "cpu" for name, _ in dummy.named_modules() if name}
            del dummy
    else:
        print("🔍 Inferring device map...")
        with init_empty_weights():
            dummy = AutoModelForCausalLM.from_pretrained(
                args.model_id, torch_dtype=torch_dtype, trust_remote_code=args.trust_remote_code
            )
            # Build max_memory dict if specified
            max_memory = None
            if args.max_gpu_memory or args.max_cpu_memory:
                max_memory = {}
                if args.max_gpu_memory:
                    # Apply to all available GPUs
                    num_gpus = torch.cuda.device_count()
                    for i in range(num_gpus):
                        max_memory[i] = args.max_gpu_memory
                    print(f"   GPU memory limit: {args.max_gpu_memory} per device ({num_gpus} GPUs)")

                # Always set CPU memory when max_memory is used
                # Otherwise infer_auto_device_map may trigger disk offloading
                if args.max_cpu_memory:
                    max_memory["cpu"] = args.max_cpu_memory
                    print(f"   CPU memory limit: {args.max_cpu_memory}")
                else:
                    # Use a very large value to allow using all available CPU memory
                    # This prevents disk offloading when user has enough RAM
                    max_memory["cpu"] = "1000GiB"
                    print(f"   CPU memory limit: 1000GiB (default, to prevent disk offloading)")

            device_map = infer_auto_device_map(
                dummy, no_split_module_classes=dummy._no_split_modules, max_memory=max_memory
            )

            # Check if disk offloading was triggered (not supported by llmcompressor)
            disk_modules = [k for k, v in device_map.items() if v == "disk"]
            if disk_modules:
                print(f"❌ Error: {len(disk_modules)} modules would be offloaded to disk.")
                print("   llmcompressor does not support disk offloading.")
                print("   Solutions:")
                print("   1. Increase --max_gpu_memory to use more GPU memory")
                print("   2. Add --max_cpu_memory with higher value (e.g., '200GiB')")
                print("   3. Ensure your machine has enough GPU + CPU memory")
                raise RuntimeError(
                    "Disk offloading is not supported by llmcompressor. "
                    "Please ensure you have enough GPU + CPU memory."
                )

            del dummy
    # --------------------------------------------------------------------
    # 2) Load the full model weights with device mapping
    # Note: offload_folder=None disables disk offloading (not supported by llmcompressor)
    print("📥 Loading model...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            args.model_id,
            device_map=device_map,
            torch_dtype=torch_dtype,
            trust_remote_code=args.trust_remote_code,
            offload_folder=None,  # Disable disk offloading (not supported by llmcompressor)
        )
    except Exception as e:
        if "disk" in str(e).lower() or "offload" in str(e).lower():
            print(f"❌ Error: Not enough GPU + CPU memory to load the model.")
            print("   llmcompressor does not support disk offloading.")
            print("   Solutions:")
            print("   1. Increase --max_gpu_memory to use more GPU memory")
            print("   2. Ensure you have enough CPU RAM for remaining layers")
            print("   3. Use a machine with more memory")
            raise
        raise

    tokenizer = AutoTokenizer.from_pretrained(args.model_id)

    # --------------------------------------------------------------------
    # 3) Prepare calibration dataset
    # GPTQ needs calibration data to compute optimal quantization parameters
    if args.quant_method == "GPTQ":
        ds = load_and_prepare_dataset(
            args.dataset,
            args.dataset_split,
            args.num_calibration_samples,
            args.max_sequence_length,
            tokenizer,
            args.random_seed,
        )

    # --------------------------------------------------------------------
    # 4) Create quantization recipe with selective layer exclusion
    print(f"⚙️  Setting up {args.quant_method} {args.quant_type} quantization recipe...")
    if args.quant_method == "GPTQ":
        # GPTQ: calibration-based quantization for better accuracy
        recipe = GPTQModifier(
            targets="Linear",  # Target all Linear layers
            scheme=args.quant_type,  # W4A16 or W8A16
            ignore=updated_ignore_patterns,  # Exclude specific patterns
            dampening_frac=args.dampening_frac,
        )
    elif args.quant_method == "RTN":
        # RTN (Round-To-Nearest): fast quantization without calibration
        recipe = QuantizationModifier(
            targets="Linear",  # Target all Linear layers
            scheme=args.quant_type,  # W4A16 or W8A16
            ignore=updated_ignore_patterns,  # Exclude specific patterns
        )
    else:
        raise ValueError(f"Unsupported quantization method: {args.quant_method}")

    print("🔧 Ignoring the following patterns from quantization:")
    for i, pattern in enumerate(updated_ignore_patterns):
        marker = "🆕" if i >= len(args.ignore_patterns) else "   "
        print(f"   {marker} {pattern}")

    # --------------------------------------------------------------------
    # 5) Perform one-shot quantization
    # GPTQ: calibration-based quantization to minimize accuracy loss
    # RTN: fast round-to-nearest quantization without calibration
    print("🎯 Starting one-shot quantization...")
    if args.quant_method == "GPTQ":
        # GPTQ requires calibration dataset
        oneshot(
            model=model,
            dataset=ds,
            recipe=recipe,
            output_dir=args.output_dir,
            max_seq_length=args.max_sequence_length,
            num_calibration_samples=args.num_calibration_samples,
        )
    elif args.quant_method == "RTN":
        # RTN does not require calibration dataset
        oneshot(
            model=model,
            recipe=recipe,
            output_dir=args.output_dir,
        )
    else:
        raise ValueError(f"Unsupported quantization method: {args.quant_method}")

    print(f"\n✅ Quantized model written to: {args.output_dir}")
    print(f"   Quantization method: {args.quant_method}")
    print(f"   Quantization type: {args.quant_type}")
    print(f"   Ignored patterns remain in {args.torch_dtype}")
    print("🎉 Quantization completed successfully!")


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/scripts/convert_kimi_k2_fp8_to_bf16_cpu.py
================================================
import os
import json
from argparse import ArgumentParser
from glob import glob
from tqdm import tqdm

import torch
from safetensors.torch import load_file, save_file

import gc


def weight_dequant_cpu(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
    assert x.dim() == 2 and s.dim() == 2, "Expect 2D tensors for x and s"
    M, N = x.shape
    n_m = (M + block_size - 1) // block_size
    n_n = (N + block_size - 1) // block_size

    y = torch.empty((M, N), dtype=torch.bfloat16, device="cpu")
    for bm in range(n_m):
        m0 = bm * block_size
        m1 = min(m0 + block_size, M)
        for bn in range(n_n):
            n0 = bn * block_size
            n1 = min(n0 + block_size, N)
            scale = s[bm, bn].item()
            sub = x[m0:m1, n0:n1].to(torch.float32) * scale
            y[m0:m1, n0:n1] = sub.to(torch.bfloat16)
    return y


def main(fp8_path, bf16_path):
    torch.set_default_dtype(torch.bfloat16)
    os.makedirs(bf16_path, exist_ok=True)
    model_index_file = os.path.join(fp8_path, "model.safetensors.index.json")
    with open(model_index_file, "r") as f:
        model_index = json.load(f)
    weight_map = model_index["weight_map"]

    loaded_files = {}
    fp8_weight_names = []

    def get_tensor(tensor_name):
        file_name = weight_map[tensor_name]
        if file_name not in loaded_files:
            file_path = os.path.join(fp8_path, file_name)
            loaded_files[file_name] = load_file(file_path, device="cpu")
        return loaded_files[file_name][tensor_name]

    safetensor_files = list(glob(os.path.join(fp8_path, "*.safetensors")))
    safetensor_files.sort()
    for safetensor_file in tqdm(safetensor_files, desc="weight file convert"):
        file_name = os.path.basename(safetensor_file)
        current_state_dict = load_file(safetensor_file, device="cpu")
        loaded_files[file_name] = current_state_dict

        new_state_dict = {}
        for weight_name, weight in current_state_dict.items():
            if weight_name.endswith("_scale_inv"):
                continue
            elif weight.element_size() == 1:
                scale_inv_name = f"{weight_name}_scale_inv"
                try:
                    scale_inv = get_tensor(scale_inv_name)
                    fp8_weight_names.append(weight_name)
                    new_state_dict[weight_name] = weight_dequant_cpu(weight, scale_inv)
                except KeyError:
                    print(f"Warning: {weight_name}loss scale factor")
                    new_state_dict[weight_name] = weight
            else:
                new_state_dict[weight_name] = weight

        new_safetensor_file = os.path.join(bf16_path, file_name)
        save_file(new_state_dict, new_safetensor_file)

        if len(loaded_files) > 2:
            oldest_file = next(iter(loaded_files))
            del loaded_files[oldest_file]
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    new_model_index_file = os.path.join(bf16_path, "model.safetensors.index.json")
    for weight_name in fp8_weight_names:
        scale_inv_name = f"{weight_name}_scale_inv"
        if scale_inv_name in weight_map:
            weight_map.pop(scale_inv_name)
    with open(new_model_index_file, "w") as f:
        json.dump({"metadata": {}, "weight_map": weight_map}, f, indent=2)
    print(f"Finish, Result in: {bf16_path}")


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--input-fp8-hf-path", type=str, required=True, help="Kimi-K2 FP8 model")
    parser.add_argument("--output-bf16-hf-path", type=str, required=True, help="BF16 model (After convert)")
    args = parser.parse_args()
    main(args.input_fp8_hf_path, args.output_bf16_hf_path)


================================================
FILE: kt-kernel/scripts/convert_moe_to_bf16.py
================================================
import argparse
import json
import os
from collections import defaultdict
from typing import Dict, Iterable, List, Optional, Tuple

import torch
from safetensors.torch import save_file, safe_open

from compressed_tensors.compressors import unpack_from_int32


def _load_config(model_dir: str, config_path: Optional[str]) -> Tuple[int, int, int]:
    cfg_path = config_path or os.path.join(model_dir, "config.json")
    with open(cfg_path, "r") as f:
        cfg = json.load(f)
    hidden_size = int(cfg.get("hidden_size"))
    inter_size = int(cfg.get("moe_intermediate_size"))
    group_size = int(
        cfg.get("quantization_config", {})
        .get("config_groups", {})
        .get("group_0", {})
        .get("weights", {})
        .get("group_size", 32)
    )
    return hidden_size, inter_size, group_size


def _dequantize_tensor(
    weight_packed: torch.Tensor,
    weight_scale: torch.Tensor,
    weight_shape: torch.Tensor,
    group_size: int,
) -> torch.Tensor:
    if isinstance(weight_shape, torch.Tensor):
        shape = tuple(int(v) for v in weight_shape.view(-1).tolist())
    else:
        shape = tuple(weight_shape)
    weight = unpack_from_int32(weight_packed, 4, shape)
    if group_size > 0:
        scale = weight_scale.to(torch.float32)
        if scale.dim() == 1:
            scale = scale.unsqueeze(1)
        scales = torch.repeat_interleave(scale, repeats=group_size, dim=1)
    else:
        scales = weight_scale.to(torch.float32)
    if scales.shape != weight.shape:
        if scales.numel() == weight.numel():
            scales = scales.reshape_as(weight)
        else:
            raise ValueError(f"Scale shape {scales.shape} incompatible with weight shape {weight.shape}")
    bf16 = (weight.to(torch.float32) * scales).to(torch.bfloat16)
    return bf16.contiguous()


def _is_quantized_weight_key(key: str) -> bool:
    if ".mlp.experts." not in key or ".shared_experts." in key:
        return False
    suffixes = ("weight_packed", "weight_scale", "weight_shape")
    for proj in ("gate_proj", "up_proj", "down_proj"):
        for suffix in suffixes:
            if key.endswith(f".{proj}.{suffix}"):
                return True
    return False


def convert_file(
    input_path: str,
    output_path: str,
    group_size: int,
    skip_existing: bool = True,
):
    if skip_existing and os.path.exists(output_path):
        print(f"[skip] {output_path} already exists.")
        return

    tensors: Dict[str, torch.Tensor] = {}
    expert_buffers: Dict[str, Dict[str, Dict[str, torch.Tensor]]] = defaultdict(lambda: defaultdict(dict))

    with safe_open(input_path, framework="pt") as reader:
        keys = list(reader.keys())
        for key in keys:
            tensor = reader.get_tensor(key).detach().cpu()

            if not _is_quantized_weight_key(key):
                tensors[key] = tensor
                continue

            parts = key.split(".")
            try:
                expert_idx = parts.index("experts")
            except ValueError:
                tensors[key] = tensor
                continue

            prefix = ".".join(parts[: expert_idx + 2])
            project = parts[-2]
            suffix = parts[-1]
            expert_buffers[prefix][project][suffix] = tensor

    stats = {
        "converted": 0,
        "skipped": 0,
    }

    for prefix, components in expert_buffers.items():
        for proj_name in ["gate_proj", "up_proj", "down_proj"]:
            proj_data = components.get(proj_name, {})
            required = {"weight_packed", "weight_scale", "weight_shape"}
            if not required.issubset(proj_data.keys()):
                print(f"[warn] Missing components for {prefix}.{proj_name}, keeping quantized tensors.")
                for suffix, value in proj_data.items():
                    tensors[f"{prefix}.{proj_name}.{suffix}"] = value
                stats["skipped"] += 1
                continue

            bf16_weight = _dequantize_tensor(
                proj_data["weight_packed"].to(torch.int32),
                proj_data["weight_scale"].to(torch.float32),
                proj_data["weight_shape"],
                group_size,
            )
            tensors[f"{prefix}.{proj_name}.weight"] = bf16_weight.to(torch.bfloat16)
            stats["converted"] += 1
            print(f"    converted {prefix}.{proj_name}.weight -> bf16")

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    save_file(tensors, output_path)
    print(f"[done] wrote {output_path} (converted={stats['converted']}, skipped={stats['skipped']})")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert MoE experts to BF16 weights.")
    parser.add_argument("--model-dir", required=True, help="Directory containing safetensors checkpoints.")
    parser.add_argument(
        "--output-dir",
        default=None,
        help="Destination directory for converted checkpoints (default: <model-dir>_bf16).",
    )
    parser.add_argument(
        "--files",
        nargs="+",
        default=None,
        help="Specific safetensor filenames to convert (relative to model-dir). Convert all if omitted.",
    )
    parser.add_argument(
        "--config-path",
        default=None,
        help="Path to config.json for extracting group_size (default: model-dir/config.json).",
    )
    parser.add_argument(
        "--overwrite",
        action="store_true",
        help="Rewrite output files even if they already exist.",
    )
    return parser.parse_args()


def main():
    args = parse_args()
    model_dir = os.path.abspath(args.model_dir)
    output_dir = os.path.abspath(args.output_dir or f"{model_dir}_bf16")

    if not os.path.isdir(model_dir):
        raise FileNotFoundError(f"Model directory not found: {model_dir}")

    _, _, group_size = _load_config(model_dir, args.config_path)

    if args.files:
        targets = [os.path.join(model_dir, fname) for fname in args.files]
    else:
        targets = [
            os.path.join(model_dir, name) for name in sorted(os.listdir(model_dir)) if name.endswith(".safetensors")
        ]

    if not targets:
        print("No safetensors checkpoints found.")
        return

    total = len(targets)

    for idx, path in enumerate(targets, start=1):
        if not os.path.isfile(path):
            print(f"[skip] {path} is not a file.")
            continue
        rel = os.path.relpath(path, model_dir)
        output_path = os.path.join(output_dir, rel)
        print(f"[{idx}/{total}] converting {rel}")
        convert_file(path, output_path, group_size, skip_existing=not args.overwrite)


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/scripts/install-git-hooks.sh
================================================
#!/usr/bin/env sh
# Install git hooks from kt-kernel/.githooks into the monorepo's .git/hooks by
# creating symlinks (or copying if symlink fails).

set -eu

# This script lives in kt-kernel/scripts/, so REPO_ROOT = kt-kernel
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
HOOKS_SRC="$REPO_ROOT/.githooks"

# Detect the top-level Git worktree (the monorepo root: ktransformers)
GIT_TOP="$(git rev-parse --show-toplevel 2>/dev/null || true)"
if [ -z "$GIT_TOP" ] || [ ! -d "$GIT_TOP/.git" ]; then
  echo "[install-git-hooks] Not inside a git worktree; skipping hooks installation." >&2
  exit 0
fi

GIT_DIR="$GIT_TOP/.git"
HOOKS_DEST="$GIT_DIR/hooks"

if [ ! -d "$HOOKS_SRC" ]; then
  echo "[install-git-hooks] No .githooks directory found at $HOOKS_SRC" >&2
  exit 1
fi

echo "[install-git-hooks] Installing git hooks from $HOOKS_SRC to $HOOKS_DEST (repo: $GIT_TOP)"

# Ensure all source hook files are executable so that even if copied (not symlinked) they run.
for src_hook in "$HOOKS_SRC"/*; do
  [ -f "$src_hook" ] || continue
  if [ ! -x "$src_hook" ]; then
    chmod +x "$src_hook" || true
  fi
done

for hook in "$HOOKS_SRC"/*; do
  [ -e "$hook" ] || continue
  name=$(basename "$hook")
  dest="$HOOKS_DEST/$name"

  # Remove existing hook if it's our symlink or a file
  if [ -L "$dest" ] || [ -f "$dest" ]; then
    rm -f "$dest"
  fi

  # Try symlink first
  if ln -s "$hook" "$dest" 2>/dev/null; then
    echo "linked $name"
  else
    # Fall back to copying and preserve executable bit
    cp "$hook" "$dest"
    chmod +x "$dest"
    echo "copied $name"
  fi
done

echo "[install-git-hooks] Done. Hooks installed."


================================================
FILE: kt-kernel/setup.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Lightweight packaging script for building and distributing kt-kernel,
a high-performance kernel operations library for KTransformers.

    pip install kt-kernel
    >>> from kt_kernel import AMXMoEWrapper

This script drives your existing CMake build (root `CMakeLists.txt`) and
only needs a working C++ toolchain, CMake (>=3.16), and pybind11 (vendored
already in the repo).

Environment knobs (export before running pip install .):
  CPUINFER_FORCE_REBUILD=1        Always rebuild (ignore any cached build)
  CPUINFER_BUILD_TYPE=Release     Debug / RelWithDebInfo / Release
  CPUINFER_PARALLEL=8             Parallel build jobs (auto = detected cores)
  CPUINFER_CPU_INSTRUCT=FANCY     One of: NATIVE|FANCY|AVX512|AVX2 (maps to CMake flags)
  CPUINFER_ENABLE_AMX=OFF         ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX
  CPUINFER_ENABLE_MLA=OFF         ON/OFF -> -DKTRANSFORMERS_CPU_MLA
  CPUINFER_ENABLE_BLIS=OFF         ON/OFF -> -DKTRANSFORMERS_CPU_MOE_AMD
  CPUINFER_ENABLE_KML=OFF         ON/OFF -> -DKTRANSFORMERS_CPU_USE_KML
  CPUINFER_ENABLE_AVX512=OFF      ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX_AVX512
  CPUINFER_ENABLE_AVX512_VNNI=OFF ON/OFF -> -DLLAMA_AVX512_VNNI
  CPUINFER_ENABLE_AVX512_BF16=OFF ON/OFF -> -DLLAMA_AVX512_BF16
  CPUINFER_ENABLE_AVX512_VBMI=OFF ON/OFF -> -DLLAMA_AVX512_VBMI (required for FP8 MoE)
  CPUINFER_BLIS_ROOT=/path/to/blis  Forward to -DBLIS_ROOT


  CPUINFER_ENABLE_LTO=ON          ON/OFF -> -DCPUINFER_ENABLE_LTO (your added option)
  CPUINFER_LTO_JOBS=8             Forward to -DCPUINFER_LTO_JOBS
  CPUINFER_LTO_MODE=auto          Forward to -DCPUINFER_LTO_MODE
  CPUINFER_NATIVE=ON               (override LLAMA_NATIVE)


GPU backends (if ever added later, keep placeholders):
  CPUINFER_USE_CUDA=0/1           -DKTRANSFORMERS_USE_CUDA
  CPUINFER_USE_ROCM=0/1           -DKTRANSFORMERS_USE_ROCM
  CPUINFER_USE_MUSA=0/1           -DKTRANSFORMERS_USE_MUSA

Usage:
  pip install .
Or build wheel:
  python -m build  (if you have build/installed)

Resulting wheel exposes a top-level package `kt_kernel` with AMXMoEWrapper and other kernel wrappers.
"""
from __future__ import annotations
import os
import re
import sys
import platform
import subprocess
from pathlib import Path
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext
import shutil


# -------------------------
# Env parsing helpers
# -------------------------
def _env_get_bool(name: str, default: bool | None = None) -> bool | None:
    v = os.environ.get(name)
    if v is None:
        return default
    val = v.strip().lower()
    if val in ("1", "on", "true", "yes", "y", "enable", "enabled"):
        return True
    if val in ("0", "off", "false", "no", "n", "disable", "disabled"):
        return False
    return default


def _cmake_onoff(flag: bool) -> str:
    return "ON" if flag else "OFF"


def _forward_bool_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
    """If env exists, forward it to CMake as -D<flag>=ON/OFF and return True; else return False."""
    b = _env_get_bool(env_name, None)
    if b is None:
        return False
    cmake_args.append(f"-D{cmake_flag}={_cmake_onoff(b)}")
    print(f"-- Forward {env_name} -> -D{cmake_flag}={_cmake_onoff(b)}")
    return True


def _forward_str_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
    v = os.environ.get(env_name)
    if not v:
        return False
    cmake_args.append(f"-D{cmake_flag}={v}")
    print(f"-- Forward {env_name} -> -D{cmake_flag}={v}")
    return True


################################################################################
# Helpers
################################################################################

REPO_ROOT = Path(__file__).parent.resolve()

CPU_FEATURE_MAP = {
    "FANCY": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON",
    "AVX512": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON",
    "AVX2": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON",
    "NATIVE": "-DLLAMA_NATIVE=ON",
}


def default_build_type() -> str:
    return os.environ.get("CPUINFER_BUILD_TYPE", "Release")


def detect_parallel_jobs() -> str:
    if "CPUINFER_PARALLEL" in os.environ:
        return os.environ["CPUINFER_PARALLEL"]
    try:
        import multiprocessing

        return str(multiprocessing.cpu_count())
    except Exception:
        return "1"


def cpu_feature_flags() -> list[str]:
    mode = os.environ.get("CPUINFER_CPU_INSTRUCT", "NATIVE").upper()
    return [tok for tok in CPU_FEATURE_MAP.get(mode, CPU_FEATURE_MAP["NATIVE"]).split() if tok]


################################################################################
# CMakeExtension + builder
################################################################################


class CMakeExtension(Extension):
    def __init__(self, name: str, sourcedir: str = ""):
        super().__init__(name, sources=[])
        self.sourcedir = str(Path(sourcedir).resolve())


class CMakeBuild(build_ext):
    def run(self):
        # Ensure CMake present
        try:
            subprocess.run(["cmake", "--version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        except Exception as e:  # pragma: no cover
            raise RuntimeError("CMake is required to build this project") from e
        super().run()

    def detect_cpu_info(self) -> dict:
        """Detect CPU vendor/arch and instruction set features.

        Returns a dict like:
            {
                'vendor': 'intel'|'amd'|'arm'|'unknown',
                'arch': platform.machine().lower(),
                'features': set(['AVX2','AVX512','AMX']),
                'raw': { 'flags': set([...]) }
            }
        """
        info = {
            "vendor": "unknown",
            "arch": platform.machine().lower(),
            "features": set(),
            "raw": {"flags": set()},
        }
        try:
            sysname = platform.system()
            if sysname == "Linux":
                with open("/proc/cpuinfo", "r", encoding="utf-8", errors="ignore") as f:
                    cpuinfo = f.read()
                low = cpuinfo.lower()

                # vendor
                if "vendor_id" in low:
                    # Typical x86 linux
                    m = re.search(r"vendor_id\s*:\s*(\S+)", cpuinfo)
                    if m:
                        v = m.group(1).lower()
                        if "genuineintel" in v:
                            info["vendor"] = "intel"
                        elif "authenticamd" in v:
                            info["vendor"] = "amd"
                # ARM sometimes has 'model name' or 'Hardware'
                if info["vendor"] == "unknown":
                    if any(tok in low for tok in ["aarch64", "armv8", "arm cortex", "kunpeng", "kirin", "huawei"]):
                        info["vendor"] = "arm"

                # flags collection (x86 uses 'flags', arm uses 'Features')
                flags = set()
                for key in ("flags", "Features", "features"):
                    m = re.search(rf"^{key}\s*:\s*(.+)$", cpuinfo, re.IGNORECASE | re.MULTILINE)
                    if m:
                        flags.update(m.group(1).lower().split())
                info["raw"]["flags"] = flags

                # feature summary
                if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl"]):
                    info["features"].add("AVX512")
                if "avx2" in flags or "avx2" in low:
                    info["features"].add("AVX2")
                # AMX flags on Linux are with underscores; keep hyphen fallback just in case
                if any(
                    f in flags or f in low
                    for f in ["amx_bf16", "amx_int8", "amx_tile", "amx-bf16", "amx-int8", "amx-tile"]
                ):
                    info["features"].add("AMX")

                # Fine-grained AVX512 subset detection
                if any(f in flags for f in ["avx512_vnni", "avx512vnni"]):
                    info["features"].add("AVX512_VNNI")
                if any(f in flags for f in ["avx512_bf16", "avx512bf16"]):
                    info["features"].add("AVX512_BF16")
                if any(f in flags for f in ["avx512_vbmi", "avx512vbmi"]):
                    info["features"].add("AVX512_VBMI")
                if any(f in flags for f in ["avx512_vpopcntdq", "avx512vpopcntdq"]):
                    info["features"].add("AVX512_VPOPCNTDQ")

            elif sysname == "Darwin":
                # macOS: Apple Silicon (arm64) vs Intel
                arch = platform.machine().lower()
                info["arch"] = arch
                if arch in ("arm64", "aarch64"):
                    info["vendor"] = "arm"
                else:
                    info["vendor"] = "intel"
                # No AVX/AMX on Apple Silicon; assume none

            elif sysname == "Windows":
                # Minimal detection via arch; detailed CPUID omitted for brevity
                arch = platform.machine().lower()
                info["arch"] = arch
                if arch in ("arm64", "aarch64"):
                    info["vendor"] = "arm"
                else:
                    # Could be Intel or AMD; leave unknown
                    info["vendor"] = "unknown"
        except Exception as e:
            print(f"Warning: CPU detection failed: {e}")
        return info

    def build_extension(self, ext: CMakeExtension):
        """
        Main entry point for building the extension.

        Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1)
        and routes to the appropriate build method.
        """
        if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False):
            # Build all 3 variants (AMX, AVX512, AVX2)
            self.build_multi_variants(ext)
        else:
            # Build single variant (original behavior)
            self._build_single_variant(ext)

    def build_multi_variants(self, ext: CMakeExtension):
        """
        Build all 6 CPU variants with progressive AVX512 capabilities.

        This creates 6 separate .so files optimized for different CPU generations:
        - _kt_kernel_ext_avx2.so         (Haswell+, 2013)
        - _kt_kernel_ext_avx512_base.so  (Skylake-X+, 2017)
        - _kt_kernel_ext_avx512_vnni.so  (Cascade Lake+, 2019)
        - _kt_kernel_ext_avx512_vbmi.so  (Ice Lake client, 2019)
        - _kt_kernel_ext_avx512_bf16.so  (Ice Lake server/Zen 4+, 2021)
        - _kt_kernel_ext_amx.so          (Sapphire Rapids+, 2023)

        Runtime CPU detection (in _cpu_detect.py) will automatically select the best match.
        """
        print("=" * 70)
        print("Building kt-kernel with ALL 6 CPU variants")
        print("=" * 70)
        print()
        print("This will build six progressive variants in a single wheel:")
        print("  1. AVX2          - Haswell+ (2013)")
        print("  2. AVX512 Base   - Skylake-X+ (2017)")
        print("  3. AVX512+VNNI   - Cascade Lake+ (2019)")
        print("  4. AVX512+VBMI   - Ice Lake client (2019)")
        print("  5. AVX512+BF16   - Ice Lake server, Zen 4+ (2021)")
        print("  6. AMX           - Sapphire Rapids+ (2023)")
        print()
        print("Runtime CPU detection will automatically select the best variant.")
        print()

        extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
        cfg = default_build_type()

        # Save original env vars to restore later
        env_backup = {
            "CPUINFER_CPU_INSTRUCT": os.environ.get("CPUINFER_CPU_INSTRUCT"),
            "CPUINFER_ENABLE_AMX": os.environ.get("CPUINFER_ENABLE_AMX"),
            "CPUINFER_ENABLE_AVX512": os.environ.get("CPUINFER_ENABLE_AVX512"),
            "CPUINFER_ENABLE_AVX512_VNNI": os.environ.get("CPUINFER_ENABLE_AVX512_VNNI"),
            "CPUINFER_ENABLE_AVX512_BF16": os.environ.get("CPUINFER_ENABLE_AVX512_BF16"),
            "CPUINFER_ENABLE_AVX512_VBMI": os.environ.get("CPUINFER_ENABLE_AVX512_VBMI"),
        }

        # Variant configurations: (name, description, env_vars)
        # Each variant specifies exactly which features to enable
        variants = [
            (
                "avx2",
                "AVX2 baseline",
                {
                    "CPUINFER_CPU_INSTRUCT": "AVX2",
                    "CPUINFER_ENABLE_AVX512": "OFF",
                    "CPUINFER_ENABLE_AMX": "OFF",
                },
            ),
            (
                "avx512_base",
                "AVX512F+BW",
                {
                    "CPUINFER_CPU_INSTRUCT": "AVX512",
                    "CPUINFER_ENABLE_AVX512": "ON",
                    "CPUINFER_ENABLE_AVX512_VNNI": "OFF",
                    "CPUINFER_ENABLE_AVX512_BF16": "OFF",
                    "CPUINFER_ENABLE_AVX512_VBMI": "OFF",
                    "CPUINFER_ENABLE_AMX": "OFF",
                },
            ),
            (
                "avx512_vnni",
                "AVX512F+VNNI",
                {
                    "CPUINFER_CPU_INSTRUCT": "AVX512",
                    "CPUINFER_ENABLE_AVX512": "ON",
                    "CPUINFER_ENABLE_AVX512_VNNI": "ON",
                    "CPUINFER_ENABLE_AVX512_BF16": "OFF",
                    "CPUINFER_ENABLE_AVX512_VBMI": "OFF",
                    "CPUINFER_ENABLE_AMX": "OFF",
                },
            ),
            (
                "avx512_vbmi",
                "AVX512F+VNNI+VBMI",
                {
                    "CPUINFER_CPU_INSTRUCT": "AVX512",
                    "CPUINFER_ENABLE_AVX512": "ON",
                    "CPUINFER_ENABLE_AVX512_VNNI": "ON",
                    "CPUINFER_ENABLE_AVX512_BF16": "OFF",
                    "CPUINFER_ENABLE_AVX512_VBMI": "ON",
                    "CPUINFER_ENABLE_AMX": "OFF",
                },
            ),
            (
                "avx512_bf16",
                "AVX512 Full (F+VNNI+VBMI+BF16)",
                {
                    "CPUINFER_CPU_INSTRUCT": "AVX512",
                    "CPUINFER_ENABLE_AVX512": "ON",
                    "CPUINFER_ENABLE_AVX512_VNNI": "ON",
                    "CPUINFER_ENABLE_AVX512_BF16": "ON",
                    "CPUINFER_ENABLE_AVX512_VBMI": "ON",
                    "CPUINFER_ENABLE_AMX": "OFF",
                },
            ),
            (
                "amx",
                "AMX + AVX512 Full",
                {
                    "CPUINFER_CPU_INSTRUCT": "AVX512",
                    "CPUINFER_ENABLE_AVX512": "ON",
                    "CPUINFER_ENABLE_AVX512_VNNI": "ON",
                    "CPUINFER_ENABLE_AVX512_BF16": "ON",
                    "CPUINFER_ENABLE_AVX512_VBMI": "ON",
                    "CPUINFER_ENABLE_AMX": "ON",
                },
            ),
        ]

        for variant_name, variant_desc, env_vars in variants:
            print("=" * 70)
            print(f"Building {variant_name.upper()} variant ({variant_desc})")
            print("=" * 70)
            print()

            # Set environment variables for this variant
            for key, value in env_vars.items():
                os.environ[key] = value
                print(f"  {key} = {value}")

            # Use separate build directory for each variant
            build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}_{variant_name}"
            build_temp.mkdir(parents=True, exist_ok=True)

            # Build this variant
            self._build_single_variant_impl(ext, extdir, build_temp, cfg)

            # Rename the built .so file to include variant suffix
            # Original name: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
            # New name: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
            built_so_files = list(extdir.glob(f"{ext.name.split('.')[-1]}.*.so"))
            if built_so_files:
                original_so = built_so_files[0]
                # Extract the suffix after the module name
                # e.g., "kt_kernel_ext.cpython-311-x86_64-linux-gnu.so" -> ".cpython-311-x86_64-linux-gnu.so"
                suffix = original_so.name.replace(ext.name.split(".")[-1], "")
                new_name = f"_kt_kernel_ext_{variant_name}{suffix}"
                new_path = extdir / new_name

                # Remove existing file if present
                if new_path.exists():
                    new_path.unlink()

                # Rename
                original_so.rename(new_path)
                print(f"✓ Built and renamed to: {new_name}")
                print()
            else:
                print(f"⚠ Warning: Could not find built .so file for {variant_name} variant")
                print()

        # Restore original env vars
        for key, value in env_backup.items():
            if value is not None:
                os.environ[key] = value
            elif key in os.environ:
                del os.environ[key]

        print("=" * 70)
        print("✓ All 6 variants built successfully!")
        print("=" * 70)
        print()
        print("The wheel now contains 6 CPU variants:")
        for so_file in sorted(extdir.glob("_kt_kernel_ext_*.so")):
            print(f"  - {so_file.name}")
        print()

    def _build_single_variant(self, ext: CMakeExtension):
        """Original single-variant build logic - wrapper for backward compatibility."""
        extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
        cfg = default_build_type()
        build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}"
        build_temp.mkdir(parents=True, exist_ok=True)

        self._build_single_variant_impl(ext, extdir, build_temp, cfg)

    def _build_single_variant_impl(self, ext: CMakeExtension, extdir: Path, build_temp: Path, cfg: str):
        """
        Core build logic for a single variant.

        This method contains the actual CMake configuration and build steps.
        It's called by both _build_single_variant() and build_multi_variants().

        Args:
            ext: The CMakeExtension to build
            extdir: Directory where the .so file should be placed
            build_temp: Temporary build directory for CMake
            cfg: Build type (Release/Debug/etc.)
        """

        # Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA
        def detect_cuda_toolkit() -> bool:
            # Respect CUDA_HOME
            cuda_home = os.environ.get("CUDA_HOME")
            if cuda_home:
                nvcc_path = Path(cuda_home) / "bin" / "nvcc"
                if nvcc_path.exists():
                    return True
            # PATH lookup
            if shutil.which("nvcc") is not None:
                return True
            # Common default install prefix
            if Path("/usr/local/cuda/bin/nvcc").exists():
                return True
            return False

        # Locate nvcc executable (without forcing user to set -DCMAKE_CUDA_COMPILER)
        def find_nvcc_path() -> str | None:
            cuda_home = os.environ.get("CUDA_HOME")
            if cuda_home:
                cand = Path(cuda_home) / "bin" / "nvcc"
                if cand.exists():
                    return str(cand)
            which_nvcc = shutil.which("nvcc")
            if which_nvcc:
                return which_nvcc
            # Common fallbacks (ordered by preference)
            for cand in [
                "/usr/local/cuda-12.6/bin/nvcc",
                "/usr/local/cuda/bin/nvcc",
                "/usr/bin/nvcc",
                "/usr/lib/nvidia-cuda-toolkit/bin/nvcc",
            ]:
                if Path(cand).exists():
                    return cand
            return None

        # Note: We no longer set CMAKE_CUDA_ARCHITECTURES by default.
        # If users want to specify CUDA archs, they can set env CPUINFER_CUDA_ARCHS
        # (e.g. "89" or "86;89") or pass it via CMAKE_ARGS.
        auto_moe_kernel_ = False
        # Normalize CPUINFER_USE_CUDA: if unset, auto-detect; otherwise respect truthy/falsey values
        cuda_env = _env_get_bool("CPUINFER_USE_CUDA", None)
        if cuda_env is None:
            auto_cuda = detect_cuda_toolkit()
            os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
            print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")

        # Base CMake args
        cmake_args = [
            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}/",
            f"-DPYTHON_EXECUTABLE={sys.executable}",
            f"-DCMAKE_BUILD_TYPE={cfg}",
        ]

        # CPU feature flags mapping: if user specified CPUINFER_CPU_INSTRUCT, honor it;
        # else auto-pick based on detection (x86 only)
        cmake_args += cpu_feature_flags()
        d = self.detect_cpu_info()
        print(f"Detected CPU info: {d}")
        cpu_mode = os.environ.get("CPUINFER_CPU_INSTRUCT", "NATIVE").upper()

        # Vendor / feature specific toggles
        # AMD MoE: explicit env overrides; otherwise default ON on AMD CPU
        _forward_bool_env(cmake_args, "CPUINFER_ENABLE_BLIS", "KTRANSFORMERS_CPU_MOE_AMD")
        # if d.get("vendor") == "amd":
        #     auto_moe_kernel_ = True
        #     cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON")
        #     print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)")
        #     _forward_str_env(cmake_args, "CPUINFER_BLIS_ROOT", "BLIS_ROOT")

        # KML: explicit env overrides; otherwise default ON on ARM
        _forward_bool_env(cmake_args, "CPUINFER_ENABLE_KML", "KTRANSFORMERS_CPU_USE_KML")
        # if d.get("vendor") == "arm":
        #     auto_moe_kernel_ = True
        #     cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON")
        #     print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)")

        # AMX: explicit env overrides; else enable if detected
        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMX", "KTRANSFORMERS_CPU_USE_AMX"):
            if "AMX" in d["features"]:
                cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX=ON")
                print("-- AMX support detected; enabling (-DKTRANSFORMERS_CPU_USE_AMX=ON)")

        # AVX512 umbrella (AMX/AVX512 kernels):
        # - If user explicitly sets CPUINFER_ENABLE_AVX512 -> honor it
        # - Otherwise, only auto-enable when CPU mode actually wants AVX512
        #   (NATIVE/FANCY/AVX512). In AVX2 mode we do NOT enable this, so
        #   RAWINT4 / K2 kernels are not compiled.
        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512", "KTRANSFORMERS_CPU_USE_AMX_AVX512"):
            if cpu_mode in ("NATIVE", "FANCY", "AVX512") and ("AMX" in d["features"] or "AVX512" in d["features"]):
                cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON")
                print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)")
            else:
                print(f"-- CPUINFER_CPU_INSTRUCT={cpu_mode}; not auto-enabling AMX/AVX512 umbrella")

        # Fine-grained AVX512 subset flags: only enable if CPU actually supports them
        # These are passed to CMake to conditionally add compiler flags
        # Track if any AVX512 extension is enabled
        avx512_extension_enabled = False
        allow_avx512_ext_auto = cpu_mode in ("NATIVE", "FANCY", "AVX512")

        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_VNNI", "LLAMA_AVX512_VNNI"):
            if allow_avx512_ext_auto and "AVX512_VNNI" in d["features"]:
                cmake_args.append("-DLLAMA_AVX512_VNNI=ON")
                print("-- AVX512_VNNI detected; enabling (-DLLAMA_AVX512_VNNI=ON)")
                avx512_extension_enabled = True
        else:
            avx512_extension_enabled = True

        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_BF16", "LLAMA_AVX512_BF16"):
            if allow_avx512_ext_auto and "AVX512_BF16" in d["features"]:
                cmake_args.append("-DLLAMA_AVX512_BF16=ON")
                print("-- AVX512_BF16 detected; enabling (-DLLAMA_AVX512_BF16=ON)")
                avx512_extension_enabled = True
        else:
            avx512_extension_enabled = True

        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_VBMI", "LLAMA_AVX512_VBMI"):
            if allow_avx512_ext_auto and "AVX512_VBMI" in d["features"]:
                cmake_args.append("-DLLAMA_AVX512_VBMI=ON")
                print("-- AVX512_VBMI detected; enabling (-DLLAMA_AVX512_VBMI=ON)")
                avx512_extension_enabled = True
        else:
            avx512_extension_enabled = True

        # If any AVX512 extension is enabled, ensure base AVX512 is also enabled
        if avx512_extension_enabled and cpu_mode in ("NATIVE", "FANCY", "AVX512"):
            if not any("LLAMA_AVX512=ON" in a for a in cmake_args):
                cmake_args.append("-DLLAMA_AVX512=ON")
                print("-- AVX512 extensions enabled; also enabling base AVX512F (-DLLAMA_AVX512=ON)")

        # Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend
        # (Do not enable purely on vendor auto-detection to avoid surprise behavior.)
        amd_env = _env_get_bool("CPUINFER_ENABLE_BLIS", None)
        kml_env = _env_get_bool("CPUINFER_ENABLE_KML", None)
        if amd_env or kml_env:
            auto_moe_kernel_ = True
        already_set = any("KTRANSFORMERS_CPU_MOE_KERNEL" in a for a in cmake_args)
        if not already_set and auto_moe_kernel_:
            cmake_args.append("-DKTRANSFORMERS_CPU_MOE_KERNEL=ON")
            print(
                "-- Auto-enabling MOE kernel (-DKTRANSFORMERS_CPU_MOE_KERNEL=ON) because CPUINFER_ENABLE_BLIS or CPUINFER_ENABLE_KML is ON"
            )

        # Friendly summary
        print(
            f"-- CPU detection: vendor={d.get('vendor')} arch={d.get('arch')} features={sorted(list(d.get('features', [])))}"
        )

        # MLA toggle (string/boolean allowed)
        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA"):
            _forward_str_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA")

        # LTO toggles
        _forward_bool_env(cmake_args, "CPUINFER_ENABLE_LTO", "CPUINFER_ENABLE_LTO")
        _forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS")
        _forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE")

        # CUDA static runtime toggle
        _forward_bool_env(cmake_args, "CPUINFER_CUDA_STATIC_RUNTIME", "KTRANSFORMERS_CUDA_STATIC_RUNTIME")

        # GPU backends (mutually exclusive expected)
        if _env_get_bool("CPUINFER_USE_CUDA", False):
            cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON")
            print("-- Enabling CUDA backend (-DKTRANSFORMERS_USE_CUDA=ON)")
            # Inject nvcc compiler path automatically unless user already specified one.
            user_specified_compiler = any("CMAKE_CUDA_COMPILER" in a for a in cmake_args)
            if not user_specified_compiler:
                extra_env = os.environ.get("CMAKE_ARGS", "")
                if "CMAKE_CUDA_COMPILER" in extra_env:
                    user_specified_compiler = True
            if not user_specified_compiler:
                nvcc_path = find_nvcc_path()
                if nvcc_path:
                    cmake_args.append(f"-DCMAKE_CUDA_COMPILER={nvcc_path}")
                    print(f"-- Auto-detected nvcc: {nvcc_path} (adding -DCMAKE_CUDA_COMPILER)")
                else:
                    print("-- Warning: nvcc not found via CUDA_HOME/PATH/common prefixes; CUDA configure may fail.")
            # Optional host compiler for nvcc if user set CUDAHOSTCXX
            if os.environ.get("CUDAHOSTCXX"):
                hostcxx = os.environ["CUDAHOSTCXX"]
                cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}")
                print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}")
            # Set CUDA architectures (default: Ampere/Ada/Hopper)
            archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "80;86;89;90").strip()
            if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args):
                cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}")
                print(f"-- Set CUDA architectures: {archs_env}")
        if _env_get_bool("CPUINFER_USE_ROCM", False):
            cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON")
        if _env_get_bool("CPUINFER_USE_MUSA", False):
            cmake_args.append("-DKTRANSFORMERS_USE_MUSA=ON")

        # Respect user extra CMAKE_ARGS (space separated)
        extra = os.environ.get("CMAKE_ARGS")
        if extra:
            cmake_args += [a for a in extra.split() if a]

        # Force rebuild? (delete cache)
        if _env_get_bool("CPUINFER_FORCE_REBUILD", True):
            cache = build_temp / "CMakeCache.txt"
            if cache.exists():
                cache.unlink()

        print("-- CMake configure args:")
        for a in cmake_args:
            print("   ", a)

        # Configure
        subprocess.run(["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True)

        # Build
        build_args = ["--build", ".", "--config", cfg]
        jobs = detect_parallel_jobs()
        if jobs:
            build_args += ["--parallel", jobs]
        print("-- CMake build args:", " ".join(build_args))
        subprocess.run(["cmake", *build_args], cwd=build_temp, check=True)

        # On some systems LTO + CMake + pybind may place the built .so inside build tree; move if needed
        built_candidates = list(build_temp.rglob(f"{ext.name}*.so"))
        for cand in built_candidates:
            if cand.parent != extdir:
                target = extdir / cand.name
                target.parent.mkdir(parents=True, exist_ok=True)
                # Overwrite stale
                if not target.exists() or target.stat().st_mtime < cand.stat().st_mtime:
                    print(f"-- Copying {cand} -> {target}")
                    target.write_bytes(cand.read_bytes())


################################################################################
# Version (simple). If you later add a python package dir, you can read from it.
################################################################################


# Read base version from version.py
_version_file = Path(__file__).resolve().parent.parent / "version.py"
if _version_file.exists():
    _version_ns = {}
    with open(_version_file, "r", encoding="utf-8") as f:
        exec(f.read(), _version_ns)
    _base_version = _version_ns.get("__version__", "0.5.0")
else:
    _base_version = "0.5.0"

# Determine version
if "CPUINFER_VERSION" in os.environ:
    # User explicitly set version (e.g., for testing)
    VERSION = os.environ["CPUINFER_VERSION"]
    print(f"-- Explicit version: {VERSION}")
else:
    VERSION = _base_version
    print(f"-- Version: {VERSION}")

# Package name is always kt-kernel
# The CUDA-enabled wheel includes both CPU multi-variant support and CUDA capabilities
PACKAGE_NAME = "kt-kernel"
cuda_enabled = _env_get_bool("CPUINFER_USE_CUDA", False)
if cuda_enabled:
    print(f"-- Building kt-kernel with CUDA support (+ CPU multi-variant)")
else:
    print(f"-- Building kt-kernel (CPU-only multi-variant)")

################################################################################
# Setup
################################################################################

setup(
    name=PACKAGE_NAME,
    version=VERSION,
    description="KT-Kernel: High-performance kernel operations for KTransformers (AMX/AVX/KML optimizations)",
    author="kvcache-ai",
    license="Apache-2.0",
    python_requires=">=3.8",
    packages=[
        "kt_kernel",
        "kt_kernel.utils",
        "kt_kernel.cli",
        "kt_kernel.cli.commands",
        "kt_kernel.cli.config",
        "kt_kernel.cli.utils",
    ],
    package_dir={
        "kt_kernel": "python",
        "kt_kernel.utils": "python/utils",
        "kt_kernel.cli": "python/cli",
        "kt_kernel.cli.commands": "python/cli/commands",
        "kt_kernel.cli.config": "python/cli/config",
        "kt_kernel.cli.utils": "python/cli/utils",
    },
    entry_points={
        "console_scripts": [
            "kt=kt_kernel.cli.main:main",
        ],
    },
    ext_modules=[CMakeExtension("kt_kernel.kt_kernel_ext", str(REPO_ROOT))],
    cmdclass={"build_ext": CMakeBuild},
    zip_safe=False,
    classifiers=[
        "Programming Language :: Python :: 3",
        "Programming Language :: C++",
        "Operating System :: POSIX :: Linux",
        "Operating System :: MacOS",
    ],
)


================================================
FILE: kt-kernel/test/__init__.py
================================================
"""KT-Kernel Test Suite

This test suite is adapted from SGLang's CI testing framework.
It provides hardware-aware test registration and execution with timeout control.
"""


================================================
FILE: kt-kernel/test/ci/__init__.py
================================================
"""CI test registration and execution utilities."""


================================================
FILE: kt-kernel/test/ci/ci_register.py
================================================
import ast
import warnings
from dataclasses import dataclass
from enum import Enum, auto
from typing import List


class HWBackend(Enum):
    CPU = auto()
    CUDA = auto()
    AMD = auto()


@dataclass
class CIRegistry:
    backend: HWBackend
    filename: str
    est_time: float
    suite: str


def register_cpu_ci(est_time: float, suite: str):
    pass


def register_cuda_ci(est_time: float, suite: str):
    pass


def register_amd_ci(est_time: float, suite: str):
    pass


REGISTER_MAPPING = {
    "register_cpu_ci": HWBackend.CPU,
    "register_cuda_ci": HWBackend.CUDA,
    "register_amd_ci": HWBackend.AMD,
}


class RegistryVisitor(ast.NodeVisitor):
    def __init__(self, filename: str):
        self.filename = filename
        self.registries: list[CIRegistry] = []

    def _collect_ci_registry(self, func_call: ast.Call):
        if not isinstance(func_call.func, ast.Name):
            return None

        if func_call.func.id not in REGISTER_MAPPING:
            return None

        hw = REGISTER_MAPPING[func_call.func.id]
        est_time, suite = None, None
        for kw in func_call.keywords:
            if kw.arg == "est_time":
                if isinstance(kw.value, ast.Constant):
                    est_time = kw.value.value
            elif kw.arg == "suite":
                if isinstance(kw.value, ast.Constant):
                    suite = kw.value.value

        for i, arg in enumerate(func_call.args):
            if isinstance(arg, ast.Constant):
                if i == 0:
                    est_time = arg.value
                elif i == 1:
                    suite = arg.value
        assert (
            est_time is not None
        ), "esimation_time is required and should be a constant"
        assert suite is not None, "suite is required and should be a constant"
        return CIRegistry(
            backend=hw, filename=self.filename, est_time=est_time, suite=suite
        )

    def visit_Module(self, node):
        for stmt in node.body:
            if not isinstance(stmt, ast.Expr) or not isinstance(stmt.value, ast.Call):
                continue

            cr = self._collect_ci_registry(stmt.value)
            if cr is not None:
                self.registries.append(cr)

        self.generic_visit(node)


def ut_parse_one_file(filename: str) -> List[CIRegistry]:
    with open(filename, "r") as f:
        file_content = f.read()
    tree = ast.parse(file_content, filename=filename)
    visitor = RegistryVisitor(filename=filename)
    visitor.visit(tree)
    return visitor.registries


def collect_tests(files: list[str], sanity_check: bool = True) -> List[CIRegistry]:
    ci_tests = []
    for file in files:
        registries = ut_parse_one_file(file)
        if len(registries) == 0:
            msg = f"No CI registry found in {file}"
            if sanity_check:
                raise ValueError(msg)
            else:
                warnings.warn(msg)
                continue

        ci_tests.extend(registries)

    return ci_tests


================================================
FILE: kt-kernel/test/ci/ci_utils.py
================================================
import os
import subprocess
import threading
import time
from dataclasses import dataclass
from typing import Callable, List, Optional

import psutil, signal, sys
def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
    """Kill the process and all its child processes."""
    # Remove sigchld handler to avoid spammy logs.
    if threading.current_thread() is threading.main_thread():
        signal.signal(signal.SIGCHLD, signal.SIG_DFL)

    if parent_pid is None:
        parent_pid = os.getpid()
        include_parent = False

    try:
        itself = psutil.Process(parent_pid)
    except psutil.NoSuchProcess:
        return

    children = itself.children(recursive=True)
    for child in children:
        if child.pid == skip_pid:
            continue
        try:
            child.kill()
        except psutil.NoSuchProcess:
            pass

    if include_parent:
        try:
            if parent_pid == os.getpid():
                itself.kill()
                sys.exit(0)

            itself.kill()

            # Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes),
            # so we send an additional signal to kill them.
            itself.send_signal(signal.SIGQUIT)
        except psutil.NoSuchProcess:
            pass


@dataclass
class TestFile:
    name: str
    estimated_time: float = 60


def run_with_timeout(
    func: Callable,
    args: tuple = (),
    kwargs: Optional[dict] = None,
    timeout: float = None,
):
    """Run a function with timeout."""
    ret_value = []

    def _target_func():
        ret_value.append(func(*args, **(kwargs or {})))

    t = threading.Thread(target=_target_func)
    t.start()
    t.join(timeout=timeout)
    if t.is_alive():
        raise TimeoutError()

    if not ret_value:
        raise RuntimeError()

    return ret_value[0]


def run_unittest_files(
    files: List[TestFile], timeout_per_file: float, continue_on_error: bool = False
):
    """
    Run a list of test files.

    Args:
        files: List of TestFile objects to run
        timeout_per_file: Timeout in seconds for each test file
        continue_on_error: If True, continue running remaining tests even if one fails.
                          If False, stop at first failure (default behavior for PR tests).
    """
    tic = time.perf_counter()
    success = True
    passed_tests = []
    failed_tests = []

    for i, file in enumerate(files):
        filename, estimated_time = file.name, file.estimated_time
        process = None

        def run_one_file(filename):
            nonlocal process

            filename = os.path.join(os.getcwd(), filename)
            print(
                f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
                flush=True,
            )
            tic = time.perf_counter()

            process = subprocess.Popen(
                ["python3", filename], stdout=None, stderr=None, env=os.environ
            )
            process.wait()
            elapsed = time.perf_counter() - tic

            print(
                f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
                flush=True,
            )
            return process.returncode

        try:
            ret_code = run_with_timeout(
                run_one_file, args=(filename,), timeout=timeout_per_file
            )
            if ret_code != 0:
                print(
                    f"\n✗ FAILED: {filename} returned exit code {ret_code}\n",
                    flush=True,
                )
                success = False
                failed_tests.append((filename, f"exit code {ret_code}"))
                if not continue_on_error:
                    # Stop at first failure for PR tests
                    break
                # Otherwise continue to next test for nightly tests
            else:
                passed_tests.append(filename)
        except TimeoutError:
            kill_process_tree(process.pid)
            time.sleep(5)
            print(
                f"\n✗ TIMEOUT: {filename} after {timeout_per_file} seconds\n",
                flush=True,
            )
            success = False
            failed_tests.append((filename, f"timeout after {timeout_per_file}s"))
            if not continue_on_error:
                # Stop at first timeout for PR tests
                break
            # Otherwise continue to next test for nightly tests

    if success:
        print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
    else:
        print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)

    # Print summary
    print(f"\n{'='*60}", flush=True)
    print(f"Test Summary: {len(passed_tests)}/{len(files)} passed", flush=True)
    print(f"{'='*60}", flush=True)
    if passed_tests:
        print("✓ PASSED:", flush=True)
        for test in passed_tests:
            print(f"  {test}", flush=True)
    if failed_tests:
        print("\n✗ FAILED:", flush=True)
        for test, reason in failed_tests:
            print(f"  {test} ({reason})", flush=True)
    print(f"{'='*60}\n", flush=True)

    return 0 if success else -1


================================================
FILE: kt-kernel/test/per_commit/__init__.py
================================================
"""Per-commit tests for KT-Kernel.

Tests in this directory are run on every commit in CI.
"""


================================================
FILE: kt-kernel/test/per_commit/test_amd_placeholder.py
================================================
"""AMD/ROCm backend tests for KT-Kernel (Placeholder).

This file is a placeholder for future AMD/ROCm backend tests.
Currently, KT-Kernel focuses on CPU optimizations (Intel AMX/AVX512).

To implement AMD tests:
1. Add actual test functions with @pytest.mark.amd
2. Update the estimated time in register_amd_ci()
3. Implement AMD/ROCm-specific initialization and validation tests
"""

import os
import sys

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_amd_ci

# Register this test for AMD CI (estimated time: 10 seconds, placeholder)
# Update suite name when implementing: currently using "stage-a-test-1"
register_amd_ci(est_time=10, suite="stage-a-test-1")


def test_amd_placeholder():
    """Placeholder test for AMD/ROCm backend.

    TODO: Implement actual AMD/ROCm tests when AMD support is added to kt-kernel.
    """
    # Currently a no-op placeholder
    pass


if __name__ == "__main__":
    # Allow running standalone (required by test runner)
    print("⚠ AMD/ROCm tests are not yet implemented (placeholder)")
    print("✓ Placeholder test passed")


================================================
FILE: kt-kernel/test/per_commit/test_basic_cpu.py
================================================
"""Basic CPU backend tests for KT-Kernel.

These tests verify basic functionality without requiring model files.
"""

import os
import sys
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 30 seconds
register_cpu_ci(est_time=30, suite="default")

# Check if kt_kernel_ext is available
try:
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_KT_KERNEL = True
except ImportError:
    HAS_KT_KERNEL = False
    kt_kernel_ext = None


@pytest.mark.cpu
def test_kt_kernel_import():
    """Test that kt_kernel_ext can be imported."""
    if not HAS_KT_KERNEL:
        pytest.skip("kt_kernel_ext not built or available")

    assert kt_kernel_ext is not None, "kt_kernel_ext module should be importable"


@pytest.mark.cpu
def test_cpu_infer_initialization():
    """Test that CPUInfer can be initialized."""
    if not HAS_KT_KERNEL:
        pytest.skip("kt_kernel_ext not built or available")

    # Initialize CPUInfer with 4 threads
    cpuinfer = kt_kernel_ext.CPUInfer(4)
    assert cpuinfer is not None, "CPUInfer should be initialized successfully"


@pytest.mark.cpu
def test_basic_module_attributes():
    """Test that kt_kernel_ext has expected attributes."""
    if not HAS_KT_KERNEL:
        pytest.skip("kt_kernel_ext not built or available")

    # Check for key attributes/functions
    assert hasattr(kt_kernel_ext, "CPUInfer"), "kt_kernel_ext should have CPUInfer class"


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_KT_KERNEL:
        print("⚠ kt_kernel_ext not available, skipping tests")
        return

    try:
        test_kt_kernel_import()
        print("✓ test_kt_kernel_import passed")

        test_cpu_infer_initialization()
        print("✓ test_cpu_infer_initialization passed")

        test_basic_module_attributes()
        print("✓ test_basic_module_attributes passed")

        print("\n✓ All tests passed!")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        sys.exit(1)


if __name__ == "__main__":
    # Allow running standalone (required by test runner)
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_cuda_placeholder.py
================================================
"""CUDA backend tests for KT-Kernel (Placeholder).

This file is a placeholder for future CUDA backend tests.
Currently, KT-Kernel focuses on CPU optimizations (Intel AMX/AVX512).

To implement CUDA tests:
1. Add actual test functions with @pytest.mark.cuda
2. Update the estimated time in register_cuda_ci()
3. Implement CUDA-specific initialization and validation tests
"""

import os
import sys

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cuda_ci

# Register this test for CUDA CI (estimated time: 10 seconds, placeholder)
# Update suite name when implementing: currently using "stage-a-test-1"
register_cuda_ci(est_time=10, suite="stage-a-test-1")


def test_cuda_placeholder():
    """Placeholder test for CUDA backend.

    TODO: Implement actual CUDA tests when CUDA support is added to kt-kernel.
    """
    # Currently a no-op placeholder
    pass


if __name__ == "__main__":
    # Allow running standalone (required by test runner)
    print("⚠ CUDA tests are not yet implemented (placeholder)")
    print("✓ Placeholder test passed")


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_accuracy_int4.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT4 accuracy tests for KT-Kernel.

Tests accuracy of AMX-accelerated INT4 MOE operations against torch reference.
"""

import os
import sys
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 120 seconds
register_cpu_ci(est_time=120, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from original test_moe_amx.py)
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
qlen = 1
layer_num = 1
validation_iter = 2
physical_to_logical_map = None


def act_fn(x):
    """Activation function for MoE."""
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MLP."""
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MoE."""
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )

    return t_output


@pytest.mark.cpu
def test_moe_amx_int4_accuracy():
    """Test AMX INT4 MOE accuracy against PyTorch reference implementation."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    global physical_to_logical_map
    physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()

    CPUInfer = kt_kernel_ext.CPUInfer(60)

    with torch.inference_mode(mode=True):
        # Initialize MoE layers
        gate_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        up_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        down_proj = (
            torch.randn(
                (expert_num, hidden_size, intermediate_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )

        # Create MOE config
        config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
        config.max_len = max_len
        config.gate_proj = gate_proj.data_ptr()
        config.up_proj = up_proj.data_ptr()
        config.down_proj = down_proj.data_ptr()
        config.gate_scale = 0
        config.pool = CPUInfer.backend_

        # Initialize INT4 MOE
        moe = kt_kernel_ext.moe.AMXInt4_MOE(config)
        CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
        CPUInfer.sync()
        CPUInfer.submit(moe.warm_up_task())
        CPUInfer.sync()

        # Run validation iterations
        for i in range(validation_iter):
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input_data = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            input_data = input_data / 100

            # Run AMX MOE
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_data.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            # Run torch reference
            t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)

            # Calculate relative difference
            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            print(f"Iteration {i}, diff = {diff:.6f}")

            # INT4 should have diff < 0.35
            assert diff < 0.35, f"INT4 accuracy test failed: diff={diff:.6f} >= 0.35"


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"⚠ Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT4 accuracy tests")
        return

    try:
        print("Running AMX MOE INT4 accuracy test...")
        test_moe_amx_int4_accuracy()
        print("✓ AMX MOE INT4 accuracy test passed")
        print("\n✓ All tests passed!")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_accuracy_int4_1.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT4_1 accuracy tests for KT-Kernel.

Tests accuracy of AMX-accelerated INT4_1 MOE operations against torch reference.
"""

import os
import sys
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 120 seconds
register_cpu_ci(est_time=120, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from original test_moe_amx.py)
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
qlen = 1
layer_num = 1
validation_iter = 2
physical_to_logical_map = None


def act_fn(x):
    """Activation function for MoE."""
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MLP."""
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MoE."""
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )

    return t_output


@pytest.mark.cpu
def test_moe_amx_int4_1_accuracy():
    """Test AMX INT4_1 MOE accuracy against PyTorch reference implementation."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    global physical_to_logical_map
    physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()

    CPUInfer = kt_kernel_ext.CPUInfer(60)

    with torch.inference_mode(mode=True):
        # Initialize MoE layers
        gate_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        up_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        down_proj = (
            torch.randn(
                (expert_num, hidden_size, intermediate_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )

        # Create MOE config
        config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
        config.max_len = max_len
        config.gate_proj = gate_proj.data_ptr()
        config.up_proj = up_proj.data_ptr()
        config.down_proj = down_proj.data_ptr()
        config.gate_scale = 0
        config.pool = CPUInfer.backend_

        # Initialize INT4_1 MOE
        moe = kt_kernel_ext.moe.AMXInt4_1_MOE(config)
        CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
        CPUInfer.sync()
        CPUInfer.submit(moe.warm_up_task())
        CPUInfer.sync()

        # Run validation iterations
        for i in range(validation_iter):
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input_data = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            input_data = input_data / 100

            # Run AMX MOE
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_data.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            # Run torch reference
            t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)

            # Calculate relative difference
            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            print(f"Iteration {i}, diff = {diff:.6f}")

            # INT4_1 should have diff < 0.35
            assert diff < 0.35, f"INT4_1 accuracy test failed: diff={diff:.6f} >= 0.35"


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"⚠ Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT4_1 accuracy tests")
        return

    try:
        print("Running AMX MOE INT4_1 accuracy test...")
        test_moe_amx_int4_1_accuracy()
        print("✓ AMX MOE INT4_1 accuracy test passed")
        print("\n✓ All tests passed!")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_accuracy_int4_1k.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT4_1K accuracy tests for KT-Kernel.

Tests accuracy of AMX-accelerated INT4_1K group quantization MOE operations against torch reference.
"""

import os
import sys
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 120 seconds
register_cpu_ci(est_time=120, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from original test_moe_amx.py)
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
qlen = 1
layer_num = 1
validation_iter = 2
k_group_size = 64
physical_to_logical_map = None


def act_fn(x):
    """Activation function for MoE."""
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MLP."""
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MoE."""
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )

    return t_output


@pytest.mark.cpu
def test_moe_amx_int4_1k_accuracy():
    """Test AMX INT4_1K MOE accuracy against PyTorch reference implementation."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    global physical_to_logical_map
    physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()

    CPUInfer = kt_kernel_ext.CPUInfer(60)

    with torch.inference_mode(mode=True):
        # Initialize MoE layers
        gate_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        up_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        down_proj = (
            torch.randn(
                (expert_num, hidden_size, intermediate_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )

        # Create MOE config
        config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
        config.max_len = max_len
        config.gate_proj = gate_proj.data_ptr()
        config.up_proj = up_proj.data_ptr()
        config.down_proj = down_proj.data_ptr()
        config.gate_scale = 0
        config.pool = CPUInfer.backend_

        # Configure INT4_1K quantization settings
        config.quant_config.bits = 4
        config.quant_config.group_size = k_group_size
        config.quant_config.zero_point = True

        # Initialize INT4_1K MOE
        moe = kt_kernel_ext.moe.AMXInt4_1KGroup_MOE(config)
        CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
        CPUInfer.sync()

        # Run validation iterations
        for i in range(validation_iter):
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input_data = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            input_data = input_data / 100

            # Run AMX MOE
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_data.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            # Run torch reference
            t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)

            # Calculate relative difference
            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            print(f"Iteration {i}, diff = {diff:.6f}")

            # INT4_1K should have diff < 0.35
            assert diff < 0.35, f"INT4_1K accuracy test failed: diff={diff:.6f} >= 0.35"


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"⚠ Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT4_1K accuracy tests")
        return

    try:
        print("Running AMX MOE INT4_1K accuracy test...")
        test_moe_amx_int4_1k_accuracy()
        print("✓ AMX MOE INT4_1K accuracy test passed")
        print("\n✓ All tests passed!")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_accuracy_int8.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT8 accuracy tests for KT-Kernel.

Tests accuracy of AMX-accelerated INT8 MOE operations against torch reference.
"""

import os
import sys
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 120 seconds
register_cpu_ci(est_time=120, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from original test_moe_amx.py)
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
qlen = 1
layer_num = 1
validation_iter = 2
physical_to_logical_map = None


def act_fn(x):
    """Activation function for MoE."""
    return x / (1.0 + torch.exp(-x))


def mlp_torch(input, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MLP."""
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret


def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    """PyTorch reference implementation of MoE."""
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0

    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )

    return t_output


@pytest.mark.cpu
def test_moe_amx_int8_accuracy():
    """Test AMX INT8 MOE accuracy against PyTorch reference implementation."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    global physical_to_logical_map
    physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()

    CPUInfer = kt_kernel_ext.CPUInfer(60)

    with torch.inference_mode(mode=True):
        # Initialize MoE layers
        gate_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        up_proj = (
            torch.randn(
                (expert_num, intermediate_size, hidden_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )
        down_proj = (
            torch.randn(
                (expert_num, hidden_size, intermediate_size),
                dtype=torch.bfloat16,
                device="cuda",
            )
            .to("cpu")
            .contiguous()
        )

        # Create MOE config
        config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
        config.max_len = max_len
        config.gate_proj = gate_proj.data_ptr()
        config.up_proj = up_proj.data_ptr()
        config.down_proj = down_proj.data_ptr()
        config.gate_scale = 0
        config.pool = CPUInfer.backend_

        # Initialize INT8 MOE
        moe = kt_kernel_ext.moe.AMXInt8_MOE(config)
        CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
        CPUInfer.sync()

        # Run validation iterations
        for i in range(validation_iter):
            bsz_tensor = torch.tensor([qlen], device="cpu")
            expert_ids = torch.stack(
                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
            ).contiguous()
            weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
            input_data = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
            input_data = input_data / 100

            # Run AMX MOE
            CPUInfer.submit(
                moe.forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids.data_ptr(),
                    weights.data_ptr(),
                    input_data.data_ptr(),
                    output.data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

            # Run torch reference
            t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)

            # Calculate relative difference
            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            print(f"Iteration {i}, diff = {diff:.6f}")

            # INT8 should have diff < 0.05
            assert diff < 0.05, f"INT8 accuracy test failed: diff={diff:.6f} >= 0.05"


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"⚠ Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT8 accuracy tests")
        return

    try:
        print("Running AMX MOE INT8 accuracy test...")
        test_moe_amx_int8_accuracy()
        print("✓ AMX MOE INT8 accuracy test passed")
        print("\n✓ All tests passed!")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_bench_int4.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT4 benchmark tests for KT-Kernel.

Benchmarks performance (bandwidth and FLOPS) of AMX-accelerated INT4 MOE operations.
"""

import os
import sys
import time
import json
import subprocess
import platform
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 300 seconds
register_cpu_ci(est_time=300, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm

    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from original bench_moe_amx.py)
expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
layer_num = 2
qlen = 2048
warm_up_iter = 1000
test_iter = 2000

# Worker configuration
worker_config_dict = {
    "subpool_count": 2,
    "subpool_numa_map": [0, 1],
    "subpool_thread_count": [30, 30],
}
CPUINFER_PARAM = 60


def get_git_commit():
    """Get current git commit information."""
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """Get system information including CPU model, memory, cores, and sockets."""
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    # Get CPU model (Linux only)
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # Get memory size in GB (Linux only)
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # Get CPU core count
    info["cpu_core_count"] = os.cpu_count()

    # Get socket count
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


def record_results(result, filename):
    """Append results to JSONL file."""
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


@pytest.mark.cpu
def test_moe_amx_int4_benchmark():
    """Benchmark AMX INT4 MOE performance."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    quant_mode = "int4"
    bytes_per_elem = 0.5

    # Setup output file
    script_dir = os.path.dirname(os.path.abspath(__file__))
    json_path = os.path.join(script_dir, "bench_moe_amx_int4.jsonl")

    with torch.inference_mode():
        # Initialize CPUInfer with worker config
        worker_config = kt_kernel_ext.WorkerPoolConfig()
        worker_config.subpool_count = worker_config_dict["subpool_count"]
        worker_config.subpool_numa_map = worker_config_dict["subpool_numa_map"]
        worker_config.subpool_thread_count = worker_config_dict["subpool_thread_count"]
        CPUInfer = kt_kernel_ext.CPUInfer(worker_config)

        # Initialize MOE layers
        moes = []
        for layer_index in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXInt4_MOE(config)
            CPUInfer.submit(moe.load_weights_task())
            CPUInfer.sync()
            moes.append(moe)

        # Generate test data
        gen_iter = 3000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .to("cpu")
            .contiguous()
        )
        weights = (
            torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
        )
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        bsz_tensor = torch.tensor([qlen], device="cpu")

        # Warm-up iterations
        print(f"Running warm-up for {warm_up_iter} iterations...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # Test iterations
        print(f"Running test for {test_iter} iterations...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # Calculate performance metrics
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")

        # Record results
        result = {
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result, json_path)

        print(f"Results saved to {json_path}")


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"⚠ Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT4 benchmark tests")
        return

    try:
        print("Running AMX MOE INT4 benchmark test...")
        test_moe_amx_int4_benchmark()
        print("✓ AMX MOE INT4 benchmark test passed")
        print("\n✓ All tests passed!")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_bench_int4_1.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT4 benchmark tests for KT-Kernel.

Benchmarks performance (bandwidth and FLOPS) of AMX-accelerated INT4 MOE operations.
"""

import os
import sys
import time
import json
import subprocess
import platform
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 300 seconds
register_cpu_ci(est_time=300, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm

    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from original bench_moe_amx.py)
expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
layer_num = 2
qlen = 1024
warm_up_iter = 1000
test_iter = 2000

# Worker configuration
worker_config_dict = {
    "subpool_count": 2,
    "subpool_numa_map": [0, 1],
    "subpool_thread_count": [30, 30],
}
CPUINFER_PARAM = 60


def get_git_commit():
    """Get current git commit information."""
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """Get system information including CPU model, memory, cores, and sockets."""
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    # Get CPU model (Linux only)
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # Get memory size in GB (Linux only)
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # Get CPU core count
    info["cpu_core_count"] = os.cpu_count()

    # Get socket count
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


def record_results(result, filename):
    """Append results to JSONL file."""
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


@pytest.mark.cpu
def test_moe_amx_int4_1_benchmark():
    """Benchmark AMX INT4 MOE performance."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    quant_mode = "int4"
    bytes_per_elem = 0.5

    # Setup output file
    script_dir = os.path.dirname(os.path.abspath(__file__))
    json_path = os.path.join(script_dir, "bench_moe_amx_int4_1.jsonl")

    with torch.inference_mode():
        # Initialize CPUInfer with worker config
        worker_config = kt_kernel_ext.WorkerPoolConfig()
        worker_config.subpool_count = worker_config_dict["subpool_count"]
        worker_config.subpool_numa_map = worker_config_dict["subpool_numa_map"]
        worker_config.subpool_thread_count = worker_config_dict["subpool_thread_count"]
        CPUInfer = kt_kernel_ext.CPUInfer(worker_config)

        # Initialize MOE layers
        moes = []
        for layer_index in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXInt4_MOE(config)
            CPUInfer.submit(moe.load_weights_task())
            CPUInfer.sync()
            moes.append(moe)

        # Generate test data
        gen_iter = 3000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .to("cpu")
            .contiguous()
        )
        weights = (
            torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
        )
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        bsz_tensor = torch.tensor([qlen], device="cpu")

        # Warm-up iterations
        print(f"Running warm-up for {warm_up_iter} iterations...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # Test iterations
        print(f"Running test for {test_iter} iterations...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # Calculate performance metrics
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")

        # Record results
        result = {
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result, json_path)

        print(f"Results saved to {json_path}")


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT4 benchmark tests")
        return

    try:
        print("Running AMX MOE INT4 benchmark test...")
        test_moe_amx_int4_1_benchmark()
        print("AMX MOE INT4 benchmark test passed")
        print("\nAll tests passed!")
    except Exception as e:
        print(f"\nTest failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_bench_int4_1k.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT4 1K Group benchmark tests for KT-Kernel.

Benchmarks performance (bandwidth and FLOPS) of AMX-accelerated INT4 MOE operations
with 1K group quantization (AMXInt4_1KGroup_MOE).
"""

import os
import sys
import time
import json
import subprocess
import platform
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 300 seconds
register_cpu_ci(est_time=300, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm

    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from bench_moe_amx_k.py)
expert_num = 16
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 8
layer_num = 2
qlen = 1024
warm_up_iter = 1000
test_iter = 2000
k_group_size = 128

# Worker configuration
worker_config_dict = {
    "subpool_count": 2,
    "subpool_numa_map": [0, 1],
    "subpool_thread_count": [30, 30],
}
CPUINFER_PARAM = 60


def get_git_commit():
    """Get current git commit information."""
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """Get system information including CPU model, memory, cores, and sockets."""
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    # Get CPU model (Linux only)
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # Get memory size in GB (Linux only)
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # Get CPU core count
    info["cpu_core_count"] = os.cpu_count()

    # Get socket count
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


def record_results(result, filename):
    """Append results to JSONL file."""
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


@pytest.mark.cpu
def test_moe_amx_int4_1k_benchmark():
    """Benchmark AMX INT4 1K Group MOE performance."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    quant_mode = "int4_1k"
    bytes_per_elem = 0.5

    # Setup output file
    script_dir = os.path.dirname(os.path.abspath(__file__))
    json_path = os.path.join(script_dir, "bench_moe_amx_int4_1k.jsonl")

    with torch.inference_mode():
        # Initialize CPUInfer with worker config
        worker_config = kt_kernel_ext.WorkerPoolConfig()
        worker_config.subpool_count = worker_config_dict["subpool_count"]
        worker_config.subpool_numa_map = worker_config_dict["subpool_numa_map"]
        worker_config.subpool_thread_count = worker_config_dict["subpool_thread_count"]
        CPUInfer = kt_kernel_ext.CPUInfer(worker_config)

        # Physical to logical map for weight loading
        physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()

        # Initialize MOE layers
        moes = []
        for layer_index in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_

            # Configure quantization for INT4 1K Group
            config.quant_config.bits = 4
            config.quant_config.group_size = k_group_size
            config.quant_config.zero_point = True
            config.gate_scale = 0

            moe = kt_kernel_ext.moe.AMXInt4_1KGroup_MOE(config)
            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
            CPUInfer.sync()
            moes.append(moe)

        # Generate test data
        gen_iter = 3000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .to("cpu")
            .contiguous()
        )
        weights = (
            torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
        )
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        bsz_tensor = torch.tensor([qlen], device="cpu")

        # Warm-up iterations
        print(f"Running warm-up for {warm_up_iter} iterations...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # Test iterations
        print(f"Running test for {test_iter} iterations...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # Calculate performance metrics
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")

        # Record results
        result = {
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
                "k_group_size": k_group_size,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result, json_path)

        print(f"Results saved to {json_path}")


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT4 1K Group benchmark tests")
        return

    try:
        print("Running AMX MOE INT4 1K Group benchmark test...")
        test_moe_amx_int4_1k_benchmark()
        print("AMX MOE INT4 1K Group benchmark test passed")
        print("\nAll tests passed!")
    except Exception as e:
        print(f"\nTest failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/per_commit/test_moe_amx_bench_int8.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""AMX MOE INT8 benchmark tests for KT-Kernel.

Benchmarks performance (bandwidth and FLOPS) of AMX-accelerated INT8 MOE operations.
"""

import os
import sys
import time
import json
import subprocess
import platform
import pytest

# Add parent directory to path for CI registration
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ci.ci_register import register_cpu_ci

# Register this test for CPU CI with estimated runtime of 300 seconds
register_cpu_ci(est_time=300, suite="default")

# Check if dependencies are available
try:
    import torch
    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext

    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm

    HAS_DEPS = True
except ImportError as e:
    HAS_DEPS = False
    import_error = str(e)

# Test parameters (from original bench_moe_amx.py)
expert_num = 128
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
num_experts_per_tok = 0
layer_num = 2
qlen = 1
warm_up_iter = 1000
test_iter = 2000

# Worker configuration
worker_config_dict = {
    "subpool_count": 2,
    "subpool_numa_map": [0, 1],
    "subpool_thread_count": [30, 30],
}
CPUINFER_PARAM = 60


def get_git_commit():
    """Get current git commit information."""
    result = {}
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
        commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
        result["commit"] = commit
        result["commit_message"] = commit_msg

        dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
        if dirty_output:
            result["dirty"] = True
            result["dirty_files"] = dirty_output.splitlines()
        else:
            result["dirty"] = False
    except Exception as e:
        result["commit"] = None
        result["commit_message"] = None
        result["dirty"] = None
        result["error"] = str(e)
    return result


def get_system_info():
    """Get system information including CPU model, memory, cores, and sockets."""
    info = {}
    uname = platform.uname()
    info["system_name"] = uname.system
    info["node_name"] = uname.node

    # Get CPU model (Linux only)
    cpu_model = None
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "model name" in line:
                        cpu_model = line.split(":", 1)[1].strip()
                        break
        except Exception as e:
            cpu_model = f"Error: {e}"
    info["cpu_model"] = cpu_model

    # Get memory size in GB (Linux only)
    mem_total_gb = None
    if os.path.exists("/proc/meminfo"):
        try:
            with open("/proc/meminfo", "r") as f:
                for line in f:
                    if "MemTotal" in line:
                        mem_kb = float(line.split(":", 1)[1].split()[0])
                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
                        break
        except Exception as e:
            mem_total_gb = f"Error: {e}"
    info["memory_size_GB"] = mem_total_gb

    # Get CPU core count
    info["cpu_core_count"] = os.cpu_count()

    # Get socket count
    sockets = set()
    if os.path.exists("/proc/cpuinfo"):
        try:
            with open("/proc/cpuinfo", "r") as f:
                for line in f:
                    if "physical id" in line:
                        sockets.add(line.split(":", 1)[1].strip())
        except Exception as e:
            sockets = set()
    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1

    return info


def record_results(result, filename):
    """Append results to JSONL file."""
    with open(filename, "a") as f:
        f.write(json.dumps(result) + "\n")


@pytest.mark.cpu
def test_moe_amx_int8_benchmark():
    """Benchmark AMX INT8 MOE performance."""
    if not HAS_DEPS:
        pytest.skip(f"Dependencies not available: {import_error}")

    quant_mode = "int8"
    bytes_per_elem = 1.0

    # Setup output file
    script_dir = os.path.dirname(os.path.abspath(__file__))
    json_path = os.path.join(script_dir, "bench_moe_amx_int8.jsonl")

    with torch.inference_mode():
        # Initialize CPUInfer with worker config
        worker_config = kt_kernel_ext.WorkerPoolConfig()
        worker_config.subpool_count = worker_config_dict["subpool_count"]
        worker_config.subpool_numa_map = worker_config_dict["subpool_numa_map"]
        worker_config.subpool_thread_count = worker_config_dict["subpool_thread_count"]
        CPUInfer = kt_kernel_ext.CPUInfer(worker_config)

        # Initialize MOE layers
        moes = []
        for layer_index in range(layer_num):
            gate_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            up_proj = (
                torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            down_proj = (
                torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
                .to("cpu")
                .contiguous()
            )
            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
            config.max_len = max_len
            config.gate_proj = gate_proj.data_ptr()
            config.up_proj = up_proj.data_ptr()
            config.down_proj = down_proj.data_ptr()
            config.pool = CPUInfer.backend_

            moe = kt_kernel_ext.moe.AMXInt8_MOE(config)
            CPUInfer.submit(moe.load_weights_task())
            CPUInfer.sync()
            moes.append(moe)

        # Generate test data
        gen_iter = 3000
        expert_ids = (
            torch.rand(gen_iter * qlen, expert_num, device="cpu")
            .argsort(dim=-1)[:, :num_experts_per_tok]
            .reshape(gen_iter, qlen * num_experts_per_tok)
            .to("cpu")
            .contiguous()
        )
        weights = (
            torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
        )
        input_tensor = (
            torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        output_tensor = (
            torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
        )
        bsz_tensor = torch.tensor([qlen], device="cpu")

        # Warm-up iterations
        print(f"Running warm-up for {warm_up_iter} iterations...")
        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()

        # Test iterations
        print(f"Running test for {test_iter} iterations...")
        start = time.perf_counter()
        for i in tqdm(range(test_iter), desc="Testing"):
            CPUInfer.submit(
                moes[i % layer_num].forward_task(
                    bsz_tensor.data_ptr(),
                    num_experts_per_tok,
                    expert_ids[i % gen_iter].data_ptr(),
                    weights[i % gen_iter].data_ptr(),
                    input_tensor[i % layer_num].data_ptr(),
                    output_tensor[i % layer_num].data_ptr(),
                    False,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start

        # Calculate performance metrics
        time_per_iter_us = total_time / test_iter * 1e6
        bandwidth = (
            hidden_size
            * intermediate_size
            * 3
            * num_experts_per_tok
            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
            * bytes_per_elem
            * test_iter
            / total_time
            / 1e9
        )  # GB/s
        flops = (
            hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
        )  # TFLOPS

        print("Quant mode: ", quant_mode)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", time_per_iter_us)
        print("Bandwidth: ", bandwidth, "GB/s")
        print("Flops: ", flops, "TFLOPS")

        # Record results
        result = {
            "quant_mode": quant_mode,
            "total_time_seconds": total_time,
            "iterations": test_iter,
            "time_per_iteration_us": time_per_iter_us,
            "bandwidth_GBs": bandwidth,
            "flops_TFLOPS": flops,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "test_parameters": {
                "expert_num": expert_num,
                "hidden_size": hidden_size,
                "intermediate_size": intermediate_size,
                "max_len": max_len,
                "num_experts_per_tok": num_experts_per_tok,
                "layer_num": layer_num,
                "qlen": qlen,
                "warm_up_iter": warm_up_iter,
                "test_iter": test_iter,
                "CPUInfer_parameter": CPUINFER_PARAM,
            },
        }
        result.update(get_git_commit())
        result.update(get_system_info())
        record_results(result, json_path)

        print(f"Results saved to {json_path}")


def run_all_tests():
    """Run all tests in this file (for standalone execution)."""
    if not HAS_DEPS:
        print(f"⚠ Dependencies not available: {import_error}")
        print("Skipping AMX MOE INT8 benchmark tests")
        return

    try:
        print("Running AMX MOE INT8 benchmark test...")
        test_moe_amx_int8_benchmark()
        print("✓ AMX MOE INT8 benchmark test passed")
        print("\n✓ All tests passed!")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    run_all_tests()


================================================
FILE: kt-kernel/test/run_suite.py
================================================
import argparse
import glob
import sys
from typing import List

from ci.ci_register import HWBackend, CIRegistry, collect_tests
from ci.ci_utils import TestFile, run_unittest_files

HW_MAPPING = {
    "cpu": HWBackend.CPU,
    "cuda": HWBackend.CUDA,
    "amd": HWBackend.AMD,
}

LABEL_MAPPING = {
    HWBackend.CPU: ["default"],
    HWBackend.AMD: ["stage-a-test-1"],
    HWBackend.CUDA: ["stage-a-test-1"],
}


def _filter_tests(
    ci_tests: List[CIRegistry], hw: HWBackend, suite: str
) -> List[CIRegistry]:
    ci_tests = [t for t in ci_tests if t.backend == hw]
    ret = []
    for t in ci_tests:
        assert t.suite in LABEL_MAPPING[hw], f"Unknown stage {t.suite} for backend {hw}"
        if t.suite == suite:
            ret.append(t)
    return ret


def run_per_commit(hw: HWBackend, suite: str):
    files = glob.glob("per_commit/**/*.py", recursive=True)
    # Exclude __init__.py files as they don't contain test registrations
    files = [f for f in files if not f.endswith("__init__.py")]
    ci_tests = _filter_tests(collect_tests(files), hw, suite)
    test_files = [TestFile(t.filename, t.est_time) for t in ci_tests]

    return run_unittest_files(
        test_files,
        timeout_per_file=1200,
        continue_on_error=False,
    )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--hw",
        type=str,
        choices=["cpu", "cuda", "amd"],
        required=True,
        help="Hardware backend to run tests on.",
    )
    parser.add_argument(
        "--suite",
        type=str,
        required=True,
        help="Test suite to run.",
    )
    args = parser.parse_args()
    hw = HW_MAPPING[args.hw]
    exit_code = run_per_commit(hw, args.suite)
    # run_unittest_files returns 0 for success, -1 for failure
    # Convert to standard exit codes: 0 for success, 1 for failure
    sys.exit(0 if exit_code == 0 else 1)


if __name__ == "__main__":
    main()


================================================
FILE: kt-kernel/test/test_generate_gpu_experts_masks.py
================================================
"""Test for generate_gpu_experts_masks function."""

import sys
import os

# Add python directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "python"))

import torch
import time
from experts_base import generate_gpu_experts_masks


def test_basic():
    """Test basic functionality."""
    print("=" * 60)
    print("Test 1: Basic functionality")
    print("=" * 60)

    activation_freq = torch.tensor([
        [0.1, 0.5, 0.3, 0.8],  # layer 0
        [0.2, 0.4, 0.9, 0.1],  # layer 1
    ])

    print(f"Input activation_freq:\n{activation_freq}")
    print(f"num_gpu_experts: 3")

    masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=3)

    print(f"Output masks:\n{masks}")
    print(f"Output dtype: {masks.dtype}, device: {masks.device}")

    # Verify: top 3 should be (1,2)=0.9, (0,3)=0.8, (0,1)=0.5
    expected_gpu_count = masks.sum().item()
    print(f"Total GPU experts: {expected_gpu_count}")

    # Check the top 3 positions
    assert masks[1, 2] == True, "layer1-expert2 (0.9) should be on GPU"
    assert masks[0, 3] == True, "layer0-expert3 (0.8) should be on GPU"
    assert masks[0, 1] == True, "layer0-expert1 (0.5) should be on GPU"
    assert expected_gpu_count == 3, f"Expected 3 GPU experts, got {expected_gpu_count}"

    print("PASSED\n")


def test_edge_cases():
    """Test edge cases."""
    print("=" * 60)
    print("Test 2: Edge cases")
    print("=" * 60)

    activation_freq = torch.tensor([
        [0.1, 0.5, 0.3, 0.8],
        [0.2, 0.4, 0.9, 0.1],
    ])

    # Test num_gpu_experts = 0
    masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=0)
    assert masks.sum().item() == 0, "num_gpu_experts=0 should have no GPU experts"
    print(f"num_gpu_experts=0: {masks.sum().item()} GPU experts - PASSED")

    # Test num_gpu_experts = total experts
    masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=8)
    assert masks.sum().item() == 8, "num_gpu_experts=8 should have all experts on GPU"
    print(f"num_gpu_experts=8 (all): {masks.sum().item()} GPU experts - PASSED")

    # Test num_gpu_experts > total experts (should clamp)
    masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=100)
    assert masks.sum().item() == 8, "num_gpu_experts=100 should be clamped to 8"
    print(f"num_gpu_experts=100 (clamped): {masks.sum().item()} GPU experts - PASSED")

    # Test negative num_gpu_experts (should clamp to 0)
    masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=-5)
    assert masks.sum().item() == 0, "num_gpu_experts=-5 should be clamped to 0"
    print(f"num_gpu_experts=-5 (clamped): {masks.sum().item()} GPU experts - PASSED")

    print("All edge cases PASSED\n")


def test_performance():
    """Test performance with realistic sizes."""
    print("=" * 60)
    print("Test 3: Performance")
    print("=" * 60)

    # DeepSeek-V3 like: 61 layers, 256 experts
    num_layers = 61
    num_experts = 256

    # Generate random activation frequencies
    activation_freq = torch.rand(num_layers, num_experts)

    # Test with different num_gpu_experts
    test_cases = [0, 100, 500, 1000, 2000, 5000, num_layers * num_experts]

    print(f"Shape: ({num_layers}, {num_experts}) = {num_layers * num_experts} total experts\n")

    for num_gpu in test_cases:
        # Warmup
        _ = generate_gpu_experts_masks(activation_freq, num_gpu_experts=num_gpu)

        # Measure time
        num_runs = 100
        start = time.perf_counter()
        for _ in range(num_runs):
            masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=num_gpu)
        end = time.perf_counter()

        avg_time_us = (end - start) / num_runs * 1e6
        actual_gpu = masks.sum().item()

        print(f"num_gpu_experts={num_gpu:5d} -> actual={actual_gpu:5d}, time={avg_time_us:8.2f} us")

    print("\nPerformance test PASSED\n")


def test_output_properties():
    """Test output tensor properties."""
    print("=" * 60)
    print("Test 4: Output properties")
    print("=" * 60)

    activation_freq = torch.rand(10, 64)
    masks = generate_gpu_experts_masks(activation_freq, num_gpu_experts=50)

    print(f"Shape: {masks.shape}")
    print(f"Dtype: {masks.dtype}")
    print(f"Device: {masks.device}")
    print(f"Is contiguous: {masks.is_contiguous()}")

    assert masks.shape == (10, 64), f"Expected shape (10, 64), got {masks.shape}"
    assert masks.dtype == torch.bool, f"Expected dtype bool, got {masks.dtype}"
    assert str(masks.device) == "cpu", f"Expected device cpu, got {masks.device}"

    print("All properties PASSED\n")


def test_determinism():
    """Test that results are deterministic."""
    print("=" * 60)
    print("Test 5: Determinism")
    print("=" * 60)

    activation_freq = torch.rand(20, 128)

    masks1 = generate_gpu_experts_masks(activation_freq, num_gpu_experts=100)
    masks2 = generate_gpu_experts_masks(activation_freq, num_gpu_experts=100)

    assert torch.equal(masks1, masks2), "Results should be deterministic"
    print("Determinism PASSED\n")


if __name__ == "__main__":
    test_basic()
    test_edge_cases()
    test_output_properties()
    test_determinism()
    test_performance()

    print("=" * 60)
    print("All tests PASSED!")
    print("=" * 60)


================================================
FILE: kt-sft/.flake8
================================================
[flake8]
max-line-length = 120
extend-select = B950
extend-ignore = E203,E501,E701, B001,B006,B007,B008,B009,B010,B011,B016,B028,B031,B950,E265,E266,E401,E402,E711,E712,E713,E721,E722,E731,F401,F403,F405,F541,F811,F821,F841,W391

================================================
FILE: kt-sft/.gitignore
================================================
__pycache__
build
.vscode
*.so
*.cache
server.db
logs
node_modules
*.nsys-rep
.vs/
*pycache*
*build/
*/third_party/*
.DS_Store
compile_commands.json
*.egg-info*
*dist/
ktransformers/server/local_store/
ktransformers/server_test1.db
*.patch
img/
tmp*.txt
tmp*.py
test.txt
book
ktransformers/tests/chat_txt.txt
mmlu_result*
ktransformers/ktransformers_ext/cuda_musa/
test_prompt.txt
csrc/demo

.vscode/

*__pycache__*
*.py[cod]
*$py.class
.pytest_cache/

GGUF-DeepSeek-V2-Lite-Chat
DeepSeek-V2-Lite-Chat
ktransformers/sft/adapter
tmp
graphviz/
compute_graph*
graphviz*
third_party/
test_adapter/demo_*
*.whl
*.svg
*_graph
tmp_package.txt
logs/

*.vscode/

__pycache__/
*.py[cod]
*$py.class
.pytest_cache/

# MakeFiles for kt_ext
build/
ktransformers/ktransformers_ext/bin
ktransformers/ktransformers_ext/CMakeFiles
ktransformers/ktransformers_ext/cmake_install.cmake
ktransformers/ktransformers_ext/CMakeCache.txt
ktransformers/ktransformers_ext/compile_commands.json
ktransformers/ktransformers_ext/Makefile
*.egg-info*
*.so

*.txt
*.pt

debug/*

test_adapter/ESC_inst_all.json

!CMakeLists.txt
!requirements-sft.txt

*-test*.yaml

duipai_pure_tf
data/dataset_info.json

test_adapter/*.json

.venv*

================================================
FILE: kt-sft/.gitmodules
================================================
[submodule "third_party/llama.cpp"]
	path = third_party/llama.cpp
	url = https://github.com/ggerganov/llama.cpp.git
[submodule "third_party/pybind11"]
	path = third_party/pybind11
	url = https://github.com/pybind/pybind11.git
[submodule "third_party/spdlog"]
	path = third_party/spdlog
	url = https://github.com/gabime/spdlog.git
[submodule "third_party/custom_flashinfer"]
	path = third_party/custom_flashinfer
	url = https://github.com/kvcache-ai/custom_flashinfer.git
	branch = fix-precision-mla-merge-main
[submodule "third_party/xxHash"]
	path = third_party/xxHash
	url = https://github.com/Cyan4973/xxHash.git
[submodule "third_party/prometheus-cpp"]
	path = third_party/prometheus-cpp
	url = https://github.com/jupp0r/prometheus-cpp.git


================================================
FILE: kt-sft/.pylintrc
================================================
[MASTER]
extension-pkg-whitelist=pydantic
max-line-length=120

[MESSAGES CONTROL]
disable=missing-function-docstring

================================================
FILE: kt-sft/Dockerfile
================================================
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server


ARG CPU_INSTRUCT=NATIVE

# 设置工作目录和 CUDA 路径
WORKDIR /workspace
ENV CUDA_HOME=/usr/local/cuda


# 安装依赖
RUN apt update -y
RUN apt install -y --no-install-recommends \
    libtbb-dev \
    libssl-dev \
    libcurl4-openssl-dev \
    libaio1 \
    libaio-dev \
    libfmt-dev \
    libgflags-dev \
    zlib1g-dev \
    patchelf \
    git \
    wget \
    vim \
    gcc \
    g++ \
    cmake
# 拷贝代码
RUN git clone https://github.com/kvcache-ai/ktransformers.git 
# 清理 apt 缓存
RUN rm -rf /var/lib/apt/lists/*

# 进入项目目录
WORKDIR /workspace/ktransformers
# 初始化子模块
RUN git submodule update --init --recursive

# 升级 pip
RUN pip install --upgrade pip

# 安装构建依赖
RUN pip install ninja pyproject numpy cpufeature aiohttp zmq openai

# 安装 flash-attn（提前装可以避免后续某些编译依赖出错）
RUN pip install flash-attn

# 安装 ktransformers 本体（含编译）
RUN CPU_INSTRUCT=${CPU_INSTRUCT} \
    USE_BALANCE_SERVE=1 \
    KTRANSFORMERS_FORCE_BUILD=TRUE \
    TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" \
    pip install . --no-build-isolation --verbose

RUN pip install third_party/custom_flashinfer/
# 清理 pip 缓存
RUN pip cache purge

# 拷贝 C++ 运行时库
RUN cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/

# 保持容器运行（调试用）
ENTRYPOINT ["tail", "-f", "/dev/null"]

================================================
FILE: kt-sft/Dockerfile.xpu
================================================
# Base image
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04

ARG http_proxy
ARG https_proxy

ENV DEBIAN_FRONTEND=noninteractive
ENV CONDA_DIR=/opt/conda

# Install dependencies
RUN apt-get update && apt-get install -y \
    wget \
    curl \
    bash \
    git \
    vim \
    ca-certificates \
    binutils \
    cmake \
    g++ \
    && rm -rf /var/lib/apt/lists/*

# Install Miniforge
RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O /tmp/miniforge.sh && \
    bash /tmp/miniforge.sh -b -p $CONDA_DIR && \
    rm /tmp/miniforge.sh && \
    $CONDA_DIR/bin/conda clean -afy

# Add conda to PATH
ENV PATH=$CONDA_DIR/bin:$PATH

RUN bash -c "\
    source /opt/conda/etc/profile.d/conda.sh && \
    conda create --name ktransformers python=3.11 -y && \
    conda activate ktransformers && \
    conda env list && \
    conda install -c conda-forge libstdcxx-ng -y && \
    strings \$(find /opt/conda/envs/ktransformers/lib -name 'libstdc++.so.6') | grep GLIBCXX | grep 3.4.32 \
"

RUN bash -c "\
    source /opt/conda/etc/profile.d/conda.sh && \
    conda activate ktransformers && \
    pip install ipex-llm[xpu_2.6]==2.3.0b20250518 --extra-index-url https://download.pytorch.org/whl/xpu && \
    pip uninstall -y torch torchvision torchaudio && \
    pip install torch==2.7+xpu torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu && \
    pip uninstall -y intel-opencl-rt dpcpp-cpp-rt && \
    pip list \
"

# Clone and set up ktransformers repo
RUN bash -c "\
    source $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate ktransformers && \
    git clone https://github.com/kvcache-ai/ktransformers.git && \
    cd ktransformers && \
    git submodule update --init && \
    sed -i 's/torch\.xpu\.is_available()/True/g' setup.py && \
    bash install.sh --dev xpu \
"

# Init conda and prepare bashrc
RUN conda init bash && \
    echo "source $CONDA_DIR/etc/profile.d/conda.sh" >> ~/.bashrc && \
    echo "conda activate ktransformers" >> ~/.bashrc

WORKDIR /ktransformers/
CMD ["bash"]


================================================
FILE: kt-sft/LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: kt-sft/MANIFEST.in
================================================
graft third_party
graft ktransformers
graft local_chat.py
graft csrc
include LICENSE README.md
prune ktransformers/website
prune ktransformers/logs
prune ktransformers.egg-info
prune third_party/llama.cpp/models
graft ktransformers/website/dist
global-exclude __pycache__
include KTransformersOps.*.so
include cpuinfer_ext.*.so


================================================
FILE: kt-sft/Makefile
================================================
flake_find:
	cd ktransformers && flake8 | grep -Eo '[A-Z][0-9]{3}' | sort | uniq| paste -sd ',' - 
format:
	@cd ktransformers && black .
	@black setup.py
dev_install:
# clear build dirs
	rm -rf build
	rm -rf *.egg-info
	rm -rf ktransformers/ktransformers_ext/build
	rm -rf ktransformers/ktransformers_ext/cuda/build
	rm -rf ktransformers/ktransformers_ext/cuda/dist
	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info

# install ktransformers
	echo "Installing python dependencies from requirements.txt"
	pip install -r requirements-local_chat.txt

	echo "Installing ktransformers"
	KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
	echo "Installation completed successfully"
clean:
	rm -rf build
	rm -rf *.egg-info
	rm -rf ktransformers/ktransformers_ext/build
	rm -rf ktransformers/ktransformers_ext/cuda/build
	rm -rf ktransformers/ktransformers_ext/cuda/dist
	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info	
install_numa:
	USE_NUMA=1 make dev_install
install_no_numa:
	env -u USE_NUMA make dev_install

================================================
FILE: kt-sft/README.md
================================================
- [KTransformers Fine-Tuning × LLaMA-Factory Integration – User Guide](#ktransformers-fine-tuning-x-llama-factory-integration-–-user-guide)
- [Introduction](#introduction)

- [Fine-Tuning Results (Examples)](#fine-tuning-results-examples)
  - [Stylized Dialogue (CatGirl tone)](#stylized-dialogue-catgirl-tone)
  - [Benchmarks](#benchmarks)
    - [Translational-Style dataset](#translational-style-dataset)
    - [AfriMed-QA (short answer)](#afrimed-qa-short-answer)
    - [AfriMed-QA (multiple choice)](#afrimed-qa-multiple-choice)

- [Quick to Start](#quick-to-start)
  - [Environment Setup](#environment-setup)
  - [Core Feature 1: Use KTransformers backend to fine-tune ultra-large MoE models](#core-feature-1-use-ktransformers-backend-to-fine-tune-ultra-large-moe-models)
  - [Core Feature 2: Chat with the fine-tuned model (base + LoRA adapter)](#core-feature-2-chat-with-the-fine-tuned-model-base--lora-adapter)
  - [Core Feature 3: Batch inference + metrics (base + LoRA adapter)](#core-feature-3-batch-inference--metrics-base--lora-adapter)

- [KT Fine-Tuning Speed (User-Side View)](#kt-fine-tuning-speed-user-side-view)
  - [End-to-End Performance](#end-to-end-performance)
  - [GPU/CPU Memory Footprint](#gpucpu-memory-footprint)

- [Conclusion](#conclusion)


# KTransformers Fine-Tuning × LLaMA-Factory Integration – User Guide

**MadSys Lab, KVCache-AI Team, Approaching AI, LLaMA-Factory Team**

## Introduction

From **DeepSeek-V3/R1** to **Qwen3-MoE** and **Kimi-K2**, each wave of open-sourced large models brings leaps in performance and scale. However, many researchers and developers are constrained by expensive GPUs and models with tens or even hundreds of billions of parameters, making it **hard to fine-tune very large models under limited resources**. To bridge this gap, we propose a practical approach: combining **KTransformers** with **LLaMA-Factory**. With just **2–4 RTX 4090s** and a high-memory CPU, you can fine-tune ultra-large MoE models like DeepSeek-671B.

Our goal is to give resource-constrained researchers a **local path to explore fine-tuning ultra-large models**, and also a fast way to customize smaller models (e.g., 14B/30B) for specific scenarios. We validate the setup using **stylized dialogue**, **Westernized translation tone**, and **medical Q&A** as representative tasks, showing that **personalized adaptation can be achieved within hours**.

As shown below, LLaMA-Factory is the unified orchestration/configuration layer for the whole fine-tuning workflow—handling data, training scheduling, LoRA injection, and inference interfaces. **KTransformers** acts as a pluggable high-performance backend that takes over core operators like Attention/MoE under the same training configs, enabling efficient **GPU+CPU heterogeneous cooperation**.

![image-20251011010558909](../doc/assets/image-20251011010558909.png)

Within LLaMA-Factory, we compared LoRA fine-tuning with **HuggingFace**, **Unsloth**, and **KTransformers** backends. KTransformers is the **only workable 4090-class solution** for ultra-large MoE models (e.g., 671B) and also delivers higher throughput and lower GPU memory on smaller MoE models (e.g., DeepSeek-14B).

| Under LoRA (BF16) + [NekoQA-10K stylized dialogue](https://github.com/mindsRiverPonder/LLM-practice) | HuggingFace Backend                      | Unsloth Backend                      | KTransformers Backend |
| ------------------------------------------------------------ | ---------------------------------------- | ------------------------------------ | --------------------- |
| [14B-DeepSeekV2-Lite] LoRA fine-tuning throughput            | 303.58 token/s                           | 455.37 token/s                       | 530.38 token/s        |
| [14B-DeepSeekV2-Lite] GPU memory                             | 32.12 GB                                 | 9.64 GB                              | 6.08 GB               |
| [671B-DeepSeekV3] LoRA fine-tuning throughput                | <font color='red'>Too Huge to run</font> | <font color='red'>NOT SUPPORT</font> | 40.35 token/s         |
| [671B-DeepSeekV3] GPU memory (sum across GPUs)               | theoretical 1400 GB †                    | <font color='red'>NOT SUPPORT</font> | 70 GB †               |

† **1400 GB** is a **theoretical** FP16 full-parameter resident footprint (not runnable). **70 GB** is the **measured peak** with KT strategy (Attention on GPU + layered MoE offload).

![按照模型划分的对比图_02](../doc/assets/image-compare_model.png)

### Fine-Tuning Results (Examples)

#### Stylized Dialogue (CatGirl tone)

Dataset: [NekoQA-10K](https://zhuanlan.zhihu.com/p/1934983798233231689). Goal: improve style consistency and recognizability.

The figure compares responses from the base vs. fine-tuned models. The fine-tuned model maintains the target tone and address terms more consistently (red boxes), validating the effectiveness of **style-transfer fine-tuning**.

![image-20251016175046882](../doc/assets/image-20251016175046882.png)

#### Benchmarks

We use:

(1) [Translational-Style-ChatLLM](https://github.com/Benson114/Translational-Style-ChatLLM), which asks for an exaggerated, Westernized translation tone—clear, stylized customization.

(2) [AfriMed-QA](https://aclanthology.org/2025.acl-long.96/) (ACL 2025), a medical dataset for African contexts with strong domain specificity, including multiple-choice and short-answer sub-tasks—well-suited for vertical fine-tuning evaluation.

The tables show metrics before vs. after LoRA fine-tuning. We observe **large improvements** across metrics, verifying fine-tuning effectiveness:

| Translational-Style dataset    | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------ | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite (no LoRA)              | 20.66     | 8.33      | 4.54      | 2.89      | 22.71     | 4.52      | 19.19     |
| **KT-LoRA fine-tuned V2-Lite** | **35.41** | **22.44** | **15.42** | **11.18** | **42.03** | **18.38** | **33.10** |
| V3 base (no LoRA)              | 8.49      | 3.34      | 1.62      | 0.96      | 15.91     | 2.55      | 10.07     |
| **KT-LoRA fine-tuned V3**      | **37.02** | **23.70** | **16.21** | **11.49** | **43.43** | **18.96** | **34.54** |

| AfriMed-QA (short answer)      | BLEU-1    | BLEU-2    | BLEU-3    | BLEU-4    | ROUGE-1   | ROUGE-2   | ROUGE-L   |
| ------------------------------ | --------- | --------- | --------- | --------- | --------- | --------- | --------- |
| V2-Lite (no LoRA)              | 13.58     | 11.12     | 9.10      | 7.23      | 22.48     | 7.81      | 11.73     |
| **KT-LoRA fine-tuned V2-Lite** | **35.90** | **27.63** | **22.99** | **19.15** | **35.25** | **17.50** | **28.44** |
| V3 base (no LoRA)              | 12.75     | 10.27     | 8.05      | 5.99      | 20.33     | 5.65      | 10.11     |
| **KT-LoRA fine-tuned V3**      | **42.42** | **34.12** | **28.95** | **24.54** | **41.97** | **22.37** | **33.28** |

| AfriMed-QA (multiple choice)   | Accuracy   |
| ------------------------------ | ---------- |
| V2-Lite (no LoRA)              | 0.0645     |
| **KT-LoRA fine-tuned V2-Lite** | **0.4812** |
| V3 base (no LoRA)              | 0.5833     |
| **KT-LoRA fine-tuned V3**      | **0.7930** |

Even for ultra-large MoE models, **KTransformers-backed fine-tuning** achieves strong task performance quickly.


## Quick to Start

This section shows how to install and use **LLaMA-Factory + KTransformers** for fine-tuning and inference:

- Environment setup
- Fine-tune ultra-large MoE models with KTransformers backend
- Load the fine-tuned model (base + LoRA adapter) for chat/inference
- Batch inference and metric evaluation

### Environment Setup

According to the following example, install both the **KTransformers** and **LLaMA-Factory** environments simultaneously.
 This time, to simplify the installation process of KTransformers, we have specially packaged a wheel file to avoid local compilation.
 The detailed installation steps are as follows:
 (Note: Make sure your local **Python version**, **Torch version**, **CUDA version**, and the **KTransformers wheel filename** correspond correctly.)

```shell
# 1. Create a conda environment
conda create -n Kllama python=3.12 # choose from : [3.10, 3.11, 3.12, 3.13]
conda install -y -c conda-forge libstdcxx-ng gcc_impl_linux-64
# ATTENTION: DO NOT skip this step, even if your cuda version is not 11.8! Otherwise, you will get this error: ImportError: libcudart.so.11.0: cannot open shared object file: No such file or directory.
conda install -y -c nvidia/label/cuda-11.8.0 cuda-runtime

# 2. Install the LLaMA-Factory environment
git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e ".[torch,metrics]" --no-build-isolation

# 3. Install the KTransformers wheel that matches your Torch and Python versions, from https://github.com/kvcache-ai/ktransformers/releases/tag/v0.4.1 (Note: The CUDA version can differ from that in the wheel filename.)
pip install ktransformers-0.4.1+cu128torch27fancy-cp312-cp312-linux_x86_64.whl

# 4. Install flash-attention, download the corresponding file based on your Python and Torch versions from: https://github.com/Dao-AILab/flash-attention/releases
pip install flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
# abi=True/False can find from below
# import torch
# print(torch._C._GLIBCXX_USE_CXX11_ABI)

# 5. (Optional) If you want to use flash_infer (otherwise it defaults to triton)
git clone https://github.com/kvcache-ai/custom_flashinfer.git
pip install custom_flashinfer/
```

**Usage tip:** In LLaMA-Factory YAML, set `use_kt: true` and pick a `kt_optimize_rule` file to have KTransformers handle the core compute. The features below show typical configs.

### Core Feature 1: Use KTransformers backend to fine-tune ultra-large MoE models

Run the command: `USE_KT=1 llamafactory-cli train examples/train_lora/deepseek3_lora_sft_kt.yaml`.

Note: You **must** provide a **BF16** model. DeepSeek-V3-671B is released in FP8 by default; convert with [DeepSeek-V3/inference/fp8_cast_bf16.py](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py).

```yaml
### model
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
trust_remote_code: true

### method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 8
lora_target: all

### dataset
dataset: identity
template: deepseek
cutoff_len: 2048
max_samples: 100000
overwrite_cache: true
preprocessing_num_workers: 16
dataloader_num_workers: 4

### output
output_dir: saves/Kllama_deepseekV3
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true
save_only_model: false
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
resume_from_checkpoint: null

### ktransformers
use_kt: true # use KTransformers as LoRA sft backend
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```

We also support RL DPO training using the KTransformers backend now. See [DPO Tutorial](../doc/en/SFT/DPO_tutorial.md) for details.  

`kt_optimize_rule` controls **placement strategy**. See also [ktransformers/optimize_rules](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules). Naming hints (`*` = wildcard):

| Pattern                                      | Meaning                                               |
| -------------------------------------------- | ----------------------------------------------------- |
| DeepSeek-V2-Lite-Chat-* / DeepSeek-V3-Chat-* | Target model variants                                 |
| *-sft-*                                      | Strategy for fine-tuning; others are for inference    |
| *-amx-*                                      | Use AMX on CPU; otherwise use **llamafile**           |
| *-multi-gpu-X*                               | Model parallel on X GPUs (X omitted → default 2 GPUs) |

Example: `DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml` = V3-Chat fine-tuning with AMX and 2-GPU model parallel.

We recommend **AMX acceleration** where available (`lscpu | grep amx`). AMX supports BF16/INT8. Example:

```yaml
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert parallelism
    kwargs:
      prefill_device: "cpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
```

Outputs go to `output_dir` in safetensors format plus adapter metadata for later loading.

![image-20251016171537997](../doc/assets/image-20251016171537997.png)

### Core Feature 2: Chat with the fine-tuned model (base + LoRA adapter)

Run the command: `llamafactory-cli chat examples/inference/deepseek3_lora_sft_kt.yaml`.

Use the safetensors adapter trained with KT for inference.

```yaml
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path: saves/Kllama_deepseekV3
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # use KTransformers as LoRA sft backend to inference
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```

We also support **GGUF** adapters: for safetensors, set the **directory**; for GGUF, set the **file path** in `adapter_name_or_path`.

During loading, LLaMA-Factory maps layer names to KT’s naming. You’ll see logs like `Loaded adapter weight: XXX -> XXX`:

![image-20251016171526210](../doc/assets/image-20251016171526210.png)

### Core Feature 3: Batch inference + metrics (base + LoRA adapter)

Run the command: `API_PORT=8000 llamafactory-cli api examples/inference/deepseek3_lora_sft_kt.yaml`.
 Invoke the KT fine-tuned adapter to provide the API; the usage logic of other APIs is consistent with the native LLaMA-Factory approach.

```yaml
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path: saves/Kllama_deepseekV3
template: deepseek
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code: true

use_kt: true # use KTransformers as LoRA sft backend to inference
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer: 32
chunk_size: 8192
```


## KT Fine-Tuning Speed (User-Side View)

### End-to-End Performance

**Definitions**

- `step_time`: wall-clock time for a full optimization step (tensor movement + Attention + MoE + other compute).
- `tokens_per_step = GAS × qlen`; `token/s = tokens_per_step / step_time`.

**Settings:** `GAS=16`, `qlen=512` (→ `tokens_per_step = 8192`); LoRA (`r=8, alpha=32, dropout=0.1`); **AMX** enabled; GPU: RTX 4090, CPU: Intel Xeon Platinum 8488C.

**Measured**

- **DeepSeek-V3-671B:** `step_time = 203 s` → `token/s ≈ 8192 / 203 ≈ 40.35`
- **DeepSeek-V2-Lite-14B:** `step_time = 36 s` → `token/s ≈ 8192 / 36 ≈ 227.6`

### GPU/CPU Memory Footprint

- DeepSeek-V3 (671B; 61 layers with 58 MoE): ~**70 GB** total GPU VRAM (multi-GPU), ~**1.2–1.3 TB** RAM.
- DeepSeek-V2-Lite (14B; 27 layers with 26 MoE): ~**5.5 GB** GPU VRAM, ~**30 GB** RAM.

## Conclusion

By integrating **KTransformers LoRA fine-tuning** into **LLaMA-Factory**, we provide a practical guide for efficient training and deployment of MoE LLMs. KT brings cutting-edge optimizations (DeepSeek/Qwen/Kimi support with AMX-accelerated kernels), and LoRA enables customization under very low GPU memory. LLaMA-Factory offers a friendly, unified interface.

This integration (akin to Unsloth-style speedups) means even models with tens to hundreds of billions of parameters can be fine-tuned and deployed with low latency on commodity hardware. You get **memory savings, speed-ups, and usability** together. We encourage you to try LLaMA-Factory + KT for your next MoE project and follow this guide. Feedback is welcome!


================================================
FILE: kt-sft/SECURITY.md
================================================
# Security Policy

## Supported Versions

Use this section to tell people about which versions of your project are
currently being supported with security updates.

| Version | Supported          |
| ------- | ------------------ |
| 5.1.x   | :white_check_mark: |
| 5.0.x   | :x:                |
| 4.0.x   | :white_check_mark: |
| < 4.0   | :x:                |

## Reporting a Vulnerability

Use this section to tell people how to report a vulnerability.

Tell them where to go, how often they can expect to get an update on a
reported vulnerability, what to expect if the vulnerability is accepted or
declined, etc.


================================================
FILE: kt-sft/autosetup.sh
================================================
#!/usr/bin/env bash
set -euo pipefail
shopt -s nullglob

# 允许通过环境变量覆盖
PY_LIST=${PY_LIST:-"3.13"}
TORCH_LIST=${TORCH_LIST:-"2.5.0 2.6.0 2.7.0 2.8.0 2.9.0"}
WHEELS_DIR=${WHEELS_DIR:-wheels}
FORCE=${FORCE:-0}    # FORCE=1 时强制重建
mkdir -p "$WHEELS_DIR"

# 每个 Torch 版本选择一个存在的 CUDA 索引（可按需调整）
index_for_torch_version () {
  case "$1" in
    2.3.*) echo "https://download.pytorch.org/whl/cu121" ;;
    2.4.*) echo "https://download.pytorch.org/whl/cu121" ;;
    2.5.*) echo "https://download.pytorch.org/whl/cu124" ;;
    2.6.*) echo "https://download.pytorch.org/whl/cu126" ;;
    2.7.*) echo "https://download.pytorch.org/whl/cu128" ;;
    2.8.*) echo "https://download.pytorch.org/whl/cu128" ;;  # 可换 cu129
    2.9.*) echo "https://download.pytorch.org/whl/cu128" ;;  # 可换 cu129
    *)     echo "https://download.pytorch.org/whl/cu121" ;;
  esac
}

# 检查指定“当前已激活环境”的组合是否已有产物
# 依据 wheel 命名规则中的后缀：+<backend>torch<MM> 以及 -<cp_tag>-<cp_tag>-linux_<arch>
have_wheel_for_current_env () {
  python - <<'PY'
import sys, platform, torch
from packaging.version import parse
py = f"cp{sys.version_info.major}{sys.version_info.minor}"
arch = platform.uname().machine
tver = parse(torch.__version__)
mm = f"{tver.major}{tver.minor}"
backend = ""
if torch.version.cuda:
    backend = "cu" + torch.version.cuda.replace(".", "")
elif getattr(torch.version, "hip", None):
    backend = "rocm" + torch.version.hip.replace(".", "")
else:
    backend = "cpu"  # 极少走到这里
print(py, arch, backend, mm)
PY
}

for py in $PY_LIST; do
  PYBIN="$(command -v python${py} || true)"
  if [[ ! -x "$PYBIN" ]]; then
    echo ">> Skip python ${py}: not found"
    continue
  fi
  for tv in $TORCH_LIST; do
    echo "======== Build: Python ${py} × Torch ${tv} ========"

    # 1) 新建并激活 venv
    ENV_DIR=".venv-py${py//./}-torch${tv%%.*}${tv#*.}"
    "$PYBIN" -m venv "$ENV_DIR"
    source "$ENV_DIR/bin/activate"

    # 2) 安装构建依赖 + 目标 torch（固定 CUDA 索引以避免装到 CPU 轮子）
    python -m pip install -U pip
    python -m pip install setuptools wheel build ninja cmake packaging cpufeature
    IDX="$(index_for_torch_version "$tv")"
    python -m pip install --index-url "$IDX" "torch==$tv"

    # 3) 读取当前环境的关键信息，拼出匹配的 wheel 通配符并检查是否已存在
    read -r CP_TAG ARCH BACKEND MM <<<"$(have_wheel_for_current_env)"
    plat="linux_${ARCH}"
    pattern="${WHEELS_DIR}/ktransformers-*+${BACKEND}torch${MM}*-${CP_TAG}-${CP_TAG}-${plat}.whl"

    if [[ "$FORCE" = "0" ]]; then
      existing=( $pattern )
      if (( ${#existing[@]} > 0 )); then
        echo ">> Found existing wheel, skip: ${existing[0]}"
        deactivate
        continue
      fi
    else
      echo ">> FORCE=1, rebuild even if wheel exists"
    fi

    # 打印对齐信息
    python - <<'PY'
import torch, sys
print(">>> torch:", torch.__version__, "cuda:", torch.version.cuda,
      "cxx11abi:", torch.compiled_with_cxx11_abi())
print(">>> python:", sys.version)
PY

    # ★ 清理所有构建产物（含内嵌 CMake build）
    rm -rf build/ dist/ *.egg-info
    find csrc -type d -name build -prune -exec rm -rf {} +

    # 构建
    KTRANSFORMERS_FORCE_BUILD=TRUE KTRANSFORMERS_DISABLE_PREBUILT=1 \
    python -m build --no-isolation --wheel

    # ★ 验证 wheel 内包含 cpuinfer_ext
    whl="$(ls dist/*.whl)"
    unzip -l "$whl" | grep -E 'cpuinfer_ext.*\.so' >/dev/null || {
      echo "!! cpuinfer_ext missing in $whl"; exit 2;
    }

    mv dist/*.whl wheels/ || true
    deactivate
  done
done

echo "== Wheels saved in ./wheels =="


================================================
FILE: kt-sft/book.toml
================================================
[book]
authors = ["kvcache-ai"]
language = "zh-CN"
title = "Ktransformers"
src = "doc"

[output.html]
git-repository-url = "https://github.com/kvcache-ai/ktransformers"
edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"

[output.html.playground]
editable = true
copy-js = true
# line-numbers = true

[output.html.fold]
enable = true
level = 0

================================================
FILE: kt-sft/csrc/custom_marlin/__init__.py
================================================


================================================
FILE: kt-sft/csrc/custom_marlin/binding.cpp
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-12 03:05:04
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "gptq_marlin/ops.h"
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>
// namespace py = pybind11;

PYBIND11_MODULE(vLLMMarlin, m) {

    /*m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q3_k",  &dequantize_q3_k, "Function to dequantize q3_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_q2_k",  &dequantize_q2_k, "Function to dequantize q2_k
    data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
    m.def("dequantize_iq4_xs",  &dequantize_iq4_xs, "Function to dequantize
    iq4_xs data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));*/
    m.def("gptq_marlin_gemm", &gptq_marlin_gemm,
          "Function to perform GEMM using Marlin quantization.", py::arg("a"),
          py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
          py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m_tensor"),
          py::arg("size_m"), py::arg("size_n"), py::arg("size_k"),
          py::arg("sms"), py::arg("is_k_full"));
    m.def("gptq_marlin_repack", &gptq_marlin_repack,
            "gptq_marlin repack from GPTQ");
}

================================================
FILE: kt-sft/csrc/custom_marlin/gptq_marlin/gptq_marlin.cu
================================================
/*
 * Modified by Neural Magic
 * Copyright (C) Marlin.2024 Elias Frantar
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /*
  * Adapted from https://github.com/IST-DASLab/marlin
  */
  /*
   * Adapted from
   * https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
   */
#include "gptq_marlin.cuh"
#include "gptq_marlin_dtypes.cuh"
#include <c10/cuda/CUDAGuard.h>
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                              \
    static_assert(std::is_same<scalar_t, half>::value ||                       \
                      std::is_same<scalar_t, nv_bfloat16>::value,              \
                  "only float16 and bfloat16 is supported");

template <typename T> inline std::string str(T x) { return std::to_string(x); }

namespace gptq_marlin {

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800

    __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
        int const* __restrict__ perm_int_ptr,
        int4* __restrict__ out_int4_ptr, int size_m,
        int size_k, int block_rows) {}

    template <typename scalar_t,         // compute dtype, half or nv_float16
        const int num_bits,        // number of bits used for weights
        const int threads,         // number of threads in a threadblock
        const int thread_m_blocks, // number of 16x16 blocks in the m
        // dimension (batchsize) of the
        // threadblock
        const int thread_n_blocks, // same for n dimension (output)
        const int thread_k_blocks, // same for k dimension (reduction)
        const int stages, // number of stages for the async global->shared
        // fetch pipeline
        const bool has_act_order,   // whether act_order is enabled
        const int group_blocks = -1 // number of consecutive 16x16 blocks
        // with a separate quantization scale
    >
    __global__ void
        Marlin(const int4* __restrict__ A, // fp16 input matrix of shape mxk
            const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
            int4* __restrict__ C,       // fp16 output buffer of shape mxn
            const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
            // (k/groupsize)xn
            const int* __restrict__ g_idx, // int32 group indices of shape k
            int num_groups, // number of scale groups per output channel
            int prob_m,     // batch dimension m
            int prob_n,     // output dimension n
            int prob_k,     // reduction dimension k
            int* locks      // extra global storage for barrier synchronization
        ) {}

} // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
    torch::Tensor& b_scales, torch::Tensor& g_idx,
    torch::Tensor& perm, torch::Tensor& workspace,
    int64_t num_bits, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full) {
    TORCH_CHECK_NOT_IMPLEMENTED(false,
        "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
    return torch::empty({ 1, 1 });
}

#else

    // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
    // output/accumulation.
    template <typename scalar_t>
    __device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
        const typename ScalarType<scalar_t>::FragB& frag_b,
        typename ScalarType<scalar_t>::FragC& frag_c) {
        const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
        const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
        float* c = reinterpret_cast<float*>(&frag_c);
        if constexpr (std::is_same<scalar_t, half>::value) {
            asm volatile(
                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
                "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
                : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
                : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
                "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
        }
        else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
            asm volatile(
                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
                "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
                : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
                : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
                "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
        }
        else {
            STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
        }
    }

    // Instruction for loading a full 16x16 matrix fragment of operand A from shared
    // memory, directly in tensor core layout.
    template <typename scalar_t>
    __device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
        const void* smem_ptr) {
        uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
        uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
        asm volatile(
            "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
            : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
            : "r"(smem));
    }

    // Lookup-table based 3-input logical operation; explicitly used for
    // dequantization as the compiler does not seem to automatically recognize it in
    // all cases.
    template <int lut> __device__ inline int lop3(int a, int b, int c) {
        int res;
        asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
            : "=r"(res)
            : "r"(a), "r"(b), "r"(c), "n"(lut));
        return res;
    }

    // Constructs destination register by taking bytes from 2 sources (based on
    // mask)
    template <int start_byte, int mask>
    __device__ inline uint32_t prmt(uint32_t a) {
        uint32_t res;
        asm volatile("prmt.b32 %0, %1, %2, %3;\n"
            : "=r"(res)
            : "r"(a), "n"(start_byte), "n"(mask));
        return res;
    }

    // Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
    // values. We mostly follow the strategy in the link below, with some small
    // changes:
    // - FP16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
    // - BF16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
    template <typename scalar_t>
    __device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit(int q) {
        STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
    }

    template <>
    __device__ inline typename ScalarType<half>::FragB dequant_4bit<half>(int q) {
        const int LO = 0x000f000f;
        const int HI = 0x00f000f0;
        const int EX = 0x64006400;
        // Guarantee that the `(a & b) | c` operations are LOP3s.
        int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
        int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
        // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
        // directly into `SUB` and `ADD`.
        const int SUB = 0x64086408;
        const int MUL = 0x2c002c00;
        const int ADD = 0xd480d480;
        typename ScalarType<half>::FragB frag_b;
        frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
            *reinterpret_cast<const half2*>(&SUB));
        frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
            *reinterpret_cast<const half2*>(&MUL),
            *reinterpret_cast<const half2*>(&ADD));
        return frag_b;
    }

    template <>
    __device__ inline typename ScalarType<nv_bfloat16>::FragB
        dequant_4bit<nv_bfloat16>(int q) {
        static constexpr uint32_t MASK = 0x000f000f;
        static constexpr uint32_t EX = 0x43004300;

        // Guarantee that the `(a & b) | c` operations are LOP3s.

        int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
        q >>= 4;
        int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

        typename ScalarType<nv_bfloat16>::FragB frag_b;
        static constexpr uint32_t MUL = 0x3F803F80;
        static constexpr uint32_t ADD = 0xC308C308;

        frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
            *reinterpret_cast<const nv_bfloat162*>(&MUL),
            *reinterpret_cast<const nv_bfloat162*>(&ADD));
        frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
            *reinterpret_cast<const nv_bfloat162*>(&MUL),
            *reinterpret_cast<const nv_bfloat162*>(&ADD));
        return frag_b;
    }

    // Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
    // bf16 Reference:
    // - FP16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
    // - BF16:
    // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
    template <typename scalar_t>
    __device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
        STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
    }

    template <>
    __device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
        static constexpr uint32_t mask_for_elt_01 = 0x5250;
        static constexpr uint32_t mask_for_elt_23 = 0x5351;
        static constexpr uint32_t start_byte_for_fp16 = 0x64646464;

        uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
        uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);

        static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;

        typename ScalarType<half>::FragB frag_b;
        frag_b[0] =
            __hsub2(*reinterpret_cast<half2*>(&lo),
                *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
        frag_b[1] =
            __hsub2(*reinterpret_cast<half2*>(&hi),
                *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
        return frag_b;
    }

    template <>
    __device__ inline typename ScalarType<nv_bfloat16>::FragB
        dequant_8bit<nv_bfloat16>(int q) {
        typename ScalarType<nv_bfloat16>::FragB frag_b;

        float fp32_intermediates[4];
        uint32_t* fp32_intermediates_casted =
            reinterpret_cast<uint32_t*>(fp32_intermediates);

        static constexpr uint32_t fp32_base = 0x4B000000;
        fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
        fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
        fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
        fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);

        fp32_intermediates[0] -= 8388736.f;
        fp32_intermediates[1] -= 8388736.f;
        fp32_intermediates[2] -= 8388736.f;
        fp32_intermediates[3] -= 8388736.f;

        uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
        bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
            fp32_intermediates_casted[1], 0x7632);
        bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
            fp32_intermediates_casted[3], 0x7632);

        return frag_b;
    }

    // Multiply dequantized values by the corresponding quantization scale; used
    // only for grouped quantization.
    template <typename scalar_t>
    __device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
        typename ScalarType<scalar_t>::FragS& frag_s,
        int i) {
        using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
        scalar_t2 s = ScalarType<scalar_t>::num2num2(
            reinterpret_cast<scalar_t*>(&frag_s)[i]);
        frag_b[0] = __hmul2(frag_b[0], s);
        frag_b[1] = __hmul2(frag_b[1], s);
    }

    // Same as above, but for act_order (each K is multiplied individually)
    template <typename scalar_t>
    __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
        typename ScalarType<scalar_t>::FragS& frag_s_1,
        typename ScalarType<scalar_t>::FragS& frag_s_2,
        typename ScalarType<scalar_t>::FragS& frag_s_3,
        typename ScalarType<scalar_t>::FragS& frag_s_4,
        int i) {
        using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
        scalar_t2 s_val_1_2;
        s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
        s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];

        scalar_t2 s_val_3_4;
        s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
        s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];

        frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
        frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
    }

    // Given 2 floats multiply by 2 scales (halves)
    template <typename scalar_t>
    __device__ inline void scale_float(float* c,
        typename ScalarType<scalar_t>::FragS& s) {
        scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
        c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
        c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
    }

    // Wait until barrier reaches `count`, then lock for current threadblock.
    __device__ inline void barrier_acquire(int* lock, int count) {
        if (threadIdx.x == 0) {
            int state = -1;
            do
                // Guarantee that subsequent writes by this threadblock will be
                // visible globally.
                asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
                    : "=r"(state)
                    : "l"(lock));
            while (state != count);
        }
        __syncthreads();
    }

    // Release barrier and increment visitation count.
    __device__ inline void barrier_release(int* lock, bool reset = false) {
        __syncthreads();
        if (threadIdx.x == 0) {
            if (reset) {
                lock[0] = 0;
                return;
            }
            int val = 1;
            // Make sure that all writes since acquiring this barrier are visible
            // globally, while releasing the barrier.
            asm volatile("fence.acq_rel.gpu;\n");
            asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
                :
            : "l"(lock), "r"(val));
        }
    }

    // For a given "a" of size [M,K] performs a permutation of the K columns based
    // on the given "perm" indices.
    __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
        int const* __restrict__ perm_int_ptr,
        int4* __restrict__ out_int4_ptr, int size_m,
        int size_k, int block_rows) {
        int start_row = block_rows * blockIdx.x;
        int finish_row = start_row + block_rows;
        if (finish_row > size_m) {
            finish_row = size_m;
        }
        int cur_block_rows = finish_row - start_row;

        int row_stride = size_k * sizeof(half) / 16;

        auto permute_row = [&](int row) {
            int iters = size_k / default_threads;
            int rest = size_k % default_threads;

            int offset = row * row_stride;

            half const* a_row_half =
                reinterpret_cast<half const*>(a_int4_ptr + offset);
            half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);

            int base_k = 0;

            for (int i = 0; i < iters; i++) {
                int cur_k = base_k + threadIdx.x;
                int src_pos = perm_int_ptr[cur_k];

                out_half[cur_k] = a_row_half[src_pos];

                base_k += default_threads;
            }

            if (rest) {
                if (threadIdx.x < rest) {
                    int cur_k = base_k + threadIdx.x;
                    int src_pos = perm_int_ptr[cur_k];

                    out_half[cur_k] = a_row_half[src_pos];
                }
            }
            };

        for (int i = 0; i < cur_block_rows; i++) {
            int cur_row = start_row + i;
            if (cur_row < size_m) {
                permute_row(cur_row);
            }
        }
    }

    template <typename scalar_t,         // compute dtype, half or nv_float16
        const int num_bits,        // number of bits used for weights
        const int threads,         // number of threads in a threadblock
        const int thread_m_blocks, // number of 16x16 blocks in the m
        // dimension (batchsize) of the
        // threadblock
        const int thread_n_blocks, // same for n dimension (output)
        const int thread_k_blocks, // same for k dimension (reduction)
        const int stages, // number of stages for the async global->shared
        // fetch pipeline
        const bool has_act_order,   // whether act_order is enabled
        const int group_blocks = -1 // number of consecutive 16x16 blocks
        // with a separate quantization scale
    >
    __device__ void
        Marlin(const int4* __restrict__ A, // fp16 input matrix of shape mxk
            const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
            int4* __restrict__ C,       // fp16 output buffer of shape mxn
            const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
            // (k/groupsize)xn
            const int* __restrict__ g_idx, // int32 group indices of shape k
            int num_groups, // number of scale groups per output channel
            int prob_m,     // batch dimension m, should be divisible by (16 * thread_m_blocks) if bigger than that
            int prob_n,     // output dimension n
            int prob_k,     // reduction dimension k
            int* locks      // extra global storage for barrier synchronization
        ) {
        // Each threadblock processes one "stripe" of the B matrix with (roughly) the
        // same size, which might involve multiple column "slices" (of width 16 *
        // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
        // example:
        //   0 1 3
        //   0 2 3
        //   1 2 4
        // While this kind of partitioning makes things somewhat more complicated, it
        // ensures good utilization of all SMs for many kinds of shape and GPU
        // configurations, while requiring as few slow global cross-threadblock
        // reductions as possible.
        using Dtype = ScalarType<scalar_t>;
        using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
        using FragA = typename ScalarType<scalar_t>::FragA;
        using FragB = typename ScalarType<scalar_t>::FragB;
        using FragC = typename ScalarType<scalar_t>::FragC;
        using FragS = typename ScalarType<scalar_t>::FragS;

        constexpr int pack_factor = 32 / num_bits;

        // int prob_m = *prob_m_ptr;
        // const int thread_m_blocks = min(div_ceil(prob_m, 16), template_thread_m_blocks);
        // constexpr int thread_m_blocks = template_thread_m_blocks;

        // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
        // better partitioning with less reductions
        int parallel = 1;
        if (prob_m > 16 * thread_m_blocks) {
            parallel = prob_m / (16 * thread_m_blocks);
            prob_m = 16 * thread_m_blocks;
        }

        int k_tiles = prob_k / 16 / thread_k_blocks;
        int n_tiles = prob_n / 16 / thread_n_blocks;
        int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);

        if constexpr (!has_act_order && group_blocks != -1) {
            if (group_blocks >= thread_k_blocks) {
                // Ensure that the number of tiles in each stripe is a multiple of the
                // groupsize; this avoids an annoying special case where a stripe starts
                // in the middle of group.
                iters = (group_blocks / thread_k_blocks) *
                    div_ceil(iters, (group_blocks / thread_k_blocks));
            }
        }

        int slice_row = (iters * blockIdx.x) % k_tiles;
        int slice_col_par = (iters * blockIdx.x) / k_tiles;
        int slice_col = slice_col_par;
        int slice_iters;  // number of threadblock tiles in the current slice
        int slice_count =
            0;          // total number of active threadblocks in the current slice
        int slice_idx;  // index of threadblock in current slice; numbered bottom to
        // top

    // We can easily implement parallel problem execution by just remapping
    // indices and advancing global pointers
        if (slice_col_par >= n_tiles) {
            A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
            C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
            locks += (slice_col_par / n_tiles) * n_tiles;
            slice_col = slice_col_par % n_tiles;
        }

        // Compute all information about the current slice which is required for
        // synchronization.
        auto init_slice = [&]() {
            slice_iters =
                iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
            if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
            if (slice_iters == 0) return;
            if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
            slice_count = 1;
            slice_idx = 0;
            int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
            if (col_first <= k_tiles * (slice_col_par + 1)) {
                int col_off = col_first - k_tiles * slice_col_par;
                slice_count = div_ceil(k_tiles - col_off, iters);
                if (col_off > 0) slice_count++;
                int delta_first = iters * blockIdx.x - col_first;
                if (delta_first < 0 || (col_off == 0 && delta_first == 0))
                    slice_idx = slice_count - 1;
                else {
                    slice_idx = slice_count - 1 - delta_first / iters;
                    if (col_off > 0) slice_idx--;
                }
            }
            if (slice_col == n_tiles) {
                A += 16 * thread_m_blocks * prob_k / 8;
                C += 16 * thread_m_blocks * prob_n / 8;
                locks += n_tiles;
                slice_col = 0;
            }
            };
        init_slice();

        // A sizes/strides

        // stride of the A matrix in global memory
        int a_gl_stride = prob_k / 8;
        // stride of an A matrix tile in shared memory
        constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
        // delta between subsequent A tiles in global memory
        constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
        // between subsequent accesses within a tile
        int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
        // between shared memory writes
        constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
        // between shared memory tile reads
        constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
        // within a shared memory tile
        constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
        // overall size of a tile
        constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
        // number of shared write iterations for a tile
        constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);

        // B sizes/strides
        int b_gl_stride = 16 * prob_n / (pack_factor * 4);
        constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
        constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
        constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;

        int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
        int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
        constexpr int b_sh_wr_delta = threads * b_thread_vecs;
        constexpr int b_sh_rd_delta = threads * b_thread_vecs;
        constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
        constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;

        // Scale sizes/strides without act_order
        int s_gl_stride = prob_n / 8;
        constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
        constexpr int s_tb_groups =
            !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
            ? thread_k_blocks / group_blocks
            : 1;
        constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
        int s_gl_rd_delta = s_gl_stride;

        // Scale size/strides with act_order
        constexpr int tb_k = 16 * thread_k_blocks;
        constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
        // constexpr int act_s_row_stride      = 1;
        // int           act_s_col_stride      = act_s_row_stride * num_groups;
        int act_s_col_stride = 1;
        int act_s_col_warp_stride = act_s_col_stride * 8;
        int tb_n_warps = thread_n_blocks / 4;
        int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;

        // Global A read index of current thread.
        int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
            (threadIdx.x % a_gl_rd_delta_o);
        a_gl_rd += a_gl_rd_delta_o * slice_row;
        // Shared write index of current thread.
        int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
            (threadIdx.x % a_gl_rd_delta_o);
        // Shared read index.
        int a_sh_rd =
            a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
        a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));

        int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
            (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
        b_gl_rd += b_sh_stride * slice_col;
        b_gl_rd += b_gl_rd_delta_o * slice_row;
        int b_sh_wr = threadIdx.x * b_thread_vecs;
        int b_sh_rd = threadIdx.x * b_thread_vecs;

        // For act_order
        constexpr int k_iter_size = tb_k / b_sh_wr_iters;
        int slice_k_start = tb_k * slice_row;
        int slice_k_finish = slice_k_start + tb_k * slice_iters;
        int slice_k_start_shared_fetch = slice_k_start;
        int slice_n_offset = act_s_col_tb_stride * slice_col;

        // No act_order
        int s_gl_rd;
        if constexpr (!has_act_order) {
            if constexpr (group_blocks == -1) {
                s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
            }
            else {
                s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                    s_sh_stride * slice_col + threadIdx.x;
            }
        }
        int s_sh_wr = threadIdx.x;
        bool s_sh_wr_pred = threadIdx.x < s_sh_stride;

        // We use a different scale layout for grouped and column-wise quantization as
        // we scale a `half2` tile in column-major layout in the former and in
        // row-major in the latter case.
        int s_sh_rd;
        if constexpr (group_blocks != -1)
            s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
            (threadIdx.x % 32) / 4;
        else
            s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
            (threadIdx.x % 32) % 4;

        // Precompute which thread should not read memory in which iterations; this is
        // needed if there are more threads than required for a certain tilesize or
        // when the batchsize is not a multiple of 16.
        bool a_sh_wr_pred[a_sh_wr_iters];
#pragma unroll
        for (int i = 0; i < a_sh_wr_iters; i++) {
            a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
        }

        // To ensure that writing and reading A tiles to/from shared memory, the
        // latter in fragment format, is fully bank conflict free, we need to use a
        // rather fancy XOR-based layout. The key here is that neither reads nor
        // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
        // same shared memory banks. Further, it seems (based on NSight-Compute) that
        // each warp must also write a consecutive memory segment?
        auto transform_a = [&](int i) {
            int row = i / a_gl_rd_delta_o;
            return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
            };
        // Since the computation of this remapping is non-trivial and, due to our main
        // loop unrolls, all shared memory accesses are static, we simply precompute
        // both transformed reads and writes.
        int a_sh_wr_trans[a_sh_wr_iters];
#pragma unroll
        for (int i = 0; i < a_sh_wr_iters; i++) {
            a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
        }
        int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
#pragma unroll
        for (int i = 0; i < b_sh_wr_iters; i++) {
#pragma unroll
            for (int j = 0; j < thread_m_blocks; j++)
            {
                a_sh_rd_trans[i][j] =
                    transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
            }
        }

        // Since B-accesses have non-constant stride they have to be computed at
        // runtime; we break dependencies between subsequent accesses with a tile by
        // maintining multiple pointers (we have enough registers), a tiny
        // optimization.
        const int4* B_ptr[b_sh_wr_iters];
#pragma unroll
        for (int i = 0; i < b_sh_wr_iters; i++)
            B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;

        extern __shared__ int4 sh[];
        // Shared memory storage for global fetch pipelines.
        int4* sh_a = sh;
        int4* sh_b = sh_a + (stages * a_sh_stage);
        int4* sh_g_idx = sh_b + (stages * b_sh_stage);
        int4* sh_s = sh_g_idx + (stages * g_idx_stage);

        // Register storage for double buffer of shared memory reads.
        FragA frag_a[2][thread_m_blocks];
        I4 frag_b_quant[2][b_thread_vecs];
        FragC frag_c[thread_m_blocks][4][2];
        FragS frag_s[2][4];         // No act-order
        FragS act_frag_s[2][4][4];  // For act-order

        // Zero accumulators.
        auto zero_accums = [&]() {
#pragma unroll
            for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
            {
                reinterpret_cast<float*>(frag_c)[i] = 0;
            }
            };

        int sh_first_group_id = -1;
        int sh_num_groups = -1;
        constexpr int sh_max_num_groups = 32;

        auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
            int last_group_id) {
                sh_first_group_id = first_group_id;
                sh_num_groups = last_group_id - first_group_id + 1;

                if (sh_num_groups < sh_max_num_groups) {
                    sh_num_groups = sh_max_num_groups;
                }

                if (sh_first_group_id + sh_num_groups > num_groups) {
                    sh_num_groups = num_groups - sh_first_group_id;
                }

                int row_offset = first_group_id * s_gl_stride;

                if (is_async) {
                    for (int i = 0; i < sh_num_groups; i++) {
                        if (threadIdx.x < s_sh_stride) {
                            cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
                                &scales_ptr[row_offset + (i * s_gl_stride) +
                                slice_n_offset + threadIdx.x]);
                        }
                    }
                }
                else {
                    for (int i = 0; i < sh_num_groups; i++) {
                        if (threadIdx.x < s_sh_stride) {
                            sh_s[(i * s_sh_stride) + threadIdx.x] =
                                scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
                                threadIdx.x];
                        }
                    }
                }
            };
        // Asynchronously fetch the next A, B and s tile from global to the next
        // shared memory pipeline location.
        auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
            if (pred) {
                int4* sh_a_stage = sh_a + a_sh_stage * pipe;
#pragma unroll
                for (int i = 0; i < a_sh_wr_iters; i++) {
                    cp_async4_pred(
                        &sh_a_stage[a_sh_wr_trans[i]],
                        &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
                        a_sh_wr_pred[i]);
                }
                int4* sh_b_stage = sh_b + b_sh_stage * pipe;
#pragma unroll
                for (int i = 0; i < b_sh_wr_iters; i++) {
#pragma unroll
                    for (int j = 0; j < b_thread_vecs; j++) {
                        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
                    }

                    B_ptr[i] += b_gl_rd_delta_o;
                }

                if constexpr (has_act_order) {
                    // Fetch g_idx thread-block portion
                    int full_pipe = a_off;
                    int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
                    if (cur_k < prob_k && cur_k < slice_k_finish) {
                        int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;

                        int4 const* cur_g_idx_stage_ptr =
                            reinterpret_cast<int4 const*>(&g_idx[cur_k]);

                        if (threadIdx.x < g_idx_stage) {
                            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
                                &cur_g_idx_stage_ptr[threadIdx.x]);
                        }
                    }
                }
                else {
                    if constexpr (group_blocks != -1) {
                        int4* sh_s_stage = sh_s + s_sh_stage * pipe;

                        if constexpr (group_blocks >= thread_k_blocks) {
                            // Only fetch scales if this tile starts a new group
                            if (pipe % (group_blocks / thread_k_blocks) == 0) {
                                if (s_sh_wr_pred) {
                                    cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
                                }
                                s_gl_rd += s_gl_rd_delta;
                            }
                        }
                        else {
                            for (int i = 0; i < s_tb_groups; i++) {
                                if (s_sh_wr_pred) {
                                    cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
                                        &scales_ptr[s_gl_rd]);
                                }
                                s_gl_rd += s_gl_rd_delta;
                            }
                        }
                    }
                }
            }
            // Insert a fence even when we are winding down the pipeline to ensure that
            // waiting is also correct at this point.
            cp_async_fence();
            };

        // Wait until the next thread tile has been loaded to shared memory.
        auto wait_for_stage = [&]() {
            // We only have `stages - 2` active fetches since we are double buffering
            // and can only issue the next fetch when it is guaranteed that the previous
            // shared memory load is fully complete (as it may otherwise be
            // overwritten).
            cp_async_wait<stages - 2>();
            __syncthreads();
            };

        // Load the next sub-tile from the current location in the shared memory pipe
        // into the current register buffer.
        auto fetch_to_registers = [&](int k, int pipe) {
            int4* sh_a_stage = sh_a + a_sh_stage * pipe;
#pragma unroll
            for (int i = 0; i < thread_m_blocks; i++)
            {
                ldsm4<scalar_t>(frag_a[k % 2][i],
                    &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
            }

            int4* sh_b_stage = sh_b + b_sh_stage * pipe;

#pragma unroll
            for (int i = 0; i < b_thread_vecs; i++) {
                frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
                    &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
            }
            };

        bool is_same_group[stages];
        int same_group_id[stages];

        auto init_same_group = [&](int pipe) {
            if constexpr (!has_act_order) {
                is_same_group[pipe] = false;
                same_group_id[pipe] = 0;
                return;
            }

            int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
            int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

            int group_id_1 = sh_g_idx_int_ptr[0];
            int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];

            is_same_group[pipe] = group_id_1 == group_id_2;
            same_group_id[pipe] = group_id_1;
            };

        auto fetch_scales_to_registers = [&](int k, int full_pipe) {
            int pipe = full_pipe % stages;

            if constexpr (!has_act_order) {
                // No act-order case
                if constexpr (group_blocks != -1) {
                    if constexpr (group_blocks >= thread_k_blocks) {
                        int4* sh_s_stage =
                            sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));
                        reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
                    }
                    else {
                        int warp_id = threadIdx.x / 32;
                        int n_warps = thread_n_blocks / 4;

                        int warp_row = warp_id / n_warps;

                        int cur_k = warp_row * 16;
                        cur_k += k_iter_size * (k % b_sh_wr_iters);

                        int k_blocks = cur_k / 16;
                        int cur_group_id = k_blocks / group_blocks;

                        int4* sh_s_stage = sh_s + s_sh_stage * pipe;

                        reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                            sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
                    }
                }

                return;
            }

            // Act-order case

            // Determine K of the "current" thread-block
            int cur_k = slice_k_start + tb_k * full_pipe;
            if (cur_k >= prob_k || cur_k >= slice_k_finish) {
                return;
            }

            // Reset (to current thread-block) since we read g_idx portion from the
            // shared memory
            cur_k = 0;

            // Progress to current iteration
            cur_k += k_iter_size * (k % b_sh_wr_iters);

            // Determine "position" inside the thread-block (based on warp and
            // thread-id)
            int warp_id = threadIdx.x / 32;
            int n_warps =
                thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N

            int warp_row = warp_id / n_warps;
            int warp_col = warp_id % n_warps;

            cur_k += warp_row * 16;

            int th_id = threadIdx.x % 32;
            cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix

            int s_col_shift =
                /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
                (th_id / 4) * act_s_col_stride;

            if (is_same_group[pipe]) {
                if (k % 2 == 0) {
                    *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
                        sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
                        s_col_shift];
                }
                else {
                    *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
                        *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
                }

                for (int i = 1; i < 4; i++) {
                    *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
                        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
                }
                return;
            }

            int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
            int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

            constexpr int k_frag_offsets[4] = { 0, 1, 8,
                                               9 };  // Tensor core offsets per thread

#pragma unroll
            for (int i = 0; i < 4; i++) {
                int actual_k = cur_k + k_frag_offsets[i];

                int group_id = sh_g_idx_int_ptr[actual_k];
                int rel_group_id = group_id - sh_first_group_id;

                *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
                    sh_s[rel_group_id * s_sh_stride + s_col_shift];
            }
            };

        // Execute the actual tensor core matmul of a sub-tile.
        auto matmul = [&](int k) {
            // We have the m dimension as the inner loop in order to encourage overlapping
            // dequantization and matmul operations.
#pragma unroll
            for (int j = 0; j < 4; j++) {
                FragB frag_b0;
                FragB frag_b1;
                if constexpr (num_bits == 4) {
                    int b_quant = frag_b_quant[k % 2][0][j];
                    int b_quant_shift = b_quant >> 8;

                    frag_b0 = dequant_4bit<scalar_t>(b_quant);
                    frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);

                }
                else {
                    int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
                    int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
                    int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];

                    frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
                    frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
                }

                // Apply scale to frag_b0
                if constexpr (has_act_order) {
                    scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
                        act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                        act_frag_s[k % 2][3][j], 0);
                }
                else {
                    if constexpr (group_blocks != -1) {
                        scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
                    }
                }

                // Apply scale to frag_b1
                if constexpr (has_act_order) {
                    scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
                        act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                        act_frag_s[k % 2][3][j], 1);

                }
                else {
                    if constexpr (group_blocks != -1) {
                        scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
                    }
                }

#pragma unroll
                for (int i = 0; i < thread_m_blocks; i++) {
                    mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
                    mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
                }
            }
            };

        // Since we slice across the k dimension of a tile in order to increase the
        // number of warps while keeping the n dimension of a tile reasonable, we have
        // multiple warps that accumulate their partial sums of the same output
        // location; which we have to reduce over in the end. We do in shared memory.
        auto thread_block_reduce = [&]() {
            constexpr int red_off = threads / b_sh_stride_threads / 2;
            if (red_off >= 1) {
                int red_idx = threadIdx.x / b_sh_stride_threads;
                constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
                constexpr int red_sh_delta = b_sh_stride_threads;
                int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                    (threadIdx.x % b_sh_stride_threads);

                // Parallel logarithmic shared memory reduction. We make sure to avoid any
                // unnecessary read or write iterations, e.g., for two warps we write only
                // once by warp 1 and read only once by warp 0.

#pragma unroll
                for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
#pragma unroll
                    for (int i = red_off; i > 0; i /= 2) {
                        if (i <= red_idx && red_idx < 2 * i) {
#pragma unroll
                            for (int j = 0; j < 4 * 2; j++) {
                                int red_sh_wr =
                                    red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
                                if (i < red_off) {
                                    float* c_rd =
                                        reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
                                    float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
#pragma unroll
                                    for (int k = 0; k < 4; k++)
                                        reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                                        c_rd[k] + c_wr[k];
                                }
                                sh[red_sh_wr] =
                                    reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
                            }
                        }
                        __syncthreads();
                    }
                    if (red_idx == 0) {
#pragma unroll
                        for (int i = 0; i < 4 * 2; i++) {
                            float* c_rd =
                                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
#pragma unroll
                            for (int j = 0; j < 4; j++)
                                reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
                                c_rd[j];
                        }
                    }
                    __syncthreads();
                }
            }
            };

        // Since multiple threadblocks may process parts of the same column slice, we
        // finally have to globally reduce over the results. As the striped
        // partitioning minimizes the number of such reductions and our outputs are
        // usually rather small, we perform this reduction serially in L2 cache.
        auto global_reduce = [&](bool first = false, bool last = false) {
            // We are very careful here to reduce directly in the output buffer to
            // maximize L2 cache utilization in this step. To do this, we write out
            // results in FP16 (but still reduce with FP32 compute).
            constexpr int active_threads = 32 * thread_n_blocks / 4;
            if (threadIdx.x < active_threads) {
                int c_gl_stride = prob_n / 8;
                int c_gl_wr_delta_o = 8 * c_gl_stride;
                int c_gl_wr_delta_i = 4 * (active_threads / 32);
                int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
                c_gl_wr += (2 * thread_n_blocks) * slice_col;
                constexpr int c_sh_wr_delta = active_threads;
                int c_sh_wr = threadIdx.x;

                int row = (threadIdx.x % 32) / 4;

                if (!first) {
                    // Interestingly, doing direct global accesses here really seems to mess up
                    // the compiler and lead to slowdowns, hence we also use async-copies even
                    // though these fetches are not actually asynchronous.
#pragma unroll
                    for (int i = 0; i < thread_m_blocks * 4; i++) {
                        cp_async4_pred(
                            &sh[c_sh_wr + c_sh_wr_delta * i],
                            &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                            c_gl_wr_delta_i * (i % 2)],
                            i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
                    }
                    cp_async_fence();
                    cp_async_wait<0>();
                }

#pragma unroll
                for (int i = 0; i < thread_m_blocks * 4; i++) {
                    if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
                        if (!first) {
                            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
#pragma unroll
                            for (int j = 0; j < 2 * 4; j++) {
                                reinterpret_cast<float*>(
                                    &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
                                    Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
                            }
                        }
                        if (!last) {
                            int4 c;
#pragma unroll
                            for (int j = 0; j < 2 * 4; j++) {
                                reinterpret_cast<scalar_t*>(&c)[j] =
                                    Dtype::float2num(reinterpret_cast<float*>(
                                        &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
                            }
                            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
                                c;
                        }
                    }
                }
            }
            };

        // Write out the reduce final result in the correct layout. We only actually
        // reshuffle matrix fragments in this step, the reduction above is performed
        // in fragment layout.
        auto write_result = [&]() {
            int c_gl_stride = prob_n / 8;
            constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
            int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
            constexpr int c_sh_rd_delta =
                c_sh_stride * (threads / (2 * thread_n_blocks));

            int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                (threadIdx.x % (2 * thread_n_blocks));
            c_gl_wr += (2 * thread_n_blocks) * slice_col;
            int c_sh_wr =
                (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
            c_sh_wr += 32 * (threadIdx.x / 32);
            int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                (threadIdx.x % (2 * thread_n_blocks));

            int c_gl_wr_end = c_gl_stride * prob_m;

            // We first reorder in shared memory to guarantee the most efficient final
            // global write patterns
            auto write = [&](int idx, float c0, float c1, FragS& s) {
                scalar_t2 res =
                    Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

                // For per-column quantization we finally apply the scale here (only for
                // 4-bit)
                if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
                    res = __hmul2(res, s[0]);
                }

                ((scalar_t2*)sh)[idx] = res;
                };

            if (threadIdx.x / 32 < thread_n_blocks / 4) {
#pragma unroll
                for (int i = 0; i < thread_m_blocks; i++) {
#pragma unroll
                    for (int j = 0; j < 4; j++) {
                        int wr = c_sh_wr + 8 * j;
                        write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
                            frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
                        write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
                            frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
                        write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
                            frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
                        write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
                            frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
                    }
                    c_sh_wr += 16 * (4 * c_sh_stride);
                }
            }
            __syncthreads();

#pragma unroll
            for (int i = 0;
                i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
                i++) {
                if (c_gl_wr < c_gl_wr_end) {
                    C[c_gl_wr] = sh[c_sh_rd];
                    c_gl_wr += c_gl_wr_delta;
                    c_sh_rd += c_sh_rd_delta;
                }
            }
            };

        // Start global fetch and register load pipelines.
        auto start_pipes = [&]() {

#pragma unroll
            for (int i = 0; i < stages - 1; i++) {
                if (has_act_order && i == 0) {
                    int last_g_idx = slice_k_start + stages * tb_k * 2;
                    if (last_g_idx >= prob_k) {
                        last_g_idx = prob_k - 1;
                    }
                    fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
                }
                fetch_to_shared(i, i, i < slice_iters);
            }

            zero_accums();
            wait_for_stage();
            init_same_group(0);
            fetch_to_registers(0, 0);
            fetch_scales_to_registers(0, 0);
            a_gl_rd += a_gl_rd_delta_o * (stages - 1);
            slice_k_start_shared_fetch += tb_k * (stages - 1);
            };
        if (slice_iters) {
            start_pipes();
        }

        // Main loop.
        while (slice_iters) {
            // We unroll over both the global fetch and the register load pipeline to
            // ensure all shared memory accesses are static. Note that both pipelines
            // have even length meaning that the next iteration will always start at
            // index 0.

#pragma unroll
            for (int pipe = 0; pipe < stages;) {
#pragma unroll
                for (int k = 0; k < b_sh_wr_iters; k++) {
                    fetch_to_registers(k + 1, pipe % stages);
                    fetch_scales_to_registers(k + 1, pipe);
                    if (k == b_sh_wr_iters - 2) {
                        fetch_to_shared((pipe + stages - 1) % stages, pipe,
                            slice_iters >= stages);
                        pipe++;
                        wait_for_stage();
                        init_same_group(pipe % stages);
                    }
                    matmul(k);
                }
                slice_iters--;
                if (slice_iters == 0) {
                    break;
                }
            }

            a_gl_rd += a_gl_rd_delta_o * stages;
            slice_k_start += tb_k * stages;
            slice_k_start_shared_fetch += tb_k * stages;

            if constexpr (has_act_order) {
                int first_group_id = g_idx[slice_k_start];
                int last_g_idx = slice_k_start + stages * tb_k * 2;
                if (last_g_idx >= prob_k) {
                    last_g_idx = prob_k - 1;
                }
                int last_group_id = g_idx[last_g_idx];
                if (last_group_id >= sh_first_group_id + sh_num_groups) {
                    fetch_scales_to_shared(false, first_group_id, last_group_id);
                    __syncthreads();
                }
            }

            // Process results and, if necessary, proceed to the next column slice.
            // While this pattern may not be the most readable, other ways of writing
            // the loop seemed to noticeably worse performance after compilation.
            if (slice_iters == 0) {
                cp_async_wait<0>();
                bool last = slice_idx == slice_count - 1;
                // For per-column scales, we only fetch them here in the final step before
                // write-out
                if constexpr (!has_act_order && group_blocks == -1) {
                    if constexpr (num_bits == 8) {
                        if (s_sh_wr_pred) {
                            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
                        }
                        cp_async_fence();
                    }
                    else {
                        if (last) {
                            if (s_sh_wr_pred) {
                                cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
                            }
                            cp_async_fence();
                        }
                    }
                }

                thread_block_reduce();
                if constexpr (!has_act_order && group_blocks == -1) {
                    if constexpr (num_bits == 8) {
                        cp_async_wait<0>();
                        __syncthreads();
                        if (threadIdx.x / 32 < thread_n_blocks / 4) {
                            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
                            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
                        }

                    }
                    else {
                        if (last) {
                            cp_async_wait<0>();
                            __syncthreads();
                            if (threadIdx.x / 32 < thread_n_blocks / 4) {
                                reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
                                reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
                            }
                        }
                    }
                }

                // For 8-bit channelwise, we apply the scale before the global reduction
                // that converts the fp32 results to fp16 (so that we avoid possible
                // overflow in fp16)
                if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
                    if (threadIdx.x / 32 < thread_n_blocks / 4) {
#pragma unroll
                        for (int i = 0; i < thread_m_blocks; i++) {
#pragma unroll
                            for (int j = 0; j < 4; j++) {
                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                                    frag_s[j / 2][2 * (j % 2) + 0]);
                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                                    frag_s[j / 2][2 * (j % 2) + 0]);

                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                                    frag_s[j / 2][2 * (j % 2) + 1]);
                                scale_float<scalar_t>(
                                    reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                                    frag_s[j / 2][2 * (j % 2) + 1]);
                            }
                        }
                    }
                }

                if (slice_count > 1) {  // only globally reduce if there is more than one
                    // block in a slice
                    barrier_acquire(&locks[slice_col], slice_idx);
                    global_reduce(slice_idx == 0, last);
                    barrier_release(&locks[slice_col], last);
                }
                if (last)  // only the last block in a slice actually writes the result
                    write_result();
                slice_row = 0;
                slice_col_par++;
                slice_col++;
                init_slice();
                if (slice_iters) {
                    a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                        (threadIdx.x % a_gl_rd_delta_o);
#pragma unroll
                    for (int i = 0; i < b_sh_wr_iters; i++)
                        B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
                    if (slice_col == 0) {
#pragma unroll
                        for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
                    }

                    // Update slice k/n for scales loading
                    if constexpr (has_act_order) {
                        slice_k_start = tb_k * slice_row;
                        slice_k_finish = slice_k_start + tb_k * slice_iters;
                        slice_k_start_shared_fetch = slice_k_start;
                        slice_n_offset = act_s_col_tb_stride * slice_col;

                    }
                    else {
                        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
                    }

                    start_pipes();
                }
            }
        }
    }

    template <typename scalar_t,         // compute dtype, half or nv_float16
        const int num_bits,        // number of bits used for weights
        const int threads,         // number of threads in a threadblock
        const int template_thread_m_blocks, // number of 16x16 blocks in the m
        // dimension (batchsize) of the
        // threadblock
        const int thread_n_blocks, // same for n dimension (output)
        const int thread_k_blocks, // same for k dimension (reduction)
        const int stages, // number of stages for the async global->shared
        // fetch pipeline
        const bool has_act_order,   // whether act_order is enabled
        const int group_blocks = -1 // number of consecutive 16x16 blocks
        // with a separate quantization scale
    >
    __global__ void
        Marlin_wrapper(const int4* __restrict__ A, // fp16 input matrix of shape mxk
            const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
            int4* __restrict__ C,       // fp16 output buffer of shape mxn
            const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
            // (k/groupsize)xn
            const int* __restrict__ g_idx, // int32 group indices of shape k
            int num_groups, // number of scale groups per output channel
            const int* __restrict__ prob_m_ptr,     // batch dimension m
            int prob_n,     // output dimension n
            int prob_k,     // reduction dimension k
            int* locks      // extra global storage for barrier synchronization
        ) {
        int prob_m = *prob_m_ptr;
        prob_m = min(prob_m, 1024);
        const int thread_m_blocks = min(div_ceil(prob_m, 16), template_thread_m_blocks);
        if(prob_m > 16 * thread_m_blocks)
            prob_m = (16 * thread_m_blocks) * div_ceil(prob_m, (16 * thread_m_blocks));
        /*if (blockIdx.x == 0 && threadIdx.x == 0)
            printf("marlin prob_m %d\n", prob_m);*/
        if (thread_m_blocks == 1) {
            Marlin<scalar_t, num_bits, threads, 1,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
        else if (thread_m_blocks == 2) {
            Marlin<scalar_t, num_bits, threads, 2,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
        else if (thread_m_blocks == 3) {
            Marlin<scalar_t, num_bits, threads, 3,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
        else if (thread_m_blocks == 4) {
            Marlin<scalar_t, num_bits, threads, 4,
                thread_n_blocks, thread_k_blocks, stages, has_act_order,
                group_blocks>(
                    A, B, C, scales_ptr, g_idx, num_groups, prob_m, prob_n,
                    prob_k, locks);
        }
    }

#define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
                  HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)                    \
    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
             thread_n_blocks == THREAD_N_BLOCKS &&                             \
             thread_k_blocks == THREAD_K_BLOCKS &&                             \
             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
             num_threads == NUM_THREADS) {                                     \
        cudaFuncSetAttribute(                                                  \
            Marlin_wrapper<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,           \
                   THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages,              \
                   HAS_ACT_ORDER, GROUP_BLOCKS>,                               \
            cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
        Marlin_wrapper<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,               \
               THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,   \
               GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
            A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m_ptr, prob_n, \
            prob_k, locks);                                                    \
    }

    typedef struct {
        int thread_k;
        int thread_n;
        int num_threads;
    } thread_config_t;

    typedef struct {
        int max_m_blocks;
        thread_config_t tb_cfg;
    } exec_config_t;

    thread_config_t small_batch_thread_configs[] = {
        // Ordered by priority

        // thread_k, thread_n, num_threads
        {128, 128, 256},
        {64, 128, 128},
        {128, 64, 128},
    };

    thread_config_t large_batch_thread_configs[] = {
        // Ordered by priority

        // thread_k, thread_n, num_threads
        {64, 256, 256},
        // {128, 128, 256},
        {64, 128, 128},
        {128, 64, 128},

    };

    int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
        int prob_n, int prob_k, int num_bits, int group_size,
        bool has_act_order, bool is_k_full) {
        bool cache_scales_chunk = has_act_order && !is_k_full;

        int tb_n = th_config.thread_n;
        int tb_k = th_config.thread_k;

        // Get max scale groups per thread-block
        int tb_groups;
        if (group_size == -1) {
            tb_groups = 1;
        }
        else if (group_size == 0) {
            tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size
        }
        else {
            tb_groups = div_ceil(tb_k, group_size);
        }

        if (cache_scales_chunk) {
            int load_groups =
                tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K
            load_groups = max(load_groups, 32); // We load at least 32 scale groups
            return load_groups * tb_n * 2;

        }
        else {
            int tb_scales = tb_groups * tb_n * 2;

            return tb_scales * pipe_stages;
        }
    }

    bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
        int prob_m, int prob_n, int prob_k, int num_bits,
        int scales_cache_size, int max_shared_mem) {
        int pack_factor = 32 / num_bits;

        // Get B size
        int tb_k = th_config.thread_k;
        int tb_n = th_config.thread_n;

        int b_size = (tb_k * tb_n / pack_factor) * 4;

        // Get A size
        int m_blocks = div_ceil(prob_m, 16);
        int tb_max_m = 16;

        // zbx: too ugly
        // origin
        /*while (true) {
          if (m_blocks >= max_m_blocks) {
            tb_max_m *= max_m_blocks;
            break;
          }

          max_m_blocks--;
          if (max_m_blocks == 0) {
            TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
          }
        }*/
        // refactor
        tb_max_m *= std::min(m_blocks, max_m_blocks);

        int a_size = (tb_max_m * tb_k) * 2;

        float pipe_size = (a_size + b_size) * pipe_stages;

        TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
        return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
    }

    bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
        int prob_m, int prob_n, int prob_k, int num_bits,
        int group_size, bool has_act_order, bool is_k_full,
        int max_shared_mem) {
        // Sanity
        if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
            th_config.num_threads == -1) {
            return false;
        }

        // Verify K/N are divisible by thread K/N
        if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
            return false;
        }

        // Verify min for thread K/N
        if (th_config.thread_n < min_thread_n ||
            th_config.thread_k < min_thread_k) {
            return false;
        }

        // num_threads must be at least 128 (= 4 warps)
        if (th_config.num_threads < 128) {
            return false;
        }

        //  Determine cache for scales
        int scales_cache_size =
            get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                group_size, has_act_order, is_k_full);

        // Check that pipeline fits into cache
        if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
            num_bits, scales_cache_size, max_shared_mem)) {
            return false;
        }

        return true;
    }

    exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
        int num_bits, int group_size,
        bool has_act_order, bool is_k_full,
        int max_shared_mem) {
        int max_m_blocks = 4;
        while (max_m_blocks > 0) {
            if (prob_m <= 16) {
                for (auto th_config : small_batch_thread_configs) {
                    if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n,
                        prob_k, num_bits, group_size, has_act_order,
                        is_k_full, max_shared_mem)) {
                        return exec_config_t{ max_m_blocks, th_config };
                    }
                }
            }
            else {
                for (auto th_config : large_batch_thread_configs) {
                    if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n,
                        prob_k, num_bits, group_size, has_act_order,
                        is_k_full, max_shared_mem)) {
                        return exec_config_t{ max_m_blocks, th_config };
                    }
                }
            }

            max_m_blocks--; // Process less M blocks per invocation to reduce cache
            // usage
        }

        return exec_config_t{ 0, {-1, -1, -1} };
    }

#define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)                     \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)          \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)        

    template <typename scalar_t>
    void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
        void* g_idx, void* perm, void* a_tmp, int* prob_m_ptr, int prob_m,
        int prob_n, int prob_k, void* workspace, int num_bits,
        bool has_act_order, bool is_k_full, int num_groups,
        int group_size, int dev, cudaStream_t stream, int thread_k,
        int thread_n, int sms, int max_par) {
        TORCH_CHECK(num_bits == 4 || num_bits == 8,
            "num_bits must be 4 or 8. Got = ", num_bits);
        TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [",
            prob_m, ", ", prob_n, ", ", prob_k, "]");

        int tot_m = prob_m;
        int tot_m_blocks = div_ceil(tot_m, 16);
        int pad = 16 * tot_m_blocks - tot_m;

        if (sms == -1) {
            cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
        }

        int max_shared_mem = 0;
        cudaDeviceGetAttribute(&max_shared_mem,
            cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
        TORCH_CHECK(max_shared_mem > 0);

        // Set thread config
        exec_config_t exec_cfg;
        if (thread_k != -1 && thread_n != -1) {
            // User-defined config
            exec_cfg = exec_config_t{
                4, thread_config_t{thread_k, thread_n, default_threads} };
        }
        else {
            // Auto config
            exec_cfg = determine_thread_config(prob_m, prob_n, prob_k, num_bits,
                group_size, has_act_order, is_k_full,
                max_shared_mem);
        }

        TORCH_CHECK(
            exec_cfg.max_m_blocks > 0 &&
            is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks, prob_m,
                prob_n, prob_k, num_bits, group_size, has_act_order,
                is_k_full, max_shared_mem),
            "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
            ", thread_k = ", exec_cfg.tb_cfg.thread_k,
            ", thread_n = ", exec_cfg.tb_cfg.thread_n,
            ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [", prob_m,
            ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
            ", group_size = ", group_size, ", has_act_order = ", has_act_order,
            ", is_k_full = ", is_k_full, ", max_shared_mem = ", max_shared_mem);

        int num_threads = exec_cfg.tb_cfg.num_threads;
        thread_k = exec_cfg.tb_cfg.thread_k;
        thread_n = exec_cfg.tb_cfg.thread_n;

        int thread_k_blocks = thread_k / 16;
        int thread_n_blocks = thread_n / 16;

        int blocks = sms;

        TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
            " is not divisible by thread_n = ", thread_n);
        TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
            " is not divisible by thread_k = ", thread_k);

        int group_blocks = 0;
        if (has_act_order) {
            if (is_k_full) {
                TORCH_CHECK(group_size != -1);
                group_blocks = group_size / 16;
                TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                    " is not divisible by group_blocks = ", group_blocks);
            }
            else {
                TORCH_CHECK(group_size == 0);
                group_blocks = 0;
            }

        }
        else {
            if (group_size == -1) {
                group_blocks = -1;
            }
            else {
                group_blocks = group_size / 16;
                TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                    " is not divisible by group_blocks = ", group_blocks);
            }
        }

        const int4* A_ptr = (const int4*)A;
        const int4* B_ptr = (const int4*)B;
        int4* C_ptr = (int4*)C;
        const int4* s_ptr = (const int4*)s;
        const int* g_idx_ptr = (const int*)g_idx;
        const int* perm_ptr = (const int*)perm;
        int4* a_tmp_ptr = (int4*)a_tmp;

        int* locks = (int*)workspace;

        if (has_act_order) {
            // Permute A columns
            int block_rows = div_ceil(prob_m, blocks);
            permute_cols_kernel << <blocks, default_threads, 0, stream >> > (
                A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
            A_ptr = a_tmp_ptr;
        }

        // If we have a full K, then we can run the non-act-order version of Marlin
        // (since the weight rows are reordered by increasing group ids, and by
        // having a full K, we have full original groups)
        if (is_k_full) {
            has_act_order = false;
        }

        // Main loop
        for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
            int thread_m_blocks = tot_m_blocks - i;
            prob_m = tot_m - 16 * i;
            int par = 1;
            if (thread_m_blocks > exec_cfg.max_m_blocks) {
                // Note that parallel > 1 currently only works for inputs without
                // any padding
                par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
                if (par > max_par)
                    par = max_par;
                prob_m = (16 * exec_cfg.max_m_blocks) * par;
                i += exec_cfg.max_m_blocks * (par - 1);
                thread_m_blocks = exec_cfg.max_m_blocks;
            }

            // Define kernel configurations
#define undefined_error                                                        \
    TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +    \
                           str(prob_n) + ", " + str(prob_k) + "]" +            \
                           ", has_act_order = " + str(has_act_order) +         \
                           ", num_groups = " + str(num_groups) +               \
                           ", group_size = " + str(group_size) +               \
                           ", thread_m_blocks = " + str(thread_m_blocks) +     \
                           ", thread_n_blocks = " + str(thread_n_blocks) +     \
                           ", thread_k_blocks = " + str(thread_k_blocks));

        /* std::cout << "MNK = [" + str(prob_m) + ", " + \
             str(prob_n) + ", " + str(prob_k) + "]" + \
             ", has_act_order = " + str(has_act_order) + \
             ", num_groups = " + str(num_groups) + \
             ", group_size = " + str(group_size) + \
             ", thread_m_blocks = " + str(thread_m_blocks) + \
             ", thread_n_blocks = " + str(thread_n_blocks) + \
             ", thread_k_blocks = " + str(thread_k_blocks) << std::endl;*/

             /*if (false) {
             }
             // CALL_IF(4, 32, 2, 256)
             // CALL_IF(4, 16, 4, 256)
             __CALL_IF(4, 1, 16, 4, false, 4, 256)
             __CALL_IF(4, 2, 16, 4, false, 4, 256)
             // CALL_IF(4, 8, 8, 256)
             __CALL_IF(4, 1, 8, 8, false, 4, 256)
             __CALL_IF(4, 2, 8, 8, false, 4, 256)
             // CALL_IF(4, 16, 4, 128)
             __CALL_IF(4, 1, 16, 4, false, 4, 128)
             __CALL_IF(4, 2, 16, 4, false, 4, 128)
             // CALL_IF(4, 8, 8, 128)
             __CALL_IF(4, 1, 8, 8, false, 4, 128)
             __CALL_IF(4, 2, 8, 8, false, 4, 128)
             else {undefined_error}*/

            if (num_bits == 4 && num_threads == 256)
            {
                if (false) {
                }
                CALL_IF(4, 32, 2, 256)
                    CALL_IF(4, 16, 4, 256)
                    CALL_IF(4, 8, 8, 256)
                else {
                    undefined_error
                }
            }
            else if (num_bits == 4 && num_threads == 128)
            {
                if (false) {
                }
                CALL_IF(4, 8, 4, 128)
                    CALL_IF(4, 16, 4, 128)
                    CALL_IF(4, 4, 8, 128)
                else {
                    undefined_error
                }
            }
            // else if (num_bits == 8 && num_threads == 256)
            // {
            //     if (false) {
            //     }
            //     CALL_IF(8, 32, 2, 256)
            //     CALL_IF(8, 16, 4, 256)
            //     CALL_IF(8, 8, 8, 256)
            //     else {
            //         undefined_error
            //     }
            // }
            // else if (num_bits == 8 && num_threads == 128)
            // {
            //     if (false) {
            //     }
            //     CALL_IF(8, 8, 4, 128)
            //     CALL_IF(8, 16, 4, 128)
            //     CALL_IF(8, 4, 8, 128)
            //     else {
            //         undefined_error
            //     }
            // }
            else {
                undefined_error
            }

            A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
            C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
        }
    }

} // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
    torch::Tensor& b_scales, torch::Tensor& g_idx,
    torch::Tensor& perm, torch::Tensor& workspace,
    int64_t num_bits, torch::Tensor size_m_tensor, int64_t size_m, int64_t size_n,
    int64_t size_k, int sms, bool is_k_full) {
    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
    // Verify num_bits
    TORCH_CHECK(num_bits == 4 || num_bits == 8,
        "num_bits must be 4 or 8. Got = ", num_bits);
    int pack_factor = 32 / num_bits;

    // Verify A
    TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
        ", size_m = ", size_m);
    TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
        ", size_k = ", size_k);

    // Verify B
    TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
        " is not divisible by tile_size = ", gptq_marlin::tile_size);
    TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
        "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
        ", size_k = ", size_k,
        ", tile_size = ", gptq_marlin::tile_size);
    TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
        "b_q_weight.size(1) = ", b_q_weight.size(1),
        " is not divisible by tile_size = ", gptq_marlin::tile_size);
    int actual_size_n =
        (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
    TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
        ", actual_size_n = ", actual_size_n);

    // Verify device and strides
    TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
    TORCH_CHECK(a.is_contiguous(), "A is not contiguous");

    TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
    TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");

    TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
    TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");

    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");

    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");

    // Alloc buffers
    auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
    torch::Tensor c = torch::empty({ size_m, size_n }, options);
    torch::Tensor a_tmp = torch::empty({ size_m, size_k }, options);

    // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
    // auto -1)
    int thread_k = -1;
    // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
    // auto -1)
    int thread_n = -1;
    // sms: number of SMs to use for the kernel (can usually be left as auto -1)
    // int sms = -1; //zbx

    // Verify g_idx and perm
    TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
        (g_idx.size(0) == size_k && perm.size(0) == size_k),
        "Unexpected g_idx.size(0) = ", g_idx.size(0),
        " and perm.size(0) = ", perm.size(0),
        ", where size_k = ", size_k);

    // Detect groupsize and act_order
    int num_groups = -1;
    int group_size = -1;
    bool has_act_order = g_idx.size(0) != 0;

    int b_rank = b_scales.sizes().size();
    TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
    TORCH_CHECK(b_scales.size(1) == size_n,
        "b_scales dim 1 = ", b_scales.size(1),
        " is not size_n = ", size_n);
    num_groups = b_scales.size(0);

    if (has_act_order) {
        if (is_k_full) {
            TORCH_CHECK(num_groups > 1,
                "For act_order, num_groups must be > 1");
            TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
                ", is not divisible by num_groups = ", num_groups);
            group_size = size_k / num_groups;
        }
        else {
            group_size = 0;
        }

    }
    else {
        if (num_groups > 1) {
            TORCH_CHECK(
                size_k % num_groups == 0, "size_k = ", size_k,
                ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
            group_size = size_k / num_groups;
        }
        else {
            group_size = -1;
        }
    }

    // Verify workspace size
    TORCH_CHECK(
        size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
        ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
    int min_workspace_size =
        (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
    TORCH_CHECK(workspace.numel() >= min_workspace_size,
        "workspace.numel = ", workspace.numel(),
        " is below min_workspace_size = ", min_workspace_size);

    int dev = a.get_device();
    if (a.scalar_type() == at::ScalarType::Half) {
        gptq_marlin::marlin_mm_f16i4<half>(
            a.data_ptr<at::Half>(), b_q_weight.data_ptr(),
            c.data_ptr<at::Half>(), b_scales.data_ptr<at::Half>(),
            g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
            size_m_tensor.data_ptr<int>(),
            size_m, size_n, size_k, workspace.data_ptr(), num_bits,
            has_act_order, is_k_full, num_groups, group_size, dev,
            at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
            gptq_marlin::max_par);
    }
    else if (a.scalar_type() == at::ScalarType::BFloat16) {
        gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
            a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
            c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
            g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
            size_m_tensor.data_ptr<int>(),
            size_m, size_n, size_k, workspace.data_ptr(), num_bits,
            has_act_order, is_k_full, num_groups, group_size, dev,
            at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
            gptq_marlin::max_par);
    }
    else {
        TORCH_CHECK(false,
            "gpt_marlin_gemm only supports bfloat16 and float16");
    }

    return c;
}

#endif

================================================
FILE: kt-sft/csrc/custom_marlin/gptq_marlin/gptq_marlin.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once

#include <torch/all.h>

#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>

namespace gptq_marlin {

// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
static constexpr int default_threads = 256;

static constexpr int pipe_stages =
    4; // 4 pipeline stages fit into shared memory

static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;

static constexpr int tile_size = 16;
static constexpr int max_par = 16;

template <typename T, int n> struct Vec {
    T elems[n];
    __device__ T &operator[](int i) { return elems[i]; }
};

using I4 = Vec<int, 4>;

constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// No support for async
#else

__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
                                      bool pred = true) {
    const int BYTES = 16;
    uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
    asm volatile("{\n"
                 "   .reg .pred p;\n"
                 "   setp.ne.b32 p, %0, 0;\n"
                 "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
                 "}\n" ::"r"((int)pred),
                 "r"(smem), "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
    const int BYTES = 16;
    uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
    asm volatile("{\n"
                 "   cp.async.cg.shared.global [%0], [%1], %2;\n"
                 "}\n" ::"r"(smem),
                 "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async_fence() {
    asm volatile("cp.async.commit_group;\n" ::);
}

template <int n> __device__ inline void cp_async_wait() {
    asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}

#endif

} // namespace gptq_marlin

================================================
FILE: kt-sft/csrc/custom_marlin/gptq_marlin/gptq_marlin_dtypes.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#ifndef _data_types_cuh
#define _data_types_cuh
#include "gptq_marlin.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>

namespace gptq_marlin {

template <typename scalar_t> class ScalarType {};

template <> class ScalarType<half> {
  public:
    using scalar_t = half;
    using scalar_t2 = half2;

    // Matrix fragments for tensor core instructions; their precise layout is
    // documented here:
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
    using FragA = Vec<half2, 4>;
    using FragB = Vec<half2, 2>;
    using FragC = Vec<float, 4>;
    using FragS = Vec<half2, 1>;

    static __device__ float inline num2float(const half x) {
        return __half2float(x);
    }

    static __device__ half2 inline num2num2(const half x) {
        return __half2half2(x);
    }

    static __device__ half2 inline nums2num2(const half x1, const half x2) {
        return __halves2half2(x1, x2);
    }

    static __host__ __device__ half inline float2num(const float x) {
        return __float2half(x);
    }
};

template <> class ScalarType<nv_bfloat16> {
  public:
    using scalar_t = nv_bfloat16;
    using scalar_t2 = nv_bfloat162;

    using FragA = Vec<nv_bfloat162, 4>;
    using FragB = Vec<nv_bfloat162, 2>;
    using FragC = Vec<float, 4>;
    using FragS = Vec<nv_bfloat162, 1>;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
    static __device__ float inline num2float(const nv_bfloat16 x) {
        return __bfloat162float(x);
    }

    static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
        return __bfloat162bfloat162(x);
    }

    static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
                                                    const nv_bfloat16 x2) {
        return __halves2bfloat162(x1, x2);
    }

    static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
        return __float2bfloat16(x);
    }
#endif
};

} // namespace gptq_marlin

#endif

================================================
FILE: kt-sft/csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu
================================================
#include "gptq_marlin.cuh"

namespace gptq_marlin {

static constexpr int repack_stages = 8;

static constexpr int repack_threads = 256;

static constexpr int tile_k_size = tile_size;
static constexpr int tile_n_size = tile_k_size * 4;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800

template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void marlin_repack_kernel(
    uint32_t const* __restrict__ b_q_weight_ptr,
    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
    int size_k, int size_n) {}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits) {
  TORCH_CHECK_NOT_IMPLEMENTED(
      false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
  return torch::empty({1, 1});
}

#else

template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void marlin_repack_kernel(
    uint32_t const* __restrict__ b_q_weight_ptr,
    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
    int size_k, int size_n) {
  constexpr int pack_factor = 32 / num_bits;

  int k_tiles = size_k / tile_k_size;
  int n_tiles = size_n / tile_n_size;
  int block_k_tiles = div_ceil(k_tiles, gridDim.x);

  int start_k_tile = blockIdx.x * block_k_tiles;
  if (start_k_tile >= k_tiles) {
    return;
  }

  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);

  // Wait until the next thread tile has been loaded to shared memory.
  auto wait_for_stage = [&]() {
    // We only have `stages - 2` active fetches since we are double buffering
    // and can only issue the next fetch when it is guaranteed that the previous
    // shared memory load is fully complete (as it may otherwise be
    // overwritten).
    cp_async_wait<repack_stages - 2>();
    __syncthreads();
  };

  extern __shared__ int4 sh[];

  constexpr int perm_size = tile_k_size / 4;

  int4* sh_perm_ptr = sh;
  int4* sh_pipe_ptr = sh_perm_ptr;
  if constexpr (has_perm) {
    sh_pipe_ptr += perm_size;
  }

  constexpr int tile_ints = tile_k_size / pack_factor;

  constexpr int stage_n_threads = tile_n_size / 4;
  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
  constexpr int stage_size = stage_k_threads * stage_n_threads;

  auto load_perm_to_shared = [&](int k_tile_id) {
    int first_k_int4 = (k_tile_id * tile_k_size) / 4;

    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);

    if (threadIdx.x < perm_size) {
      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
    }
    __syncthreads();
  };

  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
    if (n_tile_id >= n_tiles) {
      cp_async_fence();
      return;
    }

    int first_n = n_tile_id * tile_n_size;

    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;

    if constexpr (has_perm) {
      if (threadIdx.x < stage_size) {
        int k_id = threadIdx.x / stage_n_threads;
        int n_id = threadIdx.x % stage_n_threads;

        uint32_t const* sh_perm_int_ptr =
            reinterpret_cast<uint32_t const*>(sh_perm_ptr);

        int src_k = sh_perm_int_ptr[k_id];
        int src_k_packed = src_k / pack_factor;

        cp_async4(
            &sh_ptr[k_id * stage_n_threads + n_id],
            reinterpret_cast<int4 const*>(&(
                b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
      }

    } else {
      if (threadIdx.x < stage_size) {
        int k_id = threadIdx.x / stage_n_threads;
        int n_id = threadIdx.x % stage_n_threads;

        int first_k = k_tile_id * tile_k_size;
        int first_k_packed = first_k / pack_factor;

        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
                  reinterpret_cast<int4 const*>(
                      &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
                                       first_n + (n_id * 4)])));
      }
    }

    cp_async_fence();
  };

  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
    if (n_tile_id >= n_tiles) {
      return;
    }

    int warp_id = threadIdx.x / 32;
    int th_id = threadIdx.x % 32;

    if (warp_id >= 4) {
      return;
    }

    int tc_col = th_id / 4;
    int tc_row = (th_id % 4) * 2;

    constexpr int tc_offsets[4] = {0, 1, 8, 9};

    int cur_n = warp_id * 16 + tc_col;

    constexpr int sh_stride = 64;
    constexpr uint32_t mask = (1 << num_bits) - 1;

    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);

    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);

    uint32_t vals[8];

    if constexpr (has_perm) {
      for (int i = 0; i < 4; i++) {
        int k_idx = tc_row + tc_offsets[i];

        uint32_t src_k = sh_perm_int_ptr[k_idx];
        uint32_t src_k_pos = src_k % pack_factor;

        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;

        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;

        vals[i] = b1_cur_val;
        vals[4 + i] = b2_cur_val;
      }

    } else {
      uint32_t b1_vals[tile_ints];
      uint32_t b2_vals[tile_ints];

  #pragma unroll
      for (int i = 0; i < tile_ints; i++) {
        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
      }

  #pragma unroll
      for (int i = 0; i < 4; i++) {
        int cur_elem = tc_row + tc_offsets[i];
        int cur_int = cur_elem / pack_factor;
        int cur_pos = cur_elem % pack_factor;

        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
      }
    }

    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;

    // Result of:
    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
    if constexpr (num_bits == 4) {
      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};

      uint32_t res = 0;
  #pragma unroll
      for (int i = 0; i < 8; i++) {
        res |= vals[pack_idx[i]] << (i * 4);
      }

      out_ptr[out_offset + th_id * 4 + warp_id] = res;

    } else {
      constexpr int pack_idx[4] = {0, 2, 1, 3};

      uint32_t res1 = 0;
      uint32_t res2 = 0;
  #pragma unroll
      for (int i = 0; i < 4; i++) {
        res1 |= vals[pack_idx[i]] << (i * 8);
        res2 |= vals[4 + pack_idx[i]] << (i * 8);
      }

      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
    }
  };

  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
  #pragma unroll
    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
    }

    wait_for_stage();
  };
  #pragma unroll
  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
    int n_tile_id = 0;

    if constexpr (has_perm) {
      load_perm_to_shared(k_tile_id);
    }

    start_pipes(k_tile_id, n_tile_id);

    while (n_tile_id < n_tiles) {
  #pragma unroll
      for (int pipe = 0; pipe < repack_stages; pipe++) {
        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
                        n_tile_id + pipe + repack_stages - 1);
        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
        wait_for_stage();
      }
      n_tile_id += repack_stages;
    }
  }
}

}  // namespace gptq_marlin

  #define CALL_IF(NUM_BITS, HAS_PERM)                                          \
    else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                   \
      cudaFuncSetAttribute(                                                    \
          gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads,       \
                                            NUM_BITS, HAS_PERM>,               \
          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
      gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
                                        HAS_PERM>                              \
          <<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>(   \
              b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);              \
    }

torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits) {
  // Verify compatibility with marlin tile of 16x64
  TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
              " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
  TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
              " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);

  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  int const pack_factor = 32 / num_bits;

  // Verify B
  TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
              ", size_k = ", size_k, ", pack_factor = ", pack_factor);
  TORCH_CHECK(b_q_weight.size(1) == size_n,
              "b_q_weight.size(1) = ", b_q_weight.size(1),
              " is not size_n = ", size_n);

  // Verify device and strides
  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");

  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");

  // Alloc buffers
  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
  auto options = torch::TensorOptions()
                     .dtype(b_q_weight.dtype())
                     .device(b_q_weight.device());
  torch::Tensor out =
      torch::empty({size_k / gptq_marlin::tile_size,
                    size_n * gptq_marlin::tile_size / pack_factor},
                   options);

  // Detect if there is act_order
  bool has_perm = perm.size(0) != 0;

  // Get ptrs
  uint32_t const* b_q_weight_ptr =
      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());

  // Get dev info
  int dev = b_q_weight.get_device();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
  int blocks;
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);

  int max_shared_mem = 0;
  cudaDeviceGetAttribute(&max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);

  if (false) {
  }
  CALL_IF(4, false)
  CALL_IF(4, true)
  CALL_IF(8, false)
  CALL_IF(8, true)
  else {
    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
                ", has_perm = ", has_perm);
  }

  return out;
}

#endif

================================================
FILE: kt-sft/csrc/custom_marlin/gptq_marlin/ops.h
================================================
/**
 * @Description  :
 * @Author       : Azure
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : Azure
 * @LastEditTime : 2024-07-26 08:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#pragma once

#include <torch/extension.h>
#include <torch/library.h>
#include <torch/torch.h>

torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
                               torch::Tensor &b_scales, torch::Tensor &g_idx,
                               torch::Tensor &perm, torch::Tensor &workspace,
                               int64_t num_bits, torch::Tensor size_m_tensor, int64_t size_m, int64_t size_n,
                               int64_t size_k, int sms, bool is_k_full);

torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor&perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);

================================================
FILE: kt-sft/csrc/custom_marlin/setup.py
================================================
from setuptools import setup, Extension
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
    name='vLLMMarlin',
    ext_modules=[
        CUDAExtension(
            'vLLMMarlin', [
                #'custom_gguf/dequant.cu',
                'binding.cpp',
                'gptq_marlin/gptq_marlin.cu',
                'gptq_marlin/gptq_marlin_repack.cu',
            ],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': [
                    '-O3',
                    '--use_fast_math',
                    '-Xcompiler', '-fPIC',
                ]
            },
        )
    ],
    cmdclass={'build_ext': BuildExtension}
)

================================================
FILE: kt-sft/csrc/custom_marlin/test_cuda_graph.py
================================================
import csv
import torch
import torch.nn as nn
import vLLMMarlin
torch.set_grad_enabled(False)
from utils.marlin_utils import (
	MarlinWorkspace,
	marlin_quantize,
	GPTQ_MARLIN_MIN_THREAD_N,
	GPTQ_MARLIN_MIN_THREAD_K,
	GPTQ_MARLIN_MAX_PARALLEL,
)

def setup_seed(seed):
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

setup_seed(20241223)

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
global_dtype=torch.bfloat16
global_device=torch.device("cuda",0)
global_num_cases:int=int(50)
torch.cuda.set_device(0)
torch.backends.cudnn.enabled =True
torch.backends.cudnn.benchmark = True

max_batch_size = 512
max_tp = 8
L2_size = 73728 * 1024

def get_usable_mem():
	properties = torch.cuda.get_device_properties(global_device)
	#print(f"Total memory: {properties.total_memory / (1024 ** 3):.2f} GB")
	allocated_memory = torch.cuda.memory_allocated(global_device)
	#print(f"Currently allocated memory: {allocated_memory / (1024 ** 2):.2f} MB")
	reserved_memory = torch.cuda.memory_reserved(global_device)
	#print(f"Currently reserved memory: {reserved_memory / (1024 ** 2):.2f} MB")
	return properties.total_memory - 512 * 1024 ** 2 - allocated_memory# - reserved_memory

def exp_range(start, stop, step = 2):
	now = start
	while now <= stop:
		yield now
		now *= step

def timing(func, iters, epochs=100):
	#warmup
	for idx in range(iters):
		func(idx)
		
	torch.cuda.synchronize()
	cuda_graph = torch.cuda.CUDAGraph()
	with torch.cuda.graph(cuda_graph):
		for idx in range(iters):
			func(idx)

	for _ in range(2000):
		cuda_graph.replay()

	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)
	stream = torch.cuda.Stream()
	torch.cuda.synchronize()
	#with torch.cuda.stream(stream):
	start_event.record()
	for _ in range(10):
		cuda_graph.replay()
	end_event.record()
	torch.cuda.synchronize()
	elapsed_time_ms0 = start_event.elapsed_time(end_event)
	
	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)
	torch.cuda.synchronize()
	#with torch.cuda.stream(stream):
	start_event.record()
	for _ in range(epochs+10):
		cuda_graph.replay()
	end_event.record()
	torch.cuda.synchronize()
	elapsed_time_ms = start_event.elapsed_time(end_event) - elapsed_time_ms0
	
	#print(elapsed_time_ms0, elapsed_time_ms)
	return elapsed_time_ms/iters/epochs

class LinearMarlin(nn.Linear):
	marlin_q_w: torch.Tensor
	marlin_s: torch.Tensor
	g_idx: torch.Tensor
	sort_indices: torch.Tensor
	has_bias: bool
	def __init__(
		self,
		in_features,
		out_features,
		bias = False,
		device: str = "cuda",
		num_bits: int = 4,  # 4-bit/8-bit is supported
		group_size: int = 64,  # -1, 32, 64, 128
		act_order: bool = False,
		is_k_full=True,
		sms = -1, # sms in GPU
		**kwargs,
	):
		self.padding = False
		assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
		if in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
			#print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
			self.padding = True
			self.orin_in_features = in_features
			self.orin_out_features = out_features
			in_features = (in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
			out_features = (out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
			#print(f"After padding: in_features={in_features}, out_features={out_features}")
			

		super().__init__(in_features, out_features, bias, device)
		self.has_bias = bias
		self.device = device
		self.num_bits = num_bits
		self.group_size = group_size
		self.act_order = act_order
		# TODO: optimize every shape GEMM
		
		blocks_k, blocks_n = in_features//128, out_features//128

		self.sms = sms

		self.is_k_full = is_k_full
		
		self.weight.requires_grad = False
		self.weight.t_()
		# Pack Marlin linear
		#w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
		#    self.weight, self.num_bits, self.group_size, self.act_order
		#)
		marlin_q_w = torch.randint(int(-1e9), int(1e9), (in_features//16, out_features*2), device=device, dtype=torch.int)
		marlin_s = torch.randn((in_features//64, out_features), device=device)
		self.workspace = MarlinWorkspace(
			self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL, self.device
		)
		self.marlin_q_w = marlin_q_w
		self.marlin_s = marlin_s
		self.g_idx = torch.empty((0), dtype=torch.int32, device=self.device)
		self.sort_indices = torch.empty((0), dtype=torch.int32, device=self.device)
		self.k = self.weight.shape[0]
		self.n = self.weight.shape[1]
		self.weight = None
		"""
		print(in_features, out_features)
		print(marlin_q_w.shape)
		print(marlin_q_w.dtype)
		print(marlin_s.shape)
		print(marlin_s.dtype)
		print(self.workspace.scratch.shape)
		print(self.workspace.scratch.dtype)
		print(self.g_idx.shape)
		print(self.g_idx.dtype)
		print(self.sort_indices.shape)
		print(self.sort_indices.dtype)
		#print(w_ref.shape)
		#print(w_ref.dtype)
		"""
		#w_ref = None

	def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor) -> torch.Tensor:
		# Only support input x as BF16 and FP16
		x = x.to(self.device)
		orig_shape = list(x.shape)
		orig_dtype = x.dtype
		x = x.reshape(-1, x.shape[-1])
		if self.padding:
			padding_input=torch.empty(x.shape[0], self.in_features, device=x.device, dtype=x.dtype)
			padding_input[:,:self.orin_in_features] = x
			x = padding_input
		marlin_s = self.marlin_s.to(x.dtype)
		#print(self.sms * ((orig_shape[0]+63)//64))
		
		sms = self.sms

		x = vLLMMarlin.gptq_marlin_gemm(
			x,
			self.marlin_q_w,
			marlin_s,
			self.g_idx,
			self.sort_indices,
			self.workspace.scratch,
			self.num_bits,
			bsz_tensor,
			x.shape[0],
			self.n,
			x.shape[-1],
			sms,
			self.is_k_full,
		)
		# TODO: don't padding bias
		if self.has_bias:
			x = x + self.bias
		if self.padding:
			x = x[:,:self.orin_out_features]
			orig_shape[-1] = self.orin_out_features
		else:
			orig_shape[-1] = self.out_features
		return x.reshape(orig_shape).to(orig_dtype)

def benchLinearMarlin(input_dim, output_dim):#, out_file
	print("benchmarking MLP Marlin")
	print("-----------------------------------------------------------")
	headers = ["batch_size", "tp", "used_time", "bandwidth GB/s", "TFLOPS", "cases", "padding", "sms"]
	print(" | ".join(headers) + "\n")
	rows = []
	for batch_size in exp_range(1, 64):
		for tp in exp_range(1, max_tp):
			torch.cuda.empty_cache()
			if output_dim % tp != 0:
				continue
			cur_output_dim = output_dim // tp
			modules = []
			inputs = []
			data_size = int(0.53125*input_dim*cur_output_dim)
			input_size = int(2*batch_size*input_dim)
			output_size = int(2*batch_size*cur_output_dim)
			usable_mem = get_usable_mem() - 2 * input_dim * cur_output_dim
			min_cases = max(global_num_cases, (2*L2_size) // (data_size+input_size))
			cases = int(min(min_cases, (usable_mem * 0.8) // (data_size+input_size)))
			#print(usable_mem, data_size, input_size, cases)
				
			bsz_tensor = torch.tensor([batch_size], device=global_device, dtype=torch.int32)

			if cases == 0:
				row = [f"{batch_size}", "OOM", "OOM", "OOM", "0", "False"]
				rows.append(row)
				break
			for _ in range(cases):
				modules.append(LinearMarlin(input_dim, cur_output_dim, sms=56, non_equal_division=False).to(device=global_device).eval())
				inputs.append(torch.randn(batch_size, 1, input_dim, device=global_device))
				
			def forward(case_id):
				modules[case_id](inputs[case_id], bsz_tensor)
				
			used_time = timing(forward, iters=cases)
			bandwidth = (data_size+input_size+output_size)/used_time/1e6
			flops = 2*batch_size*input_dim*cur_output_dim
			tflops = flops/used_time/1e9
			cur_sms = modules[0].sms
			row = [f"{batch_size}", f"{tp}", f"{used_time}", f"{bandwidth}", f"{tflops}", f"{cases}", modules[0].padding, cur_sms]
			rows.append(row)
			print(f"{batch_size}", f"{tp}", f"{used_time}", f"{bandwidth}", f"{tflops}", f"{cases}", modules[0].padding, cur_sms)
	
	"""
	with open(out_file, 'w', newline='') as csvfile:
		csvwriter = csv.writer(csvfile)
		csvwriter.writerow(headers)
		for row in rows:
			csvwriter.writerow(row)
	"""
	
	"""
	markdown_table = " | ".join(headers) + "\n"
	markdown_table += " | ".join(["---"] * len(headers)) + "\n"
	for row in rows:
		markdown_table += " | ".join(row) + "\n"

	print(markdown_table)
	"""
	#print("finish write file", out_file)
	#print("-------------------------------------------------------------")

if __name__ == "__main__":
	
	benchLinearMarlin(5120, 3584)
	exit(0)
	
	max_batch = 1
	cur_batch = 1


	marlin_linear = LinearMarlin(5120, 3584)

	input_tensor = torch.randn(max_batch, 1, 5120, device="cuda", dtype=torch.bfloat16)
	bsz_tensor = torch.tensor([max_batch], device="cuda", dtype=torch.int32)

	out_truth = marlin_linear(input_tensor, bsz_tensor)

	print(out_truth)

	g = torch.cuda.CUDAGraph()
	with torch.cuda.graph(g):
		out_buf = marlin_linear(input_tensor, bsz_tensor)
	
	for i in range(10000):
		g.replay()
	
	#torch.testing.assert_close(out_buf, out_truth, rtol=1e-3, atol=1e-3)
	
	marlin_linear = LinearMarlin(5120, 3584)
	g = torch.cuda.CUDAGraph()
	with torch.cuda.graph(g):
		out_buf = marlin_linear(input_tensor, bsz_tensor)
	
	new_input = torch.randn(cur_batch, 1, 5120, device="cuda", dtype=torch.bfloat16)
	bsz_tensor.copy_(torch.tensor([cur_batch], device="cuda", dtype=torch.int32))
	
	new_out_truth = marlin_linear(new_input, bsz_tensor)
	input_tensor[:cur_batch].copy_(new_input)
	input_tensor[cur_batch:] = 0
	
	g.replay()
	
	torch.cuda.synchronize()

	def printMinMax(tensor):
		abs_tensor = torch.abs(tensor)

		min_val = torch.min(abs_tensor)
		max_val = torch.max(abs_tensor)

		min_indices = (abs_tensor == min_val).nonzero(as_tuple=True)
		max_indices = (abs_tensor == max_val).nonzero(as_tuple=True)

		print(f"min: {min_val.item()}")
		print(f"min idx: {min_indices}")
		print(f"max: {max_val.item()}")
		print(f"max idx: {max_indices}")

	print(out_buf[:cur_batch].shape)
	print(new_out_truth.shape)


	printMinMax(out_buf[:cur_batch])
	printMinMax(new_out_truth)

	#torch.testing.assert_close(out_buf[:cur_batch, 0, :], new_out_truth[:cur_batch, 0, :], rtol=1e-3, atol=1e-3)


================================================
FILE: kt-sft/csrc/custom_marlin/utils/__init__.py
================================================


================================================
FILE: kt-sft/csrc/custom_marlin/utils/format24.py
================================================
#
# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
#

import torch


# This is PyTorch implementation of main part of reorder_meta()
# function, from tools/util/include/cutlass/util/host_reorder.h file
# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
# GEMM decides upon layout of this matrix, and at the moment for the
# sparse GEMM executed on tensor cores, this is layout described by
# ColumnMajorInterleaved<2> data structure, in
# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
# reordering of meta matrix into meta_reordered matrix calculated
# according to these segments of CUTLASS code is re-implemented here.
# Note that this calculation produces offsets for scattering metadata
# matrix elements into reordered metadata matrix elements (or,
# equivalently, for gathering reordered metadata matrix element back
# into metadata matrix elements).
def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
                                               device):
    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)

    # Reorder the rows, then swizzle the 2x2 blocks.
    group_x = 64
    group_y = 32 if meta_dtype.itemsize == 2 else 16

    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
                ((dst_rows % group_x) // 8) * 4)

    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
    dst_rows += topright - bottomleft
    dst_cols -= topright - bottomleft

    # Assumed that meta tensor is to be stored in CUTLASS
    # InterleavedColumnMajor layout, and reverse engineered
    # corresponding code to store values into this tensor.
    interleave = 2
    cols_maj = dst_cols // interleave
    cols_min = dst_cols % interleave
    return (cols_maj * m * interleave + dst_rows * interleave +
            cols_min).view(-1)


# This function converts dense matrix into sparse semi-structured
# representation, producing "compressed" matrix, in the layout used by
# CUTLASS backend, and corresponding metadata matrix.
def sparse_semi_structured_from_dense_cutlass(dense):
    if dense.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = dense.shape
    device = dense.device

    meta_dtype = torch.int8
    if dense.dtype == torch.int8:
        meta_dtype = torch.int32
    elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
        meta_dtype = torch.int16
    else:
        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
    if quadbits_per_meta_elem not in (4, 8):
        raise RuntimeError(
            "Invalid number of elements per meta element calculated")

    if meta_dtype == torch.int32:
        if m % 16 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 16")
    else:
        if m % 32 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 32")
    if k % (4 * quadbits_per_meta_elem) != 0:
        raise RuntimeError(
            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
        )

    if dense.dtype != torch.float:
        ksparse = 4
        dense_4 = dense.view(-1, k // ksparse, ksparse)
        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
    else:
        ksparse = 2
        dense_2 = dense.view(-1, k // ksparse, ksparse)
        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
    meta_ncols = k // (ksparse * quadbits_per_meta_elem)

    # Encoding quadruples of True/False values as follows:
    #     [True,  True,  False, False] -> 0b0100
    #     [True,  False, True,  False] -> 0b1000
    #     [False, True,  True,  False] -> 0b1001
    #     [True,  False, False, True ] -> 0b1100
    #     [False, True,  False, True ] -> 0b1101
    #     [False, False, True,  True ] -> 0b1110
    # Thus, lower two bits in the encoding are index of the True value
    # at the lowest index in the quadruple, and the higher two bits in
    # the encoding are index of the other True value in the quadruple.
    # In case there are less than two True values, than False value or
    # values at some index or indices are considered True for the
    # encoding.  In case there are more than two True values, then the
    # excess True value(s) at some indices are considered False for
    # the encoding.  The exact encodings used for these cases are as
    # follows:
    #     [False, False, False, False] -> 0b1110
    #     [False, False, False, True ] -> 0b1110
    #     [False, False, True,  False] -> 0b1110
    #     [False, True,  False, False] -> 0b1001
    #     [False, True,  True,  True ] -> 0b1101
    #     [True,  False, False, False] -> 0b1000
    #     [True,  False, True,  True ] -> 0b1100
    #     [True,  True,  False, True ] -> 0b0100
    #     [True,  True,  True,  False] -> 0b0100
    #     [True,  True,  True,  True ] -> 0b0100
    # These particular encodings are chosen, with the help of Espresso
    # logic minimizer software, for the purpose of minimization of
    # corresponding Boolean functions, that translate non-zero flags
    # into encoding bits.  Note also possible choices for the first
    # and last of these encodings were limited only to (0b0100,
    # 0b1110), in order to produce valid encodings for 1:2 sparsity
    # case.

    expr0 = m0 & m1
    expr1 = ~m0 & m1
    expr2 = ~m0 & ~m1
    bit0 = expr1
    bit1 = expr2
    bit2 = expr0 | expr2 | m3
    bit3 = expr1 | ~m1
    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
    idxs1 = bit2 | (bit3.to(torch.int64) << 1)

    if dense.dtype != torch.float:
        sparse0 = dense_4.gather(
            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
    else:
        sparse = dense_2.gather(-1,
                                idxs0.unsqueeze(-1) // 2).view(
                                    m,
                                    k // 2)  # type: ignore[possibly-undefined]

    meta_4 = idxs0 | (idxs1 << 2)
    meta_n = meta_4.view(
        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)

    if quadbits_per_meta_elem == 4:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12))
    elif quadbits_per_meta_elem == 8:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12)
                | (meta_n[:, :, 4] << 16)
                | (meta_n[:, :, 5] << 20)
                | (meta_n[:, :, 6] << 24)
                | (meta_n[:, :, 7] << 28))

    # Reorder meta tensor elements.
    meta_reordered = meta.new_empty(
        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))

    return (sparse, meta_reordered.view(m, meta_ncols))


# This function performs reverse of the function above - it
# reconstructs dense matrix from a pair of "compressed" matrix, given
# in the layout used by CUTLASS backend, and accompanying metadata
# matrix.
def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
    if sparse.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = sparse.shape
    device = sparse.device

    if meta_reordered.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"  # noqa: E501
        )
    if meta_reordered.device != device:
        raise RuntimeError(
            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"  # noqa: E501
        )

    meta_dtype = meta_reordered.dtype
    if meta_dtype not in (torch.int16, torch.int32):
        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4

    ksparse = 4 if sparse.dtype != torch.float else 2

    meta_nrows, meta_ncols = meta_reordered.shape
    if meta_nrows != m:
        raise RuntimeError(
            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"  # noqa: E501
        )
    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
        raise RuntimeError(
            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
            "expected according to the number of columns of meta matrix")

    # Undo meta tensor elements reordering.
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta = torch.gather(meta_reordered.view(-1), 0,
                        meta_offsets).view(m, meta_ncols)

    # Unpack sparse tensor back to original dense tensor, using
    # information provided by meta tensor.  Note that torch.float
    # datatype is handled pretty much the same as
    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
    # value is encoded as if underlying 8 bytes contain four
    # torch.half/torch.bfloat16 values, where either first two or last
    # two are zeros.
    meta_2 = torch.empty(
        (m, meta_ncols, 2 * quadbits_per_meta_elem),
        dtype=meta_dtype,
        device=device,
    )
    if quadbits_per_meta_elem == 4:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
    elif quadbits_per_meta_elem == 8:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
        meta_2[:, :, 8] = (meta >> 16) & 0b11
        meta_2[:, :, 9] = (meta >> 18) & 0b11
        meta_2[:, :, 10] = (meta >> 20) & 0b11
        meta_2[:, :, 11] = (meta >> 22) & 0b11
        meta_2[:, :, 12] = (meta >> 24) & 0b11
        meta_2[:, :, 13] = (meta >> 26) & 0b11
        meta_2[:, :, 14] = (meta >> 28) & 0b11
        meta_2[:, :, 15] = (meta >> 30) & 0b11

    dense_offsets = meta_2.view(-1) + (
        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
            -1, 1).repeat(1, 2).view(-1)

    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
    if sparse.dtype != torch.float:
        # dense.scatter_(0, dense_offsets, sparse.view(-1))
        dense.scatter_(0, dense_offsets, sparse.reshape(-1))
    else:
        dense.view(torch.half).scatter_(0, dense_offsets,
                                        sparse.view(torch.half).view(-1))

    return dense.view(m, 2 * k)


def mask_creator(tensor):
    """
    Class for creating N:M sparsity masks.
    Masks will be created using the N:M ratio, where for every block of 
    M weights, N will be pruned based on ranked weight value. Each mask 
    will correspond to the given tensor.

    :param N: The number of weights in a group to keep
    :param M: The size of a weight group
    """
    N = 2
    M = 4

    mask = None
    # for i, tensor in enumerate(tensors):
    if tensor.numel() % M != 0:
        raise ValueError(
            f"Tensor of size {tensor.shape} can't be evenly divided into "
            f"{M} groups")

    num_groups = tensor.numel() // M

    # N:M sparsity for linear layers
    tensor_temp = tensor.detach().abs().reshape(num_groups, M)
    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]

    w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
    mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)

    return mask

================================================
FILE: kt-sft/csrc/custom_marlin/utils/marlin_24_perms.py
================================================
'''
Date: 2024-11-08 02:46:07
LastEditors: djw
LastEditTime: 2024-11-08 02:46:41
'''
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
#
# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms_24(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        col_o = col // 2
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
                             4 * block)
        for j in range(4):
            perm_list.extend([p + 1 * j for p in perm1])
    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
    scale_perm_single: List[int] = []
    for i in range(8):
        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
    return perm, scale_perm, scale_perm_single


marlin_24_perm: Dict[int, torch.Tensor] = {}
marlin_24_scale_perm: Dict[int, List[int]] = {}
marlin_24_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
    marlin_24_perm[num_bits] = perm_24
    marlin_24_scale_perm[num_bits] = scale_perm_24
    marlin_24_scale_perm_single[num_bits] = scale_perm_single_24

================================================
FILE: kt-sft/csrc/custom_marlin/utils/marlin_perms.py
================================================
'''
Date: 2024-11-08 02:46:47
LastEditors: djw
LastEditTime: 2024-11-08 02:46:55
'''
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
#
# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col + 8 * block)
        for j in range(4):
            perm_list.extend([p + 256 * j for p in perm1])

    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return perm, scale_perm, scale_perm_single


marlin_perm: Dict[int, torch.Tensor] = {}
marlin_scale_perm: Dict[int, List[int]] = {}
marlin_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm, scale_perm, scale_perm_single = get_perms(num_bits)
    marlin_perm[num_bits] = perm
    marlin_scale_perm[num_bits] = scale_perm
    marlin_scale_perm_single[num_bits] = scale_perm_single

================================================
FILE: kt-sft/csrc/custom_marlin/utils/marlin_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import random

import numpy
import torch

from .format24 import (
    mask_creator, sparse_semi_structured_from_dense_cutlass)
from .marlin_24_perms import (
    marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
from .marlin_perms import (
    marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
from .quant_utils import (
    get_pack_factor, quantize_weights, sort_weights, dequantize_weights)


__cuda_arch = torch.cuda.get_device_capability()

MARLIN_TILE = 16

GPTQ_MARLIN_TILE = 16
GPTQ_MARLIN_MIN_THREAD_N = 64
GPTQ_MARLIN_MIN_THREAD_K = 128
GPTQ_MARLIN_MAX_PARALLEL = 16

GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
GPTQ_MARLIN_SUPPORTED_SYM = [True]

def is_marlin_supported():
    return __cuda_arch[0] >= 8


def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
    assert q_w.shape == (size_k, size_n)
    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"

    # Permute weights to 16x64 marlin tiles
    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
    q_w = q_w.permute((0, 2, 1, 3))
    q_w = q_w.reshape((size_k // tile, size_n * tile))

    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)

    return q_w


def marlin_weights(q_w, size_k, size_n, num_bits, perm):
    # Permute
    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)

    # Pack
    pack_factor = get_pack_factor(num_bits)
    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
                           dtype=numpy.uint32)
    for i in range(pack_factor):
        q_packed |= q_w[:, i::pack_factor] << num_bits * i

    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)

    return q_packed


def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
                          scale_perm_single):
    if group_size < size_k and group_size != -1:
        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
    else:
        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
    s = s.reshape((-1, size_n)).contiguous()

    return s


def marlin_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
    act_order: bool,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Quantize (and apply act_order if provided)
    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                       act_order)

    # For act_order, sort the "weights" and "g_idx" so that group ids are
    # increasing
    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)

    # Reformat to marlin
    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
                                marlin_perm[num_bits])
    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                     marlin_scale_perm[num_bits],
                                     marlin_scale_perm_single[num_bits])

    # Create result
    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def inject_24(w, size_k, size_n):
    assert w.shape == (size_k, size_n)

    mask = mask_creator(w.t()).t().cuda().bool()

    return (mask * w).contiguous(), mask.contiguous()


def check_24(w, num_rows_to_sample=50, _verbose=False):
    BLOCK_SIZE = 4
    MAX_NON_ZEROS = 2

    w = w.t().contiguous()

    print("check_24: w.shape = {}".format(w.shape))

    num_rows, num_cols = w.shape
    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
    if _verbose:
        print(f"Sampled row idxs = {sampled_row_idxs}")

    total_segments = 0
    non_24_segments = 0
    for i in sampled_row_idxs:
        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
            total_segments += 1
            block = w[i, j:j + BLOCK_SIZE]
            num_nonzero = torch.count_nonzero(block)
            if num_nonzero > MAX_NON_ZEROS:
                print("i = {} j = {} block = {}".format(i, j, block))
                non_24_segments += 1

    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")


def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
    assert q_24.shape == (size_k, size_n)

    # Remove zp to normalize over 0
    max_q_val = (1 << num_bits) - 1
    zp = (max_q_val + 1) // 2
    q_24_no_zp = q_24 - zp

    # Compress
    q_24_no_zp = q_24_no_zp.t().contiguous()
    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
        q_24_no_zp)
    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()

    # Restore zp
    q_24_comp = q_24_no_zp_comp + zp

    # Resize meta to its actual shape (without moving any data)
    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)

    return q_24_comp, meta


def marlin_24_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Inject 2:4 sparsity
    w_24, mask_24 = inject_24(w, size_k, size_n)

    # Quantize
    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
                                                             num_bits,
                                                             group_size,
                                                             act_order=False)

    # Compress quantized weight
    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
                                                     num_bits)
    size_k_comp = size_k // 2

    # Reformat to marlin
    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
                                        num_bits, marlin_24_perm[num_bits])
    marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                        marlin_24_scale_perm[num_bits],
                                        marlin_24_scale_perm_single[num_bits])

    # Create result
    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def compute_max_diff(output, output_ref):
    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
        torch.abs(output_ref))


class MarlinWorkspace:

    def __init__(self, out_features, min_thread_n, max_parallel, device):
        assert (out_features % min_thread_n == 0), (
            "out_features = {} is undivisible by min_thread_n = {}".format(
                out_features, min_thread_n))

        max_workspace_size = ((out_features // min_thread_n) * max_parallel)

        self.scratch = torch.zeros(max_workspace_size,
                                   dtype=torch.int,
                                   device=device)

================================================
FILE: kt-sft/csrc/custom_marlin/utils/quant_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import numpy
import torch

SUPPORTED_NUM_BITS = [4, 8]
SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]


def get_pack_factor(num_bits):
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    return 32 // num_bits


def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
    assert q_w.shape == w_ref.shape

    orig_device = q_w.device
    k_size, _ = q_w.shape

    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
    for i in range(k_size):
        g_idx[i] = i // group_size

    # Simulate act_order by doing a random permutation on K
    rand_perm = torch.randperm(k_size)

    g_idx = g_idx[rand_perm].contiguous()
    q_w = q_w[rand_perm, :].contiguous()
    w_ref = w_ref[rand_perm, :].contiguous()

    return (
        w_ref.to(device=orig_device),
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


# Function: Dequantize quantized weights
def dequantize_weights(qweight, qzeros, scales, g_idx, bits=4, group_size=128, device='cuda:0'):
    # Create a tensor for bitwise right shift operation
    wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32, device=device).unsqueeze(0)

    # Apply bitwise right shift and convert qzeros to the appropriate type
    zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)).to(torch.int16 if bits == 8 else torch.int8)
    torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros)

    # Reshape the zeros tensor
    zeros = zeros + 1
    zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])

    # Reshape the scales tensor
    scales = scales.reshape(-1, 1, scales.shape[-1])

    # Similar bitwise right shift operation for qweight and reshape
    weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
    torch.bitwise_and(weight, (2 ** bits) - 1, out=weight)
    weight = weight.reshape(-1, group_size, weight.shape[2])

    # Apply dequantization formula and reshape the final weight
    weight = (scales * (weight - zeros))
    weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])

    # Return the transposed weight
    return weight.transpose(0, 1)

def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
                     act_order: bool):
    orig_device = w.device
    size_k, size_n = w.shape

    assert w.is_floating_point(), "w must be float"
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    assert group_size in SUPPORTED_GROUP_SIZES + [
        size_k
    ], f"Unsupported groupsize = {group_size}"

    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    max_q_val = 2**num_bits - 1
    half_q_val = (max_q_val + 1) // 2

    # Reshape to [groupsize, -1]
    if group_size < size_k:
        w = w.view((-1, group_size, size_n))
        w = w.permute(1, 0, 2)
        w = w.reshape((group_size, -1))

    # Compute scale for each group
    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
    s *= 2 / max_q_val  # 2 => symmetric

    # Quantize
    q_w = torch.round(w / s).int()
    q_w += half_q_val
    q_w = torch.clamp(q_w, 0, max_q_val)

    # Compute ref (dequantized)
    w_ref = (q_w - half_q_val).half() * s

    # Restore original shapes
    if group_size < size_k:

        def reshape_w(w):
            w = w.reshape((group_size, -1, size_n))
            w = w.permute(1, 0, 2)
            w = w.reshape((size_k, size_n)).contiguous()
            return w

        q_w = reshape_w(q_w)
        w_ref = reshape_w(w_ref)

    s = s.reshape((-1, size_n)).contiguous()

    # Apply act_order
    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        assert (
            group_size < size_k
        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
            group_size, size_k)

        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)

    return (
        w_ref.to(device=orig_device),
        q_w.to(device=orig_device),
        s.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
    orig_device = q_w.device

    sort_indices = torch.argsort(g_idx).to(
        dtype=torch.int32)  # Sort based on g_idx

    g_idx = g_idx[sort_indices].contiguous()
    q_w = q_w[sort_indices, :].contiguous()

    return (
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        sort_indices.to(device=orig_device),
    )


def gptq_pack(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    pack_factor = get_pack_factor(num_bits)
    assert size_k % pack_factor == 0

    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_res |= q_w[i::pack_factor, :] << num_bits * i

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    return q_res

def gptq_unpack(
    q_res: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    pack_factor = 32 // num_bits
    assert size_k % pack_factor == 0

    orig_device = q_res.device

    q_res = q_res.cpu().numpy()

    q_w = numpy.zeros((size_k, size_n), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_w[i::pack_factor, :] = (q_res >> (num_bits * i)) & ((1 << num_bits) - 1)

    q_w = torch.from_numpy(q_w.astype(numpy.int32)).to(orig_device)
    return q_w

================================================
FILE: kt-sft/csrc/ktransformers_ext/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16)
project(cpuinfer_ext VERSION 0.1.0)


set(CMAKE_CXX_STANDARD 17)


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math -fopenmp")
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
set(CMAKE_BUILD_TYPE "Release")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ffast-math -fopenmp")
# set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)


include(CheckCXXCompilerFlag)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)


option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)

# instruction set specific
if (LLAMA_NATIVE)
    set(INS_ENB OFF)
else()
    set(INS_ENB ON)
endif()

option(LLAMA_AVX                             "llama: enable AVX"                                OFF)
option(LLAMA_AVX2                            "llama: enable AVX2"                               OFF)
option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
option(LLAMA_AVX512_BF16                     "llama: enable AVX512-BF16"                        OFF)
option(LLAMA_FMA                             "llama: enable FMA"                                OFF)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
    option(LLAMA_F16C                        "llama: enable F16C"                               OFF)
endif()
option(LLAMA_AVX512_FANCY_SIMD               "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"                        OFF)
option(KTRANSFORMERS_USE_CUDA                "ktransformers: use CUDA"                          ON)
option(KTRANSFORMERS_USE_MUSA                "ktransformers: use MUSA"                          OFF)
option(KTRANSFORMERS_USE_ROCM                "ktransformers: use ROCM"                          OFF)
option(KTRANSFORMERS_USE_XPU                 "ktransformers: use XPU"                           OFF)
option(KTRANSFORMERS_USE_NPU                 "ktransformers: use NPU"                           OFF)

if(KTRANSFORMERS_USE_NPU)
    add_definitions(-DKTRANSFORMERS_USE_NPU=1)
endif()

# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
#       feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
    set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()

if (NOT MSVC)
    if (LLAMA_STATIC)
        add_link_options(-static)
        if (MINGW)
            add_link_options(-static-libgcc -static-libstdc++)
        endif()
    endif()
    if (LLAMA_GPROF)
        add_compile_options(-pg)
    endif()
endif()

set(ARCH_FLAGS "")

if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
     CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
    message(STATUS "ARM detected")
    if (MSVC)
        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
        add_compile_definitions(__ARM_NEON)
        add_compile_definitions(__ARM_FEATURE_FMA)

        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
        if (GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif ()
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
        endif ()
        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
    else()
        if(KTRANSFORMERS_USE_NPU)
            list(APPEND ARCH_FLAGS -march=armv8.2-a+fp16+fp16fml+dotprod -lnuma)
        endif()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
                # Android armeabi-v7a
                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
            else()
                # Raspberry Pi 2
                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
            endif()
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Android arm64-v8a
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
    endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
    message(STATUS "x86 detected")
    if(NOT KTRANSFORMERS_USE_NPU)
        set(HOST_IS_X86 TRUE)
        set(HAS_AVX512 TRUE)
        set(__HAS_AMX__ TRUE)
        add_compile_definitions(__x86_64__)
        # check AVX512
        execute_process(
            COMMAND lscpu
            OUTPUT_VARIABLE LSCPU_OUTPUT
            OUTPUT_STRIP_TRAILING_WHITESPACE
        )
        # message(STATUS "LSCPU_OUTPUT: ${LSCPU_OUTPUT}")
    
        string(FIND "${LSCPU_OUTPUT}" "avx512" COMPILER_SUPPORTS_AVX512F)
        
        if (COMPILER_SUPPORTS_AVX512F GREATER -1)
            message(STATUS "Compiler and CPU support AVX512F (tested by compiling a program)")
            add_compile_definitions(__HAS_AVX512F__)
        else()
            message(STATUS "Compiler and/or CPU do NOT support AVX512F")
            set(HAS_AVX512 False)
        endif()
    
        # check AMX
        string(FIND "${LSCPU_OUTPUT}" "amx" COMPILER_SUPPORTS_AMX)
        
        if(COMPILER_SUPPORTS_AMX GREATER -1)
            message(STATUS "Compiler supports AMX")
            add_compile_definitions(__HAS_AMX__)
        else()
            message(STATUS "Compiler does NOT support AMX")
        endif()
    endif()
    if (MSVC)
        # instruction set detection for MSVC only
        if (LLAMA_NATIVE)
            include(cmake/FindSIMD.cmake)
        endif ()
        if (LLAMA_AVX512)
            list(APPEND ARCH_FLAGS /arch:AVX512)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
            # Do it manually.
            if (LLAMA_AVX512_VBMI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
            endif()
            if (LLAMA_AVX512_VNNI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (LLAMA_AVX512_FANCY_SIMD)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VL__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BW__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512DQ__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (LLAMA_AVX512_BF16)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
            endif()
        elseif (LLAMA_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (LLAMA_AVX)
            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
    else()
        if (LLAMA_NATIVE)
            list(APPEND ARCH_FLAGS -mfma -mavx -mavx2)
            list(APPEND ARCH_FLAGS -march=native)
        endif()
        if (LLAMA_F16C)
            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        if (LLAMA_FMA)
            list(APPEND ARCH_FLAGS -mfma)
        endif()
        if (LLAMA_AVX)
            list(APPEND ARCH_FLAGS -mavx)
        endif()
        if (LLAMA_AVX2)
            list(APPEND ARCH_FLAGS -mavx2)
        endif()
        if (LLAMA_AVX512)
            list(APPEND ARCH_FLAGS -mavx512f)
            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (LLAMA_AVX512_VBMI)
            list(APPEND ARCH_FLAGS -mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
        if (LLAMA_AVX512_FANCY_SIMD)
            message(STATUS "AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI enabled")
            list(APPEND ARCH_FLAGS -mavx512vl)
            list(APPEND ARCH_FLAGS -mavx512bw)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512vnni)
            list(APPEND ARCH_FLAGS -mavx512vpopcntdq)
        endif()
        if (LLAMA_AVX512_BF16)
            list(APPEND ARCH_FLAGS -mavx512bf16)
        endif()
    endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
else()
    message(STATUS "Unknown architecture")
endif()

# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}")
# find_package(FindCUDAToolkit REQUIRED)
# if(CUDAToolkit_FOUND)
#     message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}")
# else()
#     message(STATUS "Can't found CUDA lib")
# endif()

if (NOT EXISTS $ENV{ROCM_PATH})
    if (NOT EXISTS /opt/rocm)
        set(ROCM_PATH /usr)
    else()
        set(ROCM_PATH /opt/rocm)
    endif()
else()
    set(ROCM_PATH $ENV{ROCM_PATH})
endif()

list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")

if (NOT EXISTS $ENV{MUSA_PATH})
    if (NOT EXISTS /opt/musa)
        set(MUSA_PATH /usr/local/musa)
    else()
        set(MUSA_PATH /opt/musa)
    endif()
else()
    set(MUSA_PATH $ENV{MUSA_PATH})
endif()

list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")

add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/third_party/pybind11)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party)
if (WIN32)
    include_directories("$ENV{CUDA_PATH}/include")
    add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
elseif (UNIX)
    if (KTRANSFORMERS_USE_ROCM)
        find_package(HIP REQUIRED)
        if(HIP_FOUND)
            include_directories("${HIP_INCLUDE_DIRS}")
            add_compile_definitions(KTRANSFORMERS_USE_ROCM=1)
        endif()
    elseif (KTRANSFORMERS_USE_MUSA)
        if (NOT EXISTS $ENV{MUSA_PATH})
            if (NOT EXISTS /opt/musa)
                set(MUSA_PATH /usr/local/musa)
            else()
                set(MUSA_PATH /opt/musa)
            endif()
        else()
            set(MUSA_PATH $ENV{MUSA_PATH})
        endif()

        list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")

        find_package(MUSAToolkit)
        if (MUSAToolkit_FOUND)
            message(STATUS "MUSA Toolkit found")
            add_compile_definitions(KTRANSFORMERS_USE_MUSA=1)
        endif()
    elseif (KTRANSFORMERS_USE_XPU)
        add_compile_definitions(KTRANSFORMERS_USE_XPU=1)
    elseif (KTRANSFORMERS_USE_CUDA)
        find_package(CUDA REQUIRED)
        include_directories("${CUDA_INCLUDE_DIRS}")
        include(CheckLanguage)
        check_language(CUDA)
        if(CMAKE_CUDA_COMPILER)
            message(STATUS "CUDA detected")
            find_package(CUDAToolkit REQUIRED)
            include_directories(${CUDAToolkit_INCLUDE_DIRS})
        endif()
        message(STATUS "enabling CUDA")
        enable_language(CUDA)
        add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
    endif()
endif()

aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
# aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/llamafile SOURCE_DIR4)
file(GLOB LLAMAFILE_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/llamafile/*.cpp")
list(REMOVE_ITEM LLAMAFILE_SOURCES
    "${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/llamafile/sgemm_arm.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/llamafile/sgemm_x86.cpp"
)
set(SOURCE_DIR4 ${LLAMAFILE_SOURCES})
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5)

if (HOST_IS_X86 AND HAS_AVX512 AND __HAS_AMX__)
    aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/amx SOURCE_DIR6)
endif()


set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5} ${SOURCE_DIR6})

file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")

add_custom_target(
    format
    COMMAND clang-format
    -i
    -style=file
    ${FMT_SOURCES}
    COMMENT "Running clang-format on all source files"
)


add_library(llamafile STATIC ${SOURCE_DIR4})

message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
message(STATUS "ARCH_FLAGS: ${ARCH_FLAGS}")
pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
target_link_libraries(${PROJECT_NAME} PRIVATE llama)


if(WIN32)
    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
elseif(UNIX)
    if (KTRANSFORMERS_USE_ROCM)
        add_compile_definitions(USE_HIP=1)
        target_link_libraries(${PROJECT_NAME} PRIVATE "${ROCM_PATH}/lib/libamdhip64.so")
        message(STATUS "Building for HIP")
    elseif(KTRANSFORMERS_USE_MUSA)
        target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
    elseif(KTRANSFORMERS_USE_XPU)
    elseif(KTRANSFORMERS_USE_CUDA AND NOT KTRANSFORMERS_USE_MUSA)
        target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
    endif()
endif()

# Define the USE_NUMA option
option(USE_NUMA "Disable NUMA support" OFF)

# Check if the USE_NUMA environment variable is set
if(DEFINED ENV{USE_NUMA})
    set(USE_NUMA ON)
endif()

if(USE_NUMA)
    message(STATUS "NUMA support is enabled")
else()
    message(STATUS "NUMA support is disabled")
endif()

find_library(NUMA_LIBRARY NAMES numa)

if(NUMA_LIBRARY AND USE_NUMA)
    message(STATUS "NUMA library found: ${NUMA_LIBRARY} - enabling NUMA support")
    target_link_libraries(${PROJECT_NAME} PRIVATE ${NUMA_LIBRARY})
    target_compile_definitions(${PROJECT_NAME} PRIVATE USE_NUMA)
else()
    if(USE_NUMA)
        message(FATAL_ERROR "NUMA library not found - maybe sudo apt install libnuma-dev")
    else()
        message(STATUS "NUMA library not found or user not set USE_NUMA - disabling NUMA support")
    endif()
endif()


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : Jianwei Dong 
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1

anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 64
max_batch_size: int = 1
max_block_num: int = 1024
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)

warm_up_iter = 1000
test_iter = 10000


def bench_linear(cache_seqlen: int):
    with torch.inference_mode(mode=True):
        cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
        seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")

        config = cpuinfer_ext.kvcache.KVCacheConfig(
            layer_num,
            kv_head_num,
            q_head_num,
            head_dim,
            block_len,
            anchor_num,
            anchor_type,
            kv_type,
            retrieval_type,
            layer_step,
            token_step,
            layer_offset,
            max_block_num,
            max_batch_size,
            max_thread_num,
        )
        local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
        block_table = (
            torch.arange(max_block_num, dtype=torch.int32, device="cpu")
            .contiguous()
            .view(1, -1)
        )

        for layer_idx in range(layer_num):
            k_cache = torch.randn(
                (1, cache_seqlen, kv_head_num, head_dim),
                dtype=torch.float16,
                device="cpu",
            ).contiguous()
            v_cache = torch.randn(
                (1, cache_seqlen, kv_head_num, head_dim),
                dtype=torch.float16,
                device="cpu",
            ).contiguous()

            CPUInfer.submit(
                local_kvcache.update_kvcache_fp16(
                    k_cache.data_ptr(),
                    v_cache.data_ptr(),
                    layer_idx,
                    block_table.data_ptr(),
                    1,
                    max_block_num,
                    seqlens_zero.data_ptr(),
                    cache_seqlen,
                )
            )
            CPUInfer.sync()

        input = torch.randn(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()
        output = torch.empty(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()

        # attn_lse: (bsz, q_len, q_head_num)
        attn_lse = torch.empty(
            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
        ).contiguous()
        input = input / 100

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                local_kvcache.attn(
                    input.data_ptr(),
                    output.data_ptr(),
                    attn_lse.data_ptr(),
                    i % layer_num,
                    0,
                    1,
                    1,
                    max_block_num,
                    block_table.data_ptr(),
                    cache_seqlens.data_ptr(),
                    -1,
                    -1,
                    -1,
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                local_kvcache.attn(
                    input.data_ptr(),
                    output.data_ptr(),
                    attn_lse.data_ptr(),
                    i % layer_num,
                    0,
                    1,
                    1,
                    max_block_num,
                    block_table.data_ptr(),
                    cache_seqlens.data_ptr(),
                    -1,
                    -1,
                    -1,
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print("cache sequence length: ", cache_seqlen)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            cache_seqlen
            * kv_head_num
            * head_dim
            * 2
            * 2
            * test_iter
            / total_time
            / 1000
            / 1000
            / 1000,
            "GB/s",
        )
        print("")


bench_linear(1024)
bench_linear(4096)
bench_linear(16384)
bench_linear(32768)
bench_linear(65536)


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_attention_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : Jianwei Dong 
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
warm_up_iter = 1000
test_iter = 10000


def bench_linear(cache_seqlen: int, device):
    with torch.inference_mode(mode=True):

        kvcaches = []

        for layer_idx in range(layer_num):
            k_cache = torch.randn(
                (1, 32, cache_seqlen, head_dim),
                dtype=torch.float16,
                device=device,
            ).contiguous()
            v_cache = torch.randn(
                (1, 32, cache_seqlen, head_dim),
                dtype=torch.float16,
                device=device,
            ).contiguous()

            kvcaches.append((k_cache, v_cache))

        input = torch.randn(
            (1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
        ).contiguous()
        input = input / 100

        # warm up
        for i in range(warm_up_iter):
            k_cache = kvcaches[i % layer_num][0]
            v_cache = kvcaches[i % layer_num][1]
            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            k_cache = kvcaches[i % layer_num][0]
            v_cache = kvcaches[i % layer_num][1]
            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
        end = time.perf_counter()
        total_time = end - start
        print("cache sequence length: ", cache_seqlen)
        print("Time(s): ", total_time)
        print("Iteration: ", test_iter)
        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
        print(
            "Bandwidth: ",
            cache_seqlen
            * q_head_num
            * head_dim
            * 2
            * 2
            * test_iter
            / total_time
            / 1000
            / 1000
            / 1000,
            "GB/s",
        )
        print("")


bench_linear(1024, "cpu")
bench_linear(4096, "cpu")
bench_linear(1024, "cuda")
bench_linear(4096, "cuda")
bench_linear(16384, "cuda")
bench_linear(32768, "cuda")
bench_linear(65536, "cuda")


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:31:59
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:35:35
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

input_size = 16384
output_size = 5120
stride = 16
group_max_len = 1024
layer_num = 10
qlen = 1
CPUInfer = cpuinfer_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000

def bench_linear(quant_mode: str):
    with torch.inference_mode(mode=True):

        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            proj_type = 0 # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = 1 # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = 30 # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            proj_type = 8 # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            proj_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            proj_type = 13 # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.687500
        elif quant_mode == "q4_k_m":
            proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
            bytes_per_elem = 0.562500
        elif quant_mode == "q3_k_m":
            proj_type = 11 # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.429688
        elif quant_mode == "q2_k":
            proj_type = 10 # ggml_type::GGML_TYPE_Q2_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert(False)

        linears = []
        projs = []
        for _ in range(layer_num):
            proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
            linear = cpuinfer_ext.linear.Linear(config)
            projs.append(proj)
            linears.append(linear)
        input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                linears[i % layer_num].forward(
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                linears[i % layer_num].forward(
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_linear("fp32")
bench_linear("fp16")
bench_linear("bf16")
bench_linear("q8_0")
bench_linear("q6_k")
bench_linear("q5_k_m")
bench_linear("q4_k_m")
bench_linear("q3_k_m")
bench_linear("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_linear_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:31:59
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:48
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

input_size = 16384
output_size = 5120
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def bench_linear(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        projs = []
        for _ in range(layer_num):
            proj = torch.randn((output_size, input_size), dtype = torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                proj_q = torch.quantize_per_tensor(proj, scale, zero_point, torch.qint8)
                quantized_layer = nnq.Linear(input_size, output_size)
                quantized_layer.set_weight_bias(proj_q, None)
                projs.append(quantized_layer)
            else:
                projs.append(proj.to(proj_type))
        input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            if isinstance(projs[i % layer_num], nnq.Linear):
                input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
                t_output = projs[i % layer_num](input_q)
            else:
                t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            if isinstance(projs[i % layer_num], nnq.Linear):
                input_q = torch.quantize_per_tensor(input[i % layer_num].to(torch.float32), scale, zero_point, torch.quint8)
                t_output = projs[i % layer_num](input_q)
            else:
                t_output = torch.mm(input[i % layer_num].to(proj_type), projs[i % layer_num].t())
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_linear("fp32")
bench_linear("fp16")
bench_linear("bf16")
bench_linear("qint8")


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_mlp.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-16 10:43:18
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:36:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

hidden_size = 5120
intermediate_size = 3072
stride = 16
group_max_len = 1024
layer_num = 10
qlen = 1
CPUInfer = cpuinfer_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000

def bench_mlp(quant_mode: str):
    with torch.inference_mode(mode=True):

        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            gate_type = 0 # ggml_type::GGML_TYPE_F32
            up_type = 0 # ggml_type::GGML_TYPE_F32
            down_type = 0 # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            gate_type = 1 # ggml_type::GGML_TYPE_F16
            up_type = 1 # ggml_type::GGML_TYPE_F16
            down_type = 1 # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            gate_type = 30 # ggml_type::GGML_TYPE_BF16
            up_type = 30 # ggml_type::GGML_TYPE_BF16
            down_type = 30 # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
            up_type = 8 # ggml_type::GGML_TYPE_Q8_0
            down_type = 8 # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
            up_type = 14 # ggml_type::GGML_TYPE_Q6_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
            up_type = 13 # ggml_type::GGML_TYPE_Q5_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.731771
        elif quant_mode == "q4_k_m":
            gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
            up_type = 12 # ggml_type::GGML_TYPE_Q4_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.648437
        elif quant_mode == "q3_k_m":
            gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
            up_type = 11 # ggml_type::GGML_TYPE_Q3_K
            down_type = 13 # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.515625
        elif quant_mode == "q2_k":
            gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
            up_type = 10 # ggml_type::GGML_TYPE_Q2_K
            down_type = 11 # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert(False)


        mlps = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
            mlp = cpuinfer_ext.mlp.MLP(config)
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            mlps.append(mlp)
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                mlps[i % layer_num].forward( 
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                mlps[i % layer_num].forward( 
                    qlen, 
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_mlp("fp32")
bench_mlp("fp16")
bench_mlp("bf16")
bench_mlp("q8_0")
bench_mlp("q6_k")
bench_mlp("q5_k_m")
bench_mlp("q4_k_m")
bench_mlp("q3_k_m")
bench_mlp("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_mlp_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-16 10:43:18
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:53
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

hidden_size = 5120
intermediate_size = 3072
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    if isinstance(gate_proj, nnq.Linear):
        input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
        gate_buf = gate_proj(input_q)
        up_buf = up_proj(input_q)
        gate_buf = gate_buf.dequantize()
        up_buf = up_buf.dequantize()
        intermediate = act_fn(gate_buf) * up_buf
        intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
        expert_output = down_proj(intermediate_q)
        ret = expert_output.dequantize()
    else:
        gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
        up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
        intermediate = act_fn(gate_buf) * up_buf
        ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
    return ret

def bench_mlp(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                gate_proj_q = torch.quantize_per_tensor(gate_proj, scale, zero_point, torch.qint8)
                quantized_gate = nnq.Linear(hidden_size, intermediate_size)
                quantized_gate.set_weight_bias(gate_proj_q, None)
                up_proj_q = torch.quantize_per_tensor(up_proj, scale, zero_point, torch.qint8)
                quantized_up = nnq.Linear(hidden_size, intermediate_size)
                quantized_up.set_weight_bias(up_proj_q, None)
                down_proj_q = torch.quantize_per_tensor(down_proj, scale, zero_point, torch.qint8)
                quantized_down = nnq.Linear(intermediate_size, hidden_size)
                quantized_down.set_weight_bias(down_proj_q, None)
                gate_projs.append(quantized_gate)
                up_projs.append(quantized_up)
                down_projs.append(quantized_down)
            else:
                gate_projs.append(gate_proj.to(proj_type))
                up_projs.append(up_proj.to(proj_type))
                down_projs.append(down_proj.to(proj_type))
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            mlp_torch(input[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_mlp("fp32")
bench_mlp("fp16")
bench_mlp("bf16")
bench_mlp("qint8")


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

expert_num = 160
hidden_size = 5120
intermediate_size = 1536
stride = 16
group_min_len = 10
group_max_len = 1024
n_routed_experts = 6
layer_num = 10
qlen = 1
CPUInfer = cpuinfer_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000

def bench_moe(quant_mode: str):
    with torch.inference_mode(mode=True):
        hidden_type = 30 # ggml_type::GGML_TYPE_BF16
        if quant_mode == "fp32":
            gate_type = 0 # ggml_type::GGML_TYPE_F32
            up_type = 0 # ggml_type::GGML_TYPE_F32
            down_type = 0 # ggml_type::GGML_TYPE_F32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            gate_type = 1 # ggml_type::GGML_TYPE_F16
            up_type = 1 # ggml_type::GGML_TYPE_F16
            down_type = 1 # ggml_type::GGML_TYPE_F16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            gate_type = 30 # ggml_type::GGML_TYPE_BF16
            up_type = 30 # ggml_type::GGML_TYPE_BF16
            down_type = 30 # ggml_type::GGML_TYPE_BF16
            bytes_per_elem = 2.000000
        elif quant_mode == "q8_0":
            gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
            up_type = 8 # ggml_type::GGML_TYPE_Q8_0
            down_type = 8 # ggml_type::GGML_TYPE_Q8_0
            bytes_per_elem = 1.062500
        elif quant_mode == "q6_k":
            gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
            up_type = 14 # ggml_type::GGML_TYPE_Q6_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.820312
        elif quant_mode == "q5_k_m":
            gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
            up_type = 13 # ggml_type::GGML_TYPE_Q5_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.731771
        elif quant_mode == "q4_k_m":
            gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
            up_type = 12 # ggml_type::GGML_TYPE_Q4_K
            down_type = 14 # ggml_type::GGML_TYPE_Q6_K
            bytes_per_elem = 0.648437
        elif quant_mode == "q3_k_m":
            gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
            up_type = 11 # ggml_type::GGML_TYPE_Q3_K
            down_type = 13 # ggml_type::GGML_TYPE_Q5_K
            bytes_per_elem = 0.515625
        elif quant_mode == "q2_k":
            gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
            up_type = 10 # ggml_type::GGML_TYPE_Q2_K
            down_type = 11 # ggml_type::GGML_TYPE_Q3_K
            bytes_per_elem = 0.328125
        elif quant_mode == "iq3_xs":
            gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
            bytes_per_elem = 0.429688
        elif quant_mode == "iq2_xxs":
            gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
            bytes_per_elem = 0.257812
        else:
            assert(False)


        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.moe.MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, stride, group_min_len, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
            moe = cpuinfer_ext.moe.MOE(config)
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)
        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_moe("fp32")
bench_moe("fp16")
bench_moe("bf16")
bench_moe("q8_0")
bench_moe("q6_k")
bench_moe("q5_k_m")
bench_moe("q4_k_m")
bench_moe("q3_k_m")
bench_moe("q2_k")
# Not supported on __x86_64__
# bench_linear("iq3_xs")
# bench_linear("iq2_xxs")


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_moe_amx.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2025-04-25 18:28:12
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2025-04-25 18:28:12
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

expert_num = 8
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
n_routed_experts = 8
layer_num = 10
qlen = 1024
CPUInfer = cpuinfer_ext.CPUInfer(65)
warm_up_iter = 100
test_iter = 100

def bench_moe(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "bf16":
            bytes_per_elem = 2.000000
        elif quant_mode == "int8":
            bytes_per_elem = 1.000000
        else:
            assert(False)


        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.moe.AMX_MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr())
            if quant_mode == "bf16":
                moe = cpuinfer_ext.moe.AMXBF16_MOE(config)
                CPUInfer.submit(moe.load_weights())
                CPUInfer.sync()
            elif quant_mode == "int8":
                moe = cpuinfer_ext.moe.AMXInt8_MOE(config)
                CPUInfer.submit(moe.load_weights())
                CPUInfer.sync()
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)
        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
        qlen_tensor = torch.tensor([qlen], dtype=torch.int32)

        # warm up
        for i in range(warm_up_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr(),
                    qlen_tensor.data_ptr()
                )
            )
            CPUInfer.sync()

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            CPUInfer.submit(
                moes[i % layer_num].forward( 
                    qlen, 
                    n_routed_experts, 
                    expert_ids[i % layer_num].data_ptr(), 
                    weights[i % layer_num].data_ptr(),
                    input[i % layer_num].data_ptr(), 
                    output[i % layer_num].data_ptr(),
                    qlen_tensor.data_ptr()
                )
            )
            CPUInfer.sync()
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('Flops: ', hidden_size * intermediate_size * qlen * 3 * n_routed_experts * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GFLOPS')
        print('')

bench_moe("bf16")
bench_moe("int8")


================================================
FILE: kt-sft/csrc/ktransformers_ext/bench/bench_moe_torch.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-07-25 10:32:57
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq

scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset

expert_num = 160
hidden_size = 5120
intermediate_size = 1536
n_routed_experts = 6
layer_num = 10
qlen = 1
warm_up_iter = 1000
test_iter = 10000

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    if isinstance(gate_proj, nnq.Linear):
        input_q = torch.quantize_per_tensor(input.to(torch.float32), scale, zero_point, torch.quint8)
        gate_buf = gate_proj(input_q)
        up_buf = up_proj(input_q)
        gate_buf = gate_buf.dequantize()
        up_buf = up_buf.dequantize()
        intermediate = act_fn(gate_buf) * up_buf
        intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
        expert_output = down_proj(intermediate_q)
        ret = expert_output.dequantize()
    else:
        gate_buf = torch.mm(input.to(gate_proj.dtype), gate_proj.t())
        up_buf = torch.mm(input.to(up_proj.dtype), up_proj.t())
        intermediate = act_fn(gate_buf) * up_buf
        ret = torch.mm(intermediate.to(down_proj.dtype), down_proj.t())
    return ret

def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output

def bench_moe(quant_mode: str):
    with torch.inference_mode(mode=True):
        if quant_mode == "fp32":
            proj_type = torch.float32
            bytes_per_elem = 4.000000
        elif quant_mode == "fp16":
            proj_type = torch.float16
            bytes_per_elem = 2.000000
        elif quant_mode == "bf16":
            proj_type = torch.bfloat16
            bytes_per_elem = 2.000000
        elif quant_mode == "qint8":
            proj_type = torch.qint8
            bytes_per_elem = 1.000000
        else:
            assert(False)

        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
            if quant_mode == "qint8":
                quantized_gate_proj = []
                quantized_up_proj = []
                quantized_down_proj = []
                for i in range(expert_num):
                    gate_proj_q = torch.quantize_per_tensor(gate_proj[i], scale, zero_point, torch.qint8)
                    quantized_gate = nnq.Linear(hidden_size, intermediate_size)
                    quantized_gate.set_weight_bias(gate_proj_q, None)
                    quantized_gate_proj.append(quantized_gate)
                    up_proj_q = torch.quantize_per_tensor(up_proj[i], scale, zero_point, torch.qint8)
                    quantized_up = nnq.Linear(hidden_size, intermediate_size)
                    quantized_up.set_weight_bias(up_proj_q, None)
                    quantized_up_proj.append(quantized_up)
                    down_proj_q = torch.quantize_per_tensor(down_proj[i], scale, zero_point, torch.qint8)
                    quantized_down = nnq.Linear(intermediate_size, hidden_size)
                    quantized_down.set_weight_bias(down_proj_q, None)
                    quantized_down_proj.append(quantized_down)
                gate_projs.append(quantized_gate_proj)
                up_projs.append(quantized_up_proj)
                down_projs.append(quantized_down_proj)
            else:
                gate_projs.append(gate_proj.to(proj_type))
                up_projs.append(up_proj.to(proj_type))
                down_projs.append(down_proj.to(proj_type))
        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()

        # warm up
        for i in range(warm_up_iter):
            moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])

        # test
        start = time.perf_counter()
        for i in range(test_iter):
            moe_torch(input[i % layer_num], expert_ids[i % layer_num], weights[i % layer_num], gate_projs[i % layer_num], up_projs[i % layer_num], down_projs[i % layer_num])
        end = time.perf_counter()
        total_time = end - start
        print('Quant mode: ', quant_mode)
        print('Time(s): ', total_time)
        print('Iteration: ', test_iter) 
        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
        print('')

bench_moe("fp32")
bench_moe("fp16")
bench_moe("bf16")
bench_moe("qint8")


================================================
FILE: kt-sft/csrc/ktransformers_ext/cmake/FindSIMD.cmake
================================================
include(CheckCSourceRuns)

set(AVX_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 a;
        a = _mm256_set1_ps(0);
        return 0;
    }
")

set(AVX512_CODE "
    #include <immintrin.h>
    int main()
    {
        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0);
        __m512i b = a;
        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
        return 0;
    }
")

set(AVX2_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256i a = {0};
        a = _mm256_abs_epi16(a);
        __m256i x;
        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
        return 0;
    }
")

set(FMA_CODE "
    #include <immintrin.h>
    int main()
    {
        __m256 acc = _mm256_setzero_ps();
        const __m256 d = _mm256_setzero_ps();
        const __m256 p = _mm256_setzero_ps();
        acc = _mm256_fmadd_ps( d, p, acc );
        return 0;
    }
")

macro(check_sse type flags)
    set(__FLAG_I 1)
    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
    foreach (__FLAG ${flags})
        if (NOT ${type}_FOUND)
            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
            if (HAS_${type}_${__FLAG_I})
                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
            endif()
            math(EXPR __FLAG_I "${__FLAG_I}+1")
        endif()
    endforeach()
    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})

    if (NOT ${type}_FOUND)
        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
        set(${type}_FLAGS "" CACHE STRING "${type} flags")
    endif()

    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
endmacro()

# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
    set(LLAMA_AVX OFF)
else()
    set(LLAMA_AVX ON)
endif()

check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
    set(LLAMA_AVX2 OFF)
else()
    set(LLAMA_AVX2 ON)
endif()

check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
    set(LLAMA_AVX512 OFF)
else()
    set(LLAMA_AVX512 ON)
endif()


================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/backend.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:34
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "backend.h"

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>

thread_local int Backend::numa_node = -1;
#endif

thread_local int Backend::thread_local_id = -1;

Backend::Backend(int max_thread_num) {
    max_thread_num_ = max_thread_num;
    thread_state_.resize(max_thread_num_);
    for (int i = 0; i < max_thread_num_; i++) {
        thread_state_[i].curr = std::make_unique<std::atomic<int>>();
        thread_state_[i].status =
            std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
    }
    workers_.resize(max_thread_num_);
    for (int i = 1; i < max_thread_num_; i++) {
        workers_[i] = std::thread(&Backend::worker_thread, this, i);
    }
}

Backend::~Backend() {
    for (int i = 0; i < max_thread_num_; i++) {
        thread_state_[i].status->store(ThreadStatus::EXIT,
                                       std::memory_order_release);
    }
    for (int i = 1; i < max_thread_num_; i++) {
        if (workers_[i].joinable()) {
            workers_[i].join();
        }
    }
}

int Backend::get_thread_num() { return max_thread_num_; }

void Backend::do_work_stealing_job(int task_num,
                                   std::function<void(int)> init_func,
                                   std::function<void(int)> compute_func,
                                   std::function<void(int)> finalize_func) {
    init_func_ = init_func;
    compute_func_ = compute_func;
    finalize_func_ = finalize_func;
#ifdef USE_NUMA
    // numa node location will be calculated based on the number of threads
    thread_num_ = max_thread_num_;
#else
    thread_num_ = std::min(max_thread_num_, task_num);
#endif
    int base = task_num / thread_num_;
    int remain = task_num % thread_num_;
    thread_state_[0].end = base + (0 < remain);

    // 为主线程设置 thread_local_id
    thread_local_id = 0;

    for (int i = 1; i < thread_num_; i++) {
        thread_state_[i].curr->store(thread_state_[i - 1].end,
                                     std::memory_order_relaxed);
        thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
        thread_state_[i].status->store(ThreadStatus::WORKING,
                                       std::memory_order_release);
    }
    thread_state_[0].curr->store(0, std::memory_order_relaxed);
    thread_state_[0].status->store(ThreadStatus::WORKING,
                                   std::memory_order_release);
    process_tasks(0);
    for (int i = 1; i < thread_num_; i++) {
        while (thread_state_[i].status->load(std::memory_order_acquire) ==
               ThreadStatus::WORKING) {
        }
    }
}

void Backend::process_tasks(int thread_id) {
    
    #ifdef USE_NUMA
    if(numa_node == -1){
        numa_node = thread_id * numa_num_configured_nodes() / thread_num_;
        struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes());
        numa_bitmask_setbit(mask, numa_node);
        numa_bind(mask);
    }
    #endif

    if (init_func_ != nullptr) {
        init_func_(thread_id);
    }
    while (true) {
        int task_id = thread_state_[thread_id].curr->fetch_add(
            1, std::memory_order_acq_rel);
        if (task_id >= thread_state_[thread_id].end) {
            break;
        }
        compute_func_(task_id);
    }
    for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
        int t_i = (thread_id + t_offset) % thread_num_;
        if (thread_state_[t_i].status->load(std::memory_order_acquire) !=
            ThreadStatus::WORKING) {
            continue;
        }
        while (true) {
            int task_id = thread_state_[t_i].curr->fetch_add(
                1, std::memory_order_acq_rel);
            if (task_id >= thread_state_[t_i].end) {
                break;
            }
            compute_func_(task_id);
        }
    }
    if (finalize_func_ != nullptr) {
        finalize_func_(thread_id);
    }
    thread_state_[thread_id].status->store(ThreadStatus::WAITING,
                                           std::memory_order_release);
}

void Backend::worker_thread(int thread_id) {
    auto start = std::chrono::steady_clock::now();
    thread_local_id = thread_id; // 设置线程本地变量
    while (true) {
        ThreadStatus status =
            thread_state_[thread_id].status->load(std::memory_order_acquire);
        if (status == ThreadStatus::WORKING) {
            process_tasks(thread_id);
            start = std::chrono::steady_clock::now();
        } else if (status == ThreadStatus::WAITING) {
            auto now = std::chrono::steady_clock::now();
            auto duration =
                std::chrono::duration_cast<std::chrono::milliseconds>(now -
                                                                      start)
                    .count();
            if (duration > 50) {
                std::this_thread::sleep_for(std::chrono::milliseconds(1));
            }
        } else if (status == ThreadStatus::EXIT) {
            return;
        }
    }
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/backend.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_BACKEND_H
#define CPUINFER_BACKEND_H

#include <atomic>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <mutex>
#include <thread>
#include <vector>

enum ThreadStatus {
    WORKING,
    WAITING,
    EXIT,
};

struct ThreadState {
    std::unique_ptr<std::atomic<ThreadStatus>> status;
    std::unique_ptr<std::atomic<int>> curr;
    int end;
};

class Backend {
  public:
    Backend(int);
    ~Backend();
    int get_thread_num();
    void do_work_stealing_job(int, std::function<void(int)>,
                              std::function<void(int)>,
                              std::function<void(int)>);
    #ifdef USE_NUMA
    static thread_local int numa_node;
    #endif
    static thread_local int thread_local_id;

  private:
    int thread_num_;
    int max_thread_num_;
    std::vector<ThreadState> thread_state_; // [thread_num]
    std::function<void(int)> init_func_;
    std::function<void(int)> compute_func_;
    std::function<void(int)> finalize_func_;
    std::vector<std::thread> workers_;

    void process_tasks(int);
    void worker_thread(int);
};
#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/cpuinfer.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-08-07 09:47:43
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #ifndef CPUINFER_CPUINFER_H
 #define CPUINFER_CPUINFER_H
 
 #include <atomic>
 #include <condition_variable>
 #include <functional>
 #include <mutex>
 #include <queue>
 #include <thread>
 #include <vector>
 #include <stdexcept>
 #ifdef KTRANSFORMERS_USE_CUDA
 #include "vendors/cuda.h"
 #elif KTRANSFORMERS_USE_MUSA
 #include "vendors/musa.h"
 #elif KTRANSFORMERS_USE_ROCM
 #define __HIP_PLATFORM_AMD__
 #include "vendors/hip.h"
 #endif
 
 #include "backend.h"
 #include "task_queue.h"
 #include "./vendors/vendor.h"
 
 #include "llama.cpp/ggml-impl.h"
 
 class CPUInfer {
    public:
     CPUInfer(int thread_num) {
         backend_ = new Backend(thread_num - 1);
         task_queue_ = new TaskQueue();
         for (int i = 0; i < (1 << 16); ++i) {
             ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
         }
     }
 
     ~CPUInfer() {
         delete backend_;
         delete task_queue_;
     }
 
     template <typename Func, typename Obj, typename... Args>
     void enqueue(Func f, Obj* obj, Args... args) {
         task_queue_->enqueue([=]() {
             std::invoke(f, *obj, args..., backend_);
         });
     }
 
     void submit(std::pair<intptr_t, intptr_t> params) {
         void (*func)(void*) = (void (*)(void*))params.first;
         void* args = (void*)params.second;
         *((CPUInfer**)args) = this;
         func(args);
     }
 
     void sync() {
         task_queue_->sync();
     }
 
     void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
        #if defined(KTRANSFORMERS_USE_CUDA) || defined(KTRANSFORMERS_USE_MUSA) || defined(KTRANSFORMERS_USE_ROCM)
         void (*func)(void*) = (void (*)(void*))params.first;
         void* args = (void*)params.second;
         *((CPUInfer**)args) = this;
         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
        #else
         throw std::runtime_error("submit_with_cuda_stream is not supported on this platforma");
        #endif
     }
 
     static void sync_(void* cpu_infer_ptr) {
         CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr;
         cpuinfer->sync();
     }
 
     void sync_with_cuda_stream(intptr_t user_cuda_stream) {
        #if defined(KTRANSFORMERS_USE_CUDA) || defined(KTRANSFORMERS_USE_MUSA) || defined(KTRANSFORMERS_USE_ROCM)
         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
        #else
         throw std::runtime_error("sync_with_cuda_stream is not supported on this platforma");
        #endif
     }
 
    public:
     Backend* backend_;
     TaskQueue* task_queue_;
 };
 
 #endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-08-05 04:49:08
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022 
 * @LastEditTime : 2024-08-05 09:21:29
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "shared_mem_buffer.h"
#include <cstdio>

SharedMemBuffer::SharedMemBuffer() {
    buffer_ = nullptr;
    size_ = 0;
}

SharedMemBuffer::~SharedMemBuffer() {
    if (buffer_) {
        free(buffer_);
    }
}

void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests) {
    uint64_t size = 0;
    for (auto& request : requests) {
        size += request.second;
    }
    if (size > size_) {
        if (buffer_) {
            free(buffer_);
        }
        buffer_ = std::aligned_alloc(64, size);

        size_ = size;
        for (auto& obj_requests : hist_requests_) {
            for (auto& requests : obj_requests.second) {
                arrange(requests);
            }
        }
    }
    arrange(requests);
    hist_requests_[object].push_back(requests);
}

void SharedMemBuffer::dealloc(void* object) {
    hist_requests_.erase(object);
}

void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) {
    uint64_t offset = 0;
    for (auto& request : requests) {
        *(request.first) = (uint8_t*)buffer_ + offset;
        offset += request.second;
    }
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-08-05 04:49:08
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022 
 * @LastEditTime : 2024-08-05 06:36:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

 #ifndef CPUINFER_SHAREDMEMBUFFER_H
 #define CPUINFER_SHAREDMEMBUFFER_H
 
 #include <cstdint>
 #include <cstdlib>
 #include <map>
 #include <vector>
 
 class SharedMemBuffer {
    public:
     SharedMemBuffer();
     ~SharedMemBuffer();
 
     void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
     void dealloc(void* object);
 
    private:
     void* buffer_;
     uint64_t size_;
     std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
 
     void arrange(std::vector<std::pair<void**, uint64_t>> requests);
 };
 
 static SharedMemBuffer shared_mem_buffer;
 
 #endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/task_queue.cpp
================================================
/**
 * @Description :
 * @Author    : chenht2022
 * @Date     : 2024-07-17 12:25:51
 * @Version   : 1.0.0
 * @LastEditors : chenht2022
 * @LastEditTime : 2024-10-09 11:08:10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "task_queue.h"

TaskQueue::TaskQueue() {
    worker = std::thread(&TaskQueue::processTasks, this);
    sync_flag.store(true, std::memory_order_seq_cst);
    exit_flag.store(false, std::memory_order_seq_cst);
}

TaskQueue::~TaskQueue() {
    {
        mutex.lock();
        exit_flag.store(true, std::memory_order_seq_cst);
        mutex.unlock();
    }
    cv.notify_all();
    if (worker.joinable()) {
        worker.join();
    }
}

void TaskQueue::enqueue(std::function<void()> task) {
    {
        mutex.lock();
        tasks.push(task);
        sync_flag.store(false, std::memory_order_seq_cst);
        mutex.unlock();
    }
    cv.notify_one();
}

void TaskQueue::sync() {
    while (!sync_flag.load(std::memory_order_seq_cst))
        ;
}

void TaskQueue::processTasks() {
    while (true) {
        std::function<void()> task;
        {
            mutex.lock();
            cv.wait(mutex, [this]() { return !tasks.empty() || exit_flag.load(std::memory_order_seq_cst); });
            if (exit_flag.load(std::memory_order_seq_cst) && tasks.empty()) {
                return;
            }
            task = tasks.front();
            tasks.pop();
            mutex.unlock();
        }
        task();
        {
            mutex.lock();
            if (tasks.empty()) {
                sync_flag.store(true, std::memory_order_seq_cst);
            }
            mutex.unlock();
        }
    }
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/task_queue.h
================================================
/**
 * @Description :
 * @Author    : chenht2022
 * @Date     : 2024-07-16 10:43:18
 * @Version   : 1.0.0
 * @LastEditors : chenht
 * @LastEditTime : 2024-10-09 11:08:07
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_TASKQUEUE_H
#define CPUINFER_TASKQUEUE_H

#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
#ifdef _WIN32
#include <windows.h>
#endif

class custom_mutex {
   private:
#ifdef _WIN32
    CRITICAL_SECTION cs;
#else
    std::mutex mtx;
#endif

   public:
    custom_mutex() {
#ifdef _WIN32
        InitializeCriticalSection(&cs);
#else
        // No initialization required for std::mutex
#endif
    }

    ~custom_mutex() {
#ifdef _WIN32
        DeleteCriticalSection(&cs);
#endif
    }

    void lock() {
#ifdef _WIN32
        EnterCriticalSection(&cs);
#else
        mtx.lock();
#endif
    }

    void unlock() {
#ifdef _WIN32
        LeaveCriticalSection(&cs);
#else
        mtx.unlock();
#endif
    }

#ifdef _WIN32
    CRITICAL_SECTION* get_handle() {
        return &cs;
    }
#else
    std::mutex* get_handle() {
        return &mtx;
    }
#endif
};

class custom_condition_variable {
   private:
#ifdef _WIN32
    CONDITION_VARIABLE cond_var;
#else
    std::condition_variable cond_var;
#endif

   public:
    custom_condition_variable() {
#ifdef _WIN32
        InitializeConditionVariable(&cond_var);
#endif
    }

    template <typename Predicate>
    void wait(custom_mutex& mutex, Predicate pred) {
#ifdef _WIN32
        while (!pred()) {
            SleepConditionVariableCS(&cond_var, mutex.get_handle(), INFINITE);
        }
#else
        std::unique_lock<std::mutex> lock(*mutex.get_handle(), std::adopt_lock);
        cond_var.wait(lock, pred);
        lock.release();
#endif
    }

    void notify_one() {
#ifdef _WIN32
        WakeConditionVariable(&cond_var);
#else
        cond_var.notify_one();
#endif
    }

    void notify_all() {
#ifdef _WIN32
        WakeAllConditionVariable(&cond_var);
#else
        cond_var.notify_all();
#endif
    }
};

class TaskQueue {
   public:
    TaskQueue();
    ~TaskQueue();

    void enqueue(std::function<void()>);

    void sync();

   private:
    void processTasks();

    std::queue<std::function<void()>> tasks;
    custom_mutex mutex;
    custom_condition_variable cv;
    std::thread worker;
    std::atomic<bool> sync_flag;
    std::atomic<bool> exit_flag;
};
#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/vendors/README.md
================================================
## TODO

This directory can be removed after updating the version of `llama.cpp`.

================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/vendors/cuda.h
================================================
#pragma once

#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif // CUDART_VERSION < 11020


================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/vendors/hip.h
================================================
#pragma once

#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bfloat16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__

#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F  HIPBLAS_R_16F
#define CUDA_R_32F  HIPBLAS_R_32F
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cublasOperation_t hipblasOperation_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cuDeviceGet hipDeviceGet
#define CUdevice hipDevice_t
#define CUdeviceptr hipDeviceptr_t
#define cuMemUnmap hipMemUnmap
#define CUmemAccessDesc hipMemAccessDesc
#define cuMemAddressFree hipMemAddressFree
#define cuMemRelease hipMemRelease
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
#define cuMemCreate hipMemCreate
#define cuMemAddressReserve hipMemAddressReserve
#define cuMemMap hipMemMap
#define cuMemSetAccess hipMemSetAccess
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
#define CUmemAllocationProp hipMemAllocationProp
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaGraphExec_t hipGraphExec_t
#define cudaGraphNode_t hipGraphNode_t
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaGraphExecDestroy hipGraphExecDestroy
#define cudaGraphLaunch hipGraphLaunch
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
#define cudaGraphNodeType hipGraphNodeType
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
#define cudaGraphInstantiate hipGraphInstantiate
#define cudaStreamEndCapture hipStreamEndCapture
#define cudaGraphDestroy hipGraphDestroy
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
#define cudaGraphNodeGetType hipGraphNodeGetType
#define cudaGraphGetNodes hipGraphGetNodes
#define cudaGraphExecUpdate hipGraphExecUpdate
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
#define cudaStreamBeginCapture hipStreamBeginCapture
#define cudaGraph_t hipGraph_t
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define cudaHostFn_t hipHostFn_t
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED

#define __CUDA_ARCH__ 1300

#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif

#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
#define CDNA
#endif

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#if defined(__gfx1010__) || defined(__gfx1012__)
#define RDNA1
#endif

#ifndef __has_builtin
    #define __has_builtin(x) 0
#endif

typedef hip_bfloat16 nv_bfloat16;


================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/vendors/musa.h
================================================
#pragma once

#include <musa_runtime.h>
#include <musa.h>
#include <mublas.h>
#include <musa_bf16.h>
#include <musa_fp16.h>
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
#define CUDA_R_16F  MUSA_R_16F
#define CUDA_R_32F  MUSA_R_32F
#define cublasComputeType_t cudaDataType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy
#define cublasGemmEx mublasGemmEx
#define cublasGemmBatchedEx mublasGemmBatchedEx
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
#define cublasHandle_t mublasHandle_t
#define cublasSetMathMode mublasSetMathMode
#define cublasSetStream mublasSetStream
#define cublasSgemm mublasSgemm
#define cublasStatus_t mublasStatus_t
#define cublasOperation_t mublasOperation_t
#define cublasGetStatusString mublasStatus_to_string
#define cudaDataType_t musaDataType_t
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t
#define cudaEventDestroy musaEventDestroy
#define cudaFree musaFree
#define cudaFreeHost musaFreeHost
#define cudaGetDevice musaGetDevice
#define cudaGetDeviceCount musaGetDeviceCount
#define cudaGetDeviceProperties musaGetDeviceProperties
#define cudaGetErrorString musaGetErrorString
#define cudaGetLastError musaGetLastError
#define cudaHostRegister musaHostRegister
#define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister
#define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost
#define cudaMallocManaged musaMallocManaged
#define cudaMemcpy musaMemcpy
#define cudaMemcpyAsync musaMemcpyAsync
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
#define cudaMemcpy2DAsync musaMemcpy2DAsync
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
#define cudaMemcpyKind musaMemcpyKind
#define cudaMemset musaMemset
#define cudaMemsetAsync musaMemsetAsync
#define cudaMemGetInfo musaMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
#define cudaSetDevice musaSetDevice
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
#define cudaStreamDestroy musaStreamDestroy
#define cudaStreamFireAndForget musaStreamFireAndForget
#define cudaStreamNonBlocking musaStreamNonBlocking
#define cudaStreamPerThread musaStreamPerThread
#define cudaStreamSynchronize musaStreamSynchronize
#define cudaStreamWaitEvent musaStreamWaitEvent
#define cudaStream_t musaStream_t
#define cudaSuccess musaSuccess

// Additional mappings for MUSA virtual memory pool
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
#define CUdevice MUdevice
#define CUdeviceptr MUdeviceptr
#define CUmemAccessDesc MUmemAccessDesc
#define CUmemAllocationProp MUmemAllocationProp
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
#define cuDeviceGet muDeviceGet
#define cuDeviceGetAttribute muDeviceGetAttribute
#define cuMemAddressFree muMemAddressFree
#define cuMemAddressReserve muMemAddressReserve
#define cuMemCreate muMemCreate
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
#define cuMemMap muMemMap
#define cuMemRelease muMemRelease
#define cuMemSetAccess muMemSetAccess
#define cuMemUnmap muMemUnmap
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
#define cudaFuncSetAttribute musaFuncSetAttribute
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
#define make_cudaExtent make_musaExtent
#define make_cudaPitchedPtr make_musaPitchedPtr

// Additional mappings for MUSA graphs
#define CUDA_SUCCESS MUSA_SUCCESS
#define CUresult MUresult
#define cuGetErrorString muGetErrorString
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
#define cudaGraphDestroy musaGraphDestroy
#define cudaGraphExecDestroy musaGraphExecDestroy
#define cudaGraphExec_t musaGraphExec_t
#define cudaGraphExecUpdate musaGraphExecUpdate
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
#define cudaGraphGetNodes musaGraphGetNodes
#define cudaGraphInstantiate musaGraphInstantiate
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
#define cudaGraphLaunch musaGraphLaunch
#define cudaGraphNodeGetType musaGraphNodeGetType
#define cudaGraphNode_t musaGraphNode_t
#define cudaGraphNodeType musaGraphNodeType
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
#define cudaGraph_t musaGraph_t
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture

typedef mt_bfloat16 nv_bfloat16;


================================================
FILE: kt-sft/csrc/ktransformers_ext/cpu_backend/vendors/vendor.h
================================================
#ifndef CPUINFER_VENDOR_VENDOR_H
#define CPUINFER_VENDOR_VENDOR_H

#ifdef USE_CUDA
#include "cuda.h"
#elif USE_HIP
#define __HIP_PLATFORM_AMD__
#include "hip.h"
#elif USE_MUSA
#include "musa.h"
#endif

#endif  // CPUINFER_VENDOR_VENDOR_H

================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/binding.cpp
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 0.2.2
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/

#include "custom_gguf/ops.h"
#ifdef KTRANSFORMERS_USE_CUDA
#include "gptq_marlin/ops.h"
#endif
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
// namespace py = pybind11;

PYBIND11_MODULE(KTransformersOps, m) {

    m.def("dequantize_q8_0", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q8_0((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q8_0 data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q6_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q6_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q6_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q5_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q5_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q5_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q4_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q4_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q4_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q3_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q3_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q3_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_q2_k", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_q2_k((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize q2_k data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

    m.def("dequantize_iq4_xs", [](const intptr_t data, int num_bytes, int blk_size, const int ele_per_blk, torch::Device device, py::object target_dtype) {
        torch::Dtype dtype = torch::python::detail::py_object_to_dtype(target_dtype);
        return dequantize_iq4_xs((int8_t*)data, num_bytes, blk_size, ele_per_blk, device, dtype);
        }, "Function to dequantize iq4_xs data.",
        py::arg("data"), py::arg("num_bytes"), py::arg("blk_size"), py::arg("ele_per_blk"), py::arg("device"), py::arg("target_dtype"));

#ifdef KTRANSFORMERS_USE_CUDA
    m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
        py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
        py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
        py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
#endif
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/custom_gguf/dequant.cu
================================================
/*
 * @Description  :  
 * @Author       : Azure-Tang, Boxin Zhang
 * @Date         : 2024-07-25 13:38:30
 * @Version      : 0.2.2
 * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
 * Copyright (c) 2023-2024 The ggml authors
 * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 */
#include <cuda_runtime.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
#include <cstdint>
#include <c10/cuda/CUDAGuard.h>

#ifdef __HIP_PLATFORM_AMD__
typedef __hip_bfloat16 nv_bfloat16;
#endif

__global__ void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++){
            output_blk[i] = scale * cur_block[i];
        }
    }
}

__global__ void dequantize_q8_0_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++) {
            output_blk[i] = __float2half(scale * cur_block[i]);
        }
    }
}

__global__ void dequantize_q8_0_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x) {
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const int8_t* cur_block = data + block_id * blk_size;
        float scale = __half2float(*((half*)cur_block));
        cur_block += 2;
        for (int i = 0; i < ele_per_blk; i++) {
            output_blk[i] = __float2bfloat16(scale * cur_block[i]);
        }
    }
}

// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
__device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
    if (j < 4) {
        *d = q[j] & 63; *m = q[j + 4] & 63;
    } else {
        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
}

__global__ void dequantize_q2_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q2_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q2_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 80)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 82)));

        const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);

        int is = 0;
        float dl, ml;

        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {
                uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
                uint8_t sc = *scales;
                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l] >> shift) & 3)) - ml);

                scales = (uint8_t*)(data + block_id * blk_size + (is++));
                sc = *scales;

                dl = d * (sc & 0xF); ml = min * (sc >> 4);
                for (int l = 0; l < 16; ++l) *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml);

                shift += 2;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2half(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}

__global__ void dequantize_q3_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;    
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+= blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        uint32_t aux[4];
        const int8_t * scales = (const int8_t*)aux;
        const float d_all = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 108)));

        const uint8_t * __restrict__ q  = (uint8_t*)(data + block_id * blk_size + 32);
        const uint8_t * __restrict__ hm = (uint8_t*)(data + block_id * blk_size + 0);
        uint8_t m = 1;


        uint8_t* block_scales = (uint8_t*)(data + block_id * blk_size + 96);

        for (int i = 0; i < 3; i++) {  
            aux[i] = 0;  
            for (int j = 0; j < 4; j++) {  
                aux[i] |= ((uint32_t)block_scales[i * 4 + j]) << (j * 8);
            }
        }

        uint32_t tmp = aux[2];
        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);

        int is = 0;
        float dl;
        for (int n = 0; n < 256; n += 128) {
            int shift = 0;
            for (int j = 0; j < 4; ++j) {

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)));
                }

                dl = d_all * (scales[is++] - 32);
                for (int l = 0; l < 16; ++l) {
                    *output_blk++ = __float2bfloat16(dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)));
                }

                shift += 2;
                m <<= 1;
            }
            q += 32;
        }
    }
}


__global__ void dequantize_q4_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1;
            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l]  >> 4) - m2;
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q4_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * (q[l] & 0xF) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * (q[l]  >> 4) - m2);
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q4_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        // const uint8_t * q = data[i].qs;
        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * 144 + 2)));
        int is = 0;
        uint8_t sc, m;
        for (int j = 0; j < ele_per_blk; j += 64) {
            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * (q[l] & 0xF) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * (q[l]  >> 4) - m2);
            q += 32; is += 2;
        }
    }
}

__global__ void dequantize_q5_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q5_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2half(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q5_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);

        const float d   = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 0)));
        const float min = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 2)));

        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 16);
        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size + 48);

        int is = 0;
        uint8_t sc, m;
        uint8_t u1 = 1, u2 = 2;
        uint8_t* scales = (uint8_t*)(data + block_id * blk_size + 4);

        for (int j = 0; j < 256; j += 64) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
            const float d1 = d * sc; const float m1 = min * m;
            get_scale_min_k4(is + 1, scales, &sc, &m);
            const float d2 = d * sc; const float m2 = min * m;
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1);
            for (int l = 0; l < 32; ++l) *output_blk++ = __float2bfloat16(d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2);
            ql += 32; is += 2;
            u1 <<= 2; u2 <<= 2;
        }
    }
}

__global__ void dequantize_q6_k_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = d * sc[is + 0] * q1;
                output_blk[l + 32] = d * sc[is + 2] * q2;
                output_blk[l + 64] = d * sc[is + 4] * q3;
                output_blk[l + 96] = d * sc[is + 6] * q4;
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

__global__ void dequantize_q6_k_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = __float2half(d * sc[is + 0] * q1);
                output_blk[l + 32] = __float2half(d * sc[is + 2] * q2);
                output_blk[l + 64] = __float2half(d * sc[is + 4] * q3);
                output_blk[l + 96] = __float2half(d * sc[is + 6] * q4);
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

__global__ void dequantize_q6_k_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long  block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size + 208)));

        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);


        for (int n = 0; n < ele_per_blk; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
                output_blk[l +  0] = __float2bfloat16(d * sc[is + 0] * q1);
                output_blk[l + 32] = __float2bfloat16(d * sc[is + 2] * q2);
                output_blk[l + 64] = __float2bfloat16(d * sc[is + 4] * q3);
                output_blk[l + 96] = __float2bfloat16(d * sc[is + 6] * q4);
            }
            output_blk += 128;
            ql += 64;
            qh += 32;
            sc += 8;
        }
    }
}

static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

__global__ void dequantize_iq4_xs_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
                output_blk[j + 16] = dl * kvalues_iq4nl[qs[j] >> 4];
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

__global__ void dequantize_iq4_xs_fp16_kernel(const int8_t* data, __half* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        __half* __restrict__ output_blk = (__half*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = __float2half(dl * kvalues_iq4nl[qs[j] & 0xf]);
                output_blk[j + 16] = __float2half(dl * kvalues_iq4nl[qs[j] >> 4]);
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

__global__ void dequantize_iq4_xs_bf16_kernel(const int8_t* data, nv_bfloat16* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id=global_idx; block_id<num_blocks; block_id+=blockDim.x * gridDim.x) {
        nv_bfloat16* __restrict__ output_blk = (nv_bfloat16*)(output + block_id * ele_per_blk);
        const float d = __half2float(*(reinterpret_cast<const half*>(data + block_id * blk_size)));
        const uint16_t scales_h = *(reinterpret_cast<const uint16_t*>(data + block_id * blk_size + 2));
        const uint8_t* scales_l = (uint8_t*)(data + block_id * blk_size + 2 + 2);
        const uint8_t* qs = (uint8_t*)(data + block_id * blk_size + 2 + 2 + 4);

        for (int ib = 0; ib < 8; ++ib) {
            const int ls = ((scales_l[ib / 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h >> 2 * ib) & 3) << 4);
            const float dl = d * (ls - 32);
            for (int j = 0; j < 16; ++j) {
                output_blk[j + 0] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] & 0xf]);
                output_blk[j + 16] = __float2bfloat16(dl * kvalues_iq4nl[qs[j] >> 4]);
            }
            output_blk += 32;
            qs += 16;
        }
    }
}

torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({ num_bytes }, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({ num_blocks, ele_per_blk }, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q8_0_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q8_0_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q8_0_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }

    cudaDeviceSynchronize();
    return output;
}


torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = num_bytes / blk_size;

    const at::cuda::OptionalCUDAGuard device_guard(device);
    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q6_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q6_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q6_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q5_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q5_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q5_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    // data.numel%blk_size should be 0, else raise err
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q4_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q4_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q4_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q3_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q3_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q3_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_q2_k_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_q2_k_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_q2_k_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}

torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype) {
    int num_blocks = num_bytes / blk_size;
    const at::cuda::OptionalCUDAGuard device_guard(device);

    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
    auto data_gpu = torch::empty({num_bytes}, options);

    cudaMemcpy(data_gpu.data_ptr<int8_t>(), data, num_bytes, cudaMemcpyHostToDevice);
    //data_gpu.copy_(data, false);

    // Create output tensor
    auto output = torch::zeros({num_blocks, ele_per_blk}, torch::dtype(target_dtype).device(device));

    switch (target_dtype) {
        case torch::kFloat16:
            dequantize_iq4_xs_fp16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (__half*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kBFloat16:
            dequantize_iq4_xs_bf16_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), (nv_bfloat16*)output.data_ptr(), blk_size, ele_per_blk, num_blocks);
            break;
        case torch::kFloat32:
            dequantize_iq4_xs_fp32_kernel<<<512, 256>>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, ele_per_blk, num_blocks);
            break;
        default:
            printf("target type not support\n");
            exit(0);
    }
    cudaDeviceSynchronize();
    return output;
}


================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/custom_gguf/ops.h
================================================
/**
 * @Description  :
 * @Author       : Azure-Tang
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-12 03:48:46
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once

#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>

torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);

================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
================================================
/*
 * Modified by Neural Magic
 * Copyright (C) Marlin.2024 Elias Frantar
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Adapted from https://github.com/IST-DASLab/marlin
 */
/*
 * Adapted from  https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
 */
#include "gptq_marlin.cuh"
#include "gptq_marlin_dtypes.cuh"
#include <c10/cuda/CUDAGuard.h>
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
  static_assert(std::is_same<scalar_t, half>::value ||          \
                    std::is_same<scalar_t, nv_bfloat16>::value, \
                "only float16 and bfloat16 is supported");

template <typename T>
inline std::string str(T x) {
  return std::to_string(x);
}

namespace gptq_marlin {

#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined(__HIP_PLATFORM_AMD__)

__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
                                    int size_k, int block_rows) {}

template <typename scalar_t,          // compute dtype, half or nv_float16
          const int num_bits,         // number of bits used for weights
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
                                      // threadblock
          const int thread_n_blocks,  // same for n dimension (output)
          const int thread_k_blocks,  // same for k dimension (reduction)
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
          const bool has_act_order,    // whether act_order is enabled
          const int group_blocks = -1  // number of consecutive 16x16 blocks
                                       // with a separate quantization scale
          >
__global__ void Marlin(
    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const int* __restrict__ g_idx,        // int32 group indices of shape k
    int num_groups,  // number of scale groups per output channel
    int prob_m,      // batch dimension m
    int prob_n,      // output dimension n
    int prob_k,      // reduction dimension k
    int* locks       // extra global storage for barrier synchronization
) {}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full) {
  TORCH_CHECK_NOT_IMPLEMENTED(false,
                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
  return torch::empty({1, 1});
}

#else

// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
template <typename scalar_t>
__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
                           const typename ScalarType<scalar_t>::FragB& frag_b,
                           typename ScalarType<scalar_t>::FragC& frag_c) {
  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
  float* c = reinterpret_cast<float*>(&frag_c);
  if constexpr (std::is_same<scalar_t, half>::value) {
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
    asm volatile(
        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
  } else {
    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
  }
}

// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
template <typename scalar_t>
__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
                             const void* smem_ptr) {
  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
               : "r"(smem));
}

// Lookup-table based 3-input logical operation; explicitly used for
// dequantization as the compiler does not seem to automatically recognize it in
// all cases.
template <int lut>
__device__ inline int lop3(int a, int b, int c) {
  int res;
  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
               : "=r"(res)
               : "r"(a), "r"(b), "r"(c), "n"(lut));
  return res;
}

// Constructs destination register by taking bytes from 2 sources (based on
// mask)
template <int start_byte, int mask>
__device__ inline uint32_t prmt(uint32_t a) {
  uint32_t res;
  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
               : "=r"(res)
               : "r"(a), "n"(start_byte), "n"(mask));
  return res;
}

// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
// values. We mostly follow the strategy in the link below, with some small
// changes:
// - FP16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
// - BF16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
template <typename scalar_t>
__device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit(int q) {
  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
}

template <>
__device__ inline typename ScalarType<half>::FragB dequant_4bit<half>(int q) {
  const int LO = 0x000f000f;
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
  const int MUL = 0x2c002c00;
  const int ADD = 0xd480d480;
  typename ScalarType<half>::FragB frag_b;
  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
                      *reinterpret_cast<const half2*>(&SUB));
  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
                      *reinterpret_cast<const half2*>(&MUL),
                      *reinterpret_cast<const half2*>(&ADD));
  return frag_b;
}

template <>
__device__ inline typename ScalarType<nv_bfloat16>::FragB
dequant_4bit<nv_bfloat16>(int q) {
  static constexpr uint32_t MASK = 0x000f000f;
  static constexpr uint32_t EX = 0x43004300;

  // Guarantee that the `(a & b) | c` operations are LOP3s.

  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
  q >>= 4;
  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);

  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
  static constexpr uint32_t ADD = 0xC308C308;

  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
  return frag_b;
}

// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
// bf16 Reference:
// - FP16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
// - BF16:
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
template <typename scalar_t>
__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
}

template <>
__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
  static constexpr uint32_t mask_for_elt_01 = 0x5250;
  static constexpr uint32_t mask_for_elt_23 = 0x5351;
  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;

  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);

  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;

  typename ScalarType<half>::FragB frag_b;
  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
  return frag_b;
}

template <>
__device__ inline typename ScalarType<nv_bfloat16>::FragB
dequant_8bit<nv_bfloat16>(int q) {
  typename ScalarType<nv_bfloat16>::FragB frag_b;

  float fp32_intermediates[4];
  uint32_t* fp32_intermediates_casted =
      reinterpret_cast<uint32_t*>(fp32_intermediates);

  static constexpr uint32_t fp32_base = 0x4B000000;
  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);

  fp32_intermediates[0] -= 8388736.f;
  fp32_intermediates[1] -= 8388736.f;
  fp32_intermediates[2] -= 8388736.f;
  fp32_intermediates[3] -= 8388736.f;

  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
                                   fp32_intermediates_casted[1], 0x7632);
  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
                                   fp32_intermediates_casted[3], 0x7632);

  return frag_b;
}

// Multiply dequantized values by the corresponding quantization scale; used
// only for grouped quantization.
template <typename scalar_t>
__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
                             typename ScalarType<scalar_t>::FragS& frag_s,
                             int i) {
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  scalar_t2 s =
      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
  frag_b[0] = __hmul2(frag_b[0], s);
  frag_b[1] = __hmul2(frag_b[1], s);
}

// Same as above, but for act_order (each K is multiplied individually)
template <typename scalar_t>
__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
                              typename ScalarType<scalar_t>::FragS& frag_s_1,
                              typename ScalarType<scalar_t>::FragS& frag_s_2,
                              typename ScalarType<scalar_t>::FragS& frag_s_3,
                              typename ScalarType<scalar_t>::FragS& frag_s_4,
                              int i) {
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  scalar_t2 s_val_1_2;
  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];

  scalar_t2 s_val_3_4;
  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];

  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
}

// Given 2 floats multiply by 2 scales (halves)
template <typename scalar_t>
__device__ inline void scale_float(float* c,
                                   typename ScalarType<scalar_t>::FragS& s) {
  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
}

// Wait until barrier reaches `count`, then lock for current threadblock.
__device__ inline void barrier_acquire(int* lock, int count) {
  if (threadIdx.x == 0) {
    int state = -1;
    do
      // Guarantee that subsequent writes by this threadblock will be visible
      // globally.
      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
                   : "=r"(state)
                   : "l"(lock));
    while (state != count);
  }
  __syncthreads();
}

// Release barrier and increment visitation count.
__device__ inline void barrier_release(int* lock, bool reset = false) {
  __syncthreads();
  if (threadIdx.x == 0) {
    if (reset) {
      lock[0] = 0;
      return;
    }
    int val = 1;
    // Make sure that all writes since acquiring this barrier are visible
    // globally, while releasing the barrier.
    asm volatile("fence.acq_rel.gpu;\n");
    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
                 :
                 : "l"(lock), "r"(val));
  }
}

// For a given "a" of size [M,K] performs a permutation of the K columns based
// on the given "perm" indices.
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
                                    int size_k, int block_rows) {
  int start_row = block_rows * blockIdx.x;
  int finish_row = start_row + block_rows;
  if (finish_row > size_m) {
    finish_row = size_m;
  }
  int cur_block_rows = finish_row - start_row;

  int row_stride = size_k * sizeof(half) / 16;

  auto permute_row = [&](int row) {
    int iters = size_k / default_threads;
    int rest = size_k % default_threads;

    int offset = row * row_stride;

    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);

    int base_k = 0;

    for (int i = 0; i < iters; i++) {
      int cur_k = base_k + threadIdx.x;
      int src_pos = perm_int_ptr[cur_k];

      out_half[cur_k] = a_row_half[src_pos];

      base_k += default_threads;
    }

    if (rest) {
      if (threadIdx.x < rest) {
        int cur_k = base_k + threadIdx.x;
        int src_pos = perm_int_ptr[cur_k];

        out_half[cur_k] = a_row_half[src_pos];
      }
    }
  };

  for (int i = 0; i < cur_block_rows; i++) {
    int cur_row = start_row + i;
    if (cur_row < size_m) {
      permute_row(cur_row);
    }
  }
}

template <typename scalar_t,          // compute dtype, half or nv_float16
          const int num_bits,         // number of bits used for weights
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
                                      // threadblock
          const int thread_n_blocks,  // same for n dimension (output)
          const int thread_k_blocks,  // same for k dimension (reduction)
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
          const bool has_act_order,    // whether act_order is enabled
          const int group_blocks = -1  // number of consecutive 16x16 blocks
                                       // with a separate quantization scale
          >
__global__ void Marlin(
    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const int* __restrict__ g_idx,        // int32 group indices of shape k
    int num_groups,  // number of scale groups per output channel
    int prob_m,      // batch dimension m
    int prob_n,      // output dimension n
    int prob_k,      // reduction dimension k
    int* locks       // extra global storage for barrier synchronization
) {
  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
  // same size, which might involve multiple column "slices" (of width 16 *
  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
  // example:
  //   0 1 3
  //   0 2 3
  //   1 2 4
  // While this kind of partitioning makes things somewhat more complicated, it
  // ensures good utilization of all SMs for many kinds of shape and GPU
  // configurations, while requiring as few slow global cross-threadblock
  // reductions as possible.
  using Dtype = ScalarType<scalar_t>;
  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
  using FragA = typename ScalarType<scalar_t>::FragA;
  using FragB = typename ScalarType<scalar_t>::FragB;
  using FragC = typename ScalarType<scalar_t>::FragC;
  using FragS = typename ScalarType<scalar_t>::FragS;

  constexpr int pack_factor = 32 / num_bits;

  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
  // better partitioning with less reductions
  int parallel = 1;
  if (prob_m > 16 * thread_m_blocks) {
    parallel = prob_m / (16 * thread_m_blocks);
    prob_m = 16 * thread_m_blocks;
  }

  int k_tiles = prob_k / 16 / thread_k_blocks;
  int n_tiles = prob_n / 16 / thread_n_blocks;
  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);

  if constexpr (!has_act_order && group_blocks != -1) {
    if (group_blocks >= thread_k_blocks) {
      // Ensure that the number of tiles in each stripe is a multiple of the
      // groupsize; this avoids an annoying special case where a stripe starts
      // in the middle of group.
      iters = (group_blocks / thread_k_blocks) *
              div_ceil(iters, (group_blocks / thread_k_blocks));
    }
  }

  int slice_row = (iters * blockIdx.x) % k_tiles;
  int slice_col_par = (iters * blockIdx.x) / k_tiles;
  int slice_col = slice_col_par;
  int slice_iters;  // number of threadblock tiles in the current slice
  int slice_count =
      0;          // total number of active threadblocks in the current slice
  int slice_idx;  // index of threadblock in current slice; numbered bottom to
                  // top

  // We can easily implement parallel problem execution by just remapping
  // indices and advancing global pointers
  if (slice_col_par >= n_tiles) {
    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
    locks += (slice_col_par / n_tiles) * n_tiles;
    slice_col = slice_col_par % n_tiles;
  }

  // Compute all information about the current slice which is required for
  // synchronization.
  auto init_slice = [&]() {
    slice_iters =
        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
    if (slice_iters == 0) return;
    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
    slice_count = 1;
    slice_idx = 0;
    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
    if (col_first <= k_tiles * (slice_col_par + 1)) {
      int col_off = col_first - k_tiles * slice_col_par;
      slice_count = div_ceil(k_tiles - col_off, iters);
      if (col_off > 0) slice_count++;
      int delta_first = iters * blockIdx.x - col_first;
      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
        slice_idx = slice_count - 1;
      else {
        slice_idx = slice_count - 1 - delta_first / iters;
        if (col_off > 0) slice_idx--;
      }
    }
    if (slice_col == n_tiles) {
      A += 16 * thread_m_blocks * prob_k / 8;
      C += 16 * thread_m_blocks * prob_n / 8;
      locks += n_tiles;
      slice_col = 0;
    }
  };
  init_slice();

  // A sizes/strides

  // stride of the A matrix in global memory
  int a_gl_stride = prob_k / 8;
  // stride of an A matrix tile in shared memory
  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
  // delta between subsequent A tiles in global memory
  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
  // between subsequent accesses within a tile
  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
  // between shared memory writes
  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
  // between shared memory tile reads
  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
  // within a shared memory tile
  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
  // overall size of a tile
  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
  // number of shared write iterations for a tile
  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);

  // B sizes/strides
  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;

  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;

  // Scale sizes/strides without act_order
  int s_gl_stride = prob_n / 8;
  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
  constexpr int s_tb_groups =
      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
          ? thread_k_blocks / group_blocks
          : 1;
  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
  int s_gl_rd_delta = s_gl_stride;

  // Scale size/strides with act_order
  constexpr int tb_k = 16 * thread_k_blocks;
  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
  // constexpr int act_s_row_stride      = 1;
  // int           act_s_col_stride      = act_s_row_stride * num_groups;
  int act_s_col_stride = 1;
  int act_s_col_warp_stride = act_s_col_stride * 8;
  int tb_n_warps = thread_n_blocks / 4;
  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;

  // Global A read index of current thread.
  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                (threadIdx.x % a_gl_rd_delta_o);
  a_gl_rd += a_gl_rd_delta_o * slice_row;
  // Shared write index of current thread.
  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
                (threadIdx.x % a_gl_rd_delta_o);
  // Shared read index.
  int a_sh_rd =
      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));

  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
  b_gl_rd += b_sh_stride * slice_col;
  b_gl_rd += b_gl_rd_delta_o * slice_row;
  int b_sh_wr = threadIdx.x * b_thread_vecs;
  int b_sh_rd = threadIdx.x * b_thread_vecs;

  // For act_order
  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
  int slice_k_start = tb_k * slice_row;
  int slice_k_finish = slice_k_start + tb_k * slice_iters;
  int slice_k_start_shared_fetch = slice_k_start;
  int slice_n_offset = act_s_col_tb_stride * slice_col;

  // No act_order
  int s_gl_rd;
  if constexpr (!has_act_order) {
    if constexpr (group_blocks == -1) {
      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
    } else {
      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                s_sh_stride * slice_col + threadIdx.x;
    }
  }
  int s_sh_wr = threadIdx.x;
  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;

  // We use a different scale layout for grouped and column-wise quantization as
  // we scale a `half2` tile in column-major layout in the former and in
  // row-major in the latter case.
  int s_sh_rd;
  if constexpr (group_blocks != -1)
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) / 4;
  else
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) % 4;

  // Precompute which thread should not read memory in which iterations; this is
  // needed if there are more threads than required for a certain tilesize or
  // when the batchsize is not a multiple of 16.
  bool a_sh_wr_pred[a_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < a_sh_wr_iters; i++)
    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;

  // To ensure that writing and reading A tiles to/from shared memory, the
  // latter in fragment format, is fully bank conflict free, we need to use a
  // rather fancy XOR-based layout. The key here is that neither reads nor
  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
  // same shared memory banks. Further, it seems (based on NSight-Compute) that
  // each warp must also write a consecutive memory segment?
  auto transform_a = [&](int i) {
    int row = i / a_gl_rd_delta_o;
    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
  };
  // Since the computation of this remapping is non-trivial and, due to our main
  // loop unrolls, all shared memory accesses are static, we simply precompute
  // both transformed reads and writes.
  int a_sh_wr_trans[a_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < a_sh_wr_iters; i++)
    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
  #pragma unroll
  for (int i = 0; i < b_sh_wr_iters; i++) {
  #pragma unroll
    for (int j = 0; j < thread_m_blocks; j++)
      a_sh_rd_trans[i][j] =
          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
  }

  // Since B-accesses have non-constant stride they have to be computed at
  // runtime; we break dependencies between subsequent accesses with a tile by
  // maintining multiple pointers (we have enough registers), a tiny
  // optimization.
  const int4* B_ptr[b_sh_wr_iters];
  #pragma unroll
  for (int i = 0; i < b_sh_wr_iters; i++)
    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;

  extern __shared__ int4 sh[];
  // Shared memory storage for global fetch pipelines.
  int4* sh_a = sh;
  int4* sh_b = sh_a + (stages * a_sh_stage);
  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
  int4* sh_s = sh_g_idx + (stages * g_idx_stage);

  // Register storage for double buffer of shared memory reads.
  FragA frag_a[2][thread_m_blocks];
  I4 frag_b_quant[2][b_thread_vecs];
  FragC frag_c[thread_m_blocks][4][2];
  FragS frag_s[2][4];         // No act-order
  FragS act_frag_s[2][4][4];  // For act-order

  // Zero accumulators.
  auto zero_accums = [&]() {
  #pragma unroll
    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
      reinterpret_cast<float*>(frag_c)[i] = 0;
  };

  int sh_first_group_id = -1;
  int sh_num_groups = -1;
  constexpr int sh_max_num_groups = 32;

  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
                                    int last_group_id) {
    sh_first_group_id = first_group_id;
    sh_num_groups = last_group_id - first_group_id + 1;

    if (sh_num_groups < sh_max_num_groups) {
      sh_num_groups = sh_max_num_groups;
    }

    if (sh_first_group_id + sh_num_groups > num_groups) {
      sh_num_groups = num_groups - sh_first_group_id;
    }

    int row_offset = first_group_id * s_gl_stride;

    if (is_async) {
      for (int i = 0; i < sh_num_groups; i++) {
        if (threadIdx.x < s_sh_stride) {
          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
                         &scales_ptr[row_offset + (i * s_gl_stride) +
                                     slice_n_offset + threadIdx.x]);
        }
      }
    } else {
      for (int i = 0; i < sh_num_groups; i++) {
        if (threadIdx.x < s_sh_stride) {
          sh_s[(i * s_sh_stride) + threadIdx.x] =
              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
                         threadIdx.x];
        }
      }
    }
  };
  // Asynchronously fetch the next A, B and s tile from global to the next
  // shared memory pipeline location.
  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
    if (pred) {
      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
  #pragma unroll
      for (int i = 0; i < a_sh_wr_iters; i++) {
        cp_async4_pred(
            &sh_a_stage[a_sh_wr_trans[i]],
            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
            a_sh_wr_pred[i]);
      }
      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
  #pragma unroll
      for (int i = 0; i < b_sh_wr_iters; i++) {
  #pragma unroll
        for (int j = 0; j < b_thread_vecs; j++) {
          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
        }

        B_ptr[i] += b_gl_rd_delta_o;
      }

      if constexpr (has_act_order) {
        // Fetch g_idx thread-block portion
        int full_pipe = a_off;
        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
        if (cur_k < prob_k && cur_k < slice_k_finish) {
          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;

          int4 const* cur_g_idx_stage_ptr =
              reinterpret_cast<int4 const*>(&g_idx[cur_k]);

          if (threadIdx.x < g_idx_stage) {
            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
                           &cur_g_idx_stage_ptr[threadIdx.x]);
          }
        }
      } else {
        if constexpr (group_blocks != -1) {
          int4* sh_s_stage = sh_s + s_sh_stage * pipe;

          if constexpr (group_blocks >= thread_k_blocks) {
            // Only fetch scales if this tile starts a new group
            if (pipe % (group_blocks / thread_k_blocks) == 0) {
              if (s_sh_wr_pred) {
                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
              }
              s_gl_rd += s_gl_rd_delta;
            }
          } else {
            for (int i = 0; i < s_tb_groups; i++) {
              if (s_sh_wr_pred) {
                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
                          &scales_ptr[s_gl_rd]);
              }
              s_gl_rd += s_gl_rd_delta;
            }
          }
        }
      }
    }
    // Insert a fence even when we are winding down the pipeline to ensure that
    // waiting is also correct at this point.
    cp_async_fence();
  };

  // Wait until the next thread tile has been loaded to shared memory.
  auto wait_for_stage = [&]() {
    // We only have `stages - 2` active fetches since we are double buffering
    // and can only issue the next fetch when it is guaranteed that the previous
    // shared memory load is fully complete (as it may otherwise be
    // overwritten).
    cp_async_wait<stages - 2>();
    __syncthreads();
  };

  // Load the next sub-tile from the current location in the shared memory pipe
  // into the current register buffer.
  auto fetch_to_registers = [&](int k, int pipe) {
    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
  #pragma unroll
    for (int i = 0; i < thread_m_blocks; i++)
      ldsm4<scalar_t>(frag_a[k % 2][i],
                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
    int4* sh_b_stage = sh_b + b_sh_stage * pipe;

  #pragma unroll
    for (int i = 0; i < b_thread_vecs; i++) {
      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
    }
  };

  bool is_same_group[stages];
  int same_group_id[stages];

  auto init_same_group = [&](int pipe) {
    if constexpr (!has_act_order) {
      is_same_group[pipe] = false;
      same_group_id[pipe] = 0;
      return;
    }

    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

    int group_id_1 = sh_g_idx_int_ptr[0];
    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];

    is_same_group[pipe] = group_id_1 == group_id_2;
    same_group_id[pipe] = group_id_1;
  };

  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
    int pipe = full_pipe % stages;

    if constexpr (!has_act_order) {
      // No act-order case
      if constexpr (group_blocks != -1) {
        if constexpr (group_blocks >= thread_k_blocks) {
          int4* sh_s_stage =
              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                   (pipe / (group_blocks / thread_k_blocks)));
          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
        } else {
          int warp_id = threadIdx.x / 32;
          int n_warps = thread_n_blocks / 4;

          int warp_row = warp_id / n_warps;

          int cur_k = warp_row * 16;
          cur_k += k_iter_size * (k % b_sh_wr_iters);

          int k_blocks = cur_k / 16;
          int cur_group_id = k_blocks / group_blocks;

          int4* sh_s_stage = sh_s + s_sh_stage * pipe;

          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
        }
      }

      return;
    }

    // Act-order case

    // Determine K of the "current" thread-block
    int cur_k = slice_k_start + tb_k * full_pipe;
    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
      return;
    }

    // Reset (to current thread-block) since we read g_idx portion from the
    // shared memory
    cur_k = 0;

    // Progress to current iteration
    cur_k += k_iter_size * (k % b_sh_wr_iters);

    // Determine "position" inside the thread-block (based on warp and
    // thread-id)
    int warp_id = threadIdx.x / 32;
    int n_warps =
        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N

    int warp_row = warp_id / n_warps;
    int warp_col = warp_id % n_warps;

    cur_k += warp_row * 16;

    int th_id = threadIdx.x % 32;
    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix

    int s_col_shift =
        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
        (th_id / 4) * act_s_col_stride;

    if (is_same_group[pipe]) {
      if (k % 2 == 0) {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
                 s_col_shift];
      } else {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
      }

      for (int i = 1; i < 4; i++) {
        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
      }
      return;
    }

    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);

    constexpr int k_frag_offsets[4] = {0, 1, 8,
                                       9};  // Tensor core offsets per thread

  #pragma unroll
    for (int i = 0; i < 4; i++) {
      int actual_k = cur_k + k_frag_offsets[i];

      int group_id = sh_g_idx_int_ptr[actual_k];
      int rel_group_id = group_id - sh_first_group_id;

      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
          sh_s[rel_group_id * s_sh_stride + s_col_shift];
    }
  };

  // Execute the actual tensor core matmul of a sub-tile.
  auto matmul = [&](int k) {
  // We have the m dimension as the inner loop in order to encourage overlapping
  // dequantization and matmul operations.
  #pragma unroll
    for (int j = 0; j < 4; j++) {
      FragB frag_b0;
      FragB frag_b1;
      if constexpr (num_bits == 4) {
        int b_quant = frag_b_quant[k % 2][0][j];
        int b_quant_shift = b_quant >> 8;

        frag_b0 = dequant_4bit<scalar_t>(b_quant);
        frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);

      } else {
        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];

        frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
        frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
      }

      // Apply scale to frag_b0
      if constexpr (has_act_order) {
        scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                         act_frag_s[k % 2][3][j], 0);
      } else {
        if constexpr (group_blocks != -1) {
          scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
        }
      }

      // Apply scale to frag_b1
      if constexpr (has_act_order) {
        scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
                         act_frag_s[k % 2][3][j], 1);

      } else {
        if constexpr (group_blocks != -1) {
          scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
        }
      }

  #pragma unroll
      for (int i = 0; i < thread_m_blocks; i++) {
        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
      }
    }
  };

  // Since we slice across the k dimension of a tile in order to increase the
  // number of warps while keeping the n dimension of a tile reasonable, we have
  // multiple warps that accumulate their partial sums of the same output
  // location; which we have to reduce over in the end. We do in shared memory.
  auto thread_block_reduce = [&]() {
    constexpr int red_off = threads / b_sh_stride_threads / 2;
    if (red_off >= 1) {
      int red_idx = threadIdx.x / b_sh_stride_threads;
      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
      constexpr int red_sh_delta = b_sh_stride_threads;
      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                      (threadIdx.x % b_sh_stride_threads);

      // Parallel logarithmic shared memory reduction. We make sure to avoid any
      // unnecessary read or write iterations, e.g., for two warps we write only
      // once by warp 1 and read only once by warp 0.

  #pragma unroll
      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
  #pragma unroll
        for (int i = red_off; i > 0; i /= 2) {
          if (i <= red_idx && red_idx < 2 * i) {
  #pragma unroll
            for (int j = 0; j < 4 * 2; j++) {
              int red_sh_wr =
                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
              if (i < red_off) {
                float* c_rd =
                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
  #pragma unroll
                for (int k = 0; k < 4; k++)
                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                      c_rd[k] + c_wr[k];
              }
              sh[red_sh_wr] =
                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
            }
          }
          __syncthreads();
        }
        if (red_idx == 0) {
  #pragma unroll
          for (int i = 0; i < 4 * 2; i++) {
            float* c_rd =
                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
  #pragma unroll
            for (int j = 0; j < 4; j++)
              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
                  c_rd[j];
          }
        }
        __syncthreads();
      }
    }
  };

  // Since multiple threadblocks may process parts of the same column slice, we
  // finally have to globally reduce over the results. As the striped
  // partitioning minimizes the number of such reductions and our outputs are
  // usually rather small, we perform this reduction serially in L2 cache.
  auto global_reduce = [&](bool first = false, bool last = false) {
    // We are very careful here to reduce directly in the output buffer to
    // maximize L2 cache utilization in this step. To do this, we write out
    // results in FP16 (but still reduce with FP32 compute).
    constexpr int active_threads = 32 * thread_n_blocks / 4;
    if (threadIdx.x < active_threads) {
      int c_gl_stride = prob_n / 8;
      int c_gl_wr_delta_o = 8 * c_gl_stride;
      int c_gl_wr_delta_i = 4 * (active_threads / 32);
      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
      c_gl_wr += (2 * thread_n_blocks) * slice_col;
      constexpr int c_sh_wr_delta = active_threads;
      int c_sh_wr = threadIdx.x;

      int row = (threadIdx.x % 32) / 4;

      if (!first) {
  // Interestingly, doing direct global accesses here really seems to mess up
  // the compiler and lead to slowdowns, hence we also use async-copies even
  // though these fetches are not actually asynchronous.
  #pragma unroll
        for (int i = 0; i < thread_m_blocks * 4; i++) {
          cp_async4_pred(
              &sh[c_sh_wr + c_sh_wr_delta * i],
              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                 c_gl_wr_delta_i * (i % 2)],
              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
        }
        cp_async_fence();
        cp_async_wait<0>();
      }

  #pragma unroll
      for (int i = 0; i < thread_m_blocks * 4; i++) {
        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
          if (!first) {
            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
  #pragma unroll
            for (int j = 0; j < 2 * 4; j++) {
              reinterpret_cast<float*>(
                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
            }
          }
          if (!last) {
            int4 c;
  #pragma unroll
            for (int j = 0; j < 2 * 4; j++) {
              reinterpret_cast<scalar_t*>(&c)[j] =
                  Dtype::float2num(reinterpret_cast<float*>(
                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
            }
            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
                c;
          }
        }
      }
    }
  };

  // Write out the reduce final result in the correct layout. We only actually
  // reshuffle matrix fragments in this step, the reduction above is performed
  // in fragment layout.
  auto write_result = [&]() {
    int c_gl_stride = prob_n / 8;
    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
    constexpr int c_sh_rd_delta =
        c_sh_stride * (threads / (2 * thread_n_blocks));

    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                  (threadIdx.x % (2 * thread_n_blocks));
    c_gl_wr += (2 * thread_n_blocks) * slice_col;
    int c_sh_wr =
        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
    c_sh_wr += 32 * (threadIdx.x / 32);
    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
                  (threadIdx.x % (2 * thread_n_blocks));

    int c_gl_wr_end = c_gl_stride * prob_m;

    // We first reorder in shared memory to guarantee the most efficient final
    // global write patterns
    auto write = [&](int idx, float c0, float c1, FragS& s) {
      scalar_t2 res =
          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

      // For per-column quantization we finally apply the scale here (only for
      // 4-bit)
      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
        res = __hmul2(res, s[0]);
      }

      ((scalar_t2*)sh)[idx] = res;
    };

    if (threadIdx.x / 32 < thread_n_blocks / 4) {
  #pragma unroll
      for (int i = 0; i < thread_m_blocks; i++) {
  #pragma unroll
        for (int j = 0; j < 4; j++) {
          int wr = c_sh_wr + 8 * j;
          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
        }
        c_sh_wr += 16 * (4 * c_sh_stride);
      }
    }
    __syncthreads();

  #pragma unroll
    for (int i = 0;
         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
         i++) {
      if (c_gl_wr < c_gl_wr_end) {
        C[c_gl_wr] = sh[c_sh_rd];
        c_gl_wr += c_gl_wr_delta;
        c_sh_rd += c_sh_rd_delta;
      }
    }
  };

  // Start global fetch and register load pipelines.
  auto start_pipes = [&]() {

  #pragma unroll
    for (int i = 0; i < stages - 1; i++) {
      if (has_act_order && i == 0) {
        int last_g_idx = slice_k_start + stages * tb_k * 2;
        if (last_g_idx >= prob_k) {
          last_g_idx = prob_k - 1;
        }
        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
      }
      fetch_to_shared(i, i, i < slice_iters);
    }

    zero_accums();
    wait_for_stage();
    init_same_group(0);
    fetch_to_registers(0, 0);
    fetch_scales_to_registers(0, 0);
    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
    slice_k_start_shared_fetch += tb_k * (stages - 1);
  };
  if (slice_iters) {
    start_pipes();
  }

  // Main loop.
  while (slice_iters) {
    // We unroll over both the global fetch and the register load pipeline to
    // ensure all shared memory accesses are static. Note that both pipelines
    // have even length meaning that the next iteration will always start at
    // index 0.

  #pragma unroll
    for (int pipe = 0; pipe < stages;) {
  #pragma unroll
      for (int k = 0; k < b_sh_wr_iters; k++) {
        fetch_to_registers(k + 1, pipe % stages);
        fetch_scales_to_registers(k + 1, pipe);
        if (k == b_sh_wr_iters - 2) {
          fetch_to_shared((pipe + stages - 1) % stages, pipe,
                          slice_iters >= stages);
          pipe++;
          wait_for_stage();
          init_same_group(pipe % stages);
        }
        matmul(k);
      }
      slice_iters--;
      if (slice_iters == 0) {
        break;
      }
    }

    a_gl_rd += a_gl_rd_delta_o * stages;
    slice_k_start += tb_k * stages;
    slice_k_start_shared_fetch += tb_k * stages;

    if constexpr (has_act_order) {
      int first_group_id = g_idx[slice_k_start];
      int last_g_idx = slice_k_start + stages * tb_k * 2;
      if (last_g_idx >= prob_k) {
        last_g_idx = prob_k - 1;
      }
      int last_group_id = g_idx[last_g_idx];
      if (last_group_id >= sh_first_group_id + sh_num_groups) {
        fetch_scales_to_shared(false, first_group_id, last_group_id);
        __syncthreads();
      }
    }

    // Process results and, if necessary, proceed to the next column slice.
    // While this pattern may not be the most readable, other ways of writing
    // the loop seemed to noticeably worse performance after compilation.
    if (slice_iters == 0) {
      cp_async_wait<0>();
      bool last = slice_idx == slice_count - 1;
      // For per-column scales, we only fetch them here in the final step before
      // write-out
      if constexpr (!has_act_order && group_blocks == -1) {
        if constexpr (num_bits == 8) {
          if (s_sh_wr_pred) {
            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
          }
          cp_async_fence();
        } else {
          if (last) {
            if (s_sh_wr_pred) {
              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
            }
            cp_async_fence();
          }
        }
      }

      thread_block_reduce();
      if constexpr (!has_act_order && group_blocks == -1) {
        if constexpr (num_bits == 8) {
          cp_async_wait<0>();
          __syncthreads();
          if (threadIdx.x / 32 < thread_n_blocks / 4) {
            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
          }

        } else {
          if (last) {
            cp_async_wait<0>();
            __syncthreads();
            if (threadIdx.x / 32 < thread_n_blocks / 4) {
              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
            }
          }
        }
      }

      // For 8-bit channelwise, we apply the scale before the global reduction
      // that converts the fp32 results to fp16 (so that we avoid possible
      // overflow in fp16)
      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
        if (threadIdx.x / 32 < thread_n_blocks / 4) {
  #pragma unroll
          for (int i = 0; i < thread_m_blocks; i++) {
  #pragma unroll
            for (int j = 0; j < 4; j++) {
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                  frag_s[j / 2][2 * (j % 2) + 0]);
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                  frag_s[j / 2][2 * (j % 2) + 0]);

              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                  frag_s[j / 2][2 * (j % 2) + 1]);
              scale_float<scalar_t>(
                  reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                  frag_s[j / 2][2 * (j % 2) + 1]);
            }
          }
        }
      }

      if (slice_count > 1) {  // only globally reduce if there is more than one
                              // block in a slice
        barrier_acquire(&locks[slice_col], slice_idx);
        global_reduce(slice_idx == 0, last);
        barrier_release(&locks[slice_col], last);
      }
      if (last)  // only the last block in a slice actually writes the result
        write_result();
      slice_row = 0;
      slice_col_par++;
      slice_col++;
      init_slice();
      if (slice_iters) {
        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                  (threadIdx.x % a_gl_rd_delta_o);
  #pragma unroll
        for (int i = 0; i < b_sh_wr_iters; i++)
          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
        if (slice_col == 0) {
  #pragma unroll
          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
        }

        // Update slice k/n for scales loading
        if constexpr (has_act_order) {
          slice_k_start = tb_k * slice_row;
          slice_k_finish = slice_k_start + tb_k * slice_iters;
          slice_k_start_shared_fetch = slice_k_start;
          slice_n_offset = act_s_col_tb_stride * slice_col;

        } else {
          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
        }

        start_pipes();
      }
    }
  }
}

  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
                    THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \
    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
             thread_n_blocks == THREAD_N_BLOCKS &&                             \
             thread_k_blocks == THREAD_K_BLOCKS &&                             \
             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
             num_threads == NUM_THREADS) {                                     \
      cudaFuncSetAttribute(                                                    \
          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
                 GROUP_BLOCKS>,                                                \
          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
             GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>(   \
          A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n,   \
          prob_k, locks);                                                      \
    }

typedef struct {
  int thread_k;
  int thread_n;
  int num_threads;
} thread_config_t;

typedef struct {
  int max_m_blocks;
  thread_config_t tb_cfg;
} exec_config_t;

thread_config_t small_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {128, 128, 256},
    {64, 128, 128},
    {128, 64, 128},
};

thread_config_t large_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {64, 256, 256},
    {64, 128, 128},
    {128, 64, 128},

};

int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                          int prob_n, int prob_k, int num_bits, int group_size,
                          bool has_act_order, bool is_k_full) {
  bool cache_scales_chunk = has_act_order && !is_k_full;

  int tb_n = th_config.thread_n;
  int tb_k = th_config.thread_k;

  // Get max scale groups per thread-block
  int tb_groups;
  if (group_size == -1) {
    tb_groups = 1;
  } else if (group_size == 0) {
    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
  } else {
    tb_groups = div_ceil(tb_k, group_size);
  }

  if (cache_scales_chunk) {
    int load_groups =
        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
    return load_groups * tb_n * 2;

  } else {
    int tb_scales = tb_groups * tb_n * 2;

    return tb_scales * pipe_stages;
  }
}

bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
                         int prob_m, int prob_n, int prob_k, int num_bits,
                         int scales_cache_size, int max_shared_mem) {
  int pack_factor = 32 / num_bits;

  // Get B size
  int tb_k = th_config.thread_k;
  int tb_n = th_config.thread_n;

  int b_size = (tb_k * tb_n / pack_factor) * 4;

  // Get A size
  int m_blocks = div_ceil(prob_m, 16);
  int tb_max_m = 16;

  while (true) {
    if (m_blocks >= max_m_blocks) {
      tb_max_m *= max_m_blocks;
      break;
    }

    max_m_blocks--;
    if (max_m_blocks == 0) {
      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
    }
  }

  int a_size = (tb_max_m * tb_k) * 2;

  float pipe_size = (a_size + b_size) * pipe_stages;

  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity

  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
}

bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
                     int prob_m, int prob_n, int prob_k, int num_bits,
                     int group_size, bool has_act_order, bool is_k_full,
                     int max_shared_mem) {
  // Sanity
  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
      th_config.num_threads == -1) {
    return false;
  }

  // Verify K/N are divisible by thread K/N
  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
    return false;
  }

  // Verify min for thread K/N
  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
    return false;
  }

  // num_threads must be at least 128 (= 4 warps)
  if (th_config.num_threads < 128) {
    return false;
  }

  //  Determine cache for scales
  int scales_cache_size =
      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                            group_size, has_act_order, is_k_full);

  // Check that pipeline fits into cache
  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                           num_bits, scales_cache_size, max_shared_mem)) {
    return false;
  }

  return true;
}

exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
                                      int num_bits, int group_size,
                                      bool has_act_order, bool is_k_full,
                                      int max_shared_mem) {
  int max_m_blocks = 4;
  while (max_m_blocks > 0) {
    if (prob_m <= 16) {
      for (auto th_config : small_batch_thread_configs) {
        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                            num_bits, group_size, has_act_order, is_k_full,
                            max_shared_mem)) {
          return exec_config_t{max_m_blocks, th_config};
        }
      }
    } else {
      for (auto th_config : large_batch_thread_configs) {
        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
                            num_bits, group_size, has_act_order, is_k_full,
                            max_shared_mem)) {
          return exec_config_t{max_m_blocks, th_config};
        }
      }
    }

    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
                     // usage
  }

  return exec_config_t{0, {-1, -1, -1}};
}

  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
                                                                       \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
                                                                       \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)

template <typename scalar_t>
void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
                     void* g_idx, void* perm, void* a_tmp, int prob_m,
                     int prob_n, int prob_k, void* workspace, int num_bits,
                     bool has_act_order, bool is_k_full, int num_groups,
                     int group_size, int dev, cudaStream_t stream, int thread_k,
                     int thread_n, int sms, int max_par) {
  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
              ", ", prob_n, ", ", prob_k, "]");

  int tot_m = prob_m;
  int tot_m_blocks = div_ceil(tot_m, 16);
  int pad = 16 * tot_m_blocks - tot_m;

  if (sms == -1) {
    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
  }

  int max_shared_mem = 0;
  cudaDeviceGetAttribute(&max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);

  // Set thread config
  exec_config_t exec_cfg;
  if (thread_k != -1 && thread_n != -1) {
    // User-defined config
    exec_cfg =
        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
  } else {
    // Auto config
    exec_cfg =
        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
                                has_act_order, is_k_full, max_shared_mem);
  }

  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
                                  prob_m, prob_n, prob_k, num_bits, group_size,
                                  has_act_order, is_k_full, max_shared_mem),
              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
              ", group_size = ", group_size,
              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
              ", max_shared_mem = ", max_shared_mem);

  int num_threads = exec_cfg.tb_cfg.num_threads;
  thread_k = exec_cfg.tb_cfg.thread_k;
  thread_n = exec_cfg.tb_cfg.thread_n;

  int thread_k_blocks = thread_k / 16;
  int thread_n_blocks = thread_n / 16;

  int blocks = sms;

  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
              " is not divisible by thread_n = ", thread_n);
  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
              " is not divisible by thread_k = ", thread_k);

  int group_blocks = 0;
  if (has_act_order) {
    if (is_k_full) {
      TORCH_CHECK(group_size != -1);
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    } else {
      TORCH_CHECK(group_size == 0);
      group_blocks = 0;
    }

  } else {
    if (group_size == -1) {
      group_blocks = -1;
    } else {
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    }
  }

  const int4* A_ptr = (const int4*)A;
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  const int4* s_ptr = (const int4*)s;
  const int* g_idx_ptr = (const int*)g_idx;
  const int* perm_ptr = (const int*)perm;
  int4* a_tmp_ptr = (int4*)a_tmp;

  int* locks = (int*)workspace;

  if (has_act_order) {
    // Permute A columns
    int block_rows = div_ceil(prob_m, blocks);
    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
    A_ptr = a_tmp_ptr;
  }

  // If we have a full K, then we can run the non-act-order version of Marlin
  // (since the weight rows are reordered by increasing group ids, and by having
  // a full K, we have full original groups)
  if (is_k_full) {
    has_act_order = false;
  }

  // Main loop
  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
    int thread_m_blocks = tot_m_blocks - i;
    prob_m = tot_m - 16 * i;
    int par = 1;
    if (thread_m_blocks > exec_cfg.max_m_blocks) {
      // Note that parallel > 1 currently only works for inputs without any
      // padding
      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
      if (par > max_par) par = max_par;
      prob_m = (16 * exec_cfg.max_m_blocks) * par;
      i += exec_cfg.max_m_blocks * (par - 1);
      thread_m_blocks = exec_cfg.max_m_blocks;
    }


    // Define kernel configurations
#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
    str(prob_n) + ", " + str(prob_k) + "]" + \
        ", has_act_order = " + str(has_act_order) + \
        ", num_groups = " + str(num_groups) + \
        ", group_size = " + str(group_size) + \
        ", thread_m_blocks = " + str(thread_m_blocks) + \
        ", thread_n_blocks = " + str(thread_n_blocks) + \
        ", thread_k_blocks = " + str(thread_k_blocks));


    if (num_bits == 4 && num_threads == 256)
    {
        if (false) {
        }
        CALL_IF(4, 32, 2, 256)
        CALL_IF(4, 16, 4, 256)
        CALL_IF(4, 8, 8, 256)
        else {
            undefined_error
        }
    }
    else if (num_bits == 4 && num_threads == 128)
    {
        if (false) {
        }
        CALL_IF(4, 8, 4, 128)
        CALL_IF(4, 4, 8, 128)
        else {
            undefined_error
        }
    }
    else if (num_bits == 8 && num_threads == 256)
    {
        if (false) {
        }
        CALL_IF(8, 32, 2, 256)
        CALL_IF(8, 16, 4, 256)
        CALL_IF(8, 8, 8, 256)
        else {
            undefined_error
        }
    }
    else if (num_bits == 8 && num_threads == 128)
    {
        if (false) {
        }
        CALL_IF(8, 8, 4, 128)
        CALL_IF(8, 4, 8, 128)
        else {
            undefined_error
        }
    }
    else {
        undefined_error
    }

    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
  }
}

}  // namespace gptq_marlin

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  // Verify num_bits
  TORCH_CHECK(num_bits == 4 || num_bits == 8,
              "num_bits must be 4 or 8. Got = ", num_bits);
  int pack_factor = 32 / num_bits;

  // Verify A
  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
              ", size_m = ", size_m);
  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
              ", size_k = ", size_k);

  // Verify B
  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
              " is not divisible by tile_size = ", gptq_marlin::tile_size);
  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
              "b_q_weight.size(1) = ", b_q_weight.size(1),
              " is not divisible by tile_size = ", gptq_marlin::tile_size);
  int actual_size_n =
      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
              ", actual_size_n = ", actual_size_n);

  // Verify device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");

  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");

  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");

  TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
  TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");

  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");

  // Alloc buffers
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c = torch::empty({size_m, size_n}, options);
  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);

  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_k = -1;
  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_n = -1;
  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
  int sms = -1;

  // Verify g_idx and perm
  TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
                  (g_idx.size(0) == size_k && perm.size(0) == size_k),
              "Unexpected g_idx.size(0) = ", g_idx.size(0),
              " and perm.size(0) = ", perm.size(0),
              ", where size_k = ", size_k);

  // Detect groupsize and act_order
  int num_groups = -1;
  int group_size = -1;
  bool has_act_order = g_idx.size(0) != 0;

  int b_rank = b_scales.sizes().size();
  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
              " is not size_n = ", size_n);
  num_groups = b_scales.size(0);

  if (has_act_order) {
    if (is_k_full) {
      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
                  ", is not divisible by num_groups = ", num_groups);
      group_size = size_k / num_groups;
    } else {
      group_size = 0;
    }

  } else {
    if (num_groups > 1) {
      TORCH_CHECK(
          size_k % num_groups == 0, "size_k = ", size_k,
          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
      group_size = size_k / num_groups;
    } else {
      group_size = -1;
    }
  }

  // Verify workspace size
  TORCH_CHECK(
      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
  int min_workspace_size =
      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
  TORCH_CHECK(workspace.numel() >= min_workspace_size,
              "workspace.numel = ", workspace.numel(),
              " is below min_workspace_size = ", min_workspace_size);

  int dev = a.get_device();
  if (a.scalar_type() == at::ScalarType::Half) {
    gptq_marlin::marlin_mm_f16i4<half>(
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
        b_scales.data_ptr<at::Half>(), g_idx.data_ptr(), perm.data_ptr(),
        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
        workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups,
        group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
        thread_n, sms, gptq_marlin::max_par);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order,
        is_k_full, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        gptq_marlin::max_par);
  } else {
    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
  }

  return c;
}

#endif


================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once

#include <torch/all.h>

#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>

namespace gptq_marlin {

// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
static constexpr int default_threads = 256;

static constexpr int pipe_stages =
    4;  // 4 pipeline stages fit into shared memory

static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;

static constexpr int tile_size = 16;
static constexpr int max_par = 16;

template <typename T, int n>
struct Vec {
  T elems[n];
  __device__ T& operator[](int i) { return elems[i]; }
};

using I4 = Vec<int, 4>;

constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }

#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined (__HIP_PLATFORM_AMD__)
// No support for async
#else

__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
                                      bool pred = true) {
  const int BYTES = 16;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   .reg .pred p;\n"
      "   setp.ne.b32 p, %0, 0;\n"
      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
      "}\n" ::"r"((int)pred),
      "r"(smem), "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
  const int BYTES = 16;
  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
  asm volatile(
      "{\n"
      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
      "}\n" ::"r"(smem),
      "l"(glob_ptr), "n"(BYTES));
}

__device__ inline void cp_async_fence() {
  asm volatile("cp.async.commit_group;\n" ::);
}

template <int n>
__device__ inline void cp_async_wait() {
  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}

#endif

}  // namespace gptq_marlin


================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
================================================
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#ifndef _data_types_cuh
#define _data_types_cuh
#include "gptq_marlin.cuh"
#include <cuda_fp16.h>
#include <cuda_bf16.h>

#ifdef __HIP_PLATFORM_AMD__
typedef __hip_bfloat16 nv_bfloat16;
typedef __hip_bfloat162 nv_bfloat162;
#endif

namespace gptq_marlin {

template <typename scalar_t>
class ScalarType {};

template <>
class ScalarType<half> {
 public:
  using scalar_t = half;
  using scalar_t2 = half2;

  // Matrix fragments for tensor core instructions; their precise layout is
  // documented here:
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
  using FragA = Vec<half2, 4>;
  using FragB = Vec<half2, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<half2, 1>;

  static __device__ float inline num2float(const half x) {
    return __half2float(x);
  }

  static __device__ half2 inline num2num2(const half x) {
    return __half2half2(x);
  }

  static __device__ half2 inline nums2num2(const half x1, const half x2) {
    return __halves2half2(x1, x2);
  }

  static __host__ __device__ half inline float2num(const float x) {
    return __float2half(x);
  }
};

template <>
class ScalarType<nv_bfloat16> {
 public:
  using scalar_t = nv_bfloat16;
  using scalar_t2 = nv_bfloat162;

  using FragA = Vec<nv_bfloat162, 4>;
  using FragB = Vec<nv_bfloat162, 2>;
  using FragC = Vec<float, 4>;
  using FragS = Vec<nv_bfloat162, 1>;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
  static __device__ float inline num2float(const nv_bfloat16 x) {
    return __bfloat162float(x);
  }

  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
    return __bfloat162bfloat162(x);
  }

  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
                                                  const nv_bfloat16 x2) {
    return __halves2bfloat162(x1, x2);
  }

  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
    return __float2bfloat16(x);
  }
#endif
};

}  // namespace gptq_marlin

#endif


================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/gptq_marlin/ops.h
================================================
/**
 * @Description  :  
 * @Author       : Azure
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : Azure 
 * @LastEditTime : 2024-07-26 08:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
**/
#pragma once

#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>

torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& g_idx,
                               torch::Tensor& perm, torch::Tensor& workspace,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k, bool is_k_full);

// torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
//                                  int64_t size_k, int64_t size_n,
//                                  int64_t num_bits);

================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/setup.py
================================================

from setuptools import setup, Extension
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
    name='KTransformersOps',
    ext_modules=[
        CUDAExtension(
            'KTransformersOps', [
                'custom_gguf/dequant.cu',
                'binding.cpp',
                'gptq_marlin/gptq_marlin.cu',
                # 'gptq_marlin_repack.cu',
            ],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': [
                    '-O3',
                    '--use_fast_math',
                    '-Xcompiler', '-fPIC',
                ]
            },
        )
    ],
    cmdclass={'build_ext': BuildExtension}
)

================================================
FILE: kt-sft/csrc/ktransformers_ext/cuda/test_dequant.py
================================================
import os
import sys
sys.path.insert(0,"/home/zbx/ktransformers")
from ktransformers.util.custom_loader import GGUFLoader
import torch

gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/")

torch.set_default_dtype(torch.bfloat16)

tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")

print(tensor_1[0, -64:])
print(tensor_2[0, -64:])

================================================
FILE: kt-sft/csrc/ktransformers_ext/examples/test_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-28 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""
import os, sys
import time

sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
from flash_attn import flash_attn_with_kvcache
import torch

layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
cache_seqlen = 8192
cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 2
max_batch_size: int = 1
max_block_num: int = 512
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
validation_iter = 100

with torch.inference_mode(mode=True):
    config = cpuinfer_ext.kvcache.KVCacheConfig(
        layer_num,
        kv_head_num,
        q_head_num,
        head_dim,
        block_len,
        anchor_num,
        anchor_type,
        kv_type,
        retrieval_type,
        layer_step,
        token_step,
        layer_offset,
        max_block_num,
        max_batch_size,
        max_thread_num,
    )
    local_kvcache = cpuinfer_ext.kvcache.KVCache(config)

    kvcaches = []
    block_table = (
        torch.arange(max_block_num, dtype=torch.int32, device="cpu")
        .contiguous()
        .view(1, -1)
    )

    for layer_idx in range(layer_num):
        k_cache = torch.randn(
            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()
        v_cache = torch.randn(
            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()

        CPUInfer.submit(
            local_kvcache.update_kvcache_fp16(
                k_cache.data_ptr(),
                v_cache.data_ptr(),
                layer_idx,
                block_table.data_ptr(),
                1,
                max_block_num,
                seqlens_zero.data_ptr(),
                cache_seqlen,
            )
        )
        CPUInfer.sync()

        kvcaches.append((k_cache.to("cuda"), v_cache.to("cuda")))

    # validation
    for i in range(validation_iter):

        k_cache = kvcaches[i % layer_num][0]
        v_cache = kvcaches[i % layer_num][1]
        input = torch.randn(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()
        output = torch.empty(
            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
        ).contiguous()

        # attn_lse: (bsz, q_len, q_head_num)
        attn_lse = torch.empty(
            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
        ).contiguous()
        input = input / 100

        CPUInfer.submit(
            local_kvcache.attn(
                input.data_ptr(),
                output.data_ptr(),
                attn_lse.data_ptr(),
                i % layer_num,
                0,
                1,
                1,
                max_block_num,
                block_table.data_ptr(),
                cache_seqlens.data_ptr(),
                -1,
                -1,
                -1,
            )
        )
        CPUInfer.sync()
        # print("cpuinfer output", output)

        t_output = flash_attn_with_kvcache(
            q=input.to("cuda"),
            k_cache=k_cache,
            v_cache=v_cache,
            cache_seqlens=cache_seqlens.to("cuda"),
        )
        # print("torch output", t_output)

        diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
            torch.abs(t_output)
        )
        print("diff = ", diff)
        assert diff < 0.001


================================================
FILE: kt-sft/csrc/ktransformers_ext/examples/test_linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:36:59
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

input_size = 16384
output_size = 5120
stride = 32
group_max_len = 1024
proj_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100

with torch.inference_mode(mode=True):
    linears = []
    projs = []
    for _ in range(layer_num):
        proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
        linear = cpuinfer_ext.linear.Linear(config)
        projs.append(proj)
        linears.append(linear)

    # validation
    for i in range(validation_iter):
        linear = linears[i % layer_num]
        input = torch.randn((qlen, input_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
        input = input / 100

        CPUInfer.submit(
            linear.forward(
                qlen,
                input.data_ptr(),
                output.data_ptr()
            )
        )
        CPUInfer.sync()
        # print('cpuinfer output', output)

        proj = projs[i%layer_num]
        t_output = torch.mm(input, proj.t())
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print('diff = ', diff)
        assert(diff < 0.001)


================================================
FILE: kt-sft/csrc/ktransformers_ext/examples/test_mlp.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:37:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

hidden_size = 5120
intermediate_size = 3072
stride = 32
group_max_len = 1024
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret

with torch.inference_mode(mode=True):
    mlps = []
    gate_projs = []
    up_projs = []
    down_projs = []
    for _ in range(layer_num):
        gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        config = cpuinfer_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
        mlp = cpuinfer_ext.mlp.MLP(config)
        gate_projs.append(gate_proj)
        up_projs.append(up_proj)
        down_projs.append(down_proj)
        mlps.append(mlp)

    # validation
    for i in range(validation_iter):
        mlp = mlps[i % layer_num]
        input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
        input = input / 100

        CPUInfer.submit(
            mlp.forward(
                qlen,
                input.data_ptr(), 
                output.data_ptr()
            )
        )
        CPUInfer.sync()
        # print('cpuinfer output', output)

        gate_proj = gate_projs[i%layer_num]
        up_proj = up_projs[i%layer_num]
        down_proj = down_projs[i%layer_num]
        t_output = mlp_torch(input, gate_proj, up_proj, down_proj)
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print('diff = ', diff)
        assert(diff < 0.001)


================================================
FILE: kt-sft/csrc/ktransformers_ext/examples/test_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

expert_num = 160
hidden_size = 5120
intermediate_size = 1536
stride = 32
group_min_len = 10
group_max_len = 1024
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
n_routed_experts = 6
qlen = 30
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def mlp_torch(input, gate_proj, up_proj, down_proj):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    intermediate = act_fn(gate_buf) * up_buf
    ret = torch.mm(intermediate, down_proj.t())
    return ret

def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output

with torch.inference_mode(mode=True):
    moes = []
    gate_projs = []
    up_projs = []
    down_projs = []
    for _ in range(layer_num):
        gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
        config = cpuinfer_ext.moe.MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, stride, group_min_len, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
        moe = cpuinfer_ext.moe.MOE(config)
        gate_projs.append(gate_proj)
        up_projs.append(up_proj)
        down_projs.append(down_proj)
        moes.append(moe)

    # validation
    for i in range(validation_iter):
        expert_ids = torch.stack([torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]).contiguous()
        weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
        input = torch.randn((qlen, hidden_size), dtype=torch.float16).contiguous()
        output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
        input = input / 100
        
        moe = moes[i % layer_num]
        CPUInfer.submit(
            moe.forward( 
                qlen,
                n_routed_experts, 
                expert_ids.data_ptr(), 
                weights.data_ptr(), 
                input.data_ptr(), 
                output.data_ptr()
            )
        )
        CPUInfer.sync()
        # print('cpuinfer output', output)

        gate_proj = gate_projs[i%layer_num]
        up_proj = up_projs[i%layer_num]
        down_proj = down_projs[i%layer_num]
        t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj)
        # print('torch output', t_output)

        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
        print('diff = ', diff)
        assert(diff < 0.001)


================================================
FILE: kt-sft/csrc/ktransformers_ext/examples/test_sft_amx_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch
from pathlib import Path
import numpy as np

expert_num = 10
hidden_size = 5120
intermediate_size = 1536
max_len = 1024

n_routed_experts = 2
qlen = 600
layer_num = 10
num_threads = 112
validation_iter = 1
LAYER_IDX  = 0
DUMP_DIR   = Path(os.getenv("SFT_DEBUG_PATH", "debug"))

dtype = torch.bfloat16
gradtype = torch.bfloat16
# torch.backends.cuda.matmul.allow_tf32 = False

import shutil
folder_path = "/home/lpl/kt-sft/debug"
if os.path.exists(folder_path):
    shutil.rmtree(folder_path)
os.makedirs(folder_path)

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

def silu_fwd(x: torch.Tensor) -> torch.Tensor:
    return x / (1. + torch.exp(-x))

def silu_grad(x: torch.Tensor) -> torch.Tensor:
    """SiLU激活函数的梯度"""
    sigmoid_x = torch.sigmoid(x)
    return sigmoid_x * (1. + x * (1. - sigmoid_x))

class SiLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inp):
        ctx.save_for_backward(inp)
        return silu_fwd(inp)

    @staticmethod
    def backward(ctx, grad_out):
        (inp,) = ctx.saved_tensors
        sig = torch.sigmoid(inp)
        return grad_out * (sig + inp * sig * (1. - sig))

silu = SiLU.apply   # 可求导版本

# -------------------- Torch MLP / MoE 参考实现 --------------------
def mlp_torch(x, gate, up, down, req_grad=False):
    g = torch.mm(x, gate.t())
    u = torch.mm(x, up.t())
    if req_grad:
        inter = silu_fwd(g) * u
    else:
        inter = silu_fwd(g) * u
    return torch.mm(inter, down.t())

def moe_torch(x, eid, w, gate, up, down, req_grad=False):
    """eid: [T,k]  int64,  w: [T,k] float"""
    T, k = eid.shape
    tok_cnt = torch.zeros(expert_num, dtype=torch.int64)
    for e in eid.view(-1):
        tok_cnt[e] += 1
    # 打包 token
    order = eid.view(-1).argsort()
    packed = x[order // k]

    outputs, start = [], 0
    for e in range(expert_num):
        num = tok_cnt[e].item()
        if not num:
            continue
        end = start + num
        o = mlp_torch(packed[start:end], gate[e], up[e], down[e], req_grad)
        outputs.append(o)
        start = end
    if outputs:
        out_all = torch.cat(outputs, 0)
    else:
        out_all = packed.new_empty(0, hidden_size)

    # 还原顺序并做加权
    out_restore = torch.empty_like(out_all)
    out_restore[order] = out_all
    out_restore = out_restore.view(T, k, hidden_size)
    out = (out_restore * w.unsqueeze(-1)).sum(1)
    return out

def moe_backward_python(x, eid, w, gate, up, down, grad_output, gate_u_cache, up_v_cache):
    """
    Python模拟C++的MoE backward计算 - 完全仿照sft_moe.hpp的实现
    参数:
        x: 输入 [T, hidden_size]
        eid: expert_ids [T, k]
        w: weights [T, k]
        gate, up, down: 权重矩阵
        grad_output: 输出梯度 [T, hidden_size]
        gate_u_cache, up_v_cache: forward时缓存的中间结果
    返回:
        grad_input: 输入梯度 [T, hidden_size]
    """
    T, k = eid.shape
    expert_num = gate.shape[0]
    hidden_size = gate.shape[2]
    intermediate_size = gate.shape[1]
    
    print("\n========== Python Backward详细对拍 ==========")
    print(f"输入形状: T={T}, k={k}, hidden_size={hidden_size}, intermediate_size={intermediate_size}")
    print(f"\n--- Python Token 0 ---")
    print(f"  Expert 0: weight={w[0, 0].item():.6f}")
    
    # 初始化梯度
    grad_input = torch.zeros_like(x, dtype=torch.float32)
    
    # print(f"grad_output:{grad_output}")
    # print(f"gate_u_cache:{gate_u_cache}")
    # print(f"up_v_cache:{up_v_cache}")
    
    # print(f"grad_output:{grad_output}")
    # print(f"gate_u_cache:{gate_u_cache}")
    # print(f"up_v_cache:{up_v_cache}")
    
    # 按C++的方式组织数据：按expert分组
    # 1. 统计每个expert处理的token数量
    expert_token_counts = torch.zeros(expert_num, dtype=torch.int64)
    for i in range(T):
        for j in range(k):
            expert_token_counts[eid[i, j]] += 1
    
    # 2. 构建expert到token的映射
    expert_token_indices = [[] for _ in range(expert_num)]
    expert_token_positions = [[] for _ in range(expert_num)]
    
    for i in range(T):
        for j in range(k):
            expert_id = int(eid[i, j].item())
            expert_token_indices[expert_id].append(i)
            expert_token_positions[expert_id].append(j)
    
    # 3. 为每个expert分配本地存储空间
    max_tokens_per_expert = int(expert_token_counts.max().item()) if expert_token_counts.max() > 0 else 0
    
    # 本地存储空间（模拟C++中的m_local_*_ptr_）
    local_input = torch.zeros(expert_num, max_tokens_per_expert, hidden_size, dtype=torch.float32)
    local_gate_output = torch.zeros(expert_num, max_tokens_per_expert, intermediate_size, dtype=torch.float32)
    local_up_output = torch.zeros(expert_num, max_tokens_per_expert, intermediate_size, dtype=torch.float32)
    local_down_output_grad = torch.zeros(expert_num, max_tokens_per_expert, hidden_size, dtype=torch.float32)
    local_down_input_grad = torch.zeros(expert_num, max_tokens_per_expert, intermediate_size, dtype=torch.float32)
    local_gate_output_grad = torch.zeros(expert_num, max_tokens_per_expert, intermediate_size, dtype=torch.float32)
    local_up_output_grad = torch.zeros(expert_num, max_tokens_per_expert, intermediate_size, dtype=torch.float32)
    local_gate_input_grad = torch.zeros(expert_num, max_tokens_per_expert, hidden_size, dtype=torch.float32)
    local_up_input_grad = torch.zeros(expert_num, max_tokens_per_expert, hidden_size, dtype=torch.float32)
    
    # 4. 复制输入数据和梯度到本地存储
    for expert_id in range(expert_num):
        for local_idx, (token_idx, expert_pos) in enumerate(zip(expert_token_indices[expert_id], expert_token_positions[expert_id])):
            local_input[expert_id, local_idx] = x[token_idx].to(torch.float32)
            local_down_output_grad[expert_id, local_idx] = grad_output[token_idx].to(torch.float32)
    
    # 5. 重新计算forward的中间结果（模拟C++中的forward计算）
    for expert_id in range(expert_num):
        num_tokens = expert_token_counts[expert_id]
        if num_tokens == 0:
            continue
            
        # 计算gate和up的输出
        local_input_expert = local_input[expert_id, :num_tokens]  # [num_tokens, hidden_size]
        gate_output = torch.mm(local_input_expert, gate[expert_id].to(torch.float32).t())  # [num_tokens, intermediate_size]
        up_output = torch.mm(local_input_expert, up[expert_id].to(torch.float32).t())      # [num_tokens, intermediate_size]
        
        # 应用激活函数
        gate_output_activated = silu_fwd(gate_output) * up_output
        
        local_gate_output[expert_id, :num_tokens] = gate_output
        local_up_output[expert_id, :num_tokens] = up_output
        
    for expert_id in range(expert_num):
        num_tokens = expert_token_counts[expert_id]
        if num_tokens == 0:
            continue
        # print(f"local_down_output_grad_E_{expert_id}: {local_down_output_grad[expert_id, :num_tokens]}")
        # print(f"shape:{local_down_output_grad[expert_id, :num_tokens].shape}")
        # torch.save(local_down_output_grad[expert_id, :num_tokens], f"debug/py_layer0_E_End{expert_id}_down_output_grad_.pt")
        # torch.save(local_gate_output[expert_id, :num_tokens], f"debug/py_layer0_E_End{expert_id}_gate_output_.pt")
        # torch.save(local_up_output[expert_id, :num_tokens], f"debug/py_layer0_E_End{expert_id}_up_output_.pt")
    
    # 6. 计算down_input_grad（模拟C++中的down_t_bc_计算）
    for expert_id in range(expert_num):
        num_tokens = expert_token_counts[expert_id]
        if num_tokens == 0:
            continue
        # down_input_grad = down_proj_t @ output_grad
        down_input_grad = torch.mm(local_down_output_grad[expert_id, :num_tokens], down[expert_id].to(torch.float32))  # [num_tokens, intermediate_size]
        local_down_input_grad[expert_id, :num_tokens] = down_input_grad
            
    for expert_id in range(expert_num):
        num_tokens = expert_token_counts[expert_id]
        if num_tokens == 0:
            continue
        # torch.save(local_gate_output_grad[expert_id, :num_tokens], f"debug/py_layer0_E_End{expert_id}_gate_output_grad_.pt")
        # torch.save(local_up_output_grad[expert_id, :num_tokens], f"debug/py_layer0_E_End{expert_id}_up_output_grad_.pt")
        torch.save(local_down_output_grad[expert_id, :num_tokens], f"debug/py_layer0_E_End{expert_id}_down_output_grad_.pt")
        # torch.save(down[expert_id].to(torch.float32), f"debug/py_layer0_E_End{expert_id}_down_weight_.pt")
        torch.save(local_gate_output[expert_id, :num_tokens], f"debug/py_layer0_E_End{expert_id}_gate_output_.pt")
    
    # 7. 计算gate_output_grad和up_output_grad（模拟C++中的核心计算）
    for expert_id in range(expert_num):
        num_tokens = expert_token_counts[expert_id]
        if num_tokens == 0:
            continue
            
        for local_idx in range(num_tokens):
            token_idx = expert_token_indices[expert_id][local_idx]
            expert_pos = expert_token_positions[expert_id][local_idx]
            weight = w[token_idx, expert_pos].item()
            
            # 只为第一个token的第一个expert输出调试信息
            should_print = (token_idx == 0 and expert_pos == 0)
            
            # 获取当前token的中间结果
            gate_u = local_gate_output[expert_id, local_idx]  # [intermediate_size]
            up_v = local_up_output[expert_id, local_idx]      # [intermediate_size]
            down_input_grad_token = local_down_input_grad[expert_id, local_idx]  # [intermediate_size]
            
            # 应用weight
            down_input_grad_token = down_input_grad_token * weight
            
            if should_print:
                print(f"    down_input_grad前5个值: {down_input_grad_token[:5].tolist()}")
            
            # gate_output_grad = down_input_grad * up_v * silu_grad(gate_u)
            gate_output_grad = down_input_grad_token * up_v * silu_grad(gate_u)
            
            # up_output_grad = down_input_grad * silu_fwd(gate_u)
            up_output_grad = down_input_grad_token * silu_fwd(gate_u)
            
            if should_print:
                print(f"    gate_output_grad前5个值: {gate_output_grad[:5].tolist()}")
                print(f"    up_output_grad前5个值: {up_output_grad[:5].tolist()}")
            
            local_gate_output_grad[expert_id, local_idx] = gate_output_grad
            local_up_output_grad[expert_id, local_idx] = up_output_grad
    
    # 8. 计算gate_input_grad和up_input_grad（模拟C++中的矩阵乘法）
    for expert_id in range(expert_num):
        num_tokens = expert_token_counts[expert_id]
        if num_tokens == 0:
            continue
            
        # gate_input_grad = gate_proj_t @ gate_output_grad
        gate_input_grad = torch.mm(local_gate_output_grad[expert_id, :num_tokens], 
                                  gate[expert_id].to(torch.float32))  # [num_tokens, hidden_size]
        
        # up_input_grad = up_proj_t @ up_output_grad
        up_input_grad = torch.mm(local_up_output_grad[expert_id, :num_tokens], 
                                up[expert_id].to(torch.float32))  # [num_tokens, hidden_size]
        
        local_gate_input_grad[expert_id, :num_tokens] = gate_input_grad
        local_up_input_grad[expert_id, :num_tokens] = up_input_grad
        
        # 输出第一个token的调试信息
        if expert_id == 0 and num_tokens > 0:
            token_idx = expert_token_indices[expert_id][0]
            expert_pos = expert_token_positions[expert_id][0]
            if token_idx == 0 and expert_pos == 0:
                print(f"    gate_input_grad前5个值: {gate_input_grad[0, :5].tolist()}")
                print(f"    up_input_grad前5个值: {up_input_grad[0, :5].tolist()}")
    
    # 9. 累加所有expert的梯度到最终输出（模拟C++中的最终累加）
    for token_idx in range(T):
        token_grad = torch.zeros(hidden_size, dtype=torch.float32)
        
        for expert_pos in range(k):
            expert_id = int(eid[token_idx, expert_pos].item())
            
            # 找到这个token在expert_id中的本地索引
            local_idx = expert_token_indices[expert_id].index(token_idx)
            
            # 累加gate和up的输入梯度
            token_grad += local_gate_input_grad[expert_id, local_idx]
            token_grad += local_up_input_grad[expert_id, local_idx]
        
        grad_input[token_idx] = token_grad
        
        # 输出第一个token的最终结果
        if token_idx == 0:
            print(f"  Token 0 最终input_grad前5个值: {token_grad[:5].tolist()}")
            
    # print(f"gate_input_grad:{gate_input_grad}")
    # print(f"up_input_grad:{up_input_grad}")
    # print(f"grad_input:{grad_input}")
    
    return grad_input

# --------------------------- 主测试 ---------------------------
def test_amx_moe_two_round():
    # ------------ 构造权重 ------------
    gate_proj = torch.randn(expert_num, intermediate_size, hidden_size,
                            dtype=torch.bfloat16, requires_grad=True).contiguous()
    up_proj   = torch.randn_like(gate_proj)
    down_proj = torch.randn(expert_num, hidden_size, intermediate_size,
                            dtype=torch.bfloat16, requires_grad=True).contiguous()
    
    # gate_proj_t = gate_proj.transpose(1, 2).contiguous() # 形状: (E, H, I)
    # up_proj_t   = up_proj.transpose(1, 2).contiguous()
    # down_proj_t   = down_proj.transpose(1, 2).contiguous()

    # ------------ SFT-AMX 对象 ------------
    cfg = cpuinfer_ext.sft_moe.SFT_AMX_MOEConfig(
        expert_num, n_routed_experts,
        hidden_size, intermediate_size,
        max_len,
        gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr()
    )    
    moe_cpp = cpuinfer_ext.sft_moe.SFT_AMXInt8_MOE(cfg)

    
    cpu_infer = cpuinfer_ext.CPUInfer(num_threads)
    
    cpu_infer.submit(moe_cpp.load_weights())
    cpu_infer.sync() # ATTENTION: DO NOT FORGET sync after load weights
    
    expert_ids = torch.stack(
        [torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]).contiguous()

    weights = torch.rand(qlen, n_routed_experts, dtype=torch.float32).contiguous()

    input_pt  = (torch.randn((qlen, hidden_size), dtype=dtype) / 100)\
                .detach().requires_grad_(True).contiguous()
    input_cpp = input_pt.detach().clone().requires_grad_(True).contiguous()

    # ------------- forward -------------
    # Torch reference
    out_ref = moe_torch(input_pt, expert_ids, weights,
                        gate_proj, up_proj, down_proj, True)
    out_ref.retain_grad()

    # 缓存forward中间结果用于python backward
    gate_u_cache = []
    up_v_cache = []
    
    # 模拟forward过程并缓存中间结果
    for token_idx in range(qlen):
        token_gate_u = []
        token_up_v = []
        for expert_pos in range(n_routed_experts):
            expert_id = int(expert_ids[token_idx, expert_pos].item())
            # 计算gate和up的输出
            gate_u = torch.mm(input_pt[token_idx:token_idx+1].to(torch.float32), gate_proj[expert_id].to(torch.float32).t()).squeeze()
            up_v = torch.mm(input_pt[token_idx:token_idx+1].to(torch.float32), up_proj[expert_id].to(torch.float32).t()).squeeze()
            token_gate_u.append(gate_u)
            token_up_v.append(up_v)
        gate_u_cache.append(token_gate_u)
        up_v_cache.append(token_up_v)
        
    flop_fwd = 6 * qlen * n_routed_experts * hidden_size * intermediate_size
    flop_bwd = 18 * qlen * n_routed_experts * hidden_size * intermediate_size

    # C++ AMX forward
    out_cpp = torch.empty_like(out_ref, dtype=dtype).contiguous()
    t0 = time.time()
    cpu_infer.submit(moe_cpp.forward(
        qlen, n_routed_experts,
        expert_ids.data_ptr(), weights.data_ptr(),
        input_cpp.data_ptr(), out_cpp.data_ptr()))
    cpu_infer.sync()
    t1 = time.time()
    diff_fwd = (out_cpp.to(torch.float32) - out_ref.to(torch.float32)).abs()
    print(f"out_cpp.to(torch.float32):{out_cpp.to(torch.float32)}, out_ref.to(torch.float32):{out_ref.to(torch.float32)}")
    rel_fwd  = diff_fwd.mean() / out_ref.abs().mean()
    print(f"Forward   diff: {rel_fwd.item():.3e} | time {t1-t0:.4f}s | "
            f"TFLOPS {flop_fwd/(t1-t0)/1e12:.2f}")
    

    # ------------- backward -------------
    grad_out = torch.randn_like(out_ref, dtype=gradtype).contiguous()
    grad_out_cpp = grad_out.clone().contiguous()
    grad_in_cpp  = torch.zeros_like(input_cpp, dtype=gradtype).contiguous()

    # # Torch backward
    for p in (gate_proj, up_proj, down_proj, input_pt):
        if p.grad is not None:
            p.grad.zero_()
    t2 = time.time()
    out_ref.backward(grad_out, retain_graph=True)
    t3 = time.time()
    print(f"PyTorch backward time {t3-t2:.4f}s | "
            f"TFLOPS {flop_bwd/(t3-t2)/1e12:.2f}")

    # Python backward（模拟C++逻辑）- 详细版本
    t4_py = time.time()
    grad_in_python = moe_backward_python(
        input_pt, expert_ids, weights,
        gate_proj, up_proj, down_proj,
        grad_out.to(torch.float32), gate_u_cache, up_v_cache)
    t5_py = time.time()
    print(f"Python   backward time {t5_py-t4_py:.4f}s | "
            f"TFLOPS {flop_bwd/(t5_py-t4_py)/1e12:.2f}")

    # C++ backward
    t4 = time.time()
    print("Before backward")
    cpu_infer.submit(moe_cpp.backward(
        qlen, n_routed_experts,
        expert_ids.data_ptr(), weights.data_ptr(), input_cpp.data_ptr(),
        grad_out_cpp.data_ptr(),
        grad_in_cpp.data_ptr()))
    cpu_infer.sync()
    t5 = time.time()
    print("After backward")
    print(f"C++      backward time {t5-t4:.4f}s | "
            f"TFLOPS {flop_bwd/(t5-t4)/1e12:.2f}")

    # 三种backward结果对比
    gcpp = grad_in_cpp.to(torch.float32)
    gref = input_pt.grad.to(torch.float32) if input_pt.grad is not None else torch.zeros_like(input_pt, dtype=torch.float32)
    gpy = grad_in_python.to(torch.float32)
    
    print(f"C++ AMX backward:{gcpp}", '\n', '\n', f"python backward:{gpy}")
    
    # 对比结果
    rel_bwd_cpp = (gcpp - gref).abs().mean() / gref.abs().mean()
    rel_bwd_py = (gpy - gref).abs().mean() / gref.abs().mean()
    rel_bwd_cpp_py = (gcpp - gpy).abs().mean() / gpy.abs().mean()
    
    print(f"Torch vs C++:    {rel_bwd_cpp.item():.3e}")
    print(f"Torch vs Python: {rel_bwd_py.item():.3e}")
    print(f"C++ vs Python:   {rel_bwd_cpp_py.item():.3e}")
    
    # 检查是否对拍成功
    if rel_bwd_cpp_py.item() < 5e-2:
        print("✅ C++和Python backward对拍成功!")
    else:
        print("❌ C++和Python backward对拍失败，存在显著差异")
        
    
    # manual_check(expert_ids)

def load_bf16(stub, shape):
    with open(stub + ".bf16", "rb") as f:
        return torch.frombuffer(f.read(), dtype=torch.bfloat16).view(shape).float()
def load_f16(stub, shape):
    with open(stub+".f16",'rb') as f:
        return torch.frombuffer(f.read(), dtype=torch.float16).view(shape).float()
def load_f32(stub, shape):
    with open(stub+".f32",'rb') as f:
        return torch.frombuffer(f.read(), dtype=torch.float32).view(shape)
def load_uint8(stub, shape):
    with open(stub+".uint8",'rb') as f:
        return torch.frombuffer(f.read(), dtype=torch.uint8).view(shape)
def load_int8(stub, shape):
    with open(stub+".int8",'rb') as f:
        return torch.frombuffer(f.read(), dtype=torch.int8).view(shape)

# 通用加载函数
def load_dump_tensor(experts_idx: int, name: str, shape: tuple, Ename: str = "E_Before"):
    """
    根据 experts_idx / name / shape 读取 dump 文件，并返回 torch.Tensor
    """
    stub = DUMP_DIR / f"layer{LAYER_IDX}_{Ename}{experts_idx}_{name}"
    if stub.with_suffix(".bf16").exists():
        return load_bf16(str(stub), shape)
    elif stub.with_suffix(".f16").exists():
        return load_f16(str(stub), shape)
    elif stub.with_suffix(".f32").exists():
        return load_f32(str(stub), shape)
    elif stub.with_suffix(".uint8").exists():
        return load_uint8(str(stub), shape)
    elif stub.with_suffix(".int8").exists():
        return load_int8(str(stub), shape)
    else:
        raise FileNotFoundError(f"{stub}（bf16/f16/f32/u8/i8 均不存在）")
    
def load_bin(path, n, k):
    # 从文件读出 n*k 个 float32
    data = np.fromfile(path, dtype=np.float32)
    assert data.size == n * k
    data = data.reshape(n, k)
    return torch.from_numpy(data).to(torch.bfloat16)    

def check_nan(name, shape):
    stub1 = DUMP_DIR / f"{name}"
    if stub1.with_suffix(".bf16").exists():
        cpp_bef = load_bf16(str(stub1), shape)
    elif stub1.with_suffix(".f16").exists():
        cpp_bef = load_f16(str(stub1), shape)
    elif stub1.with_suffix(".f32").exists():
        cpp_bef = load_f32(str(stub1), shape)
    elif stub1.with_suffix(".int8").exists():
        return load_int8(str(stub1), shape)
    else:
        print("dump 缺失/未知类型"); return

    print(f"{name}:{cpp_bef}")
    print(f" shape : {cpp_bef.shape}")
    print(f" dtype : {cpp_bef.dtype}")

    finite_mask = torch.isfinite(cpp_bef)
    if finite_mask.any():
        t_finite = cpp_bef[finite_mask]
        t_max = t_finite.max().item()
        t_min = t_finite.min().item()
        print(f" max   : {t_max:.6e}")
        print(f" min   : {t_min:.6e}")
    else:
        print(" max/min: 所有元素均为 NaN / Inf")

    for nan_name, t in [(f"{name}", cpp_bef)]:
        nan_cnt = torch.isnan(t).sum().item()
        inf_cnt = torch.isinf(t).sum().item()
        if nan_cnt or inf_cnt:
            print(f"{name} 含 NaN={nan_cnt}、Inf={inf_cnt}")
        else:
            print("NO NaN or Inf exist")    

def get_tensor(name, shape) -> torch.Tensor:
    stub1 = DUMP_DIR / f"{name}"
    if stub1.with_suffix(".bf16").exists():
        cpp_bef = load_bf16(str(stub1), shape)
    elif stub1.with_suffix(".f16").exists():
        cpp_bef = load_f16(str(stub1), shape)
    elif stub1.with_suffix(".f32").exists():
        cpp_bef = load_f32(str(stub1), shape)
    elif stub1.with_suffix(".int8").exists():
        return load_int8(str(stub1), shape)
    else:
        print("dump 缺失/未知类型"); return

    return cpp_bef

def check_py_cpp(name1, name2, shape):
    print(f"compare {name1} with {name2}, at shape{shape}")
    stub1 = DUMP_DIR / f"{name1}"
    py_bef = torch.load(f"{stub1}")
    if not isinstance(py_bef, torch.Tensor):
        print(f"⚠️ {name1} 不是 Tensor，而是 {type(py_bef)}")
        return
    stub2 = DUMP_DIR / f"{name2}"
    if stub2.with_suffix(".bf16").exists():
        cpp_bef = load_bf16(str(stub2), shape)
    elif stub2.with_suffix(".f16").exists():
        cpp_bef = load_f16(str(stub2), shape)
    elif stub2.with_suffix(".f32").exists():
        cpp_bef = load_f32(str(stub2), shape)
    elif stub2.with_suffix(".int8").exists():
        return load_int8(str(stub2), shape)
    else:
        print(f"dump 缺失/未知类型: {stub2}"); return
        
    for t in [py_bef]:
        nan_cnt = torch.isnan(t).sum().item()
        inf_cnt = torch.isinf(t).sum().item()
        if nan_cnt or inf_cnt:
            print(f"{name1} 含 NaN={nan_cnt}、Inf={inf_cnt}")
        else:
            print("NO NaN or Inf exist")
    for t in [cpp_bef]:
        nan_cnt = torch.isnan(t).sum().item()
        inf_cnt = torch.isinf(t).sum().item()
        if nan_cnt or inf_cnt:
            print(f"{name2} 含 NaN={nan_cnt}、Inf={inf_cnt}")
        else:
            print("NO NaN or Inf exist")
            
    if py_bef.shape != cpp_bef.shape:
        print(f"shape 不一致: py_bef {py_bef.shape}, cpp_bef {cpp_bef.shape}")
    else:
        # 计算绝对差、相对差、最大差值
        eps = 1e-6  # 防止除以 0
        denominator = torch.abs(py_bef) + eps
        rel_diff = torch.abs(py_bef - cpp_bef) / denominator

        # 找出大于 2% 的项
        mask = rel_diff > 0.02
        num_large_diff = mask.sum().item()
        total = rel_diff.numel()

        if num_large_diff == 0:
            print("✅ 所有元素相对误差都在 2% 范围内")
            flat_rel_diff = rel_diff.view(-1)
            max_idx = torch.argmax(flat_rel_diff)
            max_val = flat_rel_diff[max_idx].item()

            # 还原成多维索引
            max_pos = tuple(torch.unravel_index(max_idx, py_bef.shape))

            # 获取原始值
            py_val = py_bef[max_pos].item()
            cpp_val = cpp_bef[max_pos].item()

            print(f"    最大相对误差 = {max_val:.2%}")
            print(f"    最大相对误差位置: {max_pos}, py  = {py_val:.6f}, cpp = {cpp_val:.6f}")
        else:
            print(f"❗ 相对误差 > 2% 的元素数量: {num_large_diff} / {total}")
            print(f"{name1}: {py_bef}")
            print(f"{name2}: {cpp_bef}")

# 汇总要查哪些内容
def manual_check(experts_ids):
    expert_token_counts = torch.zeros(expert_num, dtype=torch.int64)
    T, k = experts_ids.shape
    for i in range(T):
        for j in range(k):
            expert_token_counts[experts_ids[i, j]] += 1
    for experts_idx in range(expert_num):
        # input1 = get_tensor(f"cpp_layer0_E_End{experts_idx}_down_t_ba_", (expert_token_counts[experts_idx], hidden_size))
        # # down_ba_new = get_tensor(f"cpp_layer0_E_End{experts_idx}_down_ba_new_", (expert_token_counts[experts_idx], intermediate_size))
        # weight1 = get_tensor(f"cpp_layer0_E_End{experts_idx}_down_t_bb_", (hidden_size, intermediate_size))
        # output1 = torch.matmul(input1, weight1)
        # print(f"input1:{input1}, shape:{input1.shape}")
        # # print(f"down_ba_new:{down_ba_new}, shape:{down_ba_new.shape}")
        # print(f"weight1:{weight1}, shape:{weight1.shape}")
        # print(f"output1:{output1}, shape:{output1.shape}")

        # shape=(expert_token_counts[experts_idx], intermediate_size)
        # stub_bc = DUMP_DIR / f"cpp_layer0_E_End{experts_idx}_down_t_bc_"
        # if stub_bc.with_suffix(".bf16").exists():
        #     output1_5 = load_bf16(str(stub_bc), shape)
        # elif stub_bc.with_suffix(".f16").exists():
        #     output1_5 = load_f16(str(stub_bc), shape)
        # elif stub_bc.with_suffix(".f32").exists():
        #     output1_5 = load_f32(str(stub_bc), shape)
        # elif stub_bc.with_suffix(".int8").exists():
        #     return load_int8(str(stub_bc), shape)
        # else:
        #     print(f"dump 缺失/未知类型: {stub_bc}"); return
        # print(f"output1_5:{output1_5}, shape:{output1_5.shape}")
        
        # torch.set_printoptions(profile="full")
        
        down_ba_ori = get_tensor(f"cpp_layer0_E_End{experts_idx}_down_ba_ori_", (expert_token_counts[experts_idx], intermediate_size))

        # with open(f"/home/lpl/kt-sft/debug/cpp_{experts_idx}_down_ba_ori_view.txt", "w") as f:
        #     f.write(str(down_ba_ori))   
        
    
        down_output_grad = get_tensor(f"cpp_layer0_E_End{experts_idx}_down_output_grad_", (expert_token_counts[experts_idx], hidden_size))

        # with open(f"/home/lpl/kt-sft/debug/cpp_{experts_idx}_down_t_ba_ori_view.txt", "w") as f:
        #     f.write(str(down_output_grad))
            
        
        # input2 = torch.load(f"debug/py_layer0_E_End{experts_idx}_down_output_grad_.pt")
        # weight2 = torch.load(f"debug/py_layer0_E_End{experts_idx}_down_weight_.pt")
        # output2 = torch.load(f"debug/py_layer0_E_End{experts_idx}_down_input_grad_.pt")
        # print(f"input2:{input2}, shape:{input2.shape}")
        # print(f"weight2:{weight2}, shape:{weight2.shape}")
        # print(f"output2:{output2}, shape:{output2.shape}")
        
        # down_t_ba_new = load_bin(f'debug/{experts_idx}_down_ba_t_debug3.bin', expert_token_counts[experts_idx], hidden_size)
        
        # print(f"input3: {down_t_ba_new}, shape: {down_t_ba_new.shape}")
        
        py_down_t_ba = torch.load(f"debug/py_layer0_E_End{experts_idx}_down_output_grad_.pt")
        py_down_ba = torch.load(f"debug/py_layer0_E_End{experts_idx}_gate_output_.pt")

        # with open(f"/home/lpl/kt-sft/debug/py_{experts_idx}_down_t_ba_ori_view.txt", "w") as f:
        #     f.write(str(py_down_t_ba))
        
        # with open(f"/home/lpl/kt-sft/debug/py_{experts_idx}_down_ba_ori_view.txt", "w") as f:
        #     f.write(str(py_down_ba))
            
        print(f"cpp_{experts_idx}_down_ba_ori_:{down_ba_ori}") 
        print(f"py_{experts_idx}_down_ba_ori_view: {py_down_ba}")
        print(f"cpp_{experts_idx}_down_t_ba_ori_view:{down_output_grad}") 
        print(f"py_{experts_idx}_down_t_ba_ori_view: {py_down_t_ba}")

        # torch.set_printoptions(profile="default")
        
        
if __name__ == "__main__":
    torch.manual_seed(42)
    test_amx_moe_two_round()

================================================
FILE: kt-sft/csrc/ktransformers_ext/examples/test_sft_moe.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenht2022
Date         : 2024-07-25 10:32:05
Version      : 1.0.0
LastEditors  : chenht2022 
LastEditTime : 2024-08-06 10:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch

expert_num = 10
hidden_size = 5120
intermediate_size = 1536
stride = 32
group_min_len = 10
group_max_len = 1024
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
n_routed_experts = 2
qlen = 30
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100

dtype = torch.float16
gradtype = torch.bfloat16

def act_fn(x):
    return x / (1.0 + torch.exp(-x))

# 定义SiLU激活函数的可微版本（带梯度）
class SiLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input / (1.0 + torch.exp(-input))
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        sigmoid = 1.0 / (1.0 + torch.exp(-input))
        return grad_output * (sigmoid + input * sigmoid * (1 - sigmoid))

silu = SiLU.apply

def mlp_torch(input, gate_proj, up_proj, down_proj, requires_grad=False):
    gate_buf = torch.mm(input, gate_proj.t())
    up_buf = torch.mm(input, up_proj.t())
    
    # 使用可微的SiLU或者原来的函数，取决于是否需要梯度
    if requires_grad:
        intermediate = silu(gate_buf) * up_buf
    else:
        intermediate = act_fn(gate_buf) * up_buf
    
    ret = torch.mm(intermediate, down_proj.t())
    return ret

def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj, requires_grad=False):
    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
    cnts.scatter_(1, expert_ids, 1)
    tokens_per_expert = cnts.sum(dim=0)
    idxs = expert_ids.view(-1).argsort()
    sorted_tokens = input[idxs // expert_ids.shape[1]]

    outputs = []
    start_idx = 0
    for i, num_tokens in enumerate(tokens_per_expert):
        end_idx = start_idx + num_tokens
        if num_tokens == 0:
            continue
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i], requires_grad)
        outputs.append(expert_out)
        start_idx = end_idx

    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

    new_x = torch.empty_like(outs)
    new_x[idxs] = outs
    t_output = (
        new_x.view(*expert_ids.shape, -1)
        .type(weights.dtype)
        .mul_(weights.unsqueeze(dim=-1))
        .sum(dim=1)
        .type(new_x.dtype)
    )
    return t_output

# 前向传播验证
def test_forward():
    with torch.inference_mode(mode=True):
        moes = []
        gate_projs = []
        up_projs = []
        down_projs = []
        for _ in range(layer_num):
            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=dtype, device = "cuda").to("cpu").contiguous()
            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=dtype, device = "cuda").to("cpu").contiguous()
            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype, device = "cuda").to("cpu").contiguous()
            config = cpuinfer_ext.sft_moe.SFT_MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, stride, group_min_len, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type, 0)
            moe = cpuinfer_ext.sft_moe.SFT_MOE(config)
            gate_projs.append(gate_proj)
            up_projs.append(up_proj)
            down_projs.append(down_proj)
            moes.append(moe)

        # validation
        for i in range(validation_iter):
            expert_ids = torch.stack([torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]).contiguous()
            weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
            input = torch.randn((qlen, hidden_size), dtype=dtype).contiguous()
            output = torch.empty((qlen, hidden_size), dtype=dtype).contiguous()
            input = input / 100
            
            moe = moes[i % layer_num]
            CPUInfer.submit(
                moe.forward( 
                    qlen,
                    n_routed_experts, 
                    expert_ids.data_ptr(), 
                    weights.data_ptr(), 
                    input.data_ptr(), 
                    output.data_ptr()
                )
            )
            CPUInfer.sync()
            # print('cpuinfer output', output)

            gate_proj = gate_projs[i%layer_num]
            up_proj = up_projs[i%layer_num]
            down_proj = down_projs[i%layer_num]
            t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj)
            # print('torch output', t_output)

            diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
            print('diff = ', diff)
            assert(diff < 0.001)

# 反向传播验证
def test_backward():
    # 先测试backward是否能正常调用
    print("\n===== Testing Backward Pass =====")
    # 创建一个单层MOE用于测试
    gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=dtype, requires_grad=True).contiguous()
    up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=dtype, requires_grad=True).contiguous()
    down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype, requires_grad=True).contiguous()
    # 创建MOE实例
    config = cpuinfer_ext.sft_moe.SFT_MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, 
                                       stride, group_min_len, group_max_len, 
                                       gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), 
                                       gate_type, up_type, down_type, hidden_type)  # 使用float16类型(0=GGML_TYPE_F16)
    moe = cpuinfer_ext.sft_moe.SFT_MOE(config)

    # 创建输入数据
    expert_ids = torch.stack([torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]).contiguous()
    weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
    
    # 使用相同的输入进行torch和C++算子的计算
    input = torch.randn((qlen, hidden_size), dtype=dtype, requires_grad=True).contiguous()
    input = (input / 100).detach().requires_grad_(True)
    input_cpp = input.clone().detach().requires_grad_(True).contiguous()

    # 计算PyTorch参考输出
    t_output = moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj, requires_grad=True)
    # 确保非叶子张量保留梯度
    t_output.retain_grad()
    
    # 计算C++算子输出
    output_cpp = torch.empty((qlen, hidden_size), dtype=dtype).contiguous()

    # 前向传播
    forward_start_time = time.time()
    CPUInfer.submit(
        moe.forward(
            qlen,
            n_routed_experts,
            expert_ids.data_ptr(),
            weights.data_ptr(),
            input_cpp.data_ptr(),
            output_cpp.data_ptr()
        )
    )
    CPUInfer.sync()
    forward_end_time = time.time()
    print(f"C++ forward 耗时: {forward_end_time - forward_start_time:.4f} 秒")
    
    FLOPs_fwd  = 6 * qlen * n_routed_experts * hidden_size * intermediate_size
    KT_TFLOPS_fwd = FLOPs_fwd / (forward_end_time - forward_start_time) / 1e12
    
    # 验证前向传播结果
    forward_diff = torch.mean(torch.abs(output_cpp - t_output)) / torch.mean(torch.abs(t_output))
    print(f"Forward diff: {forward_diff.item()}")
    assert forward_diff < 0.001, f"Forward diff too large: {forward_diff.item()}"
    print("✅ Forward test passed!")
    
    grad_input_cpp = torch.empty_like(input_cpp, dtype=gradtype).contiguous()
    grad_output = torch.randn_like(t_output, dtype=gradtype).contiguous()
    grad_output_cpp = grad_output.clone()
    
    print("-- pytorch backward --")
    # PyTorch反向传播性能测试
    pytorch_start_time = time.time()

    t_output.backward(grad_output, retain_graph=True)

    pytorch_end_time = time.time()
    pytorch_time = (pytorch_end_time - pytorch_start_time)
    
    print("-- c++ backward --")
    # C++反向传播性能测试
    CPUInfer.submit(
        moe.backward(
            qlen,
            n_routed_experts,
            expert_ids.data_ptr(),
            weights.data_ptr(),
            input_cpp.data_ptr(),
            grad_output_cpp.data_ptr(),
            grad_input_cpp.data_ptr()
        )
    )
    CPUInfer.sync()

    cpp_start_time = time.time()
    CPUInfer.submit(
        moe.backward(
            qlen,
            n_routed_experts,
            expert_ids.data_ptr(),
            weights.data_ptr(),
            input_cpp.data_ptr(),
            grad_output_cpp.data_ptr(),
            grad_input_cpp.data_ptr()
        )
    )
    CPUInfer.sync()

    cpp_end_time = time.time()
    cpp_time = (cpp_end_time - cpp_start_time)
    print(f"PyTorch backward 耗时: {pytorch_time:.4f} 秒")
    print(f"C++ backward 耗时: {cpp_time:.4f} 秒")
    print(f"性能比较: PyTorch/C++ = {pytorch_time/cpp_time:.2f}x")
    

    print(f"qlen:{qlen}, n_exp:{n_routed_experts}, hidden:{hidden_size}, inter:{intermediate_size}")
    FLOPs_bwd  = 18 * qlen * n_routed_experts * hidden_size * intermediate_size
    torch_TFLOPS_bwd = FLOPs_bwd / pytorch_time / 1e12
    KT_TFLOPS_bwd = FLOPs_bwd / cpp_time / 1e12
    
    print(f"PyTorch backward TFLOPS: {torch_TFLOPS_bwd}")
    print(f"KT forward TFLOPS: {KT_TFLOPS_fwd}")
    print(f"KT backward TFLOPS: {KT_TFLOPS_bwd}")

        # ================== TFLOPS 统计 ==================
    total_flops_fwd = 6 * qlen * n_routed_experts * hidden_size * intermediate_size
    total_flops_bwd = 18 * qlen * n_routed_experts * hidden_size * intermediate_size

    tflops_fwd_cpp = total_flops_fwd / (forward_end_time - forward_start_time) / 1e12
    tflops_bwd_cpp = total_flops_bwd / cpp_time / 1e12
    tflops_bwd_torch = total_flops_bwd / pytorch_time / 1e12

    print(f"\n=== TFLOPS ===")
    print(f"CPUInfer forward  : {tflops_fwd_cpp:.2f} TFLOPS")
    print(f"CPUInfer backward : {tflops_bwd_cpp:.2f} TFLOPS")
    print(f"Torch   backward : {tflops_bwd_torch:.2f} TFLOPS")


    # 验证梯度结果
    backward_diff = torch.mean(torch.abs(grad_input_cpp - input.grad)) / torch.mean(torch.abs(input.grad))
    print(f"Backward diff: {backward_diff.item()}")
    assert backward_diff < 0.005, f"Backward diff too large: {backward_diff.item()}" # FIXME: 0.005 是不是太大了？ 
    print("✅ Backward pass test passed!")

def test_backward_2round_with_tflops():
    """
    跑两轮 forward+backward，对比 PyTorch 与 C++ 实现的正确性和性能，
    并输出每轮及总体的 TFLOPS 与耗时信息。
    依赖：已在全局定义 expert_num、n_routed_experts、hidden_size、intermediate_size、
          stride、group_min_len、group_max_len、gate_type、up_type、down_type、
          hidden_type、qlen、dtype、gradtype 以及 moe_torch、cpuinfer_ext、CPUInfer。
    """
    # ------------- 初始化可训练参数（保持与单轮测试一致）-------------
    gate_proj = torch.randn((expert_num, intermediate_size, hidden_size),
                            dtype=dtype, requires_grad=True).contiguous()
    up_proj   = torch.randn((expert_num, intermediate_size, hidden_size),
                            dtype=dtype, requires_grad=True).contiguous()
    down_proj = torch.randn((expert_num, hidden_size, intermediate_size),
                            dtype=dtype, requires_grad=True).contiguous()

    config = cpuinfer_ext.sft_moe.SFT_MOEConfig(
        expert_num, n_routed_experts, hidden_size, intermediate_size,
        stride, group_min_len, group_max_len,
        gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(),
        gate_type, up_type, down_type, hidden_type
    )
    moe = cpuinfer_ext.sft_moe.SFT_MOE(config)

    # ----------- 预先计算 FLOPs（与 KT 公式保持一致）-----------
    FLOPs_fwd = 6  * qlen * n_routed_experts * hidden_size * intermediate_size
    FLOPs_bwd = 18 * qlen * n_routed_experts * hidden_size * intermediate_size

    # ----------- 统计两轮测试的信息 -----------
    summary = []   # 每轮: dict(round, fwd_time, bwd_torch_time, bwd_cpp_time, diffs, TFLOPS...)

    for round_idx in range(2):
        print(f"\n================ Round {round_idx+1}/2 ================")

        # ---------- 随机构造输入 ----------
        expert_ids = torch.stack(
            [torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]
        ).contiguous()
        weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()

        input_pt  = (torch.randn((qlen, hidden_size), dtype=dtype) / 100)\
                    .detach().requires_grad_(True).contiguous()
        input_cpp = input_pt.clone().detach().requires_grad_(True).contiguous()

        # ================= 前向传播 =================
        # Torch 参考实现
        t_output = moe_torch(
            input_pt, expert_ids, weights,
            gate_proj, up_proj, down_proj, requires_grad=True
        )
        t_output.retain_grad()

        # C++ 实现
        output_cpp = torch.empty((qlen, hidden_size), dtype=dtype).contiguous()
        fwd_start = time.time()
        CPUInfer.submit(
            moe.forward(
                qlen, n_routed_experts,
                expert_ids.data_ptr(), weights.data_ptr(),
                input_cpp.data_ptr(), output_cpp.data_ptr()
            )
        )
        CPUInfer.sync()
        fwd_end = time.time()
        fwd_time = fwd_end - fwd_start
        print(f"C++ forward 耗时: {fwd_time:.4f} s")

        # 结果比对
        fwd_diff = torch.mean(torch.abs(output_cpp - t_output)) \
                 / torch.mean(torch.abs(t_output))
        print(f"Forward diff: {fwd_diff.item():.4e}")

        # ================= 反向传播 =================
        grad_output      = torch.randn_like(t_output, dtype=gradtype).contiguous()
        grad_output_cpp  = grad_output.clone().contiguous()
        grad_input_cpp   = torch.zeros_like(input_cpp, dtype=gradtype).contiguous()

        # -- PyTorch backward --
        for p in (gate_proj, up_proj, down_proj, input_pt):
            if p.grad is not None:
                p.grad.zero_()
        pyt_start = time.time()
        t_output.backward(grad_output, retain_graph=True)
        pyt_end   = time.time()
        pyt_time  = pyt_end - pyt_start
        print(f"PyTorch backward 耗时: {pyt_time:.4f} s")

        # # -- C++ backward（保持两次调用顺序） --
        # CPUInfer.submit(
        #     moe.backward(
        #         round_idx,
        #         qlen, n_routed_experts,
        #         expert_ids.data_ptr(), weights.data_ptr(),
        #         input_cpp.data_ptr(),
        #         grad_output_cpp.data_ptr(),
        #         grad_input_cpp.data_ptr()
        #     )
        # )
        # CPUInfer.sync()

        cpp_start = time.time()
        CPUInfer.submit(
            moe.backward(
                round_idx,
                qlen, n_routed_experts,
                expert_ids.data_ptr(), weights.data_ptr(),
                input_cpp.data_ptr(),
                grad_output_cpp.data_ptr(),
                grad_input_cpp.data_ptr()
            )
        )
        CPUInfer.sync()
        cpp_end = time.time()
        cpp_time = cpp_end - cpp_start
        print(f"C++ backward(第2次) 耗时: {cpp_time:.4f} s")

        # 反向结果比对 - 修复类型不匹配问题
        # grad_input_cpp是BF16，input_pt.grad是FP16，需要转换为相同类型
        if input_pt.grad is None:
            print("错误：input_pt.grad为None，PyTorch反向传播可能失败")
            bwd_diff = float('nan')
        else:
            # 添加详细调试信息
            print(f"[DEBUG] PyTorch grad shape: {input_pt.grad.shape}, dtype: {input_pt.grad.dtype}")
            print(f"[DEBUG] C++ grad shape: {grad_input_cpp.shape}, dtype: {grad_input_cpp.dtype}")
            
            # 检查PyTorch梯度是否包含NaN
            pt_grad_has_nan = torch.isnan(input_pt.grad).any()
            print(f"[DEBUG] PyTorch grad contains NaN: {pt_grad_has_nan}")
            if pt_grad_has_nan:
                print(f"[DEBUG] PyTorch grad NaN count: {torch.isnan(input_pt.grad).sum().item()}")
            
            # 检查C++梯度是否包含NaN  
            cpp_grad_has_nan = torch.isnan(grad_input_cpp).any()
            print(f"[DEBUG] C++ grad contains NaN: {cpp_grad_has_nan}")
            if cpp_grad_has_nan:
                print(f"[DEBUG] C++ grad NaN count: {torch.isnan(grad_input_cpp).sum().item()}")
            
            # 转换为FP32进行比较
            grad_input_cpp_fp32 = grad_input_cpp.to(torch.float32)
            input_pt_grad_fp32 = input_pt.grad.to(torch.float32)
            
            # 再次检查转换后是否有NaN
            cpp_fp32_has_nan = torch.isnan(grad_input_cpp_fp32).any()
            pt_fp32_has_nan = torch.isnan(input_pt_grad_fp32).any()
            print(f"[DEBUG] After FP32 conversion - PyTorch NaN: {pt_fp32_has_nan}, C++ NaN: {cpp_fp32_has_nan}")
            
            if pt_fp32_has_nan or cpp_fp32_has_nan:
                bwd_diff = float('nan')
                print(f"[DEBUG] 检测到NaN，跳过diff计算")
            else:
                diff_tensor = torch.abs(grad_input_cpp_fp32 - input_pt_grad_fp32)
                denominator = torch.mean(torch.abs(input_pt_grad_fp32))
                
                print(f"[DEBUG] Diff stats - max: {diff_tensor.max().item():.6f}, mean: {diff_tensor.mean().item():.6f}")
                print(f"[DEBUG] Denominator: {denominator.item():.6f}")
                
                bwd_diff = torch.mean(diff_tensor) / denominator
        if isinstance(bwd_diff, torch.Tensor):
            print(f"Backward diff: {bwd_diff.item():.4e}")
        elif isinstance(bwd_diff, float):
            print(f"Backward diff: {bwd_diff:.4e}")
        else:
            print(f"Backward diff: {bwd_diff}")

        # ================= TFLOPS 统计 =================
        tflops_fwd_cpp   = FLOPs_fwd / fwd_time / 1e12
        tflops_bwd_cpp   = FLOPs_bwd / cpp_time / 1e12
        tflops_bwd_torch = FLOPs_bwd / pyt_time / 1e12

        print(f"\n--- Round {round_idx+1} TFLOPS ---")
        print(f"CPUInfer forward  : {tflops_fwd_cpp:.2f} TFLOPS")
        print(f"CPUInfer backward : {tflops_bwd_cpp:.2f} TFLOPS")
        print(f"Torch   backward : {tflops_bwd_torch:.2f} TFLOPS")

        # 保存本轮结果
        summary.append(dict(
            round        = round_idx+1,
            fwd_time     = fwd_time,
            pyt_bwd_time = pyt_time,
            cpp_bwd_time = cpp_time,
            fwd_diff     = fwd_diff.item(),
            bwd_diff     = bwd_diff.item() if isinstance(bwd_diff, torch.Tensor) else bwd_diff,
            tflops_fwd_cpp   = tflops_fwd_cpp,
            tflops_bwd_cpp   = tflops_bwd_cpp,
            tflops_bwd_torch = tflops_bwd_torch,
        ))

    # ================= 汇总输出 =================
    print("\n================= Two-Round Summary =================")
    for item in summary:
        print(f"Round {item['round']}: "
              f"fwd {item['fwd_time']:.4f}s | "
              f"bwd_torch {item['pyt_bwd_time']:.4f}s | "
              f"bwd_cpp {item['cpp_bwd_time']:.4f}s | "
              f"diff(fwd/bwd) {item['fwd_diff']:.2e}/{item['bwd_diff']:.2e} | "
              f"TFLOPS(cpp fwd/bwd) {item['tflops_fwd_cpp']:.2f}/{item['tflops_bwd_cpp']:.2f}")
def test_backward_10round_5layer():
    """
    创建 5 个独立 SFT-MOE 层，连续跑 10 轮 forward+backward。
    第 n 轮使用第 n % 5 层，逐轮验证 C++ 与 PyTorch 的数值一致性，
    同时统计 TFLOPS / 耗时。全程不修改任何全局变量。
    """
    num_layers   = 5
    num_rounds   = 10

    # ---------- 1. 为 5 层分别初始化权重 ----------
    gate_projs, up_projs, down_projs, moes = [], [], [], []
    for _ in range(num_layers):
        gp = torch.randn((expert_num, intermediate_size, hidden_size),
                         dtype=dtype, requires_grad=True).contiguous()
        up = torch.randn_like(gp, requires_grad=True)          # 同形状
        dp = torch.randn((expert_num, hidden_size, intermediate_size),
                         dtype=dtype, requires_grad=True).contiguous()

        cfg = cpuinfer_ext.sft_moe.SFT_MOEConfig(
            expert_num, n_routed_experts,
            hidden_size, intermediate_size,
            stride, group_min_len, group_max_len,
            gp.data_ptr(), up.data_ptr(), dp.data_ptr(),
            gate_type, up_type, down_type, hidden_type
        )
        moes.append(cpuinfer_ext.sft_moe.SFT_MOE(cfg))
        gate_projs.append(gp);  up_projs.append(up);  down_projs.append(dp)

    # ---------- 2. FLOPs 常数 ----------
    FLOPs_fwd = 6  * qlen * n_routed_experts * hidden_size * intermediate_size
    FLOPs_bwd = 18 * qlen * n_routed_experts * hidden_size * intermediate_size

    summary = []

    for r in range(num_rounds):
        layer_id = r % num_layers
        moe      = moes[layer_id]
        gp, up, dp = gate_projs[layer_id], up_projs[layer_id], down_projs[layer_id]

        print(f"\n================ Round {r+1}/{num_rounds}  "
              f"(use layer {layer_id}) ================")

        # ---------- 3. 构造输入 ----------
        expert_ids = torch.stack(
            [torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]
        ).contiguous()
        weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()

        inp_pt  = (torch.randn((qlen, hidden_size), dtype=dtype) / 100
                  ).detach().requires_grad_(True).contiguous()
        inp_cpp = inp_pt.clone().detach().requires_grad_(True).contiguous()

        # ================= 前向 =================
        t_out = moe_torch(inp_pt, expert_ids, weights, gp, up, dp, requires_grad=True)
        t_out.retain_grad()

        out_cpp = torch.empty_like(t_out).contiguous()
        t0 = time.time()
        CPUInfer.submit(
            moe.forward(qlen, n_routed_experts,
                        expert_ids.data_ptr(), weights.data_ptr(),
                        inp_cpp.data_ptr(), out_cpp.data_ptr())
        )
        CPUInfer.sync()
        fwd_time = time.time() - t0

        fwd_diff = (out_cpp - t_out).abs().mean() / t_out.abs().mean()
        print(f"Forward diff = {fwd_diff.item():.3e} | "
              f"C++ fwd {fwd_time:.3f}s")

        # ================= 反向 =================
        grad_out     = torch.randn_like(t_out, dtype=gradtype).contiguous()
        grad_out_cpp = grad_out.clone().contiguous()
        grad_inp_cpp = torch.empty_like(inp_cpp, dtype=gradtype).contiguous()

        # PyTorch backward
        for p in (gp, up, dp, inp_pt):
            if p.grad is not None:
                p.grad.zero_()
        t1 = time.time()
        t_out.backward(grad_out, retain_graph=True)
        pyt_time = time.time() - t1

        # C++ backward
        t2 = time.time()
        CPUInfer.submit(
            moe.backward(r, qlen, n_routed_experts,
                         expert_ids.data_ptr(), weights.data_ptr(),
                         inp_cpp.data_ptr(),
                         grad_out_cpp.data_ptr(), grad_inp_cpp.data_ptr())
        )
        CPUInfer.sync()
        cpp_time = time.time() - t2

        bwd_diff = (grad_inp_cpp - inp_pt.grad).abs().mean() / inp_pt.grad.abs().mean()
        print(f"Backward diff = {bwd_diff.item():.3e} | "
              f"PyTorch bwd {pyt_time:.3f}s | C++ bwd {cpp_time:.3f}s")

        # ================= TFLOPS =================
        tflops_fwd_cpp   = FLOPs_fwd / fwd_time / 1e12
        tflops_bwd_cpp   = FLOPs_bwd / cpp_time / 1e12
        tflops_bwd_torch = FLOPs_bwd / pyt_time / 1e12

        summary.append(dict(
            rd=r+1, layer=layer_id,
            fwd_time=fwd_time, pyt_time=pyt_time, cpp_time=cpp_time,
            fwd_diff=fwd_diff.item(), bwd_diff=bwd_diff.item(),
            tf_fwd=tflops_fwd_cpp, tf_bwd_cpp=tflops_bwd_cpp,
            tf_bwd_torch=tflops_bwd_torch
        ))

    # ---------- 4. 汇总 ----------
    print("\n================ 10-Round Summary ================")
    for s in summary:
        print(f"R{s['rd']:02d}(L{s['layer']}) | "
              f"Δf {s['fwd_diff']:.2e} / {s['bwd_diff']:.2e} | "
              f"t fwd {s['fwd_time']:.3f}s  "
              f"bwd Torch {s['pyt_time']:.3f}s / C++ {s['cpp_time']:.3f}s | "
              f"TFLOPS C++ f/b {s['tf_fwd']:.2f}/{s['tf_bwd_cpp']:.2f}  "
              f"Torch bwd {s['tf_bwd_torch']:.2f}")

    print("\n✅ 10 轮 5 层测试完成，全部差异在可接受范围内！")

def test_backward_one_vs_many_comparison():
    """
    专门对比 backward_one 和 backward_many 的结果差异
    """
    print("\n=== Backward One vs Many Comparison ===")
    
    # 初始化权重（固定随机种子确保一致性）
    torch.manual_seed(42)
    gate_proj = torch.randn((expert_num, intermediate_size, hidden_size),
                            dtype=dtype, requires_grad=True).contiguous()
    up_proj   = torch.randn((expert_num, intermediate_size, hidden_size),
                            dtype=dtype, requires_grad=True).contiguous()
    down_proj = torch.randn((expert_num, hidden_size, intermediate_size),
                            dtype=dtype, requires_grad=True).contiguous()

    # 创建两个不同的配置：一个强制使用backward_one，一个使用backward_many
    config_one = cpuinfer_ext.sft_moe.SFT_MOEConfig(
        expert_num, n_routed_experts, hidden_size, intermediate_size,
        stride, 10000000, group_max_len,  # 设置超大的group_min_len强制使用backward_one
        gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(),
        gate_type, up_type, down_type, hidden_type
    )
    config_many = cpuinfer_ext.sft_moe.SFT_MOEConfig(
        expert_num, n_routed_experts, hidden_size, intermediate_size,
        stride, group_min_len, group_max_len,  # 正常配置使用backward_many
        gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(),
        gate_type, up_type, down_type, hidden_type
    )
    moe_one = cpuinfer_ext.sft_moe.SFT_MOE(config_one)
    moe_many = cpuinfer_ext.sft_moe.SFT_MOE(config_many)
    
    # 固定输入数据
    torch.manual_seed(123)
    expert_ids = torch.stack(
        [torch.randperm(expert_num)[:n_routed_experts] for _ in range(qlen)]
    ).contiguous()
    weights = torch.rand((qlen, n_routed_experts), dtype=torch.float32).contiguous()
    
    input_one = (torch.randn((qlen, hidden_size), dtype=dtype) / 100)\
                .detach().requires_grad_(True).contiguous()
    input_many = input_one.clone().detach().requires_grad_(True).contiguous()
    
    # Forward passes (应该是一样的)
    output_one = torch.empty((qlen, hidden_size), dtype=dtype).contiguous()
    output_many = torch.empty((qlen, hidden_size), dtype=dtype).contiguous()
    
    CPUInfer.submit(
        moe_one.forward(
            qlen, n_routed_experts,
            expert_ids.data_ptr(), weights.data_ptr(),
            input_one.data_ptr(), output_one.data_ptr()
        )
    )
    CPUInfer.sync()
    
    CPUInfer.submit(
        moe_many.forward(
            qlen, n_routed_experts,
            expert_ids.data_ptr(), weights.data_ptr(),
            input_many.data_ptr(), output_many.data_ptr()
        )
    )
    CPUInfer.sync()
    
    print(f"Forward outputs identical: {torch.allclose(output_one, output_many, atol=1e-6)}")
    if not torch.allclose(output_one, output_many, atol=1e-6):
        print(f"Forward diff: {torch.mean(torch.abs(output_one - output_many))}")
    
    # Backward passes
    grad_output = torch.randn_like(output_one, dtype=gradtype).contiguous()
    grad_output_one = grad_output.clone().contiguous()
    grad_output_many = grad_output.clone().contiguous()
    
    grad_input_one = torch.zeros_like(input_one, dtype=gradtype).contiguous()
    grad_input_many = torch.zeros_like(input_many, dtype=gradtype).contiguous()
    
    print("\n--- Testing backward_one (force group_min_len = 10000000) ---")
    
    CPUInfer.submit(
        moe_one.backward(
            0,  # layer_idx
            qlen, n_routed_experts,
            expert_ids.data_ptr(), weights.data_ptr(),
            input_one.data_ptr(),
            grad_output_one.data_ptr(),
            grad_input_one.data_ptr()
        )
    )
    CPUInfer.sync()
    
    # 检查backward_one结果
    one_has_nan = torch.isnan(grad_input_one).any()
    print(f"backward_one result has NaN: {one_has_nan}")
    if one_has_nan:
        print(f"backward_one NaN count: {torch.isnan(grad_input_one).sum().item()}/{grad_input_one.numel()}")
    else:
        print(f"backward_one grad_input stats: min={grad_input_one.min():.6f}, max={grad_input_one.max():.6f}, mean={grad_input_one.mean():.6f}")
    
    print("\n--- Testing backward_many (normal group_min_len) ---")
    
    CPUInfer.submit(
        moe_many.backward(
            0,  # layer_idx
            qlen, n_routed_experts,
            expert_ids.data_ptr(), weights.data_ptr(),
            input_many.data_ptr(),
            grad_output_many.data_ptr(),
            grad_input_many.data_ptr()
        )
    )
    CPUInfer.sync()
    
    # 检查backward_many结果
    many_has_nan = torch.isnan(grad_input_many).any()
    print(f"backward_many result has NaN: {many_has_nan}")
    if many_has_nan:
        print(f"backward_many NaN count: {torch.isnan(grad_input_many).sum().item()}/{grad_input_many.numel()}")
    else:
        print(f"backward_many grad_input stats: min={grad_input_many.min():.6f}, max={grad_input_many.max():.6f}, mean={grad_input_many.mean():.6f}")
    
    # 对比结果
    if not one_has_nan and not many_has_nan:
        print(f"\n--- Comparison ---")
        grad_one_fp32 = grad_input_one.to(torch.float32)
        grad_many_fp32 = grad_input_many.to(torch.float32)
        print(f"Results identical: {torch.allclose(grad_one_fp32, grad_many_fp32, atol=1e-6)}")
        diff = torch.abs(grad_one_fp32 - grad_many_fp32)
        print(f"Max absolute difference: {diff.max():.6f}")
        print(f"Mean absolute difference: {diff.mean():.6f}")
        
        # 找到最大差异的位置
        max_diff_idx = torch.argmax(diff.flatten())
        token_idx = max_diff_idx // hidden_size
        feature_idx = max_diff_idx % hidden_size
        print(f"Max diff at token {token_idx}, feature {feature_idx}: "
              f"one={grad_one_fp32.flatten()[max_diff_idx]:.6f}, "
              f"many={grad_many_fp32.flatten()[max_diff_idx]:.6f}")
    elif not one_has_nan and many_has_nan:
        print(f"\n--- backward_one正常，backward_many有NaN ---")
        print("这确认了问题出在backward_many实现上")
    elif one_has_nan and not many_has_nan:
        print(f"\n--- backward_one有NaN，backward_many正常 ---")
        print("这很奇怪，需要进一步调查")
    else:
        print(f"\n--- 两者都有NaN ---")
        print("问题可能在更基础的地方")


if __name__ == "__main__":
    # test_backward_2round_with_tflops()
    # test_backward_10round_5layer()
    test_backward_one_vs_many_comparison()
 

================================================
FILE: kt-sft/csrc/ktransformers_ext/ext_bindings.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022, Jianwei Dong
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
// Python bindings
#include "cpu_backend/cpuinfer.h"
#if !defined(KTRANSFORMERS_USE_ROCM) && !defined(KTRANSFORMERS_USE_XPU)
#include "device_launch_parameters.h"
#endif
#include "llamafile/flags.h"
#include "operators/kvcache/kvcache.h"
#include "operators/llamafile/linear.h"
#include "operators/llamafile/mlp.h"
#include "operators/llamafile/moe.h"
#include "operators/llamafile/sft_moe.h"

#if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
#include "operators/amx/moe.hpp"
#include "operators/amx/sft_moe.hpp"
#endif

#include "pybind11/functional.h"
#include "pybind11/operators.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include <cstdint>
#include <iostream>
#include <memory>

namespace py = pybind11;
using namespace pybind11::literals;

// Binding functions for the KVCache class
class KVCacheBindings {
  public:
    class AttnBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            const ggml_fp16_t *q_in;
            ggml_fp16_t *output;
            float *attn_lse;
            int layer_idx;
            int generate_token_idx;
            int q_len;
            int batch_size;
            int max_block_num;
            int *block_table;
            int *cache_seqlens;
            int pick_block_num;
            int init_block_num;
            int local_block_num;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::attn, args_->kv_cache, args_->q_in, args_->output,
                args_->attn_lse, args_->layer_idx, args_->generate_token_idx,
                args_->q_len, args_->batch_size, args_->max_block_num,
                args_->block_table, args_->cache_seqlens, args_->pick_block_num,
                args_->init_block_num, args_->local_block_num);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t output,
                           intptr_t attn_lse, int layer_idx,
                           int generate_token_idx, int q_len, int batch_size,
                           int max_block_num, intptr_t block_table,
                           intptr_t cache_seqlens, int pick_block_num,
                           int init_block_num, int local_block_num) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (const ggml_fp16_t *)q_in,
                                  (ggml_fp16_t *)output,
                                  (float *)attn_lse,
                                  layer_idx,
                                  generate_token_idx,
                                  q_len,
                                  batch_size,
                                  max_block_num,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  pick_block_num,
                                  init_block_num,
                                  local_block_num};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class GetAllKVCacheOneLayerBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int layer_id;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::get_all_kvcache_one_layer,
                                     args_->kv_cache, args_->layer_id,
                                     args_->k_in, args_->v_in);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id) {
            Args *args = new Args{nullptr, &kv_cache, layer_id,
                                  (ggml_fp16_t *)k_in, (ggml_fp16_t *)v_in};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class GetAndUpdateKVCacheFp16Bindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *cache_seqlens;
            int q_len;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::get_and_update_kvcache_fp16,
                                     args_->kv_cache, args_->k_in, args_->v_in,
                                     args_->layer_id, args_->block_table,
                                     args_->batch_size, args_->max_block_num,
                                     args_->cache_seqlens, args_->q_len);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id, intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t cache_seqlens,
                           int q_len) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (ggml_fp16_t *)k_in,
                                  (ggml_fp16_t *)v_in,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)cache_seqlens,
                                  q_len};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class GetKVCacheFp16Bindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *cache_seqlens;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::get_kvcache_fp16, args_->kv_cache, args_->k_in,
                args_->v_in, args_->layer_id, args_->block_table,
                args_->batch_size, args_->max_block_num, args_->cache_seqlens);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id, intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t cache_seqlens) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (ggml_fp16_t *)k_in,
                                  (ggml_fp16_t *)v_in,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)cache_seqlens};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class UpdateKVCacheFp16Bindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            ggml_fp16_t *k_in;
            ggml_fp16_t *v_in;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *cache_seqlens;
            int q_len;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::update_kvcache_fp16,
                                     args_->kv_cache, args_->k_in, args_->v_in,
                                     args_->layer_id, args_->block_table,
                                     args_->batch_size, args_->max_block_num,
                                     args_->cache_seqlens, args_->q_len);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t k_in, intptr_t v_in,
                           int layer_id, intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t cache_seqlens,
                           int q_len) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (ggml_fp16_t *)k_in,
                                  (ggml_fp16_t *)v_in,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)cache_seqlens,
                                  q_len};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class UpdateImportanceBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            const ggml_fp16_t *importance;
            int layer_id;
            int *block_table;
            int batch_size;
            int max_block_num;
            int *offset;
            int width;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::update_importance, args_->kv_cache, args_->importance,
                args_->layer_id, args_->block_table, args_->batch_size,
                args_->max_block_num, args_->offset, args_->width);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t importance, int layer_id,
                           intptr_t block_table, int batch_size,
                           int max_block_num, intptr_t offset, int width) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (const ggml_fp16_t *)importance,
                                  layer_id,
                                  (int *)block_table,
                                  batch_size,
                                  max_block_num,
                                  (int *)offset,
                                  width};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class AttnWithKVCacheBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            const ggml_fp16_t *q_in;
            const ggml_fp16_t *k_in;
            const ggml_fp16_t *v_in;
            ggml_fp16_t *output;
            float *attn_lse;
            int layer_idx;
            int generate_token_idx;
            int q_len;
            int batch_size;
            int max_block_num;
            int *block_table;
            int *cache_seqlens;
            int topk;
            int local;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &KVCache::attn_with_kvcache, args_->kv_cache, args_->q_in,
                args_->k_in, args_->v_in, args_->output, args_->attn_lse,
                args_->layer_idx, args_->generate_token_idx, args_->q_len,
                args_->batch_size, args_->max_block_num, args_->block_table,
                args_->cache_seqlens, args_->topk, args_->local);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t q_in, intptr_t k_in,
                           intptr_t v_in, intptr_t output, intptr_t attn_lse,
                           int layer_idx, int generate_token_idx, int q_len,
                           int batch_size, int max_block_num,
                           intptr_t block_table, intptr_t cache_seqlens,
                           int topk, int local) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (const ggml_fp16_t *)q_in,
                                  (const ggml_fp16_t *)k_in,
                                  (const ggml_fp16_t *)v_in,
                                  (ggml_fp16_t *)output,
                                  (float *)attn_lse,
                                  layer_idx,
                                  generate_token_idx,
                                  q_len,
                                  batch_size,
                                  max_block_num,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  topk,
                                  local};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class ClearImportanceAllLayersBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int *block_table;
            int *cache_seqlens;
            int batch_size;
            int max_block_num;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::clear_importance_all_layers,
                                     args_->kv_cache, args_->block_table,
                                     args_->cache_seqlens, args_->batch_size,
                                     args_->max_block_num);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
                           intptr_t cache_seqlens, int batch_size,
                           int max_block_num) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  batch_size,
                                  max_block_num};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class CalcAnchorAllLayersBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int *block_table;
            int *cache_seqlens;
            int batch_size;
            int max_block_num;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::calc_anchor_all_layers,
                                     args_->kv_cache, args_->block_table,
                                     args_->cache_seqlens, args_->batch_size,
                                     args_->max_block_num);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
                           intptr_t cache_seqlens, int batch_size,
                           int max_block_num) {
            Args *args = new Args{nullptr,
                                  &kv_cache,
                                  (int *)block_table,
                                  (int *)cache_seqlens,
                                  batch_size,
                                  max_block_num};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

    class LoadKVCacheBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            std::string tensor_file_path;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::load_kvcache, args_->kv_cache,
                                     args_->tensor_file_path);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, std::string tensor_file_path) {
            Args *args =
                new Args{nullptr, &kv_cache, (std::string)tensor_file_path};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class DumpKVCacheBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            KVCache *kv_cache;
            int *block_table;
            int cache_total_len;
            std::string tensor_file_path;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&KVCache::dump_kvcache, args_->kv_cache,
                                     args_->block_table, args_->cache_total_len,
                                     args_->tensor_file_path);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(KVCache &kv_cache, intptr_t block_table,
                           int cache_total_len, std::string tensor_file_path) {
            Args *args =
                new Args{nullptr, &kv_cache, (int *)block_table,
                         cache_total_len, (std::string)tensor_file_path};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};

class LinearBindings {
  public:
    class WarmUpBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            Linear *linear;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&Linear::warm_up, args_->linear);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(Linear &linear) {
            Args *args = new Args{nullptr, &linear};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            Linear *linear;
            int qlen;
            const void *input;
            void *output;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&Linear::forward, args_->linear,
                                     args_->qlen, args_->input, args_->output);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(Linear &linear, int qlen, intptr_t input,
                           intptr_t output) {
            Args *args = new Args{nullptr, &linear, qlen, (const void *)input,
                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};

class MLPBindings {
  public:
    class WarmUpBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MLP *mlp;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MLP::warm_up, args_->mlp);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MLP &mlp) {
            Args *args = new Args{nullptr, &mlp};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MLP *mlp;
            int qlen;
            const void *input;
            void *output;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MLP::forward, args_->mlp, args_->qlen,
                                     args_->input, args_->output);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(MLP &mlp, int qlen, intptr_t input,
                           intptr_t output) {
            Args *args = new Args{nullptr, &mlp, qlen, (const void *)input,
                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};

class MOEBindings {
  public:
    class WarmUpBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MOE *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&MOE::warm_up, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(MOE &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            MOE *moe;
            int qlen;
            int k;
            const uint64_t *expert_ids;
            const float *weights;
            const void *input;
            void *output;
            int *batch_size_tensor;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &MOE::forward, args_->moe, args_->qlen, args_->k,
                args_->expert_ids, args_->weights, args_->input, args_->output, args_->batch_size_tensor);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(MOE &moe, int qlen, int k, intptr_t expert_ids,
                           intptr_t weights, intptr_t input, intptr_t output, intptr_t batch_size_tensor) {
            Args *args = new Args{nullptr,
                                  &moe,
                                  qlen,
                                  k,
                                  (const uint64_t *)expert_ids,
                                  (const float *)weights,
                                  (const void *)input,
                                  (void *)output,
                                  (int *)batch_size_tensor};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};

namespace {
	inline void sft_moe_forward_wrapper(
			SFT_MOE& self,
			int qlen, int k,
			const uint64_t* expert_ids,
			const float*     weights,
			const void*      input,
			void*            output,
			Backend*         backend)
	{
		self.ensure_fwd_cache(qlen, k);
		self.forward(qlen, k, expert_ids, weights,
					input, output,
					backend,
					self.fwd_cache_ptr());
	}

	inline void sft_moe_backward_wrapper(
			SFT_MOE& self,
			int layer_idx,
			int qlen, int k,
			const uint64_t* expert_ids,
			const float*     weights,
			const void*      input,
			const void*      grad_output,
			void*            grad_input,
			Backend*         backend)
	{
		self.backward(layer_idx, qlen, k, expert_ids, weights,
					input, grad_output, grad_input,
					backend,
					self.fwd_cache_ptr());
	}
}

class SFT_MOEBindings {
  public:
    class WarmUpBindinds {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            SFT_MOE *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&SFT_MOE::warm_up, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(SFT_MOE &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            SFT_MOE *moe;
            int qlen;
            int k;
            const uint64_t *expert_ids;
            const float *weights;
            const void *input;
            void *output;
        };
        // static void inner(void *args) {
        //     Args *args_ = (Args *)args;
        //     args_->cpuinfer->enqueue(
        //         &SFT_MOE::forward, args_->moe, args_->qlen, args_->k,
        //         args_->expert_ids, args_->weights, args_->input, args_->output);
        // }
		static void inner(void *args) {
			Args *args_ = static_cast<Args *>(args);
			args_->cpuinfer->enqueue(
				&sft_moe_forward_wrapper,   // 使用包装函数
				args_->moe, 
				args_->qlen, args_->k,
				args_->expert_ids,
				args_->weights,
				args_->input,
				args_->output);
		}
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(SFT_MOE &moe, int qlen, int k, intptr_t expert_ids,
                           intptr_t weights, intptr_t input, intptr_t output) {
            Args *args = new Args{nullptr,
                                  &moe,
                                  qlen,
                                  k,
                                  (const uint64_t *)expert_ids,
                                  (const float *)weights,
                                  (const void *)input,
                                  (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
	// FIXME: need fit the args setting with the backward of MoE
	class BackwardBindings {
    public:
		struct Args {
			CPUInfer* cpuinfer;
			SFT_MOE* moe;
			int layer_idx;
			int qlen;
			int k;
			const uint64_t* expert_ids;
			const float* weights;
			const void* input;
			const void* grad_output;
			void* grad_input;
		};

        // static void inner(void* args) {
        //     Args* args_ = static_cast<Args*>(args);
        //     args_->cpuinfer->enqueue(&SFT_MOE::backward, args_->moe, 
        //         args_->qlen, args_->k,
        //         args_->expert_ids, args_->weights,
		// 		args_->input,
		// 		args_->grad_output,
		// 		args_->grad_input);
        // }

		static void inner(void *args) {
			Args *args_ = static_cast<Args *>(args);
			args_->cpuinfer->enqueue(
				&sft_moe_backward_wrapper,  // 使用包装函数
				args_->moe,
				args_->layer_idx,
				args_->qlen, args_->k,
				args_->expert_ids,
				args_->weights,
				args_->input,
				args_->grad_output,
				args_->grad_input);
		}

        static std::pair<intptr_t, intptr_t> cpuinfer_interface(
            SFT_MOE& moe, int layer_idx, int qlen, int k, 
            intptr_t expert_ids, intptr_t weights,
    		intptr_t input,
            intptr_t grad_output, intptr_t grad_input) {
            
            Args* args = new Args{
				nullptr, &moe, layer_idx, qlen, k,
				reinterpret_cast<const uint64_t*>(expert_ids),
				reinterpret_cast<const float*>(weights),
				reinterpret_cast<const void*>(input), 
				reinterpret_cast<const void*>(grad_output),
				reinterpret_cast<void*>(grad_input)
			};
            return std::make_pair(
                reinterpret_cast<intptr_t>(&inner),
                reinterpret_cast<intptr_t>(args));
        }
    };
};

#if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
template<class T>
class AMX_MOEBindings {
  public:
    class WarmUpBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            AMX_MOE<T> *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&AMX_MOE<T>::warm_up, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class LoadWeightsBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            AMX_MOE<T> *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&AMX_MOE<T>::load_weights, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            AMX_MOE<T> *moe;
            int qlen;
            int k;
            const uint64_t *expert_ids;
            const float *weights;
            const void *input;
            void *output;
            int *batch_size_tensor;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &AMX_MOE<T>::forward, args_->moe, args_->qlen, args_->k,
                args_->expert_ids, args_->weights, args_->input, args_->output, args_->batch_size_tensor);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(AMX_MOE<T> &moe, int qlen, int k, intptr_t expert_ids,
                        intptr_t weights, intptr_t input, intptr_t output, intptr_t batch_size_tensor) {
            Args *args = new Args{nullptr,
                                &moe,
                                qlen,
                                k,
                                (const uint64_t *)expert_ids,
                                (const float *)weights,
                                (const void *)input,
                                (void *)output,
                                (int *)batch_size_tensor};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
};
#endif

#if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
template<class T>
class SFT_AMX_MOEBindings {
  public:
    class WarmUpBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            SFT_AMX_MOE<T> *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&SFT_AMX_MOE<T>::warm_up, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(SFT_AMX_MOE<T> &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class LoadWeightsBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            SFT_AMX_MOE<T> *moe;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(&SFT_AMX_MOE<T>::load_weights, args_->moe);
        }
        static std::pair<intptr_t, intptr_t> cpuinfer_interface(SFT_AMX_MOE<T> &moe) {
            Args *args = new Args{nullptr, &moe};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
    class ForwardBindings {
      public:
        struct Args {
            CPUInfer *cpuinfer;
            SFT_AMX_MOE<T> *moe;
            int qlen;
            int k;
            const uint64_t *expert_ids;
            const float *weights;
            const void *input;
            void *output;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &SFT_AMX_MOE<T>::forward, args_->moe, args_->qlen, args_->k,
                args_->expert_ids, args_->weights, args_->input, args_->output);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(SFT_AMX_MOE<T> &moe, int qlen, int k, intptr_t expert_ids,
                        intptr_t weights, intptr_t input, intptr_t output) {
            Args *args = new Args{nullptr,
                                &moe,
                                qlen,
                                k,
                                (const uint64_t *)expert_ids,
                                (const float *)weights,
                                (const void *)input,
                                (void *)output};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };

	class BackwardBindings {
    public:
		struct Args {
			CPUInfer* cpuinfer;
			SFT_AMX_MOE<T> *moe;
			int qlen;
			int k;
			const uint64_t* expert_ids;
			const float* weights;
            const void* input;
			const void* output_grad;
			void* input_grad;
		};

		static void inner(void *args) {
			Args *args_ = static_cast<Args *>(args);
			args_->cpuinfer->enqueue(
				&SFT_AMX_MOE<T>::backward,
				args_->moe,
				args_->qlen, args_->k,
				args_->expert_ids,
				args_->weights,
                args_->input,
				args_->output_grad,
				args_->input_grad);
		}

        static std::pair<intptr_t, intptr_t> cpuinfer_interface(
            SFT_AMX_MOE<T> &moe, int qlen, int k, 
            intptr_t expert_ids, intptr_t weights,
            intptr_t input,
            intptr_t output_grad, intptr_t input_grad) {
            
            Args* args = new Args{
				nullptr, &moe, qlen, k,
				(const uint64_t*)expert_ids,
				(const float*)weights,
                (const void*)input,
				(const void*)output_grad,
				(void*)input_grad
			};
            return std::make_pair(
                (intptr_t)&inner,
                (intptr_t)args);
        }
    };
};
#endif

PYBIND11_MODULE(cpuinfer_ext, m) {
    py::class_<CPUInfer>(m, "CPUInfer")
        .def(py::init<int>())
        .def("submit", &CPUInfer::submit)
        .def("submit_with_cuda_stream", &CPUInfer::submit_with_cuda_stream)
        .def("sync", &CPUInfer::sync)
        .def("sync_with_cuda_stream", &CPUInfer::sync_with_cuda_stream);

    auto linear_module = m.def_submodule("linear");
    py::class_<LinearConfig>(linear_module, "LinearConfig")
        .def(py::init([](int hidden_size, int intermediate_size, int stride,
                         int group_max_len, intptr_t proj, int proj_type,
                         int hidden_type) {
            return LinearConfig(hidden_size, intermediate_size, stride,
                                group_max_len, (void *)proj,
                                (ggml_type)proj_type, (ggml_type)hidden_type);
        }));
    py::class_<Linear>(linear_module, "Linear")
        .def(py::init<LinearConfig>())
        .def("warm_up", &LinearBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &LinearBindings::ForwardBindings::cpuinfer_interface);

    auto mlp_module = m.def_submodule("mlp");
    py::class_<MLPConfig>(mlp_module, "MLPConfig")
        .def(py::init([](int hidden_size, int intermediate_size, int stride,
                         int group_max_len, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj, int gate_type,
                         int up_type, int down_type, int hidden_type) {
            return MLPConfig(hidden_size, intermediate_size, stride,
                             group_max_len, (void *)gate_proj, (void *)up_proj,
                             (void *)down_proj, (ggml_type)gate_type,
                             (ggml_type)up_type, (ggml_type)down_type,
                             (ggml_type)hidden_type);
        }));
    py::class_<MLP>(mlp_module, "MLP")
        .def(py::init<MLPConfig>())
        .def("warm_up", &MLPBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &MLPBindings::ForwardBindings::cpuinfer_interface);

    auto moe_module = m.def_submodule("moe");
    py::class_<MOEConfig>(moe_module, "MOEConfig")
        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
                         int intermediate_size, int stride, int group_min_len,
                         int group_max_len, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj, int gate_type,
                         int up_type, int down_type, int hidden_type) {
            return MOEConfig(expert_num, routed_expert_num, hidden_size,
                             intermediate_size, stride, group_min_len,
                             group_max_len, (void *)gate_proj, (void *)up_proj,
                             (void *)down_proj, (ggml_type)gate_type,
                             (ggml_type)up_type, (ggml_type)down_type,
                             (ggml_type)hidden_type);
        }));
    py::class_<MOE>(moe_module, "MOE")
        .def(py::init<MOEConfig>())
        .def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface);

    auto sft_moe_module = m.def_submodule("sft_moe");
    py::class_<SFT_MOEConfig>(sft_moe_module, "SFT_MOEConfig")
        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
                         int intermediate_size, int stride, int group_min_len,
                         int group_max_len, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj, int gate_type,
                         int up_type, int down_type, int hidden_type) {
            return SFT_MOEConfig(expert_num, routed_expert_num, hidden_size,
                             intermediate_size, stride, group_min_len,
                             group_max_len, (void *)gate_proj, (void *)up_proj,
                             (void *)down_proj, (ggml_type)gate_type,
                             (ggml_type)up_type, (ggml_type)down_type,
                             (ggml_type)hidden_type);
        }));
    py::class_<SFT_MOE>(sft_moe_module, "SFT_MOE")
        .def(py::init<SFT_MOEConfig>())
        .def("warm_up", &SFT_MOEBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &SFT_MOEBindings::ForwardBindings::cpuinfer_interface)
		.def("backward", &SFT_MOEBindings::BackwardBindings::cpuinfer_interface);

    #if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
    py::class_<AMX_MOEConfig>(moe_module, "AMX_MOEConfig")
        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
                         int intermediate_size,
                         int max_len, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj) {
            return AMX_MOEConfig(expert_num, routed_expert_num, hidden_size,
                                 intermediate_size, 
                                 max_len, (void *)gate_proj,
                                 (void *)up_proj, (void *)down_proj);
        }));

    py::class_<AMX_MOE<amx::GemmKernel224BF>>(moe_module, "AMXBF16_MOE")
        .def(py::init<AMX_MOEConfig>())
        .def("warm_up", &AMX_MOEBindings<amx::GemmKernel224BF>::WarmUpBindings::cpuinfer_interface)
        .def("load_weights", &AMX_MOEBindings<amx::GemmKernel224BF>::LoadWeightsBindings::cpuinfer_interface)
        .def("forward", &AMX_MOEBindings<amx::GemmKernel224BF>::ForwardBindings::cpuinfer_interface);
    py::class_<AMX_MOE<amx::GemmKernel224Int8>>(moe_module, "AMXInt8_MOE")
        .def(py::init<AMX_MOEConfig>())
        .def("warm_up", &AMX_MOEBindings<amx::GemmKernel224Int8>::WarmUpBindings::cpuinfer_interface)
        .def("load_weights", &AMX_MOEBindings<amx::GemmKernel224Int8>::LoadWeightsBindings::cpuinfer_interface)
        .def("forward", &AMX_MOEBindings<amx::GemmKernel224Int8>::ForwardBindings::cpuinfer_interface);

    #endif

	#if defined(__x86_64__) && defined(__HAS_AVX512F__) && defined(__HAS_AMX__)
    py::class_<SFT_AMX_MOEConfig>(sft_moe_module, "SFT_AMX_MOEConfig")
        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
                         int intermediate_size,
                         int max_len, intptr_t gate_proj,
                         intptr_t up_proj, intptr_t down_proj) {
            return SFT_AMX_MOEConfig(expert_num, routed_expert_num, hidden_size,
                                 intermediate_size, 
                                 max_len, (void *)gate_proj,
                                 (void *)up_proj, (void *)down_proj);
        }));

    py::class_<SFT_AMX_MOE<amx::GemmKernel224BF>>(sft_moe_module, "SFT_AMXBF16_MOE")
        .def(py::init<SFT_AMX_MOEConfig>())
        .def("warm_up", &SFT_AMX_MOEBindings<amx::GemmKernel224BF>::WarmUpBindings::cpuinfer_interface)
        .def("load_weights", &SFT_AMX_MOEBindings<amx::GemmKernel224BF>::LoadWeightsBindings::cpuinfer_interface)
        .def("forward", &SFT_AMX_MOEBindings<amx::GemmKernel224BF>::ForwardBindings::cpuinfer_interface)
		.def("backward", &SFT_AMX_MOEBindings<amx::GemmKernel224BF>::BackwardBindings::cpuinfer_interface);

    py::class_<SFT_AMX_MOE<amx::GemmKernel224Int8>>(sft_moe_module, "SFT_AMXInt8_MOE")
        .def(py::init<SFT_AMX_MOEConfig>())
        .def("warm_up", &SFT_AMX_MOEBindings<amx::GemmKernel224Int8>::WarmUpBindings::cpuinfer_interface)
        .def("load_weights", &SFT_AMX_MOEBindings<amx::GemmKernel224Int8>::LoadWeightsBindings::cpuinfer_interface)
        .def("forward", &SFT_AMX_MOEBindings<amx::GemmKernel224Int8>::ForwardBindings::cpuinfer_interface)
		.def("backward", &SFT_AMX_MOEBindings<amx::GemmKernel224Int8>::BackwardBindings::cpuinfer_interface);

    #endif

    auto kvcache_module = m.def_submodule("kvcache");

    py::enum_<AnchorType>(kvcache_module, "AnchorType")
        .value("FIXED", AnchorType::FIXED_ANCHOR)
        .value("DYNAMIC", AnchorType::DYNAMIC)
        .value("QUEST", AnchorType::QUEST)
        .value("BLOCK_MAX", AnchorType::BLOCK_MAX)
        .value("BLOCK_MEAN", AnchorType::BLOCK_MEAN);
    py::enum_<ggml_type>(kvcache_module, "ggml_type")
        .value("FP16", ggml_type::GGML_TYPE_F16)
        .value("FP32", ggml_type::GGML_TYPE_F32)
        .value("Q4_0", ggml_type::GGML_TYPE_Q4_0)
        .value("Q8_0", ggml_type::GGML_TYPE_Q8_0);
    py::enum_<RetrievalType>(kvcache_module, "RetrievalType")
        .value("LAYER", RetrievalType::LAYER)
        .value("KVHEAD", RetrievalType::KVHEAD)
        .value("QHEAD", RetrievalType::QHEAD);

    py::class_<KVCacheConfig>(kvcache_module, "KVCacheConfig")
        .def(py::init<int, int, int, int, int, int, AnchorType, ggml_type,
                      RetrievalType, int, int, int, int, int, int>())
        .def_readwrite("layer_num", &KVCacheConfig::layer_num)
        .def_readwrite("kv_head_num", &KVCacheConfig::kv_head_num)
        .def_readwrite("q_head_num", &KVCacheConfig::q_head_num)
        .def_readwrite("head_dim", &KVCacheConfig::head_dim)
        .def_readwrite("block_len", &KVCacheConfig::block_len)
        .def_readwrite("anchor_num", &KVCacheConfig::anchor_num)
        .def_readwrite("anchor_type", &KVCacheConfig::anchor_type)
        .def_readwrite("kv_type", &KVCacheConfig::kv_type)
        .def_readwrite("retrieval_type", &KVCacheConfig::retrieval_type)
        .def_readwrite("layer_step", &KVCacheConfig::layer_step)
        .def_readwrite("token_step", &KVCacheConfig::token_step)
        .def_readwrite("layer_offset", &KVCacheConfig::layer_offset)
        .def_readwrite("max_block_num", &KVCacheConfig::max_block_num)
        .def_readwrite("max_batch_size", &KVCacheConfig::max_batch_size)
        .def_readwrite("max_thread_num", &KVCacheConfig::max_thread_num);
    py::class_<KVCache>(kvcache_module, "KVCache")
        .def(py::init<KVCacheConfig>())
        .def("get_cache_total_len", &KVCache::get_cache_total_len)
        .def("update_cache_total_len",
             [](KVCache &kvcache, int cache_total_len) {
                 kvcache.update_cache_total_len(cache_total_len);
             })
        .def("attn", &KVCacheBindings::AttnBindings::cpuinfer_interface)
        .def(
            "get_all_kvcache_one_layer",
            &KVCacheBindings::GetAllKVCacheOneLayerBindings::cpuinfer_interface)
        .def("get_and_update_kvcache_fp16",
             &KVCacheBindings::GetAndUpdateKVCacheFp16Bindings::
                 cpuinfer_interface)
        .def("get_kvcache_fp16",
             &KVCacheBindings::GetKVCacheFp16Bindings::cpuinfer_interface)
        .def("update_kvcache_fp16",
             &KVCacheBindings::UpdateKVCacheFp16Bindings::cpuinfer_interface)
        .def("update_importance",
             &KVCacheBindings::UpdateImportanceBindings::cpuinfer_interface)
        .def("attn_with_kvcache",
             &KVCacheBindings::AttnWithKVCacheBindings::cpuinfer_interface)
        .def("clear_importance_all_layers",
             &KVCacheBindings::ClearImportanceAllLayersBindings::
                 cpuinfer_interface)
        .def("calc_anchor_all_layers",
             &KVCacheBindings::CalcAnchorAllLayersBindinds::cpuinfer_interface);
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/amx/debug_sft_moe.hpp
================================================
/**
 * @Description  : Mainly used for dev debug, with no numa version
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_SFT_AMX_MOE_H
#define CPUINFER_OPERATOR_SFT_AMX_MOE_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>
#include <fstream>
#include <filesystem>

#include "debug_sft_moe.hpp"

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

#include "la/amx.hpp"

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
void *numa_alloc_aligned(size_t size, int node, size_t alignment) {
  void *ptr = numa_alloc_onnode(size, node);
  assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
  return ptr;
}
#endif

// static inline __m512 exp_avx512(__m512 x) {
//   const __m512 log2e = _mm512_set1_ps(1.44269504089f);
//   const __m512 c1 = _mm512_set1_ps(0.69314718056f);

//   __m512 y = _mm512_mul_ps(x, log2e);
//   __m512i int_part = _mm512_cvtps_epi32(y);
//   __m512 frac_part = _mm512_sub_ps(y, _mm512_cvtepi32_ps(int_part));

//   const __m512 poly_1 = _mm512_set1_ps(0.9999999995f);
//   const __m512 poly_2 = _mm512_set1_ps(0.6931471805f);
//   const __m512 poly_3 = _mm512_set1_ps(0.2402265069f);
//   const __m512 poly_4 = _mm512_set1_ps(0.0555041087f);
//   const __m512 poly_5 = _mm512_set1_ps(0.0096181291f);
//   const __m512 poly_6 = _mm512_set1_ps(0.0013333558f);

//   __m512 frac_exp = _mm512_fmadd_ps(
//       frac_part, poly_6,
//       _mm512_fmadd_ps(frac_part, poly_5,
//                       _mm512_fmadd_ps(frac_part, poly_4,
//                                       _mm512_fmadd_ps(frac_part, poly_3, _mm512_fmadd_ps(frac_part, poly_2, poly_1)))));

//   __m512 two_pow_i = _mm512_scalef_ps(_mm512_set1_ps(1.0f), _mm512_cvtepi32_ps(int_part));
//   return _mm512_mul_ps(two_pow_i, frac_exp);
// }

// static inline __m512 act_fn(__m512 gate_val, __m512 up_val) {
//   __m512 neg_gate_val = _mm512_sub_ps(_mm512_setzero_ps(), gate_val);
//   __m512 exp_neg_gate = exp_avx512(neg_gate_val);
//   __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), exp_neg_gate);
//   __m512 act_val = _mm512_div_ps(gate_val, denom);

//   return _mm512_mul_ps(act_val, up_val);
// }

static inline __m512 sigmoid(__m512 x) {
  __m512 neg = _mm512_sub_ps(_mm512_setzero_ps(), x);
  __m512 e = exp_avx512(neg);
  __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), e);
  return _mm512_div_ps(_mm512_set1_ps(1.0f), denom);
}

static inline __m512 act_fn_1(__m512 x) {
  __m512 sigmoid_val = sigmoid(x);
  return _mm512_mul_ps(sigmoid_val, x);
}

static inline __m512 act_fn_grad(__m512 x) {
  // sigmoid(x) * (1 + x * (1 - sigmoid(x)))
  __m512 sigmoid_val = sigmoid(x);
  __m512 one_minus_sigmoid = _mm512_sub_ps(_mm512_set1_ps(1.0f), sigmoid_val);
  __m512 x_term = _mm512_mul_ps(x, one_minus_sigmoid);
  __m512 one_plus_x_term = _mm512_add_ps(_mm512_set1_ps(1.0f), x_term);
  return _mm512_mul_ps(sigmoid_val, one_plus_x_term);
}

// static inline float bf16_to_fp32(ggml_bf16_t v) {
//     uint16_t lo16;
//     std::memcpy(&lo16, &v, sizeof(lo16));   // 取出 16 bit 数据
//     uint32_t tmp = uint32_t(lo16) << 16; // 放到高 16 位
//     float out;
//     std::memcpy(&out, &tmp, sizeof(float));
//     return out;
// }

// 把 ggml_bf16_t 数组转换成可读字符串（逗号分隔）
std::string int8_row_to_string(const int8_t* row, int len) {
    std::string s;
    for (int i = 0; i < len; ++i) {
        if (i) s += ", ";
        s += std::to_string(row[i]);
    }
    return s;
}

struct SFT_AMX_MOEConfig {
  int expert_num;
  int routed_expert_num;
  int hidden_size;
  int intermediate_size;
  int max_len;
  void *gate_proj;
  void *up_proj;
  void *down_proj;

  SFT_AMX_MOEConfig() {}

  SFT_AMX_MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int max_len,
                void *gate_proj, void *up_proj, void *down_proj)
      : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size),
        intermediate_size(intermediate_size), max_len(max_len), gate_proj(gate_proj), up_proj(up_proj),
        down_proj(down_proj) {}
};

template <class T> class SFT_AMX_MOE {
private:
  SFT_AMX_MOEConfig config_;
  void *gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *up_proj_;   // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *down_proj_; // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

  void *gate_proj_t_; // [expert_num * intermediate_size * hidden_size]
  void *up_proj_t_;   // [expert_num * intermediate_size * hidden_size]
  void *down_proj_t_; // [expert_num * hidden_size * intermediate_size]

  ggml_bf16_t *m_local_input_;       // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_gate_output_; // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_up_output_;   // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_down_output_; // [routed_expert_num * max_len * hidden_size]

  std::vector<std::vector<int>> m_local_pos_;          // [max_len, routed_expert_num]
  std::vector<int> m_local_num_;                       // [expert_num]
  std::vector<int> m_expert_id_map_;                   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_input_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_output_ptr_; // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_output_ptr_;   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_down_output_ptr_; // [expert_num]

  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;

#ifdef USE_NUMA
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> gate_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> up_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> down_bb_numa_;
#else
  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
#endif

  ggml_bf16_t *m_local_down_output_grad_;       // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_down_input_grad_;        // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_gate_output_grad_;       // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_up_output_grad_;         // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_gate_input_grad_;        // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_up_input_grad_;          // [routed_expert_num * max_len * hidden_size]

  std::vector<ggml_bf16_t *> m_local_down_output_grad_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_down_input_grad_ptr_;        // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_output_grad_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_output_grad_ptr_;         // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_input_grad_ptr_;        // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_input_grad_ptr_;          // [expert_num]

  std::vector<std::shared_ptr<typename T::BufferA>> gate_t_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_t_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> up_t_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_t_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_t_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_t_bc_;

  // TODO: NUMA
  std::vector<std::shared_ptr<typename T::BufferB>> gate_t_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_t_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_t_bb_;

  int* m_local_token_indices_;                                   // [routed_expert_num * max_len]
  int* m_local_expert_positions_;                               // [routed_expert_num * max_len]
  std::vector<int *> m_local_token_indices_ptr_;                // [expert_num]
  std::vector<int *> m_local_expert_positions_ptr_;             // [expert_num]

public:
  SFT_AMX_MOE(SFT_AMX_MOEConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;

    std::vector<std::pair<void **, uint64_t>> m_mem_requests;
    m_mem_requests.push_back({(void **)&m_local_input_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                  config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_up_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_down_output_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    std::vector<void *> gate_up_ba_ptr(config_.expert_num);
    std::vector<void *> gate_bc_ptr(config_.expert_num);
    std::vector<void *> up_bc_ptr(config_.expert_num);
    std::vector<void *> down_ba_ptr(config_.expert_num);
    std::vector<void *> down_bc_ptr(config_.expert_num);
    for (int i = 0; i < config_.expert_num; i++) {
      m_mem_requests.push_back(
          {(void **)&gate_up_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&gate_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&up_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
    }

    m_mem_requests.push_back({(void **)&gate_proj_t_,
                              sizeof(ggml_bf16_t) * config_.expert_num * config_.intermediate_size * config_.hidden_size});
    m_mem_requests.push_back({(void **)&up_proj_t_,
                              sizeof(ggml_bf16_t) * config_.expert_num * config_.intermediate_size * config_.hidden_size});
    m_mem_requests.push_back({(void **)&down_proj_t_,
                              sizeof(ggml_bf16_t) * config_.expert_num * config_.hidden_size * config_.intermediate_size});
    
    m_mem_requests.push_back({(void **)&m_local_down_output_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_down_input_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_gate_output_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_up_output_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_gate_input_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_up_input_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_token_indices_,
                              sizeof(int) * config_.routed_expert_num * config_.max_len});
    m_mem_requests.push_back({(void **)&m_local_expert_positions_,
                              sizeof(int) * config_.routed_expert_num * config_.max_len});
    std::vector<void *> gate_t_ba_ptr(config_.expert_num);
    std::vector<void *> gate_t_bc_ptr(config_.expert_num);
    std::vector<void *> up_t_ba_ptr(config_.expert_num);
    std::vector<void *> up_t_bc_ptr(config_.expert_num);
    std::vector<void *> down_t_ba_ptr(config_.expert_num);
    std::vector<void *> down_t_bc_ptr(config_.expert_num);
    for (int i = 0; i < config_.expert_num; i++) {
      m_mem_requests.push_back(
          {(void **)&gate_t_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&gate_t_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&up_t_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&up_t_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&down_t_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&down_t_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
    }

    shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.max_len);
    for (int i = 0; i < config_.max_len; i++) {
      m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_expert_id_map_.resize(config_.expert_num);
    m_local_num_.resize(config_.expert_num);
    m_local_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);
    m_local_down_output_grad_ptr_.resize(config_.expert_num);
    m_local_down_input_grad_ptr_.resize(config_.expert_num);
    m_local_gate_output_grad_ptr_.resize(config_.expert_num);
    m_local_up_output_grad_ptr_.resize(config_.expert_num);
    m_local_gate_input_grad_ptr_.resize(config_.expert_num);
    m_local_up_input_grad_ptr_.resize(config_.expert_num);

    for (uint64_t i = 0; i < config_.expert_num; i++) {
      gate_up_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, gate_up_ba_ptr[i]));
      gate_bc_.push_back(
          std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, gate_bc_ptr[i]));
      up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, up_bc_ptr[i]));
      down_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, down_ba_ptr[i]));
      down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, down_bc_ptr[i]));

#ifdef USE_NUMA
      int numa_nodes = numa_num_configured_nodes();
      gate_bb_numa_.resize(numa_nodes);
      up_bb_numa_.resize(numa_nodes);
      down_bb_numa_.resize(numa_nodes);
      for (int j = 0; j < numa_nodes; j++) {
        void *gate_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        gate_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
        void *up_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        up_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
        void *down_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
        down_bb_numa_[j].push_back(  
            std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
      }
#else
      void *gate_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      gate_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));

      void *up_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      up_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));

      void *down_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      down_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
#endif
    }

    for (uint64_t i = 0; i < config_.expert_num; i++) {
      gate_t_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, gate_t_ba_ptr[i]));
      gate_t_bc_.push_back(
          std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, gate_t_bc_ptr[i]));
      up_t_ba_.push_back(std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, up_t_ba_ptr[i]));
      up_t_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, up_t_bc_ptr[i]));
      down_t_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, down_t_ba_ptr[i]));
      down_t_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, down_t_bc_ptr[i]));

      // TODO: NUMA
      void *gate_t_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      gate_t_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, gate_t_bb_ptr));

      void *up_t_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      up_t_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, up_t_bb_ptr));

      void *down_t_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      down_t_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, down_t_bb_ptr));
    }

    m_local_token_indices_ptr_.resize(config_.expert_num);
    m_local_expert_positions_ptr_.resize(config_.expert_num);
  }

  ~SFT_AMX_MOE() { shared_mem_buffer.dealloc(this); }

  void transpose_expert(const void* src, void* dst, int R, int C, Backend* backend) {
    backend->do_work_stealing_job(
        config_.expert_num, nullptr,
        [&](uint64_t expert_idx) {
          for (int r = 0; r < R; ++r) {
            for (int c = 0; c < C; ++c) {
                memcpy(
                    (uint8_t*)dst + (expert_idx * R * C + (c * R + r)) * sizeof(ggml_bf16_t),
                    (uint8_t*)src + (expert_idx * R * C + (r * C + c)) * sizeof(ggml_bf16_t),
                    sizeof(ggml_bf16_t));
            }
          }
        },
        nullptr);
  }
  
  void load_weights(Backend *backend) {
    transpose_expert(config_.gate_proj, gate_proj_t_, config_.intermediate_size, config_.hidden_size, backend);
    transpose_expert(config_.up_proj, up_proj_t_, config_.intermediate_size, config_.hidden_size, backend);
    transpose_expert(config_.down_proj, down_proj_t_, config_.hidden_size, config_.intermediate_size, backend);

    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            gate_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                                       expert_idx * config_.intermediate_size * config_.hidden_size,
                                                   ith, nth);
            up_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.up_proj +
                                                     expert_idx * config_.intermediate_size * config_.hidden_size,
                                                 ith, nth);
          }
#else
          gate_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                             expert_idx * config_.intermediate_size * config_.hidden_size,
                                         ith, nth);
          up_bb_[expert_idx]->from_mat(
              (ggml_bf16_t *)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith, nth);
#endif
		},
        nullptr);
    nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
          down_t_bb_[expert_idx]->from_mat((ggml_bf16_t *)down_proj_t_ +
                                             expert_idx * config_.intermediate_size * config_.hidden_size,
                                         ith, nth);
        },
        nullptr);

	// if constexpr (std::is_same_v<typename T::dt, ggml_bf16_t>) {
	// 	// 确保 debug/ 目录存在
	// 	std::filesystem::create_directories("debug");

	// 	int tail_cols = 1024;
	// 	for (int expert_idx = 0; expert_idx < config_.expert_num; ++expert_idx) {
	// 		auto buf = down_t_bb_[expert_idx].get();
	// 		std::cout << "k: " << buf->k << "; n: " << buf->n << std::endl;
	// 		// 打开对应 expert 的文件
	// 		std::string path = "debug/" + std::to_string(expert_idx) + "_down_bb_t_debug.txt";
	// 		std::ofstream ofs(path, std::ios::out);
	// 		if (!ofs) {
	// 			std::cerr << "Failed to open " << path << " for writing\n";
	// 			continue;
	// 		}

	// 		ofs << "==== Expert " << expert_idx << " ====\n";
	// 		for (int n_idx = 0; n_idx < buf->k; ++n_idx) {
	// 			// 明确当作 int8 读
	// 			const int8_t* row = reinterpret_cast<const int8_t*>(buf->b) + n_idx * buf->n;

	// 			// 写整行
	// 			ofs << "row[" << n_idx << "] = { "
	// 				<< int8_row_to_string(row, buf->n)
	// 				<< " }\n";
	// 		}

	// 		ofs.close();
	// 	}
	// }

	// if constexpr (std::is_same_v<typename T::dt, int8_t>) {
	// 	for (int expert_idx = 0; expert_idx < config_.expert_num; ++expert_idx) {
	// 		auto buf = down_t_bb_[expert_idx].get();

	// 		// 打开对应 expert 的文件
	// 		std::string path = "debug/" + std::to_string(expert_idx) + "_down_bb_t_debug3.bin";
	// 		std::ofstream ofs(path, std::ios::binary);
	// 		for (int n_idx = 0; n_idx < buf->k; ++n_idx) {
	// 			const int8_t* row = reinterpret_cast<const int8_t*>(buf->b) + n_idx * buf->n;
	// 			for (int j = 0; j < buf->n; ++j) {
	// 				float v = row[j];
	// 				ofs.write(reinterpret_cast<const char*>(&v), sizeof(v));
	// 			}
	// 		}
	// 		ofs.close();
	// 	}
	// }

	
	// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
	// 	// 打开对应 expert 的文件
	// 	std::string path = "debug/" + std::to_string(expert_idx) + "_up_proj_t.bin";
	// 	std::ofstream ofs(path, std::ios::binary);
	// 	std::cout << "config_.hidden_size: " << config_.hidden_size << std::endl;
	// 	std::cout << "config_.intermediate_size: " << config_.intermediate_size << std::endl;
	// 	for (int n_idx = 0; n_idx < config_.intermediate_size; ++n_idx) {
	// 		const int8_t* row = reinterpret_cast<const int8_t*>(config_.down_proj + expert_idx * n_idx * config_.hidden_size);
	// 		for (int j = 0; j < config_.hidden_size; ++j) {
	// 			float v = row[j];
	// 			ofs.write(reinterpret_cast<const char*>(&v), sizeof(v));
	// 		}
	// 	}
	// 	ofs.close();
	// }

    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            down_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                                       expert_idx * config_.hidden_size * config_.intermediate_size,
                                                   ith, nth);
          }
#else
          down_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
#endif
          up_t_bb_[expert_idx]->from_mat((ggml_bf16_t *)up_proj_t_ +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
          gate_t_bb_[expert_idx]->from_mat((ggml_bf16_t *)gate_proj_t_ +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
        },
        nullptr);
  }

  void warm_up(Backend *backend) {}

  void forward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void *input, void *output, Backend *backend) {
    bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }
    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];
    }
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int j = 0; j < k; j++) {
            memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
                   (ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
          }
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];

          gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_numa_[Backend::numa_node][expert_idx], gate_bc_[expert_idx],
                       ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_numa_[Backend::numa_node][expert_idx], up_bc_[expert_idx], ith,
                       nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
#endif
          gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
          up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);
          auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
          for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
            for (int j = n_start; j < n_end; j += 32) {
              __m512 gate_val0, gate_val1, up_val0, up_val1;
              avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
              avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
              __m512 result0 = act_fn(gate_val0, up_val0);
              __m512 result1 = act_fn(gate_val1, up_val1);
              avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i *)(gate_output_ptr + j));
            }
          }
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], 0, 1);
        },
        nullptr);
	
	// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
	// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_ba_ori_", (ggml_bf16_t*)m_local_gate_output_ptr_[expert_idx], config_.intermediate_size * m_local_num_[expert_idx], GGML_TYPE_BF16);
	// }
	// if constexpr (std::is_same_v<typename T::dt, int8_t>) {
	// 	std::cout << "GO INTO forward output" << std::endl;
	// 	// 确保 debug/ 目录存在
	// 	std::filesystem::create_directories("debug");

	// 	for (int expert_idx = 0; expert_idx < config_.expert_num; ++expert_idx) {
	// 		auto buf = down_ba_[expert_idx].get();
	// 		// std::cout << "k: " << buf->k << "; n: " << buf->n << std::endl;
	// 		// 打开对应 expert 的文件
	// 		std::string path = "debug/" + std::to_string(expert_idx) + "_down_ba_debug.txt";
	// 		std::ofstream ofs(path, std::ios::out);
	// 		if (!ofs) {
	// 			std::cerr << "Failed to open " << path << " for writing\n";
	// 			continue;
	// 		}

	// 		ofs << "==== Expert " << expert_idx << " ====\n";
	// 		ofs << "buf_k: " << buf->k << "\n";
	// 		for (int n_idx = 0; n_idx < m_local_num_[expert_idx]; ++n_idx) {
	// 			// 明确当作 bfloat16 读
	// 			const int8_t* row = reinterpret_cast<const int8_t*>(buf->a) + n_idx * buf->k;

	// 			// 写整行
	// 			ofs << "row[" << n_idx << "] = { "
	// 				<< int8_row_to_string(row, buf->k)
	// 				<< " }\n";
	// 		}

	// 		ofs.close();
	// 	}
	// 	std::cout << "OUT INTO forward output" << std::endl;
	// }
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_numa_[Backend::numa_node][expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_[expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#endif
          down_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_output_ptr_[expert_idx], ith, nth);
        },
        nullptr);
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int e = 0; e < config_.hidden_size; e += 32) {
            __m512 x0 = _mm512_setzero_ps();
            __m512 x1 = _mm512_setzero_ps();
            for (int j = 0; j < k; j++) {
              __m512 weight = _mm512_set1_ps(weights[i * k + j]);
              __m512 down_output0, down_output1;
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_down_output_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &down_output0, &down_output1);
              x0 = _mm512_fmadd_ps(down_output0, weight, x0);
              x1 = _mm512_fmadd_ps(down_output1, weight, x1);
            }
            avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)output + i * config_.hidden_size + e));
          }
        },
        nullptr);
  }

  void backward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void* input, const void *output_grad, void *input_grad, Backend *backend) {
    bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }
    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;

      m_local_down_output_grad_ptr_[i] = m_local_down_output_grad_ + offset * config_.hidden_size;
      m_local_down_input_grad_ptr_[i] = m_local_down_input_grad_ + offset * config_.intermediate_size;
      m_local_gate_output_grad_ptr_[i] = m_local_gate_output_grad_ + offset * config_.intermediate_size;
      m_local_up_output_grad_ptr_[i] = m_local_up_output_grad_ + offset * config_.intermediate_size;
      m_local_gate_input_grad_ptr_[i] = m_local_gate_input_grad_ + offset * config_.hidden_size;
      m_local_up_input_grad_ptr_[i] = m_local_up_input_grad_ + offset * config_.hidden_size;
      m_local_token_indices_ptr_[i] = m_local_token_indices_ + offset;
      m_local_expert_positions_ptr_[i] = m_local_expert_positions_ + offset;
      offset += m_local_num_[i];
    }

    // TODO: cache
    backend->do_work_stealing_job(
        qlen, nullptr, 
        [&](int i) {
          for (int j = 0; j < k; j++) {
            uint64_t expert_id = expert_ids[i * k + j];
            int local_row = m_local_pos_[i][j];
            memcpy(m_local_input_ptr_[expert_id] + local_row * config_.hidden_size,
              (ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size); // TODO: cache
            memcpy(m_local_down_output_grad_ptr_[expert_id] + local_row * config_.hidden_size,
              (ggml_bf16_t *)output_grad + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
            m_local_token_indices_ptr_[expert_id][local_row] = i;
            m_local_expert_positions_ptr_[expert_id][local_row] = j;
          }
        }, 
        nullptr);

    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1); // TODO: cache
          down_t_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_down_output_grad_ptr_[expert_idx], 0, 1);
        },
        nullptr);
		
	// // for debug
	// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
	// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_output_grad_", (ggml_bf16_t*)m_local_down_output_grad_ptr_[expert_idx], config_.hidden_size * m_local_num_[expert_idx], GGML_TYPE_BF16);
	// }
	
	// if constexpr (std::is_same_v<typename T::dt, int8_t>) {
	// 	// 确保 debug/ 目录存在
	// 	std::filesystem::create_directories("debug");

	// 	for (int expert_idx = 0; expert_idx < config_.expert_num; ++expert_idx) {
	// 		auto buf = down_t_ba_[expert_idx].get();
	// 		// std::cout << "k: " << buf->k << "; n: " << buf->n << std::endl;
	// 		// 打开对应 expert 的文件
	// 		std::string path = "debug/" + std::to_string(expert_idx) + "_down_ba_t_debug.txt";
	// 		std::ofstream ofs(path, std::ios::out);
	// 		if (!ofs) {
	// 			std::cerr << "Failed to open " << path << " for writing\n";
	// 			continue;
	// 		}

	// 		ofs << "==== Expert " << expert_idx << " ====\n";
	// 		for (int n_idx = 0; n_idx < m_local_num_[expert_idx]; ++n_idx) {
	// 			// 明确当作 bfloat16 读
	// 			const int8_t* row = reinterpret_cast<const int8_t*>(buf->a) + n_idx * buf->k;

	// 			// 写整行
	// 			ofs << "row[" << n_idx << "] = { "
	// 				<< int8_row_to_string(row, buf->k)
	// 				<< " }\n";
	// 		}

	// 		ofs.close();
	// 	}
	// }

    int nth = T::recommended_nth(config_.intermediate_size);  
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;

        //   // TODO: cache
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                      gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                      gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
          gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
          up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);

          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                      down_t_ba_[expert_idx], down_t_bb_[expert_idx], down_t_bc_[expert_idx], ith, nth, use_amx);
          down_t_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_input_grad_ptr_[expert_idx], ith, nth);


          auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
          for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *down_input_grad_ptr = &m_local_down_input_grad_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *gate_output_grad_ptr = &m_local_gate_output_grad_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *up_output_grad_ptr = &m_local_up_output_grad_ptr_[expert_idx][i * config_.intermediate_size];
            
            int token_idx = m_local_token_indices_ptr_[expert_idx][i];
            int expert_pos = m_local_expert_positions_ptr_[expert_idx][i];
            __m512 weight = _mm512_set1_ps(weights[token_idx * k + expert_pos]);
            
            for (int j = n_start; j < n_end; j += 32) {
              __m512 gate_val0, gate_val1, up_val0, up_val1, down_input_grad0, down_input_grad1;
              avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
              avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
              avx512_32xbf16_to_32xfp32((__m512i *)(down_input_grad_ptr + j), &down_input_grad0, &down_input_grad1);
              
              down_input_grad0 = _mm512_mul_ps(down_input_grad0, weight);
              down_input_grad1 = _mm512_mul_ps(down_input_grad1, weight);
              
              // gate_output_grad = δ_zji ⊙ v_ji ⊙ σ'(u_ji)
              __m512 gate_grad0 = _mm512_mul_ps(down_input_grad0, 
                                               _mm512_mul_ps(up_val0, act_fn_grad(gate_val0)));
              __m512 gate_grad1 = _mm512_mul_ps(down_input_grad1, 
                                               _mm512_mul_ps(up_val1, act_fn_grad(gate_val1)));
              
              // up_output_grad = δ_zji ⊙ σ(u_ji)
              __m512 up_grad0 = _mm512_mul_ps(down_input_grad0, act_fn_1(gate_val0));
              __m512 up_grad1 = _mm512_mul_ps(down_input_grad1, act_fn_1(gate_val1));
              
              avx512_32xfp32_to_32xbf16(&gate_grad0, &gate_grad1, (__m512i *)(gate_output_grad_ptr + j));
              avx512_32xfp32_to_32xbf16(&up_grad0, &up_grad1, (__m512i *)(up_output_grad_ptr + j));
            }
          }
        },
        nullptr);

	// for debug
	// if constexpr (std::is_same_v<typename T::dt, ggml_bf16_t>) {	
	// 	for (int expert_idx = 0; expert_idx < config_.expert_num; ++expert_idx) {
	// 		auto buf = down_t_ba_[expert_idx].get();

	// 		// 打开对应 expert 的文件
	// 		std::string path = "debug/" + std::to_string(expert_idx) + "_down_ba_t_debug3.bin";
	// 		std::ofstream ofs(path, std::ios::binary);
	// 		for (int n_idx = 0; n_idx < m_local_num_[expert_idx]; ++n_idx) {
	// 			const ggml_bf16_t* row = reinterpret_cast<const ggml_bf16_t*>(buf->a) + n_idx * buf->k;
	// 			for (int j = 0; j < buf->k; ++j) {
	// 				float v = row[j];
	// 				ofs.write(reinterpret_cast<const char*>(&v), sizeof(v));
	// 			}
	// 		}
	// 		ofs.close();
	// 	}
	// }
	
	// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
	// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_t_ba_", (ggml_bf16_t*)m_local_down_output_grad_ptr_[expert_idx], config_.hidden_size * m_local_num_[expert_idx], GGML_TYPE_BF16);
	// }
	
	// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
	// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_t_bb_", (ggml_bf16_t *)down_proj_t_ + expert_idx * config_.intermediate_size * config_.hidden_size, config_.hidden_size * config_.intermediate_size, GGML_TYPE_BF16);
	// }

	// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
	// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_t_bc_", (ggml_bf16_t*)m_local_down_input_grad_ptr_[expert_idx], config_.intermediate_size * m_local_num_[expert_idx], GGML_TYPE_BF16);
	// }

    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          gate_t_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_grad_ptr_[expert_idx], 0, 1);
          up_t_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_up_output_grad_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size,
                      gate_t_ba_[expert_idx], gate_t_bb_[expert_idx], gate_t_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size,
                      up_t_ba_[expert_idx], up_t_bb_[expert_idx], up_t_bc_[expert_idx], ith, nth, use_amx);
          gate_t_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_input_grad_ptr_[expert_idx], ith, nth);
          up_t_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_input_grad_ptr_[expert_idx], ith, nth);
        },
        nullptr);
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int e = 0; e < config_.hidden_size; e += 32) {
            __m512 x0 = _mm512_setzero_ps();
            __m512 x1 = _mm512_setzero_ps();
            for (int j = 0; j < k; j++) {
              __m512 gate_input_grad0, gate_input_grad1, up_input_grad0, up_input_grad1;
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_gate_input_grad_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &gate_input_grad0, &gate_input_grad1);
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_up_input_grad_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &up_input_grad0, &up_input_grad1);
              x0 = _mm512_add_ps(gate_input_grad0, x0);
              x1 = _mm512_add_ps(gate_input_grad1, x1);
              x0 = _mm512_add_ps(up_input_grad0, x0);
              x1 = _mm512_add_ps(up_input_grad1, x1);
            }
            avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)input_grad + i * config_.hidden_size + e));
          }
        },
        nullptr);
  }
};
#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/amx/debug_tools_sft_moe.hpp
================================================
#ifndef SFT_DEBUG_HPP
#define SFT_DEBUG_HPP

#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <string>
#include <iostream>

inline std::string get_env_or_default(const char *var_name, const std::string &default_value) {
	const char *value = std::getenv(var_name);
	return (value != nullptr) ? std::string(value) : default_value;
}

/* use example:  
	for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
		dump_grad_bin("layer0_E_End"+std::to_string(expert_idx)+"_gate_proj_out_trans_", (uint8_t*)gate_proj_t_ + expert_idx * config_.hidden_size * config_.intermediate_size * ggml_type_size(config_.grad_type), config_.hidden_size * config_.intermediate_size, config_.grad_type);
		std::cout << "gate_proj_t_:" << static_cast<const void*>((uint8_t*)gate_proj_t_ + expert_idx * config_.hidden_size * config_.intermediate_size) << ", grad_type: " << config_.grad_type << std::endl;
	}
*/
inline void dump_grad_bin(const std::string &file_name,
                          const void       *data,
                          size_t            elem_cnt,
                          ggml_type         dtype,
						  std::streamoff    offset_bytes = 0)
{
    std::string path = get_env_or_default("SFT_DEBUG_PATH","debug") + "/" + file_name;
    switch (dtype) {
        case GGML_TYPE_F32:  path += ".f32";  break;
        case GGML_TYPE_F16:  path += ".f16";  break;
        case GGML_TYPE_BF16: path += ".bf16"; break;
		case GGML_TYPE_I8: path += ".int8"; break;
        default:             path += ".raw";  break;
    }
	std::fstream f(path, std::ios::in | std::ios::out | std::ios::binary);
    if (!f.is_open()) {
        std::ofstream tmp(path, std::ios::out | std::ios::binary);
        tmp.close();
        f.open(path, std::ios::in | std::ios::out | std::ios::binary);
    }

    f.seekp(offset_bytes * ggml_type_size(dtype));
	// std::cout << "seekp: " << offset_bytes * ggml_type_size(dtype) << std::endl;

    f.write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(elem_cnt * ggml_type_size(dtype)));
    f.close();
}

// inline void dump_bin(std::string file_name, float16_t *data, size_t count) {
//   file_name = get_env_or_default("SFT_DEBUG_PATH", "debug") + "/" + file_name + ".f16";
//   std::ofstream f(file_name, std::ios::binary);
//   f.write(reinterpret_cast<const char *>(data), count * sizeof(*data));
//   f.close();
// }
inline void dump_bin(std::string file_name, float *data, size_t count) {
	file_name = get_env_or_default("SFT_DEBUG_PATH", "debug") + "/" + file_name + ".f32";
	std::cout << file_name << std::endl;
	std::ofstream f(file_name, std::ios::binary);
	f.write(reinterpret_cast<const char *>(data), count * sizeof(*data));
	f.close();
}
inline void dump_bin(std::string file_name, int64_t *data, size_t count) {
	file_name = get_env_or_default("SFT_DEBUG_PATH", "debug") + "/" + file_name + ".int64";
	std::cout << file_name << std::endl;
	std::ofstream f(file_name, std::ios::binary);
	f.write(reinterpret_cast<const char *>(data), count * sizeof(*data));
	f.close();
}
inline void dump_bin(std::string file_name, uint8_t *data, size_t count) {
	file_name = get_env_or_default("SFT_DEBUG_PATH", "debug") + "/" + file_name + ".uint8";
	std::cout << file_name << std::endl;
	std::ofstream f(file_name, std::ios::binary);
	f.write(reinterpret_cast<const char *>(data), count * sizeof(*data));
	f.close();
}

#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/amx/la/amx.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#pragma once
#include <array>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <immintrin.h>
#include <iostream>
#include <random>
#include <stdexcept>
#include <stdlib.h>
#include <sys/syscall.h>
#include <unistd.h>

#include "utils.hpp"
#include <memory>

#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
#define RESTRICT __restrict__
#endif

#if (defined(_WIN32) || defined(_WIN64))
#define ALWAYS_INLINE __forceinline
#elif __has_attribute(always_inline) || defined(__GNUC__)
#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
#else
#define ALWAYS_INLINE inline
#endif

namespace amx {

#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18

const int TMMCount = 8;
const int MaxTileHeight = 16;
const int MaxTileWidth = 64;

const int AMX_BLK_SIZE = 32;

#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7

inline bool enable_amx() {
  static thread_local bool initialized = false;
  if (initialized) {
    return true;
  }
  initialized = true;

  if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
    printf("\n Fail to do XFEATURE_XTILEDATA \n\n");
    return false;
  } else {
    // printf("\n TILE DATA USE SET - OK \n\n");
    return true;
  }
  return true;
}

struct alignas(64) TileConfig {
  uint8_t palette;
  uint8_t start_row;
  std::array<uint8_t, 14> __0 = {};
  std::array<uint16_t, 8> colsb;
  std::array<uint8_t, 16> __1 = {};
  std::array<uint8_t, 8> rows;
  std::array<uint8_t, 8> __2 = {};

  TileConfig() {
    palette = 1;
    start_row = 0;
    for (int i = 0; i < 8; i++) {
      set_row_col(i, 0, 0);
    }
  }

  void set_row_col(int i, uint8_t row, uint16_t col) {
    colsb[i] = col;
    rows[i] = row;
  }

  void set_config() { _tile_loadconfig(this); }

  static void load_data(int to, void *from, size_t stride) {
    switch (to) {
    case 0:
      _tile_loadd(0, from, stride);
      break;
    case 1:
      _tile_loadd(1, from, stride);
      break;
    case 2:
      _tile_loadd(2, from, stride);
      break;
    case 3:
      _tile_loadd(3, from, stride);
      break;
    case 4:
      _tile_loadd(4, from, stride);
      break;
    case 5:
      _tile_loadd(5, from, stride);
      break;
    case 6:
      _tile_loadd(6, from, stride);
      break;
    case 7:
      _tile_loadd(7, from, stride);
      break;
    default:
      throw std::runtime_error("no such tile");
    }
  }

  static void store_data(int from, void *to, size_t stride) {
    switch (from) {
    case 0:
      _tile_stored(0, to, stride);
      break;
    case 1:
      _tile_stored(1, to, stride);
      break;
    case 2:
      _tile_stored(2, to, stride);
      break;
    case 3:
      _tile_stored(3, to, stride);
      break;
    case 4:
      _tile_stored(4, to, stride);
      break;
    case 5:
      _tile_stored(5, to, stride);
      break;
    case 6:
      _tile_stored(6, to, stride);
      break;
    case 7:
      _tile_stored(7, to, stride);
      break;
    default:
      throw std::runtime_error("no such tile");
    }
  }
};

static_assert(sizeof(TileConfig) == 64);

inline void debug_tile(int t) {
  printf("Tile %d\n", t);
  uint8_t data[16][64] = {};
  TileConfig::store_data(t, data, 64);
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 64; j++) {
      printf("%3d ", data[i][j]);
    }
    printf("\n");
  }
  printf("\n");
}

inline void debug_tiles(int to = 8) {
  for (int i = 0; i < to; i++) {
    debug_tile(i);
  }
}

inline void debug_m512(__m512 x) {
  float data[16];
  _mm512_storeu_ps(data, x);
  for (int i = 0; i < 16; i++) {
    printf("%f ", data[i]);
  }
  printf("\n");
}

// transpose utils
inline void transpose_16x16_32bit(__m512i *v) {
  __m512i v1[16];
  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);

  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);

  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

/*
  Transpose 16x16 32-bit elements
  Note that v must be 64 byte aligned
*/
inline void transpose_16x16_32bit(__m512i *v, size_t stride) {
  assert(reinterpret_cast<intptr_t>(v) % 64 == 0 && "v must be 64 aligned");

  auto stride_v = [=](int i) { return offset_pointer(v, i * stride); };
  __m512i v1[16];

  v1[0] = _mm512_unpacklo_epi32(*stride_v(0), *stride_v(1));
  v1[1] = _mm512_unpackhi_epi32(*stride_v(0), *stride_v(1));
  v1[2] = _mm512_unpacklo_epi32(*stride_v(2), *stride_v(3));
  v1[3] = _mm512_unpackhi_epi32(*stride_v(2), *stride_v(3));
  v1[4] = _mm512_unpacklo_epi32(*stride_v(4), *stride_v(5));
  v1[5] = _mm512_unpackhi_epi32(*stride_v(4), *stride_v(5));
  v1[6] = _mm512_unpacklo_epi32(*stride_v(6), *stride_v(7));
  v1[7] = _mm512_unpackhi_epi32(*stride_v(6), *stride_v(7));
  v1[8] = _mm512_unpacklo_epi32(*stride_v(8), *stride_v(9));
  v1[9] = _mm512_unpackhi_epi32(*stride_v(8), *stride_v(9));
  v1[10] = _mm512_unpacklo_epi32(*stride_v(10), *stride_v(11));
  v1[11] = _mm512_unpackhi_epi32(*stride_v(10), *stride_v(11));
  v1[12] = _mm512_unpacklo_epi32(*stride_v(12), *stride_v(13));
  v1[13] = _mm512_unpackhi_epi32(*stride_v(12), *stride_v(13));
  v1[14] = _mm512_unpacklo_epi32(*stride_v(14), *stride_v(15));
  v1[15] = _mm512_unpackhi_epi32(*stride_v(14), *stride_v(15));

  *stride_v(0) = _mm512_unpacklo_epi64(v1[0], v1[2]);
  *stride_v(1) = _mm512_unpackhi_epi64(v1[0], v1[2]);
  *stride_v(2) = _mm512_unpacklo_epi64(v1[1], v1[3]);
  *stride_v(3) = _mm512_unpackhi_epi64(v1[1], v1[3]);
  *stride_v(4) = _mm512_unpacklo_epi64(v1[4], v1[6]);
  *stride_v(5) = _mm512_unpackhi_epi64(v1[4], v1[6]);
  *stride_v(6) = _mm512_unpacklo_epi64(v1[5], v1[7]);
  *stride_v(7) = _mm512_unpackhi_epi64(v1[5], v1[7]);
  *stride_v(8) = _mm512_unpacklo_epi64(v1[8], v1[10]);
  *stride_v(9) = _mm512_unpackhi_epi64(v1[8], v1[10]);
  *stride_v(10) = _mm512_unpacklo_epi64(v1[9], v1[11]);
  *stride_v(11) = _mm512_unpackhi_epi64(v1[9], v1[11]);
  *stride_v(12) = _mm512_unpacklo_epi64(v1[12], v1[14]);
  *stride_v(13) = _mm512_unpackhi_epi64(v1[12], v1[14]);
  *stride_v(14) = _mm512_unpacklo_epi64(v1[13], v1[15]);
  *stride_v(15) = _mm512_unpackhi_epi64(v1[13], v1[15]);

  v1[0] = _mm512_shuffle_i32x4(*stride_v(0), *stride_v(4), 0x88);
  v1[1] = _mm512_shuffle_i32x4(*stride_v(1), *stride_v(5), 0x88);
  v1[2] = _mm512_shuffle_i32x4(*stride_v(2), *stride_v(6), 0x88);
  v1[3] = _mm512_shuffle_i32x4(*stride_v(3), *stride_v(7), 0x88);
  v1[4] = _mm512_shuffle_i32x4(*stride_v(0), *stride_v(4), 0xdd);
  v1[5] = _mm512_shuffle_i32x4(*stride_v(1), *stride_v(5), 0xdd);
  v1[6] = _mm512_shuffle_i32x4(*stride_v(2), *stride_v(6), 0xdd);
  v1[7] = _mm512_shuffle_i32x4(*stride_v(3), *stride_v(7), 0xdd);
  v1[8] = _mm512_shuffle_i32x4(*stride_v(8), *stride_v(12), 0x88);
  v1[9] = _mm512_shuffle_i32x4(*stride_v(9), *stride_v(13), 0x88);
  v1[10] = _mm512_shuffle_i32x4(*stride_v(10), *stride_v(14), 0x88);
  v1[11] = _mm512_shuffle_i32x4(*stride_v(11), *stride_v(15), 0x88);
  v1[12] = _mm512_shuffle_i32x4(*stride_v(8), *stride_v(12), 0xdd);
  v1[13] = _mm512_shuffle_i32x4(*stride_v(9), *stride_v(13), 0xdd);
  v1[14] = _mm512_shuffle_i32x4(*stride_v(10), *stride_v(14), 0xdd);
  v1[15] = _mm512_shuffle_i32x4(*stride_v(11), *stride_v(15), 0xdd);

  *stride_v(0) = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
  *stride_v(1) = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
  *stride_v(2) = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
  *stride_v(3) = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
  *stride_v(4) = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
  *stride_v(5) = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
  *stride_v(6) = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
  *stride_v(7) = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
  *stride_v(8) = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
  *stride_v(9) = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
  *stride_v(10) = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
  *stride_v(11) = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
  *stride_v(12) = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
  *stride_v(13) = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
  *stride_v(14) = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
  *stride_v(15) = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
}

struct GemmKernel224BF {
  using dt = ggml_bf16_t;
  using output_t = float;
  static const int TILE_M = 16;
  static const int TILE_K = 32;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 2;

  static inline constexpr int M_STEP = TILE_M * 2;
  static inline constexpr int N_STEP = TILE_N * 2;
  static inline constexpr int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  static inline const int K_BLOCK = 1792;

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 32
    for (int i = 0; i < 2; i++)
      tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 16 x 32
    for (int i = 2; i < 4; i++)
      tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 16
    for (int i = 4; i < 8; i++)
      tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
  }

  static void load_a(dt *a, size_t lda) {
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
  }

  static void load_b(dt *b, size_t ldb) {
    _tile_loadd(2, b, ldb);
    _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
  }

  static void clean_c() {
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
  }

  static void load_c(output_t *c, size_t ldc) {
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void store_c(output_t *c, size_t ldc) {
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void run_tile() {
    _tile_dpbf16ps(4, 0, 2);
    _tile_dpbf16ps(5, 0, 3);
    _tile_dpbf16ps(6, 1, 2);
    _tile_dpbf16ps(7, 1, 3);
  }

  struct BufferA {
    ggml_bf16_t *a;
    int max_m, k;

    static size_t required_size(int max_m, int k) { return max_m * k * sizeof(ggml_bf16_t); }

    BufferA(int max_m, int k, void *ptr) : max_m(max_m), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(k % K_STEP == 0);
      a = reinterpret_cast<ggml_bf16_t *>(ptr);
    }

    void from_mat(int m, ggml_bf16_t *src, int ith, int nth) {
      assert(m <= max_m);
      assert(ith == 0 && nth == 1);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
              __m512i *s = (__m512i *)(src + (m_begin + i) * k + k_block_begin + k_begin);
              __m512i *d = (__m512i *)(a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP +
                                       i * K_STEP);
              avx512_copy_32xbf16(s, d);
            }
          }
        }
      }
    }

    ggml_bf16_t *get_submat(int m, int k, int m_begin, int k_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
    }
  };

  struct BufferB {
    ggml_bf16_t *b;
    int n, k;

    static size_t required_size(int n, int k) { return n * k * sizeof(ggml_bf16_t); }

    BufferB(int n, int k, void *ptr) : n(n), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(n % N_STEP == 0);
      assert(k % K_STEP == 0);
      b = reinterpret_cast<ggml_bf16_t *>(ptr);
    }

    void from_mat(ggml_bf16_t *src, int ith, int nth) {
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < N_STEP; i++) {
              __m512i *s = (__m512i *)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin);
              __m512i *d = (__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                                       k_begin * N_STEP + i * K_STEP);
              avx512_copy_32xbf16(s, d);
            }
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP));
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP + TILE_N * K_STEP));
          }
        }
      }
    }

    ggml_bf16_t *get_submat(int n, int k, int n_begin, int k_begin) {
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      n_begin -= n_block_begin;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP;
    }
  };

  struct BufferC {
    float *c;
    int max_m, n;

    static size_t required_size(int max_m, int n) { return max_m * n * sizeof(float); }

    BufferC(int max_m, int n, void *ptr) : max_m(max_m), n(n) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(n % N_STEP == 0);
      c = reinterpret_cast<float *>(ptr);
    }

    void to_mat(int m, ggml_bf16_t *dst, int ith, int nth) {
      assert(m <= max_m);
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512 *x0 =
                (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
            __m512 *x1 = (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP +
                                    i * N_STEP + 16);
            avx512_32xfp32_to_32xbf16(x0, x1, (__m512i *)(dst + (m_begin + i) * n + n_block_begin + n_begin));
          }
        }
      }
    }

    float *get_submat(int m, int n, int m_begin, int n_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      n_begin -= n_block_begin;
      return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
    }
  };
};

struct GemmKernel224Int8 {
  using dt = int8_t;
  using output_t = int32_t;
  static const int TILE_M = 16;
  static const int TILE_K = 64;
  static const int TILE_N = 16;
  static const int VNNI_BLK = 4;

  static inline constexpr int M_STEP = TILE_M * 2;
  static inline constexpr int N_STEP = TILE_N * 2;
  static inline constexpr int K_STEP = TILE_K;

  static inline const int N_BLOCK = 256;
  static inline const int K_BLOCK = 3584;

  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }

  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
    int n_start = N_BLOCK * ith;
    int n_end = std::min(n, N_BLOCK * (ith + 1));
    return {n_start, n_end};
  }

  static void config() {
    enable_amx();
    TileConfig tile_config;

    // size is 16 x 64
    for (int i = 0; i < 2; i++)
      tile_config.set_row_col(i, TILE_M, TILE_K * sizeof(dt));

    // size is 16 x 64
    for (int i = 2; i < 4; i++)
      tile_config.set_row_col(i, TILE_K / VNNI_BLK, TILE_N * VNNI_BLK * sizeof(dt));

    // size is 16 x 16
    for (int i = 4; i < 8; i++) 
      tile_config.set_row_col(i, TILE_M, TILE_N * sizeof(output_t));

    tile_config.set_config();
  }

  static void load_a(dt *a, size_t lda) {
    _tile_loadd(0, a, lda);
    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
  }

  static void load_b(dt *b, size_t ldb) {
    _tile_loadd(2, b, ldb);
    _tile_loadd(3, offset_pointer(b, ldb * TILE_N), ldb);
  }

  static void clean_c() {
    _tile_zero(4);
    _tile_zero(5);
    _tile_zero(6);
    _tile_zero(7);
  }

  static void load_c(output_t *c, size_t ldc) {
    _tile_loadd(4, c, ldc);
    _tile_loadd(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_loadd(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_loadd(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void store_c(output_t *c, size_t ldc) {
    _tile_stored(4, c, ldc);
    _tile_stored(5, offset_pointer(c, TILE_N * sizeof(output_t)), ldc);
    _tile_stored(6, offset_pointer(c, ldc * TILE_M), ldc);
    _tile_stored(7, offset_pointer(c, ldc * TILE_M + TILE_N * sizeof(output_t)), ldc);
  }

  static void run_tile() {
    _tile_dpbssd(4, 0, 2);
    _tile_dpbssd(5, 0, 3);
    _tile_dpbssd(6, 1, 2);
    _tile_dpbssd(7, 1, 3);
  }

  struct BufferA {
    int8_t *a;
    float *d;
    int max_m, k;

    static size_t required_size(int max_m, int k) { return max_m * k * sizeof(int8_t) + max_m * sizeof(float); }

    BufferA(int max_m, int k, void *ptr) : max_m(max_m), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(k % K_STEP == 0);
      a = reinterpret_cast<int8_t *>(ptr);
      d = reinterpret_cast<float *>(a + max_m * k);
    }

    void from_mat(int m, ggml_bf16_t *src, int ith, int nth) {
      assert(m <= max_m);
      assert(ith == 0 && nth == 1);
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
          float amax = 0.0f;
          for (int j = 0; j < k; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i *)(src + (m_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          }
          d[m_begin + i] = amax / ((1 << 7) - 1);
        }
      }
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
              __m512 id = _mm512_set1_ps(d[m_begin + i] ? 1.0f / d[m_begin + i] : 0.0f);
              int8_t *dst = a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP + i * K_STEP;
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i *)(src + (m_begin + i) * k + k_block_begin + k_begin), &f0, &f1);
              avx512_32xbf16_to_32xfp32((__m512i *)(src + (m_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              _mm_storeu_si128((__m128i *)dst, s0);
              _mm_storeu_si128((__m128i *)(dst + 16), s1);
              _mm_storeu_si128((__m128i *)(dst + 32), s2);
              _mm_storeu_si128((__m128i *)(dst + 48), s3);
            }
          }
        }
      }
    }

    int8_t *get_submat(int m, int k, int m_begin, int k_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return a + k_block_begin * m_block_size + m_begin * k_block_size + k_begin * M_STEP;
    }

    float *get_scale(int m, int m_begin) { return d + m_begin; }
  };

  struct BufferB {
    int8_t *b;
    float *d;
    int n, k;

    static size_t required_size(int n, int k) { return n * k * sizeof(int8_t) + n * sizeof(float); }

    BufferB(int n, int k, void *ptr) : n(n), k(k) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(n % N_STEP == 0);
      assert(k % K_STEP == 0);
      b = reinterpret_cast<int8_t *>(ptr);
      d = reinterpret_cast<float *>(b + n * k);
    }

    void from_mat(ggml_bf16_t *src, int ith, int nth) {
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int i = 0; i < N_STEP; i++) {
          float amax = 0.0f;
          for (int j = 0; j < k; j += 32) {
            __m512 f0, f1;
            avx512_32xbf16_to_32xfp32((__m512i *)(src + (n_block_begin + n_begin + i) * k + j), &f0, &f1);
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
            amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
          }
          d[n_block_begin + n_begin + i] = amax / ((1 << 7) - 1);
        }
      }
      for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
        for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
          int k_block_size = std::min(K_BLOCK, k - k_block_begin);
          for (int k_begin = 0; k_begin < k_block_size; k_begin += K_STEP) {
            for (int i = 0; i < N_STEP; i++) {
              __m512 id = _mm512_set1_ps(d[n_block_begin + n_begin + i] ? 1.0f / d[n_block_begin + n_begin + i] : 0.0f);
              int8_t *dst = b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size +
                            k_begin * N_STEP + i * K_STEP;
              __m512 f0, f1, f2, f3;
              avx512_32xbf16_to_32xfp32((__m512i *)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin),
                                        &f0, &f1);
              avx512_32xbf16_to_32xfp32(
                  (__m512i *)(src + (n_block_begin + n_begin + i) * k + k_block_begin + k_begin) + 1, &f2, &f3);
              __m512i i0 = _mm512_cvtps_epi32(_mm512_mul_ps(f0, id));
              __m512i i1 = _mm512_cvtps_epi32(_mm512_mul_ps(f1, id));
              __m512i i2 = _mm512_cvtps_epi32(_mm512_mul_ps(f2, id));
              __m512i i3 = _mm512_cvtps_epi32(_mm512_mul_ps(f3, id));
              __m128i s0 = _mm512_cvtsepi32_epi8(i0);
              __m128i s1 = _mm512_cvtsepi32_epi8(i1);
              __m128i s2 = _mm512_cvtsepi32_epi8(i2);
              __m128i s3 = _mm512_cvtsepi32_epi8(i3);
              _mm_storeu_si128((__m128i *)dst, s0);
              _mm_storeu_si128((__m128i *)(dst + 16), s1);
              _mm_storeu_si128((__m128i *)(dst + 32), s2);
              _mm_storeu_si128((__m128i *)(dst + 48), s3);
            }
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP));
            transpose_16x16_32bit((__m512i *)(b + n_block_begin * k + k_block_begin * n_block_size +
                                              n_begin * k_block_size + k_begin * N_STEP + TILE_N * K_STEP));
          }
        }
      }
    }

    int8_t *get_submat(int n, int k, int n_begin, int k_begin) {
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      n_begin -= n_block_begin;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      int k_block_begin = k_begin / K_BLOCK * K_BLOCK;
      k_begin -= k_block_begin;
      int k_block_size = std::min(K_BLOCK, k - k_block_begin);
      return b + n_block_begin * k + k_block_begin * n_block_size + n_begin * k_block_size + k_begin * N_STEP;
    }

    float *get_scale(int n, int n_begin) { return d + n_begin; }
  };

  struct BufferC {
    float *c;
    int max_m, n;

    static size_t required_size(int max_m, int n) { return max_m * n * sizeof(float); }

    BufferC(int max_m, int n, void *ptr) : max_m(max_m), n(n) {
      assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
      assert(max_m % M_STEP == 0);
      assert(n % N_STEP == 0);
      c = reinterpret_cast<float *>(ptr);
    }

    void to_mat(int m, ggml_bf16_t *dst, int ith, int nth) {
      assert(m <= max_m);
      auto [n_start, n_end] = split_range_n(n, ith, nth);
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_start;
      int n_block_size = n_end - n_block_begin;
      for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
        for (int n_begin = 0; n_begin < n_block_size; n_begin += N_STEP) {
          for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
            __m512 *x0 =
                (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP + i * N_STEP);
            __m512 *x1 = (__m512 *)(c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP +
                                    i * N_STEP + 16);
            avx512_32xfp32_to_32xbf16(x0, x1, (__m512i *)(dst + (m_begin + i) * n + n_block_begin + n_begin));
          }
        }
      }
    }

    float *get_submat(int m, int n, int m_begin, int n_begin) {
      int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
      int n_block_begin = n_begin / N_BLOCK * N_BLOCK;
      int n_block_size = std::min(N_BLOCK, n - n_block_begin);
      n_begin -= n_block_begin;
      return c + m_block_size * n_block_begin + m_begin * n_block_size + n_begin * M_STEP;
    }
  };
};

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224BF::BufferA> ba,
                    std::shared_ptr<GemmKernel224BF::BufferB> bb, std::shared_ptr<GemmKernel224BF::BufferC> bc, int ith,
                    int nth, bool use_amx) {
//   std::cout << "mat_mul in BF16!!!!" << std::endl;
  using K = GemmKernel224BF;
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {

        float *c = bc->get_submat(m, n, m_begin, n_begin);
        if (!use_amx) {
          __m512 *c512 = (__m512 *)c;
          if (k_block_begin == 0) {
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              c512[m_i * 2] = _mm512_setzero_ps();
              c512[m_i * 2 + 1] = _mm512_setzero_ps();
            }
          }

          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            int32_t *a32 = (int32_t *)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
            __m512bh *b512 = (__m512bh *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              for (int k_i = 0; k_i < 16; k_i++) {
                __m512bh ma = (__m512bh)_mm512_set1_epi32(a32[m_i * 16 + k_i]);
                for (int n_i = 0; n_i < 2; n_i++) {
                  c512[m_i * 2 + n_i] = _mm512_dpbf16_ps(c512[m_i * 2 + n_i], ma, b512[n_i * 16 + k_i]);
                }
              }
            }
          }

        } else {
          if (k_block_begin == 0) {
            K::clean_c();
          } else {
            K::load_c(c, K::N_STEP * sizeof(float));
          }
          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(ggml_bf16_t));
            K::load_b(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::K_STEP * sizeof(ggml_bf16_t));
            K::run_tile();
          }
          K::store_c(c, K::N_STEP * sizeof(float));
        }
      }
    }
  }
}

inline __m512i _mm512_dpbssd_epi32(__m512i src, __m512i a, __m512i b) {
  __m256i a_lo = _mm512_extracti64x4_epi64(a, 0);
  __m256i a_hi = _mm512_extracti64x4_epi64(a, 1);
  __m256i b_lo = _mm512_extracti64x4_epi64(b, 0);
  __m256i b_hi = _mm512_extracti64x4_epi64(b, 1);

  b_lo = _mm256_sign_epi8(b_lo, a_lo);
  b_hi = _mm256_sign_epi8(b_hi, a_hi);

  b = _mm512_inserti64x4(b, b_lo, 0);
  b = _mm512_inserti64x4(b, b_hi, 1);

  a = _mm512_abs_epi8(a);

  return _mm512_dpbusd_epi32(src, a, b);
}

inline void mat_mul(int m, int n, int k, std::shared_ptr<GemmKernel224Int8::BufferA> ba,
                    std::shared_ptr<GemmKernel224Int8::BufferB> bb, std::shared_ptr<GemmKernel224Int8::BufferC> bc,
                    int ith, int nth, bool use_amx) {
//   std::cout << "mat_mul in INT8!!!!" << std::endl;
  using K = GemmKernel224Int8;
  assert(n % K::N_STEP == 0);
  assert(k % K::K_STEP == 0);

  auto [n_start, n_end] = K::split_range_n(n, ith, nth);

  for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K::K_BLOCK) {
    for (int m_begin = 0; m_begin < m; m_begin += K::M_STEP) {
      for (int n_begin = n_start; n_begin < n_end; n_begin += K::N_STEP) {
        float *c = bc->get_submat(m, n, m_begin, n_begin);

        if (!use_amx) {
          __m512i *c512 = (__m512i *)c;
          if (k_block_begin == 0) {
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              c512[m_i * 2] = _mm512_setzero_si512();
              c512[m_i * 2 + 1] = _mm512_setzero_si512();
            }
          }

          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            static_assert(K::K_STEP * sizeof(int8_t) == sizeof(__m512i));
            static_assert(K::N_STEP / K::TILE_N == 2, "Must be lke this");

            int32_t *a32 = (int32_t *)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
            __m512i *b512 = (__m512i *)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
            for (int m_i = 0; m_i < m && m_i < K::M_STEP; m_i++) {
              for (int k_i = 0; k_i < 16; k_i++) {
                __m512i ma = _mm512_set1_epi32(a32[m_i * 16 + k_i]);
                for (int n_i = 0; n_i < 2; n_i++) {
                  c512[m_i * 2 + n_i] = _mm512_dpbssd_epi32(c512[m_i * 2 + n_i], ma, b512[n_i * 16 + k_i]);
                }
              }
            }
          }
        } else {
          if (k_block_begin == 0) {
            K::clean_c();
          } else {
            K::load_c((int32_t *)c, K::N_STEP * sizeof(int32_t));
          }
          for (int k_begin = 0; k_begin < K::K_BLOCK && k_block_begin + k_begin < k; k_begin += K::K_STEP) {
            K::load_a(ba->get_submat(m, k, m_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
            K::load_b(bb->get_submat(n, k, n_begin, k_block_begin + k_begin), K::K_STEP * sizeof(int8_t));
            K::run_tile();
          }
          K::store_c((int32_t *)c, K::N_STEP * sizeof(int32_t));
        }

        if (k_block_begin + K::K_BLOCK >= k) {
          int to = m - m_begin;
          if (m - m_begin > K::M_STEP) {
            to = K::M_STEP;
          }
          for (int i = 0; i < to; i++) {
            __m512 as = _mm512_set1_ps(*ba->get_scale(m, m_begin + i));
            __m512 bs = _mm512_load_ps(bb->get_scale(n, n_begin));
            __m512i now = _mm512_load_si512((__m512i *)(c + i * K::N_STEP));
            __m512 result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
            _mm512_store_ps((__m512 *)(c + i * K::N_STEP), result);
            bs = _mm512_load_ps(bb->get_scale(n, n_begin) + K::TILE_N);
            now = _mm512_load_si512((__m512i *)(c + i * K::N_STEP + K::TILE_N));
            result = _mm512_mul_ps(_mm512_mul_ps(as, bs), _mm512_cvtepi32_ps(now));
            _mm512_store_ps((__m512 *)(c + i * K::N_STEP + K::TILE_N), result);
          }
        }
      }
    }
  }
}

} // namespace amx

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/amx/la/utils.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#pragma once
#include <cstdint>


template <typename T>
T* offset_pointer(T* ptr, std::size_t byte_offset) {
  return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + byte_offset);
}

template <typename T>
const T* offset_pointer(const T* ptr, std::size_t byte_offset) {
  return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + byte_offset);
}

template <typename T>
T* offset_pointer_row_major(T* t, int row, int col, std::size_t ld) {
  return offset_pointer(t, row * ld) + col;
}

template <typename T>
T* offset_pointer_col_major(T* t, int row, int col, std::size_t ld) {
  return offset_pointer(t, col * ld) + row;
}

static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
  _mm512_storeu_si512(dst, _mm512_loadu_si512(src));
}

static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
  _mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
}

static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
  _mm512_storeu_ps(dst0, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src))), 16)));
  _mm512_storeu_ps(dst1, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src) + 1)), 16)));
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/amx/moe.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_AMX_MOE_H
#define CPUINFER_OPERATOR_AMX_MOE_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

#include "la/amx.hpp"

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
void *numa_alloc_aligned(size_t size, int node, size_t alignment) {
  void *ptr = numa_alloc_onnode(size, node);
  assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
  return ptr;
}
#endif

static inline __m512 exp_avx512(__m512 x) {
  const __m512 log2e = _mm512_set1_ps(1.44269504089f);
  const __m512 c1 = _mm512_set1_ps(0.69314718056f);

  __m512 y = _mm512_mul_ps(x, log2e);
  __m512i int_part = _mm512_cvtps_epi32(y);
  __m512 frac_part = _mm512_sub_ps(y, _mm512_cvtepi32_ps(int_part));

  const __m512 poly_1 = _mm512_set1_ps(0.9999999995f);
  const __m512 poly_2 = _mm512_set1_ps(0.6931471805f);
  const __m512 poly_3 = _mm512_set1_ps(0.2402265069f);
  const __m512 poly_4 = _mm512_set1_ps(0.0555041087f);
  const __m512 poly_5 = _mm512_set1_ps(0.0096181291f);
  const __m512 poly_6 = _mm512_set1_ps(0.0013333558f);

  __m512 frac_exp = _mm512_fmadd_ps(
      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(poly_6, frac_part, poly_5), frac_part, poly_4),
                                      frac_part, poly_3),
                      frac_part, poly_2),
      frac_part, poly_1);

  __m512 two_pow_i = _mm512_scalef_ps(_mm512_set1_ps(1.0f), _mm512_cvtepi32_ps(int_part));
  return _mm512_mul_ps(two_pow_i, frac_exp);
}

static inline __m512 act_fn(__m512 gate_val, __m512 up_val) {
  __m512 neg_gate_val = _mm512_sub_ps(_mm512_setzero_ps(), gate_val);
  __m512 exp_neg_gate = exp_avx512(neg_gate_val);
  __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), exp_neg_gate);
  __m512 act_val = _mm512_div_ps(gate_val, denom);

  return _mm512_mul_ps(act_val, up_val);
}

struct AMX_MOEConfig {
  int expert_num;
  int routed_expert_num;
  int hidden_size;
  int intermediate_size;
  int max_len;
  void *gate_proj;
  void *up_proj;
  void *down_proj;

  AMX_MOEConfig() {}

  AMX_MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int max_len,
                void *gate_proj, void *up_proj, void *down_proj)
      : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size),
        intermediate_size(intermediate_size), max_len(max_len), gate_proj(gate_proj), up_proj(up_proj),
        down_proj(down_proj) {}
};

template <class T> class AMX_MOE {
private:
  AMX_MOEConfig config_;
  void *gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *up_proj_;   // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *down_proj_; // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

  ggml_bf16_t *m_local_input_;       // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_gate_output_; // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_up_output_;   // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_down_output_; // [routed_expert_num * max_len * hidden_size]

  std::vector<std::vector<int>> m_local_pos_;          // [max_len, routed_expert_num]
  std::vector<int> m_local_num_;                       // [expert_num]
  std::vector<int> m_expert_id_map_;                   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_input_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_output_ptr_; // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_output_ptr_;   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_down_output_ptr_; // [expert_num]

  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;

#ifdef USE_NUMA
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> gate_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> up_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> down_bb_numa_;
#else
  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
#endif

public:
  AMX_MOE(AMX_MOEConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;

    std::vector<std::pair<void **, uint64_t>> m_mem_requests;
    m_mem_requests.push_back({(void **)&m_local_input_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                  config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_up_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_down_output_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    std::vector<void *> gate_up_ba_ptr(config_.expert_num);
    std::vector<void *> gate_bc_ptr(config_.expert_num);
    std::vector<void *> up_bc_ptr(config_.expert_num);
    std::vector<void *> down_ba_ptr(config_.expert_num);
    std::vector<void *> down_bc_ptr(config_.expert_num);
    for (int i = 0; i < config_.expert_num; i++) {
      m_mem_requests.push_back(
          {(void **)&gate_up_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&gate_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&up_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
    }
    shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.max_len);
    for (int i = 0; i < config_.max_len; i++) {
      m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_expert_id_map_.resize(config_.expert_num);
    m_local_num_.resize(config_.expert_num);
    m_local_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);

    for (uint64_t i = 0; i < config_.expert_num; i++) {
      gate_up_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, gate_up_ba_ptr[i]));
      gate_bc_.push_back(
          std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, gate_bc_ptr[i]));
      up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, up_bc_ptr[i]));
      down_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, down_ba_ptr[i]));
      down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, down_bc_ptr[i]));

#ifdef USE_NUMA
      int numa_nodes = numa_num_configured_nodes();
      gate_bb_numa_.resize(numa_nodes);
      up_bb_numa_.resize(numa_nodes);
      down_bb_numa_.resize(numa_nodes);
      for (int j = 0; j < numa_nodes; j++) {
        void *gate_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        gate_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
        void *up_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        up_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
        void *down_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
        down_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
      }
#else
      void *gate_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      gate_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));

      void *up_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      up_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));

      void *down_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      down_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
#endif
    }
  }

  ~AMX_MOE() { shared_mem_buffer.dealloc(this); }

  void load_weights(Backend *backend) {
    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            gate_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                                       expert_idx * config_.intermediate_size * config_.hidden_size,
                                                   ith, nth);
            up_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.up_proj +
                                                     expert_idx * config_.intermediate_size * config_.hidden_size,
                                                 ith, nth);
          }
#else
          gate_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                             expert_idx * config_.intermediate_size * config_.hidden_size,
                                         ith, nth);
          up_bb_[expert_idx]->from_mat(
              (ggml_bf16_t *)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith, nth);
#endif
        },
        nullptr);
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            down_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                                       expert_idx * config_.hidden_size * config_.intermediate_size,
                                                   ith, nth);
          }
#else
          down_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
#endif
        },
        nullptr);
  }

  void warm_up(Backend *backend) {}

  void forward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void *input, void *output,
               int *batch_size_tensor, Backend *backend) {
    qlen = batch_size_tensor[0];
    bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }
    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];
    }
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int j = 0; j < k; j++) {
            memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
                   (ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
          }
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];

          gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_numa_[Backend::numa_node][expert_idx], gate_bc_[expert_idx],
                       ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_numa_[Backend::numa_node][expert_idx], up_bc_[expert_idx], ith,
                       nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
#endif
          gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
          up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);
          auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
          for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
            for (int j = n_start; j < n_end; j += 32) {
              __m512 gate_val0, gate_val1, up_val0, up_val1;
              avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
              avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
              __m512 result0 = act_fn(gate_val0, up_val0);
              __m512 result1 = act_fn(gate_val1, up_val1);
              avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i *)(gate_output_ptr + j));
            }
          }
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_numa_[Backend::numa_node][expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_[expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#endif
          down_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_output_ptr_[expert_idx], ith, nth);
        },
        nullptr);
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int e = 0; e < config_.hidden_size; e += 32) {
            __m512 x0 = _mm512_setzero_ps();
            __m512 x1 = _mm512_setzero_ps();
            for (int j = 0; j < k; j++) {
              __m512 weight = _mm512_set1_ps(weights[i * k + j]);
              __m512 down_output0, down_output1;
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_down_output_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &down_output0, &down_output1);
              x0 = _mm512_fmadd_ps(down_output0, weight, x0);
              x1 = _mm512_fmadd_ps(down_output1, weight, x1);
            }
            avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)output + i * config_.hidden_size + e));
          }
        },
        nullptr);
  }
};

#endif


================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/amx/sft_moe.hpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2025-04-25 18:28:12
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2025-04-25 18:28:12
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_SFT_AMX_MOE_H
#define CPUINFER_OPERATOR_SFT_AMX_MOE_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>
#include <fstream>
#include <filesystem>

#include "debug_sft_moe.hpp"

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

#include "la/amx.hpp"

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
// void *numa_alloc_aligned(size_t size, int node, size_t alignment) {
//   void *ptr = numa_alloc_onnode(size, node);
//   assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
//   return ptr;
// }
#endif

static inline __m512 sigmoid(__m512 x) {
  __m512 neg = _mm512_sub_ps(_mm512_setzero_ps(), x);
  __m512 e = exp_avx512(neg);
  __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), e);
  return _mm512_div_ps(_mm512_set1_ps(1.0f), denom);
}

static inline __m512 act_fn_1(__m512 x) {
  __m512 sigmoid_val = sigmoid(x);
  return _mm512_mul_ps(sigmoid_val, x);
}

static inline __m512 act_fn_grad(__m512 x) {
  // sigmoid(x) * (1 + x * (1 - sigmoid(x)))
  __m512 sigmoid_val = sigmoid(x);
  __m512 one_minus_sigmoid = _mm512_sub_ps(_mm512_set1_ps(1.0f), sigmoid_val);
  __m512 x_term = _mm512_mul_ps(x, one_minus_sigmoid);
  __m512 one_plus_x_term = _mm512_add_ps(_mm512_set1_ps(1.0f), x_term);
  return _mm512_mul_ps(sigmoid_val, one_plus_x_term);
}

struct SFT_AMX_MOEConfig {
  int expert_num;
  int routed_expert_num;
  int hidden_size;
  int intermediate_size;
  int max_len;
  void *gate_proj;
  void *up_proj;
  void *down_proj;

  SFT_AMX_MOEConfig() {}

  SFT_AMX_MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int max_len,
                void *gate_proj, void *up_proj, void *down_proj)
      : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size),
        intermediate_size(intermediate_size), max_len(max_len), gate_proj(gate_proj), up_proj(up_proj),
        down_proj(down_proj) {}
};

template <class T> class SFT_AMX_MOE {
private:
  SFT_AMX_MOEConfig config_;
  void *gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *up_proj_;   // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
  void *down_proj_; // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

  void *gate_proj_t_; // [expert_num * hidden_size * intermediate_size]
  void *up_proj_t_;   // [expert_num * hidden_size * intermediate_size]
  void *down_proj_t_; // [expert_num * intermediate_size * hidden_size]

  ggml_bf16_t *m_local_input_;       // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_gate_output_; // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_up_output_;   // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_down_output_; // [routed_expert_num * max_len * hidden_size]

  std::vector<std::vector<int>> m_local_pos_;          // [max_len, routed_expert_num]
  std::vector<int> m_local_num_;                       // [expert_num]
  std::vector<int> m_expert_id_map_;                   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_input_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_output_ptr_; // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_output_ptr_;   // [expert_num]
  std::vector<ggml_bf16_t *> m_local_down_output_ptr_; // [expert_num]

  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;

#ifdef USE_NUMA
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> gate_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> up_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> down_bb_numa_;
#else
  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
#endif

  ggml_bf16_t *m_local_down_output_grad_;       // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_down_input_grad_;        // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_gate_output_grad_;       // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_up_output_grad_;         // [routed_expert_num * max_len * intermediate_size]
  ggml_bf16_t *m_local_gate_input_grad_;        // [routed_expert_num * max_len * hidden_size]
  ggml_bf16_t *m_local_up_input_grad_;          // [routed_expert_num * max_len * hidden_size]

  std::vector<ggml_bf16_t *> m_local_down_output_grad_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_down_input_grad_ptr_;        // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_output_grad_ptr_;       // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_output_grad_ptr_;         // [expert_num]
  std::vector<ggml_bf16_t *> m_local_gate_input_grad_ptr_;        // [expert_num]
  std::vector<ggml_bf16_t *> m_local_up_input_grad_ptr_;          // [expert_num]

  std::vector<std::shared_ptr<typename T::BufferA>> gate_t_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> gate_t_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> up_t_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> up_t_bc_;
  std::vector<std::shared_ptr<typename T::BufferA>> down_t_ba_;
  std::vector<std::shared_ptr<typename T::BufferC>> down_t_bc_;

  // TODO: NUMA
#ifdef USE_NUMA
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> gate_t_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> up_t_bb_numa_;
  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> down_t_bb_numa_;
#else
  std::vector<std::shared_ptr<typename T::BufferB>> gate_t_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> up_t_bb_;
  std::vector<std::shared_ptr<typename T::BufferB>> down_t_bb_;
#endif

  int* m_local_token_indices_;                                   // [routed_expert_num * max_len]
  int* m_local_expert_positions_;                               // [routed_expert_num * max_len]
  std::vector<int *> m_local_token_indices_ptr_;                // [expert_num]
  std::vector<int *> m_local_expert_positions_ptr_;             // [expert_num]

public:
  SFT_AMX_MOE(SFT_AMX_MOEConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;

    std::vector<std::pair<void **, uint64_t>> m_mem_requests;
    m_mem_requests.push_back({(void **)&m_local_input_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                  config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_up_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
                                                                config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_down_output_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    std::vector<void *> gate_up_ba_ptr(config_.expert_num);
    std::vector<void *> gate_bc_ptr(config_.expert_num);
    std::vector<void *> up_bc_ptr(config_.expert_num);
    std::vector<void *> down_ba_ptr(config_.expert_num);
    std::vector<void *> down_bc_ptr(config_.expert_num);
    for (int i = 0; i < config_.expert_num; i++) {
      m_mem_requests.push_back(
          {(void **)&gate_up_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&gate_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&up_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&down_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
    }

    m_mem_requests.push_back({(void **)&gate_proj_t_,
                              sizeof(ggml_bf16_t) * config_.expert_num * config_.intermediate_size * config_.hidden_size});
    m_mem_requests.push_back({(void **)&up_proj_t_,
                              sizeof(ggml_bf16_t) * config_.expert_num * config_.intermediate_size * config_.hidden_size});
    m_mem_requests.push_back({(void **)&down_proj_t_,
                              sizeof(ggml_bf16_t) * config_.expert_num * config_.hidden_size * config_.intermediate_size});
    
    m_mem_requests.push_back({(void **)&m_local_down_output_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_down_input_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_gate_output_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_up_output_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void **)&m_local_gate_input_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_up_input_grad_,
                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
    m_mem_requests.push_back({(void **)&m_local_token_indices_,
                              sizeof(int) * config_.routed_expert_num * config_.max_len});
    m_mem_requests.push_back({(void **)&m_local_expert_positions_,
                              sizeof(int) * config_.routed_expert_num * config_.max_len});
    std::vector<void *> gate_t_ba_ptr(config_.expert_num);
    std::vector<void *> gate_t_bc_ptr(config_.expert_num);
    std::vector<void *> up_t_ba_ptr(config_.expert_num);
    std::vector<void *> up_t_bc_ptr(config_.expert_num);
    std::vector<void *> down_t_ba_ptr(config_.expert_num);
    std::vector<void *> down_t_bc_ptr(config_.expert_num);
    for (int i = 0; i < config_.expert_num; i++) {
      m_mem_requests.push_back(
          {(void **)&gate_t_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&gate_t_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&up_t_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
      m_mem_requests.push_back(
          {(void **)&up_t_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&down_t_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
      m_mem_requests.push_back(
          {(void **)&down_t_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
    }

    shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.max_len);
    for (int i = 0; i < config_.max_len; i++) {
      m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_expert_id_map_.resize(config_.expert_num);
    m_local_num_.resize(config_.expert_num);
    m_local_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);
    m_local_down_output_grad_ptr_.resize(config_.expert_num);
    m_local_down_input_grad_ptr_.resize(config_.expert_num);
    m_local_gate_output_grad_ptr_.resize(config_.expert_num);
    m_local_up_output_grad_ptr_.resize(config_.expert_num);
    m_local_gate_input_grad_ptr_.resize(config_.expert_num);
    m_local_up_input_grad_ptr_.resize(config_.expert_num);

    for (uint64_t i = 0; i < config_.expert_num; i++) {
      gate_up_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, gate_up_ba_ptr[i]));
      gate_bc_.push_back(
          std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, gate_bc_ptr[i]));
      up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, up_bc_ptr[i]));
      down_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, down_ba_ptr[i]));
      down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, down_bc_ptr[i]));

#ifdef USE_NUMA
      int numa_nodes = numa_num_configured_nodes();
      gate_bb_numa_.resize(numa_nodes);
      up_bb_numa_.resize(numa_nodes);
      down_bb_numa_.resize(numa_nodes);
      for (int j = 0; j < numa_nodes; j++) {
        void *gate_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        gate_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
        void *up_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        up_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
        void *down_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
        down_bb_numa_[j].push_back(  
            std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
      }
#else
      void *gate_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      gate_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));

      void *up_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      up_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));

      void *down_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      down_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
#endif
    }

    for (uint64_t i = 0; i < config_.expert_num; i++) {
      gate_t_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, gate_t_ba_ptr[i]));
      gate_t_bc_.push_back(
          std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, gate_t_bc_ptr[i]));
      up_t_ba_.push_back(std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, up_t_ba_ptr[i]));
      up_t_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, up_t_bc_ptr[i]));
      down_t_ba_.push_back(
          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, down_t_ba_ptr[i]));
      down_t_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, down_t_bc_ptr[i]));

#ifdef USE_NUMA
      int numa_nodes = numa_num_configured_nodes();
      gate_t_bb_numa_.resize(numa_nodes);
      up_t_bb_numa_.resize(numa_nodes);
      down_t_bb_numa_.resize(numa_nodes);
      for (int j = 0; j < numa_nodes; j++) {
        void *gate_t_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
        gate_t_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, gate_t_bb_ptr));
        void *up_t_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
        up_t_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, up_t_bb_ptr));
        void *down_t_bb_ptr =
            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
        down_t_bb_numa_[j].push_back(
            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, down_t_bb_ptr));
      }
#else
      void *gate_t_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      gate_t_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, gate_t_bb_ptr));

      void *up_t_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
      up_t_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, up_t_bb_ptr));

      void *down_t_bb_ptr =
          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
      down_t_bb_.push_back(
          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, down_t_bb_ptr));
#endif
    }

    m_local_token_indices_ptr_.resize(config_.expert_num);
    m_local_expert_positions_ptr_.resize(config_.expert_num);
  }

  ~SFT_AMX_MOE() { shared_mem_buffer.dealloc(this); }

  void transpose_expert(const void* src, void* dst, int R, int C, Backend* backend) {
    backend->do_work_stealing_job(
        config_.expert_num, nullptr,
        [&](uint64_t expert_idx) {
          for (int r = 0; r < R; ++r) {
            for (int c = 0; c < C; ++c) {
                memcpy(
                    (uint8_t*)dst + (expert_idx * R * C + (c * R + r)) * sizeof(ggml_bf16_t),
                    (uint8_t*)src + (expert_idx * R * C + (r * C + c)) * sizeof(ggml_bf16_t),
                    sizeof(ggml_bf16_t));
            }
          }
        },
        nullptr);
  }
  
  void load_weights(Backend *backend) {
    transpose_expert(config_.gate_proj, gate_proj_t_, config_.intermediate_size, config_.hidden_size, backend);
    transpose_expert(config_.up_proj, up_proj_t_, config_.intermediate_size, config_.hidden_size, backend);
    transpose_expert(config_.down_proj, down_proj_t_, config_.hidden_size, config_.intermediate_size, backend);

    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            gate_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                                       expert_idx * config_.intermediate_size * config_.hidden_size,
                                                   ith, nth);
            up_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.up_proj +
                                                     expert_idx * config_.intermediate_size * config_.hidden_size,
                                                 ith, nth);
          }
#else
          gate_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
                                             expert_idx * config_.intermediate_size * config_.hidden_size,
                                         ith, nth);
          up_bb_[expert_idx]->from_mat(
              (ggml_bf16_t *)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith, nth);
#endif
		},
        nullptr);
    nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            down_t_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)down_proj_t_ +
                                                         expert_idx * config_.intermediate_size * config_.hidden_size,
                                                     ith, nth);
          }
#else
          down_t_bb_[expert_idx]->from_mat((ggml_bf16_t *)down_proj_t_ +
                                             expert_idx * config_.intermediate_size * config_.hidden_size,
                                         ith, nth);
#endif
        },
        nullptr);
        
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * config_.expert_num, nullptr,
        [&](int task_id) {
          uint64_t expert_idx = task_id / nth;
          int ith = task_id % nth;
#ifdef USE_NUMA
          int numa_nodes = numa_num_configured_nodes();
          for (int j = 0; j < numa_nodes; j++) {
            down_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                                       expert_idx * config_.hidden_size * config_.intermediate_size,
                                                   ith, nth);
          }
#else
          down_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
#endif
#ifdef USE_NUMA
          for (int j = 0; j < numa_nodes; j++) {
            gate_t_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)gate_proj_t_ +
                                                         expert_idx * config_.hidden_size * config_.intermediate_size,
                                                     ith, nth);
            up_t_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)up_proj_t_ +
                                                       expert_idx * config_.hidden_size * config_.intermediate_size,
                                                   ith, nth);
          }
#else
          gate_t_bb_[expert_idx]->from_mat((ggml_bf16_t *)gate_proj_t_ +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
          up_t_bb_[expert_idx]->from_mat((ggml_bf16_t *)up_proj_t_ +
                                             expert_idx * config_.hidden_size * config_.intermediate_size,
                                         ith, nth);
#endif
        },
        nullptr);
  }

  void warm_up(Backend *backend) {}

  void forward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void *input, void *output, Backend *backend) {
    bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }
    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];
    }
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int j = 0; j < k; j++) {
            memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
                   (ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
          }
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];

          gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    int nth = T::recommended_nth(config_.intermediate_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_numa_[Backend::numa_node][expert_idx], gate_bc_[expert_idx],
                       ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_numa_[Backend::numa_node][expert_idx], up_bc_[expert_idx], ith,
                       nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
#endif
          gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
          up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);
          auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
          for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
            for (int j = n_start; j < n_end; j += 32) {
              __m512 gate_val0, gate_val1, up_val0, up_val1;
              avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
              avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
              __m512 result0 = act_fn(gate_val0, up_val0);
              __m512 result1 = act_fn(gate_val1, up_val1);
              avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i *)(gate_output_ptr + j));
            }
          }
        },
        nullptr);
    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], 0, 1);
        },
        nullptr);
	
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_numa_[Backend::numa_node][expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
                       down_bb_[expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#endif
          down_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_output_ptr_[expert_idx], ith, nth);
        },
        nullptr);
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int e = 0; e < config_.hidden_size; e += 32) {
            __m512 x0 = _mm512_setzero_ps();
            __m512 x1 = _mm512_setzero_ps();
            for (int j = 0; j < k; j++) {
              __m512 weight = _mm512_set1_ps(weights[i * k + j]);
              __m512 down_output0, down_output1;
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_down_output_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &down_output0, &down_output1);
              x0 = _mm512_fmadd_ps(down_output0, weight, x0);
              x1 = _mm512_fmadd_ps(down_output1, weight, x1);
            }
            avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)output + i * config_.hidden_size + e));
          }
        },
        nullptr);
  }

  void backward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void* input, const void *output_grad, void *input_grad, Backend *backend) {
    bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
    int activated_expert = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
      for (int j = 0; j < k; j++) {
        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
      }
    }
    for (int i = 0; i < config_.expert_num; i++) {
      if (m_local_num_[i] > 0) {
        m_expert_id_map_[activated_expert] = i;
        activated_expert++;
      }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;

      m_local_down_output_grad_ptr_[i] = m_local_down_output_grad_ + offset * config_.hidden_size;
      m_local_down_input_grad_ptr_[i] = m_local_down_input_grad_ + offset * config_.intermediate_size;
      m_local_gate_output_grad_ptr_[i] = m_local_gate_output_grad_ + offset * config_.intermediate_size;
      m_local_up_output_grad_ptr_[i] = m_local_up_output_grad_ + offset * config_.intermediate_size;
      m_local_gate_input_grad_ptr_[i] = m_local_gate_input_grad_ + offset * config_.hidden_size;
      m_local_up_input_grad_ptr_[i] = m_local_up_input_grad_ + offset * config_.hidden_size;
      m_local_token_indices_ptr_[i] = m_local_token_indices_ + offset;
      m_local_expert_positions_ptr_[i] = m_local_expert_positions_ + offset;
      offset += m_local_num_[i];
    }

    // TODO: cache
    backend->do_work_stealing_job(
        qlen, nullptr, 
        [&](int i) {
          for (int j = 0; j < k; j++) {
            uint64_t expert_id = expert_ids[i * k + j];
            int local_row = m_local_pos_[i][j];
            memcpy(m_local_input_ptr_[expert_id] + local_row * config_.hidden_size,
              (ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size); // TODO: cache
            memcpy(m_local_down_output_grad_ptr_[expert_id] + local_row * config_.hidden_size,
              (ggml_bf16_t *)output_grad + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
            m_local_token_indices_ptr_[expert_id][local_row] = i;
            m_local_expert_positions_ptr_[expert_id][local_row] = j;
          }
        }, 
        nullptr);

    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1); // TODO: cache
          down_t_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_down_output_grad_ptr_[expert_idx], 0, 1);
        },
        nullptr);

    int nth = T::recommended_nth(config_.intermediate_size);  
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;

          // TODO: cache
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_numa_[Backend::numa_node][expert_idx], gate_bc_[expert_idx],
                       ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_numa_[Backend::numa_node][expert_idx], up_bc_[expert_idx], ith,
                       nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                       gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
#endif
          gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
          up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);

#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                      down_t_ba_[expert_idx], down_t_bb_numa_[Backend::numa_node][expert_idx], down_t_bc_[expert_idx], ith, nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
                      down_t_ba_[expert_idx], down_t_bb_[expert_idx], down_t_bc_[expert_idx], ith, nth, use_amx);
#endif
          down_t_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_input_grad_ptr_[expert_idx], ith, nth);


          auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
          for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *down_input_grad_ptr = &m_local_down_input_grad_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *gate_output_grad_ptr = &m_local_gate_output_grad_ptr_[expert_idx][i * config_.intermediate_size];
            ggml_bf16_t *up_output_grad_ptr = &m_local_up_output_grad_ptr_[expert_idx][i * config_.intermediate_size];
            
            int token_idx = m_local_token_indices_ptr_[expert_idx][i];
            int expert_pos = m_local_expert_positions_ptr_[expert_idx][i];
            __m512 weight = _mm512_set1_ps(weights[token_idx * k + expert_pos]);
            
            for (int j = n_start; j < n_end; j += 32) {
              __m512 gate_val0, gate_val1, up_val0, up_val1, down_input_grad0, down_input_grad1;
              avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
              avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
              avx512_32xbf16_to_32xfp32((__m512i *)(down_input_grad_ptr + j), &down_input_grad0, &down_input_grad1);
              
              down_input_grad0 = _mm512_mul_ps(down_input_grad0, weight);
              down_input_grad1 = _mm512_mul_ps(down_input_grad1, weight);
              
              // gate_output_grad = δ_zji ⊙ v_ji ⊙ σ'(u_ji)
              __m512 gate_grad0 = _mm512_mul_ps(down_input_grad0, 
                                               _mm512_mul_ps(up_val0, act_fn_grad(gate_val0)));
              __m512 gate_grad1 = _mm512_mul_ps(down_input_grad1, 
                                               _mm512_mul_ps(up_val1, act_fn_grad(gate_val1)));
              
              // up_output_grad = δ_zji ⊙ σ(u_ji)
              __m512 up_grad0 = _mm512_mul_ps(down_input_grad0, act_fn_1(gate_val0));
              __m512 up_grad1 = _mm512_mul_ps(down_input_grad1, act_fn_1(gate_val1));
              
              avx512_32xfp32_to_32xbf16(&gate_grad0, &gate_grad1, (__m512i *)(gate_output_grad_ptr + j));
              avx512_32xfp32_to_32xbf16(&up_grad0, &up_grad1, (__m512i *)(up_output_grad_ptr + j));
            }
          }
        },
        nullptr);


    backend->do_work_stealing_job(
        activated_expert, nullptr,
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id];
          gate_t_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_grad_ptr_[expert_idx], 0, 1);
          up_t_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_up_output_grad_ptr_[expert_idx], 0, 1);
        },
        nullptr);
    nth = T::recommended_nth(config_.hidden_size);
    backend->do_work_stealing_job(
        nth * activated_expert, [&](int _) { T::config(); },
        [&](int task_id) {
          int expert_idx = m_expert_id_map_[task_id / nth];
          int ith = task_id % nth;
#ifdef USE_NUMA
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size,
                      gate_t_ba_[expert_idx], gate_t_bb_numa_[Backend::numa_node][expert_idx], gate_t_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size,
                      up_t_ba_[expert_idx], up_t_bb_numa_[Backend::numa_node][expert_idx], up_t_bc_[expert_idx], ith, nth, use_amx);
#else
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size,
                      gate_t_ba_[expert_idx], gate_t_bb_[expert_idx], gate_t_bc_[expert_idx], ith, nth, use_amx);
          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size,
                      up_t_ba_[expert_idx], up_t_bb_[expert_idx], up_t_bc_[expert_idx], ith, nth, use_amx);
#endif
          gate_t_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_input_grad_ptr_[expert_idx], ith, nth);
          up_t_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_input_grad_ptr_[expert_idx], ith, nth);
        },
        nullptr);
    backend->do_work_stealing_job(
        qlen, nullptr,
        [&](int i) {
          for (int e = 0; e < config_.hidden_size; e += 32) {
            __m512 x0 = _mm512_setzero_ps();
            __m512 x1 = _mm512_setzero_ps();
            for (int j = 0; j < k; j++) {
              __m512 gate_input_grad0, gate_input_grad1, up_input_grad0, up_input_grad1;
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_gate_input_grad_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &gate_input_grad0, &gate_input_grad1);
              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_up_input_grad_ptr_[expert_ids[i * k + j]] +
                                                    m_local_pos_[i][j] * config_.hidden_size + e),
                                        &up_input_grad0, &up_input_grad1);
              x0 = _mm512_add_ps(gate_input_grad0, x0);
              x1 = _mm512_add_ps(gate_input_grad1, x1);
              x0 = _mm512_add_ps(up_input_grad0, x0);
              x1 = _mm512_add_ps(up_input_grad1, x1);
            }
            avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)input_grad + i * config_.hidden_size + e));
          }
        },
        nullptr);
  }
};
#endif

// for debug
// if constexpr (std::is_same_v<typename T::dt, ggml_bf16_t>) {	
// 	for (int expert_idx = 0; expert_idx < config_.expert_num; ++expert_idx) {
// 		auto buf = down_t_ba_[expert_idx].get();

// 		std::string path = "debug/" + std::to_string(expert_idx) + "_down_ba_t_debug3.bin";
// 		std::ofstream ofs(path, std::ios::binary);
// 		for (int n_idx = 0; n_idx < m_local_num_[expert_idx]; ++n_idx) {
// 			const ggml_bf16_t* row = reinterpret_cast<const ggml_bf16_t*>(buf->a) + n_idx * buf->k;
// 			for (int j = 0; j < buf->k; ++j) {
// 				float v = row[j];
// 				ofs.write(reinterpret_cast<const char*>(&v), sizeof(v));
// 			}
// 		}
// 		ofs.close();
// 	}
// }

// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_t_ba_", (ggml_bf16_t*)m_local_down_output_grad_ptr_[expert_idx], config_.hidden_size * m_local_num_[expert_idx], GGML_TYPE_BF16);
// }

// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_t_bb_", (ggml_bf16_t *)down_proj_t_ + expert_idx * config_.intermediate_size * config_.hidden_size, config_.hidden_size * config_.intermediate_size, GGML_TYPE_BF16);
// }

// for (uint64_t expert_idx = 0; expert_idx < (uint64_t)config_.expert_num; ++expert_idx) {
// 	dump_grad_bin("cpp_layer0_E_End"+std::to_string(expert_idx)+"_down_t_bc_", (ggml_bf16_t*)m_local_down_input_grad_ptr_[expert_idx], config_.intermediate_size * m_local_num_[expert_idx], GGML_TYPE_BF16);
// }

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/kvcache/kvcache.h
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#ifndef CPUINFER_OPERATOR_KVCACHE_H
#define CPUINFER_OPERATOR_KVCACHE_H

#include <algorithm>
#include <atomic>
#include <cassert>
#include <condition_variable>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <functional>
#include <future>
#include <iostream>
#include <memory>
#include <mutex>
#include <queue>
#include <random>
#include <stdexcept>
#include <thread>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "llama.cpp/ggml-common.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

#define CHUNK_SIZE 32

/**
 * @brief Converts a ggml_type enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * ggml_type enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param type The ggml_type enum value to convert.
 * @return A string representation of the enum value.
 */
std::string ggml_type_to_string(ggml_type type);

/**
 * @enum AnchorType
 * @brief Defines the types of anchors used in attention mechanisms.
 *
 * This enum specifies different types of anchors that can be used in attention
 * mechanisms, such as fixed anchors, dynamic anchors, or special anchors like
 * QUEST, BLOCK_MEAN, or BLOCK_MAX.
 */
enum AnchorType {
    FIXED_ANCHOR, /**< A fixed anchor that does not change. */
    DYNAMIC,      /**< A dynamic anchor that can change over time. */
    QUEST, /**< A special anchor type used for QUEST (Query and Embedding Space
              Transformation). */
    BLOCK_MEAN, /**< An anchor based on the mean of a block of data. */
    BLOCK_MAX /**< An anchor based on the maximum value within a block of data.
               */
};

/**
 * @brief Converts an AnchorType enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * AnchorType enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param anchor_type The AnchorType enum value to convert.
 * @return A string representation of the enum value.
 */
std::string AnchorTypeToString(AnchorType anchor_type);

/**
 * @enum RetrievalType
 * @brief Defines the types of retrieval strategies in attention mechanisms.
 *
 * This enum specifies different retrieval strategies that can be used in
 * attention mechanisms, such as layer-level retrieval, key-value head-level
 * retrieval, or query head-level retrieval.
 */
enum RetrievalType {
    LAYER,  /**< Retrieval at the layer level. */
    KVHEAD, /**< Retrieval at the key-value head level. */
    QHEAD   /**< Retrieval at the query head level. */
};

/**
 * @brief Converts a RetrievalType enum value to its corresponding string
 * representation.
 *
 * This function provides a human-readable string representation for a given
 * RetrievalType enum value. The string can be used for logging, debugging, or
 * displaying information in a user interface.
 *
 * @param retrieval_type The RetrievalType enum value to convert.
 * @return A string representation of the enum value.
 */
std::string RetrievalTypeToString(RetrievalType retrieval_type);

/**
 * @struct KVCacheConfig
 * @brief Configuration structure for Key-Value (KV) Cache.
 *
 * This structure holds configuration parameters for setting up and managing
 * a Key-Value (KV) Cache used in various attention mechanisms. It includes
 * parameters such as the number of layers, the number of heads, the dimension
 * of each head, block length, anchor information, and memory-related settings.
 */
struct KVCacheConfig {
    int layer_num;   /**< Number of layers in the model. */
    int kv_head_num; /**< Number of heads in the KV Cache. */
    int q_head_num;  /**< Number of heads in the query. */
    int head_dim;    /**< Dimension of each head. */
    int block_len;   /**< Length of each block in the cache. */
    int anchor_num;  /**< Number of anchors used in attention. */

    ggml_type kv_type; /**< Data type of the KV Cache (e.g., fp16, q8_0). */

    // Controls the pre-allocated memory size
    int max_block_num;  /**< Maximum number of blocks that can be allocated. */
    int max_batch_size; /**< Maximum batch size that can be processed. */
    int max_thread_num; /**< Maximum number of threads that can be used. */

    AnchorType
        anchor_type; /**< Type of anchors used in the attention mechanism. */
    RetrievalType
        retrieval_type; /**< Type of retrieval strategy used in the cache. */

    int layer_step;   /**< Step size between layers. */
    int token_step;   /**< Step size between tokens. */
    int layer_offset; /**< Offset value for layers. */

    /**
     * @brief Default constructor for KVCacheConfig.
     *
     * Initializes the configuration with default values. This constructor
     * does not initialize any member variables explicitly.
     */
    KVCacheConfig() = default;

    /**
     * @brief Parameterized constructor for KVCacheConfig.
     *
     * This constructor initializes the configuration with specific values
     * for all member variables.
     *
     * @param layer_num The number of layers in the model.
     * @param kv_head_num The number of heads in the KV Cache.
     * @param q_head_num The number of heads in the query.
     * @param head_dim The dimension of each head.
     * @param block_len The length of each block in the cache.
     * @param anchor_num The number of anchors used in attention.
     * @param anchor_type The type of anchors used in the attention mechanism.
     * @param kv_type The data type of the KV Cache (e.g., fp16, q8_0).
     * @param retrieval_type The type of retrieval strategy used in the cache.
     * @param layer_step The step size between layers.
     * @param token_step The step size between tokens.
     * @param layer_offset The offset value for layers.
     * @param max_block_num The maximum number of blocks that can be allocated.
     * @param max_batch_size The maximum batch size that can be processed.
     * @param max_thread_num The maximum number of threads that can be used.
     */
    KVCacheConfig(int layer_num, int kv_head_num, int q_head_num, int head_dim,
                  int block_len, int anchor_num, AnchorType anchor_type,
                  ggml_type kv_type, RetrievalType retrieval_type,
                  int layer_step, int token_step, int layer_offset,
                  int max_block_num, int max_batch_size, int max_thread_num);
};

/**
 * @class KVCache
 * @brief Manages the Key-Value (KV) Cache used in attention mechanisms.
 *
 * The KVCache class provides functionality for managing the Key-Value Cache,
 * including resizing the cache, retrieving configuration parameters, and
 * updating internal states. This class is typically used in transformer models
 * to store and manage past key and value states for efficient attention
 * computations.
 */
class KVCache {
  public:
    /**
     * @brief Constructs a KVCache object with the given configuration.
     *
     * Initializes the KVCache with the specified configuration parameters,
     * such as the number of layers, heads, head dimensions, and other
     * relevant settings.
     *
     * @param config The configuration object containing initialization
     * parameters.
     */
    KVCache(KVCacheConfig config);

    /**
     * @brief Resizes the number of threads used by the cache.
     *
     * This function adjusts the number of threads that the cache can utilize.
     * It allows dynamic reconfiguration of the parallel processing capabilities
     * based on the current workload or system resources.
     *
     * @param thread_num The new number of threads to use.
     */
    void ThreadResize(int thread_num);

    /**
     * @brief Resizes the batch size managed by the cache.
     *
     * This function adjusts the batch size that the cache can handle. It
     * is useful when the input batch size changes dynamically, allowing
     * the cache to be reconfigured accordingly.
     *
     * @param batch_size The new batch size.
     */
    void BatchResize(int batch_size);

    /**
     * @brief Resizes the number of blocks managed by the cache.
     *
     * This function adjusts the number of blocks that the cache can manage.
     * It allows dynamic reconfiguration of the block structure based on the
     * current sequence length or other factors.
     *
     * @param block_num The new number of blocks.
     */
    void BlockResize(int block_num);

    /**
     * @brief Gets the number of layers in the cache.
     *
     * @return The number of layers configured in the cache.
     */
    int get_layer_num() { return config_.layer_num; }

    /**
     * @brief Gets the number of KV heads in the cache.
     *
     * @return The number of KV heads configured in the cache.
     */
    int get_kv_head_num() { return config_.kv_head_num; }

    /**
     * @brief Gets the number of query heads in the cache.
     *
     * @return The number of query heads configured in the cache.
     */
    int get_q_head_num() { return config_.q_head_num; }

    /**
     * @brief Gets the dimension of each head in the cache.
     *
     * @return The dimension of each head.
     */
    int get_head_dim() { return config_.head_dim; }

    /**
     * @brief Gets the length of each block in the cache.
     *
     * @return The length of each block.
     */
    int get_block_len() { return config_.block_len; }

    /**
     * @brief Gets the number of blocks for a specific layer.
     *
     * @param layer_id The ID of the layer for which to retrieve the block
     * number.
     * @return The number of blocks in the specified layer.
     */
    int get_block_num(int layer_id) { return past_block_num_[layer_id]; }

    /**
     * @brief Gets the number of anchors in the cache.
     *
     * @return The number of anchors configured in the cache.
     */
    int get_anchor_num() { return config_.anchor_num; }

    /**
     * @brief Gets the total length of the cache.
     *
     * @return The total length of the cache.
     */
    int get_cache_total_len() { return cache_total_len_; }

    /**
     * @brief Gets the total number of blocks in the cache.
     *
     * This function computes and returns the total number of blocks in the
     * cache based on the total cache length and the block length configuration.
     *
     * @return The total number of blocks in the cache.
     */
    int get_cache_total_block_num() {
        return (cache_total_len_ + config_.block_len - 1) / config_.block_len;
    }

    /**
     * @brief Updates the total length of the cache.
     *
     * This function sets a new total length for the cache, allowing dynamic
     * adjustment of the cache size during runtime.
     *
     * @param cache_total_len The new total length of the cache.
     */
    void update_cache_total_len(int cache_total_len) {
        cache_total_len_ = cache_total_len;
    }
    void attn(const ggml_fp16_t *q_in, ggml_fp16_t *output, float *attn_lse,
              int layer_idx, int generate_token_idx, int q_len, int batch_size,
              int max_block_num, int *block_table, int *cache_seqlens,
              int pick_block_num, int init_block_num, int local_block_num,
              Backend *backend);

    void update_kvcache_one_block_fp16(const ggml_fp16_t *k_in,
                                       const ggml_fp16_t *v_in, int layer_id,
                                       int block_idx, Backend *backend);

    void get_kvcache_one_block_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                    int layer_id, int block_idx,
                                    Backend *backend);

    void update_importance_one_block(const ggml_fp16_t *importance,
                                     int layer_id, int block_idx,
                                     Backend *backend);
    void get_importance_one_block(ggml_fp16_t *importance, int layer_id,
                                  int block_idx, Backend *backend);

    void get_anchor_one_block(ggml_fp16_t *anchor, int layer_id, int block_idx,
                              Backend *backend);

    void update_anchor_one_block(const ggml_fp16_t *anchor, int layer_id,
                                 int block_idx, Backend *backend);

    void calc_anchor_all_layers(int *block_table, int *cache_seqlens,
                                int batch_size, int max_block_num,
                                Backend *backend);

    void load_kvcache(std::string tensor_file_path, Backend *backend);
    void dump_kvcache(int *block_table, int cache_total_len,
                      std::string tensor_file_path, Backend *backend);

    void get_and_update_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                     int layer_id, int *block_table,
                                     int batch_size, int max_block_num,
                                     int *cache_seqlens, int q_len,
                                     Backend *backend);

    void get_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in, int layer_id,
                          int *block_table, int batch_size, int max_block_num,
                          int *cache_seqlens, Backend *backend);

    void update_kvcache_fp16(const ggml_fp16_t *k_in, const ggml_fp16_t *v_in,
                             int layer_id, int *block_table, int batch_size,
                             int max_block_num, int *cache_seqlens, int q_len,
                             Backend *backend);

    void update_importance(const ggml_fp16_t *importance, int layer_id,
                           int *block_table, int batch_size, int max_block_num,
                           int *offset, int width, Backend *backend);

    void attn_with_kvcache(const ggml_fp16_t *q_in, const ggml_fp16_t *k_in,
                           const ggml_fp16_t *v_in, ggml_fp16_t *output,
                           float *attn_lse, int layer_idx,
                           int generate_token_idx, int q_len, int batch_size,
                           int max_block_num, int *block_table,
                           int *cache_seqlens, int topk, int local,
                           Backend *backend);

    void clear_importance_all_layers(int *block_table, int *cache_seqlens,
                                     int batch_size, int max_block_num,
                                     Backend *backend);

    void clear_kvcache_all_layers(int *block_table, int *cache_seqlens,
                                  int batch_size, int max_block_num,
                                  Backend *backend);

    void get_sincos(ggml_fp16_t *sin, ggml_fp16_t *cos, int seqlen);

    void get_attn_sparsity(const ggml_fp16_t *q_in, float *attn_sparsity,
                           int layer_idx, int generate_token_idx, int q_len,
                           int batch_size, int max_block_num, int *block_table,
                           int *cache_seqlens, int *block_table_origin,
                           int *cache_seqlens_origin, int max_block_num_origin,
                           int topk, int local, Backend *backend);

    void get_all_kvcache_one_layer(int layer_id, ggml_fp16_t *k_in,
                                   ggml_fp16_t *v_in, Backend *backend);

  private:
    // Persistent data
    KVCacheConfig config_;
    int n_gqa_;                            // q_head_num / kv_head_num
    int cache_total_len_;                  // Number of tokens in cache
    std::vector<uint64_t> past_block_num_; // [layer_num]
    std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
        k_cache_q4; // [layer_num, kv_head_num, past_block_num, block_len *
                    // (head_dim / QK_4)]
    std::vector<std::vector<std::vector<std::vector<block_q4_0>>>>
        v_cache_q4; // [layer_num, kv_head_num, past_block_num, head_dim *
                    // (block_len / QK_4)]
    std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
        k_cache_q8; // [layer_num, kv_head_num, past_block_num, block_len *
                    // (head_dim / QK_8)]
    std::vector<std::vector<std::vector<std::vector<block_q8_0>>>>
        v_cache_q8; // [layer_num, kv_head_num, past_block_num, head_dim *
                    // (block_len / QK_8)]

    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
        k_cache_fp16_; // [layer_num, kv_head_num, past_block_num, block_len *
                       // head_dim]
    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
        v_cache_fp16_; // [layer_num, kv_head_num, past_block_num, head_dim *
                       // block_len]

    std::vector<std::vector<std::vector<std::vector<ggml_fp16_t>>>>
        importance_; // [layer_num, past_block_num, block_len,
                     // attention_head_num]

    std::vector<ggml_fp16_t>
        anchor_; // [layer_num * past_block_num * anchor_num *
                 // attention_head_num * head_dim]

    // Runtime data
    int64_t layer_id_;
    int64_t block_idx_;
    int *block_table_;
    uint64_t block_num_;
    int max_block_num_after_retrieval_;

    // Rotary positional embeddings
    std::vector<std::vector<ggml_fp16_t>> sin_; // [seq_len, head_dim]
    std::vector<std::vector<ggml_fp16_t>> cos_; // [seq_len, head_dim]

    // update/get
    int seq_len_;
    uint16_t *k_scales_;        // q4_0
    uint8_t *k_in_;             // q4_0
    uint16_t *v_scales_;        // q4_0
    uint8_t *v_in_;             // q4_0
    uint16_t *k_data_;          // fp16
    uint16_t *v_data_;          // fp16
    uint16_t *importance_data_; // fp16
    uint16_t *anchor_data_;     // fp16

    // sparsity = (sigma(block lse / lse))
    std::vector<std::vector<std::vector<float>>>
        block_lse_; // [batch_size, max_block_num, q_head_num]
    std::vector<std::vector<float>> attn_sparsity_; // [batch_size, q_head_num]

    // attn
    std::vector<std::vector<float>>
        avg_q; // [batch_size, q_head_num * head_dim]

    std::vector<std::vector<ggml_fp16_t>>
        avg_q_fp16; // [batch_size, q_head_num * head_dim]
    std::vector<
        std::priority_queue<std::pair<float, int>,
                            std::vector<std::pair<float, int>>, std::greater<>>>
        top_similar_block_;

    std::vector<std::vector<float>> block_similar_;
    std::vector<std::vector<std::vector<float>>> block_similar_kv_head_;
    std::vector<std::vector<std::vector<float>>> block_similar_q_head_;

    std::vector<int> cache_seqlens_;               // [batch_size]
    std::vector<int> selected_blocks_num_history_; // [layer_num // layer_step]

    std::vector<std::vector<std::vector<int>>> selected_blocks_history_;
    // [layer_num // layer_step, batch_size, max_block_num]

    std::vector<std::vector<std::vector<std::vector<int>>>>
        selected_blocks_history_kvhead_; // [layer_num // layer_step,
                                         // batch_size, max_block_num,
                                         // kv_head_num]

    std::vector<std::vector<int>>
        block_table_before_retrieval_; // [batch_size, max_block_num]
    std::vector<std::vector<int>>
        block_table_after_retrieval_; // [batch_size, pick_block_num]

    std::vector<std::vector<std::vector<int>>>
        block_table_before_retrieval_qhead_; // [batch_size, max_block_num,
                                             // q_head_num]
    std::vector<std::vector<std::vector<int>>>
        block_table_after_retrieval_qhead_; // [batch_size, pick_block_num,
                                            // q_head_num]

    std::vector<std::vector<std::vector<int>>>
        block_table_before_retrieval_kvhead_; // [batch_size, max_block_num,
                                              // kv_head_num]
    std::vector<std::vector<std::vector<int>>>
        block_table_after_retrieval_kvhead_; // [batch_size, pick_block_num,
                                             // kv_head_num]

    std::vector<std::vector<std::unique_ptr<std::mutex>>>
        mutex_; // [batch_size, kv_head_num]
    std::vector<std::vector<std::vector<block_q8_0>>>
        q_q8_0_; // [batch_size, kv_head_num, n_gqa * head_dim / QK8_0]
    std::vector<std::vector<std::vector<float>>>
        q_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]

    std::vector<std::vector<std::vector<float>>>
        output_fp32_; // [batch_size, kv_head_num, n_gqa * head_dim]
    std::vector<std::vector<std::vector<float>>>
        attn_lse_; // [batch_size, kv_head_num, n_gqa]

    std::vector<std::pair<int, int>> thread_cur_head_idx_; // [thread_num]

    std::vector<std::vector<block_q8_0>>
        thread_local_output_q8_0_; // [thread_num, n_gqa * head_dim / QK8_0]
    std::vector<std::vector<float>>
        thread_local_attn_score_; // [thread_num, n_gqa * block_len]
    std::vector<std::vector<float>>
        thread_local_output_fp32_; // [thread_num, n_gqa * head_dim]
    std::vector<std::vector<float>>
        thread_local_attn_lse_; // [thread_num, n_gqa]
    std::vector<std::vector<float>>
        thread_local_cur_output_fp32_; // [thread_num, n_gqa * head_dim]
    std::vector<std::vector<float>>
        thread_local_cur_attn_lse_; // [thread_num, n_gqa]
    std::vector<std::vector<uint8_t>>
        thread_local_attn_mask_; // [thread_num, block_len // 8]
    std::vector<std::vector<char>>
        thread_local_draft_; // [thread_num, 2 * n_gqa * block_len + 6 * n_gqa *
                             // head_dim + 2 * block_len * head_dim]

    // tmp space
    std::vector<float> q_fp32; // [n_gqa * head_dim]

    void quantize_q_(const uint16_t *q_in_data, int batch_size);
    void attn_initialize_layer_(int batch_size, int layer_idx, int *block_table,
                                int &max_block_num, int *cache_seqlens);
    void attn_initialize_kvhead_(int batch_size, int layer_idx,
                                 int *block_table, int &max_block_num,
                                 int *cache_seqlens);
    void retrieval_kvcache_layer_(const uint16_t *q_in_data, int init_block_num,
                                  int local_block_num, int pick_block_num,
                                  int q_len, int generate_token_idx,
                                  int batch_size, int layer_idx,
                                  int *cache_seqlens, int &max_block_num,
                                  Backend *backend);
    void retrieval_kvcache_kvhead_(const uint16_t *q_in_data,
                                   int init_block_num, int local_block_num,
                                   int pick_block_num, int q_len,
                                   int generate_token_idx, int batch_size,
                                   int layer_idx, int *cache_seqlens,
                                   int &max_block_num, Backend *backend);

    void calculate_block_similarity_layer_(
        const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
        int max_block_num, int *cache_seqlens, int init_block_num,
        int local_block_num, int pick_block_num, Backend *backend);
    void calculate_block_similarity_kvhead_(
        const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
        int max_block_num, int *cache_seqlens, int init_block_num,
        int local_block_num, int pick_block_num, Backend *backend);

    void select_block_layer_(int batch_size, int layer_idx, int max_block_num,
                             int init_block_num, int local_block_num,
                             int pick_block_num);
    void select_block_kvhead_(int batch_size, int layer_idx, int max_block_num,
                              int init_block_num, int local_block_num,
                              int pick_block_num);

    void calculate_sparsity_layer_(const uint16_t *q_in_data,
                                   float *attn_sparsity, int batch_size,
                                   int max_block_num, int *block_table,
                                   int *cache_seqlens, Backend *backend);
    void calculate_sparsity_kvhead_(const uint16_t *q_in_data,
                                    float *attn_sparsity, int batch_size,
                                    int max_block_num, int *block_table,
                                    int *cache_seqlens, Backend *backend);

    void attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
                           float *attn_lse, int batch_size, Backend *backend);
    void attention_layer_(const uint16_t *q_in_data, ggml_fp16_t *output,
                          float *attn_lse, int batch_size, Backend *backend);

    /**
     * @brief Computes attention with KV cache for one block.
     *
     * This function performs attention computation for one block using KV
     * cache. The function supports different data types for Q, K, and V caches,
     * and provides options for quantization. The function does not perform any
     * dynamic memory allocation internally, so all necessary buffers must be
     * pre-allocated externally.
     *
     * @param head_dim The dimension of the head.
     * @param bsz The batch size.
     * @param q_type The data type of Q (GGML data type). Only supports fp16 and
     * q8_0.
     * @param q Pointer to the Q tensor [bsz, head_dim]. The quantization is
     *          always applied along the head_dim dimension. The size must be
     *          bsz * head_dim/32 * qtype_size. If head_dim % 32 != 0, an error
     *          will be raised.
     * @param past_kv_len The length of the past KV cache.
     * @param past_kv_offset The offset in the past KV cache.
     * @param is_full_attn Boolean flag indicating whether to use full attention
     *                     (true for full 1 mask).
     * @param attn_mask Pointer to the attention mask [bsz, past_kv_len]. If
     *                  is_full_attn = false, a bit matrix is passed to
     * represent the mask.
     * @param k_type The data type of K cache (GGML data type). Only supports
     *               fp16, q4_0, and q8_0.
     * @param k_quant_type Quantization type for K cache. 0 for per_token, 1 for
     *                     per_channel. Other values will raise an error.
     * @param k_cache Pointer to the K cache tensor [seq_len, head_dim]. If
     *                quant_type == 0, head_dim % 32 must be 0. If quant_type ==
     * 1, seq_len % 32 must be 0.
     * @param num_k_anchor The number of K anchors. If num_k_anchor == 0, it
     * means no anchor is present.
     * @param k_cache_anchors Pointer to the K cache anchors [num_k_anchor,
     * head_dim]. The k_anchor_type must be fp16.
     * @param k_cache_anchor_pos Pointer to the K cache anchor positions. Each
     * token is associated with the nearest previous anchor position.
     * @param v_type The data type of V cache (GGML data type).
     * @param v_quant_type Quantization type for V cache.
     * @param v_cache Pointer to the V cache tensor [head_dim, seq_len].
     * @param num_v_anchor The number of V anchors.
     * @param v_cache_anchors Pointer to the V cache anchors.
     * @param v_cache_anchor_pos Pointer to the V cache anchor positions.
     * @param attn_score Pre-allocated buffer for attention scores [bsz,
     * past_kv_len].
     * @param output Output tensor [bsz, head_dim] with the same type as q_type.
     * @param lse Pre-allocated buffer [bsz] for the log-sum-exp of the
     * attention scores.
     * @param draft Pre-allocated temporary buffer. The buffer size should be
     * enough to hold (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 *
     *              past_kv_len * head_dim + past_kv_len * head_dim / 32) bytes.
     * @param rotary_angle Pointer to the rotary angle tensor.
     * @param rotary_cos Pointer to the cosine values for rotary embedding.
     * @param rotary_sin Pointer to the sine values for rotary embedding.
     */
    void attn_with_kvcache_one_block_(
        int head_dim, int bsz,
        ggml_type q_type, // GGML data type of `Q`, only supports fp16 and q8_0
        // [bsz, head_dim]
        // Quantization is always on the head_dim dimension (per_token). If
        // head_dim % 32 != 0, an error will be raised. The size must be bsz *
        // head_dim/32 * qtype_size.
        const void *q,

        int past_kv_len, int past_kv_offset,
        bool is_full_attn, // true indicates a full 1 mask
        // If is_full_attn = false, a bit matrix representing the mask is
        // passed. [bsz, past_kv_len]
        const uint8_t *attn_mask,

        ggml_type k_type, // GGML data type of `K Cache`, only supports fp16,
                          // q4_0, q8_0
        int k_quant_type, // 0 for per_token, 1 for per_channel, others raise an
                          // error
        // [seq_len, head_dim]
        // If quant_type == 0, head_dim % 32 must be 0.
        // If quant_type == 1, seq_len % 32 must be 0.
        const void *k_cache,

        // k_anchor_type must be fp16
        int num_k_anchor, // num_k_anchor == 0 indicates no anchor
        // [num_k_anchor, head_dim]
        const void *k_cache_anchors,
        // Each token is associated with the nearest previous position's anchor,
        // with the same distance.
        const int *k_cache_anchor_pos,

        // v_cache similar to k_cache
        ggml_type v_type, int v_quant_type,
        // [head_dim, seq_len]
        const void *v_cache, int num_v_anchor, const void *v_cache_anchors,
        const int *v_cache_anchor_pos,

        // Pre-allocated buffer for intermediate calculations [bsz,
        // past_kv_len]. No malloc is performed inside this function.
        float *attn_score,

        // Output: [bsz, head_dim], with the same type as q_type
        void *output,
        // [bsz]
        float *lse,

        // Pre-allocated temporary buffer with sufficient size:
        // (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
        // head_dim + past_kv_len * head_dim / 32) bytes.
        void *draft,

        // Apply rotary embedding online
        const int *rotary_angle, const void *rotary_cos, const void *rotary_sin
        // rotary_cos=None,
        // rotary_sin=None,
        // cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
        // cache_batch_idx: Optional[torch.Tensor] = None,
        // rotary_interleaved=True,

        // // Not supported for now
        // window_size=(-1, -1),  # -1 means infinite context window
        // alibi_slopes=None,
    );
};

/**
 * @brief Scales a float32 vector by a given scalar value.
 *
 * This function multiplies each element of the input vector `y` by a scalar
 * `v`. It uses platform-specific optimizations if available, such as Apple's
 * Accelerate framework or SIMD instructions. If no specific optimization is
 * available, the function falls back to a simple scalar multiplication loop.
 *
 * @param n The number of elements in the vector `y`.
 * @param y The input vector to be scaled. The result will be stored in the same
 * vector.
 * @param v The scalar value by which to scale the vector.
 */
void ggml_vec_scale_f32(const int n, float *y, const float v);
#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

void KVCache::attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
                                float *attn_lse, int batch_size,
                                Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;

    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num_after_retrieval_,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num *
                                      max_block_num_after_retrieval_);
            int head_id = (task_id % (config_.kv_head_num *
                                      max_block_num_after_retrieval_)) /
                          max_block_num_after_retrieval_;
            int block_id = task_id % max_block_num_after_retrieval_;
            int thread_id = Backend::thread_local_id;

            // If the block is out of the sequence length, skip it.
            if (cache_seqlens_[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx =
                block_table_after_retrieval_kvhead_[batch_id][block_id]
                                                   [head_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;
                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });
    // move the results to output and attn_lse
    uint16_t *output_data = reinterpret_cast<uint16_t *>(output);
    float *attn_lse_data = attn_lse;
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                config_.head_dim +
                            i * n_gqa_ * config_.head_dim + j] =
                    GGML_FP32_TO_FP16(output_fp32_[batch_idx][i][j]);
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_data[batch_idx * config_.kv_head_num * n_gqa_ +
                              i * n_gqa_ + j] = attn_lse_[batch_idx][i][j];
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of computing attention: %f s\n", layer_idx,
    //        diff.count());
}

void KVCache::attention_layer_(const uint16_t *q_in_data, ggml_fp16_t *output,
                               float *attn_lse, int batch_size,
                               Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num_after_retrieval_,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num *
                                      max_block_num_after_retrieval_);
            int head_id = (task_id % (config_.kv_head_num *
                                      max_block_num_after_retrieval_)) /
                          max_block_num_after_retrieval_;
            int block_id = task_id % max_block_num_after_retrieval_;
            int thread_id = Backend::thread_local_id;
            // If the block is out of the sequence length, skip it.
            if (cache_seqlens_[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table_after_retrieval_[batch_id][block_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;

                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });

    // move the results to output and attn_lse
    uint16_t *output_data = reinterpret_cast<uint16_t *>(output);
    float *attn_lse_data = attn_lse;
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                config_.head_dim +
                            i * n_gqa_ * config_.head_dim + j] =
                    GGML_FP32_TO_FP16(output_fp32_[batch_idx][i][j]);
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_data[batch_idx * config_.kv_head_num * n_gqa_ +
                              i * n_gqa_ + j] = attn_lse_[batch_idx][i][j];
            }
        }
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    //     printf("layer %d time of computing attention: %f s\n", layer_id_,
    //     diff.count());
}

void KVCache::attn(const ggml_fp16_t *q_in, ggml_fp16_t *output,
                   float *attn_lse, int layer_idx, int generate_token_idx,
                   int q_len, int batch_size, int max_block_num,
                   int *block_table, int *cache_seqlens, int pick_block_num,
                   int init_block_num, int local_block_num, Backend *backend) {

    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    layer_id_ = layer_idx;
    batch_size = batch_size * q_len;

    const uint16_t *q_in_data = const_cast<const uint16_t *>(q_in);

    quantize_q_(q_in_data, batch_size);
    if (config_.retrieval_type == RetrievalType::LAYER) {
        attn_initialize_layer_(batch_size, layer_idx, block_table,
                               max_block_num, cache_seqlens);
        retrieval_kvcache_layer_(q_in_data, init_block_num, local_block_num,
                                 pick_block_num, q_len, generate_token_idx,
                                 batch_size, layer_idx, cache_seqlens,
                                 max_block_num, backend);
        attention_layer_(q_in_data, output, attn_lse, batch_size, backend);
    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        attn_initialize_kvhead_(batch_size, layer_idx, block_table,
                                max_block_num, cache_seqlens);
        retrieval_kvcache_kvhead_(q_in_data, init_block_num, local_block_num,
                                  pick_block_num, q_len, generate_token_idx,
                                  batch_size, layer_idx, cache_seqlens,
                                  max_block_num, backend);
        attention_kvhead_(q_in_data, output, attn_lse, batch_size, backend);
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of computing attention: %f s\n", layer_idx,
    //        diff.count());
}

void KVCache::attn_with_kvcache(
    const ggml_fp16_t *q_in, const ggml_fp16_t *k_in, const ggml_fp16_t *v_in,
    ggml_fp16_t *output, float *attn_lse, int layer_idx, int generate_token_idx,
    int q_len, int batch_size, int max_block_num, int *block_table,
    int *cache_seqlens, int topk, int local, Backend *backend) {
    //    printf("attn_with_kvcache start\n");
    assert(q_len == 1);
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_idx;

    update_kvcache_fp16(k_in, v_in, layer_idx, block_table, batch_size,
                        max_block_num, cache_seqlens, q_len, backend);
    //    printf("update finished.\n");

    // cache_seqlens memory is modified.
    for (int i = 0; i < batch_size; i++) {
        cache_seqlens[i] += q_len;
    }
    int init_block_num = 1;
    if (config_.block_len <= 32) {
        init_block_num = 64 / config_.block_len;
    }

    attn(q_in, output, attn_lse, layer_idx, generate_token_idx, q_len,
         batch_size, max_block_num, block_table, cache_seqlens, topk,
         init_block_num, local, backend);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    //     printf("layer %d time of computing attention with kvcache: %f s\n",
    //     layer_idx, diff.count());
}

void KVCache::quantize_q_(const uint16_t *q_in_data, int batch_size) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            // quantize q
            for (int i = 0; i < config_.kv_head_num; i++) {
                for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                    q_fp32_[batch_idx][i][j] = GGML_FP16_TO_FP32(
                        q_in_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                      config_.head_dim +
                                  i * n_gqa_ * config_.head_dim + j]);
                }
            }
        } else {
            // quantize q
            for (int i = 0; i < config_.kv_head_num; i++) {
                for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                    q_fp32[j] = GGML_FP16_TO_FP32(
                        q_in_data[batch_idx * config_.kv_head_num * n_gqa_ *
                                      config_.head_dim +
                                  i * n_gqa_ * config_.head_dim + j]);
                }
                quantize_row_q8_0(q_fp32.data(), q_q8_0_[batch_idx][i].data(),
                                  n_gqa_ * config_.head_dim);
            }
        }
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("time of quantizing q: %f s\n",
    //        std::chrono::duration<double>(end - start).count());
}
void KVCache::attn_initialize_layer_(int batch_size, int layer_idx,
                                     int *block_table, int &max_block_num,
                                     int *cache_seqlens) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        // initialize output_fp32_ and attn_lse_
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_fp32_[batch_idx][i][j] = 0;
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_[batch_idx][i][j] = 0;
            }
        }
        // clear top_similar_block_

        while (!top_similar_block_[batch_idx].empty())
            top_similar_block_[batch_idx].pop();
    }

    // get block_table_before_retrieval_ and cache_seqlens_
    if (block_table == nullptr) {
        max_block_num = past_block_num_[layer_idx];
        for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
            if (cache_total_len_ != 0)
                cache_seqlens_[batch_idx] = cache_total_len_;
            else
                cache_seqlens_[batch_idx] = max_block_num * config_.block_len;
            for (int i = 0; i < max_block_num; i++) {
                block_table_before_retrieval_[batch_idx][i] = i;
                block_similar_[batch_idx][i] = 0;
            }
        }
    } else {
        for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
            cache_seqlens_[batch_idx] = cache_seqlens[batch_idx];
            for (int i = 0; i < max_block_num; i++) {
                block_table_before_retrieval_[batch_idx][i] =
                    block_table[batch_idx * max_block_num + i];
                block_similar_[batch_idx][i] = 0;
            }
        }
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("layer %d time of initializing attention: %f s\n", layer_idx,
    //        std::chrono::duration<double>(end - start).count());
}

void KVCache::calculate_block_similarity_layer_(
    const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
    int max_block_num, int *cache_seqlens, int init_block_num,
    int local_block_num, int pick_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    if (batch_size == 1 &&
        config_.anchor_num == 1) { // TODO: improve batch_size > 1
        for (int batch_id = 0; batch_id < batch_size; batch_id++) {
            if (q_len == 1) {
                for (int j = 0; j < config_.head_dim * config_.q_head_num;
                     j++) {
                    avg_q[batch_id][j] = GGML_FP16_TO_FP32(
                        q_in_data[batch_id * q_len * config_.q_head_num *
                                      config_.head_dim +
                                  j]);
                    avg_q_fp16[batch_id][j] =
                        q_in_data[batch_id * q_len * config_.q_head_num *
                                      config_.head_dim +
                                  j];
                }
            } else {
                for (int j = 0; j < config_.head_dim * config_.q_head_num;
                     j++) {
                    avg_q[batch_id][j] = 0;
                }
                for (int i = 0; i < q_len; i++) {
                    for (int j = 0; j < config_.head_dim; j++) {
                        avg_q[batch_id][j] += GGML_FP16_TO_FP32(
                            q_in_data[batch_id * q_len * config_.q_head_num *
                                          config_.head_dim +
                                      i * config_.q_head_num *
                                          config_.head_dim +
                                      j]);
                    }
                }
                for (int j = 0; j < config_.head_dim * config_.q_head_num;
                     j++) {
                    avg_q[batch_id][j] /= q_len;
                    avg_q_fp16[batch_id][j] =
                        GGML_FP32_TO_FP16(avg_q[batch_id][j]);
                }
            }
            int seq_len = cache_seqlens_[batch_id];
            int block_num = (seq_len / config_.block_len) - local_block_num -
                            init_block_num;
            if (block_num <= 0) {
                continue;
            }
            bool is_seq = true;
            for (int i = init_block_num + 1;
                 i < (seq_len / config_.block_len) - local_block_num; i++) {
                if (block_table_before_retrieval_[batch_id][i] !=
                    block_table_before_retrieval_[batch_id][i - 1] + 1) {
                    is_seq = false;
                    break;
                }
            }
            if (is_seq) {
                int nth = backend->get_thread_num();
                backend->do_work_stealing_job(
                    nth, nullptr,
                    [&](int task_id) {
                        int ith = task_id;
                        bool ok = llamafile_sgemm(
                            block_num, 1, config_.q_head_num * config_.head_dim,
                            anchor_.data() +
                                (layer_idx * config_.max_block_num +
                                 block_table_before_retrieval_
                                     [batch_id][init_block_num]) *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim,
                            config_.q_head_num * config_.head_dim,
                            avg_q_fp16[batch_id].data(),
                            config_.q_head_num * config_.head_dim,
                            block_similar_[batch_id].data() + init_block_num,
                            block_num, ith, nth, GGML_TASK_TYPE_COMPUTE,
                            GGML_TYPE_F16, GGML_TYPE_F16, GGML_TYPE_F32,
                            GGML_PREC_DEFAULT);
                        if (!ok) {
                            printf("llamafile_sgemm failed\n");
                        }
                    },
                    nullptr);
            } else {
                backend->do_work_stealing_job(
                    block_num, nullptr,
                    [&](int task_id) {
                        int block_id = task_id + init_block_num;
                        int block_idx =
                            block_table_before_retrieval_[batch_id][block_id];
                        bool ok = llamafile_sgemm(
                            1, 1, config_.q_head_num * config_.head_dim,
                            anchor_.data() +
                                (layer_idx * config_.max_block_num +
                                 block_table_before_retrieval_[batch_id]
                                                              [block_idx]) *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim,
                            config_.q_head_num * config_.head_dim,
                            avg_q_fp16[batch_id].data(),
                            config_.q_head_num * config_.head_dim,
                            block_similar_[batch_id].data() + block_id, 1, 0, 1,
                            GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F16,
                            GGML_TYPE_F16, GGML_TYPE_F32, GGML_PREC_DEFAULT);
                        if (!ok) {
                            printf("llamafile_sgemm failed\n");
                        }
                    },
                    nullptr);
            }
        }
    } else {
        backend->do_work_stealing_job(
            batch_size * max_block_num, nullptr,
            [&](int task_id) {
                int batch_id = task_id / max_block_num;
                int block_id = task_id % max_block_num;
                int seq_len = cache_seqlens_[batch_id];

                if (block_id < init_block_num ||
                    block_id >=
                        (seq_len / config_.block_len) - local_block_num) {
                    return;
                }

                int block_idx =
                    block_table_before_retrieval_[batch_id][block_id];
                float sim = 0;

                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int i = 0; i < config_.head_dim; i++) {
                        float q_i = 0,
                              qa_i = std::numeric_limits<float>::lowest();
                        for (int q_id = 0; q_id < q_len; q_id++) {
                            q_i += GGML_FP16_TO_FP32(
                                q_in_data[batch_id * q_len *
                                              config_.q_head_num *
                                              config_.head_dim +
                                          q_id * config_.q_head_num *
                                              config_.head_dim +
                                          head_id * config_.head_dim + i]);
                        }
                        q_i /= q_len;
                        for (int anchor_id = 0; anchor_id < config_.anchor_num;
                             anchor_id++) {
                            qa_i = std::max(
                                qa_i,
                                GGML_FP16_TO_FP32(
                                    anchor_[(long long)layer_idx *
                                                config_.max_block_num *
                                                config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            block_idx * config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            anchor_id * config_.q_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim + i]) *
                                    q_i);
                        }
                        sim += qa_i;
                    }
                }
                block_similar_[batch_id][block_id] = sim;
            },
            nullptr);
    }
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating similarity: %f s\n", layer_idx,
    //        diff.count());
}

void KVCache::select_block_layer_(int batch_size, int layer_idx,
                                  int max_block_num, int init_block_num,
                                  int local_block_num, int pick_block_num) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {

        if (cache_seqlens_[batch_idx] / config_.block_len <=
            init_block_num + pick_block_num + local_block_num) {
            block_table_after_retrieval_[batch_idx].swap(
                block_table_before_retrieval_[batch_idx]);
            selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] = 0;
            continue;
        }

        for (int block_id = init_block_num;
             block_id <
             (cache_seqlens_[batch_idx] / config_.block_len) - local_block_num;
             block_id++) {
            top_similar_block_[batch_idx].push(std::make_pair(
                block_similar_[batch_idx][block_id],
                block_table_before_retrieval_[batch_idx][block_id]));
            if (top_similar_block_[batch_idx].size() > pick_block_num) {
                top_similar_block_[batch_idx].pop();
            }
        }

        int i = 0;
        for (; i < init_block_num; i++) {
            block_table_after_retrieval_[batch_idx][i] =
                block_table_before_retrieval_[batch_idx][i];
        }
        while (!top_similar_block_[batch_idx].empty()) {
            block_table_after_retrieval_[batch_idx][i] =
                top_similar_block_[batch_idx].top().second;
            top_similar_block_[batch_idx].pop();
            i++;
        }
        for (; i < init_block_num + pick_block_num + local_block_num; i++) {
            block_table_after_retrieval_[batch_idx][i] =
                block_table_before_retrieval_[batch_idx]
                                             [(cache_seqlens_[batch_idx] /
                                               config_.block_len) -
                                              local_block_num + i -
                                              init_block_num - pick_block_num];
        }
        if (cache_seqlens_[batch_idx] % config_.block_len != 0) {
            block_table_after_retrieval_[batch_idx][i] =
                block_table_before_retrieval_[batch_idx][(
                    cache_seqlens_[batch_idx] / config_.block_len)];
            cache_seqlens_[batch_idx] =
                (cache_seqlens_[batch_idx] % config_.block_len) +
                i * config_.block_len;
            i++;
        } else {
            cache_seqlens_[batch_idx] =
                (cache_seqlens_[batch_idx] % config_.block_len) +
                i * config_.block_len;
        }
        for (int j = 0; j < i; j++) {
            selected_blocks_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step][batch_idx][j] =
                block_table_after_retrieval_[batch_idx][j];
        }
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] = i;
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of selecting blocks: %f s\n", layer_idx,
    //        diff.count());
}

// retrieval kvcache, get the init_block_num block at beginning, top
// pick_block_num similar and last local_block_num blocks. Each task
// calculates the simlarity of a certain block with the query, then push
// the block into the priority queue. Finally, the required blocks are
// pushed into the block_table_after_retrieval_.
void KVCache::retrieval_kvcache_layer_(const uint16_t *q_in_data,
                                       int init_block_num, int local_block_num,
                                       int pick_block_num, int q_len,
                                       int generate_token_idx, int batch_size,
                                       int layer_idx, int *cache_seqlens,
                                       int &max_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    max_block_num_after_retrieval_ = 0;
    if (pick_block_num != -1 &&
        (generate_token_idx % config_.token_step != 0 ||
         (layer_idx % config_.layer_step != config_.layer_offset))) {

        if (selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] == 0) {
            max_block_num_after_retrieval_ = max_block_num;
            block_table_after_retrieval_.swap(block_table_before_retrieval_);
        } else {
            max_block_num_after_retrieval_ = selected_blocks_num_history_
                [(layer_idx - config_.layer_offset) / config_.layer_step];
            for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
                for (int i = 0; i < max_block_num_after_retrieval_; i++) {
                    block_table_after_retrieval_[batch_idx][i] =
                        selected_blocks_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step][batch_idx]
                                                [i];
                }

                if (cache_seqlens[batch_idx] % config_.block_len == 1) {
                    selected_blocks_num_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step] += 1;
                    int x =
                        selected_blocks_num_history_[(layer_idx -
                                                      config_.layer_offset) /
                                                     config_.layer_step];
                    int last_block_idx =
                        block_table_before_retrieval_[batch_idx]
                                                     [cache_seqlens[batch_idx] /
                                                      config_.block_len];
                    selected_blocks_history_[(layer_idx -
                                              config_.layer_offset) /
                                             config_.layer_step][batch_idx]
                                            [x - 1] = last_block_idx;
                    block_table_after_retrieval_[batch_idx][x - 1] =
                        last_block_idx;
                }
                cache_seqlens_[batch_idx] =
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                    selected_blocks_num_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step] *
                        config_.block_len -
                    config_.block_len;
            }
        }
    } else if (pick_block_num != -1) {
        max_block_num_after_retrieval_ =
            std::min(max_block_num,
                     init_block_num + pick_block_num + local_block_num + 1);
        calculate_block_similarity_layer_(q_in_data, batch_size, layer_idx,
                                          q_len, max_block_num, cache_seqlens,
                                          init_block_num, local_block_num,
                                          pick_block_num, backend);
        select_block_layer_(batch_size, layer_idx, max_block_num,
                            init_block_num, local_block_num, pick_block_num);
    } else {
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] = 0;
        max_block_num_after_retrieval_ = max_block_num;
        block_table_after_retrieval_.swap(block_table_before_retrieval_);
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    //     printf("layer %d time of retrieval kvcache: %f s\n", layer_idx,
    //     std::chrono::duration<double>(end - start).count());
}
void KVCache::calculate_sparsity_layer_(const uint16_t *q_in_data,
                                        float *attn_sparsity, int batch_size,
                                        int max_block_num, int *block_table,
                                        int *cache_seqlens, Backend *backend

) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int head_id = (task_id % (config_.kv_head_num * max_block_num)) /
                          max_block_num;
            int block_id = task_id % max_block_num;
            int thread_id = Backend::thread_local_id;
            // If the block is out of the sequence length, skip it.
            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;
                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            for (int i = 0; i < n_gqa_; i++) {
                block_lse_[batch_id][block_idx][head_id * n_gqa_ + i] =
                    thread_local_attn_lse_[thread_id][i];
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });

    for (int i = 0; i < batch_size; i++) {
        for (int j = 0; j < max_block_num_after_retrieval_; j++) {
            int block_idx = block_table_after_retrieval_[i][j];
            for (int k = 0; k < config_.q_head_num; k++) {
                attn_sparsity[i * config_.q_head_num + k] +=
                    std::exp(block_lse_[i][block_idx][k] -
                             attn_lse_[i][k / n_gqa_][k % n_gqa_]);
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating sparsity: %f s\n", layer_id_,
    //        diff.count());
}

void KVCache::attn_initialize_kvhead_(int batch_size, int layer_idx,
                                      int *block_table, int &max_block_num,
                                      int *cache_seqlens) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        // initialize output_fp32_ and attn_lse_
        for (int i = 0; i < config_.kv_head_num; i++) {
            for (int j = 0; j < n_gqa_ * config_.head_dim; j++) {
                output_fp32_[batch_idx][i][j] = 0;
            }
            for (int j = 0; j < n_gqa_; j++) {
                attn_lse_[batch_idx][i][j] = 0;
            }
        }

        // clear top_similar_block_
        while (!top_similar_block_[batch_idx].empty())
            top_similar_block_[batch_idx].pop();
    }

    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        cache_seqlens_[batch_idx] = cache_seqlens[batch_idx];
        for (int i = 0; i < max_block_num; i++) {
            for (int j = 0; j < config_.kv_head_num; j++) {
                block_table_before_retrieval_kvhead_[batch_idx][i][j] =
                    block_table[batch_idx * max_block_num + i];
                block_similar_kv_head_[batch_idx][i][j] = 0;
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("layer %d time of initializing attn: %f s\n", layer_idx,
    //        std::chrono::duration<double>(end - start).count());
}
void KVCache::retrieval_kvcache_kvhead_(const uint16_t *q_in_data,
                                        int init_block_num, int local_block_num,
                                        int pick_block_num, int q_len,
                                        int generate_token_idx, int batch_size,
                                        int layer_idx, int *cache_seqlens,
                                        int &max_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    max_block_num_after_retrieval_ = 0;
    if (pick_block_num != -1 &&
        (generate_token_idx % config_.token_step != 0 ||
         (layer_idx % config_.layer_step != config_.layer_offset))) {

        if (selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] == 0) {
            max_block_num_after_retrieval_ = max_block_num;
            for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
                for (int i = 0; i < max_block_num; i++) {
                    for (int j = 0; j < config_.kv_head_num; j++) {
                        block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                            block_table_before_retrieval_kvhead_[batch_idx][i]
                                                                [j];
                    }
                }
            }
        } else {

            max_block_num_after_retrieval_ = selected_blocks_num_history_
                [(layer_idx - config_.layer_offset) / config_.layer_step];

            for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
                for (int i = 0; i < max_block_num_after_retrieval_; i++) {
                    for (int j = 0; j < config_.kv_head_num; j++) {
                        block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                            selected_blocks_history_kvhead_
                                [(layer_idx - config_.layer_offset) /
                                 config_.layer_step][batch_idx][i][j];
                    }
                }

                if (cache_seqlens[batch_idx] % config_.block_len == 1) {
                    selected_blocks_num_history_[(layer_idx -
                                                  config_.layer_offset) /
                                                 config_.layer_step] += 1;
                    int x =
                        selected_blocks_num_history_[(layer_idx -
                                                      config_.layer_offset) /
                                                     config_.layer_step];
                    for (int i = 0; i < config_.kv_head_num; i++) {
                        int last_block_idx =
                            block_table_before_retrieval_kvhead_
                                [batch_idx][cache_seqlens[batch_idx] /
                                            config_.block_len][i];
                        selected_blocks_history_kvhead_[(layer_idx -
                                                         config_.layer_offset) /
                                                        config_.layer_step]
                                                       [batch_idx][x - 1][i] =
                                                           last_block_idx;
                        block_table_after_retrieval_kvhead_[batch_idx][x - 1]
                                                           [i] = last_block_idx;
                    }
                }
                cache_seqlens_[batch_idx] = std::min(
                    cache_seqlens_[batch_idx],
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                        (init_block_num + pick_block_num + local_block_num) *
                            config_.block_len);
            }
        }
    } else if (pick_block_num != -1) {
        max_block_num_after_retrieval_ =
            std::min(max_block_num,
                     init_block_num + pick_block_num + local_block_num + 1);
        calculate_block_similarity_kvhead_(q_in_data, batch_size, layer_idx,
                                           q_len, max_block_num, cache_seqlens,
                                           init_block_num, local_block_num,
                                           pick_block_num, backend);
        select_block_kvhead_(batch_size, layer_idx, max_block_num,
                             init_block_num, local_block_num, pick_block_num);
    } else {
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] = 0;
        max_block_num_after_retrieval_ = max_block_num;
        for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
            for (int i = 0; i < max_block_num; i++) {
                for (int j = 0; j < config_.kv_head_num; j++) {
                    block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                        block_table_before_retrieval_kvhead_[batch_idx][i][j];
                }
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    // printf("layer %d time of retrieval kvcache: %f s\n", layer_idx,
    //        std::chrono::duration<double>(end - start).count());
}
void KVCache::calculate_sparsity_kvhead_(const uint16_t *q_in_data,
                                         float *attn_sparsity, int batch_size,
                                         int max_block_num, int *block_table,
                                         int *cache_seqlens, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * max_block_num,
        [&](int thread_id) {
            thread_cur_head_idx_[thread_id].first = -1;
            thread_cur_head_idx_[thread_id].second = -1;
        },
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int head_id = (task_id % (config_.kv_head_num * max_block_num)) /
                          max_block_num;
            int block_id = task_id % max_block_num;
            int thread_id = Backend::thread_local_id;
            // If the block is out of the sequence length, skip it.
            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];
            if (cache_seqlens_[batch_id] / config_.block_len == block_id) {
                int seq_len = cache_seqlens_[batch_id] % config_.block_len;
                if (seq_len == 0)
                    return;

                // Prepare the attention mask for the last block.
                int full_blocks = seq_len / 8;
                int remaining_bits = seq_len % 8;

                // Fill full blocks with 1s
                for (int i = 0; i < full_blocks; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0xFF;
                }
                // Fill the remaining bits in the next block
                if (remaining_bits > 0 && full_blocks < seq_len_ / 8) {
                    thread_local_attn_mask_[thread_id][full_blocks] =
                        (1 << remaining_bits) - 1;
                } else {
                    thread_local_attn_mask_[thread_id][full_blocks] = 0;
                }

                for (int i = full_blocks + 1; i < seq_len_ / 8; ++i) {
                    thread_local_attn_mask_[thread_id][i] = 0;
                }
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, false,
                        thread_local_attn_mask_[thread_id].data(),
                        GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            } else {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num, GGML_TYPE_F16,
                        (void *)&q_in_data[batch_id * config_.kv_head_num *
                                               n_gqa_ * config_.head_dim +
                                           head_id * n_gqa_ * config_.head_dim],
                        seq_len_, 0, true, nullptr, GGML_TYPE_F16, 0,
                        k_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_F16, 1,
                        v_cache_fp16_[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q4_0, 0,
                        k_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q4_0, 1,
                        v_cache_q4[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    attn_with_kvcache_one_block_(
                        config_.head_dim,
                        config_.q_head_num / config_.kv_head_num,
                        GGML_TYPE_Q8_0, q_q8_0_[batch_id][head_id].data(),
                        seq_len_, 0, true, nullptr, GGML_TYPE_Q8_0, 0,
                        k_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr, GGML_TYPE_Q8_0, 1,
                        v_cache_q8[layer_id_][head_id][block_idx].data(), 0,
                        nullptr, nullptr,
                        thread_local_attn_score_[thread_id].data(),
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_attn_lse_[thread_id].data(),
                        thread_local_draft_[thread_id].data(), nullptr,
                        cos_.data(), sin_.data());
                    dequantize_row_q8_0(
                        thread_local_output_q8_0_[thread_id].data(),
                        thread_local_output_fp32_[thread_id].data(),
                        n_gqa_ * config_.head_dim);
                }
            }
            for (int i = 0; i < n_gqa_; i++) {
                block_lse_[batch_id][block_idx][head_id * n_gqa_ + i] =
                    thread_local_attn_lse_[thread_id][i];
            }
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (batch_id == cur_batch_idx && head_id == cur_head_id) {
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse =
                        thread_local_cur_attn_lse_[thread_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_attn_lse_[thread_id][i] -
                                     thread_local_cur_attn_lse_[thread_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_[thread_id]
                                                     [i * config_.head_dim +
                                                      j] +=
                            thread_local_output_fp32_[thread_id]
                                                     [i * config_.head_dim + j];
                    }
                    thread_local_cur_attn_lse_[thread_id][i] = new_attn_lse;
                }
            } else {
                if (cur_batch_idx != -1) {
                    mutex_[cur_batch_idx][cur_head_id]->lock();
                    for (int i = 0; i < n_gqa_; i++) {
                        if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                            1e-6) {
                            attn_lse_[cur_batch_idx][cur_head_id][i] =
                                thread_local_cur_attn_lse_[thread_id][i];
                            for (int j = 0; j < config_.head_dim; j++) {
                                output_fp32_[cur_batch_idx][cur_head_id]
                                            [i * config_.head_dim + j] =
                                                thread_local_cur_output_fp32_
                                                    [thread_id]
                                                    [i * config_.head_dim + j];
                            }
                            continue;
                        }
                        float new_attn_lse =
                            attn_lse_[cur_batch_idx][cur_head_id][i] +
                            std::log(
                                1.0 +
                                std::exp(
                                    thread_local_cur_attn_lse_[thread_id][i] -
                                    attn_lse_[cur_batch_idx][cur_head_id][i]));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            output_fp32_[cur_batch_idx][cur_head_id].data() +
                                i * config_.head_dim,
                            std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                     new_attn_lse));
                        ggml_vec_scale_f32(
                            config_.head_dim,
                            thread_local_cur_output_fp32_[thread_id].data() +
                                i * config_.head_dim,
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     new_attn_lse));
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] +=
                                thread_local_cur_output_fp32_
                                    [thread_id][i * config_.head_dim + j];
                        }
                        attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                    }
                    mutex_[cur_batch_idx][cur_head_id]->unlock();
                }
                thread_cur_head_idx_[thread_id].first = batch_id;
                thread_cur_head_idx_[thread_id].second = head_id;
                for (int i = 0; i < n_gqa_; i++) {
                    thread_local_cur_attn_lse_[thread_id][i] =
                        thread_local_attn_lse_[thread_id][i];
                    for (int j = 0; j < config_.head_dim; j++) {
                        thread_local_cur_output_fp32_
                            [thread_id][i * config_.head_dim + j] =
                                thread_local_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                }
            }
        },
        // Merge the results of the remaining blocks.
        [&](int thread_id) {
            int cur_batch_idx = thread_cur_head_idx_[thread_id].first;
            int cur_head_id = thread_cur_head_idx_[thread_id].second;
            if (cur_head_id != -1) {
                mutex_[cur_batch_idx][cur_head_id]->lock();
                for (int i = 0; i < n_gqa_; i++) {
                    float new_attn_lse;
                    if (std::abs(attn_lse_[cur_batch_idx][cur_head_id][i]) <
                        1e-6) {
                        attn_lse_[cur_batch_idx][cur_head_id][i] =
                            thread_local_cur_attn_lse_[thread_id][i];
                        for (int j = 0; j < config_.head_dim; j++) {
                            output_fp32_[cur_batch_idx][cur_head_id]
                                        [i * config_.head_dim + j] =
                                            thread_local_cur_output_fp32_
                                                [thread_id]
                                                [i * config_.head_dim + j];
                        }
                        continue;
                    }
                    new_attn_lse =
                        attn_lse_[cur_batch_idx][cur_head_id][i] +
                        std::log(
                            1.0 +
                            std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                     attn_lse_[cur_batch_idx][cur_head_id][i]));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        output_fp32_[cur_batch_idx][cur_head_id].data() +
                            i * config_.head_dim,
                        std::exp(attn_lse_[cur_batch_idx][cur_head_id][i] -
                                 new_attn_lse));
                    ggml_vec_scale_f32(
                        config_.head_dim,
                        thread_local_cur_output_fp32_[thread_id].data() +
                            i * config_.head_dim,
                        std::exp(thread_local_cur_attn_lse_[thread_id][i] -
                                 new_attn_lse));
                    for (int j = 0; j < config_.head_dim; j++) {
                        output_fp32_[cur_batch_idx][cur_head_id]
                                    [i * config_.head_dim + j] +=
                            thread_local_cur_output_fp32_[thread_id]
                                                         [i * config_.head_dim +
                                                          j];
                    }
                    attn_lse_[cur_batch_idx][cur_head_id][i] = new_attn_lse;
                }
                mutex_[cur_batch_idx][cur_head_id]->unlock();
            }
        });

    for (int i = 0; i < batch_size; i++) {
        for (int j = 0; j < max_block_num_after_retrieval_; j++) {
            for (int k = 0; k < config_.q_head_num; k++) {
                int block_idx =
                    block_table_after_retrieval_kvhead_[i][j][k / n_gqa_];
                attn_sparsity[i * config_.q_head_num + k] +=
                    std::exp(block_lse_[i][block_idx][k] -
                             attn_lse_[i][k / n_gqa_][k % n_gqa_]);
            }
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating sparsity: %f s\n", layer_id_,
    //        diff.count());
}
void KVCache::calculate_block_similarity_kvhead_(
    const uint16_t *q_in_data, int batch_size, int layer_idx, int q_len,
    int max_block_num, int *cache_seqlens, int init_block_num,
    int local_block_num, int pick_block_num, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    backend->do_work_stealing_job(
        batch_size * max_block_num, nullptr,
        [&](int task_id) {
            int batch_id = task_id / max_block_num;
            int block_id = task_id % max_block_num;
            int seq_len = cache_seqlens_[batch_id];

            if (block_id < init_block_num ||
                block_id >= (seq_len / config_.block_len) - local_block_num) {
                return;
            }
            int block_idx =
                block_table_before_retrieval_kvhead_[batch_id][block_id][0];

            for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                for (int i = 0; i < config_.head_dim; i++) {
                    float q_i = 0, qa_i = std::numeric_limits<float>::lowest();
                    for (int q_id = 0; q_id < q_len; q_id++) {
                        q_i += GGML_FP16_TO_FP32(
                            q_in_data[batch_id * q_len * config_.q_head_num *
                                          config_.head_dim +
                                      q_id * config_.q_head_num *
                                          config_.head_dim +
                                      head_id * config_.head_dim + i]);
                    }
                    q_i /= q_len;
                    for (int anchor_id = 0; anchor_id < config_.anchor_num;
                         anchor_id++) {
                        qa_i = std::max(
                            qa_i,
                            GGML_FP16_TO_FP32(
                                anchor_[layer_idx * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        anchor_id * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + i]) *
                                q_i);
                    }
                    block_similar_kv_head_[batch_id][block_id]
                                          [head_id / n_gqa_] += qa_i;
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of calculating similarity: %f s\n", layer_idx,
    //        diff.count());
}
void KVCache::select_block_kvhead_(int batch_size, int layer_idx,
                                   int max_block_num, int init_block_num,
                                   int local_block_num, int pick_block_num) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        int cache_len_after_retrieval = 0;
        if (cache_seqlens_[batch_idx] / config_.block_len <=
            init_block_num + pick_block_num + local_block_num) {
            selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                         config_.layer_step] = 0;
            for (int i = 0; i < max_block_num; i++) {
                for (int j = 0; j < config_.kv_head_num; j++) {
                    block_table_after_retrieval_kvhead_[batch_idx][i][j] =
                        block_table_before_retrieval_kvhead_[batch_idx][i][j];
                }
            }
            continue;
        }
        for (int head_id = 0; head_id < config_.kv_head_num; head_id++) {

            for (int block_id = init_block_num;
                 block_id < (cache_seqlens_[batch_idx] / config_.block_len) -
                                local_block_num;
                 block_id++) {

                top_similar_block_[batch_idx].push(std::make_pair(
                    block_similar_kv_head_[batch_idx][block_id][head_id],
                    block_table_before_retrieval_kvhead_[batch_idx][block_id]
                                                        [head_id]));
                if (top_similar_block_[batch_idx].size() > pick_block_num) {
                    top_similar_block_[batch_idx].pop();
                }
            }

            int i = 0;
            for (; i < init_block_num; i++) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    block_table_before_retrieval_kvhead_[batch_idx][i][head_id];
            }
            while (!top_similar_block_[batch_idx].empty()) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    top_similar_block_[batch_idx].top().second;
                top_similar_block_[batch_idx].pop();
                i++;
            }
            for (; i < init_block_num + pick_block_num + local_block_num; i++) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    block_table_before_retrieval_kvhead_
                        [batch_idx]
                        [(cache_seqlens_[batch_idx] / config_.block_len) -
                         local_block_num + i - init_block_num - pick_block_num]
                        [head_id];
            }
            if (cache_seqlens_[batch_idx] % config_.block_len != 0) {
                block_table_after_retrieval_kvhead_[batch_idx][i][head_id] =
                    block_table_before_retrieval_kvhead_[batch_idx][(
                        cache_seqlens_[batch_idx] / config_.block_len)]
                                                        [head_id];
                cache_len_after_retrieval =
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                    i * config_.block_len;
                i++;
            } else {
                cache_len_after_retrieval =
                    (cache_seqlens_[batch_idx] % config_.block_len) +
                    i * config_.block_len;
            }
            for (int j = 0; j < i; j++) {
                selected_blocks_history_kvhead_
                    [(layer_idx - config_.layer_offset) / config_.layer_step]
                    [batch_idx][j][head_id] =
                        block_table_after_retrieval_kvhead_[batch_idx][j]
                                                           [head_id];
            }
        }
        cache_seqlens_[batch_idx] = cache_len_after_retrieval;
        selected_blocks_num_history_[(layer_idx - config_.layer_offset) /
                                     config_.layer_step] =
            (cache_len_after_retrieval + config_.block_len - 1) /
            config_.block_len;
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    // printf("layer %d time of selecting block: %f s\n", layer_idx,
    //        diff.count())
}

void KVCache::get_attn_sparsity(const ggml_fp16_t *q_in, float *attn_sparsity,
                                int layer_idx, int generate_token_idx,
                                int q_len, int batch_size, int max_block_num,
                                int *block_table, int *cache_seqlens,
                                int *block_table_origin,
                                int *cache_seqlens_origin,
                                int max_block_num_origin, int topk, int local,
                                Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    layer_id_ = layer_idx;
    int thread_num = backend->get_thread_num();
    batch_size = 1;

    const uint16_t *q_in_data = const_cast<const uint16_t *>(q_in);

    quantize_q_(q_in_data, batch_size);
    if (config_.retrieval_type == RetrievalType::LAYER) {
        attn_initialize_layer_(batch_size, layer_idx, block_table,
                               max_block_num, cache_seqlens);
        retrieval_kvcache_layer_(q_in_data, 1, local, topk, q_len,
                                 generate_token_idx, batch_size, layer_idx,
                                 cache_seqlens, max_block_num, backend);
        calculate_sparsity_layer_(q_in_data, attn_sparsity, batch_size,
                                  max_block_num_origin, block_table_origin,
                                  cache_seqlens_origin, backend);
    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        attn_initialize_kvhead_(batch_size, layer_idx, block_table,
                                max_block_num, cache_seqlens);
        retrieval_kvcache_kvhead_(q_in_data, 1, local, topk, q_len,
                                  generate_token_idx, batch_size, layer_idx,
                                  cache_seqlens, max_block_num, backend);
        calculate_sparsity_kvhead_(q_in_data, attn_sparsity, batch_size,
                                   max_block_num_origin, block_table_origin,
                                   cache_seqlens_origin, backend);
    }
}

void KVCache::attn_with_kvcache_one_block_(
    int head_dim, int bsz,
    ggml_type q_type, // GGML data type of `Q`, only supports fp16 and q8_0
    // [bsz, head_dim]
    // Quantization is always on the head_dim dimension (per_token). If
    // head_dim % 32 != 0, an error will be raised. The size must be bsz *
    // head_dim/32 * qtype_size.
    const void *q,

    int past_kv_len, int past_kv_offset,
    bool is_full_attn, // true indicates a full 1 mask
    // If is_full_attn = false, a bit matrix representing the mask is
    // passed. [bsz, past_kv_len]
    const uint8_t *attn_mask,

    ggml_type k_type, // GGML data type of `K Cache`, only supports fp16,
                      // q4_0, q8_0
    int k_quant_type, // 0 for per_token, 1 for per_channel, others raise an
                      // error
    // [seq_len, head_dim]
    // If quant_type == 0, head_dim % 32 must be 0.
    // If quant_type == 1, seq_len % 32 must be 0.
    const void *k_cache,

    // k_anchor_type must be fp16
    int num_k_anchor, // num_k_anchor == 0 indicates no anchor
    // [num_k_anchor, head_dim]
    const void *k_cache_anchors,
    // Each token is associated with the nearest previous position's anchor,
    // with the same distance.
    const int *k_cache_anchor_pos,

    // v_cache similar to k_cache
    ggml_type v_type, int v_quant_type,
    // [head_dim, seq_len]
    const void *v_cache, int num_v_anchor, const void *v_cache_anchors,
    const int *v_cache_anchor_pos,

    // Pre-allocated buffer for intermediate calculations [bsz,
    // past_kv_len]. No malloc is performed inside this function.
    float *attn_score,

    // Output: [bsz, head_dim], with the same type as q_type
    void *output,
    // [bsz]
    float *lse,

    // Pre-allocated temporary buffer with sufficient size:
    // (2 * bsz * past_kv_len + 6 * bsz * head_dim + 2 * past_kv_len *
    // head_dim + past_kv_len * head_dim / 32) bytes.
    void *draft,

    // Apply rotary embedding online
    const int *rotary_angle, const void *rotary_cos, const void *rotary_sin
    // rotary_cos=None,
    // rotary_sin=None,
    // cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
    // cache_batch_idx: Optional[torch.Tensor] = None,
    // rotary_interleaved=True,

    // // Not supported for now
    // window_size=(-1, -1),  # -1 means infinite context window
    // alibi_slopes=None,
) {
    assert(head_dim % 32 == 0);
    assert(k_quant_type == 0);
    assert(v_quant_type == 1);
    assert(q_type == GGML_TYPE_F16 || q_type == GGML_TYPE_Q8_0);
    if (q_type == GGML_TYPE_F16) {
        assert(k_type == GGML_TYPE_F16);
        assert(v_type == GGML_TYPE_F16);

        // attn = q * k + q * k_anchor
        // TODO: anchor
        assert(num_k_anchor == 0);

        if (rotary_angle != nullptr) {
            ggml_fp16_t *k_cache_with_rope_fp16 =
                (reinterpret_cast<ggml_fp16_t *>(draft) +
                 sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
                 sizeof(float) * bsz * head_dim);
            // dequant k_cache and apply rope
            // k_rope(i) = k(i) * cos(i) - k(i+l) * sin(i)
            // k_rope(i+l) = k(i+l) * cos(i+l) + k(i) * sin(i)

            // k(i)cos(i) -> k_rope(i)
            // k(i)sin(i+l) -> k_rope(i+l)

            // k(i)cos(i) -> k_rope(i)
            // -k(i)sin(i-l) -> k_rope(i-l)

            std::vector<float> block_fp32(32);
            for (int k = 0; k < past_kv_len; k++) {
                int angle = rotary_angle[k];
                for (int l = 0; l < head_dim / 32; l++) {
                    for (int m = 0; m < 32; m++) {
                        float x = GGML_FP16_TO_FP32((
                            (ggml_fp16_t *)k_cache)[k * head_dim + l * 32 + m]);
                        float sin_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_sin)[angle * head_dim + l * 32 + m]);
                        float cos_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_cos)[angle * head_dim + l * 32 + m]);

                        if (l * 32 + m < head_dim / 2) {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] =
                                GGML_FP32_TO_FP16(x * cos_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m +
                                                   head_dim / 2] =
                                GGML_FP32_TO_FP16(-x * sin_val);
                        } else {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] =
                                GGML_FP32_TO_FP16(
                                    GGML_FP16_TO_FP32(
                                        k_cache_with_rope_fp16[k * head_dim +
                                                               l * 32 + m]) +
                                    x * sin_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m -
                                                   head_dim / 2] =
                                GGML_FP32_TO_FP16(
                                    GGML_FP16_TO_FP32(
                                        k_cache_with_rope_fp16[k * head_dim +
                                                               l * 32 + m -
                                                               head_dim / 2]) -
                                    x * cos_val);
                        }
                    }
                }
            }

            llamafile_sgemm(past_kv_len, bsz, head_dim,
                            (ggml_fp16_t *)k_cache_with_rope_fp16, head_dim,
                            (ggml_fp16_t *)q, head_dim, attn_score, past_kv_len,
                            0, 1, GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_F16,
                            GGML_TYPE_F32, GGML_PREC_DEFAULT);
        } else {
            bool ok = llamafile_sgemm(
                past_kv_len, bsz, head_dim, (ggml_fp16_t *)k_cache, head_dim,
                (ggml_fp16_t *)q, head_dim, attn_score, past_kv_len, 0, 1,
                GGML_TASK_TYPE_COMPUTE, k_type, GGML_TYPE_F16, GGML_TYPE_F32,
                GGML_PREC_DEFAULT);

            if (!ok) {
                printf("llamafile_sgemm failed\n");
            }
        }
        // attn = attn * scale
        float scale_factor = 1.0 / std::sqrt(float(head_dim));
        ggml_vec_scale_f32(bsz * past_kv_len, attn_score, scale_factor);

        // attn = attn & mask
        if (!is_full_attn) {
            for (int i = 0; i < bsz; i++) {
                for (int j = 0; j < past_kv_len; j++) {
                    int index = i * past_kv_len + j;
                    if (!(attn_mask[j / 8] & (1 << (j % 8)))) {
                        attn_score[index] =
                            std::numeric_limits<float>::lowest();
                    }
                }
            }
        }

        // attn = softmax(attn)
        for (int i = 0; i < bsz; i++) {
            float sum_exp = 0;
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] =
                    std::exp(attn_score[i * past_kv_len + j]);
                sum_exp += attn_score[i * past_kv_len + j];
            }
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] /= sum_exp;
            }
            if (lse != nullptr) {
                lse[i] = std::log(sum_exp);
            }
        }

        // output = attn * v + attn * v_anchor
        // std::vector<float> sum(bsz * head_dim);
        float *sum = reinterpret_cast<float *>(reinterpret_cast<char *>(draft) +
                                               sizeof(block_q8_0) * bsz *
                                                   past_kv_len / QK8_0);

        // float* attn_score_fp16(bsz, past_kv_len)
        ggml_fp16_t *attn_score_fp16 = (reinterpret_cast<ggml_fp16_t *>(
            reinterpret_cast<char *>(draft) +
            sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
            sizeof(float) * bsz * head_dim));

        for (int i = 0; i < bsz * past_kv_len; i++) {
            attn_score_fp16[i] = GGML_FP32_TO_FP16(attn_score[i]);
        }

        // TODO: anchor
        assert(num_v_anchor == 0);
        bool ok = llamafile_sgemm(
            head_dim, bsz, past_kv_len, (ggml_fp16_t *)v_cache, past_kv_len,
            (ggml_fp16_t *)attn_score_fp16, past_kv_len, sum, head_dim, 0, 1,
            GGML_TASK_TYPE_COMPUTE, v_type, GGML_TYPE_F16, GGML_TYPE_F32,
            GGML_PREC_DEFAULT);
        if (!ok) {
            printf("llamafile_sgemm failed\n");
        }

        // copy to output
        for (int i = 0; i < bsz; i++) {
            for (int j = 0; j < head_dim; j++) {
                ((float *)output)[i * head_dim + j] = sum[i * head_dim + j];
            }
        }
    } else {
        assert(k_type == GGML_TYPE_Q4_0 || k_type == GGML_TYPE_Q8_0);
        assert(v_type == GGML_TYPE_Q4_0 || v_type == GGML_TYPE_Q8_0);

        // attn = q * k + q * k_anchor
        // TODO: anchor
        assert(num_k_anchor == 0);

        if (rotary_angle != nullptr) {
            ggml_fp16_t *k_cache_with_rope_fp16 =
                (reinterpret_cast<ggml_fp16_t *>(draft) +
                 sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
                 sizeof(float) * bsz * head_dim);
            block_q4_0 *k_cache_with_rope_q4 =
                (reinterpret_cast<block_q4_0 *>(draft) +
                 sizeof(block_q8_0) * bsz * past_kv_len / QK8_0 +
                 sizeof(float) * bsz * head_dim) +
                sizeof(ggml_fp16_t) * bsz * head_dim;
            // dequant k_cache and apply rope
            // k_rope(i) = k(i) * cos(i) - k(i+l) * sin(i)
            // k_rope(i+l) = k(i+l) * cos(i+l) + k(i) * sin(i)

            // k(i)cos(i) -> k_rope(i)
            // k(i)sin(i+l) -> k_rope(i+l)

            // k(i)cos(i) -> k_rope(i)
            // -k(i)sin(i-l) -> k_rope(i-l)

            std::vector<float> block_fp32(32);
            for (int k = 0; k < past_kv_len; k++) {
                int angle = rotary_angle[k];
                for (int l = 0; l < head_dim / 32; l++) {
                    block_q4_0 block =
                        ((block_q4_0 *)k_cache)[k * head_dim / 32 + l];
                    dequantize_row_q4_0(&block, block_fp32.data(), 32);
                    for (int m = 0; m < 32; m++) {
                        float sin_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_sin)[angle * head_dim + l * 32 + m]);
                        float cos_val = GGML_FP16_TO_FP32(
                            ((ggml_fp16_t *)
                                 rotary_cos)[angle * head_dim + l * 32 + m]);

                        if (l * 32 + m < head_dim / 2) {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] =
                                GGML_FP32_TO_FP16(block_fp32[m] * cos_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m +
                                                   head_dim / 2] =
                                GGML_FP32_TO_FP16(-block_fp32[m] * sin_val);
                        } else {
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m] +=
                                GGML_FP32_TO_FP16(block_fp32[m] * sin_val);
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m -
                                                   head_dim / 2] -=
                                GGML_FP32_TO_FP16(block_fp32[m] * cos_val);
                        }
                    }
                }
            }
            // quantize k_cache_with_rope_fp16
            for (int k = 0; k < past_kv_len; k++) {
                for (int l = 0; l < head_dim / 32; l++) {
                    for (int m = 0; m < 32; m++) {
                        block_fp32[m] = GGML_FP16_TO_FP32(
                            k_cache_with_rope_fp16[k * head_dim + l * 32 + m]);
                    }
                    quantize_row_q4_0(
                        block_fp32.data(),
                        &k_cache_with_rope_q4[k * head_dim / 32 + l], 32);
                }
            }

            llamafile_sgemm(past_kv_len, bsz, head_dim / 32,
                            (block_q4_0 *)k_cache_with_rope_q4, head_dim / 32,
                            (block_q8_0 *)q, head_dim / 32, attn_score,
                            past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type,
                            GGML_TYPE_Q8_0, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        } else {
            llamafile_sgemm(past_kv_len, bsz, head_dim / 32,
                            (block_q4_0 *)k_cache, head_dim / 32,
                            (block_q8_0 *)q, head_dim / 32, attn_score,
                            past_kv_len, 0, 1, GGML_TASK_TYPE_COMPUTE, k_type,
                            GGML_TYPE_Q8_0, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        }

        // attn = attn * scale
        float scale_factor = 1.0 / std::sqrt(float(head_dim));
        ggml_vec_scale_f32(bsz * past_kv_len, attn_score, scale_factor);

        // attn = attn & mask
        if (!is_full_attn) {
            for (int i = 0; i < bsz; i++) {
                for (int j = 0; j < past_kv_len; j++) {
                    int index = i * past_kv_len + j;
                    if (!(attn_mask[j / 8] & (1 << (j % 8)))) {
                        attn_score[index] =
                            std::numeric_limits<float>::lowest();
                    }
                }
            }
        }

        // attn = softmax(attn)
        for (int i = 0; i < bsz; i++) {
            float sum_exp = 0;
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] =
                    std::exp(attn_score[i * past_kv_len + j]);
                sum_exp += attn_score[i * past_kv_len + j];
            }
            for (int j = 0; j < past_kv_len; j++) {
                attn_score[i * past_kv_len + j] /= sum_exp;
            }
            if (lse != nullptr) {
                lse[i] = std::log(sum_exp);
            }
        }

        // output = attn * v + attn * v_anchor
        // std::vector<block_q8_0> attn_q8_0(bsz * past_kv_len / QK8_0);
        block_q8_0 *attn_q8_0 = reinterpret_cast<block_q8_0 *>(draft);
        quantize_row_q8_0(attn_score, attn_q8_0, bsz * past_kv_len);
        // std::vector<float> sum(bsz * head_dim);
        float *sum = reinterpret_cast<float *>(reinterpret_cast<char *>(draft) +
                                               sizeof(block_q8_0) * bsz *
                                                   past_kv_len / QK8_0);
        // TODO: anchor
        assert(num_v_anchor == 0);
        llamafile_sgemm(head_dim, bsz, past_kv_len / 32, (block_q4_0 *)v_cache,
                        past_kv_len / 32, attn_q8_0, past_kv_len / 32, sum,
                        head_dim, 0, 1, GGML_TASK_TYPE_COMPUTE, v_type,
                        GGML_TYPE_Q8_0, GGML_TYPE_F32, GGML_PREC_DEFAULT);

        quantize_row_q8_0(sum, (block_q8_0 *)output, bsz * head_dim);
    }
}


================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    std::ifstream ifs_tensor(tensor_file_path, std::ios::binary);
    if (!ifs_tensor) {
        throw std::runtime_error("Failed to open tensor file");
    }
    ifs_tensor.read(reinterpret_cast<char *>(&cache_total_len_),
                    sizeof(cache_total_len_));
    int past_block_num =
        (cache_total_len_ + config_.block_len - 1) / config_.block_len;
    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len_,
           past_block_num);
    for (int i = 0; i < config_.layer_num; ++i) {
        past_block_num_[i] = past_block_num;
    }
    ifs_tensor.read(reinterpret_cast<char *>(anchor_.data()),
                    anchor_.size() * sizeof(ggml_fp16_t));
    for (int i = 0; i < config_.layer_num; ++i) {
        for (int j = 0; j < config_.kv_head_num; ++j) {
            for (int k = 0; k < past_block_num_[i]; ++k) {
                if (config_.kv_type == GGML_TYPE_F16) {
                    ifs_tensor.read(
                        reinterpret_cast<char *>(k_cache_fp16_[i][j][k].data()),
                        k_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
                    ifs_tensor.read(
                        reinterpret_cast<char *>(v_cache_fp16_[i][j][k].data()),
                        v_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
                    ifs_tensor.read(
                        reinterpret_cast<char *>(k_cache_q4[i][j][k].data()),
                        k_cache_q4[i][j][k].size() * sizeof(block_q4_0));
                    ifs_tensor.read(
                        reinterpret_cast<char *>(v_cache_q4[i][j][k].data()),
                        v_cache_q4[i][j][k].size() * sizeof(block_q4_0));
                }
            }
        }
        for (int k = 0; k < past_block_num_[i]; ++k) {
            for (int l = 0; l < config_.block_len; l++) {
                ifs_tensor.read(
                    reinterpret_cast<char *>(importance_[i][k][l].data()),
                    importance_[i][k][l].size() * sizeof(ggml_fp16_t));
            }
        }
    }
    ifs_tensor.close();
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    printf("time of load: %f s\n", diff.count());
}
void KVCache::dump_kvcache(int *block_table, int cache_total_len,
                           std::string tensor_file_path, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();
    std::ofstream ofs(tensor_file_path, std::ios::binary);
    printf("dump_kvcache: %s\n", tensor_file_path.c_str());
    if (!ofs.is_open()) {
        std::cerr << "Cannot open file " << tensor_file_path << std::endl;
        return;
    }
    ofs.write(reinterpret_cast<const char *>(&cache_total_len),
              sizeof(cache_total_len));
    int past_block_num =
        (cache_total_len + config_.block_len - 1) / config_.block_len;
    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len,
           past_block_num);
    ofs.write(reinterpret_cast<const char *>(anchor_.data()),
              anchor_.size() * sizeof(ggml_fp16_t));
    for (int i = 0; i < config_.layer_num; ++i) {
        for (int j = 0; j < config_.kv_head_num; ++j) {
            for (int k = 0; k < past_block_num; ++k) {
                int block_idx = block_table[k];
                if (config_.kv_type == GGML_TYPE_F16) {
                    ofs.write(reinterpret_cast<const char *>(
                                  k_cache_fp16_[i][j][block_idx].data()),
                              k_cache_fp16_[i][j][block_idx].size() *
                                  sizeof(ggml_fp16_t));
                    ofs.write(reinterpret_cast<const char *>(
                                  v_cache_fp16_[i][j][block_idx].data()),
                              v_cache_fp16_[i][j][block_idx].size() *
                                  sizeof(ggml_fp16_t));

                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
                    ofs.write(reinterpret_cast<const char *>(
                                  k_cache_q4[i][j][block_idx].data()),
                              k_cache_q4[i][j][block_idx].size() *
                                  sizeof(block_q4_0));
                    ofs.write(reinterpret_cast<const char *>(
                                  v_cache_q4[i][j][block_idx].data()),
                              v_cache_q4[i][j][block_idx].size() *
                                  sizeof(block_q4_0));
                }
            }
        }
        for (int k = 0; k < past_block_num; ++k) {
            int block_idx = block_table[k];
            for (int l = 0; l < config_.block_len; l++) {
                ofs.write(reinterpret_cast<const char *>(
                              importance_[i][block_idx][l].data()),
                          importance_[i][block_idx][l].size() *
                              sizeof(ggml_fp16_t));
            }
        }
    }
    ofs.close();
    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
    printf("time of dump: %f s\n", diff.count());
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

void KVCache::get_anchor_one_block(ggml_fp16_t *anchor, int layer_id,
                                   int block_idx, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    anchor_data_ = const_cast<uint16_t *>(anchor);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of reading anchor: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::update_anchor_one_block(const ggml_fp16_t *anchor, int layer_id,
                                      int block_idx, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    anchor_data_ = const_cast<uint16_t *>(anchor);

    // Each task updates the anchor of a certain position
    // backend->do_work_stealing_job(config_.anchor_num, [&](int task_id) {
    //     int k = task_id % config_.anchor_num;
    //     int head_id = task_id / config_.anchor_num;
    //     memcpy(anchor_[layer_id_][head_id][block_idx].data() +
    //                k * config_.head_dim,
    //            anchor_data_ + k * config_.head_dim,
    //            sizeof(uint16_t) * config_.head_dim);
    // });

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of writting anchor: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::update_importance_one_block(const ggml_fp16_t *importance,
                                          int layer_id, int block_idx,
                                          Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    importance_data_ = const_cast<uint16_t *>(importance);

    // Each task updates the importance of a certain position
    backend->do_work_stealing_job(
        config_.block_len, nullptr,
        [&](int task_id) {
            int k = task_id;
            memcpy(importance_[layer_id_][block_idx].data() + k,
                   importance_data_ + k, sizeof(uint16_t));
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of writting importance: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::get_importance_one_block(ggml_fp16_t *importance, int layer_id,
                                       int block_idx, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    importance_data_ = const_cast<uint16_t *>(importance);

    // Each task updates the importance of a certain position
    backend->do_work_stealing_job(
        config_.block_len, nullptr,
        [&](int task_id) {
            int k = task_id;
            memcpy(importance_data_ + k,
                   importance_[layer_id_][block_idx].data() + k,
                   sizeof(uint16_t));
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of reading importance: %f s\n", layer_id,
           block_idx, duration.count());
}

void KVCache::update_kvcache_one_block_fp16(const ggml_fp16_t *k_in,
                                            const ggml_fp16_t *v_in,
                                            int layer_id, int block_idx,
                                            Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    block_idx = block_idx;
    seq_len_ = config_.block_len;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);

    int new_block_num = std::max((int)past_block_num_[layer_id], block_idx + 1);

    importance_[layer_id_].resize(new_block_num);

    for (int i = 0; i < config_.kv_head_num; i++) {
        k_cache_q4[layer_id][i].resize(new_block_num);
        v_cache_q4[layer_id][i].resize(new_block_num);
        // anchor_[layer_id][i].resize(new_block_num);
    }

    for (int i = 0; i < new_block_num; i++) {
        importance_[layer_id][i].resize(config_.block_len);
    }

    // Each task updates the k cache or v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * 2, nullptr,
        [&](int task_id) {
            std::vector<float> block_fp32(32);
            int head_id = task_id / 2;
            if (task_id & 1) {
                // fill k_cache_
                k_cache_q4[layer_id_][head_id][block_idx].resize(
                    config_.block_len * config_.head_dim / 32);
                for (int k = 0; k < config_.block_len; k++) {
                    for (int l = 0; l < config_.head_dim / 32; l++) {
                        block_q4_0 block;
                        for (int m = 0; m < 32; m++) {

                            block_fp32[m] = GGML_FP16_TO_FP32(
                                k_data_[((0 * config_.kv_head_num + head_id) *
                                             seq_len_ +
                                         0 * config_.block_len + k) *
                                            config_.head_dim +
                                        l * 32 + m]);
                        }
                        quantize_row_q4_0(block_fp32.data(), &block, 32);
                        k_cache_q4[layer_id_][head_id][block_idx]
                                  [k * config_.head_dim / 32 + l] = block;
                    }
                }
            } else {
                // fill v_cache_
                v_cache_q4[layer_id_][head_id][block_idx].resize(
                    config_.head_dim * config_.block_len / 32);
                for (int k = 0; k < config_.block_len / 32; k++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        block_q4_0 block;
                        for (int m = 0; m < 32; m++) {

                            block_fp32[m] = GGML_FP16_TO_FP32(
                                v_data_[((0 * config_.kv_head_num + head_id) *
                                             seq_len_ +
                                         0 * config_.block_len + k * 32 + m) *
                                            config_.head_dim +
                                        l]);
                        }
                        quantize_row_q4_0(block_fp32.data(), &block, 32);
                        v_cache_q4[layer_id_][head_id][block_idx]
                                  [l * config_.block_len / 32 + k] = block;
                    }
                }
            }
        },
        nullptr);
    past_block_num_[layer_id] = new_block_num;

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of writting KV Cache: %f s\n", layer_id,
           block_idx, duration.count());
    // printf("get_one_block_fp16 duration: %ld\n", duration);
}

void KVCache::get_kvcache_one_block_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                         int layer_id, int block_idx,
                                         Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    seq_len_ = config_.block_len;
    k_data_ = reinterpret_cast<uint16_t *>(k_in);
    v_data_ = reinterpret_cast<uint16_t *>(v_in);

    // printf("layer_id: %d, block_idx: %d\n", layer_id, block_idx);
    // Each task gets the k cache or v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * 2, nullptr,
        [&](int task_id) {
            std::vector<float> block_fp32(32);
            int head_id = task_id / 2;
            if (task_id & 1) {
                // get k_cache_
                for (int k = 0; k < config_.block_len; k++) {
                    for (int l = 0; l < config_.head_dim / 32; l++) {
                        block_q4_0 block =
                            k_cache_q4[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            k_data_[((0 * config_.kv_head_num + head_id) *
                                         seq_len_ +
                                     0 * config_.block_len + k) *
                                        config_.head_dim +
                                    l * 32 + m] =
                                GGML_FP32_TO_FP16(block_fp32[m]);
                        }
                    }
                }
            } else {
                // get v_cache_
                for (int k = 0; k < config_.block_len / 32; k++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        block_q4_0 block =
                            v_cache_q4[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            v_data_[((0 * config_.kv_head_num + head_id) *
                                         seq_len_ +
                                     0 * config_.block_len + k * 32 + m) *
                                        config_.head_dim +
                                    l] = GGML_FP32_TO_FP16(block_fp32[m]);
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("layer %d block %d time of reading KV Cache: %f s\n", layer_id,
           block_idx, duration.count());
    // printf("get_one_block_fp16 duration: %ld\n", duration);
}

// k_in: (batch_size, seq_len, head_num, head_dim)
// v_in: (batch_size, seq_len, head_num, head_dim)
void KVCache::get_and_update_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                                          int layer_id, int *block_table,
                                          int batch_size, int max_block_num,
                                          int *cache_seqlens, int q_len,
                                          Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);

    // Each task updates the k cache and v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * max_block_num * batch_size, nullptr,
        [&](int task_id) {
            // printf("block_idx: %d, task_id: %d\n", block_idx, task_id);
            std::vector<float> block_fp32(32);
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int block_id = (task_id / config_.kv_head_num) % max_block_num;
            int head_id = task_id % config_.kv_head_num;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            int seq_len = cache_seqlens[batch_id];
            int block_l = block_id * config_.block_len;
            int block_r = block_id * config_.block_len + config_.block_len;

            if (block_l < seq_len) {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim; l++) {
                            k_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    k_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [k * config_.head_dim + l];
                            v_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    v_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len + k];
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q4_0 block =
                                k_cache_q4[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q4_0 block =
                                v_cache_q4[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q8_0 block =
                                k_cache_q8[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q8_0 block =
                                v_cache_q8[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                }
            }
            if (block_r > seq_len && block_l < seq_len + q_len) {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >=
                                seq_len + q_len ||
                            block_id * config_.block_len + k < seq_len)
                            continue;
                        for (int l = 0; l < config_.head_dim; l++) {
                            k_cache_fp16_[layer_id_][head_id][block_idx]
                                         [k * config_.head_dim + l] = k_data_
                                             [batch_id * (max_block_num *
                                                          config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              block_id * (config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              k * (config_.kv_head_num *
                                                   config_.head_dim) +
                                              head_id * config_.head_dim + l];
                            v_cache_fp16_[layer_id_][head_id][block_idx]
                                         [l * config_.block_len + k] = v_data_
                                             [batch_id * (max_block_num *
                                                          config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              block_id * (config_.block_len *
                                                          config_.kv_head_num *
                                                          config_.head_dim) +
                                              k * (config_.kv_head_num *
                                                   config_.head_dim) +
                                              head_id * config_.head_dim + l];
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    // fill k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >=
                                seq_len + q_len ||
                            block_id * config_.block_len + k < seq_len)
                            continue;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q4_0 block;
                            for (int m = 0; m < 32; m++) {

                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    k_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            k * (config_.kv_head_num *
                                                 config_.head_dim) +
                                            head_id * config_.head_dim +
                                            l * 32 + m]);
                            }
                            quantize_row_q4_0(block_fp32.data(), &block, 32);
                            k_cache_q4[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l] = block;
                        }
                    }

                    // fill v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q4_0 block;
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len + q_len) {
                                    block_fp32[m] = 0;
                                    continue;
                                }
                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    v_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            (k * 32 + m) * config_.kv_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim + l]);
                            }
                            quantize_row_q4_0(block_fp32.data(), &block, 32);
                            v_cache_q4[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k] = block;
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    // fill k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >=
                                seq_len + q_len ||
                            block_id * config_.block_len + k < seq_len)
                            continue;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q8_0 block;
                            for (int m = 0; m < 32; m++) {

                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    k_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            k * (config_.kv_head_num *
                                                 config_.head_dim) +
                                            head_id * config_.head_dim +
                                            l * 32 + m]);
                            }
                            quantize_row_q8_0(block_fp32.data(), &block, 32);
                            k_cache_q8[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l] = block;
                        }
                    }

                    // fill v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q8_0 block;
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len + q_len) {
                                    block_fp32[m] = 0;
                                    continue;
                                }
                                block_fp32[m] = GGML_FP16_TO_FP32(
                                    v_data_[batch_id * (max_block_num *
                                                        config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            block_id * (config_.block_len *
                                                        config_.kv_head_num *
                                                        config_.head_dim) +
                                            (k * 32 + m) * config_.kv_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim + l]);
                            }
                            quantize_row_q8_0(block_fp32.data(), &block, 32);
                            v_cache_q8[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k] = block;
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;

    // printf("layer %d time of reading and updating KV Cache: %f s\n",
    // layer_id,
    //        duration.count());
}

void KVCache::update_importance(const ggml_fp16_t *importance, int layer_id,
                                int *block_table, int batch_size,
                                int max_block_num, int *offset, int width,
                                Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    importance_data_ = const_cast<uint16_t *>(importance);

    // Each task updates the importance of a certain position
    backend->do_work_stealing_job(
        max_block_num * batch_size, nullptr,
        [&](int task_id) {
            int block_id = task_id % max_block_num;
            int batch_id = task_id / max_block_num;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            if (block_id > (offset[batch_id] + width) / config_.block_len) {
                return;
            }
            for (int k = 0; k < config_.block_len; k++) {
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    importance_[layer_id_][block_idx][k][head_id] =
                        GGML_FP32_TO_FP16(
                            GGML_FP16_TO_FP32(
                                importance_data_[batch_id * max_block_num *
                                                     config_.block_len *
                                                     config_.q_head_num +
                                                 (block_id * config_.block_len +
                                                  k) *
                                                     config_.q_head_num +
                                                 head_id]) +
                            GGML_FP16_TO_FP32(
                                importance_[layer_id_][block_idx][k][head_id]));
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;

    // printf("layer %d time of updating importance: %f s\n", layer_id,
    //        duration.count());
}

void KVCache::get_kvcache_fp16(ggml_fp16_t *k_in, ggml_fp16_t *v_in,
                               int layer_id, int *block_table, int batch_size,
                               int max_block_num, int *cache_seqlens,
                               Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);

    // Each task updates the k cache and v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * max_block_num * batch_size, nullptr,
        [&](int task_id) {
            // printf("block_idx: %d, task_id: %d\n", block_idx, task_id);
            std::vector<float> block_fp32(32);
            int batch_id = task_id / (config_.kv_head_num * max_block_num);
            int block_id = (task_id / config_.kv_head_num) % max_block_num;
            int head_id = task_id % config_.kv_head_num;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            int seq_len = cache_seqlens[batch_id];
            int block_l = block_id * config_.block_len;
            int block_r = block_id * config_.block_len + config_.block_len;

            if (block_l < seq_len) {
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim; l++) {
                            k_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    k_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [k * config_.head_dim + l];
                            v_data_
                                [batch_id *
                                     (max_block_num * config_.block_len *
                                      config_.kv_head_num * config_.head_dim) +
                                 block_id *
                                     (config_.block_len * config_.kv_head_num *
                                      config_.head_dim) +
                                 k * (config_.kv_head_num * config_.head_dim) +
                                 head_id * config_.head_dim + l] =
                                    v_cache_fp16_[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len + k];
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q4_0 block =
                                k_cache_q4[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q4_0 block =
                                v_cache_q4[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q4_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    // get k_cache_
                    for (int k = 0; k < config_.block_len; k++) {
                        if (block_id * config_.block_len + k >= seq_len)
                            break;
                        for (int l = 0; l < config_.head_dim / 32; l++) {
                            block_q8_0 block =
                                k_cache_q8[layer_id_][head_id][block_idx]
                                          [k * config_.head_dim / 32 + l];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                k_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        k * (config_.kv_head_num *
                                             config_.head_dim) +
                                        head_id * config_.head_dim + l * 32 +
                                        m] = GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                    // get v_cache_
                    for (int k = 0; k < config_.block_len / 32; k++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            block_q8_0 block =
                                v_cache_q8[layer_id_][head_id][block_idx]
                                          [l * config_.block_len / 32 + k];
                            dequantize_row_q8_0(&block, block_fp32.data(), 32);
                            for (int m = 0; m < 32; m++) {

                                if (block_id * config_.block_len + k * 32 + m >=
                                    seq_len)
                                    break;
                                v_data_[batch_id *
                                            (max_block_num * config_.block_len *
                                             config_.kv_head_num *
                                             config_.head_dim) +
                                        block_id * (config_.block_len *
                                                    config_.kv_head_num *
                                                    config_.head_dim) +
                                        (k * 32 + m) * config_.kv_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(block_fp32[m]);
                            }
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
}

void KVCache::update_kvcache_fp16(const ggml_fp16_t *k_in,
                                  const ggml_fp16_t *v_in, int layer_id,
                                  int *block_table, int batch_size,
                                  int max_block_num, int *cache_seqlens,
                                  int q_len, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    k_data_ = const_cast<uint16_t *>(k_in);
    v_data_ = const_cast<uint16_t *>(v_in);
    // Each task updates the k cache and v cache of a certain header
    backend->do_work_stealing_job(
        batch_size * config_.kv_head_num * q_len, nullptr,
        [&](int task_id) {
            int batch_id = task_id / (config_.kv_head_num * q_len);
            int head_id = task_id / q_len % config_.kv_head_num;
            int seq_len = cache_seqlens[batch_id] + task_id % q_len;
            int q_offset = task_id % q_len;

            int block_id = seq_len / config_.block_len;
            int block_idx = block_table[batch_id * max_block_num + block_id];
            int pos_in_block = seq_len % config_.block_len;

            if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                for (int l = 0; l < config_.head_dim; l++) {
                    k_cache_fp16_[layer_id_][head_id][block_idx]
                                 [pos_in_block * config_.head_dim + l] =
                                     k_data_[batch_id *
                                                 (q_len * config_.kv_head_num *
                                                  config_.head_dim) +
                                             q_offset * config_.kv_head_num *
                                                 config_.head_dim +
                                             head_id * config_.head_dim + l];
                    v_cache_fp16_[layer_id_][head_id][block_idx]
                                 [l * config_.block_len + pos_in_block] =
                                     v_data_[batch_id *
                                                 (q_len * config_.kv_head_num *
                                                  config_.head_dim) +
                                             q_offset * config_.kv_head_num *
                                                 config_.head_dim +
                                             head_id * config_.head_dim + l];
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                std::vector<float> block_fp32(32);
                // fill k_cache_
                for (int l = 0; l < config_.head_dim / 32; l++) {
                    block_q4_0 block;
                    for (int m = 0; m < 32; m++) {

                        block_fp32[m] = GGML_FP16_TO_FP32(
                            k_data_[batch_id * (q_len * config_.kv_head_num *
                                                config_.head_dim) +
                                    head_id * config_.head_dim + l * 32 + m]);
                    }
                    quantize_row_q4_0(block_fp32.data(), &block, 32);

                    k_cache_q4[layer_id_][head_id][block_idx]
                              [pos_in_block * config_.head_dim / 32 + l] =
                                  block;
                }

                // fill v_cache_
                for (int l = 0; l < config_.head_dim; l++) {
                    block_q4_0 block = v_cache_q4[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len / 32 +
                                                  pos_in_block / 32];
                    dequantize_row_q4_0(&block, block_fp32.data(), 32);
                    block_fp32[pos_in_block % 32] = GGML_FP16_TO_FP32(
                        v_data_[batch_id * (q_len * config_.kv_head_num *
                                            config_.head_dim) +
                                head_id * config_.head_dim + l]);
                    quantize_row_q4_0(block_fp32.data(), &block, 32);
                    v_cache_q4[layer_id_][head_id][block_idx]
                              [l * config_.block_len / 32 + pos_in_block / 32] =
                                  block;
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                std::vector<float> block_fp32(32);
                // fill k_cache_
                for (int l = 0; l < config_.head_dim / 32; l++) {
                    block_q8_0 block;
                    for (int m = 0; m < 32; m++) {

                        block_fp32[m] = GGML_FP16_TO_FP32(
                            k_data_[batch_id * (q_len * config_.kv_head_num *
                                                config_.head_dim) +
                                    head_id * config_.head_dim + l * 32 + m]);
                    }
                    quantize_row_q8_0(block_fp32.data(), &block, 32);

                    k_cache_q8[layer_id_][head_id][block_idx]
                              [pos_in_block * config_.head_dim / 32 + l] =
                                  block;
                }

                // fill v_cache_
                for (int l = 0; l < config_.head_dim; l++) {
                    block_q8_0 block = v_cache_q8[layer_id_][head_id][block_idx]
                                                 [l * config_.block_len / 32 +
                                                  pos_in_block / 32];
                    dequantize_row_q8_0(&block, block_fp32.data(), 32);
                    block_fp32[pos_in_block % 32] = GGML_FP16_TO_FP32(
                        v_data_[batch_id * (q_len * config_.kv_head_num *
                                            config_.head_dim) +
                                head_id * config_.head_dim + l]);
                    quantize_row_q8_0(block_fp32.data(), &block, 32);
                    v_cache_q8[layer_id_][head_id][block_idx]
                              [l * config_.block_len / 32 + pos_in_block / 32] =
                                  block;
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    // printf("layer %d time of reading KV Cache: %f s\n", layer_id,
    //        duration.count());
}

void KVCache::get_all_kvcache_one_layer(int layer_id, ggml_fp16_t *k_in,
                                        ggml_fp16_t *v_in, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    layer_id_ = layer_id;
    seq_len_ = config_.block_len;
    block_num_ = get_cache_total_block_num();
    k_data_ = reinterpret_cast<uint16_t *>(k_in);
    v_data_ = reinterpret_cast<uint16_t *>(v_in);

    // Each task gets the k cache or v cache of a certain header
    backend->do_work_stealing_job(
        config_.kv_head_num * past_block_num_[layer_id] * 2, nullptr,
        [&](int task_id) {
            std::vector<float> block_fp32(32);
            int head_id = task_id / 2 / past_block_num_[layer_id];
            int block_idx = task_id / 2 % past_block_num_[layer_id];
            if (block_idx >= block_num_)
                return;

            int max_offset = 0;
            if (task_id & 1) {
                // get k_cache_
                for (int k = 0; k < config_.block_len; k++) {
                    if (block_idx * seq_len_ + k >= cache_total_len_)
                        break;
                    for (int l = 0; l < config_.head_dim / 32; l++) {
                        block_q4_0 block =
                            k_cache_q4[layer_id_][head_id][block_idx]
                                      [k * config_.head_dim / 32 + l];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            k_data_[(head_id * cache_total_len_ +
                                     block_idx * config_.block_len + k) *
                                        config_.head_dim +
                                    l * 32 + m] =
                                GGML_FP32_TO_FP16(block_fp32[m]);
                            max_offset = std::max(
                                max_offset,
                                (int)(head_id * cache_total_len_ +
                                      block_idx * config_.block_len + k) *
                                        config_.head_dim +
                                    l * 32 + m);
                        }
                    }
                }
            } else {
                // get v_cache_
                for (int k = 0; k < config_.block_len / 32; k++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        block_q4_0 block =
                            v_cache_q4[layer_id_][head_id][block_idx]
                                      [l * config_.block_len / 32 + k];
                        dequantize_row_q4_0(&block, block_fp32.data(), 32);
                        for (int m = 0; m < 32; m++) {

                            if (block_idx * seq_len_ + k * 32 + m >=
                                cache_total_len_)
                                break;
                            v_data_[(head_id * cache_total_len_ +
                                     block_idx * config_.block_len + k * 32 +
                                     m) *
                                        config_.head_dim +
                                    l] = GGML_FP32_TO_FP16(block_fp32[m]);
                            max_offset =
                                std::max(max_offset,
                                         (int)((head_id * cache_total_len_ +
                                                block_idx * config_.block_len +
                                                k * 32 + m) *
                                                   config_.head_dim +
                                               l));
                        }
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    // printf("layer %d block num %d time of reading all KV Cache: %f s\n",
    //        layer_id, block_num_, duration.count());
}


================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
================================================
/**
 * @Description  :
 * @Author       : Jianwei Dong
 * @Date         : 2024-08-26 22:47:06
 * @Version      : 1.0.0
 * @LastEditors  : Jianwei Dong
 * @LastEditTime : 2024-08-26 22:47:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/

#include "kvcache.h"

#include <chrono>

std::string ggml_type_to_string(ggml_type type) {
    switch (type) {
    case GGML_TYPE_F32:
        return "GGML_TYPE_F32";
    case GGML_TYPE_F16:
        return "GGML_TYPE_F16";
    case GGML_TYPE_Q4_0:
        return "GGML_TYPE_Q4_0";
    case GGML_TYPE_Q8_0:
        return "GGML_TYPE_Q8_0";
    }
    return "UNDIFINED";
}
std::string AnchorTypeToString(AnchorType type) {
    switch (type) {
    case AnchorType::DYNAMIC:
        return "DYNAMIC";
    case AnchorType::BLOCK_MEAN:
        return "BLOCK_MEAN";
    case AnchorType::BLOCK_MAX:
        return "BLOCK_MAX";
    case AnchorType::FIXED_ANCHOR:
        return "FIXED_ANCHOR";
    case AnchorType::QUEST:
        return "QUEST";
    }
    return "UNDIFINED";
}
std::string RetrievalTypeToString(RetrievalType type) {
    switch (type) {
    case RetrievalType::LAYER:
        return "SHARED";
    case RetrievalType::KVHEAD:
        return "SEPARATE";
    case RetrievalType::QHEAD:
        return "INDIVIDUAL";
    }
    return "UNDIFINED";
}
KVCacheConfig::KVCacheConfig(int layer_num, int kv_head_num, int q_head_num,
                             int head_dim, int block_len, int anchor_num,
                             AnchorType anchor_type, ggml_type kv_type,
                             RetrievalType retrieval_type, int layer_step,
                             int token_step, int layer_offset,
                             int max_block_num, int max_batch_size,
                             int max_thread_num)
    : layer_num(layer_num), kv_head_num(kv_head_num), q_head_num(q_head_num),
      head_dim(head_dim), block_len(block_len), anchor_num(anchor_num),
      anchor_type(anchor_type), kv_type(kv_type),
      retrieval_type(retrieval_type), layer_step(layer_step),
      token_step(token_step), layer_offset(layer_offset),
      max_block_num(max_block_num), max_batch_size(max_batch_size),
      max_thread_num(max_thread_num) {
    printf(
        "layer_num: %d, kv_head_num: %d, q_head_num: %d, head_dim: %d, "
        "block_len: %d, anchor_num: %d, anchor_type: %s, kv_type: %s, "
        "retrieval_type: %s, layer_step: %d, token_step: %d, layer_offset: %d,"
        "max_block_num: %d, max_batch_size: %d, max_thread_num: %d\n",
        layer_num, kv_head_num, q_head_num, head_dim, block_len, anchor_num,
        AnchorTypeToString(anchor_type).c_str(),
        ggml_type_to_string(kv_type).c_str(),
        RetrievalTypeToString(retrieval_type).c_str(), layer_step, token_step,
        layer_offset, max_block_num, max_batch_size, max_thread_num);
    assert(q_head_num % kv_head_num == 0);
}
KVCache::KVCache(KVCacheConfig config) {
    this->config_ = config;

    n_gqa_ = config_.q_head_num / config_.kv_head_num;
    if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
        // TODO: Elegant implement
        k_cache_fp16_.resize(config_.layer_num);
        v_cache_fp16_.resize(config_.layer_num);
        selected_blocks_num_history_.resize(config_.layer_num /
                                            config_.layer_step);
        if (config_.retrieval_type == RetrievalType::LAYER) {
            selected_blocks_history_.resize(config_.layer_num /
                                            config_.layer_step);
        } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
            selected_blocks_history_kvhead_.resize(config_.layer_num /
                                                   config_.layer_step);
        } else if (config_.retrieval_type == RetrievalType::QHEAD) {
        }
    } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
        k_cache_q4.resize(config.layer_num);
        v_cache_q4.resize(config.layer_num);
    } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
        k_cache_q8.resize(config.layer_num);
        v_cache_q8.resize(config.layer_num);
    } else {
        assert(false);
    }
    anchor_.resize(config.layer_num * config.max_block_num * config.anchor_num *
                   config.q_head_num * config.head_dim);
    importance_.resize(config.layer_num);
    past_block_num_.resize(config.layer_num);
    for (int i = 0; i < config.layer_num; i++) {
        past_block_num_[i] = 0;
    }

    ThreadResize(config.max_thread_num);
    BatchResize(config.max_batch_size);
    BlockResize(config.max_block_num);
    q_fp32.resize(n_gqa_ * config.head_dim);
}

void KVCache::ThreadResize(int thread_num) {
    thread_local_output_q8_0_.resize(thread_num);
    thread_local_attn_score_.resize(thread_num);
    thread_local_output_fp32_.resize(thread_num);
    thread_local_attn_lse_.resize(thread_num);
    thread_local_cur_output_fp32_.resize(thread_num);
    thread_local_cur_attn_lse_.resize(thread_num);
    thread_local_draft_.resize(thread_num);
    thread_cur_head_idx_.resize(thread_num);
    thread_local_attn_mask_.resize(thread_num);
    for (int i = 0; i < thread_num; i++) {
        thread_local_output_q8_0_[i].resize(n_gqa_ * config_.head_dim / QK8_0);
        thread_local_attn_score_[i].resize(n_gqa_ * config_.block_len);
        thread_local_output_fp32_[i].resize(n_gqa_ * config_.head_dim);
        thread_local_attn_lse_[i].resize(n_gqa_);
        thread_local_cur_output_fp32_[i].resize(n_gqa_ * config_.head_dim);
        thread_local_cur_attn_lse_[i].resize(n_gqa_);
        thread_local_draft_[i].resize(
            2 * n_gqa_ * config_.block_len + 6 * n_gqa_ * config_.head_dim +
            2 * config_.block_len * config_.head_dim +
            config_.block_len * config_.head_dim / QK4_0);
        thread_local_attn_mask_[i].resize(config_.block_len / 8);
    }
}
void KVCache::BatchResize(int batch_size) {
    mutex_.resize(batch_size);
    q_q8_0_.resize(batch_size);
    q_fp32_.resize(batch_size);
    output_fp32_.resize(batch_size);
    attn_lse_.resize(batch_size);
    block_lse_.resize(batch_size);
    attn_sparsity_.resize(batch_size);

    if (config_.retrieval_type == RetrievalType::LAYER) {
        block_table_before_retrieval_.resize(batch_size);
        block_table_after_retrieval_.resize(batch_size);

        for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
            selected_blocks_history_[i].resize(batch_size);
        }

    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        block_table_before_retrieval_kvhead_.resize(batch_size);
        block_table_after_retrieval_kvhead_.resize(batch_size);
        for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
            selected_blocks_history_kvhead_[i].resize(batch_size);
        }
    } else if (config_.retrieval_type == RetrievalType::QHEAD) {
        block_table_before_retrieval_qhead_.resize(batch_size);
        block_table_after_retrieval_qhead_.resize(batch_size);
    }
    cache_seqlens_.resize(batch_size);
    if (config_.retrieval_type == RetrievalType::LAYER) {
        block_similar_.resize(batch_size);
    } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
        block_similar_kv_head_.resize(batch_size);
    } else if (config_.retrieval_type == RetrievalType::QHEAD) {
        block_similar_q_head_.resize(batch_size);
    }
    for (int i = 0; i < batch_size; i++) {
        top_similar_block_.resize(batch_size);

        mutex_[i].resize(config_.kv_head_num);
        q_q8_0_[i].resize(config_.kv_head_num);
        q_fp32_[i].resize(config_.kv_head_num);
        output_fp32_[i].resize(config_.kv_head_num);
        attn_lse_[i].resize(config_.kv_head_num);

        for (int j = 0; j < config_.kv_head_num; j++) {
            if (!mutex_[i][j]) {
                mutex_[i][j] = std::make_unique<std::mutex>();
            }
            q_q8_0_[i][j].resize(n_gqa_ * config_.head_dim / QK8_0);
            q_fp32_[i][j].resize(n_gqa_ * config_.head_dim);
            output_fp32_[i][j].resize(n_gqa_ * config_.head_dim);
            attn_lse_[i][j].resize(n_gqa_);
        }
    }
    avg_q.resize(batch_size);
    avg_q_fp16.resize(batch_size);
    for (int i = 0; i < batch_size; i++) {
        attn_sparsity_[i].resize(config_.q_head_num);
        avg_q[i].resize(config_.q_head_num * config_.head_dim);
        avg_q_fp16[i].resize(config_.q_head_num * config_.head_dim);
    }
}

void KVCache::BlockResize(int max_block_num) {
    sin_.resize(max_block_num * config_.block_len);
    cos_.resize(max_block_num * config_.block_len);
    for (int i = 0; i < max_block_num * config_.block_len; i++) {
        sin_[i].resize(config_.head_dim);
        cos_[i].resize(config_.head_dim);
    }

    for (int i = 0; i < config_.layer_num / config_.layer_step; i++) {
        for (int j = 0; j < config_.max_batch_size; j++) {
            if (config_.retrieval_type == RetrievalType::LAYER) {
                selected_blocks_history_[i][j].resize(max_block_num);
            } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
                selected_blocks_history_kvhead_[i][j].resize(max_block_num);
                for (int k = 0; k < config_.max_block_num; k++) {
                    selected_blocks_history_kvhead_[i][j][k].resize(
                        config_.kv_head_num);
                }
            } else if (config_.retrieval_type == RetrievalType::QHEAD) {
            }
        }
    }

    for (int layer_id = 0; layer_id < config_.layer_num; layer_id++) {
        importance_[layer_id].resize(max_block_num);

        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
            // TODO: Elegant implement
            k_cache_fp16_[layer_id].resize(config_.kv_head_num);
            v_cache_fp16_[layer_id].resize(config_.kv_head_num);

            for (int i = 0; i < config_.kv_head_num; i++) {
                k_cache_fp16_[layer_id][i].resize(max_block_num);
                v_cache_fp16_[layer_id][i].resize(max_block_num);

                for (int j = 0; j < max_block_num; j++) {
                    k_cache_fp16_[layer_id][i][j].resize(config_.block_len *
                                                         config_.head_dim);
                    v_cache_fp16_[layer_id][i][j].resize(config_.block_len *
                                                         config_.head_dim);
                }
            }

        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
            k_cache_q4[layer_id].resize(config_.kv_head_num);
            v_cache_q4[layer_id].resize(config_.kv_head_num);
            for (int i = 0; i < config_.kv_head_num; i++) {
                k_cache_q4[layer_id][i].resize(max_block_num);
                v_cache_q4[layer_id][i].resize(max_block_num);

                for (int j = 0; j < max_block_num; j++) {
                    k_cache_q4[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                    v_cache_q4[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                }
            }
        } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
            k_cache_q8[layer_id].resize(config_.kv_head_num);
            v_cache_q8[layer_id].resize(config_.kv_head_num);
            for (int i = 0; i < config_.kv_head_num; i++) {
                k_cache_q8[layer_id][i].resize(max_block_num);
                v_cache_q8[layer_id][i].resize(max_block_num);

                for (int j = 0; j < max_block_num; j++) {
                    k_cache_q8[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                    v_cache_q8[layer_id][i][j].resize(config_.block_len *
                                                      config_.head_dim / 32);
                }
            }
        } else {
            assert(false);
        }
        for (int i = 0; i < config_.max_batch_size; i++) {
            if (config_.retrieval_type == RetrievalType::LAYER) {
                block_similar_[i].resize(max_block_num);
                block_table_before_retrieval_[i].resize(max_block_num);
                block_table_after_retrieval_[i].resize(max_block_num);
            } else if (config_.retrieval_type == RetrievalType::KVHEAD) {
                block_similar_kv_head_[i].resize(max_block_num);
                block_table_before_retrieval_kvhead_[i].resize(max_block_num);
                block_table_after_retrieval_kvhead_[i].resize(max_block_num);
                for (int j = 0; j < max_block_num; j++) {
                    block_similar_kv_head_[i][j].resize(config_.kv_head_num);
                    block_table_before_retrieval_kvhead_[i][j].resize(
                        config_.kv_head_num);
                    block_table_after_retrieval_kvhead_[i][j].resize(
                        config_.kv_head_num);
                }
            } else if (config_.retrieval_type == RetrievalType::QHEAD) {
                block_similar_q_head_[i].resize(max_block_num);
                block_table_before_retrieval_qhead_[i].resize(max_block_num);
                block_table_after_retrieval_qhead_[i].resize(max_block_num);
                for (int j = 0; j < max_block_num; j++) {
                    block_similar_q_head_[i][j].resize(config_.q_head_num);
                    block_table_before_retrieval_qhead_[i][j].resize(
                        config_.q_head_num);
                    block_table_after_retrieval_qhead_[i][j].resize(
                        config_.q_head_num);
                }
            }
            block_lse_[i].resize(max_block_num);
            for (int j = 0; j < max_block_num; j++) {
                block_lse_[i][j].resize(config_.q_head_num);
            }
        }

        for (int i = 0; i < max_block_num; i++) {
            importance_[layer_id][i].resize(config_.block_len);
            for (int j = 0; j < config_.block_len; j++) {
                importance_[layer_id][i][j].resize(config_.q_head_num);
            }
        }
    }
}

void KVCache::calc_anchor_all_layers(int *block_table, int *cache_seqlens,
                                     int batch_size, int max_block_num,
                                     Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    // Each task updates the importance of a certain block
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        config_.layer_num * batch_size * max_block_num, nullptr,
        [&](int task_id) {
            int layer_id = task_id / (batch_size * max_block_num);
            int batch_id = (task_id / max_block_num) % batch_size;
            int block_id = task_id % max_block_num;
            // If the block is out of the sequence length, skip it. In
            // particular, the last block of the sequence that is shorter than
            // the block length should be skipped.

            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];

            std::vector<float> block_fp32(32);
            if (config_.anchor_type == AnchorType::DYNAMIC) {

                // clear anchor_
                for (int anchor_id = 0; anchor_id < 1; anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // find top anchor_num importances and their corresponding
                // positions in the importance_ tensor
                // TODO: Move top_importances to the class member to avoid
                // repeated memory allocation
                std::priority_queue<
                    std::pair<float, std::pair<int, int>>,
                    std::vector<std::pair<float, std::pair<int, int>>>,
                    std::greater<>>
                    top_importances;
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int k = 0; k < seq_len_; k++) {
                        top_importances.push(std::make_pair(
                            GGML_FP16_TO_FP32(
                                importance_[layer_id][block_idx][k][head_id]),
                            std::make_pair(block_idx, k)));
                        // TODO: change to config_ item
                        if (top_importances.size() > config_.anchor_num) {
                            top_importances.pop();
                        }
                    }

                    // fill anchor_

                    for (int l = 0; l < config_.head_dim; l++) {
                        anchor_[layer_id * config_.max_block_num *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim +
                                block_idx * config_.anchor_num *
                                    config_.q_head_num * config_.head_dim +
                                0 * config_.q_head_num * config_.head_dim +
                                head_id * config_.head_dim + l] = 0;
                    }
                    for (int k = 0; k < config_.anchor_num; k++) {
                        int top_indice = top_importances.top().second.second;
                        int top_block_idx = top_importances.top().second.first;

                        if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        top_block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    top_block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]) +
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_[layer_id]
                                                         [head_id / n_gqa_]
                                                         [top_block_idx]
                                                         [top_indice *
                                                              config_.head_dim +
                                                          l]));
                            }

                        } else if (config_.kv_type ==
                                   ggml_type::GGML_TYPE_Q4_0) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q4_0 block = k_cache_q4
                                    [layer_id][head_id / n_gqa_][top_block_idx]
                                    [top_indice * config_.head_dim / 32 + l];
                                dequantize_row_q4_0(&block, block_fp32.data(),
                                                    32);
                                for (int m = 0; m < 32; m++) {
                                    anchor_[layer_id * config_.max_block_num *
                                                config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            top_block_idx * config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            0 * config_.q_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim +
                                            l * 32 + m] =
                                        GGML_FP32_TO_FP16(
                                            block_fp32[m] / 4 +
                                            GGML_FP16_TO_FP32(
                                                anchor_[layer_id *
                                                            config_
                                                                .max_block_num *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        top_block_idx *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        0 * config_.q_head_num *
                                                            config_.head_dim +
                                                        head_id *
                                                            config_.head_dim +
                                                        l * 32 + m]));
                                }
                            }
                        } else if (config_.kv_type ==
                                   ggml_type::GGML_TYPE_Q8_0) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q8_0 block = k_cache_q8
                                    [layer_id][head_id / n_gqa_][top_block_idx]
                                    [top_indice * config_.head_dim / 32 + l];
                                dequantize_row_q8_0(&block, block_fp32.data(),
                                                    32);
                                for (int m = 0; m < 32; m++) {
                                    anchor_[layer_id * config_.max_block_num *
                                                config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            top_block_idx * config_.anchor_num *
                                                config_.q_head_num *
                                                config_.head_dim +
                                            0 * config_.q_head_num *
                                                config_.head_dim +
                                            head_id * config_.head_dim +
                                            l * 32 + m] =
                                        GGML_FP32_TO_FP16(
                                            block_fp32[m] / 4 +
                                            GGML_FP16_TO_FP32(
                                                anchor_[layer_id *
                                                            config_
                                                                .max_block_num *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        top_block_idx *
                                                            config_.anchor_num *
                                                            config_.q_head_num *
                                                            config_.head_dim +
                                                        0 * config_.q_head_num *
                                                            config_.head_dim +
                                                        head_id *
                                                            config_.head_dim +
                                                        l * 32 + m]));
                                }
                            }
                        }
                        top_importances.pop();
                    }
                }
            } else if (config_.anchor_type == AnchorType::BLOCK_MEAN) {
                // clear anchor_
                for (int anchor_id = 0; anchor_id < config_.anchor_num;
                     anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // fill anchor_
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int k = 0; k < config_.block_len; k++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]) +
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_[layer_id]
                                                         [head_id / n_gqa_]
                                                         [block_idx]
                                                         [k * config_.head_dim +
                                                          l]) /
                                            config_.block_len);
                            }
                        }
                    }
                }
            } else if (config_.anchor_type == AnchorType::BLOCK_MAX) {
                // clear anchor_
                for (int anchor_id = 0; anchor_id < config_.anchor_num;
                     anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // fill anchor_
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int k = 0; k < config_.block_len; k++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(std::max(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]),
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_
                                                [layer_id][head_id / n_gqa_]
                                                [block_idx]
                                                [k * config_.head_dim + l])));
                            }
                        }
                    }
                }
            } else if (config_.anchor_type == AnchorType::FIXED_ANCHOR) {
                // clear anchor_
                for (int anchor_id = 0; anchor_id < 1; anchor_id++) {
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int l = 0; l < config_.head_dim; l++) {
                            anchor_[layer_id * config_.max_block_num *
                                        config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    block_idx * config_.anchor_num *
                                        config_.q_head_num * config_.head_dim +
                                    anchor_id * config_.q_head_num *
                                        config_.head_dim +
                                    head_id * config_.head_dim + l] = 0;
                        }
                    }
                }

                // fill anchor_
                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {

                    int stride = config_.block_len / config_.anchor_num;
                    for (int head_id = 0; head_id < config_.q_head_num;
                         head_id++) {
                        for (int k = 0, tot = 0;
                             k < config_.block_len, tot < config_.anchor_num;
                             k += stride, tot++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l]) +
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_[layer_id]
                                                         [head_id / n_gqa_]
                                                         [block_idx]
                                                         [k * config_.head_dim +
                                                          l]) /
                                            config_.anchor_num);
                            }
                        }
                    }
                }

            } else if (config_.anchor_type == AnchorType::QUEST) {
                // clear anchor_
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int l = 0; l < config_.head_dim; l++) {
                        anchor_[layer_id * config_.max_block_num *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim +
                                block_idx * config_.anchor_num *
                                    config_.q_head_num * config_.head_dim +
                                1 * config_.q_head_num * config_.head_dim +
                                head_id * config_.head_dim + l] =
                            GGML_FP32_TO_FP16(
                                std::numeric_limits<float>::max());

                        anchor_[layer_id * config_.max_block_num *
                                    config_.anchor_num * config_.q_head_num *
                                    config_.head_dim +
                                block_idx * config_.anchor_num *
                                    config_.q_head_num * config_.head_dim +
                                0 * config_.q_head_num * config_.head_dim +
                                head_id * config_.head_dim + l] =
                            GGML_FP32_TO_FP16(
                                std::numeric_limits<float>::min());
                    }
                }

                // fill anchor_

                if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                    for (int indice = 0; indice < seq_len_; indice++) {
                        for (int head_id = 0; head_id < config_.kv_head_num;
                             head_id++) {
                            for (int l = 0; l < config_.head_dim; l++) {
                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        0 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(std::max(
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_
                                                [layer_id][head_id][block_idx]
                                                [indice * config_.head_dim +
                                                 l]),
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    0 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l])));

                                anchor_[layer_id * config_.max_block_num *
                                            config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        block_idx * config_.anchor_num *
                                            config_.q_head_num *
                                            config_.head_dim +
                                        1 * config_.q_head_num *
                                            config_.head_dim +
                                        head_id * config_.head_dim + l] =
                                    GGML_FP32_TO_FP16(std::min(
                                        GGML_FP16_TO_FP32(
                                            k_cache_fp16_
                                                [layer_id][head_id][block_idx]
                                                [indice * config_.head_dim +
                                                 l]),
                                        GGML_FP16_TO_FP32(
                                            anchor_[layer_id *
                                                        config_.max_block_num *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    block_idx *
                                                        config_.anchor_num *
                                                        config_.q_head_num *
                                                        config_.head_dim +
                                                    1 * config_.q_head_num *
                                                        config_.head_dim +
                                                    head_id * config_.head_dim +
                                                    l])));
                            }
                        }
                    }

                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                    for (int indice = 0; indice < seq_len_; indice++) {
                        for (int head_id = 0; head_id < config_.kv_head_num;
                             head_id++) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q4_0 block =
                                    k_cache_q4[layer_id][head_id][block_idx]
                                              [indice * config_.head_dim / 32 +
                                               l];
                                dequantize_row_q4_0(&block, block_fp32.data(),
                                                    32);

                                for (int m = 0; m < 32; m++) {
                                    for (int gqa_idx = 0; gqa_idx < n_gqa_;
                                         gqa_idx++) {

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                0 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::max(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         0 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                1 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::min(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         1 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));
                                    }
                                }
                            }
                        }
                    }
                } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                    for (int indice = 0; indice < seq_len_; indice++) {
                        for (int head_id = 0; head_id < config_.kv_head_num;
                             head_id++) {
                            for (int l = 0; l < config_.head_dim / 32; l++) {
                                block_q8_0 block =
                                    k_cache_q8[layer_id][head_id][block_idx]
                                              [indice * config_.head_dim / 32 +
                                               l];
                                dequantize_row_q8_0(&block, block_fp32.data(),
                                                    32);

                                for (int m = 0; m < 32; m++) {
                                    for (int gqa_idx = 0; gqa_idx < n_gqa_;
                                         gqa_idx++) {

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                0 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::max(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         0 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));

                                        anchor_[layer_id *
                                                    config_.max_block_num *
                                                    config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                block_idx * config_.anchor_num *
                                                    config_.q_head_num *
                                                    config_.head_dim +
                                                1 * config_.q_head_num *
                                                    config_.head_dim +
                                                head_id * config_.head_dim +
                                                l * 32 + m] =
                                            GGML_FP32_TO_FP16(std::min(
                                                block_fp32[m],
                                                GGML_FP16_TO_FP32(
                                                    anchor_
                                                        [layer_id *
                                                             config_
                                                                 .max_block_num *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         block_idx *
                                                             config_
                                                                 .anchor_num *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         1 *
                                                             config_
                                                                 .q_head_num *
                                                             config_.head_dim +
                                                         head_id *
                                                             config_.head_dim +
                                                         l * 32 + m])));
                                    }
                                }
                            }
                        }
                    }
                }
            } else {
                assert(false);
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    //    printf("time of calc_anchor_all_layers: %f s\n", duration.count());
}

void KVCache::clear_importance_all_layers(int *block_table, int *cache_seqlens,
                                          int batch_size, int max_block_num,
                                          Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    // Each task updates the importance of a certain block
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        config_.layer_num * batch_size * max_block_num, nullptr,
        [&](int task_id) {
            int layer_id = task_id / (batch_size * max_block_num);
            int batch_id = (task_id / max_block_num) % batch_size;
            int block_id = task_id % max_block_num;
            // If the block is out of the sequence length, skip it. In
            // particular, the last block of the sequence that is shorter than
            // the block length should be skipped.

            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];

            if (config_.anchor_type == AnchorType::DYNAMIC) {

                // clear anchor_
                for (int head_id = 0; head_id < config_.q_head_num; head_id++) {
                    for (int l = 0; l < config_.block_len; l++) {
                        importance_[layer_id][block_idx][l][head_id] = 0;
                    }
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    //    printf("time of clear_importance_all_layerssssss: %f s\n",
    //    duration.count());
}

void KVCache::clear_kvcache_all_layers(int *block_table, int *cache_seqlens,
                                       int batch_size, int max_block_num,
                                       Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    // Each task updates the importance of a certain block
    seq_len_ = config_.block_len;
    backend->do_work_stealing_job(
        config_.layer_num * batch_size * max_block_num * config_.kv_head_num,
        nullptr,
        [&](int task_id) {
            int layer_id =
                task_id / (batch_size * max_block_num * config_.kv_head_num);
            int batch_id =
                (task_id / (max_block_num * config_.kv_head_num)) % batch_size;
            int block_id = task_id / config_.kv_head_num % max_block_num;
            int head_id = task_id % config_.kv_head_num;
            // If the block is out of the sequence length, skip it. In
            // particular, the last block of the sequence that is shorter than
            // the block length should be skipped.
            if (cache_seqlens[batch_id] / config_.block_len < block_id) {
                return;
            }
            int block_idx = block_table[batch_id * max_block_num + block_id];

            if (config_.kv_type == ggml_type::GGML_TYPE_F16) {
                for (int l = 0; l < config_.block_len * config_.head_dim; l++) {
                    k_cache_fp16_[layer_id][head_id][block_idx][l] = 0;
                    v_cache_fp16_[layer_id][head_id][block_idx][l] = 0;
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) {
                for (int l = 0; l < config_.block_len * config_.head_dim / 32;
                     l++) {
                    k_cache_q4[layer_id][head_id][block_idx][l].d = 0;
                    v_cache_q4[layer_id][head_id][block_idx][l].d = 0;
                }
            } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) {
                for (int l = 0; l < config_.block_len * config_.head_dim / 32;
                     l++) {
                    k_cache_q8[layer_id][head_id][block_idx][l].d = 0;
                    v_cache_q8[layer_id][head_id][block_idx][l].d = 0;
                }
            }
        },
        nullptr);

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    //    printf("time of clear_kvcache_all_layers: %f s\n", duration.count());
}

void KVCache::get_sincos(ggml_fp16_t *sin, ggml_fp16_t *cos, int seqlen) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

    const uint16_t *sin_data = const_cast<const uint16_t *>(sin);
    const uint16_t *cos_data = const_cast<const uint16_t *>(cos);

    for (int i = 0; i < seqlen; i++) {
        for (int j = 0; j < config_.head_dim; j++) {
            sin_[i][j] = sin_data[i * config_.head_dim + j];
            cos_[i][j] = cos_data[i * config_.head_dim + j];
        }
    }

    // Timer end
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;
    printf("time of get_sincos: %f s\n", duration.count());
}

void ggml_vec_scale_f32(const int n, float *y, const float v) {
#if defined(GGML_USE_ACCELERATE)
    vDSP_vsmul(y, 1, &v, y, 1, n);
#elif defined(GGML_SIMD)
    const int np = (n & ~(GGML_F32_STEP - 1));

    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);

    GGML_F32_VEC ay[GGML_F32_ARR];

    for (int i = 0; i < np; i += GGML_F32_STEP) {
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ay[j] = GGML_F32_VEC_LOAD(y + i + j * GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);

            GGML_F32_VEC_STORE(y + i + j * GGML_F32_EPR, ay[j]);
        }
    }

    // leftovers
    for (int i = np; i < n; ++i) {
        y[i] *= v;
    }
#else
    // scalar
    for (int i = 0; i < n; ++i) {
        y[i] *= v;
    }
#endif
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/conversion.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022 
 * @LastEditTime : 2024-07-25 10:34:55
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_CONVERSION_H
#define CPUINFER_CONVERSION_H

#include <memory.h>
#include "llama.cpp/ggml.h"

inline void to_float(const void* input, float* output, int size, ggml_type type) {
    if (type == ggml_type::GGML_TYPE_F32) {
        memcpy(output, input, size * sizeof(float));
    } else {
        ggml_internal_get_type_traits(type).to_float(input, output, size);
    }
}

inline void from_float(const float* input, void* output, int size, ggml_type type) {
    if (type == ggml_type::GGML_TYPE_F32) {
        memcpy(output, input, size * sizeof(float));
    } else {
        ggml_internal_get_type_traits(type).from_float(input, output, size);
    }
}

#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/linear.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:45:18
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "linear.h"

Linear::Linear(LinearConfig config) {
    config_ = config;
    proj_ = config_.proj;

    std::vector<std::pair<void**, uint64_t>> mem_requests;
    mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.input_size});
    mem_requests.push_back({(void**)&proj_input_, config_.group_max_len * config_.input_size * ggml_type_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.proj_type).vec_dot_type)});
    mem_requests.push_back({(void**)&proj_output_, sizeof(float) * config_.group_max_len * config_.output_size});
    shared_mem_buffer.alloc(this, mem_requests);
}

Linear::~Linear() {
    shared_mem_buffer.dealloc(this);
}

void Linear::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.input_size);
    std::vector<uint8_t> input(config_.input_size *
                               ggml_type_size(config_.hidden_type) /
                               ggml_blck_size(config_.hidden_type));
    std::vector<uint8_t> output(config_.output_size *
                                ggml_type_size(config_.hidden_type) /
                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.input_size; i++) {
        input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.input_size, config_.hidden_type);
    forward_many(1, input.data(), output.data(), backend);
}

void Linear::forward_many(int qlen, const void* input, void* output, Backend* backend) {
    const void* proj_input_ptr;
    if (config_.hidden_type == ggml_internal_get_type_traits(config_.proj_type).vec_dot_type) {
        proj_input_ptr = input;
    } else {
        to_float(input, input_fp32_, qlen * config_.input_size, config_.hidden_type);
        from_float(input_fp32_, proj_input_, qlen * config_.input_size, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type);
        proj_input_ptr = proj_input_;
    }
    int nth = config_.output_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
        float* proj_output_ptr = proj_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.input_size / ggml_blck_size(config_.proj_type), proj_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_input_ptr, config_.input_size / ggml_blck_size(config_.proj_type), proj_output_ptr, config_.output_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.proj_type, ggml_internal_get_type_traits(config_.proj_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            for (int i = 0; i < qlen; i++) {
                float* output_fp32_ptr = proj_output_ + i * config_.output_size + ith * config_.stride;
                void* output_ptr = (uint8_t*)output + i * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
    }
}

void Linear::forward(int qlen, const void* input, void* output, Backend* backend) {
    if (qlen <= 0) {
        return;
    }
    int forward_len = std::min(qlen, config_.group_max_len);
    forward_many(forward_len, input, output, backend);
    forward(qlen - forward_len, (uint8_t*)input + forward_len * config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/linear.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:00
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_LINEAR_H
#define CPUINFER_OPERATOR_LINEAR_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct LinearConfig {
    int input_size;
    int output_size;
    int stride;
    int group_max_len;
    void* proj;
    ggml_type proj_type;
    ggml_type hidden_type;

    LinearConfig() {}

    LinearConfig(int input_size, int output_size, int stride, int group_max_len, void* proj, ggml_type proj_type, ggml_type hidden_type)
        : input_size(input_size), output_size(output_size), stride(stride), group_max_len(group_max_len), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {}
};

class Linear {
   public:
    Linear(LinearConfig);
    ~Linear();
    void warm_up(Backend* backend);
    void forward_many(int qlen, const void* input, void* output, Backend* backend);
    void forward(int qlen, const void* input, void* output, Backend* backend);

   private:
    LinearConfig config_;
    void* proj_;  // [output_size * input_size ( /32 if quantized)]

    float* input_fp32_;    // [group_max_len * input_size]
    uint8_t* proj_input_;  // [group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)]
    float* proj_output_;   // [group_max_len * output_size]
};

#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/mlp.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:44:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "mlp.h"

MLP::MLP(MLPConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;

    std::vector<std::pair<void**, uint64_t>> mem_requests;
    mem_requests.push_back({(void**)&input_fp32_, sizeof(float) * config_.group_max_len * config_.hidden_size});
    mem_requests.push_back({(void**)&gate_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    mem_requests.push_back({(void**)&up_input_, config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    mem_requests.push_back({(void**)&gate_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
    mem_requests.push_back({(void**)&up_output_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
    mem_requests.push_back({(void**)&intermediate_fp32_, sizeof(float) * config_.group_max_len * config_.intermediate_size});
    mem_requests.push_back({(void**)&down_input_, config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
    mem_requests.push_back({(void**)&down_output_, sizeof(float) * config_.group_max_len * config_.hidden_size});
    shared_mem_buffer.alloc(this, mem_requests);
}

MLP::~MLP() {
    shared_mem_buffer.dealloc(this);
}

void MLP::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.hidden_size);
    std::vector<uint8_t> input(config_.hidden_size *
                               ggml_type_size(config_.hidden_type) /
                               ggml_blck_size(config_.hidden_type));
    std::vector<uint8_t> output(config_.hidden_size *
                                ggml_type_size(config_.hidden_type) /
                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.hidden_size; i++) {
        input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
    forward_many(1, input.data(), output.data(), backend);
}

static float act_fn(float x) { return x / (1.0f + expf(-x)); }

void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) {
    const void* gate_input_ptr;
    const void* up_input_ptr;
    if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
        gate_input_ptr = up_input_ptr = input;
    } else {
        to_float(input, input_fp32_, qlen * config_.hidden_size, config_.hidden_type);
        if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
            gate_input_ptr = up_input_ptr = gate_input_;
        } else {
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                from_float(input_fp32_, gate_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = gate_input_;
            } else {
                gate_input_ptr = input;
            }
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(input_fp32_, up_input_, qlen * config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                up_input_ptr = up_input_;
            } else {
                up_input_ptr = input;
            }
        }
    }
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = gate_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        void* up_proj_ptr = (uint8_t*)up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = up_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < qlen; i++) {
            for (int j = ith * config_.stride; j < (ith + 1) * config_.stride; j++) {
                intermediate_fp32_[i * config_.intermediate_size + j] = act_fn(gate_output_[i * config_.intermediate_size + j]) * up_output_[i * config_.intermediate_size + j];
            }
            if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
                float* intermediate_fp32_ptr = intermediate_fp32_ + i * config_.intermediate_size + ith * config_.stride;
                void* down_input_ptr = (uint8_t*)down_input_ + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
                from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            }
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
    }
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = down_output_ + ith * config_.stride;
        llamafile_sgemm(config_.stride, qlen, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            for (int i = 0; i < qlen; i++) {
                float* output_fp32_ptr = down_output_ + i * config_.hidden_size + ith * config_.stride;
                void* output_ptr = (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type) + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
    }
}

void MLP::forward(int qlen, const void* input, void* output, Backend* backend) {
    if (qlen <= 0) {
        return;
    }
    int forward_len = std::min(qlen, config_.group_max_len);
    forward_many(forward_len, input, output, backend);
    forward(qlen - forward_len, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/mlp.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:06
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_MLP_H
#define CPUINFER_OPERATOR_MLP_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct MLPConfig {
    int hidden_size;
    int intermediate_size;
    int stride;
    int group_max_len;
    void* gate_proj;
    void* up_proj;
    void* down_proj;
    ggml_type gate_type;
    ggml_type up_type;
    ggml_type down_type;
    ggml_type hidden_type;

    MLPConfig() {}

    MLPConfig(int hidden_size, int intermediate_size, int stride, int group_max_len, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
        : hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_max_len(group_max_len), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
};

class MLP {
   public:
    MLP(MLPConfig);
    ~MLP();
    void warm_up(Backend* backend);
    void forward_many(int qlen, const void* input, void* output, Backend* backend);
    void forward(int qlen, const void* input, void* output, Backend* backend);

   private:
    MLPConfig config_;
    void* gate_proj_;  // [intermediate_size * hidden_size ( /32 if quantized)]
    void* up_proj_;    // [intermediate_size * hidden_size ( /32 if quantized)]
    void* down_proj_;  // [hidden_size * intermediate_size ( /32 if quantized)]

    float* input_fp32_;         // [group_max_len * hidden_size]
    uint8_t* gate_input_;       // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* up_input_;         // [group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    float* gate_output_;        // [group_max_len * intermediate_size]
    float* up_output_;          // [group_max_len * intermediate_size]
    float* intermediate_fp32_;  // [group_max_len * intermediate_size]
    uint8_t* down_input_;       // [group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    float* down_output_;        // [group_max_len * hidden_size]
};

#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/moe.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:43:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "moe.h"
#include <iostream>
#include <cstdint>

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
#endif

MOE::MOE(MOEConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;
    
    #ifdef USE_NUMA
    int numa_nodes = numa_num_configured_nodes();
    gate_proj_numa_.resize(numa_nodes);
    up_proj_numa_.resize(numa_nodes);
    down_proj_numa_.resize(numa_nodes);
    size_t exp_inter_hidden_mul_ = (size_t)config.expert_num * config.intermediate_size * config.hidden_size;
    for (int i = 0; i < numa_nodes; i++) {
        gate_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.gate_type) / ggml_blck_size(config.gate_type), i);
        up_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.up_type) / ggml_blck_size(config.up_type), i);
        down_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.down_type) / ggml_blck_size(config.down_type), i);
        if (!gate_proj_numa_[i]) {
            std::cout << "Memory allocation failed for gate_proj_numa_ on node " << i << std::endl;
        }
        if (!up_proj_numa_[i]) {
            std::cout << "Memory allocation failed for up_proj_numa_ on node " << i << std::endl;
        }
        if (!down_proj_numa_[i]) {
            std::cout << "Memory allocation failed for down_proj_numa_ on node " << i << std::endl;
        }
        memcpy(gate_proj_numa_[i], gate_proj_, exp_inter_hidden_mul_* ggml_type_size(config.gate_type) / ggml_blck_size(config.gate_type));
        memcpy(up_proj_numa_[i], up_proj_, exp_inter_hidden_mul_* ggml_type_size(config.up_type) / ggml_blck_size(config.up_type));
        memcpy(down_proj_numa_[i], down_proj_, exp_inter_hidden_mul_* ggml_type_size(config.down_type) / ggml_blck_size(config.down_type));
    }
    #endif

    std::vector<std::pair<void**, uint64_t>> s_mem_requests;
    s_mem_requests.push_back({(void**)&s_input_fp32_, sizeof(float) * config_.hidden_size});
    s_mem_requests.push_back({(void**)&s_gate_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    s_mem_requests.push_back({(void**)&s_up_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    s_gate_output_.resize(config_.routed_expert_num);
    s_up_output_.resize(config_.routed_expert_num);
    s_intermediate_fp32_.resize(config_.routed_expert_num);
    s_down_input_.resize(config_.routed_expert_num);
    s_down_output_.resize(config_.routed_expert_num);
    for (int i = 0; i < config_.routed_expert_num; i++) {
        s_mem_requests.push_back({(void**)&s_gate_output_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_up_output_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_intermediate_fp32_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_down_input_[i], config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
        s_mem_requests.push_back({(void**)&s_down_output_[i], sizeof(float) * config_.hidden_size});
    }
    s_mem_requests.push_back({(void**)&s_output_fp32_, sizeof(float) * config_.hidden_size});
    shared_mem_buffer.alloc(this, s_mem_requests);

    std::vector<std::pair<void**, uint64_t>> m_mem_requests;
    m_input_fp32_.resize(config_.group_max_len);
    m_gate_input_.resize(config_.group_max_len);
    m_up_input_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_mem_requests.push_back({(void**)&m_input_fp32_[i], sizeof(float) * config_.hidden_size});
        m_mem_requests.push_back({(void**)&m_gate_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
        m_mem_requests.push_back({(void**)&m_up_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    }
    m_mem_requests.push_back({(void**)&m_local_gate_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_up_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_gate_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_up_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_intermediate_fp32_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_down_input_, config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_down_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size});
    m_output_fp32_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_mem_requests.push_back({(void**)&m_output_fp32_[i], sizeof(float) * config_.hidden_size});
    }
    shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_local_num_.resize(config_.expert_num);
    m_local_gate_input_ptr_.resize(config_.expert_num);
    m_local_up_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_intermediate_fp32_ptr_.resize(config_.expert_num);
    m_local_down_input_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);
}

MOE::~MOE() {
    shared_mem_buffer.dealloc(this);

    #ifdef USE_NUMA
    int numa_nodes = numa_num_configured_nodes();
    for (int i = 0; i < numa_nodes; i++) {
        numa_free(gate_proj_numa_[i], config_.expert_num * config_.intermediate_size * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type));
        numa_free(up_proj_numa_[i], config_.expert_num * config_.intermediate_size * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type));
        numa_free(down_proj_numa_[i], config_.expert_num * config_.hidden_size * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type));
    }
    #endif
}

void MOE::warm_up(Backend* backend) {
    std::vector<float> input_fp32(config_.hidden_size);
    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
    std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.hidden_size; i++) {
        input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
    for (int i = 0; i < config_.expert_num; i++) {
        uint64_t expert_ids = i;
        float weights = 0;
        forward_one(1, &expert_ids, &weights, input.data(), output.data(), backend);
    }
}

static float act_fn(float x) {
    return x / (1.0f + expf(-x));
}

void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
    const void* gate_input_ptr;
    const void* up_input_ptr;
    if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
        gate_input_ptr = up_input_ptr = input;
    } else {
        to_float(input, s_input_fp32_, config_.hidden_size, config_.hidden_type);
        if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
            gate_input_ptr = up_input_ptr = s_gate_input_;
        } else {
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = s_gate_input_;
            } else {
                gate_input_ptr = input;
            }
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(s_input_fp32_, s_up_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                up_input_ptr = s_up_input_;
            } else {
                up_input_ptr = input;
            }
        }
    }
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
        
        #ifdef USE_NUMA
        void* gate_proj_ptr = (uint8_t*)gate_proj_numa_[Backend::numa_node] + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #else
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #endif

        float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);

        #ifdef USE_NUMA
        void* up_proj_ptr = (uint8_t*)up_proj_numa_[Backend::numa_node] + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #else
        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #endif

        float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_intermediate_fp32_[expert_idx][i] = act_fn(s_gate_output_[expert_idx][i]) * s_up_output_[expert_idx][i];
        }
        if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
            float* intermediate_fp32_ptr = s_intermediate_fp32_[expert_idx] + ith * config_.stride;
            void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        for (int i = 0; i < k; i++) {
            from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_output_fp32_[i] = 0;
        }
        for (int expert_idx = 0; expert_idx < k; expert_idx++) {
            uint64_t expert_id = expert_ids[expert_idx];

            #ifdef USE_NUMA
            void* down_proj_ptr = (uint8_t*)down_proj_numa_[Backend::numa_node] + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            #else
            void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            #endif
            
            float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
            llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
                s_output_fp32_[i] += s_down_output_[expert_idx][i] * weights[expert_idx];
            }
        }
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(s_output_fp32_, output, config_.hidden_size, config_.hidden_type);
    }
}

void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
        for (int j = 0; j < k; j++) {
            m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
        }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_gate_input_ptr_[i] = m_local_gate_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
        m_local_up_input_ptr_[i] = m_local_up_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
        m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
        m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
        m_local_intermediate_fp32_ptr_[i] = m_local_intermediate_fp32_ + offset * config_.intermediate_size;
        m_local_down_input_ptr_[i] = m_local_down_input_ + offset * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
        offset += m_local_num_[i];
    }
    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        const void* gate_input_ptr;
        const void* up_input_ptr;
        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
        } else {
            to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
            if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = up_input_ptr = m_gate_input_[i];
            } else {
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                    gate_input_ptr = m_gate_input_[i];
                } else {
                    gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                    up_input_ptr = m_up_input_[i];
                } else {
                    up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
            }
        }
        for (int j = 0; j < k; j++) {
            memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
            memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
        }
    }, nullptr);
    int stride = QK_K;
    int nth = config_.intermediate_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];

        #ifdef USE_NUMA
        void* gate_proj_ptr = (uint8_t*)gate_proj_numa_[Backend::numa_node] + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #else
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #endif

        float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        void* up_input_ptr = m_local_up_input_ptr_[expert_idx];

        #ifdef USE_NUMA
        void* up_proj_ptr = (uint8_t*)up_proj_numa_[Backend::numa_node] + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #else
        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #endif

        float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            for (int j = ith * stride; j < (ith + 1) * stride; j++) {
                m_local_intermediate_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] = act_fn(m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size + j]) * m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size + j];
            }
            float* intermediate_fp32_ptr = m_local_intermediate_fp32_ptr_[expert_idx] + i * config_.intermediate_size + ith * stride;
            void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }, nullptr);
    stride = QK_K;
    nth = config_.hidden_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
        
        #ifdef USE_NUMA
        void* down_proj_ptr = (uint8_t*)down_proj_numa_[Backend::numa_node] + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        #else
        void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        #endif

        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
    }, nullptr);
    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        for (int e = 0; e < config_.hidden_size; e++) {
            m_output_fp32_[i][e] = 0;
        }
        for (int j = 0; j < k; j++) {
            for (int e = 0; e < config_.hidden_size; e++) {
                m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
            }
        }
        from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
    }, nullptr);
}

void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend) {
    qlen = batch_size_tensor[0];
    if (qlen < config_.group_min_len) {
        for (int i = 0; i < qlen; i++) {
            forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
        }
        return;
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    forward_many(forward_len, k, expert_ids, weights, input, output, backend);

    batch_size_tensor[0] -= forward_len;
    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), batch_size_tensor, backend);
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/moe.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_MOE_H
#define CPUINFER_OPERATOR_MOE_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"

struct MOEConfig {
    int expert_num;
    int routed_expert_num;
    int hidden_size;
    int intermediate_size;
    int stride;
    int group_min_len;
    int group_max_len;
    void* gate_proj;
    void* up_proj;
    void* down_proj;
    ggml_type gate_type;
    ggml_type up_type;
    ggml_type down_type;
    ggml_type hidden_type;

    MOEConfig() {}

    MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
        : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_min_len(group_min_len), group_max_len(group_max_len), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
};

class MOE {
   public:
    MOE(MOEConfig);
    ~MOE();
    void warm_up(Backend* backend);
    void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
    void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend);

   private:
    MOEConfig config_;
    void* gate_proj_;  // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    void* up_proj_;    // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    void* down_proj_;  // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

    #ifdef USE_NUMA
    std::vector<void*> gate_proj_numa_;  // [numa_num, expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    std::vector<void*> up_proj_numa_;    // [numa_num, expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    std::vector<void*> down_proj_numa_;  // [numa_num, expert_num * hidden_size * intermediate_size ( /32 if quantized)]
    #endif

    float* s_input_fp32_;                      // [hidden_size]
    uint8_t* s_gate_input_;                    // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* s_up_input_;                      // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    std::vector<float*> s_gate_output_;        // [routed_expert_num, intermediate_size]
    std::vector<float*> s_up_output_;          // [routed_expert_num, intermediate_size]
    std::vector<float*> s_intermediate_fp32_;  // [routed_expert_num, intermediate_size]
    std::vector<uint8_t*> s_down_input_;       // [routed_expert_num, intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    std::vector<float*> s_down_output_;        // [routed_expert_num, hidden_size]
    float* s_output_fp32_;                     // [hidden_size]

    std::vector<float*> m_input_fp32_;    // [group_max_len, hidden_size]
    std::vector<uint8_t*> m_gate_input_;  // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    std::vector<uint8_t*> m_up_input_;    // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    uint8_t* m_local_gate_input_;         // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* m_local_up_input_;           // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    float* m_local_gate_output_;          // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_up_output_;            // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_intermediate_fp32_;    // [routed_expert_num * group_max_len * intermediate_size]
    uint8_t* m_local_down_input_;         // [routed_expert_num * group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    float* m_local_down_output_;          // [routed_expert_num * group_max_len * hidden_size]
    std::vector<float*> m_output_fp32_;   // [group_max_len, hidden_size]

    std::vector<std::vector<int>> m_local_pos_;          // [group_max_len, routed_expert_num]
    std::vector<int> m_local_num_;                       // [expert_num]
    std::vector<uint8_t*> m_local_gate_input_ptr_;       // [expert_num]
    std::vector<uint8_t*> m_local_up_input_ptr_;         // [expert_num]
    std::vector<float*> m_local_gate_output_ptr_;        // [expert_num]
    std::vector<float*> m_local_up_output_ptr_;          // [expert_num]
    std::vector<float*> m_local_intermediate_fp32_ptr_;  // [expert_num]
    std::vector<uint8_t*> m_local_down_input_ptr_;       // [expert_num]
    std::vector<float*> m_local_down_output_ptr_;        // [expert_num]
};

#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/sft_moe.cpp
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-15 07:43:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#include "sft_moe.h"
#include <iostream>
#include <cstdint>
#include <cstring>
#include <time.h>

#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
#endif

SFT_MOE::SFT_MOE(SFT_MOEConfig config) {
    config_ = config;
    gate_proj_ = config_.gate_proj;
    up_proj_ = config_.up_proj;
    down_proj_ = config_.down_proj;
    
    #ifdef USE_NUMA
    int numa_nodes = numa_num_configured_nodes();
    gate_proj_numa_.resize(numa_nodes);
    up_proj_numa_.resize(numa_nodes);
    down_proj_numa_.resize(numa_nodes);
    size_t exp_inter_hidden_mul_ = (size_t)config.expert_num * config.intermediate_size * config.hidden_size;
    for (int i = 0; i < numa_nodes; i++) {
        gate_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.gate_type) / ggml_blck_size(config.gate_type), i);
        up_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.up_type) / ggml_blck_size(config.up_type), i);
        down_proj_numa_[i] = numa_alloc_onnode(exp_inter_hidden_mul_* ggml_type_size(config.down_type) / ggml_blck_size(config.down_type), i);
        if (!gate_proj_numa_[i]) {
            std::cout << "Memory allocation failed for gate_proj_numa_ on node " << i << std::endl;
        }
        if (!up_proj_numa_[i]) {
            std::cout << "Memory allocation failed for up_proj_numa_ on node " << i << std::endl;
        }
        if (!down_proj_numa_[i]) {
            std::cout << "Memory allocation failed for down_proj_numa_ on node " << i << std::endl;
        }
        memcpy(gate_proj_numa_[i], gate_proj_, exp_inter_hidden_mul_* ggml_type_size(config.gate_type) / ggml_blck_size(config.gate_type));
        memcpy(up_proj_numa_[i], up_proj_, exp_inter_hidden_mul_* ggml_type_size(config.up_type) / ggml_blck_size(config.up_type));
        memcpy(down_proj_numa_[i], down_proj_, exp_inter_hidden_mul_* ggml_type_size(config.down_type) / ggml_blck_size(config.down_type));
    }
    #endif

    std::vector<std::pair<void**, uint64_t>> s_mem_requests;
    s_mem_requests.push_back({(void**)&gate_proj_t_, config_.expert_num * config_.hidden_size * config_.intermediate_size * ggml_type_size(config_.grad_type)});
    s_mem_requests.push_back({(void**)&up_proj_t_, config_.expert_num * config_.hidden_size * config_.intermediate_size * ggml_type_size(config_.grad_type)});
    s_mem_requests.push_back({(void**)&down_proj_t_, config_.expert_num * config_.hidden_size * config_.intermediate_size * ggml_type_size(config_.grad_type)});
    s_mem_requests.push_back({(void**)&transpose_buffer_fp32_, config_.expert_num * config_.intermediate_size * config_.hidden_size * sizeof(float)});
    s_mem_requests.push_back({(void**)&transpose_buffer_, config_.expert_num * config_.intermediate_size * config_.hidden_size * ggml_type_size(config_.grad_type)});

    s_mem_requests.push_back({(void**)&s_input_fp32_, sizeof(float) * config_.hidden_size});
    s_mem_requests.push_back({(void**)&s_gate_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    s_mem_requests.push_back({(void**)&s_up_input_, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    s_gate_output_.resize(config_.routed_expert_num);
    s_up_output_.resize(config_.routed_expert_num);
    s_intermediate_fp32_.resize(config_.routed_expert_num);
    s_down_input_.resize(config_.routed_expert_num);
    s_down_output_.resize(config_.routed_expert_num);
    for (int i = 0; i < config_.routed_expert_num; i++) {
        s_mem_requests.push_back({(void**)&s_gate_output_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_up_output_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_intermediate_fp32_[i], sizeof(float) * config_.intermediate_size});
        s_mem_requests.push_back({(void**)&s_down_input_[i], config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
        s_mem_requests.push_back({(void**)&s_down_output_[i], sizeof(float) * config_.hidden_size});
    }
    s_mem_requests.push_back({(void**)&s_output_fp32_, sizeof(float) * config_.hidden_size});
        
    s_down_input_grad_.resize(config_.routed_expert_num);
    s_gate_output_grad_fp32_.resize(config_.routed_expert_num);
    s_up_output_grad_fp32_.resize(config_.routed_expert_num);
    s_gate_output_grad_.resize(config_.routed_expert_num);
    s_up_output_grad_.resize(config_.routed_expert_num);
    s_gate_input_grad_.resize(config_.routed_expert_num);
    s_up_input_grad_.resize(config_.routed_expert_num);
    for (int i = 0; i < config_.routed_expert_num; i++) {
        s_mem_requests.push_back({(void**)&s_down_input_grad_[i], config_.intermediate_size * sizeof(float)});
        s_mem_requests.push_back({(void**)&s_gate_output_grad_fp32_[i], config_.intermediate_size * sizeof(float)});
        s_mem_requests.push_back({(void**)&s_up_output_grad_fp32_[i], config_.intermediate_size * sizeof(float)});
        s_mem_requests.push_back({(void**)&s_gate_output_grad_[i], config_.intermediate_size * ggml_type_size(config_.grad_type)});
        s_mem_requests.push_back({(void**)&s_up_output_grad_[i], config_.intermediate_size * ggml_type_size(config_.grad_type)});
        s_mem_requests.push_back({(void**)&s_gate_input_grad_[i], config_.hidden_size * sizeof(float)});
        s_mem_requests.push_back({(void**)&s_up_input_grad_[i], config_.hidden_size * sizeof(float)});
    }
    s_mem_requests.push_back({(void**)&s_input_grad_fp32_, config_.hidden_size * sizeof(float)});

    shared_mem_buffer.alloc(this, s_mem_requests);

    std::vector<std::pair<void**, uint64_t>> m_mem_requests;
    m_input_fp32_.resize(config_.group_max_len);
    m_gate_input_.resize(config_.group_max_len);
    m_up_input_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_mem_requests.push_back({(void**)&m_input_fp32_[i], sizeof(float) * config_.hidden_size});
        m_mem_requests.push_back({(void**)&m_gate_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
        m_mem_requests.push_back({(void**)&m_up_input_[i], config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    }
    m_mem_requests.push_back({(void**)&m_local_gate_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_up_input_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_gate_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_up_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_intermediate_fp32_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_down_input_, config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type)});
    m_mem_requests.push_back({(void**)&m_local_down_output_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size});
    m_output_fp32_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_mem_requests.push_back({(void**)&m_output_fp32_[i], sizeof(float) * config_.hidden_size});
    }
    
    m_mem_requests.push_back({(void**)&m_local_down_output_grad_, config_.routed_expert_num * config_.group_max_len * config_.hidden_size * ggml_type_size(config_.grad_type)});
    m_mem_requests.push_back({(void**)&m_local_down_input_grad_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_gate_output_grad_fp32_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_up_output_grad_fp32_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.intermediate_size});
    m_mem_requests.push_back({(void**)&m_local_gate_output_grad_, config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(config_.grad_type)});
    m_mem_requests.push_back({(void**)&m_local_up_output_grad_, config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(config_.grad_type)});
    m_mem_requests.push_back({(void**)&m_local_gate_input_grad_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size});
    m_mem_requests.push_back({(void**)&m_local_up_input_grad_, sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size});
    m_mem_requests.push_back({(void**)&m_local_token_indices_, sizeof(int) * config_.routed_expert_num * config_.group_max_len});
    m_mem_requests.push_back({(void**)&m_local_expert_positions_, sizeof(int) * config_.routed_expert_num * config_.group_max_len});
    m_grad_input_fp32_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_mem_requests.push_back({(void**)&m_grad_input_fp32_[i], sizeof(float) * config_.hidden_size});
    }
    
    shared_mem_buffer.alloc(this, m_mem_requests);

    m_local_pos_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
        m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_local_num_.resize(config_.expert_num);
    m_local_gate_input_ptr_.resize(config_.expert_num);
    m_local_up_input_ptr_.resize(config_.expert_num);
    m_local_gate_output_ptr_.resize(config_.expert_num);
    m_local_up_output_ptr_.resize(config_.expert_num);
    m_local_intermediate_fp32_ptr_.resize(config_.expert_num);
    m_local_down_input_ptr_.resize(config_.expert_num);
    m_local_down_output_ptr_.resize(config_.expert_num);
    
    // backward_many 专用指针数组初始化
    m_local_down_output_grad_ptr_.resize(config_.expert_num);
    m_local_down_input_grad_ptr_.resize(config_.expert_num);
    m_local_gate_output_grad_fp32_ptr_.resize(config_.expert_num);
    m_local_up_output_grad_fp32_ptr_.resize(config_.expert_num);
    m_local_gate_output_grad_ptr_.resize(config_.expert_num);
    m_local_up_output_grad_ptr_.resize(config_.expert_num);
    m_local_gate_input_grad_ptr_.resize(config_.expert_num);
    m_local_up_input_grad_ptr_.resize(config_.expert_num);
    
    // fwd_cache访问映射指针数组初始化
    m_local_token_indices_ptr_.resize(config_.expert_num);
    m_local_expert_positions_ptr_.resize(config_.expert_num);
}

SFT_MOE::~SFT_MOE() {
    shared_mem_buffer.dealloc(this);

    #ifdef USE_NUMA
    int numa_nodes = numa_num_configured_nodes();
    for (int i = 0; i < numa_nodes; i++) {
        numa_free(gate_proj_numa_[i], config_.expert_num * config_.intermediate_size * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type));
        numa_free(up_proj_numa_[i], config_.expert_num * config_.intermediate_size * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type));
        numa_free(down_proj_numa_[i], config_.expert_num * config_.hidden_size * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type));
    }
    #endif
}

void SFT_MOE::warm_up(Backend* backend) {
    std::vector<float> input_fp32(config_.hidden_size);
    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
    std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.hidden_size; i++) {
        input_fp32[i] = 0;
    }
    from_float(input_fp32.data(), input.data(), config_.hidden_size, config_.hidden_type);
	/* ---------- 仅用于占位的 ForwardCache ---------- */
    SFT_MoEForwardCache dummy_cache; // 内容无用，只为满足接口
	dummy_cache.init(/*k=*/1, config_.intermediate_size);
    for (int i = 0; i < config_.expert_num; i++) {
        uint64_t expert_ids = i;
        float weights = 0;
        forward_one(1, &expert_ids, &weights, input.data(), output.data(), backend, &dummy_cache);
    }
}

static float act_fn(float x) {
    return x / (1.0f + expf(-x));
}

void SFT_MOE::ensure_fwd_cache(int qlen, int k)
{
	// if ((int)fw_cache_.size() < qlen)
	// 	fw_cache_.resize(qlen);
	// /* 只在扩容的那部分做 init，防止重复开辟 */
	// for (int i = 0; i < qlen; ++i)
	// 	fw_cache_[i].init(k, config_.intermediate_size);

	int old_sz = fw_cache_.size();
    if (old_sz < qlen)
    {
        fw_cache_.resize(qlen);
        for (int i = old_sz; i < qlen; ++i)  // 仅初始化新增元素
            fw_cache_[i].init(k, config_.intermediate_size);
    }

	
    // if ((int)fw_cache_.size() < qlen)
    //     fw_cache_.resize(qlen);

    // for (int i = 0; i < qlen; ++i)                          // 每轮都 init
    //     fw_cache_[i].init(k, config_.intermediate_size);    // 但 无重 alloc

}

SFT_MoEForwardCache* SFT_MOE::fwd_cache_ptr()
{
	return fw_cache_.empty() ? nullptr : fw_cache_.data();
}

void SFT_MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend, SFT_MoEForwardCache* fwd_cache) {
    const void* gate_input_ptr;
    const void* up_input_ptr;
    if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
        gate_input_ptr = up_input_ptr = input;
    } else {
        to_float(input, s_input_fp32_, config_.hidden_size, config_.hidden_type);
        if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
            gate_input_ptr = up_input_ptr = s_gate_input_;
        } else {
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                from_float(s_input_fp32_, s_gate_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = s_gate_input_;
            } else {
                gate_input_ptr = input;
            }
            if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(s_input_fp32_, s_up_input_, config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                up_input_ptr = s_up_input_;
            } else {
                up_input_ptr = input;
            }
        }
    }
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
        
        #ifdef USE_NUMA
        void* gate_proj_ptr = (uint8_t*)gate_proj_numa_[Backend::numa_node] + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #else
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #endif

        float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);

        #ifdef USE_NUMA
        void* up_proj_ptr = (uint8_t*)up_proj_numa_[Backend::numa_node] + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #else
        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #endif

        float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_intermediate_fp32_[expert_idx][i] = act_fn(s_gate_output_[expert_idx][i]) * s_up_output_[expert_idx][i];
        }
        if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
            float* intermediate_fp32_ptr = s_intermediate_fp32_[expert_idx] + ith * config_.stride;
            void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        for (int i = 0; i < k; i++) {
            from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_output_fp32_[i] = 0;
        }
        for (int expert_idx = 0; expert_idx < k; expert_idx++) {
            uint64_t expert_id = expert_ids[expert_idx];

            #ifdef USE_NUMA
            void* down_proj_ptr = (uint8_t*)down_proj_numa_[Backend::numa_node] + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            #else
            void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            #endif
            
            float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
            llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
                s_output_fp32_[i] += s_down_output_[expert_idx][i] * weights[expert_idx];
            }
        }
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
    }, nullptr);

	for (int e = 0; e < k; ++e) {
        // gate_output_: float[inter_size] per expert
        std::memcpy(fwd_cache->gate_u[e].data(),
                    s_gate_output_[e],
                    sizeof(float) * config_.intermediate_size);

        std::memcpy(fwd_cache->up_v[e].data(),
                    s_up_output_[e],
                    sizeof(float) * config_.intermediate_size);

        // 可选保存 z
        // std::memcpy(fwd_cache->z[e].data(),
        //             s_intermediate_fp32_[e],
        //             sizeof(float) * config_.intermediate_size);
    }
}

void SFT_MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend, SFT_MoEForwardCache* fwd_cache) {
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
        for (int j = 0; j < k; j++) {
            m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
        }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_gate_input_ptr_[i] = m_local_gate_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
        m_local_up_input_ptr_[i] = m_local_up_input_ + offset * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
        m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
        m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
        m_local_intermediate_fp32_ptr_[i] = m_local_intermediate_fp32_ + offset * config_.intermediate_size;
        m_local_down_input_ptr_[i] = m_local_down_input_ + offset * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
        offset += m_local_num_[i];
    }
    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        const void* gate_input_ptr;
        const void* up_input_ptr;
        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
            gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
        } else {
            to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
            if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = up_input_ptr = m_gate_input_[i];
            } else {
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                    gate_input_ptr = m_gate_input_[i];
                } else {
                    gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                    up_input_ptr = m_up_input_[i];
                } else {
                    up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
            }
        }
        for (int j = 0; j < k; j++) {
            memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
            memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
        }
    }, nullptr);
    int stride = QK_K;
    int nth = config_.intermediate_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];

        #ifdef USE_NUMA
        void* gate_proj_ptr = (uint8_t*)gate_proj_numa_[Backend::numa_node] + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #else
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        #endif

        float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        void* up_input_ptr = m_local_up_input_ptr_[expert_idx];

        #ifdef USE_NUMA
        void* up_proj_ptr = (uint8_t*)up_proj_numa_[Backend::numa_node] + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #else
        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        #endif

        float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            for (int j = ith * stride; j < (ith + 1) * stride; j++) {
                m_local_intermediate_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] = act_fn(m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size + j]) * m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size + j];
            }
            float* intermediate_fp32_ptr = m_local_intermediate_fp32_ptr_[expert_idx] + i * config_.intermediate_size + ith * stride;
            void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }, nullptr);
    stride = QK_K;
    nth = config_.hidden_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
        
        #ifdef USE_NUMA
        void* down_proj_ptr = (uint8_t*)down_proj_numa_[Backend::numa_node] + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        #else
        void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        #endif

        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
    }, nullptr);
    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        for (int e = 0; e < config_.hidden_size; e++) {
            m_output_fp32_[i][e] = 0;
        }
        for (int j = 0; j < k; j++) {
            for (int e = 0; e < config_.hidden_size; e++) {
                m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
            }
        }
        from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
    }, nullptr);

	/* 把每个 token-expert 的行复制到各自 cache */
    backend->do_work_stealing_job(qlen, nullptr, [&](int token_idx) {
        auto& cache = fwd_cache[token_idx];
        // cache 已在上层 init(k, inter_size)
        for (int j = 0; j < k; ++j) {
            uint64_t  eid   = expert_ids[token_idx*k + j];
            int       row   = m_local_pos_[token_idx][j];
            size_t    ofs   = row * config_.intermediate_size;
            /* gate u */
            std::memcpy(cache.gate_u[j].data(),
                        m_local_gate_output_ptr_[eid] + ofs,
                        sizeof(float) * config_.intermediate_size);
            /* up v */
            std::memcpy(cache.up_v[j].data(),
                        m_local_up_output_ptr_[eid] + ofs,
                        sizeof(float) * config_.intermediate_size);
            /* 可选 z */
            // std::memcpy(cache.z[j].data(),
            //             m_local_intermediate_fp32_ptr_[eid] + ofs,
            //             sizeof(float) * config_.intermediate_size);
        }
    }, nullptr);
}

void SFT_MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend, SFT_MoEForwardCache* fwd_cache) {
    if (qlen < config_.group_min_len) {
        for (int i = 0; i < qlen; i++) {
			// fwd_cache[i].init(k, config_.intermediate_size);      // 预分配
            forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend, fwd_cache + i);
        }
        return;
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    // for (int i = 0; i < forward_len; ++i)
    //     fwd_cache[i].init(k, config_.intermediate_size);
    forward_many(forward_len, k, expert_ids, weights, input, output, backend, fwd_cache);
    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend, fwd_cache + forward_len);
}

static float act_fn_grad(float x) {
    float sigmoid_x = 1.0f / (1.0f + expf(-x));
    return sigmoid_x * (1. + x * (1. - sigmoid_x));
}

void SFT_MOE::transpose_expert_matrix(const void* src, void* dst, int R, int C, ggml_type src_type, ggml_type dst_type, uint64_t expert_idx) {
    to_float(src, transpose_buffer_fp32_ + (R * C * expert_idx), R * C, src_type);
    from_float(transpose_buffer_fp32_ + (R * C * expert_idx), transpose_buffer_ + (R * C * expert_idx) * ggml_type_size(dst_type), R * C, dst_type);
    for (int r = 0; r < R; ++r) {
        for (int c = 0; c < C; ++c) {
            memcpy(
                (uint8_t*)dst + (c * R + r) * ggml_type_size(dst_type),
                (uint8_t*)transpose_buffer_ + (R * C * expert_idx + r * C + c) * ggml_type_size(dst_type),
                ggml_type_size(dst_type));
        }
    }
}

void SFT_MOE::get_transpose(Backend* backend) {
    // Transpose gate_proj_
    int R_gate = config_.intermediate_size;
    int C_gate = config_.hidden_size;
    size_t gate_expert_src_stride_bytes = (size_t)R_gate * C_gate * ggml_type_size(config_.gate_type);
    size_t gate_expert_dst_t_stride_bytes = (size_t)C_gate * R_gate * ggml_type_size(config_.grad_type);
    backend->do_work_stealing_job(config_.expert_num, nullptr, [&](int expert_idx) {
        void* src_expert = (uint8_t*)gate_proj_ + expert_idx * gate_expert_src_stride_bytes;
        void* dst_expert_t = (uint8_t*)gate_proj_t_ + expert_idx * gate_expert_dst_t_stride_bytes;
        transpose_expert_matrix(src_expert, dst_expert_t, R_gate, C_gate, config_.gate_type, config_.grad_type, expert_idx);
    }, nullptr);

    // Transpose up_proj_
    int R_up = config_.intermediate_size;
    int C_up = config_.hidden_size;
    size_t up_expert_src_stride_bytes = (size_t)R_up * C_up * ggml_type_size(config_.up_type);
    size_t up_expert_dst_t_stride_bytes = (size_t)C_up * R_up * ggml_type_size(config_.grad_type);
    backend->do_work_stealing_job(config_.expert_num, nullptr, [&](int expert_idx) {
        void* src_expert = (uint8_t*)up_proj_ + expert_idx * up_expert_src_stride_bytes;
        void* dst_expert_t = (uint8_t*)up_proj_t_ + expert_idx * up_expert_dst_t_stride_bytes;
        transpose_expert_matrix(src_expert, dst_expert_t, R_up, C_up, config_.up_type, config_.grad_type, expert_idx);
    }, nullptr);

    // Transpose down_proj_
    int R_down = config_.hidden_size;
    int C_down = config_.intermediate_size;
    size_t down_expert_src_stride_bytes = (size_t)R_down * C_down * ggml_type_size(config_.down_type);
    size_t down_expert_dst_t_stride_bytes = (size_t)C_down * R_down * ggml_type_size(config_.grad_type);
    backend->do_work_stealing_job(config_.expert_num, nullptr, [&](int expert_idx) {
        void* src_expert = (uint8_t*)down_proj_ + expert_idx * down_expert_src_stride_bytes;
        void* dst_expert_t = (uint8_t*)down_proj_t_ + expert_idx * down_expert_dst_t_stride_bytes;
        transpose_expert_matrix(src_expert, dst_expert_t, R_down, C_down, config_.down_type, config_.grad_type, expert_idx);
    }, nullptr);
}

void SFT_MOE::backward_one(int k, const uint64_t* expert_ids, const float* weights, const void* output_grad, void* input_grad, Backend* backend, const SFT_MoEForwardCache* fwd_cache) {
	// clock_t clk1, clk2, clk3, clk4;
	// clock_t clkz1, clkz2, clkz3, clkz4, clkz5;
	// clk1 = clock();
	// clk2 = clock();
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
		// clkz1 = clock();
        void* down_proj_t_ptr = (uint8_t*)down_proj_t_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.grad_type);
        float* down_input_grad_ptr = s_down_input_grad_[expert_idx] + ith * config_.stride;
        // clkz2 = clock();
        llamafile_sgemm(config_.stride, 1, config_.hidden_size, down_proj_t_ptr, config_.hidden_size, output_grad, config_.hidden_size, down_input_grad_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.grad_type, config_.grad_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        // clkz3 = clock();
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_down_input_grad_[expert_idx][i] *= weights[expert_idx];

            s_gate_output_grad_fp32_[expert_idx][i] = s_down_input_grad_[expert_idx][i] * fwd_cache->up_v[expert_idx][i] * act_fn_grad(fwd_cache->gate_u[expert_idx][i]); 
            s_up_output_grad_fp32_[expert_idx][i] = s_down_input_grad_[expert_idx][i] * act_fn(fwd_cache->gate_u[expert_idx][i]);
        }
        // clkz4 = clock();
        from_float(s_gate_output_grad_fp32_[expert_idx] + ith * config_.stride, s_gate_output_grad_[expert_idx] + ith * config_.stride * ggml_type_size(config_.grad_type), config_.stride, config_.grad_type);
        from_float(s_up_output_grad_fp32_[expert_idx] + ith * config_.stride, s_up_output_grad_[expert_idx] + ith * config_.stride * ggml_type_size(config_.grad_type), config_.stride, config_.grad_type);
        // clkz5 = clock();
    }, nullptr);

	// clk3 = clock();
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_input_grad_fp32_[i] = 0;
        }
        for (int expert_idx = 0; expert_idx < k; expert_idx++) {
            uint64_t expert_id = expert_ids[expert_idx];

            void* gate_proj_t_ptr = (uint8_t*)gate_proj_t_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.grad_type);
            float* gate_input_grad_ptr = s_gate_input_grad_[expert_idx] + ith * config_.stride;
            llamafile_sgemm(config_.stride, 1, config_.intermediate_size, gate_proj_t_ptr, config_.intermediate_size, s_gate_output_grad_[expert_idx], config_.intermediate_size, gate_input_grad_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.grad_type, config_.grad_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);

            void* up_proj_t_ptr = (uint8_t*)up_proj_t_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.grad_type);
            float* up_input_grad_ptr = s_up_input_grad_[expert_idx] + ith * config_.stride;
            llamafile_sgemm(config_.stride, 1, config_.intermediate_size, up_proj_t_ptr, config_.intermediate_size, s_up_output_grad_[expert_idx], config_.intermediate_size, up_input_grad_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.grad_type, config_.grad_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
            
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
                s_input_grad_fp32_[i] += s_gate_input_grad_[expert_idx][i] + s_up_input_grad_[expert_idx][i];
            }
        }
        from_float(s_input_grad_fp32_ + ith * config_.stride, (uint8_t*)input_grad + ith * config_.stride * ggml_type_size(config_.grad_type), config_.stride, config_.grad_type);
    }, nullptr);
	// clk4 = clock();
	// std::cout << "[Δclk12] " << (clk2 - clk1) / static_cast<double>(CLOCKS_PER_SEC) * 1000
    //       << " ms  [Δclk23] " << (clk3 - clk2) / static_cast<double>(CLOCKS_PER_SEC) * 1000
    //       << " ms  [Δclk34] " << (clk4 - clk3) / static_cast<double>(CLOCKS_PER_SEC) * 1000
    //       << " ms  [Δclkz12] " << (clkz2 - clkz1) / static_cast<double>(CLOCKS_PER_SEC) * 1000
    //       << " ms  [Δclkz23] " << (clkz3 - clkz2) / static_cast<double>(CLOCKS_PER_SEC) * 1000
    //       << " ms  [Δclkz34] " << (clkz4 - clkz3) / static_cast<double>(CLOCKS_PER_SEC) * 1000
    //       << " ms  [Δclkz45] " << (clkz5 - clkz4) / static_cast<double>(CLOCKS_PER_SEC) * 1000
    //       << " ms\n";

}

void SFT_MOE::backward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* output_grad, void* input_grad, Backend* backend, const SFT_MoEForwardCache* fwd_cache) {
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_num_[i] = 0;
    }
    for (int i = 0; i < qlen; i++) {
        for (int j = 0; j < k; j++) {
            m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
        }
    }
    uint64_t offset = 0;
    for (int i = 0; i < config_.expert_num; i++) {
        m_local_down_output_grad_ptr_[i] = m_local_down_output_grad_ + offset * config_.hidden_size * ggml_type_size(config_.grad_type);
        m_local_down_input_grad_ptr_[i] = m_local_down_input_grad_ + offset * config_.intermediate_size;
        m_local_gate_output_grad_fp32_ptr_[i] = m_local_gate_output_grad_fp32_ + offset * config_.intermediate_size;
        m_local_up_output_grad_fp32_ptr_[i] = m_local_up_output_grad_fp32_ + offset * config_.intermediate_size;
        m_local_gate_output_grad_ptr_[i] = m_local_gate_output_grad_ + offset * config_.intermediate_size * ggml_type_size(config_.grad_type);
        m_local_up_output_grad_ptr_[i] = m_local_up_output_grad_ + offset * config_.intermediate_size * ggml_type_size(config_.grad_type);
        m_local_gate_input_grad_ptr_[i] = m_local_gate_input_grad_ + offset * config_.hidden_size;
        m_local_up_input_grad_ptr_[i] = m_local_up_input_grad_ + offset * config_.hidden_size;
        m_local_token_indices_ptr_[i] = m_local_token_indices_ + offset;
        m_local_expert_positions_ptr_[i] = m_local_expert_positions_ + offset;
        offset += m_local_num_[i];
    }

    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        for (int j = 0; j < k; j++) {
            uint64_t expert_id = expert_ids[i * k + j];
            int local_row = m_local_pos_[i][j];
            memcpy(m_local_down_output_grad_ptr_[expert_id] + local_row * config_.hidden_size * ggml_type_size(config_.grad_type), (uint8_t*)output_grad + i * config_.hidden_size * ggml_type_size(config_.grad_type), config_.hidden_size * ggml_type_size(config_.grad_type));
            m_local_token_indices_ptr_[expert_id][local_row] = i;
            m_local_expert_positions_ptr_[expert_id][local_row] = j;
        }
    }, nullptr);

    // get_transpose(backend);

    int stride = QK_K;
    int nth = config_.intermediate_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        
        void* down_proj_t_ptr = (uint8_t*)down_proj_t_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.grad_type);
        void* down_output_grad_ptr = m_local_down_output_grad_ptr_[expert_idx];
        float* down_input_grad_ptr = m_local_down_input_grad_ptr_[expert_idx] + ith * stride;
                    
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size, down_proj_t_ptr, config_.hidden_size, down_output_grad_ptr, config_.hidden_size, down_input_grad_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.grad_type, config_.grad_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        
        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
            int token_idx = m_local_token_indices_ptr_[expert_idx][i];
            int expert_pos = m_local_expert_positions_ptr_[expert_idx][i];
            float weight = weights[token_idx * k + expert_pos];
            
            for (int j = ith * stride; j < (ith + 1) * stride; j++) {
                m_local_down_input_grad_ptr_[expert_idx][i * config_.intermediate_size + j] *= weight;
                
                float down_input_grad = m_local_down_input_grad_ptr_[expert_idx][i * config_.intermediate_size + j];
                m_local_gate_output_grad_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] = down_input_grad * fwd_cache[token_idx].up_v[expert_pos][j] * act_fn_grad(fwd_cache[token_idx].gate_u[expert_pos][j]);
                m_local_up_output_grad_fp32_ptr_[expert_idx][i * config_.intermediate_size + j] = down_input_grad * act_fn(fwd_cache[token_idx].gate_u[expert_pos][j]);
            }
            
            float* gate_output_grad_fp32_ptr = m_local_gate_output_grad_fp32_ptr_[expert_idx] + i * config_.intermediate_size + ith * stride;
            void* gate_output_grad_ptr = m_local_gate_output_grad_ptr_[expert_idx] + (i * config_.intermediate_size + ith * stride) * ggml_type_size(config_.grad_type);
            from_float(gate_output_grad_fp32_ptr, gate_output_grad_ptr, stride, config_.grad_type);
            
            float* up_output_grad_fp32_ptr = m_local_up_output_grad_fp32_ptr_[expert_idx] + i * config_.intermediate_size + ith * stride;
            void* up_output_grad_ptr = m_local_up_output_grad_ptr_[expert_idx] + (i * config_.intermediate_size + ith * stride) * ggml_type_size(config_.grad_type);
            from_float(up_output_grad_fp32_ptr, up_output_grad_ptr, stride, config_.grad_type);
        }
    }, nullptr);
    stride = QK_K;
    nth = config_.hidden_size / stride;
    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        uint64_t expert_idx = task_id / nth;
        int ith = task_id % nth;
        
        void* gate_proj_t_ptr = (uint8_t*)gate_proj_t_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.grad_type);
        void* up_proj_t_ptr = (uint8_t*)up_proj_t_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.grad_type);
        void* gate_output_grad_ptr = m_local_gate_output_grad_ptr_[expert_idx];
        void* up_output_grad_ptr = m_local_up_output_grad_ptr_[expert_idx];
        float* gate_input_grad_ptr = m_local_gate_input_grad_ptr_[expert_idx] + ith * stride;
        float* up_input_grad_ptr = m_local_up_input_grad_ptr_[expert_idx] + ith * stride;
        
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size, gate_proj_t_ptr, config_.intermediate_size, gate_output_grad_ptr, config_.intermediate_size, gate_input_grad_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.grad_type, config_.grad_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size, up_proj_t_ptr, config_.intermediate_size, up_output_grad_ptr, config_.intermediate_size, up_input_grad_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.grad_type, config_.grad_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
    }, nullptr);
    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        for (int e = 0; e < config_.hidden_size; e++) {
            m_grad_input_fp32_[i][e] = 0;
        }
        for (int j = 0; j < k; j++) {
            for (int e = 0; e < config_.hidden_size; e++) {
                m_grad_input_fp32_[i][e] += m_local_gate_input_grad_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] + m_local_up_input_grad_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e];
            }
        }
        from_float(m_grad_input_fp32_[i], (uint8_t*)input_grad + i * config_.hidden_size * ggml_type_size(config_.grad_type), config_.hidden_size, config_.grad_type);
    }, nullptr);
}

// TODO: input和layer_idx参数可以删除
void SFT_MOE::backward(int layer_idx, int qlen, int k, const uint64_t* expert_ids, const float* weights,
                   const void* input, const void* grad_output, void* grad_input, Backend* backend, const SFT_MoEForwardCache* fwd_cache) {

    get_transpose(backend);
    int remaining_qlen = qlen;
    int processed_offset = 0;
    
    while (remaining_qlen > 0) {
        // config_.group_min_len = 10000000;
        if (remaining_qlen < config_.group_min_len) {
            for (int i = 0; i < remaining_qlen; i++) {
                backward_one(k,
                             expert_ids + (processed_offset + i) * k,
                             weights + (processed_offset + i) * k,
                             (uint8_t*)grad_output + (processed_offset + i) * config_.hidden_size * ggml_type_size(config_.grad_type),
                             (uint8_t*)grad_input + (processed_offset + i) * config_.hidden_size * ggml_type_size(config_.grad_type),
                             backend,
                             fwd_cache + processed_offset + i);
            }
            break;
        } else {
            int backward_len = std::min(config_.group_max_len, remaining_qlen);
            backward_many(backward_len, 
                         k, 
                         expert_ids + processed_offset * k, 
                         weights + processed_offset * k, 
                         (uint8_t*)grad_output + processed_offset * config_.hidden_size * ggml_type_size(config_.grad_type), 
                         (uint8_t*)grad_input + processed_offset * config_.hidden_size * ggml_type_size(config_.grad_type), 
                         backend, 
                         fwd_cache + processed_offset);
            
            remaining_qlen -= backward_len;
            processed_offset += backward_len;
        }
    }
}

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/sft_moe.h
================================================
/**
 * @Description  :
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
 * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:35:10
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
#ifndef CPUINFER_OPERATOR_SFT_MOE_H
#define CPUINFER_OPERATOR_SFT_MOE_H

#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>

#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "sft_moe_forward_cache.h"

struct SFT_MOEConfig {
    long expert_num;
    int routed_expert_num;
    long hidden_size;
    long intermediate_size;
    int stride;
    int group_min_len;
    int group_max_len;
    void* gate_proj;
    void* up_proj;
    void* down_proj;
    ggml_type gate_type;
    ggml_type up_type;
    ggml_type down_type;
    ggml_type hidden_type;
    ggml_type grad_type = GGML_TYPE_BF16;

    SFT_MOEConfig() {}

    SFT_MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int stride, int group_min_len, int group_max_len, void* gate_proj, void* up_proj, void* down_proj, ggml_type gate_type, ggml_type up_type, ggml_type down_type, ggml_type hidden_type)
        : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size), intermediate_size(intermediate_size), stride(stride), group_min_len(group_min_len), group_max_len(group_max_len), gate_proj(gate_proj), up_proj(up_proj), down_proj(down_proj), gate_type(gate_type), up_type(up_type), down_type(down_type), hidden_type(hidden_type) {}
};

class SFT_MOE {
   public:
    SFT_MOE(SFT_MOEConfig);
    ~SFT_MOE();
    void warm_up(Backend* backend);
    void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend, SFT_MoEForwardCache* fwd_cache);
    void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend, SFT_MoEForwardCache* fwd_cache);
    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend, SFT_MoEForwardCache* fwd_cache);
	void backward_one(int k, const uint64_t* expert_ids, const float* weights, const void* output_grad, void* input_grad, Backend* backend, const SFT_MoEForwardCache* fwd_cache);
	void backward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* output_grad, void* input_grad, Backend* backend, const SFT_MoEForwardCache* fwd_cache);
	void backward(int layer_idx, int qlen, int k, const uint64_t* expert_ids, const float* weights,
              const void* input, const void* grad_output, void* grad_input, Backend* backend, const SFT_MoEForwardCache* fwd_cache); // FIXME: expert backward definition for C++
    
    void transpose_expert_matrix(const void* src, void* dst, int R, int C, ggml_type src_type, ggml_type dst_type, uint64_t expert_idx);
    void ensure_fwd_cache(int qlen, int k);
    void get_transpose(Backend* backend);
    SFT_MoEForwardCache* fwd_cache_ptr();

   private:
    SFT_MOEConfig config_;
    void* gate_proj_;  // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    void* up_proj_;    // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    void* down_proj_;  // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]

    float* transpose_buffer_fp32_;  // [expert_num * intermediate_size * hidden_size]
    uint8_t* transpose_buffer_;     // [expert_num * intermediate_size * hidden_size]

    uint8_t* gate_proj_t_;  // [expert_num * hidden_size * intermediate_size]
    uint8_t* up_proj_t_;    // [expert_num * hidden_size * intermediate_size]
    uint8_t* down_proj_t_;  // [expert_num * intermediate_size * hidden_size]

    #ifdef USE_NUMA
    std::vector<void*> gate_proj_numa_;  // [numa_num, expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    std::vector<void*> up_proj_numa_;    // [numa_num, expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    std::vector<void*> down_proj_numa_;  // [numa_num, expert_num * hidden_size * intermediate_size ( /32 if quantized)]
    #endif

    float* s_input_fp32_;                      // [hidden_size]
    uint8_t* s_gate_input_;                    // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* s_up_input_;                      // [hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    std::vector<float*> s_gate_output_;        // [routed_expert_num, intermediate_size]
    std::vector<float*> s_up_output_;          // [routed_expert_num, intermediate_size]
    std::vector<float*> s_intermediate_fp32_;  // [routed_expert_num, intermediate_size]
    std::vector<uint8_t*> s_down_input_;       // [routed_expert_num, intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    std::vector<float*> s_down_output_;        // [routed_expert_num, hidden_size]
    float* s_output_fp32_;                     // [hidden_size]

    std::vector<float*> s_down_input_grad_;        // [routed_expert_num, intermediate_size]
    std::vector<float*> s_gate_output_grad_fp32_;  // [routed_expert_num, intermediate_size]
    std::vector<float*> s_up_output_grad_fp32_;    // [routed_expert_num, intermediate_size]
    std::vector<uint8_t*> s_gate_output_grad_;     // [routed_expert_num, intermediate_size * ggml_type_size(grad_type)]
    std::vector<uint8_t*> s_up_output_grad_;       // [routed_expert_num, intermediate_size * ggml_type_size(grad_type)]
    std::vector<float*> s_gate_input_grad_;        // [routed_expert_num, hidden_size]
    std::vector<float*> s_up_input_grad_;          // [routed_expert_num, hidden_size]
    float* s_input_grad_fp32_;                     // [hidden_size]

    std::vector<float*> m_input_fp32_;    // [group_max_len, hidden_size]
    std::vector<uint8_t*> m_gate_input_;  // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    std::vector<uint8_t*> m_up_input_;    // [group_max_len, hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    uint8_t* m_local_gate_input_;         // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)]
    uint8_t* m_local_up_input_;           // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)]
    float* m_local_gate_output_;          // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_up_output_;            // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_intermediate_fp32_;    // [routed_expert_num * group_max_len * intermediate_size]
    uint8_t* m_local_down_input_;         // [routed_expert_num * group_max_len * intermediate_size * ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)]
    float* m_local_down_output_;          // [routed_expert_num * group_max_len * hidden_size]
    std::vector<float*> m_output_fp32_;   // [group_max_len, hidden_size]

    std::vector<std::vector<int>> m_local_pos_;          // [group_max_len, routed_expert_num]
    std::vector<int> m_local_num_;                       // [expert_num]
    std::vector<uint8_t*> m_local_gate_input_ptr_;       // [expert_num]
    std::vector<uint8_t*> m_local_up_input_ptr_;         // [expert_num]
    std::vector<float*> m_local_gate_output_ptr_;        // [expert_num]
    std::vector<float*> m_local_up_output_ptr_;          // [expert_num]
    std::vector<float*> m_local_intermediate_fp32_ptr_;  // [expert_num]
    std::vector<uint8_t*> m_local_down_input_ptr_;       // [expert_num]
    std::vector<float*> m_local_down_output_ptr_;        // [expert_num]

    uint8_t* m_local_down_output_grad_;                  // [routed_expert_num * group_max_len * hidden_size * ggml_type_size(grad_type)]
    float* m_local_down_input_grad_;                     // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_gate_output_grad_fp32_;               // [routed_expert_num * group_max_len * intermediate_size]
    float* m_local_up_output_grad_fp32_;                 // [routed_expert_num * group_max_len * intermediate_size]
    uint8_t* m_local_gate_output_grad_;                  // [routed_expert_num * group_max_len * intermediate_size * ggml_type_size(grad_type)]
    uint8_t* m_local_up_output_grad_;                    // [routed_expert_num * group_max_len * intermediate_size * ggml_type_size(grad_type)]
    float* m_local_gate_input_grad_;                     // [routed_expert_num * group_max_len * hidden_size]
    float* m_local_up_input_grad_;                       // [routed_expert_num * group_max_len * hidden_size]
    std::vector<float*> m_grad_input_fp32_;              // [group_max_len, hidden_size]

    std::vector<uint8_t*> m_local_down_output_grad_ptr_;     // [expert_num]
    std::vector<float*> m_local_down_input_grad_ptr_;        // [expert_num]
    std::vector<float*> m_local_gate_output_grad_fp32_ptr_;  // [expert_num]
    std::vector<float*> m_local_up_output_grad_fp32_ptr_;    // [expert_num]
    std::vector<uint8_t*> m_local_gate_output_grad_ptr_;     // [expert_num]
    std::vector<uint8_t*> m_local_up_output_grad_ptr_;       // [expert_num]
    std::vector<float*> m_local_gate_input_grad_ptr_;        // [expert_num]
    std::vector<float*> m_local_up_input_grad_ptr_;          // [expert_num]

    int* m_local_token_indices_;                             // [routed_expert_num * group_max_len]
    int* m_local_expert_positions_;                          // [routed_expert_num * group_max_len]
    std::vector<int*> m_local_token_indices_ptr_;            // [expert_num]
    std::vector<int*> m_local_expert_positions_ptr_;         // [expert_num]

	std::vector<SFT_MoEForwardCache> fw_cache_; // 持久缓存，便于backward读取到forward_cache
};

#endif

================================================
FILE: kt-sft/csrc/ktransformers_ext/operators/llamafile/sft_moe_forward_cache.h
================================================
#pragma once
#include <vector>

struct SFT_MoEForwardCache {
    // 每个 token 按 expert 分块保存
    std::vector<std::vector<float>> gate_u;   // u = W_gate x
    std::vector<std::vector<float>> up_v;     // v = W_up   x
    // 若希望反向直接用 z = σ(u)⊙v，则再加一份
    // std::vector<std::vector<float>> z;
    void init(int k, int inter_size) {
        /* ---- 只增不减：capacity 不够时才增，永不缩小，避免多线程情况下的use-after-free ---- */
       if (k > (int)gate_u.size()) {
            gate_u.resize(k);
            up_v  .resize(k);
            // z     .resize(k);
        }

        for (int i = 0; i < k; ++i) {
            if ((int)gate_u[i].capacity() < inter_size)
                gate_u[i].reserve(inter_size);   // 只增 capacity
            if ((int)up_v[i].capacity()   < inter_size)
                up_v[i].reserve(inter_size);
            // if ((int)z[i].capacity()      < inter_size)
            //     z[i].reserve(inter_size);

            // size() 更新为 inter_size 以便直接下标写入
            gate_u[i].resize(inter_size);
            up_v[i]  .resize(inter_size);
            // z[i]     .resize(inter_size);
        }
	}
};

================================================
FILE: kt-sft/csrc/ktransformers_ext/vendors/cuda.h
================================================
#pragma once

#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>

#if CUDART_VERSION < 11020
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif // CUDART_VERSION < 11020


================================================
FILE: kt-sft/csrc/ktransformers_ext/vendors/hip.h
================================================
#pragma once

#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bfloat16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__

#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F  HIPBLAS_R_16F
#define CUDA_R_32F  HIPBLAS_R_32F
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cublasOperation_t hipblasOperation_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cuDeviceGet hipDeviceGet
#define CUdevice hipDevice_t
#define CUdeviceptr hipDeviceptr_t
#define cuMemUnmap hipMemUnmap
#define CUmemAccessDesc hipMemAccessDesc
#define cuMemAddressFree hipMemAddressFree
#define cuMemRelease hipMemRelease
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
#define cuMemCreate hipMemCreate
#define cuMemAddressReserve hipMemAddressReserve
#define cuMemMap hipMemMap
#define cuMemSetAccess hipMemSetAccess
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
#define CUmemAllocationProp hipMemAllocationProp
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaGraphExec_t hipGraphExec_t
#define cudaGraphNode_t hipGraphNode_t
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaGraphExecDestroy hipGraphExecDestroy
#define cudaGraphLaunch hipGraphLaunch
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
#define cudaGraphNodeType hipGraphNodeType
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
#define cudaGraphInstantiate hipGraphInstantiate
#define cudaStreamEndCapture hipStreamEndCapture
#define cudaGraphDestroy hipGraphDestroy
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
#define cudaGraphNodeGetType hipGraphNodeGetType
#define cudaGraphGetNodes hipGraphGetNodes
#define cudaGraphExecUpdate hipGraphExecUpdate
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
#define cudaStreamBeginCapture hipStreamBeginCapture
#define cudaGraph_t hipGraph_t
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define cudaHostFn_t hipHostFn_t
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED

#define __CUDA_ARCH__ 1300

#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif

#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
#define CDNA
#endif

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif

#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif

#if defined(__gfx1010__) || defined(__gfx1012__)
#define RDNA1
#endif

#ifndef __has_builtin
    #define __has_builtin(x) 0
#endif

typedef hip_bfloat16 nv_bfloat16;


================================================
FILE: kt-sft/csrc/ktransformers_ext/vendors/musa.h
================================================
#pragma once

#include <musa_runtime.h>
#include <musa.h>
#include <mublas.h>
#include <musa_bf16.h>
#include <musa_fp16.h>
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
#define CUDA_R_16F  MUSA_R_16F
#define CUDA_R_32F  MUSA_R_32F
#define cublasComputeType_t cudaDataType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy
#define cublasGemmEx mublasGemmEx
#define cublasGemmBatchedEx mublasGemmBatchedEx
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
#define cublasHandle_t mublasHandle_t
#define cublasSetMathMode mublasSetMathMode
#define cublasSetStream mublasSetStream
#define cublasSgemm mublasSgemm
#define cublasStatus_t mublasStatus_t
#define cublasOperation_t mublasOperation_t
#define cublasGetStatusString mublasStatus_to_string
#define cudaDataType_t musaDataType_t
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t
#define cudaEventDestroy musaEventDestroy
#define cudaFree musaFree
#define cudaFreeHost musaFreeHost
#define cudaGetDevice musaGetDevice
#define cudaGetDeviceCount musaGetDeviceCount
#define cudaGetDeviceProperties musaGetDeviceProperties
#define cudaGetErrorString musaGetErrorString
#define cudaGetLastError musaGetLastError
#define cudaHostRegister musaHostRegister
#define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister
#define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost
#define cudaMallocManaged musaMallocManaged
#define cudaMemcpy musaMemcpy
#define cudaMemcpyAsync musaMemcpyAsync
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
#define cudaMemcpy2DAsync musaMemcpy2DAsync
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
#define cudaMemcpyKind musaMemcpyKind
#define cudaMemset musaMemset
#define cudaMemsetAsync musaMemsetAsync
#define cudaMemGetInfo musaMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
#define cudaSetDevice musaSetDevice
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
#define cudaStreamDestroy musaStreamDestroy
#define cudaStreamFireAndForget musaStreamFireAndForget
#define cudaStreamNonBlocking musaStreamNonBlocking
#define cudaStreamPerThread musaStreamPerThread
#define cudaStreamSynchronize musaStreamSynchronize
#define cudaStreamWaitEvent musaStreamWaitEvent
#define cudaStream_t musaStream_t
#define cudaSuccess musaSuccess

// Additional mappings for MUSA virtual memory pool
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
#define CUdevice MUdevice
#define CUdeviceptr MUdeviceptr
#define CUmemAccessDesc MUmemAccessDesc
#define CUmemAllocationProp MUmemAllocationProp
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
#define cuDeviceGet muDeviceGet
#define cuDeviceGetAttribute muDeviceGetAttribute
#define cuMemAddressFree muMemAddressFree
#define cuMemAddressReserve muMemAddressReserve
#define cuMemCreate muMemCreate
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
#define cuMemMap muMemMap
#define cuMemRelease muMemRelease
#define cuMemSetAccess muMemSetAccess
#define cuMemUnmap muMemUnmap
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
#define cudaFuncSetAttribute musaFuncSetAttribute
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
#define make_cudaExtent make_musaExtent
#define make_cudaPitchedPtr make_musaPitchedPtr

// Additional mappings for MUSA graphs
#define CUDA_SUCCESS MUSA_SUCCESS
#define CUresult MUresult
#define cuGetErrorString muGetErrorString
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
#define cudaGraphDestroy musaGraphDestroy
#define cudaGraphExecDestroy musaGraphExecDestroy
#define cudaGraphExec_t musaGraphExec_t
#define cudaGraphExecUpdate musaGraphExecUpdate
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
#define cudaGraphGetNodes musaGraphGetNodes
#define cudaGraphInstantiate musaGraphInstantiate
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
#define cudaGraphLaunch musaGraphLaunch
#define cudaGraphNodeGetType musaGraphNodeGetType
#define cudaGraphNode_t musaGraphNode_t
#define cudaGraphNodeType musaGraphNodeType
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
#define cudaGraph_t musaGraph_t
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture

typedef mt_bfloat16 nv_bfloat16;


================================================
FILE: kt-sft/csrc/ktransformers_ext/vendors/vendor.h
================================================
#ifndef CPUINFER_VENDOR_VENDOR_H
#define CPUINFER_VENDOR_VENDOR_H

#ifdef USE_CUDA
#include "cuda.h"
#elif USE_HIP
#define __HIP_PLATFORM_AMD__
#include "hip.h"
#elif USE_MUSA
#include "musa.h"
#endif

#endif  // CPUINFER_VENDOR_VENDOR_H

================================================
FILE: kt-sft/install-with-cache.sh
================================================
#!/bin/bash
set -e  

# clear build dirs
# rm -rf build
# rm -rf *.egg-info
# rm -rf csrc/build
# rm -rf csrc/ktransformers_ext/build
# rm -rf csrc/ktransformers_ext/cuda/build
# rm -rf csrc/ktransformers_ext/cuda/dist
# rm -rf csrc/ktransformers_ext/cuda/*.egg-info
rm -rf ~/.ktransformers
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
pip install -r ktransformers/server/requirements.txt
echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
pip install third_party/custom_flashinfer/ -v

# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*


echo "Installation completed successfully"


================================================
FILE: kt-sft/install.bat
================================================
@echo off

REM clear build dirs
rmdir /S /Q ktransformers\ktransformers_ext\build
rmdir /S /Q ktransformers\ktransformers_ext\cuda\build
rmdir /S /Q ktransformers\ktransformers_ext\cuda\dist
rmdir /S /Q ktransformers\ktransformers_ext\out
del /F /Q ktransformers\ktransformers_ext\cuda\*.egg-info

echo Installing python dependencies from requirements.txt
pip install -r requirements-local_chat.txt

echo Installing ktransformers
set KTRANSFORMERS_FORCE_BUILD=TRUE
pip install . --no-build-isolation
echo Installation completed successfully

================================================
FILE: kt-sft/install.sh
================================================
#!/bin/bash
set -e  

CWD=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

# default backend
DEV="cuda"

# parse --dev argument
while [[ "$#" -gt 0 ]]; do
    case $1 in
        --dev) DEV="$2"; shift ;;
        *) echo "Unknown parameter passed: $1"; exit 1 ;;
    esac
    shift
done
export DEV_BACKEND="$DEV"
echo "Selected backend: $DEV_BACKEND"

# clear build dirs
rm -rf build
rm -rf *.egg-info
rm -rf csrc/build
rm -rf csrc/ktransformers_ext/build
rm -rf csrc/ktransformers_ext/cuda/build
rm -rf csrc/ktransformers_ext/cuda/dist
rm -rf csrc/ktransformers_ext/cuda/*.egg-info
rm -rf ~/.ktransformers
echo "Installing python dependencies from requirements.txt"
pip install -r "${CWD}/requirements-sft.txt"

echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v "${CWD}" --no-build-isolation

if [[ "$DEV_BACKEND" == "cuda" ]]; then
    echo "Installing custom_flashinfer for CUDA backend"
    pip install "${CWD}/../third_party/custom_flashinfer/"
fi
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*

echo "Installation completed successfully"

================================================
FILE: kt-sft/ktransformers/__init__.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :
Author       : kkk1nak0
Date         : 2024-08-15 07:34:46
Version      : 1.0.0
LastEditors  : chenxl
LastEditTime : 2025-02-15 03:53:02
'''
import sys
import os

# Import version from shared version.py at project root
_root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, _root_dir)
try:
    from version import __version__
finally:
    sys.path.pop(0)


================================================
FILE: kt-sft/ktransformers/configs/config.yaml
================================================
log:
  dir: "logs"
  file: "lexllama.log"
  #log level: debug, info, warn, error, crit
  level: "debug"
  backup_count: -1

server:
  ip: 0.0.0.0
  port: 10002

db:
  type: "sqllite"
  database: "server.db"
  host: "./"
  pool_size: 10

user:
  secret_key: "981f1dd2a44e27d68759d0252a486568ed43480b4e616a26e3af3709c3a7ce73"
  algorithm: "HS256"

model:
  # type: transformers
  # type: balance_serve
  type: ktransformers

  name: DeepSeek-Coder-V2-Instruct
  path: deepseek-ai/DeepSeek-V2-Lite-Chat
  gguf_path: ./DeepSeek-V2-Lite-Chat-GGUF

  device: cuda:0
  cache_lens: 16384
  max_new_tokens: 500
web:
  mount: False
  open_cross_domain: True

ext:
  cpu_infer: 10

long_context:
  max_seq_len: 32000
  block_size: 128
  local_windows_len: 4096
  second_select_num: 32
  anchor_type: DYNAMIC
  kv_type: FP16
  dense_layer_num: 2
  anchor_num: 1
  preselect_block: True
  head_select_mode: SHARED
  preselect_block_count: 32
  layer_step: 1
  token_step: 

local_chat:
  prompt_file: ""

async_server:
  sched_strategy: "FCFS"
  sched_port: 56441
  sched_metrics_port: 54321
  kvc2_metrics_port: 54391
  max_batch_size: 4  # decode count + prefill count, in one mini batch

attn:
  page_size: 256
  chunk_size: 256
kvc2:
  gpu_only: true 
  utilization_percentage: 1.0
  cpu_memory_size_GB: 500


================================================
FILE: kt-sft/ktransformers/configs/log_config.ini
================================================
[loggers]
keys=root,uvicorn,uvicornError,uvicornAccess

[handlers]
keys=consoleHandler,fileHandler

[formatters]
keys=detailedFormatter

[logger_root]
level=INFO
handlers=consoleHandler

[logger_uvicorn]
level=INFO
handlers=consoleHandler,fileHandler
qualname=uvicorn
propagate=0

[logger_uvicornError]
level=ERROR
handlers=consoleHandler,fileHandler
qualname=uvicorn.error
propagate=0

[logger_uvicornAccess]
level=INFO
handlers=consoleHandler,fileHandler
qualname=uvicorn.access
propagate=0

[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=detailedFormatter
args=(sys.stdout,)

[handler_fileHandler]
class=logging.FileHandler
level=INFO
formatter=detailedFormatter
args=('uvicorn_logs.log', 'a')

[formatter_detailedFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s
datefmt=%Y-%m-%d %H:%M:%S


================================================
FILE: kt-sft/ktransformers/configs/model_config/config.json
================================================
{
	"architectures": [
		"DeepseekV2ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"auto_map": {
		"AutoConfig": "configuration_deepseek.DeepseekV2Config",
		"AutoModel": "modeling_deepseek.DeepseekV2Model",
		"AutoModelForCausalLM": "modeling_deepseek.DeepseekV2ForCausalLM"
	},
	"aux_loss_alpha": 0.001,
	"bos_token_id": 100000,
	"eos_token_id": 100001,
	"first_k_dense_replace": 1,
	"hidden_act": "silu",
	"hidden_size": 2048,
	"initializer_range": 0.02,
	"intermediate_size": 10944,
	"kv_lora_rank": 512,
	"max_position_embeddings": 163840,
	"model_type": "deepseek_v2",
	"moe_intermediate_size": 1408,
	"moe_layer_freq": 1,
	"n_group": 1,
	"n_routed_experts": 64,
	"n_shared_experts": 2,
	"norm_topk_prob": false,
	"num_attention_heads": 16,
	"num_experts_per_tok": 6,
	"num_hidden_layers": 27,
	"num_key_value_heads": 16,
	"pretraining_tp": 1,
	"q_lora_rank": null,
	"qk_nope_head_dim": 128,
	"qk_rope_head_dim": 64,
	"rms_norm_eps": 1e-06,
	"rope_scaling": {
		"beta_fast": 32,
		"beta_slow": 1,
		"factor": 40,
		"mscale": 0.707,
		"mscale_all_dim": 0.707,
		"original_max_position_embeddings": 4096,
		"type": "yarn"
	},
	"rope_theta": 10000,
	"routed_scaling_factor": 1.0,
	"scoring_func": "softmax",
	"seq_aux": true,
	"tie_word_embeddings": false,
	"topk_group": 1,
	"topk_method": "greedy",
	"torch_dtype": "bfloat16",
	"transformers_version": "4.33.1",
	"use_cache": true,
	"v_head_dim": 128,
	"vocab_size": 102400
}

================================================
FILE: kt-sft/ktransformers/configs/model_config/configuration_deepseek.py
================================================
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepseekV2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V2.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 102400):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        n_shared_experts (`int`, *optional*, defaults to None):
            Number of shared experts, None means dense model.
        n_routed_experts (`int`, *optional*, defaults to None):
            Number of routed experts, None means dense model.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        topk_method (`str`, *optional*, defaults to `gready`):
            Topk method used in routed gate.
        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
        moe_layer_freq (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
        scoring_func (`str`, *optional*, defaults to 'softmax'):
            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    ```python
    >>> from transformers import DeepseekV2Model, DeepseekV2Config

    >>> # Initializing a Deepseek-V2 style configuration
    >>> configuration = DeepseekV2Config()

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "deepseek_v2"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=102400,
        hidden_size=4096,
        intermediate_size=11008,
        moe_intermediate_size = 1407,
        num_hidden_layers=30,
        num_attention_heads=32,
        num_key_value_heads=32,
        n_shared_experts = None,
        n_routed_experts = None,
        ep_size = 1,
        routed_scaling_factor = 1.0,
        kv_lora_rank = 512,
        q_lora_rank = 1536,
        qk_rope_head_dim = 64,
        v_head_dim = 128,
        qk_nope_head_dim = 128,
        topk_method = 'gready',
        n_group = None,
        topk_group = None,
        num_experts_per_tok = None,
        moe_layer_freq = 1,
        first_k_dense_replace = 0,
        norm_topk_prob = False,
        scoring_func = 'softmax',
        aux_loss_alpha = 0.001,
        seq_aux = True,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=100000,
        eos_token_id=100001,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

================================================
FILE: kt-sft/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/format_24.py
================================================
#
# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
#

import torch


# This is PyTorch implementation of main part of reorder_meta()
# function, from tools/util/include/cutlass/util/host_reorder.h file
# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
# GEMM decides upon layout of this matrix, and at the moment for the
# sparse GEMM executed on tensor cores, this is layout described by
# ColumnMajorInterleaved<2> data structure, in
# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
# reordering of meta matrix into meta_reordered matrix calculated
# according to these segments of CUTLASS code is re-implemented here.
# Note that this calculation produces offsets for scattering metadata
# matrix elements into reordered metadata matrix elements (or,
# equivalently, for gathering reordered metadata matrix element back
# into metadata matrix elements).
def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
                                               device):
    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)

    # Reorder the rows, then swizzle the 2x2 blocks.
    group_x = 64
    group_y = 32 if meta_dtype.itemsize == 2 else 16

    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
                ((dst_rows % group_x) // 8) * 4)

    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
    dst_rows += topright - bottomleft
    dst_cols -= topright - bottomleft

    # Assumed that meta tensor is to be stored in CUTLASS
    # InterleavedColumnMajor layout, and reverse engineered
    # corresponding code to store values into this tensor.
    interleave = 2
    cols_maj = dst_cols // interleave
    cols_min = dst_cols % interleave
    return (cols_maj * m * interleave + dst_rows * interleave +
            cols_min).view(-1)


# This function converts dense matrix into sparse semi-structured
# representation, producing "compressed" matrix, in the layout used by
# CUTLASS backend, and corresponding metadata matrix.
def sparse_semi_structured_from_dense_cutlass(dense):
    if dense.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = dense.shape
    device = dense.device

    meta_dtype = torch.int8
    if dense.dtype == torch.int8:
        meta_dtype = torch.int32
    elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
        meta_dtype = torch.int16
    else:
        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
    if quadbits_per_meta_elem not in (4, 8):
        raise RuntimeError(
            "Invalid number of elements per meta element calculated")

    if meta_dtype == torch.int32:
        if m % 16 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 16")
    else:
        if m % 32 != 0:
            raise RuntimeError(
                f"Number of rows of dense matrix {m} must be divisible by 32")
    if k % (4 * quadbits_per_meta_elem) != 0:
        raise RuntimeError(
            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
        )

    if dense.dtype != torch.float:
        ksparse = 4
        dense_4 = dense.view(-1, k // ksparse, ksparse)
        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
    else:
        ksparse = 2
        dense_2 = dense.view(-1, k // ksparse, ksparse)
        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
    meta_ncols = k // (ksparse * quadbits_per_meta_elem)

    # Encoding quadruples of True/False values as follows:
    #     [True,  True,  False, False] -> 0b0100
    #     [True,  False, True,  False] -> 0b1000
    #     [False, True,  True,  False] -> 0b1001
    #     [True,  False, False, True ] -> 0b1100
    #     [False, True,  False, True ] -> 0b1101
    #     [False, False, True,  True ] -> 0b1110
    # Thus, lower two bits in the encoding are index of the True value
    # at the lowest index in the quadruple, and the higher two bits in
    # the encoding are index of the other True value in the quadruple.
    # In case there are less than two True values, than False value or
    # values at some index or indices are considered True for the
    # encoding.  In case there are more than two True values, then the
    # excess True value(s) at some indices are considered False for
    # the encoding.  The exact encodings used for these cases are as
    # follows:
    #     [False, False, False, False] -> 0b1110
    #     [False, False, False, True ] -> 0b1110
    #     [False, False, True,  False] -> 0b1110
    #     [False, True,  False, False] -> 0b1001
    #     [False, True,  True,  True ] -> 0b1101
    #     [True,  False, False, False] -> 0b1000
    #     [True,  False, True,  True ] -> 0b1100
    #     [True,  True,  False, True ] -> 0b0100
    #     [True,  True,  True,  False] -> 0b0100
    #     [True,  True,  True,  True ] -> 0b0100
    # These particular encodings are chosen, with the help of Espresso
    # logic minimizer software, for the purpose of minimization of
    # corresponding Boolean functions, that translate non-zero flags
    # into encoding bits.  Note also possible choices for the first
    # and last of these encodings were limited only to (0b0100,
    # 0b1110), in order to produce valid encodings for 1:2 sparsity
    # case.

    expr0 = m0 & m1
    expr1 = ~m0 & m1
    expr2 = ~m0 & ~m1
    bit0 = expr1
    bit1 = expr2
    bit2 = expr0 | expr2 | m3
    bit3 = expr1 | ~m1
    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
    idxs1 = bit2 | (bit3.to(torch.int64) << 1)

    if dense.dtype != torch.float:
        sparse0 = dense_4.gather(
            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
    else:
        sparse = dense_2.gather(-1,
                                idxs0.unsqueeze(-1) // 2).view(
                                    m,
                                    k // 2)  # type: ignore[possibly-undefined]

    meta_4 = idxs0 | (idxs1 << 2)
    meta_n = meta_4.view(
        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)

    if quadbits_per_meta_elem == 4:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12))
    elif quadbits_per_meta_elem == 8:
        meta = (meta_n[:, :, 0]
                | (meta_n[:, :, 1] << 4)
                | (meta_n[:, :, 2] << 8)
                | (meta_n[:, :, 3] << 12)
                | (meta_n[:, :, 4] << 16)
                | (meta_n[:, :, 5] << 20)
                | (meta_n[:, :, 6] << 24)
                | (meta_n[:, :, 7] << 28))

    # Reorder meta tensor elements.
    meta_reordered = meta.new_empty(
        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))

    return (sparse, meta_reordered.view(m, meta_ncols))


# This function performs reverse of the function above - it
# reconstructs dense matrix from a pair of "compressed" matrix, given
# in the layout used by CUTLASS backend, and accompanying metadata
# matrix.
def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
    if sparse.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"  # noqa: E501
        )

    m, k = sparse.shape
    device = sparse.device

    if meta_reordered.dim() != 2:
        raise RuntimeError(
            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"  # noqa: E501
        )
    if meta_reordered.device != device:
        raise RuntimeError(
            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"  # noqa: E501
        )

    meta_dtype = meta_reordered.dtype
    if meta_dtype not in (torch.int16, torch.int32):
        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4

    ksparse = 4 if sparse.dtype != torch.float else 2

    meta_nrows, meta_ncols = meta_reordered.shape
    if meta_nrows != m:
        raise RuntimeError(
            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"  # noqa: E501
        )
    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
        raise RuntimeError(
            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
            "expected according to the number of columns of meta matrix")

    # Undo meta tensor elements reordering.
    meta_offsets = _calculate_meta_reordering_scatter_offsets(
        m, meta_ncols, meta_dtype, device)
    meta = torch.gather(meta_reordered.view(-1), 0,
                        meta_offsets).view(m, meta_ncols)

    # Unpack sparse tensor back to original dense tensor, using
    # information provided by meta tensor.  Note that torch.float
    # datatype is handled pretty much the same as
    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
    # value is encoded as if underlying 8 bytes contain four
    # torch.half/torch.bfloat16 values, where either first two or last
    # two are zeros.
    meta_2 = torch.empty(
        (m, meta_ncols, 2 * quadbits_per_meta_elem),
        dtype=meta_dtype,
        device=device,
    )
    if quadbits_per_meta_elem == 4:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
    elif quadbits_per_meta_elem == 8:
        meta_2[:, :, 0] = meta & 0b11
        meta_2[:, :, 1] = (meta >> 2) & 0b11
        meta_2[:, :, 2] = (meta >> 4) & 0b11
        meta_2[:, :, 3] = (meta >> 6) & 0b11
        meta_2[:, :, 4] = (meta >> 8) & 0b11
        meta_2[:, :, 5] = (meta >> 10) & 0b11
        meta_2[:, :, 6] = (meta >> 12) & 0b11
        meta_2[:, :, 7] = (meta >> 14) & 0b11
        meta_2[:, :, 8] = (meta >> 16) & 0b11
        meta_2[:, :, 9] = (meta >> 18) & 0b11
        meta_2[:, :, 10] = (meta >> 20) & 0b11
        meta_2[:, :, 11] = (meta >> 22) & 0b11
        meta_2[:, :, 12] = (meta >> 24) & 0b11
        meta_2[:, :, 13] = (meta >> 26) & 0b11
        meta_2[:, :, 14] = (meta >> 28) & 0b11
        meta_2[:, :, 15] = (meta >> 30) & 0b11

    dense_offsets = meta_2.view(-1) + (
        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
            -1, 1).repeat(1, 2).view(-1)

    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
    if sparse.dtype != torch.float:
        # dense.scatter_(0, dense_offsets, sparse.view(-1))
        dense.scatter_(0, dense_offsets, sparse.reshape(-1))
    else:
        dense.view(torch.half).scatter_(0, dense_offsets,
                                        sparse.view(torch.half).view(-1))

    return dense.view(m, 2 * k)


def mask_creator(tensor):
    """
    Class for creating N:M sparsity masks.
    Masks will be created using the N:M ratio, where for every block of 
    M weights, N will be pruned based on ranked weight value. Each mask 
    will correspond to the given tensor.

    :param N: The number of weights in a group to keep
    :param M: The size of a weight group
    """
    N = 2
    M = 4

    mask = None
    # for i, tensor in enumerate(tensors):
    if tensor.numel() % M != 0:
        raise ValueError(
            f"Tensor of size {tensor.shape} can't be evenly divided into "
            f"{M} groups")

    num_groups = tensor.numel() // M

    # N:M sparsity for linear layers
    tensor_temp = tensor.detach().abs().reshape(num_groups, M)
    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]

    w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
    mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)

    return mask


================================================
FILE: kt-sft/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_24_perms.py
================================================
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
#
# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms_24(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        col_o = col // 2
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
                             4 * block)
        for j in range(4):
            perm_list.extend([p + 1 * j for p in perm1])
    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
    scale_perm_single: List[int] = []
    for i in range(8):
        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
    return perm, scale_perm, scale_perm_single


marlin_24_perm: Dict[int, torch.Tensor] = {}
marlin_24_scale_perm: Dict[int, List[int]] = {}
marlin_24_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
    marlin_24_perm[num_bits] = perm_24
    marlin_24_scale_perm[num_bits] = scale_perm_24
    marlin_24_scale_perm_single[num_bits] = scale_perm_single_24


================================================
FILE: kt-sft/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_perms.py
================================================
"""This file is used for /tests and /benchmarks"""
from typing import Dict, List

import numpy
import torch


# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
#
# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
# with the tensor-core format that is described here:
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
#
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
# (without the need to use ldmatrix instructions) # noqa: E501
def get_perms(num_bits: int):
    perm_list: List[int] = []
    for i in range(32):
        perm1: List[int] = []
        col = i // 4
        for block in [0, 1]:
            for row in [
                    2 * (i % 4),
                    2 * (i % 4) + 1,
                    2 * (i % 4 + 4),
                    2 * (i % 4 + 4) + 1,
            ]:
                perm1.append(16 * row + col + 8 * block)
        for j in range(4):
            perm_list.extend([p + 256 * j for p in perm1])

    perm = numpy.array(perm_list)

    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return perm, scale_perm, scale_perm_single


marlin_perm: Dict[int, torch.Tensor] = {}
marlin_scale_perm: Dict[int, List[int]] = {}
marlin_scale_perm_single: Dict[int, List[int]] = {}
for num_bits in [4, 8]:
    perm, scale_perm, scale_perm_single = get_perms(num_bits)
    marlin_perm[num_bits] = perm
    marlin_scale_perm[num_bits] = scale_perm
    marlin_scale_perm_single[num_bits] = scale_perm_single


================================================
FILE: kt-sft/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import random

import numpy
import torch

from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.format_24 import (
    mask_creator, sparse_semi_structured_from_dense_cutlass)
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_24_perms import (
    marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_perms import (
    marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.quant_utils import (
    get_pack_factor, quantize_weights, sort_weights)

__cuda_arch = torch.cuda.get_device_capability()

MARLIN_TILE = 16

GPTQ_MARLIN_TILE = 16
GPTQ_MARLIN_MIN_THREAD_N = 64
GPTQ_MARLIN_MIN_THREAD_K = 128
GPTQ_MARLIN_MAX_PARALLEL = 16

GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
GPTQ_MARLIN_SUPPORTED_SYM = [True]

def is_marlin_supported():
    return __cuda_arch[0] >= 8


def marlin_permute_weights(q_w, size_k, size_n, perm, tile=MARLIN_TILE):
    assert q_w.shape == (size_k, size_n)
    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"

    # Permute weights to 16x64 marlin tiles
    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
    q_w = q_w.permute((0, 2, 1, 3))
    q_w = q_w.reshape((size_k // tile, size_n * tile))

    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)

    return q_w


def marlin_weights(q_w, size_k, size_n, num_bits, perm):
    # Permute
    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)

    # Pack
    pack_factor = get_pack_factor(num_bits)
    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
                           dtype=numpy.uint32)
    for i in range(pack_factor):
        q_packed |= q_w[:, i::pack_factor] << num_bits * i

    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)

    return q_packed


def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
                          scale_perm_single):
    if group_size < size_k and group_size != -1:
        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
    else:
        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
    s = s.reshape((-1, size_n)).contiguous()

    return s


def marlin_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
    act_order: bool,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Quantize (and apply act_order if provided)
    q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                       act_order)

    # For act_order, sort the "weights" and "g_idx" so that group ids are
    # increasing
    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)

    # Reformat to marlin
    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
                                marlin_perm[num_bits])
    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                     marlin_scale_perm[num_bits],
                                     marlin_scale_perm_single[num_bits])

    # Create result
    res_list = [marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def vllm_marlin_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
    act_order: bool,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Quantize (and apply act_order if provided)
    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
                                                       act_order)

    # For act_order, sort the "weights" and "g_idx" so that group ids are
    # increasing
    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)

    # Reformat to marlin
    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
                                marlin_perm[num_bits])
    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                     marlin_scale_perm[num_bits],
                                     marlin_scale_perm_single[num_bits])

    # Create result
    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def inject_24(w, size_k, size_n):
    assert w.shape == (size_k, size_n)

    mask = mask_creator(w.t()).t().cuda().bool()

    return (mask * w).contiguous(), mask.contiguous()


def check_24(w, num_rows_to_sample=50, _verbose=False):
    BLOCK_SIZE = 4
    MAX_NON_ZEROS = 2

    w = w.t().contiguous()

    print("check_24: w.shape = {}".format(w.shape))

    num_rows, num_cols = w.shape
    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
    if _verbose:
        print(f"Sampled row idxs = {sampled_row_idxs}")

    total_segments = 0
    non_24_segments = 0
    for i in sampled_row_idxs:
        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
            total_segments += 1
            block = w[i, j:j + BLOCK_SIZE]
            num_nonzero = torch.count_nonzero(block)
            if num_nonzero > MAX_NON_ZEROS:
                print("i = {} j = {} block = {}".format(i, j, block))
                non_24_segments += 1

    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")


def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
    assert q_24.shape == (size_k, size_n)

    # Remove zp to normalize over 0
    max_q_val = (1 << num_bits) - 1
    zp = (max_q_val + 1) // 2
    q_24_no_zp = q_24 - zp

    # Compress
    q_24_no_zp = q_24_no_zp.t().contiguous()
    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
        q_24_no_zp)
    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()

    # Restore zp
    q_24_comp = q_24_no_zp_comp + zp

    # Resize meta to its actual shape (without moving any data)
    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)

    return q_24_comp, meta


def marlin_24_quantize(
    w: torch.Tensor,
    num_bits: int,
    group_size: int,
):
    size_k, size_n = w.shape

    # Normalize group_size
    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    # Inject 2:4 sparsity
    w_24, mask_24 = inject_24(w, size_k, size_n)

    # Quantize
    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
                                                             num_bits,
                                                             group_size,
                                                             act_order=False)

    # Compress quantized weight
    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
                                                     num_bits)
    size_k_comp = size_k // 2

    # Reformat to marlin
    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
                                        num_bits, marlin_24_perm[num_bits])
    marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
                                        marlin_24_scale_perm[num_bits],
                                        marlin_24_scale_perm_single[num_bits])

    # Create result
    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
    for i in range(len(res_list)):
        res_list[i] = res_list[i].to(w.device)

    return res_list


def compute_max_diff(output, output_ref):
    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
        torch.abs(output_ref))


class MarlinWorkspace:

    def __init__(self, out_features, min_thread_n, max_parallel, device):
        assert (out_features % min_thread_n == 0), (
            "out_features = {} is undivisible by min_thread_n = {}".format(
                out_features, min_thread_n))

        max_workspace_size = ((out_features // min_thread_n) * max_parallel)

        self.scratch = torch.zeros(max_workspace_size,
                                   dtype=torch.int,
                                   device=device)


================================================
FILE: kt-sft/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/quant_utils.py
================================================
"""This file is used for /tests and /benchmarks"""
import numpy
import torch

SUPPORTED_NUM_BITS = [4, 8]
SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]


def get_pack_factor(num_bits):
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    return 32 // num_bits


def permute_rows(q_w: torch.Tensor, group_size: int):

    orig_device = q_w.device
    k_size, _ = q_w.shape

    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
    for i in range(k_size):
        g_idx[i] = i // group_size

    # Simulate act_order by doing a random permutation on K
    rand_perm = torch.randperm(k_size)

    g_idx = g_idx[rand_perm].contiguous()
    q_w = q_w[rand_perm, :].contiguous()

    return (
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
                     act_order: bool):
    orig_device = w.device
    size_k, size_n = w.shape

    assert w.is_floating_point(), "w must be float"
    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
    assert group_size in SUPPORTED_GROUP_SIZES + [
        size_k
    ], f"Unsupported groupsize = {group_size}"

    if group_size == -1:
        group_size = size_k
    assert group_size <= size_k

    max_q_val = 2**num_bits - 1
    half_q_val = (max_q_val + 1) // 2

    # Reshape to [groupsize, -1]
    if group_size < size_k:
        w = w.view((-1, group_size, size_n))
        w = w.permute(1, 0, 2)
        w = w.reshape((group_size, -1))

    # Compute scale for each group
    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
    s *= 2 / max_q_val  # 2 => symmetric

    # Quantize
    q_w = torch.round(w / s).int()
    q_w += half_q_val
    q_w = torch.clamp(q_w, 0, max_q_val)

    # Restore original shapes
    if group_size < size_k:

        def reshape_w(w):
            w = w.reshape((group_size, -1, size_n))
            w = w.permute(1, 0, 2)
            w = w.reshape((size_k, size_n)).contiguous()
            return w

        q_w = reshape_w(q_w)

    s = s.reshape((-1, size_n)).contiguous()

    # Apply act_order
    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
    if act_order:
        assert (
            group_size < size_k
        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
            group_size, size_k)

        q_w, g_idx, rand_perm = permute_rows(q_w, group_size)

    return (
        q_w.to(device=orig_device),
        s.to(device=orig_device),
        g_idx.to(device=orig_device),
        rand_perm.to(device=orig_device),
    )


def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
    orig_device = q_w.device

    sort_indices = torch.argsort(g_idx).to(
        dtype=torch.int32)  # Sort based on g_idx

    g_idx = g_idx[sort_indices].contiguous()
    q_w = q_w[sort_indices, :].contiguous()

    return (
        q_w.to(device=orig_device),
        g_idx.to(device=orig_device),
        sort_indices.to(device=orig_device),
    )


def gptq_pack(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    pack_factor = get_pack_factor(num_bits)
    assert size_k % pack_factor == 0

    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_res |= q_w[i::pack_factor, :] << num_bits * i

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    return q_res


================================================
FILE: kt-sft/ktransformers/ktransformers_ext/triton/fp8gemm.py
================================================
# Adopted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
from typing import Tuple

import torch
import triton
import triton.language as tl
from triton import Config


@triton.jit
def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
    """
    Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.

    Args:
        x_ptr (triton.Pointer): Pointer to the input tensor.
        y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
        s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
        BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.

    Returns:
        None
    """
    pid = tl.program_id(axis=0)
    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    x = tl.load(x_ptr + offs).to(tl.float32)
    s = tl.max(tl.abs(x)) / 448.
    y = x / s
    y = y.to(y_ptr.dtype.element_ty)
    tl.store(y_ptr + offs, y)
    tl.store(s_ptr + pid, s)


def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Quantizes the input tensor `x` using block-wise quantization.

    Args:
        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The quantized tensor with dtype `torch.float8_e4m3fn`.
            - A tensor of scaling factors with dtype `torch.float32`.
    """
    assert x.is_contiguous(), 'Input tensor must be contiguous'
    assert x.size(-1) % block_size == 0, f'Last dimension size must be divisible by block_size (block_size={block_size})'
    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
    grid = lambda meta: (triton.cdiv(x.numel(), meta['BLOCK_SIZE']), )
    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
    return y, s


@triton.jit
def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
    """
    Dequantizes weights using the provided scaling factors and stores the result.

    Args:
        x_ptr (tl.pointer): Pointer to the quantized weights.
        s_ptr (tl.pointer): Pointer to the scaling factors.
        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
        M (int): Number of rows in the weight matrix.
        N (int): Number of columns in the weight matrix.
        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.

    Returns:
        None
    """
    pid_m = tl.program_id(axis=0)
    pid_n = tl.program_id(axis=1)
    n = tl.cdiv(N, BLOCK_SIZE)
    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs = offs_m[:, None] * N + offs_n[None, :]
    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
    s = tl.load(s_ptr + pid_m * n + pid_n)
    y = x * s
    tl.store(y_ptr + offs, y, mask=mask)


def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
    """
    Dequantizes the given weight tensor using the provided scale tensor.

    Args:
        x (torch.Tensor): The quantized weight tensor of shape (M, N).
        s (torch.Tensor): The scale tensor of shape (M, N).
        block_size (int, optional): The block size to use for dequantization. Defaults to 128.

    Returns:
        torch.Tensor: The dequantized weight tensor of the same shape as `x`.

    Raises:
        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
    """
    assert x.is_contiguous() and s.is_contiguous(), 'Input tensors must be contiguous'
    assert x.dim() == 2 and s.dim() == 2, 'Input tensors must have 2 dimensions'
    M, N = x.size()
    y = torch.empty_like(x, dtype=torch.get_default_dtype())
    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
    with torch.cuda.device(x.device):
        weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
    return y


fp8_gemm_configs = [
    Config({'BLOCK_SIZE_M': block_m, 'BLOCK_SIZE_N': block_n, 'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
]

@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
@triton.jit
def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
                    a_s_ptr, b_s_ptr,
                    M, N: tl.constexpr, K: tl.constexpr,
                    BLOCK_SIZE_M: tl.constexpr,
                    BLOCK_SIZE_N: tl.constexpr,
                    BLOCK_SIZE_K: tl.constexpr):
    """
    Performs a matrix multiplication operation on FP8 matrices with scaling factors.

    Args:
        a_ptr (tl.tensor): Pointer to the first input matrix A.
        b_ptr (tl.tensor): Pointer to the second input matrix B.
        c_ptr (tl.tensor): Pointer to the output matrix C.
        a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
        b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
        M (int): Number of rows in matrix A and C.
        N (tl.constexpr): Number of columns in matrix B and C.
        K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
        BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
        BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
        BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.

    Returns:
        None
    """
    pid_m = tl.program_id(axis=0)
    pid_n = tl.program_id(axis=1)
    k = tl.cdiv(K, BLOCK_SIZE_K)
    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
    a_s_ptrs = a_s_ptr + offs_m * k
    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k

    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    for i in range(k):
        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
        a_s = tl.load(a_s_ptrs)
        b_s = tl.load(b_s_ptrs)
        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
        a_ptrs += BLOCK_SIZE_K
        b_ptrs += BLOCK_SIZE_K
        a_s_ptrs += 1
        b_s_ptrs += 1
    c = accumulator.to(c_ptr.dtype.element_ty)
    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    tl.store(c_ptrs, c, mask=mask)


def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor):
    """
    Perform a matrix multiplication using FP8 precision.

    Args:
        a (torch.Tensor): The first input matrix, must be contiguous.
        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
        b (torch.Tensor): The second input matrix, must be contiguous.
        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.

    Returns:
        torch.Tensor: The result of the matrix multiplication.
    """
    assert a.is_contiguous() and b.is_contiguous(), 'Input tensors must be contiguous'
    assert a_s.is_contiguous() and b_s.is_contiguous(), 'Scaling factor tensors must be contiguous'
    K = a.size(-1)
    M = a.numel() // K
    N = b.size(0)
    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
    fp8_gemm_kernel[grid](a, b, c, a_s, b_s, M, N, K)
    return c

================================================
FILE: kt-sft/ktransformers/local_chat.py
================================================
"""
Description  :  
Author       : Boxin Zhang, Azure-Tang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

import os
import platform
import sys

project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)
import argparse
import torch
import logging
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
    EvalPrediction,
)
import json
from pathlib import Path
from tqdm import tqdm
from torchviz import make_dot
import fire
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import load_weights, prefill_and_generate, prefill_and_generate_capture, get_compute_capability, xpu_fp16_model
from ktransformers.server.config.config import Config
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor
from ktransformers.sft.lora import inject_lora_layer, lora_and_load_adapter
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
from ktransformers.util.globals import GLOBAL_CONFIG
from ktransformers.sft.metrics import ComputeSimilarity
from ktransformers.sft.monkey_patch_torch_module import install_patch, restore_patch

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# for debug
def print_module_tree(module, indent=0):
    print(" " + f"{module.__class__.__name__}(training={module.training})")
    for name, child in module.named_children():
        print(" " + f"└─{name}: ", end="")
        print_module_tree(child, indent + 4)

# for debug
def write_to_file(content, file_path: str = 'ktransformers/mark_content.txt', mode: str = 'a', encoding: str = 'utf-8') -> None:
    """
    将字符串写入指定文件 
    :param content: 要写入的字符串内容 
    :param file_path: 目标文件路径 
    :param mode: 文件打开模式（默认'w'为覆盖写入，可选'a'追加写入）
    :param encoding: 文件编码（默认utf-8）
    """
    with open(file_path, mode, encoding=encoding) as f:
        f.write(content) 

custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
    "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
    "LlamaForCausalLM": LlamaForCausalLM,
    "MixtralForCausalLM": MixtralForCausalLM,
}

ktransformer_rules_dir = (
    os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
)
default_optimize_rules = {
    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
    "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
    "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
}


def local_chat(
    model_path: str | None = None,
    model_config_path: str | None = None,
    optimize_config_path: str = None,
    gguf_path: str | None = None,
    max_new_tokens: int = 1000,
    cpu_infer: int = Config().cpu_infer,
    use_cuda_graph: bool = True, # modify to false if using KExpertsTorch
    prompt_file : str | None = None,
    mode: str = "normal",
    force_think: bool = False,
    chunk_size: int = 8192,
    device: str = "cuda",
    is_sft: bool = False,
    sft_data_path: str | None = None,
    save_adapter_path: str | None = None,
    use_adapter: bool = False,
    use_adapter_path: str | None = None,
    is_test_data: bool = False,
    test_data_path: str | None = None,
    output_dir: str | None = None,
):

    if not is_sft:
        torch.set_grad_enabled(False)
        
    if is_sft == True or use_adapter == True:
        GLOBAL_CONFIG._config["mod"] = "sft"
    else:
        GLOBAL_CONFIG._config["mod"] = "infer"

    Config().cpu_infer = cpu_infer
    Config().chunk_size = chunk_size
    if torch.xpu.is_available():
        use_cuda_graph = False

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if model_config_path == None:
        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    else:
        config = AutoConfig.from_pretrained(model_config_path, trust_remote_code=True)
    if mode == 'long_context':
        assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
        torch.set_default_dtype(torch.float16)
    elif xpu_fp16_model(config):
        torch.set_default_dtype(torch.float16)
    else:
        torch.set_default_dtype(config.torch_dtype)

    with torch.device("meta"):
        if config.architectures[0] in custom_models:
            print("using custom modeling_xxx.py.")
            if (
                "Qwen2Moe" in config.architectures[0]
            ):  # Qwen2Moe must use flash_attention_2 to avoid overflow.
                config._attn_implementation = "flash_attention_2"
            if "Llama" in config.architectures[0]:
                config._attn_implementation = "eager"
            if "Mixtral" in config.architectures[0]:
                config._attn_implementation = "flash_attention_2"
            if torch.xpu.is_available():
                config._attn_implementation = "eager"
            model = custom_models[config.architectures[0]](config)
        else:
            if torch.xpu.is_available():
                attn_implementation = "eager"
            else:
                attn_implementation = "flash_attention_2"
            model = AutoModelForCausalLM.from_config(
                config, trust_remote_code=True, attn_implementation=attn_implementation
            )

    if optimize_config_path is None:
        if config.architectures[0] in default_optimize_rules:
            print("using default_optimize_rule for", config.architectures[0])
            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
            optimize_config_path = input(
                "please input the path of your rule file(yaml file containing optimize rules):"
            )

    if gguf_path is None:
        gguf_path = input(
            "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
        )
        
    GLOBAL_CONFIG._config["mod"] = "infer"
    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)

    model.train()

    if is_sft == True:
        if use_adapter == True or is_test_data == True:
            raise AttributeError("We do not support to run sft and inference at the same time.")
        GLOBAL_CONFIG._config["mod"] = "sft"
        print(f"sft with lora in dataset: {sft_data_path} ...")
        print(f"use_cuda_graph:{use_cuda_graph}")
        lora_and_load_adapter(model, tokenizer, sft_data_path, save_adapter_path)

    if use_adapter == True:
        GLOBAL_CONFIG._config["mod"] = "sft"
        if is_sft == True:
            raise AttributeError("We do not support more than one adapter up to now...")
        
        if use_adapter_path.endswith('.gguf'):
            inject_lora_layer(model, use_adapter_path)
            adapter_gguf_loader = GGUFLoader(use_adapter_path)
            load_weights(model, adapter_gguf_loader, adapter_gguf=True)
            model.train()
        else:
            inject_lora_layer(model, use_adapter_path)
            
            adapter_loader = SafeTensorLoader(use_adapter_path)
            device = next(model.parameters()).device
            
            # for name, param in model.named_parameters():
            #     print(name, param.shape)

            for key in adapter_loader.tensor_file_map.keys():
                try:
                    tensor = adapter_loader.load_tensor(key, device=device)
                    
                    model_key = key.replace("base_model.model.", "")
                    model_key = model_key.replace(".weight", ".default.weight")
                    
                    param = model.get_parameter(model_key)
                    param.data.copy_(tensor.data)
                    
                    print(f"Loaded adapter weight: {key} -> {model_key}")
                except AttributeError as e:
                    print(f"Skipping {key}: not a model parameter")
                except KeyError as e:
                    print(f"Key not found in model: {model_key} (original: {key})")
            

    try:
        model.generation_config = GenerationConfig.from_pretrained(model_path)
    except Exception as e:
        print(f"generation config can't auto create, make default. Message: {e}")
        gen_config = GenerationConfig(
            temperature=0.6,
            top_p=0.95,
            do_sample=True
        )
        model.generation_config = gen_config
    # model.generation_config = GenerationConfig.from_pretrained(model_path)
    if model.generation_config.pad_token_id is None:
        model.generation_config.pad_token_id = model.generation_config.eos_token_id
    model.eval()
    logging.basicConfig(level=logging.INFO)
    
    # @torch.no_grad()
    # def first_token_argmax_baseline(model, tokenizer, prompt_text, device):
    #     model.eval()
    #     enc = tokenizer.apply_chat_template([{"role":"user","content":prompt_text}],
    #                                         add_generation_prompt=True, return_tensors="pt")
    #     x = enc.to(device)
    #     logits = model(input_ids=x, use_cache=False, return_dict=False)[0]
    #     return int(torch.argmax(logits[:, -1, :], dim=-1)[0])

    # try:
    #     device_map = model.gguf_loader.tensor_device_map
    #     from ktransformers.util.utils import get_device, torch_device_mapping
    #     torch_device = get_device('model.layers.0.self_attn', device_map)
    #     torch_device = torch_device_mapping.get(torch_device, torch_device)
    #     print(f"[FIRST-TOKEN PROBE] argmax id = {probe_id} ({tokenizer.decode([probe_id])!r})")
    # except Exception as e:
    #     print("[FIRST-TOKEN PROBE] failed:", e)
    #     return

    system = platform.system()
    # for debug
    # if system == "Windows":
    #     os.system("cls")
    # else:
    #     os.system("clear")
    
    if GLOBAL_CONFIG._config["mod"] == "sft" :
        model.model.embed_tokens.to("cpu")
        
    if is_test_data:
        data_path = Path(test_data_path)
        with data_path.open("r", encoding="utf-8") as f:
            dataset = json.load(f)
        preds, refs = [], []

        for sample in tqdm(dataset, desc="Processing samples"):
            inst = sample.get("instruction", "")
            prompt = sample.get("input", "")
            prompt = prompt+inst
            # print(f"prompt: {prompt}")
            label = sample.get("output", "")
   
            messages = [{"role": "user", "content": prompt}]
            input_tensor = tokenizer.apply_chat_template(
                messages, add_generation_prompt=True, return_tensors="pt"
            )
            if force_think:
                token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
                input_tensor = torch.cat(
                    [input_tensor, token_thinks], dim=1
                )
            if mode == 'long_context':
                assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
                "please change max_seq_len in  ~/.ktransformers/config.yaml"

            if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8 and device_manager.gpu_vendor == GPUVendor.NVIDIA:
                prediction = prefill_and_generate_capture(
                    model, tokenizer, input_tensor.to(device), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
                    use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim, echo_stream=False
                )
            else:
                prediction = prefill_and_generate_capture(
                    model, tokenizer, input_tensor.to(device), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,echo_stream=False,
                )
            # print(f"prediction:{prediction}")
            sample["label"] = label
            sample["prediction"] = prediction
            sample.pop("output", None)

            preds.append(prediction)
            refs.append(label)

        pred_file = Path(output_dir) / 'predictions.json'
        pred_file.parent.mkdir(parents=True, exist_ok=True)
        
        with pred_file.open("w", encoding="utf-8") as f:
            json.dump(dataset, f, ensure_ascii=False, indent=2)

        compute_metrics = ComputeSimilarity(tokenizer)
        # print(f"metrics:{metrics}")
        
        enc_pred = tokenizer(preds, add_special_tokens=False, padding=True, return_tensors="np")
        enc_ref  = tokenizer(refs,  add_special_tokens=False, padding=True, return_tensors="np")

        ep = EvalPrediction(
            predictions=enc_pred["input_ids"],
            label_ids=enc_ref["input_ids"]
        )

        metrics = compute_metrics(ep, compute_result=True)

        metric_file = Path(output_dir) / 'metrics.json'
        with metric_file.open("w", encoding="utf-8") as f:
            json.dump(metrics, f, ensure_ascii=False, indent=2)
            
        print(f"Results of predictions saved in {pred_file}")
        print(f"Results of metrics saved in {metric_file}")

    while not is_test_data:
        GLOBAL_CONFIG._config["mod"] = "infer"
        content = input("Chat: ")
        if content.startswith('"""'):  # prefix """
            # multi lines input
            content = content[3:] + "\n"
            while True:
                line = input("")
                if line.endswith('"""'):
                    # end multi lines input
                    line = line[:-3]  # suffix """
                    if line:
                        content += line + "\n"
                    break
                else:
                    content += line + "\n"

        if content == "":
            if prompt_file != None:
                content = open(prompt_file, "r").read()
            else:
                content = "Please write a piece of quicksort code in C++."
        elif os.path.isfile(content):
            content = open(content, "r").read()
            
        messages = [{"role": "user", "content": content}]
        input_tensor = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        )
        if force_think:
            token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
            input_tensor = torch.cat(
                [input_tensor, token_thinks], dim=1
            )
        if mode == 'long_context':
            assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
            "please change max_seq_len in  ~/.ktransformers/config.yaml"

        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8 and device_manager.gpu_vendor == GPUVendor.NVIDIA:
            generated = prefill_and_generate(
                model, tokenizer, input_tensor.to(device), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
            )
        else:
            generated = prefill_and_generate(
                model, tokenizer, input_tensor.to(device), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
            )


if __name__ == "__main__":
    install_patch()
    IS_DEBUG = True

    if IS_DEBUG == False:
        parser = argparse.ArgumentParser()

        parser.add_argument("--model_path", required=True)
        parser.add_argument("--model_config_path", default=None)
        parser.add_argument("--gguf_path", required=True)
        parser.add_argument("--cpu_infer", type=int, default=32)
        parser.add_argument("--max_new_tokens", type=int, default=1000)
        parser.add_argument("--force_think", action="store_true")
        parser.add_argument("--optimize_config_path", required=True)
        parser.add_argument("--is_sft", type=lambda x: x.lower() == "true", default=False)
        parser.add_argument("--sft_data_path", default=None)
        parser.add_argument("--save_adapter_path", default=None)
        parser.add_argument("--use_adapter", type=lambda x: x.lower() == "true", default=False)
        parser.add_argument("--use_adapter_path", default=None)
        parser.add_argument("--is_test_data", type=lambda x: x.lower() == "true", default=False)
        parser.add_argument("--test_data_path", default=None)
        parser.add_argument("--output_dir", default=None)

        args = parser.parse_args()

        local_chat(
            model_path=args.model_path,
            model_config_path=args.model_config_path,
            gguf_path=args.gguf_path,
            cpu_infer=args.cpu_infer,
            max_new_tokens=args.max_new_tokens,
            force_think=args.force_think,
            optimize_config_path=args.optimize_config_path,
            is_sft=args.is_sft,
            sft_data_path=args.sft_data_path,
            save_adapter_path=args.save_adapter_path,
            use_adapter=args.use_adapter,
            use_adapter_path=args.use_adapter_path,
            is_test_data=args.is_test_data,
            test_data_path=args.test_data_path,
            output_dir= args.output_dir
        )

    else:
        local_chat(
            # model_path="/mnt/data/data/DeepSeek-V3-671B-BF16",
            # model_config_path="/mnt/data/data/DeepSeek-V3-671B-BF16",
            # gguf_path="/mnt/data/data/DeepSeek-V3-671B-BF16",
            model_path="/mnt/data/models/DeepSeek-V2-Lite-Chat",
            model_config_path="/mnt/data/models/DeepSeek-V2-Lite-Chat",
            gguf_path="/mnt/data/models/DeepSeek-V2-Lite-Chat",
            cpu_infer=32,
            max_new_tokens=1000,
            force_think=False,
            # optimize_config_path="ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml",
            optimize_config_path="ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml",
            is_sft=True,
            sft_data_path="test_adapter/western_train.json",
            # sft_data_path="test_adapter/western_train.json",
            # sft_data_path="test_adapter/500token_test.json",
            save_adapter_path="/mnt/data/lpl/test_adapter/Kwhl_test_py312_torch28_DeepSeekV2_WEST",
            use_adapter=False,
            use_adapter_path="/mnt/data/lpl/test_adapter/Kllama_deepseekV2_AfriMed_mcq",
            is_test_data=False,
            test_data_path="/home/lpl/LLaMA-Factory-KT/data/mcq_test.json",
            output_dir="/mnt/data/lpl/test_adapter/Kllama_deepseekV2_AfriMed_mcq/baselines",
        )
        

================================================
FILE: kt-sft/ktransformers/local_chat.sh
================================================
#!/bin/bash

python3 ktransformers/local_chat.py \
    --model_path "/mnt/data/models/DeepSeek-V2-Lite-Chat" \
    --model_config_path "/mnt/data/models/DeepSeek-V2-Lite-Chat" \
    --gguf_path "/mnt/data/models/DeepSeek-V2-Lite-Chat" \
    --cpu_infer 32 \
    --max_new_tokens 1000 \
    --optimize_config_path "ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml" \
    --is_sft False \
    --sft_data_path "test_adapter/sft_translation.json" \
    --save_adapter_path "test_adapter/demo_adapter_KT_target_kv" \
    --use_adapter True \
    --use_adapter_path "/mnt/data/lpl/test_adapter/KT_newLoader_singleGPU_deepseekV2_Neko_AFS/checkpoint-566" \
    --is_test_data False \
    --test_data_path "test_adapter/demo_adapter_origin_target_kv" \
    --output_dir "test_adapter/demo_adapter_origin_target_kv" \

================================================
FILE: kt-sft/ktransformers/lora_test_module.py
================================================
import os
import platform
import sys

project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)

from torchviz import make_dot
from torch import nn
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)

from ktransformers.operators.linear import KLinearTorch, KTransformersLinear
from ktransformers.sft.peft_utils.lora_layer import KTransformersLinearLora
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.inference_state import InferenceState

import hiddenlayer as hl

gguf_loader = GGUFLoader(gguf_path="/home/yj/ktransformers/GGUF-DeepSeek-V2-Lite-Chat")
config = AutoConfig.from_pretrained("/home/yj/ktransformers/DeepSeek-V2-Lite-Chat", trust_remote_code=True)
torch.set_default_dtype(config.torch_dtype)

class TestModelLora(nn.Module):
    def __init__(self):
        super().__init__()

        random_linear_layer = nn.Linear(in_features=3072, out_features=2048, bias=False)
        
        orig_linear = KTransformersLinear(
            key='blk.0.attn_q',
            gguf_loader=gguf_loader,
            config=config,
            orig_module=random_linear_layer,
            generate_op="KLinearTorch"
        )
        self.layer = KTransformersLinearLora(
            orig_module=orig_linear,
            adapter_name="lora_test",
            r=8,
            lora_alpha=16
        )
        self.layer.generate_linear.weight = torch.randn(3072, 2048).to("cuda")
        
    def forward(self, x):
        return self.layer(x)
    
class TestModelBase(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = KTransformersLinear(
            key="linear",
            gguf_loader=gguf_loader, 
            config=config, 
            orig_module=nn.Linear(in_features=3072, out_features=2048, bias=False),
            generate_op="KLinearTorch"
        )
        # self.layer.generate_linear.weight = torch.randn(3072, 2048).to("cuda")
        weight = torch.randn(3072, 2048, device="cuda")
        self.layer.load(w=nn.Parameter(weight), mode = InferenceState.GENERATE)
        # self.layer.generate_linear.weight = nn.Parameter(torch.randn(3072, 2048).to("cuda"))
        self.fc1 = nn.Linear(3072, 2048, bias=False)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(2048, 3072, bias=False)
        # self.layer.load(mode=InferenceState.GENERATE)

    def forward(self, x):
        x = self.layer(x)
        # x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

class TestModelTorch(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = KLinearTorch(
            key="linear",
            gguf_loader=gguf_loader, 
            config=config, 
            orig_module=nn.Linear(in_features=3072, out_features=2048, bias=False)
        )
        # self.layer.weight = nn.Parameter(torch.randn(3072, 2048).to("cuda"))
        # self.layer.weight = torch.randn(3072, 2048).to("cuda")
        weight = torch.randn(3072, 2048, device="cuda")
        self.layer.load(w=nn.Parameter(weight), device="cuda")
        self.fc1 = nn.Linear(3072, 2048, bias=False)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(2048, 3072, bias=False)
        # self.layer.load(mode=InferenceState.GENERATE) 

    def forward(self, x):
        x = self.layer(x)
        # x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


# # KLinearTorch Well DONE for test!
# model = TestModelTorch()
# x = torch.randn(2048, 3072, requires_grad=True)
# out = model(x)
# make_dot(out, params=dict(model.named_parameters())).render("KTLinear_graph", format="svg")


# model = TestModelBase()
# x = torch.randn(2048, 3072, requires_grad=True)
# out = model(x)
# make_dot(out, params=dict(model.named_parameters())).render("base_graph", format="svg")

# MyConvNet_graph=hl.build_graph(model,torch.zeros(size=[2048, 3072]))
# MyConvNet_graph.theme=hl.graph.THEMES['blue'].copy()
# MyConvNet_graph.save(path='./base_graph.png',format='png')

# model = TestModelLora()
# x = torch.randn(2048, 3072, requires_grad=True)
# out = model(x)
# make_dot(out, params=dict(model.named_parameters())).render("lora_graph", format="svg")


from peft import LoraConfig, get_peft_model

class BaseModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(3072, 2048, bias=False)
    
    def forward(self, x):
        return self.linear(x)

model = BaseModel().to("cuda")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["linear"],
    lora_dropout=0.0,
    bias="none",
)

peft_model = get_peft_model(model, lora_config)
print(peft_model)

x = torch.randn(2048, 3072, requires_grad=True).to("cuda")

out = peft_model(x)

dot = make_dot(out, 
             params=dict(peft_model.named_parameters()))

dot.render("origin_lora_graph", format="svg")

================================================
FILE: kt-sft/ktransformers/models/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/models/configuration_deepseek.py
================================================
# Adapted from
# https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628/blob/main/configuration_deepseek.py
# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepseekV2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V2.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 102400):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        n_shared_experts (`int`, *optional*, defaults to None):
            Number of shared experts, None means dense model.
        n_routed_experts (`int`, *optional*, defaults to None):
            Number of routed experts, None means dense model.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        topk_method (`str`, *optional*, defaults to `gready`):
            Topk method used in routed gate.
        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
        moe_layer_freq (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
        scoring_func (`str`, *optional*, defaults to 'softmax'):
            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import DeepseekV2Model, DeepseekV2Config
    >>> # Initializing a Deepseek-V2 style configuration
    >>> configuration = DeepseekV2Config()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "deepseek_v2"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=102400,
        hidden_size=4096,
        intermediate_size=11008,
        moe_intermediate_size = 1407,
        num_hidden_layers=30,
        num_attention_heads=32,
        num_key_value_heads=32,
        n_shared_experts = None,
        n_routed_experts = None,
        ep_size = 1,
        routed_scaling_factor = 1.0,
        kv_lora_rank = 512,
        q_lora_rank = 1536,
        qk_rope_head_dim = 64,
        v_head_dim = 128,
        qk_nope_head_dim = 128,
        topk_method = 'gready',
        n_group = None,
        topk_group = None,
        num_experts_per_tok = None,
        moe_layer_freq = 1,
        first_k_dense_replace = 0,
        norm_topk_prob = False,
        scoring_func = 'softmax',
        aux_loss_alpha = 0.001,
        seq_aux = True,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=100000,
        eos_token_id=100001,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        cpu_quant=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        self.cpu_quant = cpu_quant

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


================================================
FILE: kt-sft/ktransformers/models/configuration_deepseek_v3.py
================================================
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepseekV3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V3.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 129280):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV3Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
            Number of nextn predict layers in the DeepSeekV3 Model.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        n_shared_experts (`int`, *optional*, defaults to None):
            Number of shared experts, None means dense model.
        n_routed_experts (`int`, *optional*, defaults to None):
            Number of routed experts, None means dense model.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        topk_method (`str`, *optional*, defaults to `gready`):
            Topk method used in routed gate.
        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
        moe_layer_freq (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
        scoring_func (`str`, *optional*, defaults to 'softmax'):
            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import DeepseekV3Model, DeepseekV3Config
    >>> # Initializing a Deepseek-V3 style configuration
    >>> configuration = DeepseekV3Config()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "deepseek_v3"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=129280,
        hidden_size=7168,
        intermediate_size=18432,
        moe_intermediate_size = 2048,
        num_hidden_layers=61,
        num_nextn_predict_layers=1,
        num_attention_heads=128,
        num_key_value_heads=128,
        n_shared_experts = 1,
        n_routed_experts = 256,
        ep_size = 1,
        routed_scaling_factor = 2.5,
        kv_lora_rank = 512,
        q_lora_rank = 1536,
        qk_rope_head_dim = 64,
        v_head_dim = 128,
        qk_nope_head_dim = 128,
        topk_method = 'noaux_tc',
        n_group = 8,
        topk_group = 4,
        num_experts_per_tok = 8,
        moe_layer_freq = 1,
        first_k_dense_replace = 3,
        norm_topk_prob = True,
        scoring_func = 'sigmoid',
        hidden_act="silu",
        max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=0,
        eos_token_id=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

================================================
FILE: kt-sft/ktransformers/models/configuration_llama.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LLaMA model configuration"""

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation


class LlamaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the LLaMA-7B.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`LlamaModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
            Llama 2 up to 4096, CodeLlama up to 16384.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.

    ```python
    >>> from transformers import LlamaModel, LlamaConfig

    >>> # Initializing a LLaMA llama-7b style configuration
    >>> configuration = LlamaConfig()

    >>> # Initializing a model from the llama-7b style configuration
    >>> model = LlamaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "llama"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias

        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
        rope_config_validation(self)

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


================================================
FILE: kt-sft/ktransformers/models/configuration_qwen2_moe.py
================================================
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen2MoE model configuration"""

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


class Qwen2MoeConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a
    Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of
    Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B").

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Qwen2MoeModel`]
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 16):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        use_sliding_window (`bool`, *optional*, defaults to `False`):
            Whether to use sliding window attention.
        sliding_window (`int`, *optional*, defaults to 4096):
            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
        max_window_layers (`int`, *optional*, defaults to 28):
            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        decoder_sparse_step (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer.
        moe_intermediate_size (`int`, *optional*, defaults to 1408):
            Intermediate size of the routed expert.
        shared_expert_intermediate_size (`int`, *optional*, defaults to 5632):
            Intermediate size of the shared expert.
        num_experts_per_tok (`int`, *optional*, defaults to 4):
            Number of selected experts.
        num_experts (`int`, *optional*, defaults to 60):
            Number of routed experts.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
            Indicate which layers use Qwen2MoeMLP rather than Qwen2MoeSparseMoeBlock
            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.

    ```python
    >>> from transformers import Qwen2MoeModel, Qwen2MoeConfig

    >>> # Initializing a Qwen2MoE style configuration
    >>> configuration = Qwen2MoeConfig()

    >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration
    >>> model = Qwen2MoeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "qwen2_moe"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=2048,
        intermediate_size=5632,
        num_hidden_layers=24,
        num_attention_heads=16,
        num_key_value_heads=16,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=28,
        attention_dropout=0.0,
        decoder_sparse_step=1,
        moe_intermediate_size=1408,
        shared_expert_intermediate_size=5632,
        num_experts_per_tok=4,
        num_experts=60,
        norm_topk_prob=False,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        mlp_only_layers=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window if use_sliding_window else None
        self.max_window_layers = max_window_layers

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout

        # MoE arguments
        self.decoder_sparse_step = decoder_sparse_step
        self.moe_intermediate_size = moe_intermediate_size
        self.shared_expert_intermediate_size = shared_expert_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.num_experts = num_experts
        self.norm_topk_prob = norm_topk_prob
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

================================================
FILE: kt-sft/ktransformers/models/configuration_qwen3_moe.py
================================================
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen3MoE model configuration"""

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging


logger = logging.get_logger(__name__)


class Qwen3MoeConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Qwen3MoeModel`]. It is used to instantiate a
    Qwen3MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of [Qwen/Qwen3-MoE-15B-A2B](https://huggingface.co/Qwen/Qwen3-15B-A2B).
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the Qwen3MoE model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Qwen3MoeModel`]
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 6144):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        use_sliding_window (`bool`, *optional*, defaults to `False`):
            Whether to use sliding window attention.
        sliding_window (`int`, *optional*, defaults to 4096):
            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
        max_window_layers (`int`, *optional*, defaults to 28):
            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        decoder_sparse_step (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer.
        moe_intermediate_size (`int`, *optional*, defaults to 768):
            Intermediate size of the routed expert.
        num_experts_per_tok (`int`, *optional*, defaults to 8):
            Number of selected experts.
        num_experts (`int`, *optional*, defaults to 128):
            Number of routed experts.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
            Indicate which layers use Qwen3MoeMLP rather than Qwen3MoeSparseMoeBlock
            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
    ```python
    >>> from transformers import Qwen3MoeModel, Qwen3MoeConfig
    >>> # Initializing a Qwen3MoE style configuration
    >>> configuration = Qwen3MoeConfig()
    >>> # Initializing a model from the Qwen3-15B-A2B" style configuration
    >>> model = Qwen3MoeModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "qwen3_moe"
    keys_to_ignore_at_inference = ["past_key_values"]

    # Default tensor parallel plan for base model `Qwen3Moe`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=2048,
        intermediate_size=6144,
        num_hidden_layers=24,
        num_attention_heads=32,
        num_key_value_heads=4,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=28,
        attention_dropout=0.0,
        decoder_sparse_step=1,
        moe_intermediate_size=768,
        num_experts_per_tok=8,
        num_experts=128,
        norm_topk_prob=False,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        mlp_only_layers=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window if use_sliding_window else None
        self.max_window_layers = max_window_layers

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
        rope_config_validation(self)

        # MoE arguments
        self.decoder_sparse_step = decoder_sparse_step
        self.moe_intermediate_size = moe_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.num_experts = num_experts
        self.norm_topk_prob = norm_topk_prob
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


__all__ = ["Qwen3MoeConfig"]

================================================
FILE: kt-sft/ktransformers/models/custom_cache.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
'''
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/cache_utils.py
# Copyright 2018- The Hugging Face team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
import torch
import torch.nn as nn
import transformers
from transformers import Cache, PretrainedConfig
from typing import List, Optional, Dict, Any, Tuple
try:
    from ktransformers.server.balance_serve.settings import sched_ext
except:
    print("no balance_serve")
class StaticCache(transformers.StaticCache):
    """
    Static Cache class to be used with `torch.compile(model)`.

    Parameters:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used.
        max_cache_len (`int`):
            The maximum sequence length with which the model will be used.
        device (`torch.device` or `dict`):
            The device on which the cache should be initialized. Should be the same as the layer.
            If a `dict`, it should contain the `device` key with the device name as the value.
        dtype (*optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.
    """

    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device: torch.device| dict, dtype=None) -> None:
        Cache.__init__(self)
        self.max_batch_size = max_batch_size
        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
        if config.architectures[0] == "DeepseekV3ForCausalLM":
            self.head_dim = config.qk_rope_head_dim
        else:
            self.head_dim = (
                config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
            )

        self.dtype = dtype if dtype is not None else torch.float32
        self.num_key_value_heads = (
            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
        )

        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        if config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM":
            # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
            self.page_size = 64
            self.max_pages = (self.max_cache_len + self.page_size - 1) // self.page_size
            latent_shape = (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
            self.kv_lora_rank = config.kv_lora_rank
            self.qk_rope_head_dim = config.qk_rope_head_dim
            # TODO: support real page table
            self.page_table_map = dict()
            self.page_table_list = []
            for idx in range(config.num_hidden_layers):
                if isinstance(device, dict):
                    target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
                else:
                    target_device = device
                
                if target_device not in self.page_table_map:
                    page_table = torch.zeros((max_batch_size, self.max_pages), dtype=torch.int32, device=target_device)
                    for seq_id in range(max_batch_size):
                        page_table[seq_id, :] = torch.arange(seq_id * self.max_pages, seq_id * self.max_pages + self.max_pages, dtype=torch.int32, device=target_device)
                    self.page_table_map[target_device] = page_table
                    
                self.page_table_list.append(self.page_table_map[target_device])
                    
            self.is_MLA = True
            self.is_page = True
        else:
            key_shape = cache_shape
            value_shape = cache_shape
            self.is_MLA = False

        self.past_tokens = []
        self.num_hidden_layers = config.num_hidden_layers
        for idx in range(self.num_hidden_layers):
            # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
            # breaks when updating the cache.
            if isinstance(device, dict):
                target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
            else:
                target_device = device
            
            if self.is_MLA:
                new_layer_key_cache = torch.zeros(latent_shape, dtype=self.dtype, device=target_device)
                new_layer_value_cache = None
                torch._dynamo.mark_static_address(new_layer_key_cache)
            else:
                new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=target_device)
                new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=target_device)
                torch._dynamo.mark_static_address(new_layer_key_cache)
                torch._dynamo.mark_static_address(new_layer_value_cache)
                
            self.key_cache.append(new_layer_key_cache)
            self.value_cache.append(new_layer_value_cache)
            self.past_tokens.append(0)

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
        cache_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
                to know how where to write in the cache.

        Return:
            A tuple containing the updated key and value states.
        """
        cache_position = cache_kwargs.get("cache_position")
        k_out = self.key_cache[layer_idx]
        v_out = self.value_cache[layer_idx]
        self.past_tokens[layer_idx] += cache_position.size(0)
        #print(cache_position)
        if self.is_MLA:
            page_idx = cache_position // self.page_size
            page_offset = cache_position % self.page_size
            # key shape (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim)
            k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states
            k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states
            return k_out, self.page_table_list[layer_idx]
        else:
            k_out[:, :, cache_position] = key_states
            v_out[:, :, cache_position] = value_states
            return k_out, v_out

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states that were seen by the model."""
        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
        # limit the check to the first batch member and head dimension.
        # TODO: deprecate this function in favor of `cache_position`
        return self.past_tokens[layer_idx]
    
    def change_seq_length(self, bias: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states that were seen by the model."""
        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
        # limit the check to the first batch member and head dimension.
        # TODO: deprecate this function in favor of `cache_position`
        for layer_idx in range(self.num_hidden_layers):
            self.past_tokens[layer_idx] += bias

    def get_max_length(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states."""
        return self.max_cache_len

    def reset(self):
        """Resets the cache values while preserving the objects"""
        for layer_idx in range(len(self.key_cache)):
            # In-place ops prevent breaking the static address
            self.key_cache[layer_idx].zero_()
            if self.value_cache[layer_idx] is not None:
                self.value_cache[layer_idx].zero_()
            self.past_tokens[layer_idx] = 0

    def remove_suffix(self, start_pos):
        for layer_idx in range(len(self.key_cache)):
            # In-place ops prevent breaking the static address
            if self.is_MLA:
                k_cache = self.key_cache[layer_idx]
                k_cache.view(-1, k_cache.shape[-1])[start_pos:].zero_()
            else:
                self.key_cache[layer_idx][..., start_pos:, :].zero_()
                self.value_cache[layer_idx][..., start_pos:, :].zero_()
            self.past_tokens[layer_idx] = start_pos
    
    def get_max_cache_shape(self) -> Tuple[int, int, int, int]:
        """Returns the maximum shape of the cache."""
        return self.max_cache_len

class KDeepSeekV3Cache(nn.Module):
    def __init__(
        self,
        config: PretrainedConfig,
        page_size: int = 256,
        dtype=torch.bfloat16,
        device=torch.device("cuda:0"),
        
    ):
        super().__init__()
        self.config = config
        self.dtype = dtype
        self.device = device
        self.kv_lora_rank = config.kv_lora_rank
        self.page_size = page_size
        self.k_caches = []
        self.v_caches = []
        

    def load(self, inference_context: "sched_ext.InferenceContext"):
        
        for i in range(self.config.num_hidden_layers):
            self.k_caches.append(
                inference_context.k_cache[0][i] 
            )
        self.max_cache_len = self.k_caches[0].shape[0]*self.k_caches[0].shape[1]

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,

        page_idx: torch.Tensor,
        page_offset: torch.Tensor,

        cache_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
                to know how where to write in the cache.

        Return:
            A tuple containing the updated key and value states.
        """
        k_out = self.k_caches[layer_idx]

        k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states.reshape(-1, *key_states.shape[2:])
        k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states.reshape(-1, *value_states.shape[2:])
        return k_out

        
    def get_page_table(self, cache_position: torch.Tensor, q_indptr: torch.Tensor, kv_indptr: torch.Tensor, kv_indices: torch.Tensor, bsz_tensors: torch.tensor):
        page_offset = cache_position % self.page_size  
        page_idx_local = cache_position // self.page_size  
        query_ids = torch.zeros_like(cache_position)
        for i in range(len(q_indptr) - 1):
            start_idx = q_indptr[i]
            end_idx = q_indptr[i + 1]
            query_ids[start_idx:end_idx] = i
        page_idx = torch.zeros_like(page_idx_local)
        for i in range(bsz_tensors[0]):
            query_id = query_ids[i]
            local_block = page_idx_local[i]
            start_block = kv_indptr[query_id]
            if local_block < kv_indptr[query_id + 1] - kv_indptr[query_id]:
                page_idx[i] = kv_indices[start_block + local_block]
        
        return page_idx, page_offset
    
class KGQACache(nn.Module):
    def __init__(
        self,
        config: PretrainedConfig,
        page_size: int = 256,
        dtype=torch.bfloat16,
        device=torch.device("cuda:0"),
        
    ):
        super().__init__()
        self.config = config
        self.dtype = dtype
        self.device = device
        self.page_size = page_size
        self.k_caches = []
        self.v_caches = []
        

    def load(self, inference_context: "sched_ext.InferenceContext"):
        print(self.config.num_hidden_layers)
        for i in range(self.config.num_hidden_layers):
            self.k_caches.append(
                inference_context.k_cache[0][i] 
            )
            self.v_caches.append(
                inference_context.v_cache[0][i]
            )


        self.max_cache_len = self.k_caches[0].shape[0]*self.k_caches[0].shape[1]


    def get_page_table(self, cache_position: torch.Tensor, q_indptr: torch.Tensor, kv_indptr: torch.Tensor, kv_indices: torch.Tensor, bsz_tensors: torch.tensor):
        page_offset = cache_position % self.page_size  
        page_idx_local = cache_position // self.page_size  
        query_ids = torch.zeros_like(cache_position)
        for i in range(len(q_indptr) - 1):
            start_idx = q_indptr[i]
            end_idx = q_indptr[i + 1]
            query_ids[start_idx:end_idx] = i
        page_idx = torch.zeros_like(page_idx_local)
        for i in range(bsz_tensors[0]):
            query_id = query_ids[i]
            local_block = page_idx_local[i]
            start_block = kv_indptr[query_id]
            if local_block < kv_indptr[query_id + 1] - kv_indptr[query_id]:
                page_idx[i] = kv_indices[start_block + local_block]
        
        return page_idx, page_offset

    def get_k_cache(self, layer_idx):
        return self.k_caches[layer_idx]

    def get_v_cache(self, layer_idx):
        return self.v_caches[layer_idx]

================================================
FILE: kt-sft/ktransformers/models/custom_modeling_deepseek_v2.py
================================================
import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KDeepSeekV3Cache
from  ktransformers.models.modeling_deepseek import DeepseekV2Model,  DeepseekV2PreTrainedModel
from ktransformers.models.configuration_deepseek import DeepseekV2Config


torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KDeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):

    kv_cache: KDeepSeekV3Cache
    use_cuda_graph = False
    def __init__(
        self,
        config,
        kv_cache,

    ):
        super().__init__(config)
        self.model = DeepseekV2Model(config)
        self.config = config
        self.kv_cache = kv_cache

        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        

    def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
        self.use_cuda_graph = use_cuda_graph
        self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
        self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
        self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
        self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
        self.paged_kv_len_buf = torch.empty((max_batch_size,), dtype=torch.int32, device=device)

		
        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
            self.workspace_buffer, use_cuda_graph=use_cuda_graph,
            qo_indptr=self.qo_indptr_buf,kv_indptr=self.paged_kv_indptr_buf,
            kv_indices=self.paged_kv_indices_buf,kv_len_arr=self.paged_kv_len_buf,
            backend = "fa2",
        )

    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]


        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.kv_cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.wrapper, bsz_tensors=num_tokens_tensors, 
                                                       cache_position=batch.minibatch.positions, 
                                                       batch_indices=batch.minibatch.batch_indices,
                                                       kv_indices=batch.minibatch.kv_indices,
                                                       kv_indptr=batch.minibatch.kv_indptr,
                                                       kv_last_page_len=batch.minibatch.kv_last_page_len,
                                                       q_indptr=batch.minibatch.q_indptr,
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                if i < 3:
                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors)
                else:
                    hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors)
                    hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        assert  batch.batch_size == 1
        with torch.cuda.stream(current_stream):

            local_logit = self.lm_head(self.model.norm(hidden_states[batch.minibatch.logits_start], num_tokens_tensors, residual[batch.minibatch.logits_start])[0])
            # local_logit = local_logit[batch.minibatch.logits_start]
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_heads: int,
        head_dim_ckv: int,
        head_dim_kpe: int,
        page_size: int,
        causal: bool,
        sm_scale: float,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,):
        minibatch = batch.minibatch
        
        self.wrapper.plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_len, num_heads, head_dim_ckv, head_dim_kpe, page_size, causal, sm_scale, q_data_type, kv_data_type)
        

================================================
FILE: kt-sft/ktransformers/models/custom_modeling_deepseek_v3.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KDeepSeekV3Cache
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3Model,  DeepseekV3PreTrainedModel
from ktransformers.models.configuration_deepseek_v3 import DeepseekV3Config


torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):

    cache: KDeepSeekV3Cache
    use_cuda_graph = False
    def __init__(
        self,
        config: DeepseekV3Config,
        cache,
    ):
        super().__init__(config)
        self.model = DeepseekV3Model(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
        self.use_cuda_graph = use_cuda_graph
        self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
        self.qo_indptr_buf = torch.empty((max_batch_size+2,), dtype=torch.int32, device=device)
        self.paged_kv_indptr_buf = torch.empty((max_batch_size+2,), dtype=torch.int32, device=device)
        self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
        self.paged_kv_len_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
        self.bsz_tensor_buf = torch.empty((1, ), dtype=torch.int32, device=device)
		

        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
            self.workspace_buffer, use_cuda_graph=use_cuda_graph,
            qo_indptr=self.qo_indptr_buf,kv_indptr=self.paged_kv_indptr_buf,
            kv_indices=self.paged_kv_indices_buf,kv_len_arr=self.paged_kv_len_buf,
            bsz_tensor=self.bsz_tensor_buf,
            backend = "fa2",
        )

    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = -1
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]

        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                # can't use now, only one flashinfer wrapper
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.wrapper, num_tokens_tensors=num_tokens_tensors, 
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                if i < self.config.first_k_dense_replace:
                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors)
                else:
                    hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors, cuda_graph_idx)
                    hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_heads: int,
        head_dim_ckv: int,
        head_dim_kpe: int,
        page_size: int,
        causal: bool,
        sm_scale: float,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,):
        minibatch = batch.minibatch
        self.wrapper.plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_len, num_heads, head_dim_ckv, head_dim_kpe, page_size, causal, sm_scale, q_data_type, kv_data_type, bsz_tensors)
        

================================================
FILE: kt-sft/ktransformers/models/custom_modeling_qwen2_moe.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KGQACache
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeModel, Qwen2MoePreTrainedModel
from ktransformers.models.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel):

    cache: KGQACache
    use_cuda_graph = False
    def __init__(
        self,
        config: Qwen2MoeConfig,
        cache,
    ):
        super().__init__(config)
        self.model = Qwen2MoeModel(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.attn = [None] * 100
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
        self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)


    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = 0
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]
        self.attn[cuda_graph_idx].calc_batch_indices(hidden_states.shape[0])

        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.attn[cuda_graph_idx], bsz_tensors=num_tokens_tensors, 
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors, cuda_graph_idx)
                hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0
        ):
        minibatch = batch.minibatch
        self.attn[cuda_graph_idx].plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_last_page_len, bsz_tensors, num_tokens_tensors,num_q_heads, num_kv_heads, head_dim, page_size, causal=causal, q_data_type=q_data_type, kv_data_type=kv_data_type)
        

================================================
FILE: kt-sft/ktransformers/models/custom_modeling_qwen3_moe.py
================================================
"""
Date: 2024-11-06 10:05:11
LastEditors: djw
LastEditTime: 2024-11-13 07:50:51
"""

import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.models.custom_cache import KGQACache
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeModel, Qwen3MoePreTrainedModel
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
import flashinfer

class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):

    cache: KGQACache
    use_cuda_graph = False
    def __init__(
        self,
        config: Qwen3MoeConfig,
        cache = None,
    ):
        super().__init__(config)
        self.model = Qwen3MoeModel(config)
        self.config = config
        self.cache = cache
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.attn = [None] * 100
        
    def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
        self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)


    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
        features = []
        for i in range(batch.batch_size):
            tokens = batch.minibatch.tokens.contiguous()
            feature = (
                self.model.embed_tokens(tokens.to(torch.device('cpu')))
                .to(torch.bfloat16)
                .to(device=device)
            )
            features.append(feature)

        return features


    def forward(
        self,
        batch: ForwardBatchInput | None = None,
        features: List[torch.Tensor] | None = None,
        bsz_tensors: torch.Tensor | None = None,
        num_tokens_tensors: torch.Tensor | None = None,
        page_idx: torch.Tensor | None = None,
        page_offset: torch.Tensor | None = None,
        cuda_graph_idx: int | None = 0
    ) -> ForwardBatchOutput:
        current_stream = torch.cuda.current_stream()

        forward_batch_output = ForwardBatchOutput()

        
        hidden_states = features[0]
        self.attn[cuda_graph_idx].calc_batch_indices(hidden_states.shape[0])

        with torch.cuda.stream(current_stream):
            residual = torch.zeros_like(hidden_states)
            for i, decode_layer in enumerate(self.model.layers):
                if self.model.transfer_map is not None and i in self.model.transfer_map:
                    prev_stream = torch.cuda.current_stream()
                    cur_device = self.model.transfer_map[i]
                    if cur_device not in self.model.stream_device_map:
                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                    torch.cuda.set_device(cur_device)
                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
                    hidden_states = hidden_states.to(
                        self.model.transfer_map[i], non_blocking=True
                    )

                    batch.minibatch.position_ids = (
                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
                        if batch.minibatch.position_ids is not None
                        else None
                    )
                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
                                                       position_ids=batch.minibatch.position_ids, 
                                                       wrapper=self.attn[cuda_graph_idx], bsz_tensors=num_tokens_tensors, 
                                                       page_idx=page_idx,
                                                       page_offset=page_offset
                                                       )

                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
                hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors, cuda_graph_idx)
                hidden_states = hidden_states.squeeze(0)
        forward_batch_output = ForwardBatchOutput()
        with torch.cuda.stream(current_stream):
            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
            forward_batch_output.logits.append(local_logit)

        return forward_batch_output
    

    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        page_size: int,
        causal: bool,
        q_data_type: torch.dtype,
        kv_data_type: torch.dtype,
        cuda_graph_idx: int = 0
        ):
        minibatch = batch.minibatch
        self.attn[cuda_graph_idx].plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
                          minibatch.kv_last_page_len, bsz_tensors, num_tokens_tensors, num_q_heads, num_kv_heads, head_dim, page_size, causal=causal, q_data_type=q_data_type, kv_data_type=kv_data_type)
        

================================================
FILE: kt-sft/ktransformers/models/modeling_deepseek.py
================================================
# coding=utf-8
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
'''
# Adapted from
# https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628/blob/main/modeling_deepseek.py
# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
# 
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DeepSeek model."""
import math
import warnings
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_attention_mask,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import (
    ALL_LAYERNORM_LAYERS,
    is_torch_greater_or_equal_than_1_13,
)
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_torch_fx_available
from .configuration_deepseek import DeepseekV2Config
import torch.distributed as dist
import numpy as np

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

from ktransformers.util.grad_wrapper import maybe_no_grad

# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "DeepseekV2Config"


def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(
        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
    )
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


class DeepseekV2RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        DeepseekV2RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return (self.weight * hidden_states).to(input_dtype)


ALL_LAYERNORM_LAYERS.append(DeepseekV2RMSNorm)

# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->DeepseekV2
class DeepseekV2RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

    @maybe_no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)   

# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV2
class DeepseekV2LinearScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
    """DeepseekV2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        raise NotImplementedError("LinearScalingRotaryEmbedding is not supported now.")
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        t = t / self.scaling_factor

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV2
class DeepseekV2DynamicNTKScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
    """DeepseekV2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        raise NotImplementedError("DynamicNTKScalingRotaryEmbedding is not supported now.")
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings)
                - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (
                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Inverse dim formula to find dim based on number of rotations
def yarn_find_correction_dim(
    num_rotations, dim, base=10000, max_position_embeddings=2048
):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def yarn_find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(
        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
    )
    high = math.ceil(
        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
    )
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case


def yarn_get_mscale(scale=1, mscale=1):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0


def yarn_linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func

class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        nn.Module.__init__(self)
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self._mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

    @maybe_no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()* self._mscale
            sin = emb.sin()* self._mscale
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)  

# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    b, h, s, d = k.shape
    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

class DeepseekV2MLP(nn.Module):
    def __init__(self, config, hidden_size=None, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
        self.intermediate_size = (
            config.intermediate_size if intermediate_size is None else intermediate_size
        )

        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        act = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
        down_proj = self.down_proj(act)
        return down_proj

class MoEGate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.top_k = config.num_experts_per_tok
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.scoring_func = config.scoring_func
        self.alpha = config.aux_loss_alpha
        self.seq_aux = config.seq_aux
        self.topk_method = config.topk_method
        self.n_group = config.n_group
        self.topk_group = config.topk_group

        # topk selection algorithm
        self.norm_topk_prob = config.norm_topk_prob
        self.gating_dim = config.hidden_size
        self.weight = nn.Parameter(
            torch.empty((self.n_routed_experts, self.gating_dim))
        )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        logits = F.linear(
            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
        )
        if self.scoring_func == "softmax":
            scores = logits.softmax(dim=-1, dtype=torch.float32)
        else:
            raise NotImplementedError(
                f"insupportable scoring function for MoE gating: {self.scoring_func}"
            )

        ### select top-k experts
        if self.topk_method == "greedy":
            topk_weight, topk_idx = torch.topk(
                scores, k=self.top_k, dim=-1, sorted=False
            )
        elif self.topk_method == "group_limited_greedy":
            group_scores = (
                scores.view(bsz * seq_len, self.n_group, -1).max(dim=-1).values
            )  # [n, n_group]
            group_idx = torch.topk(
                group_scores, k=self.topk_group, dim=-1, sorted=False
            )[
                1
            ]  # [n, top_k_group]
            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
            score_mask = (
                group_mask.unsqueeze(-1)
                .expand(
                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
                )
                .reshape(bsz * seq_len, -1)
            )  # [n, e]
            tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
            topk_weight, topk_idx = torch.topk(
                tmp_scores, k=self.top_k, dim=-1, sorted=False
            )

        ### norm gate to sum 1
        if self.top_k > 1 and self.norm_topk_prob:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        else:
            topk_weight = topk_weight * self.routed_scaling_factor
        ### expert-level computation auxiliary loss
        if self.training and self.alpha > 0.0:
            scores_for_aux = scores
            aux_topk = self.top_k
            # always compute aux loss based on the naive greedy topk method
            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
            if self.seq_aux:
                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
                ce = torch.zeros(
                    bsz, self.n_routed_experts, device=hidden_states.device
                )
                ce.scatter_add_(
                    1,
                    topk_idx_for_aux_loss,
                    torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device),
                ).div_(seq_len * aux_topk / self.n_routed_experts)
                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(
                    dim=1
                ).mean() * self.alpha
            else:
                mask_ce = F.one_hot(
                    topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts
                )
                ce = mask_ce.float().mean(0)
                Pi = scores_for_aux.mean(0)
                fi = ce * self.n_routed_experts
                aux_loss = (Pi * fi).sum() * self.alpha
        else:
            aux_loss = None
        return topk_idx, topk_weight, aux_loss


class AddAuxiliaryLoss(torch.autograd.Function):
    """
    The trick function of adding auxiliary (aux) loss,
    which includes the gradient of the aux loss during backpropagation.
    """

    @staticmethod
    def forward(ctx, x, loss):
        assert loss.numel() == 1
        ctx.dtype = loss.dtype
        ctx.required_aux_loss = loss.requires_grad
        return x

    @staticmethod
    def backward(ctx, grad_output):
        grad_loss = None
        if ctx.required_aux_loss:
            grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
        return grad_output, grad_loss

class DeepseekV2MoE(nn.Module):
    """
    A mixed expert module containing shared experts.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.num_experts_per_tok = config.num_experts_per_tok

        if hasattr(config, "ep_size") and config.ep_size > 1:
            assert config.ep_size == dist.get_world_size()
            self.ep_size = config.ep_size
            self.experts_per_rank = config.n_routed_experts // config.ep_size
            self.ep_rank = dist.get_rank()
            self.experts = nn.ModuleList(
                [
                    (
                        DeepseekV2MLP(
                            config, intermediate_size=config.moe_intermediate_size
                        )
                        if i >= self.ep_rank * self.experts_per_rank
                        and i < (self.ep_rank + 1) * self.experts_per_rank
                        else None
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        else:
            self.ep_size = 1
            self.experts_per_rank = config.n_routed_experts
            self.ep_rank = 0
            self.experts = nn.ModuleList(
                [
                    DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)
                    for i in range(config.n_routed_experts)
                ]
            )
        self.gate = MoEGate(config)
        if config.n_shared_experts is not None:
            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
            self.shared_experts = DeepseekV2MLP(
                config=config, intermediate_size=intermediate_size
            )

    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        flat_topk_idx = topk_idx.view(-1)
        if self.training:
            hidden_states = hidden_states.repeat_interleave(
                self.num_experts_per_tok, dim=0
            )
            y = torch.empty_like(hidden_states)
            for i, expert in enumerate(self.experts):
                y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
            y = y.view(*orig_shape)
            y = AddAuxiliaryLoss.apply(y, aux_loss)
        else:
            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
        if self.config.n_shared_experts is not None:
            y = y + self.shared_experts(identity)
        return y

    @maybe_no_grad()
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        sorted_tokens_shape = sorted_tokens.shape
        if self.ep_size > 1:
            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
            tokens_per_expert_group = tokens_per_expert.new_empty(
                tokens_per_expert.shape[0]
            )
            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
            output_splits = (
                tokens_per_expert_group.view(self.ep_size, -1)
                .sum(1)
                .cpu()
                .numpy()
                .tolist()
            )
            gathered_tokens = sorted_tokens.new_empty(
                tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
            )
            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
            dist.all_to_all(
                list(gathered_tokens.split(output_splits)),
                list(sorted_tokens.split(input_split_sizes)),
            )
            tokens_per_expert_post_gather = tokens_per_expert_group.view(
                self.ep_size, self.experts_per_rank
            ).sum(dim=0)
            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
            s = 0
            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
                gatherd_idxs[s : s + k] = i % self.experts_per_rank
                s += k
            gatherd_idxs = gatherd_idxs.argsort()
            sorted_tokens = gathered_tokens[gatherd_idxs]
            tokens_per_expert = tokens_per_expert_post_gather
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
        if self.ep_size > 1:
            new_x = torch.empty_like(outs)
            new_x[gatherd_idxs] = outs
            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
            dist.all_to_all(
                list(gathered_tokens.split(input_split_sizes)),
                list(new_x.split(output_splits)),
            )
            outs = gathered_tokens

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
class DeepseekV2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads

        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.q_lora_rank = config.q_lora_rank
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.kv_lora_rank = config.kv_lora_rank
        self.v_head_dim = config.v_head_dim
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim

        self.is_causal = True

        if self.q_lora_rank is None:
            self.q_proj = nn.Linear(
                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
            )
        else:
            self.q_a_proj = nn.Linear(
                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
            )
            self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
            self.q_b_proj = nn.Linear(
                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
            )

        self.kv_a_proj_with_mqa = nn.Linear(
            self.hidden_size,
            config.kv_lora_rank + config.qk_rope_head_dim,
            bias=config.attention_bias,
        )
        self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
        self.kv_b_proj = nn.Linear(
            config.kv_lora_rank,
            self.num_heads
            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
            bias=False,
        )

        self.o_proj = nn.Linear(
            self.num_heads * self.v_head_dim,
            self.hidden_size,
            bias=config.attention_bias,
        )
        self._init_rope()

        self.softmax_scale = self.q_head_dim ** (-0.5)
        if self.config.rope_scaling is not None:
            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
            scaling_factor = self.config.rope_scaling["factor"]
            if mscale_all_dim:
                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                self.softmax_scale = self.softmax_scale * mscale * mscale

    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = DeepseekV2RotaryEmbedding(
                self.qk_rope_head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = DeepseekV2LinearScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = DeepseekV2DynamicNTKScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "yarn":
                kwargs = {
                    key: self.config.rope_scaling[key]
                    for key in [
                        "original_max_position_embeddings",
                        "beta_fast",
                        "beta_slow",
                        "mscale",
                        "mscale_all_dim",
                    ]
                    if key in self.config.rope_scaling
                }
                self.rotary_emb = DeepseekV2YarnRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                    **kwargs,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
        )

        if attention_mask is not None:
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV2
class DeepseekV2FlashAttention2(DeepseekV2Attention):
    """
    DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # DeepseekV2FlashAttention2 attention does not support output_attentions
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

            # overwrite attention_mask with padding_mask
            attention_mask = kwargs.pop("padding_mask")

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]

        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe

        if self.q_head_dim != self.v_head_dim:
            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
        # to be able to avoid many of these transpose/reshape/view.
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (DeepseekV2RMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
            if hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            elif torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            else:
                target_dtype = self.q_a_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            position_ids=position_ids,
            dropout=dropout_rate,
            softmax_scale=self.softmax_scale,
        )
        if self.q_head_dim != self.v_head_dim:
            attn_output = attn_output[:, :, :, : self.v_head_dim]

        attn_output = attn_output.reshape(
            bsz, q_len, self.num_heads * self.v_head_dim
        ).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        position_ids,
        dropout=0.0,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.
        # Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`int`, *optional*):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV2FlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            (
                query_states,
                key_states,
                value_states,
                indices_q,
                cu_seq_lens,
                max_seq_lens,
            ) = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            attn_output = pad_input(
                attn_output_unpad, indices_q, batch_size, query_length
            )
        else:
            if query_length == 1:
                position_ids = position_ids.to(dtype=torch.int32).squeeze(1)
                attn_output = flash_attn_with_kvcache(
                    query_states,
                    key_states,
                    value_states,
                    cache_seqlens=position_ids,
                    softmax_scale=softmax_scale,
                    causal=causal,
                )   
            else:
                attn_output = flash_attn_func(
                    query_states,
                    key_states,
                    value_states,
                    dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                )

        return attn_output

    def _upad_input(
        self, query_layer, key_layer, value_layer, attention_mask, query_length
    ):
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
                indices_k,
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                query_layer, attention_mask
            )

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


ATTENTION_CLASSES = {
    "eager": DeepseekV2Attention,
    "flash_attention_2": DeepseekV2FlashAttention2,
}

class DeepseekV2DecoderLayer(nn.Module):
    def __init__(self, config: DeepseekV2Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = (
            DeepseekV2MoE(config)
            if (
                config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0
            )
            else DeepseekV2MLP(config)
        )
        self.input_layernorm = DeepseekV2RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = DeepseekV2RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            **kwargs,
        )

        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)

        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


DeepseekV2_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DeepseekV2Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV2_START_DOCSTRING,
)
class DeepseekV2PreTrainedModel(PreTrainedModel):
    config_class = DeepseekV2Config
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["DeepseekV2DecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_cache_class = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


DeepseekV2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV2_START_DOCSTRING,
)
class DeepseekV2Model(DeepseekV2PreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]

    Args:
        config: DeepseekV2Config
    """

    def __init__(self, config: DeepseekV2Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                DeepseekV2DecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
                )
                use_cache = False

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache()
                if use_legacy_cache
                else next_decoder_cache
            )
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
    
    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask


class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = DeepseekV2Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM

        >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        # logits = self.lm_head(hidden_states[:,-1:,:]).float()
        
        logits = self.lm_head(hidden_states).float() 

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        use_cache=True,
        **kwargs,
    ):
        past_length = 0
        # Omit tokens covered by past_key_values
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                max_cache_length = (
                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
                    if past_key_values.get_max_length() is not None
                    else None
                )
                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_length == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
        if cache_position is None:
            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
        elif use_cache:
            cache_position = cache_position[-input_length:]

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
                "cache_position": cache_position,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx.to(past_state.device))
                    for past_state in layer_past
                ),
            )
        return reordered_past


@add_start_docstrings(
    """
    The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).

    [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    DeepseekV2_START_DOCSTRING,
)
class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = DeepseekV2Model(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                ).to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


================================================
FILE: kt-sft/ktransformers/models/modeling_deepseek_v3.py
================================================
# coding=utf-8
# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DeepSeek model."""
import math
import warnings
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.generation import GenerationMixin
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_attention_mask,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import (
    ALL_LAYERNORM_LAYERS,
    is_torch_greater_or_equal_than_1_13,
)
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_torch_fx_available
from .configuration_deepseek_v3 import DeepseekV3Config
import torch.distributed as dist
import numpy as np

from ktransformers.util.grad_wrapper import maybe_no_grad

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa


# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "DeepseekV3Config"


def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(
        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
    )
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


class DeepseekV3RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        DeepseekV3RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm)


class DeepseekV3RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (
            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings,
            device=self.inv_freq.device,
            dtype=torch.get_default_dtype(),
        )
        self.max_seq_len_cached = None

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq.to(t.device))
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3
class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
    """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        t = t / self.scaling_factor

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3
class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
    """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings)
                - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (
                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


# Inverse dim formula to find dim based on number of rotations
def yarn_find_correction_dim(
    num_rotations, dim, base=10000, max_position_embeddings=2048
):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def yarn_find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(
        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
    )
    high = math.ceil(
        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
    )
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case


def yarn_get_mscale(scale=1, mscale=1):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0


def yarn_linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func


class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        dim = self.dim

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(seq_len, device=device, dtype=torch.float32)

        freqs = torch.outer(t, inv_freq)

        _mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )

        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer(
            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
        )
        self.register_buffer(
            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
        )


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)

    b, h, s, d = q.shape
    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)

    b, h, s, d = k.shape
    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class DeepseekV3MLP(nn.Module):
    def __init__(self, config, hidden_size=None, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
        self.intermediate_size = (
            config.intermediate_size if intermediate_size is None else intermediate_size
        )

        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class MoEGate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.top_k = config.num_experts_per_tok
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.scoring_func = config.scoring_func
        self.topk_method = config.topk_method
        self.n_group = config.n_group
        self.topk_group = config.topk_group

        # topk selection algorithm
        self.norm_topk_prob = config.norm_topk_prob
        self.gating_dim = config.hidden_size
        self.weight = nn.Parameter(
            torch.empty((self.n_routed_experts, self.gating_dim))
        )
        if self.topk_method == "noaux_tc":
            self.e_score_correction_bias = nn.Parameter(
                torch.empty((self.n_routed_experts))
            )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        logits = F.linear(
            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
        )
        if self.scoring_func == "sigmoid":
            scores = logits.sigmoid()
        else:
            raise NotImplementedError(
                f"insupportable scoring function for MoE gating: {self.scoring_func}"
            )

        ### select top-k experts
        if self.topk_method == "noaux_tc":
            #assert not self.training
            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
            group_scores = (
                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
            )  # [n, n_group]
            group_idx = torch.topk(
                group_scores, k=self.topk_group, dim=-1, sorted=False
            )[
                1
            ]  # [n, top_k_group]
            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
            score_mask = (
                group_mask.unsqueeze(-1)
                .expand(
                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
                )
                .reshape(bsz * seq_len, -1)
            )  # [n, e]
            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
            _, topk_idx = torch.topk(
                tmp_scores, k=self.top_k, dim=-1, sorted=False
            )
            topk_weight = scores.gather(1, topk_idx)
        else:
            raise NotImplementedError(
                f"insupportable TopK function for MoE gating: {self.topk_method}"
            )

        ### norm gate to sum 1
        if self.top_k > 1 and self.norm_topk_prob:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor

        return topk_idx, topk_weight

class DeepseekV3MoE(nn.Module):
    """
    A mixed expert module containing shared experts.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.num_experts_per_tok = config.num_experts_per_tok

        if hasattr(config, "ep_size") and config.ep_size > 1:
            assert config.ep_size == dist.get_world_size()
            self.ep_size = config.ep_size
            self.experts_per_rank = config.n_routed_experts // config.ep_size
            self.ep_rank = dist.get_rank()
            self.experts = nn.ModuleList(
                [
                    (
                        DeepseekV3MLP(
                            config, intermediate_size=config.moe_intermediate_size
                        )
                        if i >= self.ep_rank * self.experts_per_rank
                        and i < (self.ep_rank + 1) * self.experts_per_rank
                        else None
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        else:
            self.ep_size = 1
            self.experts_per_rank = config.n_routed_experts
            self.ep_rank = 0
            self.experts = nn.ModuleList(
                [
                    DeepseekV3MLP(
                        config, intermediate_size=config.moe_intermediate_size
                    )
                    for i in range(config.n_routed_experts)
                ]
            )
        self.gate = MoEGate(config)
        if config.n_shared_experts is not None:
            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
            self.shared_experts = DeepseekV3MLP(
                config=config, intermediate_size=intermediate_size
            )

    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        flat_topk_idx = topk_idx.view(-1)
        if not self.training:
            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
        if self.config.n_shared_experts is not None:
            y = y + self.shared_experts(identity)
        return y

    @maybe_no_grad()
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        sorted_tokens_shape = sorted_tokens.shape
        if self.ep_size > 1:
            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
            tokens_per_expert_group = tokens_per_expert.new_empty(
                tokens_per_expert.shape[0]
            )
            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
            output_splits = (
                tokens_per_expert_group.view(self.ep_size, -1)
                .sum(1)
                .cpu()
                .numpy()
                .tolist()
            )
            gathered_tokens = sorted_tokens.new_empty(
                tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
            )
            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
            dist.all_to_all(
                list(gathered_tokens.split(output_splits)),
                list(sorted_tokens.split(input_split_sizes)),
            )
            tokens_per_expert_post_gather = tokens_per_expert_group.view(
                self.ep_size, self.experts_per_rank
            ).sum(dim=0)
            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
            s = 0
            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
                gatherd_idxs[s : s + k] = i % self.experts_per_rank
                s += k
            gatherd_idxs = gatherd_idxs.argsort()
            sorted_tokens = gathered_tokens[gatherd_idxs]
            tokens_per_expert = tokens_per_expert_post_gather
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
        if self.ep_size > 1:
            new_x = torch.empty_like(outs)
            new_x[gatherd_idxs] = outs
            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
            dist.all_to_all(
                list(gathered_tokens.split(input_split_sizes)),
                list(new_x.split(output_splits)),
            )
            outs = gathered_tokens

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3
class DeepseekV3Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads

        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.q_lora_rank = config.q_lora_rank
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.kv_lora_rank = config.kv_lora_rank
        self.v_head_dim = config.v_head_dim
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim

        self.is_causal = True

        if self.q_lora_rank is None:
            self.q_proj = nn.Linear(
                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
            )
        else:
            self.q_a_proj = nn.Linear(
                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
            )
            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
            self.q_b_proj = nn.Linear(
                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
            )

        self.kv_a_proj_with_mqa = nn.Linear(
            self.hidden_size,
            config.kv_lora_rank + config.qk_rope_head_dim,
            bias=config.attention_bias,
        )
        self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
        self.kv_b_proj = nn.Linear(
            config.kv_lora_rank,
            self.num_heads
            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
            bias=False,
        )

        self.o_proj = nn.Linear(
            self.num_heads * self.v_head_dim,
            self.hidden_size,
            bias=config.attention_bias,
        )
        self._init_rope()

        self.softmax_scale = self.q_head_dim ** (-0.5)
        if self.config.rope_scaling is not None:
            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
            scaling_factor = self.config.rope_scaling["factor"]
            if mscale_all_dim:
                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                self.softmax_scale = self.softmax_scale * mscale * mscale

    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = DeepseekV3RotaryEmbedding(
                self.qk_rope_head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "yarn":
                kwargs = {
                    key: self.config.rope_scaling[key]
                    for key in [
                        "original_max_position_embeddings",
                        "beta_fast",
                        "beta_slow",
                        "mscale",
                        "mscale_all_dim",
                    ]
                    if key in self.config.rope_scaling
                }
                self.rotary_emb = DeepseekV3YarnRotaryEmbedding(
                    self.qk_rope_head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                    **kwargs,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
        )

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )
        assert attention_mask is not None
        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3
class DeepseekV3FlashAttention2(DeepseekV3Attention):
    """
    DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # DeepseekV3FlashAttention2 attention does not support output_attentions
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

            # overwrite attention_mask with padding_mask
            attention_mask = kwargs.pop("padding_mask")

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]

        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)

        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe

        if self.q_head_dim != self.v_head_dim:
            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
        # to be able to avoid many of these transpose/reshape/view.
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (DeepseekV3RMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
            if hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            elif torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            else:
                target_dtype = (
                    self.q_proj.weight.dtype
                    if self.q_lora_rank is None
                    else self.q_a_proj.weight.dtype
                )

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            dropout=dropout_rate,
            softmax_scale=self.softmax_scale,
        )
        if self.q_head_dim != self.v_head_dim:
            attn_output = attn_output[:, :, :, : self.v_head_dim]

        attn_output = attn_output.reshape(
            bsz, q_len, self.num_heads * self.v_head_dim
        ).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        dropout=0.0,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`int`, *optional*):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            (
                query_states,
                key_states,
                value_states,
                indices_q,
                cu_seq_lens,
                max_seq_lens,
            ) = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            attn_output = pad_input(
                attn_output_unpad, indices_q, batch_size, query_length
            )
        else:
            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states,
                dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

        return attn_output

    def _upad_input(
        self, query_layer, key_layer, value_layer, attention_mask, query_length
    ):
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
                indices_k,
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                query_layer, attention_mask
            )

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


ATTENTION_CLASSES = {
    "eager": DeepseekV3Attention,
    "flash_attention_2": DeepseekV3FlashAttention2,
}


class DeepseekV3DecoderLayer(nn.Module):
    def __init__(self, config: DeepseekV3Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = (
            DeepseekV3MoE(config)
            if (
                config.n_routed_experts is not None
                and layer_idx >= config.first_k_dense_replace
                and layer_idx % config.moe_layer_freq == 0
            )
            else DeepseekV3MLP(config)
        )
        self.input_layernorm = DeepseekV3RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = DeepseekV3RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            **kwargs,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


DeepseekV3_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DeepseekV3Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3PreTrainedModel(PreTrainedModel):
    config_class = DeepseekV3Config
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["DeepseekV3DecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_cache_class = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


DeepseekV3_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3Model(DeepseekV3PreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]

    Args:
        config: DeepseekV3Config
    """

    def __init__(self, config: DeepseekV3Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                DeepseekV3DecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)

        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_key_values_length,
                seq_length + past_key_values_length,
                dtype=torch.long,
                device=device,
            )
            position_ids = position_ids.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if self._use_flash_attention_2:
            # 2d mask is passed through the layers
            attention_mask = (
                attention_mask
                if (attention_mask is not None and 0 in attention_mask)
                else None
            )
        else:
            # 4d mask is passed through the layers
            attention_mask = _prepare_4d_causal_attention_mask(
                attention_mask,
                (batch_size, seq_length),
                inputs_embeds,
                past_key_values_length,
            )

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
            )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache()
                if use_legacy_cache
                else next_decoder_cache
            )
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = DeepseekV3Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM

        >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        # logits = self.lm_head(hidden_states[:,-1:,:])
        logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                cache_length = past_key_values.get_seq_length()
                past_length = past_key_values.seen_tokens
                max_cache_length = past_key_values.get_max_length()
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if (
                attention_mask is not None
                and attention_mask.shape[1] > input_ids.shape[1]
            ):
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx.to(past_state.device))
                    for past_state in layer_past
                ),
            )
        return reordered_past


@add_start_docstrings(
    """
    The DeepseekV3 Model transformer with a sequence classification head on top (linear layer).

    [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    DeepseekV3_START_DOCSTRING,
)
class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = DeepseekV3Model(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                ).to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


================================================
FILE: kt-sft/ktransformers/models/modeling_llama.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
from transformers.modeling_flash_attention_utils import _flash_attention_forward
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from .configuration_llama import LlamaConfig

from ktransformers.util.grad_wrapper import maybe_no_grad

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "LlamaConfig"


class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)


class LlamaRotaryEmbedding(nn.Module):
    def __init__(
        self,
        dim=None,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        rope_type="default",
        config: Optional[LlamaConfig] = None,
    ):
        super().__init__()
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.device = device
        self.scaling_factor = scaling_factor
        self.rope_type = rope_type
        self.config = config
        # TODO (joao): remove the `if` below, only used for BC
        self.rope_kwargs = {}
        if config is None:
            logger.warning_once(
                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
                "`config` argument. All other arguments will be removed in v4.45"
            )
            self.rope_kwargs = {
                "rope_type": rope_type,
                "factor": scaling_factor,
                "dim": dim,
                "base": base,
                "max_position_embeddings": max_position_embeddings,
            }
            self.rope_type = rope_type
            self.max_seq_len_cached = max_position_embeddings
            self.original_max_seq_len = max_position_embeddings
        else:
            # BC: "rope_type" was originally "type"
            if config.rope_scaling is not None:
                self.rope_type = config.rope_scaling.get(
                    "rope_type", config.rope_scaling.get("type")
                )
            else:
                self.rope_type = "default"
            self.max_seq_len_cached = config.max_position_embeddings
            self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(
            self.config, device, **self.rope_kwargs
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    def _dynamic_frequency_update(self, position_ids, device):
        """
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        """
        seq_len = torch.max(position_ids) + 1
        # seq_len = position_ids[0, -1] + 1
        if seq_len > self.max_seq_len_cached:  # growth
            inv_freq, self.attention_scaling = self.rope_init_fn(
                self.config, device, seq_len=seq_len, **self.rope_kwargs
            )
            self.register_buffer(
                "inv_freq", inv_freq, persistent=False
            )  # TODO joao: may break with compilation
            self.max_seq_len_cached = seq_len

        if (
            seq_len < self.original_max_seq_len
            and self.max_seq_len_cached > self.original_max_seq_len
        ):  # reset
            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
            self.max_seq_len_cached = self.original_max_seq_len

    @maybe_no_grad()
    def forward(self, x, position_ids):
        # if "dynamic" in self.rope_type:
        #     self._dynamic_frequency_update(position_ids, device=x.device)

        # Core RoPE block
        inv_freq_expanded = (
            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        )
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
        device_type = x.device.type
        device_type = (
            device_type
            if isinstance(device_type, str) and device_type != "mps"
            else "cpu"
        )
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (
                inv_freq_expanded.float() @ position_ids_expanded.float()
            ).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()

        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
        cos = cos * self.attention_scaling
        sin = sin * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(self, *args, **kwargs):
        logger.warning_once(
            "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
            "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
        )
        kwargs["rope_type"] = "linear"
        super().__init__(*args, **kwargs)


class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(self, *args, **kwargs):
        logger.warning_once(
            "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
            "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
            "__init__)."
        )
        kwargs["rope_type"] = "dynamic"
        super().__init__(*args, **kwargs)


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class LlamaMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(
            self.hidden_size, self.intermediate_size, bias=config.mlp_bias
        )
        self.up_proj = nn.Linear(
            self.hidden_size, self.intermediate_size, bias=config.mlp_bias
        )
        self.down_proj = nn.Linear(
            self.intermediate_size, self.hidden_size, bias=config.mlp_bias
        )
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        if self.config.pretraining_tp > 1:
            slice = self.intermediate_size // self.config.pretraining_tp
            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
            down_proj_slices = self.down_proj.weight.split(slice, dim=1)

            gate_proj = torch.cat(
                [
                    F.linear(x, gate_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ],
                dim=-1,
            )
            up_proj = torch.cat(
                [
                    F.linear(x, up_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ],
                dim=-1,
            )

            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
            down_proj = [
                F.linear(intermediate_states[i], down_proj_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            down_proj = sum(down_proj)
        else:
            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

        return down_proj


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class LlamaAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.q_proj = nn.Linear(
            self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.attention_bias,
        )
        self.v_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.attention_bias,
        )
        self.o_proj = nn.Linear(
            self.hidden_size, self.hidden_size, bias=config.attention_bias
        )

        # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers)
        self.rotary_emb = LlamaRotaryEmbedding(config=self.config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        if self.config.pretraining_tp > 1:
            key_value_slicing = (
                self.num_key_value_heads * self.head_dim
            ) // self.config.pretraining_tp
            query_slices = self.q_proj.weight.split(
                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
            )
            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

            query_states = [
                F.linear(hidden_states, query_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            query_states = torch.cat(query_states, dim=-1)

            key_states = [
                F.linear(hidden_states, key_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            key_states = torch.cat(key_states, dim=-1)

            value_states = [
                F.linear(hidden_states, value_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            value_states = torch.cat(value_states, dim=-1)

        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)

        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin
        )

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
        ) / math.sqrt(self.head_dim)

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, -1)

        if self.config.pretraining_tp > 1:
            attn_output = attn_output.split(
                self.hidden_size // self.config.pretraining_tp, dim=2
            )
            o_proj_slices = self.o_proj.weight.split(
                self.hidden_size // self.config.pretraining_tp, dim=1
            )
            attn_output = sum(
                [
                    F.linear(attn_output[i], o_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ]
            )
        else:
            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


class LlamaFlashAttention2(LlamaAttention):
    """
    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if isinstance(past_key_value, StaticCache):
            raise ValueError(
                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
            )

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin
        )

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
        # to be able to avoid many of these transpose/reshape/view.
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (LlamaRMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = _flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            dropout=dropout_rate,
            sliding_window=getattr(self, "sliding_window", None),
            use_top_left_mask=self._flash_attn_uses_top_left_mask,
            is_causal=self.is_causal,
        )

        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


class LlamaSdpaAttention(LlamaAttention):
    """
    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from LlamaAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
            )

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin
        )

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        causal_mask = attention_mask
        if attention_mask is not None:
            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and causal_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()

        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        is_causal = True if causal_mask is None and q_len > 1 else False

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            dropout_p=self.attention_dropout if self.training else 0.0,
            is_causal=is_causal,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(bsz, q_len, -1)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value


LLAMA_ATTENTION_CLASSES = {
    "eager": LlamaAttention,
    "flash_attention_2": LlamaFlashAttention2,
    "sdpa": LlamaSdpaAttention,
}


class LlamaDecoderLayer(nn.Module):
    def __init__(self, config: LlamaConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = LlamaMLP(config)
        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[
            Tuple[torch.Tensor, torch.Tensor]
        ] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
            **kwargs,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


LLAMA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LlamaConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaPreTrainedModel(PreTrainedModel):
    config_class = LlamaConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlamaDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


LLAMA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaModel(LlamaPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]

    Args:
        config: LlamaConfig
    """

    def __init__(self, config: LlamaConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                LlamaDecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = LlamaRotaryEmbedding(config=config)
        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
            )
            use_cache = False

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        return_legacy_cache = False
        if (
            use_cache and not isinstance(past_key_values, Cache) and not self.training
        ):  # kept for BC (non `Cache` `past_key_values` inputs)
            return_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds,
            cache_position,
            past_key_values,
            output_attentions,
        )
        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                    position_embeddings,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if return_legacy_cache:
            next_cache = next_cache.to_legacy_cache()

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = (
            past_key_values.get_seq_length() if past_key_values is not None else 0
        )
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and not using_static_cache
            and not output_attentions
        ):
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError(
                    "Custom 4D attention mask should be passed in inverted form with max==0`"
                )
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length),
                fill_value=min_dtype,
                dtype=dtype,
                device=device,
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(
                target_length, device=device
            ) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(
                input_tensor.shape[0], 1, -1, -1
            )
            if attention_mask is not None:
                causal_mask = (
                    causal_mask.clone()
                )  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = (
                    causal_mask[:, :, :, :mask_length]
                    + attention_mask[:, None, None, :]
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[
                    :, :, :, :mask_length
                ].masked_fill(padding_mask, min_dtype)
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(
                causal_mask, min_dtype
            )

        return causal_mask


class LlamaForCausalLM(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LlamaForCausalLM

        >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(
                self.vocab_size // self.config.pretraining_tp, dim=0
            )
            logits = [
                F.linear(hidden_states, lm_head_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        # logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        position_ids=None,
        use_cache=True,
        **kwargs,
    ):
        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        # Exception 1: when passing input_embeds, input_ids may be missing entries
        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        if past_key_values is not None:
            if inputs_embeds is not None:  # Exception 1
                input_ids = input_ids[:, -cache_position.shape[0] :]
            elif (
                input_ids.shape[1] != cache_position.shape[0]
            ):  # Default case (the "else", a no op, is Exception 2)
                input_ids = input_ids[:, cache_position]

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and cache_position[0] == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {
                "input_ids": input_ids.contiguous()
            }  # `contiguous()` needed for compilation use cases

        model_inputs.update(
            {
                "position_ids": position_ids,
                "cache_position": cache_position,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
            }
        )
        return model_inputs


@add_start_docstrings(
    """
    The LLaMa Model transformer with a sequence classification head on top (linear layer).

    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    LLAMA_START_DOCSTRING,
)
class LlamaForSequenceClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                )
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[
            torch.arange(batch_size, device=logits.device), sequence_lengths
        ]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
The Llama Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    LLAMA_START_DOCSTRING,
)
class LlamaForQuestionAnswering(LlamaPreTrainedModel):
    base_model_prefix = "transformer"

    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama
    def __init__(self, config):
        super().__init__(config)
        self.transformer = LlamaModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.transformer.embed_tokens

    def set_input_embeddings(self, value):
        self.transformer.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1).to(start_logits.device)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1).to(end_logits.device)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    The Llama Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    LLAMA_START_DOCSTRING,
)
class LlamaForTokenClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: kt-sft/ktransformers/models/modeling_mixtral.py
================================================
# coding=utf-8
'''
Description  : 
Author       : kkk1nak0
Date         : 2024-07-29 02:58:57
Version      : 1.0.0
LastEditors  : kkk1nak0
LastEditTime : 2024-08-02 06:08:34
'''

# Adapted from 
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mixtral/modeling_mixtral.py
# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Mixtral model."""

import inspect 
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    logging,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_torch_fx_available
from transformers.models.mixtral.configuration_mixtral import MixtralConfig

from ktransformers.util.grad_wrapper import maybe_no_grad

if is_flash_attn_2_available():
    from flash_attn import flash_attn_varlen_func, flash_attn_func, flash_attn_with_kvcache
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)

# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    if not is_torch_greater_or_equal_than_1_13:
        import torch.fx

    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "MixtralConfig"


def load_balancing_loss_func(
    gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
) -> float:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        attention_mask (`torch.Tensor`, None):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
            Number of experts

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mixtral
class MixtralRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        MixtralRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


# copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()
        
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self.max_seq_len_cached = max_position_embeddings

    @maybe_no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
# TODO @longjie no longer copied from Mistral after static cache
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    """

    def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True
        self.attention_dropout = config.attention_dropout

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

        self.rotary_emb = MixtralRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


# copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralFlashAttention2(MixtralAttention):
    """
    Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(value_states, position_ids)

        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        use_sliding_windows = (
            _flash_supports_window_size
            and getattr(self.config, "sliding_window", None) is not None
            and kv_seq_len > self.config.sliding_window
            and self.config.use_sliding_window
        )

        if not _flash_supports_window_size:
            logger.warning_once(
                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
                " make sure to upgrade flash-attn library."
            )

        if past_key_value is not None:
            # Activate slicing cache only if the config has a value `sliding_windows` attribute
            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
            if (
                getattr(self.config, "sliding_window", None) is not None
                and kv_seq_len > self.config.sliding_window
                and cache_has_contents
            ):
                slicing_tokens = 1 - self.config.sliding_window

                past_key = past_key_value[self.layer_idx][0]
                past_value = past_key_value[self.layer_idx][1]

                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
                past_value = past_value[:, :, slicing_tokens:, :].contiguous()

                if past_key.shape[-2] != self.config.sliding_window - 1:
                    raise ValueError(
                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
                        f" {past_key.shape}"
                    )

                if attention_mask is not None:
                    attention_mask = attention_mask[:, slicing_tokens:]
                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)

            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

            # we slice the states for static kv cache to be supported in FA2. Not sure it's a must as compile fails
            # for bsz == 1, avoid using slice to capture cuda graph
            if cache_position is not None and q_len > 1:
                key_states = key_states[:, :, : cache_position[-1] + 1, :]
                value_states = value_states[:, :, : cache_position[-1] + 1, :]

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        dropout_rate = 0.0 if not self.training else self.attention_dropout

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in float16 just to be sure everything works as expected.
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        # Reashape to the expected shape for Flash Attention
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            position_ids=position_ids,
            dropout=dropout_rate,
            sliding_window=getattr(self.config, "sliding_window", None),
            is_causal=self.is_causal,
        )

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value
    

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        q_len,
        position_ids,
        dropout,
        sliding_window,
        is_causal,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            
        """
        
        # Decide whether to use SWA or not by layer index.
        # if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
        #     use_sliding_windows = False
        use_sliding_windows = False

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, q_len
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            if not use_sliding_windows:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=is_causal,
                )
            else:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=is_causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, q_len)
        else:
            if not use_sliding_windows:
                if q_len == 1:
                    position_ids = position_ids.to(dtype=torch.int32).squeeze(1)
                    attn_output = flash_attn_with_kvcache(
                        query_states,
                        key_states,
                        value_states,
                        cache_seqlens=position_ids,
                        softmax_scale=softmax_scale,
                        causal=is_causal,
                    )   
                else:
                    attn_output = flash_attn_func(
                        query_states,
                        key_states,
                        value_states,
                        dropout,
                        softmax_scale=softmax_scale,
                        causal=is_causal,
                    )
            else:
                attn_output = flash_attn_func(
                    query_states,
                    key_states,
                    value_states,
                    dropout,
                    softmax_scale=softmax_scale,
                    causal=is_causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

        return attn_output

    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape

        # On the first iteration we need to properly re-create the padding mask
        # by slicing it on the proper place
        if kv_seq_len != attention_mask.shape[-1]:
            attention_mask_num_tokens = attention_mask.shape[-1]
            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]

        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)

        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)

        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


# copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralSdpaAttention(MixtralAttention):
    """
    Mixtral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `MixtralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from MixtralAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
                "MixtralModel is using MixtralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
            )

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)

        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        causal_mask = attention_mask
        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and attention_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()

        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
        is_causal = True if causal_mask is None and q_len > 1 else False

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            dropout_p=self.attention_dropout if self.training else 0.0,
            is_causal=is_causal,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value


MIXTRAL_ATTENTION_CLASSES = {
    "eager": MixtralAttention,
    "flash_attention_2": MixtralFlashAttention2,
    "sdpa": MixtralSdpaAttention,
}


class MixtralBlockSparseTop2MLP(nn.Module):
    def __init__(self, config: MixtralConfig):
        super().__init__()
        self.ffn_dim = config.intermediate_size
        self.hidden_dim = config.hidden_size

        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)  # gate
        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)  # down
        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)  # up

        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
        current_hidden_states = self.w2(current_hidden_states)
        return current_hidden_states


class MixtralSparseMoeBlock(nn.Module):
    """
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accomodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    """

    def __init__(self, config):
        super().__init__()
        self.hidden_dim = config.hidden_size
        self.ffn_dim = config.intermediate_size
        self.num_experts = config.num_local_experts
        self.top_k = config.num_experts_per_tok

        # gating
        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)

        self.experts = nn.ModuleList([MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)])

        # Jitter parameters
        self.jitter_noise = config.router_jitter_noise

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        if self.training and self.jitter_noise > 0:
            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits


class MixtralDecoderLayer(nn.Module):
    def __init__(self, config: MixtralConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = MIXTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

        self.block_sparse_moe = MixtralSparseMoeBlock(config)
        self.input_layernorm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        if output_router_logits:
            outputs += (router_logits,)

        return outputs


MIXTRAL_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MixtralConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
    MIXTRAL_START_DOCSTRING,
)
# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel with Qwen2->Mixtral
class MixtralPreTrainedModel(PreTrainedModel):
    config_class = MixtralConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["MixtralDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


MIXTRAL_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_router_logits (`bool`, *optional*):
            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
            should not be returned during inference.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
    MIXTRAL_START_DOCSTRING,
)
# copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
# TODO @longjie no longer copied from Mistral after static cache
class MixtralModel(MixtralPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]

    Args:
        config: MixtralConfig
    """

    def __init__(self, config: MixtralConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [MixtralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self._attn_implementation = config._attn_implementation
        self.norm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # Ignore copy
    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        use_legacy_cache = False
        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
            use_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
                if v is not None
            )
        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask


class MixtralForCausalLM(MixtralPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = MixtralModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.router_aux_loss_coef = config.router_aux_loss_coef
        self.num_experts = config.num_local_experts
        self.num_experts_per_tok = config.num_experts_per_tok
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    # Ignore copy
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MixtralForCausalLM

        >>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_router_logits=output_router_logits,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits if return_dict else outputs[-1],
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        if not return_dict:
            output = (logits,) + outputs[1:]
            if output_router_logits:
                output = (aux_loss,) + output
            return (loss,) + output if loss is not None else output

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        output_router_logits=False,
        position_ids=None,
        use_cache=True,
        **kwargs,
    ):
        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        # Exception 1: when passing input_embeds, input_ids may be missing entries
        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        if past_key_values is not None:
            if inputs_embeds is not None:  # Exception 1
                input_ids = input_ids[:, -cache_position.shape[0] :]
            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                input_ids = input_ids[:, cache_position]

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and cache_position[0] == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases

        model_inputs.update(
            {
                "position_ids": position_ids,
                "cache_position": cache_position,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
                "output_router_logits": output_router_logits,
            }
        )
        return model_inputs


@add_start_docstrings(
    """
    The Mixtral Model transformer with a sequence classification head on top (linear layer).

    [`MixtralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    MIXTRAL_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mixtral, LLAMA->MIXTRAL
class MixtralForSequenceClassification(MixtralPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = MixtralModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
    The Mixtral Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    MIXTRAL_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Mixtral, LLAMA->MIXTRAL
class MixtralForTokenClassification(MixtralPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = MixtralModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

================================================
FILE: kt-sft/ktransformers/models/modeling_qwen2_moe.py
================================================
# coding=utf-8
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
''' 
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.42.3/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
# 
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Qwen2MoE model."""

import inspect
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
)
from transformers.modeling_outputs import (
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig

from ktransformers.util.grad_wrapper import maybe_no_grad

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Qwen/Qwen1.5-MoE-A2.7B"
_CONFIG_FOR_DOC = "Qwen2MoeConfig"


# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
def load_balancing_loss_func(
    gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
) -> float:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        attention_mask (`torch.Tensor`, None):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
            Number of experts

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2Moe
class Qwen2MoeRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Qwen2MoeRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2Moe
class Qwen2MoeRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

    @maybe_no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2Moe
class Qwen2MoeMLP(nn.Module):
    def __init__(self, config, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2Attention with Qwen2->Qwen2Moe
class Qwen2MoeAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    """

    def __init__(self, config: Qwen2MoeConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True
        self.attention_dropout = config.attention_dropout

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

        self.rotary_emb = Qwen2MoeRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2FlashAttention2 with Qwen2->Qwen2Moe
class Qwen2MoeFlashAttention2(Qwen2MoeAttention):
    """
    Qwen2Moe flash attention module, following Qwen2Moe attention module. This module inherits from `Qwen2MoeAttention`
    as the weights of the module stays untouched. The only required change would be on the forward pass
    where it needs to correctly call the public API of flash attention and deal with padding tokens
    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
    config.max_window_layers layers.
    """

    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        use_sliding_windows = (
            _flash_supports_window_size
            and getattr(self.config, "sliding_window", None) is not None
            and kv_seq_len > self.config.sliding_window
            and self.config.use_sliding_window
        )

        if not _flash_supports_window_size:
            logger.warning_once(
                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
                " make sure to upgrade flash-attn library."
            )

        if past_key_value is not None:
            # Activate slicing cache only if the config has a value `sliding_windows` attribute
            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
            if (
                getattr(self.config, "sliding_window", None) is not None
                and kv_seq_len > self.config.sliding_window
                and cache_has_contents
            ):
                slicing_tokens = 1 - self.config.sliding_window

                past_key = past_key_value[self.layer_idx][0]
                past_value = past_key_value[self.layer_idx][1]

                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
                past_value = past_value[:, :, slicing_tokens:, :].contiguous()

                if past_key.shape[-2] != self.config.sliding_window - 1:
                    raise ValueError(
                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
                        f" {past_key.shape}"
                    )

                if attention_mask is not None:
                    attention_mask = attention_mask[:, slicing_tokens:]
                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)

            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
            # we slice the states for static kv cache to be supported in FA2. Not sure it's a must as compile fails
            # for bsz == 1, avoid using slice to capture cuda graph
            if cache_position is not None and q_len > 1:
                key_states = key_states[:, :, : cache_position[-1] + 1, :]
                value_states = value_states[:, :, : cache_position[-1] + 1, :]

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        dropout_rate = 0.0 if not self.training else self.attention_dropout

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in float16 just to be sure everything works as expected.
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        # Reashape to the expected shape for Flash Attention
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            position_ids=position_ids,
            dropout=dropout_rate,
            use_sliding_windows=use_sliding_windows,
        )

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        position_ids,
        dropout=0.0,
        softmax_scale=None,
        use_sliding_windows=False,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
            use_sliding_windows (`bool`, *optional*):
                Whether to activate sliding window attention.
        """
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
            causal = self.is_causal and query_length != 1

        # Decide whether to use SWA or not by layer index.
        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
            use_sliding_windows = False

        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            if not use_sliding_windows:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                )
            else:
                attn_output_unpad = flash_attn_varlen_func(
                    query_states,
                    key_states,
                    value_states,
                    cu_seqlens_q=cu_seqlens_q,
                    cu_seqlens_k=cu_seqlens_k,
                    max_seqlen_q=max_seqlen_in_batch_q,
                    max_seqlen_k=max_seqlen_in_batch_k,
                    dropout_p=dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            if not use_sliding_windows:
                if query_length == 1:
                    position_ids = position_ids.to(dtype=torch.int32).squeeze(1)
                    attn_output = flash_attn_with_kvcache(
                        query_states,
                        key_states,
                        value_states,
                        cache_seqlens=position_ids,
                        softmax_scale=softmax_scale,
                        causal=causal,
                    )   
                else:
                    attn_output = flash_attn_func(
                        query_states,
                        key_states,
                        value_states,
                        dropout,
                        softmax_scale=softmax_scale,
                        causal=causal,
                    )
            else:
                attn_output = flash_attn_func(
                    query_states,
                    key_states,
                    value_states,
                    dropout,
                    softmax_scale=softmax_scale,
                    causal=causal,
                    window_size=(self.config.sliding_window, self.config.sliding_window),
                )

        return attn_output

    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape

        # On the first iteration we need to properly re-create the padding mask
        # by slicing it on the proper place
        if kv_seq_len != attention_mask.shape[-1]:
            attention_mask_num_tokens = attention_mask.shape[-1]
            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]

        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)

        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)

        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )


# Copied from transformers.models.mixtral.modeling_mixtral.MixtralSdpaAttention with Mixtral->Qwen2Moe
class Qwen2MoeSdpaAttention(Qwen2MoeAttention):
    """
    Qwen2Moe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `Qwen2MoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from Qwen2MoeAttention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
                "Qwen2MoeModel is using Qwen2MoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
            )

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        causal_mask = attention_mask
        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and attention_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()

        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
        is_causal = True if causal_mask is None and q_len > 1 else False

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            dropout_p=self.attention_dropout if self.training else 0.0,
            is_causal=is_causal,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value


QWEN2MOE_ATTENTION_CLASSES = {
    "eager": Qwen2MoeAttention,
    "flash_attention_2": Qwen2MoeFlashAttention2,
    "sdpa": Qwen2MoeSdpaAttention,
}


class Qwen2MoeSparseMoeBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob

        # gating
        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
        self.experts = nn.ModuleList(
            [Qwen2MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
        )

        self.shared_expert = Qwen2MoeMLP(config, intermediate_size=config.shared_expert_intermediate_size)
        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))

        shared_expert_output = self.shared_expert(hidden_states)
        shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output

        final_hidden_states = final_hidden_states + shared_expert_output

        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits


class Qwen2MoeDecoderLayer(nn.Module):
    def __init__(self, config: Qwen2MoeConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = QWEN2MOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

        if (layer_idx not in config.mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
            self.mlp = Qwen2MoeSparseMoeBlock(config)
        else:
            self.mlp = Qwen2MoeMLP(config, intermediate_size=config.intermediate_size)

        self.input_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
                and should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states)
        if isinstance(hidden_states, tuple):
            hidden_states, router_logits = hidden_states
        else:
            router_logits = None

        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        if output_router_logits:
            outputs += (router_logits,)

        return outputs


QWEN2MOE_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Qwen2MoeConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
)
class Qwen2MoePreTrainedModel(PreTrainedModel):
    config_class = Qwen2MoeConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Qwen2MoeDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


QWEN2MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_router_logits (`bool`, *optional*):
            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
            should not be returned during inference.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
)
class Qwen2MoeModel(Qwen2MoePreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]

    Args:
        config: Qwen2MoeConfig
    """

    def __init__(self, config: Qwen2MoeConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [Qwen2MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self._attn_implementation = config._attn_implementation
        self.norm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        use_legacy_cache = False
        if use_cache and not isinstance(past_key_values, Cache):
            use_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits and layer_outputs[-1] is not None:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
                if v is not None
            )
        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask


class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2MoeModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.router_aux_loss_coef = config.router_aux_loss_coef
        self.num_experts = config.num_experts
        self.num_experts_per_tok = config.num_experts_per_tok
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen2MoeForCausalLM

        >>> model = Qwen2MoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_router_logits=output_router_logits,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits if return_dict else outputs[-1],
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        if not return_dict:
            output = (logits,) + outputs[1:]
            if output_router_logits:
                output = (aux_loss,) + output
            return (loss,) + output if loss is not None else output

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        use_cache=True,
        **kwargs,
    ):
        past_length = 0
        # Omit tokens covered by past_key_values
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                max_cache_length = (
                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
                    if past_key_values.get_max_length() is not None
                    else None
                )
                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_length == 0:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
        if cache_position is None:
            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
        elif use_cache:
            cache_position = cache_position[-input_length:]

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
                "cache_position": cache_position,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past


@add_start_docstrings(
    """
    The Qwen2MoE Model transformer with a sequence classification head on top (linear layer).

    [`Qwen2MoeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    QWEN2MOE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE
class Qwen2MoeForSequenceClassification(Qwen2MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen2MoeModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
    The Qwen2MoE Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    QWEN2MOE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE
class Qwen2MoeForTokenClassification(Qwen2MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen2MoeModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: kt-sft/ktransformers/models/modeling_qwen3_moe.py
================================================
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/qwen3_moe/modular_qwen3_moe.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_qwen3_moe.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Callable, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
from torch import nn

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
from transformers.generation import GenerationMixin
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
# from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
# from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.modeling_utils import PreTrainedModel
# from transformers.processing_utils import Unpack
from transformers.utils import (
    # LossKwargs,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from transformers.utils.deprecation import deprecate_kwarg
from .configuration_qwen3_moe import Qwen3MoeConfig

from ktransformers.util.grad_wrapper import maybe_no_grad
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRotaryEmbedding

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Qwen/Qwen3-MoE-15B-A2B"
_CONFIG_FOR_DOC = "Qwen3MoeConfig"


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs,
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class Qwen3MoeAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.num_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.scaling = self.head_dim**-0.5
        self.attention_dropout = config.attention_dropout
        self.is_causal = True

        self.q_proj = nn.Linear(
            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.v_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.o_proj = nn.Linear(
            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
        )
        self.q_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
        self.k_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape

        self.rotary_emb = Qwen2MoeRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

        self.sliding_window = config.sliding_window
        if not (
            self.config.use_sliding_window
            and getattr(self.config, "sliding_window", None) is not None
            and self.layer_idx >= self.config.max_window_layers
        ):
            self.sliding_window = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        # **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

        cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                )
            else:
                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            sliding_window=self.sliding_window,  # diff with Llama
            # **kwargs,
        )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights


class Qwen3MoeMLP(nn.Module):
    def __init__(self, config, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class Qwen3MoeSparseMoeBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob

        # gating
        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
        self.experts = nn.ModuleList(
            [Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits


class Qwen3MoeRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Qwen3MoeRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


class Qwen3MoeDecoderLayer(nn.Module):
    def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = Qwen3MoeAttention(config, layer_idx)
        self.mlp = Qwen3MoeMLP(config)

        self.self_attn = Qwen3MoeAttention(config, layer_idx)

        if (layer_idx not in config.mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
            self.mlp = Qwen3MoeSparseMoeBlock(config)
        else:
            self.mlp = Qwen3MoeMLP(config, intermediate_size=config.intermediate_size)

        self.input_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        # **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
                and should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states)
        if isinstance(hidden_states, tuple):
            hidden_states, router_logits = hidden_states
        else:
            router_logits = None

        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if output_router_logits:
            outputs += (router_logits,)

        return outputs


def _compute_default_rope_parameters(
    config: Optional[Qwen3MoeConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        base = rope_kwargs["base"]
        dim = rope_kwargs["dim"]
    elif config is not None:
        base = config.rope_theta
        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
        dim = int(config.head_dim * partial_rotary_factor)

    attention_factor = 1.0  # Unused in this type of RoPE

    # Compute the inverse frequencies
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, attention_factor

class Qwen3MoeRotaryEmbedding(nn.Module):
    def __init__(self, config: Qwen3MoeConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        self.scaling_factor = 1.0
        self.dim = config.head_dim
        self.max_position_embeddings = config.max_position_embeddings
        self.base = config.rope_theta
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))

        inv_freq, self.attention_scaling = _compute_default_rope_parameters(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    def _dynamic_frequency_update(self, position_ids, device):
        """
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        """
        seq_len = torch.max(position_ids) + 1
        if seq_len > self.max_seq_len_cached:  # growth
            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
            self.max_seq_len_cached = seq_len

        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
            # This .to() is needed if the model has been moved to a device after being initialized (because
            # the buffer is automatically moved, but not the original copy)
            self.original_inv_freq = self.original_inv_freq.to(device)
            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
            self.max_seq_len_cached = self.original_max_seq_len

    @maybe_no_grad()
    def forward(self, x, position_ids):
        if "dynamic" in self.rope_type:
            self._dynamic_frequency_update(position_ids, device=x.device)

        # Core RoPE block
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"

        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()

        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
        cos = cos * self.attention_scaling
        sin = sin * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


QWEN3_MOE_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Qwen3MoeConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare Qwen3Moe Model outputting raw hidden-states without any specific head on top.",
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoePreTrainedModel(PreTrainedModel):
    config_class = Qwen3MoeConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Qwen3MoeDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
    _supports_attention_backend = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


QWEN3_MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Qwen3Moe Model outputting raw hidden-states without any specific head on top.",
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeModel(Qwen3MoePreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3MoeDecoderLayer`]

    Args:
        config: Qwen3MoeConfig
    """

    def __init__(self, config: Qwen3MoeConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [Qwen3MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config)
        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        # **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                    position_embeddings,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                    # **flash_attn_kwargs,
                )

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        output = MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )
        return output if return_dict else output.to_tuple()

    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool = False,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and past_key_values is not None:
                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
                if is_padding_right:
                    raise ValueError(
                        "You are attempting to perform batched generation with padding_side='right'"
                        " this may lead to unexpected behaviour for Flash Attention version of Qwen3Moe. Make sure to "
                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                    )
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)
        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and not (using_static_cache or using_sliding_window_cache)
            and not output_attentions
        ):
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                sliding_window=self.config.sliding_window,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        # SlidingWindowCache or StaticCache
        if using_sliding_window_cache or using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
        # DynamicCache or no cache
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
            attention_mask,
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
            config=self.config,
            past_key_values=past_key_values,
        )

        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type in ["cuda", "xpu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

    @staticmethod
    def _prepare_4d_causal_attention_mask_with_cache_position(
        attention_mask: torch.Tensor,
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        config: Qwen3MoeConfig,
        past_key_values: Cache,
    ):
        """
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to place the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`Qwen3MoeConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        """
        if attention_mask is not None and attention_mask.dim() == 4:
            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
            causal_mask = attention_mask
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            if config.sliding_window is not None:
                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                # the check is needed to verify is current checkpoint was trained with sliding window or not
                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
                        cache_position.reshape(-1, 1) - config.sliding_window
                    )
                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
            causal_mask *= diagonal_attend_mask
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                if attention_mask.shape[-1] > target_length:
                    attention_mask = attention_mask[:, :target_length]
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
                    causal_mask.device
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        return causal_mask


# class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
class KwargsForCausalLM(): ...


def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
    top_k=2,
    attention_mask: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, int]:
    r"""
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    """
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    if isinstance(gate_logits, tuple):
        compute_device = gate_logits[0].device
        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)

    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)

    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

    if attention_mask is None:
        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.mean(routing_weights, dim=0)
    else:
        batch_size, sequence_length = attention_mask.shape
        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)

        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
        expert_attention_mask = (
            attention_mask[None, :, :, None, None]
            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
            .reshape(-1, top_k, num_experts)
            .to(compute_device)
        )

        # Compute the percentage of tokens routed to each experts
        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
            expert_attention_mask, dim=0
        )

        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
        router_per_expert_attention_mask = (
            attention_mask[None, :, :, None]
            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
            .reshape(-1, num_experts)
            .to(compute_device)
        )

        # Compute the average probability of routing to these experts
        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
            router_per_expert_attention_mask, dim=0
        )

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


class Qwen3MoeForCausalLM(Qwen3MoePreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen3MoeModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.router_aux_loss_coef = config.router_aux_loss_coef
        self.num_experts = config.num_experts
        self.num_experts_per_tok = config.num_experts_per_tok

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        # **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            logits_to_keep (`int` or `torch.Tensor`, *optional*):
                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
                This is useful when using packed tensor format (single dimension for batch and sequence length).

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM

        >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_router_logits = (
            output_router_logits if output_router_logits is not None else self.config.output_router_logits
        )

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_router_logits=output_router_logits,
            return_dict=return_dict,
            cache_position=cache_position,
            # **kwargs,
        )

        hidden_states = outputs[0]
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
        logits = self.lm_head(hidden_states[:, slice_indices, :])

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size)

        aux_loss = None
        if output_router_logits:
            aux_loss = load_balancing_loss_func(
                outputs.router_logits if return_dict else outputs[-1],
                self.num_experts,
                self.num_experts_per_tok,
                attention_mask,
            )
            if labels is not None:
                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device

        if not return_dict:
            output = (logits,) + outputs[1:]
            if output_router_logits:
                output = (aux_loss,) + output
            return (loss,) + output if loss is not None else output

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_logits=outputs.router_logits,
        )


@add_start_docstrings(
    """
    The Qwen3Moe Model transformer with a sequence classification head on top (linear layer).

    [`Qwen3MoeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeForSequenceClassification(Qwen3MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen3MoeModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        elif input_ids is not None:
            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
            logger.warning_once(
                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
            )

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )


@add_start_docstrings(
    """
    The Qwen3Moe Model transformer with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeForTokenClassification(Qwen3MoePreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen3MoeModel(config)
        if getattr(config, "classifier_dropout", None) is not None:
            classifier_dropout = config.classifier_dropout
        elif getattr(config, "hidden_dropout", None) is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(classifier_dropout)
        self.score = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.score(sequence_output)

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.config)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
The Qwen3Moe Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    QWEN3_MOE_START_DOCSTRING,
)
class Qwen3MoeForQuestionAnswering(Qwen3MoePreTrainedModel):
    base_model_prefix = "transformer"

    def __init__(self, config):
        super().__init__(config)
        self.transformer = Qwen3MoeModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.transformer.embed_tokens

    def set_input_embeddings(self, value):
        self.transformer.embed_tokens = value

    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        loss = None
        if start_positions is not None and end_positions is not None:
            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


__all__ = [
    "Qwen3MoeForCausalLM",
    "Qwen3MoeForQuestionAnswering",
    "Qwen3MoeModel",
    "Qwen3MoePreTrainedModel",
    "Qwen3MoeForSequenceClassification",
    "Qwen3MoeForTokenClassification",
]

================================================
FILE: kt-sft/ktransformers/moe_test_module.py
================================================
import os
import platform
import sys

project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)

from torchviz import make_dot
from torch import nn
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)
import unittest
from torch.autograd import gradcheck

from ktransformers.operators.linear import KLinearTorch, KTransformersLinear
from ktransformers.sft.peft_utils.lora_layer import KTransformersLinearLora
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.operators.experts import KExpertsTorch
from ktransformers.util.utils import load_weights

gguf_loader = GGUFLoader(gguf_path="/home/yj/ktransformers/GGUF-DeepSeek-V2-Lite-Chat")
config = AutoConfig.from_pretrained("/home/yj/ktransformers/DeepSeek-V2-Lite-Chat", trust_remote_code=True)
torch.set_default_dtype(config.torch_dtype)

class TestKExpertsTorch(unittest.TestCase):
    def setUp(self):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        self.num_experts = 8
        
        self.fixed_input = None
        self.fixed_expert_ids = None
        self.fixed_weights = None
        
    def _create_fixed_data(self, device, batch_size=2):
        """创建固定输入数据"""
        if self.fixed_input is None:
            with torch.random.fork_rng():
                torch.manual_seed(42)
                hidden_size = config.hidden_size
                
                self.fixed_input = torch.randn(batch_size, hidden_size)
                
                self.fixed_expert_ids = torch.tensor([[0, 1], [2, 3]], dtype=torch.long)
                
                self.fixed_weights = torch.tensor([[0.5, 0.5], [0.5, 0.5]], dtype=torch.float32)
        
        return (
            self.fixed_input.clone().to(device).requires_grad_(True),
            self.fixed_expert_ids.clone().to(device),
            self.fixed_weights.clone().to(device)
        )

    def _run_single_device_test(self, device, seed=42):
        """在指定设备上运行前向反向传播并返回梯度"""
        torch.manual_seed(seed)
        if device == "cuda":
            torch.cuda.manual_seed_all(seed)
        
        model = KExpertsTorch(
            key="blk.1",
            gguf_loader=gguf_loader,
            config=config,
            n_routed_experts=self.num_experts,
            device=device
        )
        model.load(device=device)
        
        input_tensor, expert_ids, weights = self._create_fixed_data(device)
        
        model.to(device)
        
        with torch.autocast(device_type=device, enabled=False):
            output = model(input_tensor, expert_ids, weights)
            
        loss = output.sum()
        loss.backward()
        
        gradients = {
            "input": input_tensor.grad.detach().cpu(),
            "loss": loss.detach().cpu(),
            "model": [p.grad.detach().cpu() for p in model.parameters() if p.grad is not None]
        }
        return gradients

    def test_forward_gradient(self):
        cpu_gradients = self._run_single_device_test("cpu")
        
        if torch.cuda.is_available():
            gpu_gradients = self._run_single_device_test("cuda")

            print(f"cpu_gradients:{cpu_gradients}")
            print(f"gpu_gradients:{gpu_gradients}")
            
            input_diff = torch.max(torch.abs(cpu_gradients["input"] - gpu_gradients["input"]))
            print(f"input_diff:{input_diff}")
            
            for i, (cpu_g, gpu_g) in enumerate(zip(cpu_gradients["model"], gpu_gradients["model"])):
                param_diff = torch.max(torch.abs(cpu_g - gpu_g))
                print(f"param_diff:{param_diff}")

            for i, (cpu_g, gpu_g) in enumerate(zip(cpu_gradients["model"], gpu_gradients["model"])):
                diff = (cpu_g - gpu_g.cpu()).abs().max()
                print(f"参数梯度 {i} 最大差异: {diff.item()}")
                self.assertTrue(torch.allclose(cpu_g, gpu_g, atol=1e-4, rtol=1e-3),
                            f"参数梯度 {i} 差异超出阈值，最大差异: {diff.item()}")
                
        else:
            self.skipTest("CUDA不可用，跳过GPU测试")

if __name__ == '__main__':
    unittest.main()

================================================
FILE: kt-sft/ktransformers/moe_test_module_old.py
================================================
import os
import platform
import sys

project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)

from torchviz import make_dot
from torch import nn
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)
import unittest
from torch.autograd import gradcheck

from ktransformers.operators.linear import KLinearTorch, KTransformersLinear
from ktransformers.sft.peft_utils.lora_layer import KTransformersLinearLora
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.operators.experts import KExpertsTorch
from ktransformers.util.utils import load_weights

gguf_loader = GGUFLoader(gguf_path="/home/yj/ktransformers/GGUF-DeepSeek-V2-Lite-Chat")
config = AutoConfig.from_pretrained("/home/yj/ktransformers/DeepSeek-V2-Lite-Chat", trust_remote_code=True)
torch.set_default_dtype(config.torch_dtype)

class TestKExpertsTorch(unittest.TestCase):
    def setUp(self):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        self.base_device = "cpu"
        self.num_experts = 8
        # model = KExpertsTorch(
        #     key="blk.1",
        #     gguf_loader=gguf_loader,
        #     config=config,
        #     n_routed_experts=self.num_experts,
        #     device=self.base_device
        # )
        # model.load()
        
    def _run_single_device_test(self, device, seed=42):
        """在指定设备上运行前向反向传播并返回梯度"""
        torch.manual_seed(seed)
        if device == "cuda":
            torch.cuda.manual_seed_all(seed)
        
        model = KExpertsTorch(
            key="blk.1",
            gguf_loader=gguf_loader,
            config=config,
            n_routed_experts=self.num_experts,
            device=device
        )
        model.load(device=device)

        with torch.random.fork_rng():
            torch.manual_seed(seed)
            batch_size = 2
            hidden_size = model.config.hidden_size
            input_tensor = torch.randn(batch_size, hidden_size, device=device, requires_grad=True)
            expert_ids = torch.randint(0, self.num_experts, 
                                    (batch_size, model.config.num_experts_per_tok), 
                                    device=device)
            weights = torch.randn(batch_size, model.config.num_experts_per_tok, device=device)
            weights = torch.softmax(weights, dim=-1)
        
        print(f"input_tensor.device:{input_tensor.device}")
        print(f"torch.device(device):{torch.device(device)}")
        # assert input_tensor.device == torch.device(device)
        for p in model.parameters():
            print(f"p.device:{p.device}")

        for name, param in model.named_parameters():
            print(name, param.size())

        
        model.to(device)
        with torch.autocast(device_type=device, enabled=False):
            output = model(input_tensor, expert_ids, weights)
        
        loss = output.sum()

        
        # dot = make_dot(output, params=dict(model.named_parameters()))
        # dot.render(f"origin_moe_{torch.device(device)}_graph", format="svg")

        loss.backward()
        
        gradients = {
            "input": input_tensor.grad.clone().cpu(),
            "loss": loss.clone().cpu(),
            "model": [p.grad.clone().cpu() for p in model.parameters() if p.grad is not None]
        }
        return gradients

    def test_forward_gradient(self):
        # for param in model.parameters():
        #     self.assertEqual(param.dtype, config.torch_dtype)
        
        cpu_gradients = self._run_single_device_test("cpu")
        print(f"cpu_gradients: {cpu_gradients}")
        
        self.assertIsNotNone(cpu_gradients["input"])
        self.assertTrue(all(g is not None for g in cpu_gradients["model"]))
        
        if torch.cuda.is_available():
            gpu_gradients = self._run_single_device_test("cuda")

            print(f"gpu_gradients: {gpu_gradients}")

            
            max_diff = (cpu_gradients["input"] - gpu_gradients["input"].cpu()).abs().max()
            print(f"Input梯度最大差异: {max_diff.item()}")

            self.assertTrue(torch.allclose(cpu_gradients["input"], gpu_gradients["input"], atol=1e-4, rtol=1e-3),
                        f"Input梯度差异超出阈值，最大差异: {max_diff.item()}")

            for i, (cpu_g, gpu_g) in enumerate(zip(cpu_gradients["model"], gpu_gradients["model"])):
                diff = (cpu_g - gpu_g.cpu()).abs().max()
                print(f"参数梯度 {i} 最大差异: {diff.item()}")
                self.assertTrue(torch.allclose(cpu_g, gpu_g, atol=1e-4, rtol=1e-3),
                            f"参数梯度 {i} 差异超出阈值，最大差异: {diff.item()}")

        else:
            raise ImportError("NO CUDA FOR TEST!!")

    # def test_detach_effect(self):
    #     input_tensor = torch.randn(1, model.config.hidden_size, device="cpu", requires_grad=True)
    #     expert_ids = torch.tensor([[0, 1]], device="cpu")
    #     weights = torch.tensor([[0.5, 0.5]], device="cpu")

    #     output = model(input_tensor, expert_ids, weights)
        
    #     # dot = make_dot(output, params=dict(model.named_parameters()))
    #     # dot.render("origin_moe_cpu_graph", format="svg")
        
    #     loss = output.sum()
    #     loss.backward()
        
    #     self.assertIsNotNone(input_tensor.grad)
    #     self.assertTrue(all(p.grad is not None for p in model.parameters()))

if __name__ == '__main__':
    unittest.main()

================================================
FILE: kt-sft/ktransformers/operators/RoPE.py
================================================
"""
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

from torch import nn
from transformers import ROPE_INIT_FUNCTIONS
from ktransformers.models.modeling_llama import (
    LlamaRotaryEmbedding,
    LlamaLinearScalingRotaryEmbedding,
    LlamaDynamicNTKScalingRotaryEmbedding,
)
from ktransformers.models.modeling_deepseek_v3 import (
    DeepseekV3RotaryEmbedding
)
from ktransformers.models.modeling_deepseek import (
    DeepseekV2YarnRotaryEmbedding,
    DeepseekV2RotaryEmbedding,
    yarn_get_mscale,
    yarn_linear_ramp_mask,
    yarn_find_correction_range
)
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.inference_state import InferenceState
from ktransformers.util.grad_wrapper import maybe_no_grad
from transformers.configuration_utils import PretrainedConfig
import torch

# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim, orig_module.max_position_embeddings, orig_module.base
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.device,
        )


class RotaryEmbeddingV3(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
    
    @maybe_no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)   

    def load(self):
        self._init(
            dim=self.config.qk_rope_head_dim,
            max_position_embeddings=self.config.max_position_embeddings,
            base=self.config.rope_theta,
            device=self.device,
        )
    def _init(self, dim, max_position_embeddings, base, device, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        # self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
            None,
            orig_module.scaling_factor,
            orig_module.rope_type,
            orig_module.config,
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.device,
            self.orig_module.scaling_factor,
            self.orig_module.rope_type,
            self.orig_module.config,
        )

class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
            None,  # device
            orig_module.scaling_factor,
            orig_module.original_max_position_embeddings,
            orig_module.beta_fast,
            orig_module.beta_slow,
            orig_module.mscale,
            orig_module.mscale_all_dim,
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.generate_device,
            self.orig_module.scaling_factor,
            self.orig_module.original_max_position_embeddings,
            self.orig_module.beta_fast,
            self.orig_module.beta_slow,
            self.orig_module.mscale,
            self.orig_module.mscale_all_dim,
        )

# class DeepSeekV3YarnRotaryEmbedding(BaseInjectedModule, DeepseekV3RotaryEmbedding):
#     def __init__(
#         self,
#         key: str,
#         gguf_loader: GGUFLoader,
#         config: PretrainedConfig,
#         orig_module: nn.Module,
#         #  device: str = "cuda",
#         generate_device: str = "cuda",
#         prefill_device: str = "cuda",
#         **kwargs,
#     ):
#         BaseInjectedModule.__init__(
#             self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
#         )
#         self.generate_device = generate_device
#         self.prefill_device = prefill_device

#     def load(self):
#         # TODO support perlayer prefill
#         self.orig_module.__init__(
#             self.config,
#             device=self.generate_device
#         )
#         return

class YarnRotaryEmbeddingV3(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
    
    def load(self):
        kwargs = {
            key: self.config.rope_scaling[key]
            for key in [
                "original_max_position_embeddings",
                "beta_fast",
                "beta_slow",
                "mscale",
                "mscale_all_dim",
            ]
            if key in self.config.rope_scaling
        }
        self._init(
            dim=self.config.qk_rope_head_dim,
            max_position_embeddings=self.config.max_position_embeddings,
            base=self.config.rope_theta,
            device=self.device,
            scaling_factor=self.config.rope_scaling["factor"],
            **kwargs,
        )

    @maybe_no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()* self._mscale
            sin = emb.sin()* self._mscale
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)  

    def _init(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        original_max_position_embeddings=4096,
        beta_fast=32,
        beta_slow=1,
        mscale=1,
        mscale_all_dim=0,
    ):
        self.original_max_position_embeddings = original_max_position_embeddings
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = mscale
        self.mscale_all_dim = mscale_all_dim
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        freq_extra = 1.0 / (
            self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )
        freq_inter = 1.0 / (
            self.scaling_factor
            * self.base
            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
        )

        low, high = yarn_find_correction_range(
            self.beta_fast,
            self.beta_slow,
            dim,
            self.base,
            self.original_max_position_embeddings,
        )
        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
            device=device, dtype=torch.float32
        )
        self.inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
        self._mscale = float(
            yarn_get_mscale(self.scaling_factor, self.mscale)
            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
        )
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

class DynamicNTKScalingRotaryEmbedding(
    BaseInjectedModule, LlamaDynamicNTKScalingRotaryEmbedding
):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        prefill_device: str = "cuda",
        generate_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
            None,  # device
            orig_module.scaling_factor,
            orig_module.rope_type,
            orig_module.config,
        )

    def load(self):
        self.orig_module.__init__(
            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.orig_module.device,
            self.orig_module.scaling_factor,
            self.orig_module.rope_type,
            self.orig_module.config,
        )


class RotaryEmbeddingV4(BaseInjectedModule):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
    
    @maybe_no_grad()
    def forward(self, x, position_ids):
        # x: [bs, num_attention_heads, seq_len, head_size]
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        # Force float32 since bfloat16 loses precision on long contexts
        # See https://github.com/huggingface/transformers/pull/29285
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)   

    def load(self):
        self._init(
            dim=self.config.qk_rope_head_dim,
            max_position_embeddings=self.config.max_position_embeddings,
            base=self.config.rope_theta,
            device=self.device,
        )
    def _init(self, dim, max_position_embeddings, base, device, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        # self.register_buffer("inv_freq", inv_freq, persistent=False)
        # For BC we register cos and sin cached
        self.max_seq_len_cached = max_position_embeddings

class KQwen3MoeRotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        #  device: str = "cuda",
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs
        )
        self.orig_module.__init__(
            config,
        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def load(self):
        self.orig_module.__init__(
            self.orig_module.config
        )

================================================
FILE: kt-sft/ktransformers/operators/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/operators/attention.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import torch
from torch import nn
import warnings
import torch.nn.functional as F
from ktransformers.operators.models import KLlamaModel
from ktransformers.models.configuration_deepseek import DeepseekV2Config
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention, Qwen3MoeRotaryEmbedding
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability
import logging
from transformers.configuration_utils import PretrainedConfig
from transformers.cache_utils import Cache
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor

try:
    from flash_attn import flash_attn_func
except:
    pass
from ktransformers.operators.triton_attention import decode_attention_fwd_grouped 
from ktransformers.operators.triton_attention_prefill import context_attention_fwd
import os
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
if flashinfer_enabled:
    from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
    from flashinfer.mla import BatchMLAPagedAttentionWrapper
from ktransformers.models.custom_cache import KDeepSeekV3Cache
logger = logging.getLogger("attention")

# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

# V3 MLA is same to V2
class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    attn_mask: Optional[torch.Tensor] = None

    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 absorb_for_prefill: bool = False,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
        self.mla_wrapper = None
        self.absorb_for_prefill = absorb_for_prefill

    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
            self.q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
            self.out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
            
        return self.q_absorb, self.out_absorb

    def forward_chunck(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()
        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )
        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)

        kv_seq_len = k_pe.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            
            # compressed_kv [bsz, q_len, self.kv_lora_rank]
            # k_pe [bsz, 1, q_len, self.qk_rope_head_dim]
            k_pe = k_pe.transpose(1,2)
            compressed_kv = compressed_kv.unsqueeze(2)
            compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
            compressed_kv, k_pe = torch.split(
                compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
            )
            # k_pe [pages, page_size, 1, self.qk_rope_head_dim]
            # compressed_kv [pages, page_size, 1, self.kv_lora_rank]
            
        q_absorb, out_absorb = self.get_absorbed()

        # q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
        # q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]
        k_pe = k_pe.view(bsz, 1, -1, self.qk_rope_head_dim)[:,:,:attention_mask.size(-1),:]
        compressed_kv = compressed_kv.view(bsz, 1, -1, self.kv_lora_rank)[:,:,:attention_mask.size(-1),:]
        # k_pe [bsz, 1, cache_len, self.qk_rope_head_dim]
        # compressed_kv [bsz, 1, cache_len,self.kv_lora_rank]
        q_nope = torch.matmul(q_nope, q_absorb)
        #print(q_pe.shape)
        #print(k_pe.shape)
        #print(q_nope.shape)
        #print(compressed_kv.shape)
        
        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.mT)) * self.softmax_scale
        
        #attn_weights [bsz, self.num_heads, q_len, kv_seq_len]
        compressed_kv = compressed_kv.squeeze(1)
        """
        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )
        assert attention_mask is not None
        """
        if attention_mask is not None:
            """
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            """
            #causal_mask = attention_mask[:, :, :, : kv_seq_len]
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(q_pe.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        
        attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
        
        attn_output = torch.matmul(attn_output, out_absorb.mT) 

        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        
        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

        attn_output = self.o_proj(attn_output)

        return attn_output, None, past_key_value

    def forward_linux_triton(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_value: Optional[Cache] = None,
            output_attentions: bool = False,
            use_cache: bool = False,
            cache_position: Optional[torch.LongTensor] = None,
            **kwargs,
        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)

        kv_seq_len = q_len
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since transformer version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
        
        # decode
        if q_len == 1:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank] # for speed
                # compressed_kv_with_k_pe [bsz, q_len, 1, self.kv_lora_rank + self.qk_rope_head_dim]
                # compressed_kv [bsz, q_len, 1, self.kv_lora_rank]

            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
            q_absorb, out_absorb = self.get_absorbed()
            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
            q_nope = q_nope.transpose(1, 2)
            #assert q_nope.is_contiguous()
            
            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
            query_states = torch.cat([q_nope, q_pe], dim=-1)
            
            query_states = query_states.squeeze(1)
            attn_output = torch.zeros_like(q_nope) # [bsz, q_len, self.num_heads, self.kv_lora_rank]
            
            attn_logits = torch.empty(
                    (
                        bsz,
                        self.num_heads,
                        4, #num_kv_splits # follow vLLM, fix it TODO
                        self.kv_lora_rank + 1, 
                    ),
                    dtype=torch.float32,
                    device = attn_output.device
                )

            """
            print("query_states", torch.isnan(query_states).any())
            print("compressed_kv_with_k_pe", torch.isnan(compressed_kv_with_k_pe[:,:,0,:]).any())
            print("compressed_kv", torch.isnan(compressed_kv[:,:,0,:]).any())
            print("position_ids", torch.isnan(position_ids).any())
            """

            # flash attn doesn't support head_dim bigger than 256
            # use triton attention kernel adapted from vLLM and SGLang for MQA
            decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
                             page_table,
                             position_ids.squeeze(0).to(torch.int32)+1, attn_logits,
                             4, #num_kv_splits # follow vLLM, fix it TODO
                             self.softmax_scale,
                             past_key_value.page_size)
            
            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
            attn_output = attn_output.transpose(1, 2)
            attn_output = torch.matmul(attn_output, out_absorb.mT)
            attn_output = attn_output.transpose(1, 2)
            
            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
            attn_output = self.o_proj(attn_output)
            
            #print("attn_output", torch.isnan(attn_output).any())
            return attn_output, None, past_key_value
        else:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                k_pe.squeeze(0)
                compressed_kv.squeeze(0)
                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv, k_pe = torch.split(
                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
                )
            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
            k_pe = k_pe[:, :kv_seq_len]
            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
            compressed_kv = compressed_kv[:, :kv_seq_len]
            kv = (
                self.kv_b_proj(compressed_kv)
                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            )
            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
            
            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)

            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states_padded,
                softmax_scale=self.softmax_scale,
                causal=True,
            )

            if self.q_head_dim != self.v_head_dim:
                attn_output = attn_output[:, :, :, : self.v_head_dim]

            attn_output = attn_output.reshape(
                bsz, q_len, self.num_heads * self.v_head_dim
            ).contiguous()
            attn_output = self.o_proj(attn_output)
            return attn_output, None, past_key_value

    def forward_linux_flashinfer(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            past_key_value: Optional[Cache] = None,
            output_attentions: bool = False,
            use_cache: bool = False,
            cache_position: Optional[torch.Tensor] = None,
            **kwargs,
        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = self.kv_a_layernorm(compressed_kv)
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim)
        compressed_kv = compressed_kv.view(bsz, q_len, 1, self.kv_lora_rank)

        kv_seq_len = q_len
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version transformer verision v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        
        cos, sin = self.rotary_emb(q_pe, position_ids)
        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, unsqueeze_dim=2)
        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
        
        # decode
        if q_len == 1 or self.absorb_for_prefill:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, past_key_value.page_size, self.kv_lora_rank)
                k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, past_key_value.page_size, self.qk_rope_head_dim)
                # k_pe [max_pages, page_size, self.qk_rope_head_dim]
                # compressed_kv [max_pages, page_size, self.kv_lora_rank]

            # q_nope [bsz, q_len, self.num_heads, self.qk_nope_head_dim]
            # q_absorb [self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank]
            q_absorb, out_absorb = self.get_absorbed()
            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
            q_nope = q_nope.transpose(1, 2)
            q_nope = q_nope.contiguous()
            #assert q_nope.is_contiguous()
            
            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
            q_nope.squeeze_(0)
            q_pe.squeeze_(0)

            # flash attn doesn't support head_dim bigger than 256, use flashinfer
            if self.mla_wrapper is None:
                self.mla_wrapper = MLAWrapperSingleton.get_instance(self.device, 1, past_key_value.max_pages, use_cuda_graph = True)
            if self.mla_wrapper.need_plan:
                self.mla_wrapper.need_plan = False
                if q_len == 1:
                    self.mla_wrapper.plan(None,None,None,
                                        position_ids.squeeze(1)+1,
                                        None,
                                        self.num_heads,
                                        self.kv_lora_rank,
                                        self.qk_rope_head_dim,
                                        past_key_value.page_size,
                                        self.softmax_scale,
                                        q_nope.dtype,
                                        compressed_kv.dtype)
                else:
                    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device=self.device)
                    kv_len_arr = torch.tensor([position_ids[0, -1].item()+1], dtype=torch.int32, device=self.device)
                    self.mla_wrapper.plan(qo_indptr,None,None,
                                        kv_len_arr,
                                        None,
                                        self.num_heads,
                                        self.kv_lora_rank,
                                        self.qk_rope_head_dim,
                                        past_key_value.page_size,
                                        self.softmax_scale,
                                        q_nope.dtype,
                                        compressed_kv.dtype)
            attn_output = self.mla_wrapper.run(q_nope, q_pe, compressed_kv, k_pe).view(bsz, q_len, self.num_heads, self.kv_lora_rank)
            """
            k = (
                torch.cat([compressed_kv, k_pe], dim=-1)
                .view(-1, 1, 512 + 64)
                .repeat_interleave(self.num_heads, dim=1)
            )
            v = compressed_kv.view(-1, 1, 512).repeat_interleave(self.num_heads, dim=1)
            lens = position_ids.item() + 1
            #print("lens", lens)
            attn_ref, lse_ref = attention_ref(
                1,
                torch.cat([q_nope, q_pe], dim=-1),
                k[:lens],
                v[:lens],
                False,
                self.softmax_scale
            )
            attn_output = attn_ref.view(bsz, q_len, self.num_heads, self.kv_lora_rank)
            """
            
            # mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
            # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
            attn_output = attn_output.transpose(1, 2) # [bsz, self.num_heads, q_len, self.kv_lora_rank]
            attn_output = torch.matmul(attn_output, out_absorb.mT) # [bsz, self.num_heads, q_len, self.v_head_dim]
            attn_output = attn_output.transpose(1, 2).contiguous() # [bsz, q_len, self.num_heads, self.kv_lora_rank]
            
            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) # [bsz, q_len, self.num_heads * self.v_head_dim]
            attn_output = self.o_proj(attn_output)
            
            return attn_output, None, past_key_value
        else:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                k_pe.squeeze(0)
                compressed_kv.squeeze(0)
                compressed_kv_with_k_pe, _ = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
                compressed_kv, k_pe = torch.split(
                    compressed_kv_with_k_pe, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
                )
            k_pe = k_pe.view(bsz, -1, self.qk_rope_head_dim)
            k_pe = k_pe[:, :kv_seq_len]
            compressed_kv = compressed_kv.view(bsz, -1, self.kv_lora_rank)
            compressed_kv = compressed_kv[:, :kv_seq_len]
            kv = (
                self.kv_b_proj(compressed_kv)
                .view(bsz, kv_seq_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            )
            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
            query_states = k_pe.new_empty(bsz, q_len, self.num_heads, self.q_head_dim)
            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

            key_states = k_pe.new_empty(bsz, kv_seq_len, self.num_heads, self.q_head_dim)
            key_states[:, :, :, :self.qk_nope_head_dim] = k_nope
            key_states[:, :, :, self.qk_nope_head_dim:] = k_pe.view(bsz, kv_seq_len, 1, -1)
            
            value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim)
            value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0)

            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states_padded,
                softmax_scale=self.softmax_scale,
                causal=True,
            )

            if self.q_head_dim != self.v_head_dim:
                attn_output = attn_output[:, :, :, : self.v_head_dim]

            attn_output = attn_output.reshape(
                bsz, q_len, self.num_heads * self.v_head_dim
            ).contiguous()
            attn_output = self.o_proj(attn_output)
            return attn_output, None, past_key_value
        
    def forward_windows(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if q_len <= self.chunck_size:
            return self.forward_chunck(
                            hidden_states,
                            attention_mask,
                            position_ids,
                            past_key_value,
                            output_attentions,
                            use_cache,
                            cache_position,
                            **kwargs
                        )

        assert output_attentions == False, "output_attentions is not supported when using chunked attention"
        attn_output = None
        cur_idx = 0
        while cur_idx < q_len:
            if attention_mask is not None:
                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
            else:
                # generate chunk_mask automatically.
                self.attn_mask = \
                    torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
                        if self.attn_mask is None \
                            else self.attn_mask
                self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
                    -1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
                        [:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
                self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
                self.attn_mask[:, :, :, :cur_idx] = 0
                chunk_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))

            cur_output, _, _ = self.forward_chunck(
                            hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
                            chunk_mask,
                            position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            past_key_value,
                            output_attentions,
                            use_cache,
                            cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            **kwargs
                        )
            cur_idx += self.chunck_size
            if attn_output is None:
                attn_output = cur_output
            else:
                attn_output = torch.cat((attn_output, cur_output), dim=-2)
                
        return attn_output, None, past_key_value

    def forward_xpu(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
        query_states = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
        kv = (
            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
            .transpose(1, 2)
        )

        k_nope, value_states = torch.split(
            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )
        kv_seq_len = value_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        position_embeddings = kwargs.get("position_embeddings", None)
        if position_embeddings is not None:
            cos, sin = position_embeddings
            key_states = torch.cat(
                [k_nope, k_pe.expand([-1, self.num_heads, -1, -1])],
                dim=-1
            )
            from ipex_llm.transformers.models.common import rotary_two_with_cache_inplaced
            rotary_two_with_cache_inplaced(query_states[:, :, :, self.qk_nope_head_dim :],
                                           key_states[:, :, :, self.qk_nope_head_dim:],
                                           cos, sin, True)
        else:
            q_nope, q_pe = torch.split(
                query_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
            )
            cos, sin = self.rotary_emb(q_pe, position_ids)
            q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
            query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe

            key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
            key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
            key_states[:, :, :, self.qk_nope_head_dim :] = k_pe

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(
                key_states.half(), value_states.half(), self.layer_idx, cache_kwargs
            )

        attn_weights = None
        from ipex_llm.transformers.models.common import scaled_dot_product_attention
        attn_output = scaled_dot_product_attention(
            query_states.half(), key_states, value_states,
            attention_mask.half(), q_len == kv_seq_len, self.softmax_scale
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
        attn_output = self.o_proj(attn_output).to(hidden_states.dtype)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        if torch.xpu.is_available():
            return self.forward_xpu(
                hidden_states,
                attention_mask,
                position_ids,
                past_key_value,
                output_attentions,
                use_cache,
                cache_position,
                **kwargs,
            )
        elif (os.name == 'nt'
              or get_compute_capability() < 8
              or hidden_states.device.type == 'cpu'
              or device_manager.gpu_vendor != GPUVendor.NVIDIA):
            return self.forward_windows(
                hidden_states,
                attention_mask,
                position_ids,
                past_key_value,
                output_attentions,
                use_cache,
                cache_position,
                **kwargs,
            )
        else:
            if flashinfer_enabled:
                return self.forward_linux_flashinfer(
                    hidden_states,
                    attention_mask,
                    position_ids,
                    past_key_value,
                    output_attentions,
                    use_cache,
                    cache_position,
                    **kwargs,
                )
            else:
                return self.forward_linux_triton(
                    hidden_states,
                    attention_mask,
                    position_ids,
                    past_key_value,
                    output_attentions,
                    use_cache,
                    cache_position,
                    **kwargs,
                )


class KLlamaAttention(BaseInjectedModule):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`, *optional*):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        if self.config.pretraining_tp > 1:
            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
            query_slices = self.q_proj.weight.split(
                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
            )
            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
            query_states = torch.cat(query_states, dim=-1)

            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
            key_states = torch.cat(key_states, dim=-1)

            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
            value_states = torch.cat(value_states, dim=-1)

        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:

            logger.warning(
                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
                "removed and `position_embeddings` will be mandatory."
            )
            cos, sin = self.rotary_emb(value_states, position_ids)
        else:
            cos, sin = position_embeddings
        query_states, key_states = self.apply_rotary_pos_emb(query_states, key_states, cos, sin)
        if q_len == 1:
            position_ids = position_ids[0][-1].unsqueeze(0).unsqueeze(0)
            query_states = query_states[:, :, -1:]
            key_states = key_states[:, :, -1:]

        attn_output = KLlamaModel.dynamic_sdpa.apply(
            self.layer_idx,
            bsz,
            position_ids[0][0],
            query_states.transpose(1, 2).to(torch.float16),
            key_states.transpose(1, 2).to(torch.float16),
            value_states.transpose(1, 2).to(torch.float16),
            mode="prefill" if q_len > 1 else "generate",
        )


        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, -1)

        if self.config.pretraining_tp > 1:
            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
        else:
            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


class KQwen3MoeAttentionIPEXLLM(BaseInjectedModule, Qwen3MoeAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "xpu",
                 generate_device: str = "xpu",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
        assert prefill_device.lower()[:3] == "xpu", "KQwen3MoeAttentionIPEXLLM only supports XPU device"
        assert generate_device.lower()[:3] == "xpu", "KQwen3MoeAttentionIPEXLLM only supports XPU device"

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_ids: Optional[torch.Tensor],
        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        bsz, q_len, _ = hidden_states.size()
        input_dtype = hidden_states.dtype
        hidden_shape = (*input_shape, -1, self.head_dim)

        if not hasattr(self, 'qkv_proj'):
            from ipex_llm.transformers.models.common import merge_quantized_qkv
            merge_quantized_qkv(self.q_proj.generate_linear, self.k_proj.generate_linear, self.v_proj.generate_linear, self.orig_module)

        qkv = self.qkv_proj(hidden_states)
        qkv = qkv.view(bsz, q_len, -1, self.head_dim)
        qkv = qkv.transpose(1, 2)
        query_states, key_states, value_states = qkv.split([self.config.num_attention_heads,
                                                            self.config.num_key_value_heads,
                                                            self.config.num_key_value_heads], dim=1)
        query_states = self.q_norm(query_states)
        key_states = self.k_norm(key_states)

        if position_embeddings is None:
            position_embeddings = self.rotary_emb(hidden_states, position_ids)

        cos, sin = position_embeddings

        from ipex_llm.transformers.models.common import rotary_half_with_cache_inplaced
        rotary_half_with_cache_inplaced(query_states, key_states, cos, sin)

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states.half(), value_states.half(),
                                                             self.layer_idx, cache_kwargs)

        attn_weights = None
        from ipex_llm.transformers.models.common import scaled_dot_product_attention
        attn_output = scaled_dot_product_attention(
            query_states.half(), key_states, value_states,
            attention_mask.half(), q_len == key_states.size(2), self.scaling
        )
        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output).to(input_dtype)
        return attn_output, attn_weights


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs,
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class KQwen3MoeAttention(BaseInjectedModule, Qwen3MoeAttention ):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device,
                                    **kwargs)
        self.orig_module.__init__(self.orig_module.config,
                                  orig_module.layer_idx)
        self.chunck_size = chunck_size  # TODO, generate chunck_size automatically.

    # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed

    def forward(self,
                hidden_states: torch.Tensor,
                position_ids: Optional[torch.Tensor],
                position_embeddings: Tuple[torch.Tensor, torch.Tensor],
                attention_mask: Optional[torch.Tensor],
                past_key_value: Optional[Cache] = None,
                cache_position: Optional[torch.LongTensor] = None,
                **kwargs
                ):
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

        if position_embeddings is None:
            position_embeddings = self.rotary_emb(hidden_states, position_ids)

        cos, sin = position_embeddings

        query_states, key_states = self.apply_rotary_pos_emb(query_states, key_states, cos, sin)


        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                )
            else:
                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            sliding_window=self.sliding_window,  # diff with Llama
            **kwargs,
        )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights


================================================
FILE: kt-sft/ktransformers/operators/balance_serve_attention.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.2.5
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import torch
from torch import nn
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
import logging
from transformers.configuration_utils import PretrainedConfig
from flashinfer import BatchMLAPagedAttentionWrapper
from ktransformers.operators.flashinfer_batch_prefill_wrapper import flashInferAttn
from ktransformers.models.custom_cache import KDeepSeekV3Cache, KGQACache
logger = logging.getLogger("attention")

# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

class flashinfer_attn(BaseInjectedModule, DeepseekV2Attention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
                                      bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
            self.q_absorb.weight.data = q_absorb
            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
            self.out_absorb.weight.data = out_absorb
            #del self.orig_module.kv_b_proj
        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
        return q_absorb, out_absorb
    

    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KDeepSeekV3Cache,
                position_ids: torch.Tensor,
                wrapper: BatchMLAPagedAttentionWrapper,
                num_tokens_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                ):
        q_len, _ = hidden_states.size()

        if self.q_lora_rank is None:
            q = self.q_proj(hidden_states, num_tokens_tensors)
        else:
            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states, num_tokens_tensors), num_tokens_tensors), num_tokens_tensors)
        q = q.view(q_len, self.num_heads, self.q_head_dim)
        q_nope, q_pe = torch.split(
            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states, num_tokens_tensors)
        compressed_kv, k_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )
        compressed_kv = compressed_kv.contiguous()
        compressed_kv = self.kv_a_layernorm(compressed_kv, num_tokens_tensors)
        k_pe = k_pe.view(q_len, 1, self.qk_rope_head_dim)
        compressed_kv = compressed_kv.view(q_len, 1, self.kv_lora_rank)
        
        cos, sin = self.rotary_emb(q_pe, position_ids.unsqueeze(0))
        q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=2)
        q_pe = q_pe.squeeze(0)
        if kv_cache is not None:
            
            # page_idx, page_offset = kv_cache.get_page_table(position_ids, q_indptr, kv_indptr, kv_indices)
            cache_kwargs = {"sin": sin, "cos": cos, "page_idx": page_idx, "page_offset": page_offset}  # Specific to RoPE models
            compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, self.layer_idx, page_idx, page_offset, cache_kwargs)
            compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, kv_cache.page_size, self.kv_lora_rank)
            k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, kv_cache.page_size, self.qk_rope_head_dim)
            
        q_absorb, out_absorb = self.get_absorbed()
        q_nope = q_nope.transpose(0, 1) # q_len is 1, no GPU overhead, same below
        q_nope = torch.matmul(q_nope, q_absorb) # batched MM
        q_nope = q_nope.transpose(0, 1)
        # q_nope.squeeze_(1)
        # q_pe.squeeze_(1)

        attn_output = wrapper.run(q_nope, q_pe, compressed_kv, k_pe).view(q_len, self.num_heads, self.kv_lora_rank)
        attn_output = attn_output.transpose(0, 1)
        attn_output = torch.matmul(attn_output, out_absorb.mT) # [self.num_heads, q_len, self.v_head_dim]
        attn_output = attn_output.transpose(0, 1)
        attn_output = attn_output.reshape(q_len, self.num_heads * self.v_head_dim)
        attn_output = self.o_proj(attn_output, num_tokens_tensors)
        return attn_output

class KQwen2MoeAttention(BaseInjectedModule, Qwen2MoeAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed


    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KGQACache,
                position_ids: torch.Tensor,
                wrapper: flashInferAttn,
                bsz_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                ):
        q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states, bsz_tensors)
        key_states = self.k_proj(hidden_states, bsz_tensors)
        value_states = self.v_proj(hidden_states, bsz_tensors)


        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)
        
        cos, sin = self.rotary_emb(value_states.unsqueeze(0), position_ids.unsqueeze(0))
        query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), cos, sin, unsqueeze_dim=2)

        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )
        value_states = value_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )

        k_cache = kv_cache.get_k_cache(self.layer_idx)
        v_cache = kv_cache.get_v_cache(self.layer_idx)


        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
  

        attn_output = self.o_proj(attn_output.view(q_len, self.num_heads * self.head_dim), bsz_tensors)

        return attn_output

class KQwen3MoeAttention(BaseInjectedModule, Qwen3MoeAttention):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
        """Applies Rotary Position Embedding to the query and key tensors.

        Args:
            q (`torch.Tensor`): The query tensor.
            k (`torch.Tensor`): The key tensor.
            cos (`torch.Tensor`): The cosine part of the rotary embedding.
            sin (`torch.Tensor`): The sine part of the rotary embedding.
            position_ids (`torch.Tensor`):
                Deprecated and unused.
            unsqueeze_dim (`int`, *optional*, defaults to 1):
                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
        Returns:
            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
        """
        cos = cos.unsqueeze(unsqueeze_dim)
        sin = sin.unsqueeze(unsqueeze_dim)
        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed


    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KGQACache,
                position_ids: torch.Tensor,
                wrapper: flashInferAttn,
                bsz_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                ):
        q_len, _ = hidden_states.size()

        bsz_tensors_q = bsz_tensors * self.num_heads
        bsz_tensors_kv = bsz_tensors * self.num_key_value_heads

        query_states = self.q_norm(self.q_proj(hidden_states, bsz_tensors), bsz_tensors_q)
        key_states = self.k_norm(self.k_proj(hidden_states, bsz_tensors), bsz_tensors_kv)
        value_states = self.v_proj(hidden_states, bsz_tensors)


        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)
        
        cos, sin = self.rotary_emb(value_states.unsqueeze(0), position_ids.unsqueeze(0))
        query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), cos, sin, unsqueeze_dim=2)

        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
        key_states = key_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )
        value_states = value_states.view(
            q_len, self.num_key_value_heads, self.head_dim
        )

        k_cache = kv_cache.get_k_cache(self.layer_idx)
        v_cache = kv_cache.get_v_cache(self.layer_idx)


        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
  

        attn_output = self.o_proj(attn_output.view(q_len, self.num_heads * self.head_dim), bsz_tensors)

        return attn_output


class deepseek_torch_attn(BaseInjectedModule, DeepseekV2Attention):
    def __init__(self,
                    key: str,
                    gguf_loader : GGUFLoader,
                    config: PretrainedConfig,
                    orig_module: nn.Module,
                    prefill_device: str = "cuda",
                    generate_device: str = "cuda",
                    chunck_size: int = 1000,
                    **kwargs):
            BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
            self.orig_module.__init__(orig_module.config,
                orig_module.layer_idx)
            self.chunck_size = chunck_size # TODO, generate chunck_size automatically.


    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
            kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
            q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].reshape(-1, self.kv_lora_rank)
            out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].reshape(-1, self.kv_lora_rank)
            self.q_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.qk_nope_head_dim, 
                                    bias=False, dtype=q_absorb.dtype, device=q_absorb.device)
            self.q_absorb.weight.data = q_absorb
            self.out_absorb = nn.Linear(self.kv_lora_rank, self.num_heads * self.v_head_dim, 
                                        bias=False, dtype=out_absorb.dtype, device=out_absorb.device)
            self.out_absorb.weight.data = out_absorb
            #del self.orig_module.kv_b_proj
        q_absorb = self.q_absorb.weight.view(self.num_heads, self.qk_nope_head_dim, self.kv_lora_rank)
        out_absorb = self.out_absorb.weight.view(self.num_heads, self.v_head_dim, self.kv_lora_rank)
        return q_absorb, out_absorb
    

    def forward(self,
                hidden_states: torch.Tensor,
                kv_cache: KDeepSeekV3Cache,
                position_ids: torch.Tensor,
                wrapper: None,
                num_tokens_tensors: torch.Tensor,
                page_idx: torch.Tensor,
                page_offset: torch.Tensor,
                attention_masks: Optional[list[torch.Tensor]] = None,
                q_indptr: Optional[torch.Tensor] = None,
                kv_indices: Optional[torch.Tensor] = None,
                kv_indptr: Optional[torch.Tensor] = None,
                bsz_tensors: Optional[torch.Tensor] = None,
                last_page_len: Optional[torch.Tensor] = None,
                ):
        # range bsz_tensors
        final_attention_output = torch.tensor([], device=hidden_states.device)
        for i in range(bsz_tensors[0]):
            batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
            batch_last_page_len = last_page_len[i]
            # kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
            batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
            batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
            # kv_page_nums is the number of pages for the current batch
            kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
            # kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
            kv_total_len = kv_page_nums * kv_cache.page_size
            if batch_last_page_len is not None:
                kv_total_len = kv_total_len - (kv_cache.page_size - batch_last_page_len)
            # print(f"kv_total_len's shape {kv_total_len.shape}")
            # kv_index is the index of the kv cache pages for the current batch
            kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
            # we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
            # from q_indptr[i] to q_indptr[i+1] is the range of the current batch
            batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
            batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
            q_len, _ = batch_hidden_states.size()
            # print("q_len -> ", q_len)

            if self.q_lora_rank is None:
                q = self.q_proj(batch_hidden_states, batch_num_tokens_tensors)
            else:
                q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(batch_hidden_states, batch_num_tokens_tensors), batch_num_tokens_tensors), batch_num_tokens_tensors)
            # for v3, bsz, q_len, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
            q = q.view(q_len, self.num_heads, self.q_head_dim)
            # q_nope is [q_len, num_heads(128), qk_nope_head_dim(128)]
            # q_pe is [q_len, num_heads(128), qk_rope_head_dim(64)]
            q_nope, q_pe = torch.split(
                q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
            )
            # compressed_kv is [q_len, kv_lora_rank(512) + rope(64)]
            compressed_kv = self.kv_a_proj_with_mqa(batch_hidden_states, batch_num_tokens_tensors)
            # compressed_kv is [q_len, kv_lora_rank(512)], k_pe is [q_len, rope(64)]
            compressed_kv, k_pe = torch.split(
                compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
            )
            compressed_kv = compressed_kv.contiguous()
            compressed_kv = self.kv_a_layernorm(compressed_kv, batch_num_tokens_tensors)
            # k_pe is [q_len, 1, qk_rope_head_dim(64)]
            k_pe = k_pe.view(q_len, 1, self.qk_rope_head_dim)
            # compressed_kv is [q_len, 1, kv_lora_rank(512)]
            compressed_kv = compressed_kv.view(q_len, 1, self.kv_lora_rank)
            
            cos, sin = self.rotary_emb(q_pe, batch_position_ids.unsqueeze(0))
            # print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
            q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=2)
            q_pe = q_pe.squeeze(0)
            # q_pe is [num_heads(128), q_len, qk_rope_head_dim(64)]
            q_pe.transpose_(0, 1)            
            if kv_cache is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset}  # Specific to RoPE models
                compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, self.layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
                compressed_kv = compressed_kv_with_k_pe [:, :, :, :self.kv_lora_rank].view(-1, kv_cache.page_size, self.kv_lora_rank)
                k_pe = compressed_kv_with_k_pe [:, :, :, self.kv_lora_rank:].view(-1, kv_cache.page_size, self.qk_rope_head_dim)
            # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
            # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
            q_absorb, out_absorb = self.get_absorbed()
            # q_nope is [num_heads(128), q_len, qk_nope_head_dim(128)]
            q_nope = q_nope.transpose(0, 1) # q_len is 1, no GPU overhead, same below
            # q_nope is [num_heads(128), q_len, kv_lora_rank(512)]
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM

            # # q_nope is [q_len, num_heads(128), kv_lora_rank(512)]
            # q_nope = q_nope.transpose(0, 1)

            # we need to index out the compressed_kv and k_pe for the current batch
            batch_compressed_kv = None
            batch_k_pe = None
            for page_index in kv_index:
                if kv_total_len > kv_cache.page_size:
                    tmp_compressed_kv = compressed_kv[page_index, 0:kv_cache.page_size, :]
                    tmp_k_pe = k_pe[page_index, 0:kv_cache.page_size, :]
                    if batch_compressed_kv is None or batch_k_pe is None:
                        batch_compressed_kv = tmp_compressed_kv
                        batch_k_pe = tmp_k_pe
                    else: 
                        batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                        batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                    kv_total_len -= kv_cache.page_size
                else:
                    tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
                    tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
                    if batch_compressed_kv is None or batch_k_pe is None:
                        batch_compressed_kv = tmp_compressed_kv
                        batch_k_pe = tmp_k_pe
                    else: 
                        batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
                        batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
                    break
            # batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
            # batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
            attention_weights = (torch.matmul(q_pe,batch_k_pe.mT) + torch.matmul(q_nope, batch_compressed_kv.mT)) * self.softmax_scale
            # attention_weights is [num_heads(128), q_len, k_len]
            
            # attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(q_len,-1,-1).transpose(0,1)
            
            # attention_masks[i] is [q_len, k_len]
            
            attention_weights = (attention_weights + attention_masks[i])
            # attention_weights shape is [num_heads(128), q_len, k_len]
            attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float32).to(q_pe.dtype)
            attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),q_len, lora_rank(512)]
            # out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
            out_absorb = out_absorb.transpose(1,2)
            # q for q_len, n for num_heads, h for v_head_dim, v for kv_lora_rank
            attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), q_len, v_head_dim(128)]
            attn_output = attn_output.transpose(0, 1) # [q_len, num_heads(128), v_head_dim(128)]
            attn_output = attn_output.reshape(q_len, self.num_heads * self.v_head_dim)
            attn_output = self.o_proj(attn_output, batch_num_tokens_tensors)
            final_attention_output = torch.cat((final_attention_output, attn_output), dim=0)
        return final_attention_output

================================================
FILE: kt-sft/ktransformers/operators/base_operator.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
from typing import Any
from torch import nn, Tensor
from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
import ktransformers.util.utils as utils
class BaseInjectedModule(nn.Module):
    
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        nn.Module.__init__(self)
        nn.Module.__setattr__(self, "orig_module", orig_module)
        object.__setattr__(self, "key", key)
        object.__setattr__(self, "gguf_loader", gguf_loader)
        object.__setattr__(self, "config", config)
        object.__setattr__(self, "prefill_device", prefill_device)
        object.__setattr__(self, "generate_device", generate_device)
        object.__setattr__(self, "device", generate_device)
        
    def __getattr__(self, name: str) -> Any:
        # __getattr__ in nn.Module doesn't call super().__getattribute__ when name is not in nn.Module.__dict__,
        # but __setattr__ in nn.Module call super().__setattr__ in that case, there may be some attribute set 
        # but can't get using __getattr__, typically these attr is build in attr of the class, so class.attr does not
        # call __getattr__.
        # Example:
        # ...import torch
        # ...l=torch.nn.Linear(100,200)
        # ...l.out_features # 200
        # ...l.__getattr__("out_features") # AttributeError: 'Linear' object has no attribute 'out_features'
        try:
            return object.__getattribute__(self, name) # if this attr belongs to BaseInjectedModule
        except:
            if name == "orig_module":
                return nn.Module.__getattr__(self, "orig_module")
            try:
                return nn.Module.__getattr__(self, "orig_module").__getattr__(name) # if this attr belongs to orig_module
            except:
                return super(nn.Module, nn.Module.__getattr__(self, "orig_module")).__getattribute__(name) # if this attr belongs to orig_module but not in nn.Module.__dict__

    def __setattr__(self, name: str, value: Tensor | nn.Module) -> None:
        if name == "orig_module":
            return nn.Module.__setattr__(self, "orig_module", value)
        # elif name == "base_layer":
        #     return nn.Module.__setattr__(self, "base_layer", value)
        elif hasattr(self, name):
            return object.__setattr__(self, name, value)
        return nn.Module.__getattr__(self, "orig_module").__setattr__(name, value)
    
    def forward(self, *args, **kwargs):
        return self.orig_module.forward(*args, **kwargs)
    
    def load(self, gguf_loader=None, adapter_gguf : bool = False):
        for name, child in self._modules.items():
            if gguf_loader==None:
                utils.load_weights(child, self.gguf_loader, self.key+".", adapter_gguf=adapter_gguf)
            else:
                utils.load_weights(child, gguf_loader, self.key+".", adapter_gguf=adapter_gguf)


================================================
FILE: kt-sft/ktransformers/operators/cpuinfer.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  : This script defines the `CPUInferKVCache` and `CPUInfer` classes for performing inference 
               with a Key-Value Cache on the CPU. The `CPUInferKVCache` class is responsible for configuring 
               and managing key-value caches, updating and retrieving cache data, and handling attention 
               operations. It supports different cache types (e.g., Q4_0, FP16) and retrieval strategies 
               (e.g., shared, separate). The `CPUInfer` class handles task submission and synchronization 
               on the CPU, with optional CUDA stream integration for tasks involving GPU acceleration. 
               These classes facilitate efficient caching and memory management for deep learning models 
               that leverage key-value attention mechanisms, particularly on CPU-based systems.
Author       : djw
Date         : 2024-08-26 23:25:24
Version      : 1.0.0
LastEditors  : djw 
LastEditTime : 2024-08-26 23:25:24
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import sys, os
from typing import Any
import torch
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from ktransformers.server.config.config import Config


class CPUInferKVCache:
    def __init__(
        self,
        layer_num: int = 32,
        kv_head_num: int = 8,
        q_head_num: int = 32,
        head_dim: int = 128,
        block_len: int = 256,
        anchor_num: int = 4,
        anchor_type: str = "FIXED",
        kv_type: str = "Q4_0",
        retrieval_type: str = "SHARED",
        layer_step: int = 1,
        token_step: int = 1,
        layer_offset: int = 0,
        max_thread_num: int = 32,
        max_batch_size: int = 4,
        max_block_num: int = 512,
    ):

        if anchor_type == "FIXED":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.FIXED
        elif anchor_type == "QUEST":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.QUEST
        elif anchor_type == "DYNAMIC":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
        elif anchor_type == "BLOCK_MEAN":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MEAN
        elif anchor_type == "BLOCK_MAX":
            anchor_type = cpuinfer_ext.kvcache.AnchorType.BLOCK_MAX
        else:
            raise ValueError(f"Unknown anchor type: {anchor_type}")

        if kv_type == "FP16":
            kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
        elif kv_type == "FP32":
            assert False, "FP32 is not supported yet."
            kv_type = cpuinfer_ext.kvcache.ggml_type.FP32
        elif kv_type == "Q4_0":
            kv_type = cpuinfer_ext.kvcache.ggml_type.Q4_0
        elif kv_type == "Q8_0":
            kv_type = cpuinfer_ext.kvcache.ggml_type.Q8_0
        else:
            raise ValueError(f"Unknown kv type: {kv_type}")

        if retrieval_type == "SHARED":
            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
        elif retrieval_type == "INDIVIDUAL":
            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.QHEAD
        elif retrieval_type == "SEPARATE":
            retrieval_type = cpuinfer_ext.kvcache.RetrievalType.KVHEAD

        self.config = cpuinfer_ext.kvcache.KVCacheConfig(
            layer_num,
            kv_head_num,
            q_head_num,
            head_dim,
            block_len,
            anchor_num,
            anchor_type,
            kv_type,
            retrieval_type,
            layer_step,
            token_step,
            layer_offset,
            max_block_num,
            max_batch_size,
            max_thread_num,
        )
        self.kvcache = cpuinfer_ext.kvcache.KVCache(self.config)

    def load_kvcache(self, tensor_file_path: str):
        if not os.path.exists(tensor_file_path):
            raise FileNotFoundError(f"The file {tensor_file_path} does not exist.")
        return self.kvcache.load_kvcache(tensor_file_path,)

    def dump_kvcache(
        self, block_table: torch.Tensor, cache_total_len: int, tensor_file_path: str
    ):
        assert (
            block_table.dim() == 1
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )

        assert (
            cache_total_len > 0
            and cache_total_len <= self.config.block_len * block_table.size(0)
        ), "cache_total_len: {}".format(cache_total_len)

        if not os.path.exists(os.path.dirname(tensor_file_path)):
            os.makedirs(os.path.dirname(tensor_file_path))

        return self.kvcache.dump_kvcache(
            block_table.data_ptr(),
            cache_total_len,
            tensor_file_path,
        )

    def update_cache_total_len(self, cache_total_len: int):
        assert cache_total_len > 0, "cache_total_len: {}".format(cache_total_len)
        self.kvcache.update_cache_total_len(cache_total_len)

    # q_in: (bsz, q_len, q_head_num, head_dim)
    # output: (bsz, q_len, q_head_num, head_dim)
    # attn_lse: (bsz, q_len, q_head_num)
    # block_table: (bsz, max_block_num)
    def attn(
        self,
        q_in: torch.Tensor,
        output: torch.Tensor,
        attn_lse: torch.Tensor,
        layer_idx: int,
        generate_token_idx: int,
        block_table: torch.Tensor | None = None,
        cache_seqlens: torch.Tensor | None = None,
        pick_block_num: int | None = None,
        init_block_num: int | None = None,
        local_block_num: int | None = None,
    ):

        assert (
            q_in.dim() == 4
            and q_in.size(2) == self.config.q_head_num
            and q_in.size(3) == self.config.head_dim
            and q_in.dtype == torch.float16
            and q_in.is_contiguous()
            and q_in.device == torch.device("cpu")
        ), "q_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            q_in.dim(), q_in.size(), q_in.dtype, q_in.is_contiguous(), q_in.device
        )

        batch_size = q_in.size(0)
        q_len = q_in.size(1)

        assert (block_table is None) or (
            block_table.dim() == 2
            and block_table.size(0) == batch_size
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )

        max_block_num = block_table.size(1) if block_table is not None else 0

        assert (
            output.dim() == 4
            and output.size(0) == batch_size
            and output.size(2) == self.config.q_head_num
            and output.size(1) == q_len
            and output.size(3) == self.config.head_dim
            and output.dtype == torch.float16
            and output.is_contiguous()
            and output.device == torch.device("cpu")
        ), "output dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            output.dim(),
            output.size(),
            output.dtype,
            output.is_contiguous(),
            output.device,
        )

        assert (
            attn_lse.dim() == 3
            and attn_lse.size(0) == batch_size
            and attn_lse.size(1) == q_len
            and attn_lse.size(2) == self.config.q_head_num
            and attn_lse.dtype == torch.float32
            and attn_lse.is_contiguous()
            and attn_lse.device == torch.device("cpu")
        ), "attn_lse dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            attn_lse.dim(),
            attn_lse.size(),
            attn_lse.dtype,
            attn_lse.is_contiguous(),
            attn_lse.device,
        )

        assert (
            layer_idx >= 0 and layer_idx < self.config.layer_num
        ), "layer_idx: {}".format(layer_idx)

        assert (cache_seqlens is None) or (
            cache_seqlens.dim() == 1
            and cache_seqlens.size(0) == batch_size
            and cache_seqlens.dtype == torch.int
            and cache_seqlens.is_contiguous()
            and cache_seqlens.device == torch.device("cpu")
        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            cache_seqlens.dim(),
            cache_seqlens.size(),
            cache_seqlens.dtype,
            cache_seqlens.is_contiguous(),
            cache_seqlens.device,
        )

        return self.kvcache.attn(
            q_in.data_ptr(),
            output.data_ptr(),
            attn_lse.data_ptr(),
            layer_idx,
            generate_token_idx,
            q_len,
            batch_size,
            max_block_num,
            block_table.data_ptr() if block_table is not None else 0,
            cache_seqlens.data_ptr() if cache_seqlens is not None else 0,
            pick_block_num,
            init_block_num,
            local_block_num,
        )

    # k_in: (block_len, kv_head_num, head_dim)
    # v_in: (block_len, kv_head_num, head_dim)
    def update_kvcache_one_block_fp16(
        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            k_in.dim() == 3
            and k_in.size(1) == self.config.block_len
            and k_in.size(0) == self.config.kv_head_num
            and k_in.size(2) == self.config.head_dim
            and k_in.dtype == torch.float16
            and k_in.is_contiguous()
            and k_in.device == torch.device("cpu")
        ), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
        )
        assert (
            v_in.dim() == 3
            and v_in.size(1) == self.config.block_len
            and v_in.size(0) == self.config.kv_head_num
            and v_in.size(2) == self.config.head_dim
            and v_in.dtype == torch.float16
            and v_in.is_contiguous()
            and v_in.device == torch.device("cpu")
        ), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.update_one_block_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_id,
            block_idx,
        )

    def get_kvcache_one_block_fp16(
        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            k_in.dim() == 3
            and k_in.size(1) == self.config.block_len
            and k_in.size(0) == self.config.kv_head_num
            and k_in.size(2) == self.config.head_dim
            and k_in.dtype == torch.float16
            and k_in.is_contiguous()
            and k_in.device == torch.device("cpu")
        ), "k_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            k_in.dim(), k_in.size(), k_in.dtype, k_in.is_contiguous(), k_in.device
        )
        assert (
            v_in.dim() == 3
            and v_in.size(1) == self.config.block_len
            and v_in.size(0) == self.config.kv_head_num
            and v_in.size(2) == self.config.head_dim
            and v_in.dtype == torch.float16
            and v_in.is_contiguous()
            and v_in.device == torch.device("cpu")
        ), "v_in dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            v_in.dim(), v_in.size(), v_in.dtype, v_in.is_contiguous(), v_in.device
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.get_one_block_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_id,
            block_idx,
        )

    def update_importance_one_block(
        self, importance: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            importance.dim() == 1
            and importance.size(0) == self.config.block_len
            and importance.dtype == torch.float16
            and importance.is_contiguous()
            and importance.device == torch.device("cpu")
        ), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            importance.dim(),
            importance.size(),
            importance.dtype,
            importance.is_contiguous(),
            importance.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.update_importance_one_block(
            importance.data_ptr(),
            layer_id,
            block_idx,
        )

    def get_importance_one_block(
        self, importance: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            importance.dim() == 1
            and importance.size(0) == self.config.block_len
            and importance.dtype == torch.float16
            and importance.is_contiguous()
            and importance.device == torch.device("cpu")
        ), "importance dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            importance.dim(),
            importance.size(),
            importance.dtype,
            importance.is_contiguous(),
            importance.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.get_importance_one_block(
            importance.data_ptr(),
            layer_id,
            block_idx,
        )

    def get_anchor_one_block(self, anchor: torch.Tensor, layer_id: int, block_idx: int):
        assert (
            anchor.dim() == 3
            and anchor.size(0) == self.config.kv_head_num
            and anchor.size(1) == self.config.anchor_num
            and anchor.size(2) == self.config.head_dim
            and anchor.dtype == torch.float16
            and anchor.is_contiguous()
            and anchor.device == torch.device("cpu")
        ), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            anchor.dim(),
            anchor.size(),
            anchor.dtype,
            anchor.is_contiguous(),
            anchor.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.get_anchor_one_block(
            anchor.data_ptr(),
            layer_id,
            block_idx,
        )

    def update_anchor_one_block(
        self, anchor: torch.Tensor, layer_id: int, block_idx: int
    ):
        assert (
            anchor.dim() == 3
            and anchor.size(0) == self.config.kv_head_num
            and anchor.size(1) == self.config.anchor_num
            and anchor.size(2) == self.config.head_dim
            and anchor.dtype == torch.float16
            and anchor.is_contiguous()
            and anchor.device == torch.device("cpu")
        ), "anchor dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            anchor.dim(),
            anchor.size(),
            anchor.dtype,
            anchor.is_contiguous(),
            anchor.device,
        )
        assert (
            layer_id >= 0 and layer_id < self.config.layer_num
        ), "layer_id: {}".format(layer_id)
        assert block_idx >= 0, "block_idx: {}".format(block_idx)
        return self.kvcache.update_anchor_one_block(
            anchor.data_ptr(),
            layer_id,
            block_idx,
        )

    def calc_anchor_all_layers(
        self,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
    ):
        assert (
            block_table.dim() == 2
            and block_table.size(0) == cache_seqlens.size(0)
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )
        assert (
            cache_seqlens.dim() == 1
            and cache_seqlens.dtype == torch.int
            and cache_seqlens.is_contiguous()
            and cache_seqlens.device == torch.device("cpu")
        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            cache_seqlens.dim(),
            cache_seqlens.size(),
            cache_seqlens.dtype,
            cache_seqlens.is_contiguous(),
            cache_seqlens.device,
        )
        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        return self.kvcache.calc_anchor_all_layers(
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            batch_size,
            max_block_num,
        )

    def clear_importance_all_layers(
        self,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
    ):
        assert (
            block_table.dim() == 2
            and block_table.size(0) == cache_seqlens.size(0)
            and block_table.dtype == torch.int
            and block_table.is_contiguous()
            and block_table.device == torch.device("cpu")
        ), "block_table dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            block_table.dim(),
            block_table.size(),
            block_table.dtype,
            block_table.is_contiguous(),
            block_table.device,
        )
        assert (
            cache_seqlens.dim() == 1
            and cache_seqlens.dtype == torch.int
            and cache_seqlens.is_contiguous()
            and cache_seqlens.device == torch.device("cpu")
        ), "cache_seqlens dim: {}, size: {}, dtype: {}, contiguous: {}, device: {}".format(
            cache_seqlens.dim(),
            cache_seqlens.size(),
            cache_seqlens.dtype,
            cache_seqlens.is_contiguous(),
            cache_seqlens.device,
        )
        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        return self.kvcache.clear_importance_all_layers(
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            batch_size,
            max_block_num,
        )

    def get_cache_total_len(self):
        return self.kvcache.get_cache_total_len()

    def update_kvcache_q4(
        self,
        k_in: torch.Tensor,
        k_scales: torch.Tensor,
        v_in: torch.Tensor,
        v_scales: torch.Tensor,
        layer_id: int,
        seq_offset: int | None = None,
        seq_len: int | None = None,
        block_table: torch.Tensor | None = None,
    ):
        raise NotImplementedError

    def update_kvcache_fp16(
        self,
        k_in: torch.Tensor,
        v_in: torch.Tensor,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        past_len: torch.Tensor,
        q_len,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.get_kvcache_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            past_len.data_ptr(),
            q_len
        )

    def get_kvcache_q4(
        self,
        k_in: torch.Tensor,
        k_scales: torch.Tensor,
        v_in: torch.Tensor,
        v_scales: torch.Tensor,
        layer_id: int,
        seq_offset: int | None = None,
        seq_len: int | None = None,
        block_table: torch.Tensor | None = None,
    ):
        raise NotImplementedError

    def get_kvcache_fp16(
        self,
        k_in: torch.Tensor,
        v_in: torch.Tensor,
        layer_id: int,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        past_len: torch.Tensor,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.get_kvcache_fp16(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            past_len.data_ptr(),
        )

    def get_and_update_kvcache_fp16(
        self,
        k_cache_cpu: torch.Tensor,
        v_cache_cpu: torch.Tensor,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        past_len: torch.Tensor,
        q_len,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.get_and_update_kvcache_fp16(
            k_cache_cpu.data_ptr(),
            v_cache_cpu.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            past_len.data_ptr(),
            q_len,
        )

    def update_importance(
        self,
        importance_cache: torch.Tensor,
        layer_idx,
        block_table: torch.Tensor,
        max_block_num,
        offset: torch.Tensor,
        width,
    ):
        batch_size = block_table.size(0)
        return self.kvcache.update_importance(
            importance_cache.data_ptr(),
            layer_idx,
            block_table.data_ptr(),
            batch_size,
            max_block_num,
            offset.data_ptr(),
            width,
        )

    # attn_sparsity: ((bsz, q_len, q_head_num), dtype = torch.float32)
    def get_attn_sparsity(
        self,
        q_in: torch.Tensor,
        attn_sparsity: torch.Tensor,
        layer_idx: int,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
        block_table_origin: torch.Tensor,
        cache_seqlens_origin: torch.Tensor,
        generate_token_idx: int = 0,
        topk: int | None = None,
        local: int | None = None,
    ):
        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        max_block_num_origin = block_table_origin.size(1)
        q_len = q_in.size(1)

        if topk is None or local is None or topk + local >= max_block_num:
            topk = -1
            local = -1
        return self.kvcache.get_attn_sparsity(
            q_in.data_ptr(),
            attn_sparsity.data_ptr(),
            layer_idx,
            generate_token_idx,
            q_len,
            batch_size,
            max_block_num,
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            block_table_origin.data_ptr(),
            cache_seqlens_origin.data_ptr(),
            max_block_num_origin,
            topk,
            local,
        )

    def attn_with_kvcache(
        self,
        q_in: torch.Tensor,
        k_in: torch.Tensor,
        v_in: torch.Tensor,
        output: torch.Tensor,
        attn_lse: torch.Tensor,
        layer_idx: int,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
        generate_token_idx: int = 0,
        topk: int | None = None,
        local: int | None = None,
    ):

        batch_size = block_table.size(0)
        max_block_num = block_table.size(1)
        q_len = q_in.size(1)

        if topk is None or local is None or topk + local >= max_block_num:
            topk = -1
            local = -1
        return self.kvcache.attn_with_kvcache(
            q_in.data_ptr(),
            k_in.data_ptr(),
            v_in.data_ptr(),
            output.data_ptr(),
            attn_lse.data_ptr(),
            layer_idx,
            generate_token_idx,
            q_len,
            batch_size,
            max_block_num,
            block_table.data_ptr(),
            cache_seqlens.data_ptr(),
            topk,
            local,
        )

    def get_all_kvcache_one_layer(
        self, k_in: torch.Tensor, v_in: torch.Tensor, layer_id: int
    ):
        return self.kvcache.get_all_kvcache_one_layer(
            k_in.data_ptr(),
            v_in.data_ptr(),
            layer_id,
        )

    def get_importance(
        self,
        importance: torch.Tensor,
        block_table: torch.Tensor,
    ):
        raise NotImplementedError

    def get_anchor(
        self,
        anchor: torch.Tensor,
        block_table: torch.Tensor,
    ):
        raise NotImplementedError


class CPUInfer:
    cpuinfer = None
    cur_backend_thread_num = 0
    
    def __init__(self, thread_num):
        if thread_num > CPUInfer.cur_backend_thread_num:
            CPUInfer.cur_backend_thread_num = thread_num
            del CPUInfer.cpuinfer
            CPUInfer.cpuinfer = cpuinfer_ext.CPUInfer(thread_num)

    def submit(self, task):
        CPUInfer.cpuinfer.submit(task)

    def submit_with_cuda_stream(self, current_cuda_stream, task):
        CPUInfer.cpuinfer.submit_with_cuda_stream(current_cuda_stream, task)

    def sync(self):
        CPUInfer.cpuinfer.sync()

    def sync_with_cuda_stream(self, current_cuda_stream):
        CPUInfer.cpuinfer.sync_with_cuda_stream(current_cuda_stream)


================================================
FILE: kt-sft/ktransformers/operators/dynamic_attention.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Jianwei Dong
Date         : 2024-08-26 23:25:24
Version      : 1.0.0
LastEditors  : Jianwei Dong
LastEditTime : 2024-08-26 23:25:24
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

import torch
from transformers import AutoConfig
import sys, os
import logging
logger = logging.getLogger("dynamic_attention")
sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/cpu_backend")
from ktransformers.operators.cpuinfer import CPUInfer, CPUInferKVCache
try:
    from flash_attn import flash_attn_func, flash_attn_with_kvcache
except:
    print("falsh attn not found")


import math
import json


class DynamicScaledDotProductAttention:
    remaining_length: int
    cpu_infer = None

    def __init__(
        self,
        max_seq_len: int,
        block_size: int,
        config: AutoConfig,
        device: torch.device,
        local_windows_len: int,
        topk: int,
        threads_num: int,
        anchor_type: str = "DYNAMIC",
        kv_type: str = "FP16",
        dense_layer_num: int = 0,
        anchor_num: int = 1,
        block_selection_mode: str = "SHARED",
        layer_step: int = 1,
        token_step: int = 1,
        preselect_block: bool = False,
        preselect_block_count: int = 96,
        prefill_chunk_size: int = 20480,
        use_attn_sparsity: bool = False,
    ):
        # assert anchor_num == 1
        # assert anchor_type == "DYNAMIC"
        self.remaining_length = 0
        valid_anchor_types = ["DYNAMIC", "FIXED", "BLOCK_MEAN", "BLOCK_MAX", "QUEST"]
        assert anchor_type in valid_anchor_types
        if anchor_type == "QUEST":
            assert anchor_num == 2
        elif anchor_type != "FIXED" and anchor_type != "DYNAMIC":
            assert anchor_num == 1

        valid_kv_types = ["FP16", "FP32", "Q4_0", "Q8_0"]
        assert kv_type in valid_kv_types
        if kv_type != "FP16" and kv_type != "FP32":
            assert block_size % 32 == 0

        valid_block_selection_modes = ["SHARED", "SEPARATE"]  # individual
        assert block_selection_mode in valid_block_selection_modes

        self.max_seq_len = max_seq_len
        self.block_num = max_seq_len // block_size
        self.block_size = block_size
        self.anchor_type = anchor_type
        self.kv_type = kv_type
        self.anchor_num = anchor_num
        self.threads_num = threads_num
        self.layer_step = layer_step
        self.token_step = token_step
        self.preselect_block = preselect_block
        self.preselect_block_count = preselect_block_count
        self.block_selection_mode = block_selection_mode
        self.use_attn_sparsity = use_attn_sparsity

        # model config
        self.kv_head_num = config.num_key_value_heads
        self.q_head_num = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.layer_num = config.num_hidden_layers

        self.device = device
        self.local_windows_len = local_windows_len
        self.local_block_num = self.local_windows_len // self.block_size + 1
        self.prefill_chunk_size = prefill_chunk_size

        self.topk = topk
        self.dense_layer_num = dense_layer_num
        # self.dense_layer_num = 32
        self.cache_key_states = torch.zeros(
            (self.block_num, block_size, self.kv_head_num, self.head_dim),
            device=device,
            dtype=torch.float16,
        )
        self.cache_value_states = torch.zeros(
            (self.block_num, block_size, self.kv_head_num, self.head_dim),
            device=device,
            dtype=torch.float16,
        )
        # [max_num_block, block_size, head_num]
        self.cache_importance = torch.zeros(
            (self.block_num, block_size, self.q_head_num),
            device=device,
            dtype=torch.float16,
        )

        # key_states: [bsz, q_len, kv_head_num, head_dim]
        # value_states: [bsz, q_len, kv_head_num, head_dim]
        # query_states: [bsz, q_len, q_head_num, head_dim]
        self.q_in_cpu = torch.zeros(
            (1, 1, self.q_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )
        self.k_in_cpu = torch.zeros(
            (1, 1, self.kv_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )
        self.v_in_cpu = torch.zeros(
            (1, 1, self.kv_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )

        self.cache_seqlens_cpu = torch.empty(
            (1,), device="cpu", dtype=torch.int32, pin_memory=True
        )

        self.cache_seqlens_cuda = torch.empty((1,), device=device, dtype=torch.int32)

        self.prefix_block_table = torch.arange(
            self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
        ).view(1, -1)

        self.block_table_cpu = torch.arange(
            self.block_num, device="cpu", dtype=torch.int32, pin_memory=True
        ).view(1, -1)

        # assert (
        #     self.local_windows_len // self.block_size + 1 + self.preselect_block_count
        #     <= self.block_num
        # )

        self.output_cpu = torch.empty(
            (1, 1, self.q_head_num, self.head_dim),
            device="cpu",
            dtype=torch.float16,
            pin_memory=True,
        )
        self.lse_cpu = torch.empty(
            (1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
        )

        self.output_cuda = torch.empty(
            (1, 1, self.q_head_num, self.head_dim), device=device, dtype=torch.float16
        )

        self.attn_sparsity = torch.zeros(
            (1, 1, self.q_head_num), device="cpu", dtype=torch.float32, pin_memory=True
        )

        if preselect_block == True:
            self.preselect_block_table = torch.zeros(
                self.layer_num,
                self.preselect_block_count,
                device=device,
                dtype=torch.int32,
            )
            self.preselect_block_num = 0  # block_num before preselect
            self.evict_tokens = 0

        if DynamicScaledDotProductAttention.cpu_infer is None:
            DynamicScaledDotProductAttention.cpu_infer = CPUInfer(threads_num)
            self.cpu_infer = DynamicScaledDotProductAttention.cpu_infer
        self.local_thread = CPUInferKVCache(
            self.layer_num,
            self.kv_head_num,
            self.q_head_num,
            self.head_dim,
            self.block_size,
            anchor_num=self.anchor_num,
            anchor_type=anchor_type,
            kv_type=self.kv_type,
            retrieval_type=self.block_selection_mode,
            layer_step=self.layer_step,
            token_step=self.token_step,
            layer_offset=self.dense_layer_num % self.layer_step,
            max_batch_size=1,
            max_block_num=self.block_num,
            max_thread_num=self.threads_num,
        )

        print(
            f"local_windows_len: {local_windows_len}, topk: {topk}, dense_layer_num: {dense_layer_num}, kv_type: {self.kv_type}, anchor_type: {self.anchor_type}, preselect_block: {self.preselect_block}, preselect_block_count: {self.preselect_block_count}, token_step: {self.token_step}, layer_step: {self.layer_step}"
        )

        self.shape_mask = (
            self.q_head_num,
            self.block_size,
            self.block_size,
        )

        mask = torch.zeros(
            self.shape_mask, dtype=torch.uint8, device=device
        ).contiguous()
        elm_idx = torch.arange(self.block_size, device=device)

        for i in range(mask.size(-2)):
            idx = i + mask.size(-1) - mask.size(-2) - elm_idx
            idx = idx[idx >= 0]
            mask[..., i, idx] = 1

        self.tril_mask = mask
        self.triu_mask = mask ^ 1

        self.generate_token_idx = 0

    def get_attn_score_one_block(
        self,
        batch_idx: int,
        max_block_num: int,
        query: torch.Tensor,
        key: torch.Tensor,
        offset: int,
        width: int,
        mask_mode: str | None = None,
        use_softmax: bool = True,
    ):
        n_rep = self.q_head_num // self.kv_head_num
        importance = self.cache_importance.view(-1, self.q_head_num)
        importance = importance.narrow(0, batch_idx * max_block_num + offset, width)
        n_gqa_ = self.q_head_num // self.kv_head_num 
        for head_idx in range(self.q_head_num):
            key_item = key[..., head_idx // n_gqa_, :].view(key.size(0), -1)
            qk = torch.einsum(
                "qd,kd->qk", query[:,head_idx,:], key_item
            )  # (num_attention_heads, len_q, len_k)

            if mask_mode == "tril":
                mask = self.tril_mask
                mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
                qk = qk * mask
            elif mask_mode == "triu":
                mask = self.triu_mask
                mask = mask[0, -qk.size(-2) :, -qk.size(-1) :]
                qk = qk * mask

            if use_softmax:
                qk = torch.nn.functional.softmax(
                    qk / math.sqrt(self.head_dim), dim=-1, dtype=torch.float32
                ).to(torch.float16)
              
            qk = torch.sum(qk, dim=-2)
            importance[...,head_idx] += qk

    def get_preselect_block_table_and_attn_score(
        self,
        layer_idx: int,
        batch_size: int,
        offset: torch.Tensor,
        width: int,
        query: torch.Tensor,
        key: torch.Tensor,
        union_with_last_layer: bool = True,
    ):
        max_seqs_len = offset.max().item() + width
        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size

        for batch_idx in range(batch_size):
            query_cur = query[batch_idx][-128:]
            self.get_attn_score_one_block(
                batch_idx,
                max_block_num,
                query_cur,
                key[batch_idx][: offset[batch_idx].item() + width],
                0,
                offset[batch_idx].item() + width,
                mask_mode=None,
            )

        if self.preselect_block:
            self.prefill_block_num = max(
                0, max_block_num - self.local_windows_len // self.block_size
            )
            self.evict_tokens = (
                max(self.prefill_block_num - self.preselect_block_count, 0)
                * self.block_size
            )

            if self.prefill_block_num != 0:
                importance_cache = self.cache_importance.narrow(
                    0, 0, self.prefill_block_num * batch_size
                ).view(
                    batch_size, self.prefill_block_num, self.block_size, self.q_head_num
                )

                importance_r = importance_cache[:, 1:, : self.block_size // 4]
                pad_r = torch.zeros_like(importance_r[:, :1])
                importance_r = torch.cat((importance_r, pad_r), dim=1)
                importance_l = importance_cache[:, :-1, -self.block_size // 4 :]
                pad_l = torch.zeros_like(importance_l[:, :1])
                importance_l = torch.cat((pad_l, importance_l), dim=1)
                importance = torch.cat(
                    (importance_l, importance_cache, importance_r), dim=2
                )
                importance = importance.mean(dim=-1)
                importance = importance.mean(dim=-1)
                # importance: (batch_size, max_block_num)
                topk = min(self.preselect_block_count, self.prefill_block_num)
                values, indices = torch.topk(
                    importance,
                    k=topk,
                    dim=1,
                )

                self.preselect_block_table[
                    layer_idx : layer_idx + 1,
                    :topk,
                ].copy_(indices)

                if union_with_last_layer and layer_idx == 31:
                    for tmp_layer_idx in range(self.layer_num - 1):
                        for i in range(1, min(topk, 6)):
                            x = self.preselect_block_table[-1, i]
                            if x not in self.preselect_block_table[tmp_layer_idx]:
                                self.preselect_block_table[tmp_layer_idx, topk - i] = x
        if self.anchor_type == "DYNAMIC":
            importance_cache = self.cache_importance.narrow(
                0, 0, max_block_num * batch_size
            ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
            importance_cache_cpu = torch.empty_like(
                importance_cache, device="cpu", pin_memory=True
            )

            importance_cache_cpu.copy_(importance_cache)

            block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
            offset_cpu = offset.contiguous().to("cpu")

            self.cpu_infer.submit(
                self.local_thread.update_importance(
                    importance_cache_cpu,
                    layer_idx,
                    block_table_cpu,
                    max_block_num,
                    offset_cpu,
                    width,
                )
            )
            self.cpu_infer.sync()

        importance_cache = self.cache_importance.narrow(
            0, 0, max_block_num * batch_size
        ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
        importance_cache.zero_()

    # key: [bsz, past_len, head_num, head_dim] float16
    # query: [bsz, q_len, q_head_num, head_dim] float16
    def get_attn_score(
        self,
        layer_idx: int,
        batch_size: int,
        offset: torch.Tensor,
        width: int,
        query: torch.Tensor,
        key: torch.Tensor,
    ):
        max_seqs_len = offset.max().item() + width
        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size

        for batch_idx in range(batch_size):
            for idx in range(width // self.block_size):
                offset_cur = idx * self.block_size
                query_cur = query[batch_idx, offset_cur : offset_cur + self.block_size]
                self.get_attn_score_one_block(
                    batch_idx,
                    max_block_num,
                    query_cur,
                    key[
                        batch_idx,
                        offset[batch_idx]
                        + offset_cur : offset[batch_idx]
                        + offset_cur
                        + self.block_size,
                    ],
                    offset[batch_idx].item() + offset_cur,
                    self.block_size,
                    mask_mode="tril",
                    use_softmax=False,
                )

                offset_key = (
                    offset[batch_idx].item()
                    + idx * self.block_size
                    - self.local_windows_len
                )
                if offset_key >= 0:
                    self.get_attn_score_one_block(
                        batch_idx,
                        max_block_num,
                        query_cur,
                        key[batch_idx, offset_key : offset_key + self.block_size],
                        offset_key,
                        self.block_size,
                        mask_mode="triu",
                        use_softmax=False,
                    )

                offset_key = max(0, offset_key + self.block_size)
                width_key = (
                    offset[batch_idx].item() + idx * self.block_size - offset_key
                )
                if width_key > 0:
                    self.get_attn_score_one_block(
                        batch_idx,
                        max_block_num,
                        query_cur,
                        key[batch_idx, offset_key : offset_key + width_key],
                        offset_key,
                        width_key,
                        mask_mode=None,
                        use_softmax=False,
                    )

        importance_cache = self.cache_importance.narrow(
            0, 0, max_block_num * batch_size
        ).view(batch_size, max_block_num * self.block_size, self.q_head_num)
        importance_cache_cpu = torch.empty_like(
            importance_cache, device="cpu", pin_memory=True
        )

        importance_cache_cpu.copy_(importance_cache)

        block_table_cpu = self.prefix_block_table[:, :max_block_num].to("cpu")
        offset_cpu = offset.contiguous().to("cpu")

        self.cpu_infer.submit(
            self.local_thread.update_importance(
                importance_cache_cpu,
                layer_idx,
                block_table_cpu,
                max_block_num,
                offset_cpu,
                width,
            )
        )
        self.cpu_infer.sync()
        importance_cache.zero_()

    # key: [bsz, q_len, head_num, head_dim] float16
    # value: [bsz, q_len, head_num, head_dim] float16
    def swap_in_and_swap_out(self, layer_idx, past_len, q_len, key, value):
        batch_size = 1
        max_seqs_len = past_len.max().item() + q_len
        max_block_num = (max_seqs_len + self.block_size - 1) // self.block_size
        k_cache = self.cache_key_states.narrow(0, 0, max_block_num * batch_size).view(
            batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
        )
        v_cache = self.cache_value_states.narrow(0, 0, max_block_num * batch_size).view(
            batch_size, max_block_num * self.block_size, self.kv_head_num, self.head_dim
        )

        for batch_idx in range(batch_size):
            offset = past_len[batch_idx]
            width = q_len
            k_cache[batch_idx][offset : offset + width].copy_(
                key[batch_idx].view(-1, self.kv_head_num, self.head_dim)
            )
            v_cache[batch_idx][offset : offset + width].copy_(
                value[batch_idx].view(-1, self.kv_head_num, self.head_dim)
            )

        k_cache_cpu = torch.empty_like(k_cache, device="cpu", pin_memory=True)
        v_cache_cpu = torch.empty_like(v_cache, device="cpu", pin_memory=True)

        k_cache_cpu.copy_(k_cache)
        v_cache_cpu.copy_(v_cache)

        cur_block_num = (
            q_len + past_len[0].item() + self.block_size - 1
        ) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        past_len_cpu = past_len.contiguous().to("cpu")

        self.cpu_infer.submit(
            self.local_thread.get_and_update_kvcache_fp16(
                k_cache_cpu,
                v_cache_cpu,
                layer_idx,
                block_table_cpu,
                max_block_num,
                past_len_cpu,
                q_len,
            )
        )

        self.cpu_infer.sync()
        k_cache.copy_(k_cache_cpu)
        v_cache.copy_(v_cache_cpu)

        return k_cache, v_cache

    def calc_anchor(self, cache_seqlens: int):
        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor(
            [cache_seqlens], device="cpu", dtype=torch.int32
        )

        self.cpu_infer.submit(
            self.local_thread.calc_anchor_all_layers(
                block_table_cpu,
                cache_seqlens_cpu,
            )
        )
        self.cpu_infer.sync()

    def clear_importance(self, cache_seqlens: int):
        print(f"clear importance: {cache_seqlens}")
        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor(
            [cache_seqlens], device="cpu", dtype=torch.int32
        )

        self.cpu_infer.submit(
            self.local_thread.clear_importance_all_layers(
                block_table_cpu,
                cache_seqlens_cpu,
            )
        )
        self.cpu_infer.sync()

    def clear_kvcache(self, cache_seqlens: int):
        cur_block_num = (cache_seqlens + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[:, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor(
            [cache_seqlens], device="cpu", dtype=torch.int32
        )

        self.cpu_infer.submit(
            self.local_thread.clear_kvcache_all_layers(
                block_table_cpu,
                cache_seqlens_cpu,
            )
        )
        self.cpu_infer.sync()

    def get_attn_sparsity(
        self,
        q_in: torch.Tensor,
        layer_idx: int,
        block_table: torch.Tensor,
        cache_seqlens: torch.Tensor,
        block_table_origin: torch.Tensor,
        cache_seqlens_origin: torch.Tensor,
        generate_token_idx: int = 0,
        topk: int | None = None,
        local: int | None = None,
        output_path: str = "./attn_sparsity.json",
    ):
        self.attn_sparsity.zero_()
        self.pcinfer.submit(
            self.local_thread.get_attn_sparsity(
                q_in,
                self.attn_sparsity,
                layer_idx,
                block_table,
                cache_seqlens,
                block_table_origin,
                cache_seqlens_origin,
                generate_token_idx,
                topk,
                local,
            )
        )
        self.cpu_infer.sync()
        with open(output_path, "a") as file:
            for head_idx in range(self.q_head_num):
                sparsity = self.attn_sparsity[0][0][head_idx].item()
                json_obj = {
                    "token_idx": generate_token_idx,
                    "layer_idx": layer_idx,
                    "head_idx": head_idx,
                    "sparsity": sparsity,
                }
                json.dump(json_obj, file)
                file.write("\n")

    def apply(
        self,
        layer_idx: int,
        bsz: int,
        past_len: int,
        query_states: torch.Tensor,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        mode: str = "prefill",
        generate_token_idx: int = -1,
    ):

        # key_states: [bsz, q_len, kv_head_num, head_dim]
        # value_states: [bsz, q_len, kv_head_num, head_dim]
        # query_states: [bsz, q_len, q_head_num, head_dim]
        assert query_states.dtype == torch.float16
        assert key_states.dtype == torch.float16
        assert value_states.dtype == torch.float16

        assert key_states.size(2) == self.kv_head_num
        assert value_states.size(2) == self.kv_head_num
        assert query_states.size(2) == self.q_head_num

        q_len = query_states.size(1)
        batch_size = query_states.size(0)
        self.cache_seqlens_cuda.fill_(past_len)
        last_chunk = False
        if self.remaining_length <= self.prefill_chunk_size and q_len != 1:
            last_chunk = True
        device = query_states.device
        if layer_idx == 0:
            if q_len == 1:
                self.generate_token_idx += 1
            elif last_chunk:
                self.generate_token_idx = -1

        if mode == "prefill":
            key, value = self.swap_in_and_swap_out(
                layer_idx,
                self.cache_seqlens_cuda,
                q_len,
                key_states,
                value_states,
            )

            if last_chunk and (self.anchor_type == "DYNAMIC" or self.preselect_block):
                self.get_preselect_block_table_and_attn_score(
                    layer_idx,
                    bsz,
                    self.cache_seqlens_cuda,
                    q_len,
                    query_states,
                    key,
                )
            output = flash_attn_with_kvcache(
                q=query_states,
                k_cache=key,
                v_cache=value,
                cache_seqlens=self.cache_seqlens_cuda + q_len,
                causal=True,
            )
            return output.transpose(1, 2)

        elif mode == "generate":
            assert self.generate_token_idx >= 0
            self.q_in_cpu.copy_(query_states, non_blocking=True)
            self.k_in_cpu.copy_(key_states, non_blocking=True)
            self.v_in_cpu.copy_(value_states, non_blocking=True)
            self.cache_seqlens_cpu.copy_(self.cache_seqlens_cuda, non_blocking=True)
            #            print(layer_idx)
            if layer_idx < self.dense_layer_num:
                self.block_table_cpu.copy_(self.prefix_block_table, non_blocking=True)
                self.cpu_infer.submit_with_cuda_stream(
                    torch.cuda.current_stream("cuda").cuda_stream,
                    self.local_thread.attn_with_kvcache(
                        q_in=self.q_in_cpu,
                        k_in=self.k_in_cpu,
                        v_in=self.v_in_cpu,
                        output=self.output_cpu,
                        attn_lse=self.lse_cpu,
                        layer_idx=layer_idx,
                        block_table=self.block_table_cpu,
                        cache_seqlens=self.cache_seqlens_cpu,
                    ),
                )
            else:
                if self.preselect_block:
                    self.cache_seqlens_cpu.copy_(
                        self.cache_seqlens_cuda - self.evict_tokens, non_blocking=True
                    )
                    if self.preselect_block_count < self.prefill_block_num:
                        self.block_table_cpu[:, : self.preselect_block_count].copy_(
                            self.preselect_block_table[layer_idx : layer_idx + 1],
                            non_blocking=True,
                        )

                        self.block_table_cpu[
                            :,
                            self.preselect_block_count : self.preselect_block_count
                            + self.local_block_num,
                        ].copy_(
                            self.prefix_block_table[
                                :,
                                self.prefill_block_num : self.prefill_block_num
                                + self.local_block_num,
                            ],
                            non_blocking=True,
                        )
                    #                   print("submit_with_cuda_stream")
                    self.cpu_infer.submit_with_cuda_stream(
                        torch.cuda.current_stream("cuda").cuda_stream,
                        self.local_thread.attn_with_kvcache(
                            q_in=self.q_in_cpu,
                            k_in=self.k_in_cpu,
                            v_in=self.v_in_cpu,
                            output=self.output_cpu,
                            attn_lse=self.lse_cpu,
                            layer_idx=layer_idx,
                            generate_token_idx=self.generate_token_idx,
                            block_table=self.block_table_cpu,
                            cache_seqlens=self.cache_seqlens_cpu,
                            topk=(
                                self.topk
                                if self.topk <= self.preselect_block_count
                                else None
                            ),
                            local=self.local_windows_len // self.block_size,
                        ),
                    )
                #                    print("submit_with_cuda_stream enqueue\n")
                else:
                    self.block_table_cpu.copy_(
                        self.prefix_block_table, non_blocking=True
                    )
                    self.cpu_infer.submit_with_cuda_stream(
                        torch.cuda.current_stream("cuda").cuda_stream,
                        self.local_thread.attn_with_kvcache(
                            q_in=self.q_in_cpu,
                            k_in=self.k_in_cpu,
                            v_in=self.v_in_cpu,
                            output=self.output_cpu,
                            attn_lse=self.lse_cpu,
                            layer_idx=layer_idx,
                            generate_token_idx=self.generate_token_idx,
                            block_table=self.block_table_cpu,
                            cache_seqlens=self.cache_seqlens_cpu,
                            topk=self.topk,
                            local=self.local_windows_len // self.block_size,
                        ),
                    )
            self.cpu_infer.sync_with_cuda_stream(
                torch.cuda.current_stream("cuda").cuda_stream
            )
            #            print("submit_with_cuda_stream finished\n")
            self.output_cuda.copy_(self.output_cpu, non_blocking=True)
            return self.output_cuda.transpose(1, 2)

    def save(self, path: str, length: int):
        cur_block_num = (length + self.block_size - 1) // self.block_size
        block_table_cpu = self.prefix_block_table[0, :cur_block_num].to("cpu")
        cache_seqlens_cpu = torch.tensor([length], device="cpu", dtype=torch.int32)
        self.cpu_infer.submit(
            self.local_thread.dump_kvcache(
                block_table_cpu,
                cache_seqlens_cpu,
                path,
            )
        )
        self.cpu_infer.sync()

    def load(self, path: str, length: int):
        self.cpu_infer.submit(
            self.local_thread.load_kvcache(
                path,
            )
        )
        self.cpu_infer.sync()


================================================
FILE: kt-sft/ktransformers/operators/experts.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Azure-Tang, Boxin Zhang, chenht2022
Date         : 2024-07-25 11:25:24
Version      : 0.1.0
LastEditors  : Azure 
LastEditTime : 2024-08-29 09:41:10
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''

from typing import Any, Union
import numpy as np
import numpy.typing as npt
from torch import Tensor, nn
import torch.nn.functional as F
import torch
import sys, os
from ktransformers.operators.base_operator import BaseInjectedModule
from tqdm import tqdm
import time
import logging
from tqdm.auto import tqdm
import re

sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from cpuinfer_ext.moe import MOEConfig, MOE
from cpuinfer_ext.sft_moe import SFT_MOEConfig, SFT_MOE
import ctypes
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.inference_state import InferenceState
from ktransformers.util.custom_gguf import GGMLQuantizationType
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader
from ktransformers.server.config.config import Config
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod
from ktransformers.operators.linear import KLinearMarlin, KLinearTorch, KTransformersLinear
import time
from ktransformers.operators.cpuinfer import CPUInfer
from ktransformers.util.grad_wrapper import maybe_no_grad

H_FIXED = 7168
M_FIXED = 2048

def deduplicate_and_sort(lst):
    return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
    assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
    base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]

    if chunk_size <= 1024:
        return deduplicate_and_sort(base_list)

    multiples = [i for i in range(1024, chunk_size + 1, 1024)]

    return deduplicate_and_sort(base_list + multiples)
#cuda_graphs = [Config().chunk_size] 
if torch.cuda.is_available():
    cuda_graphs = generate_cuda_graphs(Config().chunk_size)
else:
    cuda_graphs = 1
# class Base(BaseInjectedModule, ABC):
class KExpertsBase(ABC):
    def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.key = key
        self.gguf_loader = gguf_loader
        self.config = config
        self.device = device
    
    @abstractmethod
    def forward(self, input_tensor, expert_ids, weights):
        pass

    @abstractmethod
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu", warmup: bool = False):
        pass
    
    @abstractmethod
    def unload():
        pass

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None
        gate_type = None
        up_type = None
        down_type = None

        for key in keys:
            # if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: # TODO: maybe problem in merge (this is origin one)
            if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
                targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
                tensors = self.load_multi(key, targets, device=device)
                gate = tensors[".ffn_gate_exps.weight"]
                up = tensors[".ffn_up_exps.weight"]
                down = tensors[".ffn_down_exps.weight"]
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
            # elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info: # TODO: maybe problem in merge (this is origin one)
            elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"):
                # for supporting  Mixtral-8x7B-Instuct  
                gate = []
                up = []
                down = []
                for i in range(8):
                    gatei, upi, downi = f".ffn_gate.{i}.weight", f".ffn_up.{i}.weight", f".ffn_down.{i}.weight"
                    targets = [gatei, upi, downi]
                    tensors = self.load_multi(key, targets, device=device)
                    gate_it, up_it, down_it = tensors[gatei], tensors[upi], tensors[downi]
                    gate.append(gate_it)
                    up.append(up_it)
                    down.append(down_it)
                gate = torch.stack(gate)
                up = torch.stack(up)
                down = torch.stack(down)
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"]
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")
            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
        return res
    
    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
        tensors = {}
        for k in keys:
            tensors[k] = self.gguf_loader.load_gguf_tensor(key + k, device=device)
        return tensors
class KExpertsCPU(KExpertsBase):
    input_tensor_cpu:Tensor = None
    expert_ids_cpu:Tensor = None
    weights_cpu:Tensor = None
    output_cpu:Tensor = None
    output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
    #stream_map:dict = {} # Manage cuda stream on different gpu
    # @TODO add yaml
    CPU_INFER = CPUInfer(Config().cpu_infer)
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        n_routed_experts: int,
        orig_module: nn.Module = None,
        device: str = "cpu",
        out_device: str = "cuda", # this device mean which device the output should on. TODO: support cpu.
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU"
        self.n_routed_experts = n_routed_experts
        self.out_device = out_device
        self.backend = kwargs.get("backend", "llamafile")

    def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
        if device:
            assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU, Parameter \"device\" can be cpu or None."
        if w is None: w = self.load_weights()[self.key]
        self.gate = w["gate"]
        self.up = w["up"]
        self.down = w["down"]
        self.gate_type = w["gate_type"]
        self.up_type = w["up_type"]
        self.down_type = w["down_type"]
        gate_ptr = ctypes.addressof(
            ctypes.cast(self.gate.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        up_ptr = ctypes.addressof(
            ctypes.cast(self.up.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        down_ptr = ctypes.addressof(
            ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        # print(self.gate_qtype, self.up_qtype, self.down_qtype)
        n_routed_experts = self.n_routed_experts
        self.cpu_infer = KExpertsCPU.CPU_INFER
        # n_routed_experts = len(self.orig_module)
        model_dtype = torch.get_default_dtype()
        if torch.xpu.is_available() and model_dtype == torch.float16:
            hidden_type = 1 # fp16
        else:
            hidden_type = 30 # bf16
        if self.backend == "llamafile":
            moe_config = MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                64,
                10,
                1024,
                gate_ptr,
                up_ptr,
                down_ptr,
                self.gate_type,
                self.up_type,
                self.down_type,
                hidden_type, # TODO: get from model.dtype
            )
            self.moe = MOE(moe_config)
        elif self.backend == "AMXBF16":
            from cpuinfer_ext.moe import AMX_MOEConfig, AMXBF16_MOE
            assert self.gate_type == GGMLQuantizationType.BF16
            assert self.up_type == GGMLQuantizationType.BF16
            assert self.down_type == GGMLQuantizationType.BF16
            moe_config = AMX_MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
                gate_ptr,
                up_ptr,
                down_ptr,
            )
            self.moe = AMXBF16_MOE(moe_config)
            self.cpu_infer.submit(self.moe.load_weights())
            self.cpu_infer.sync()
        elif self.backend == "AMXInt8":
            from cpuinfer_ext.moe import AMX_MOEConfig, AMXInt8_MOE
            assert self.gate_type == GGMLQuantizationType.BF16
            assert self.up_type == GGMLQuantizationType.BF16
            assert self.down_type == GGMLQuantizationType.BF16
            moe_config = AMX_MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
                gate_ptr,
                up_ptr,
                down_ptr,
            )
            self.moe = AMXInt8_MOE(moe_config)
            self.cpu_infer.submit(self.moe.load_weights())
            self.cpu_infer.sync()
        # print(n_routed_experts, hidden_size, moe_intermediate_size)
        num_experts_per_tok = self.config.num_experts_per_tok
        if warmup:
            self.cpu_infer.submit(self.moe.warm_up())
            self.cpu_infer.sync()
        if self.out_device not in KExpertsCPU.output_gpu_map:
            if isinstance(cuda_graphs, list):
                KExpertsCPU.output_gpu_map[self.out_device] = [torch.zeros((cuda_graphs[i], self.config.hidden_size), device=self.out_device) for i in range(len(cuda_graphs))]
            else:
                KExpertsCPU.output_gpu_map[self.out_device] = torch.zeros((cuda_graphs, self.config.hidden_size), device=self.out_device)
        if KExpertsCPU.input_tensor_cpu == None:
            if isinstance(cuda_graphs, list):
                KExpertsCPU.input_tensor_cpu = [torch.zeros((cuda_graphs[i], self.config.hidden_size), device="cpu", pin_memory=True) for i in range(len(cuda_graphs))]
                KExpertsCPU.expert_ids_cpu = [torch.zeros((cuda_graphs[i], num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True) for i in range(len(cuda_graphs))]
                KExpertsCPU.weights_cpu = [torch.zeros((cuda_graphs[i], num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True) for i in range(len(cuda_graphs))]
                KExpertsCPU.output_cpu = [torch.zeros((cuda_graphs[i], self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16) for i in range(len(cuda_graphs))]
                KExpertsCPU.bsz_tensor_cpu = [torch.zeros((1), device="cpu", dtype=torch.int32, pin_memory=True) for i in range(len(cuda_graphs))]
            else:
                KExpertsCPU.input_tensor_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True)
                KExpertsCPU.expert_ids_cpu = torch.zeros((cuda_graphs, num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
                KExpertsCPU.weights_cpu = torch.zeros((cuda_graphs, num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
                if torch.xpu.is_available():
                    KExpertsCPU.output_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True, dtype=model_dtype)
                    KExpertsCPU.bsz_tensor_cpu = torch.ones((1), device="cpu", dtype=torch.int32, pin_memory=True)
                else:
                    KExpertsCPU.output_cpu = torch.zeros((cuda_graphs, self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
                    KExpertsCPU.bsz_tensor_cpu = torch.zeros((1), device="cpu", dtype=torch.int32, pin_memory=True)
            
    def submit_for_one_decode(self, input_tensor, expert_ids, weights, bsz_tensor=None, cuda_graph_idx=0):
        if bsz_tensor is None:
            bsz_tensor = torch.ones(1, device=input_tensor.device, dtype=torch.int32)
        if cuda_graph_idx != -1:
            KExpertsCPU.input_tensor_cpu[cuda_graph_idx].copy_(input_tensor, non_blocking=True)
            KExpertsCPU.expert_ids_cpu[cuda_graph_idx].copy_(expert_ids, non_blocking=True)
            KExpertsCPU.weights_cpu[cuda_graph_idx].copy_(weights, non_blocking=True)
            KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].copy_(bsz_tensor, non_blocking=True)
            self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(-1), KExpertsCPU.expert_ids_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.weights_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.input_tensor_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.output_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].data_ptr()))
        else:
            KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
            KExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
            KExpertsCPU.weights_cpu.copy_(weights, non_blocking=True)
            KExpertsCPU.bsz_tensor_cpu.copy_(bsz_tensor, non_blocking=True)
            self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(-1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr(), KExpertsCPU.bsz_tensor_cpu.data_ptr()))
        

    def sync_for_one_decode(self, cuda_graph_idx=0):
        if cuda_graph_idx != -1:
            self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream)
            KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx].copy_(KExpertsCPU.output_cpu[cuda_graph_idx], non_blocking=True)
            return KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx]
        else:
            self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream)
            KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
            return KExpertsCPU.output_gpu_map[self.out_device]

    def forward(self, input_tensor, expert_ids, weights, bsz_tensor=None, cuda_graph_idx=0):
        # generate, capture and run cuda graph
        # print(expert_ids)
        if bsz_tensor is None and (not torch.xpu.is_available() or input_tensor.size(0) > 1):
            bsz_tensor = torch.tensor([input_tensor.size(0)], device=input_tensor.device, dtype=torch.int32)
        if torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
            if cuda_graph_idx != -1:
                KExpertsCPU.input_tensor_cpu[cuda_graph_idx].copy_(input_tensor, non_blocking=True)
                KExpertsCPU.expert_ids_cpu[cuda_graph_idx].copy_(expert_ids, non_blocking=True)
                KExpertsCPU.weights_cpu[cuda_graph_idx].copy_(weights, non_blocking=True)
                KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].copy_(bsz_tensor, non_blocking=True)
                self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(expert_ids.size(0), expert_ids.size(-1), KExpertsCPU.expert_ids_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.weights_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.input_tensor_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.output_cpu[cuda_graph_idx].data_ptr(), KExpertsCPU.bsz_tensor_cpu[cuda_graph_idx].data_ptr()))
                self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
                KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx].copy_(KExpertsCPU.output_cpu[cuda_graph_idx], non_blocking=True)
                return KExpertsCPU.output_gpu_map[self.out_device][cuda_graph_idx]

            else:
                KExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
                KExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
                KExpertsCPU.weights_cpu.copy_(weights, non_blocking=True)
                KExpertsCPU.bsz_tensor_cpu.copy_(bsz_tensor, non_blocking=True)
                self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, self.moe.forward(expert_ids.size(0), expert_ids.size(-1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr(), KExpertsCPU.bsz_tensor_cpu.data_ptr()))
                self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
                KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
                return KExpertsCPU.output_gpu_map[self.out_device]
        elif input_tensor.size(0)==1 and torch.xpu.is_available():
            KExpertsCPU.input_tensor_cpu.copy_(input_tensor.view(-1), non_blocking=True)
            KExpertsCPU.expert_ids_cpu.copy_(expert_ids.view(-1), non_blocking=True)
            KExpertsCPU.weights_cpu.copy_(weights.view(-1), non_blocking=True)
            # KExpertsCPU.bsz_tensor_cpu.copy_(bsz_tensor.view(-1), non_blocking=True)
            self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), KExpertsCPU.expert_ids_cpu.data_ptr(), KExpertsCPU.weights_cpu.data_ptr(), KExpertsCPU.input_tensor_cpu.data_ptr(), KExpertsCPU.output_cpu.data_ptr(), KExpertsCPU.bsz_tensor_cpu.data_ptr()))
            self.cpu_infer.sync()
            KExpertsCPU.output_gpu_map[self.out_device].copy_(KExpertsCPU.output_cpu, non_blocking=True)
            return KExpertsCPU.output_gpu_map[self.out_device].view(1, -1)
        else:
            input_tensor = input_tensor.contiguous().cpu()
            expert_ids = expert_ids.contiguous().cpu()
            weights = weights.contiguous().to(torch.float32).cpu()
            bsz_tensor = bsz_tensor.contiguous().cpu()
            output = torch.empty_like(input_tensor).contiguous()
            self.cpu_infer.submit(self.moe.forward(expert_ids.size(0), expert_ids.size(1), expert_ids.data_ptr(), weights.data_ptr(), input_tensor.data_ptr(), output.data_ptr(), bsz_tensor.data_ptr()))
            self.cpu_infer.sync()
            return output.to(device=object.__getattribute__(self, "out_device"))
    
    def unload(self):
        return

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
        # TODO: support Bias
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None
        gate_type = None
        up_type = None
        down_type = None

        for key in keys:
            if isinstance(self.gguf_loader, SafeTensorLoader):
                res = self.gguf_loader.load_experts(key)
                return {key: res}
            elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
                # gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                # up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                # down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
                gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
                up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
                down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
            
            elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
            elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
                # for supporting  Mixtral-8x7B-Instuct  
                gate = []
                up = []
                down = []
                for i in range(8):
                    gate_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_gate.{i}.weight")
                    up_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_up.{i}.weight")
                    down_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_down.{i}.weight")
                    gate.append(gate_it)
                    up.append(up_it)
                    down.append(down_it)
                gate = np.stack(gate)
                up = np.stack(up)
                down = np.stack(down)
                gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
                up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
                down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")
            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
        return res
class KSFTExpertsCPU(torch.autograd.Function):
    input_tensor_cpu:Tensor = None
    expert_ids_cpu:Tensor = None
    weights_cpu:Tensor = None
    output_cpu:Tensor = None
    output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
    #stream_map:dict = {} # Manage cuda stream on different gpu
    #gguf_loader:GGUFLoader = None
    CPU_INFER = CPUInfer(Config().cpu_infer)
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        n_routed_experts: int,
        orig_module: nn.Module = None,
        device: str = "cpu",
        out_device: str = "cuda", # this device mean which device the output should on. TODO: support cpu.
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        #if KExpertsCPU.gguf_loader is None:
        #    KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
        self.gguf_loader = gguf_loader
        assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU"
        self.n_routed_experts = n_routed_experts
        self.out_device = out_device
        self.backend = kwargs.get("backend", "llamafile")

        self.key = key
        self.config = config
        self.device = device

        self.call_count = 0
        self.flops_per_call = []
        self.times = []
        
        self.tflops_fwd = []
        self.tflops_bwd = []

    def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
        if device:
            assert device.lower() == "cpu", "KSFTExpertsCPU can only be loaded on CPU, Parameter \"device\" can be cpu or None."
        if w is None: w = self.load_weights()[self.key]
        self.gate = w["gate"]
        self.up = w["up"]
        self.down = w["down"]
        self.gate_type = w["gate_type"]
        self.up_type = w["up_type"]
        self.down_type = w["down_type"]
        gate_ptr = ctypes.addressof(
            ctypes.cast(self.gate.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        up_ptr = ctypes.addressof(
            ctypes.cast(self.up.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        down_ptr = ctypes.addressof(
            ctypes.cast(self.down.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        #print(self.gate_type, self.up_type, self.down_type)
        n_routed_experts = self.n_routed_experts
        # n_routed_experts = len(self.orig_module)
        self.cpu_infer = KSFTExpertsCPU.CPU_INFER
        
        model_dtype = torch.get_default_dtype()
        if torch.xpu.is_available() and model_dtype == torch.float16:
            hidden_type = 1 # fp16
        else:
            hidden_type = 30 # bf16
        if self.backend == "llamafile":
            # print("GO INTO LLAMAFILE!!")
            moe_config = SFT_MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                64,
                10,
                1024,
                gate_ptr,
                up_ptr,
                down_ptr,
                self.gate_type,
                self.up_type,
                self.down_type,
                hidden_type, # TODO: get from model.dtype
            )
            self.moe = SFT_MOE(moe_config)
        elif self.backend == "AMXBF16":
            print("GO INTO AMXBF16!!")
            from cpuinfer_ext.sft_moe import SFT_AMX_MOEConfig, SFT_AMXBF16_MOE
            assert self.gate_type == GGMLQuantizationType.BF16
            assert self.up_type == GGMLQuantizationType.BF16
            assert self.down_type == GGMLQuantizationType.BF16
            moe_config = SFT_AMX_MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
                gate_ptr,
                up_ptr,
                down_ptr,
            )
            self.moe = SFT_AMXBF16_MOE(moe_config)
            self.cpu_infer.submit(self.moe.load_weights())
            self.cpu_infer.sync()
        elif self.backend == "AMXInt8":
            print("GO INTO AMXInt8!!")
            from cpuinfer_ext.sft_moe import SFT_AMX_MOEConfig, SFT_AMXInt8_MOE
            assert self.gate_type == GGMLQuantizationType.BF16
            assert self.up_type == GGMLQuantizationType.BF16
            assert self.down_type == GGMLQuantizationType.BF16
            moe_config = SFT_AMX_MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
                self.config.hidden_size,
                self.config.moe_intermediate_size,
                max(cuda_graphs) if isinstance(cuda_graphs, list) else Config().chunk_size,
                gate_ptr,
                up_ptr,
                down_ptr,
            )
            self.moe = SFT_AMXInt8_MOE(moe_config)
            self.cpu_infer.submit(self.moe.load_weights())
            self.cpu_infer.sync()

        # print(n_routed_experts, hidden_size, moe_intermediate_size)
        num_experts_per_tok = self.config.num_experts_per_tok
        if warmup:
            self.cpu_infer.submit(self.moe.warm_up())
            self.cpu_infer.sync()
        if self.out_device not in KSFTExpertsCPU.output_gpu_map:
            KSFTExpertsCPU.output_gpu_map[self.out_device] = torch.zeros((self.config.hidden_size), device=self.out_device)
        if KSFTExpertsCPU.input_tensor_cpu == None:
            KSFTExpertsCPU.input_tensor_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True)
            KSFTExpertsCPU.expert_ids_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.long, pin_memory=True)
            KSFTExpertsCPU.weights_cpu = torch.zeros((num_experts_per_tok), device="cpu", dtype=torch.float32, pin_memory=True)
            KSFTExpertsCPU.output_cpu = torch.zeros((self.config.hidden_size), device="cpu", pin_memory=True, dtype=torch.bfloat16)
            
        self.gate = None
        self.up = None
        self.down = None
            
    def submit_for_one_decode(self, input_tensor, expert_ids, weights):
        KSFTExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
        KSFTExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
        KSFTExpertsCPU.weights_cpu.copy_(weights, non_blocking=True)
        self.cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream, self.moe.forward(1, expert_ids.size(0), KSFTExpertsCPU.expert_ids_cpu.data_ptr(), KSFTExpertsCPU.weights_cpu.data_ptr(), KSFTExpertsCPU.input_tensor_cpu.data_ptr(), KSFTExpertsCPU.output_cpu.data_ptr()))
        
    def sync_for_one_decode(self):
        self.cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream(self.out_device).cuda_stream)
        KSFTExpertsCPU.output_gpu_map[self.out_device].copy_(KSFTExpertsCPU.output_cpu, non_blocking=True)
        return KSFTExpertsCPU.output_gpu_map[self.out_device]

    @staticmethod
    def forward(ctx, input_tensor, expert_ids, weights, cpu_infer, moe, out_device, layer_idx):
        # print("Go into the forward")
        
        # generate, capture and run cuda graph
        # torch.set_printoptions(threshold=float('inf'))
        # print(expert_ids)
        # expert_ids.cpu().numpy().tofile('debug_expert_ids.txt', sep='\n')
        # print(expert_ids.size())
        # print(xx)
        if input_tensor.size(0)==1 and torch.cuda.is_current_stream_capturing():
            # TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
            #print("capturing experts")
            KSFTExpertsCPU.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
            KSFTExpertsCPU.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
            KSFTExpertsCPU.weights_cpu.copy_(weights, non_blocking=True)
            cpu_infer.submit_with_cuda_stream(torch.cuda.current_stream().cuda_stream, moe.forward(1, expert_ids.size(1), KSFTExpertsCPU.expert_ids_cpu.data_ptr(), KSFTExpertsCPU.weights_cpu.data_ptr(), KSFTExpertsCPU.input_tensor_cpu.data_ptr(), KSFTExpertsCPU.output_cpu.data_ptr()))
            cpu_infer.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
            t_fwd     = time.time() - wall_t0
            KSFTExpertsCPU.output_gpu_map[out_device].copy_(KSFTExpertsCPU.output_cpu, non_blocking=True)
            result = KSFTExpertsCPU.output_gpu_map[out_device]
        else:
            input_tensor = input_tensor.contiguous().cpu()
            expert_ids = expert_ids.contiguous().cpu()
            weights = weights.contiguous().to(torch.float32).cpu()
            output = torch.empty_like(input_tensor).contiguous()
            # print("success record")
            wall_t0 = time.time()
            cpu_infer.submit(
                moe.forward(
                    expert_ids.size(0), 
                    expert_ids.size(1), 
                    expert_ids.data_ptr(), 
                    weights.data_ptr(), 
                    input_tensor.data_ptr(), 
                    output.data_ptr(),
                )
            )
            cpu_infer.sync()
            t_fwd     = time.time() - wall_t0

            result = output.to(device=out_device)

        ctx.save_for_backward(input_tensor, expert_ids, weights)
        ctx.cpu_infer  = cpu_infer
        ctx.moe        = moe
        ctx.out_device = out_device
        ctx.layer_idx = layer_idx
        
        # ---------- FLOPs ----------
        qlen = expert_ids.size(0)
        k    = expert_ids.size(1)

        flops_fwd = 6 * qlen * k * H_FIXED * M_FIXED
        tflops_f  = flops_fwd / t_fwd / 1e12

        ctx.saved_dims = (qlen, k)
        ctx._time_fwd  = t_fwd
        # print(f"qlen ,k:{qlen}, {k}")
        
        # with open("test_V3_ESC.txt", "a", encoding="utf-8") as f:
        #     f.write(f"[KSFTExpertsCPU]Forward: {flops_fwd/1e9:.3f} GFLOPs {tflops_f:.2f} TFLOPS {t_fwd*1e3:.2f} ms\n")

        return result
        
    @staticmethod
    def backward(ctx, output_grad):
        # print("Go into the backward!!")
        
        # Pick back the middle results
        input_tensor, expert_ids, weights = ctx.saved_tensors
        import random
        layer_idx = random.randint(0, 10000)
        # print(f"layer_idx:{layer_idx}")
        # layer_idx   = ctx.layer_idx
        
        # cpu_infer  = ctx.cpu_infer
        # moe        = ctx.moe
        # out_device = ctx.out_device

        # ready for computing gradient
        output_grad = output_grad.contiguous().cpu()
        input_grad = torch.empty_like(input_tensor).contiguous()
        # print(dir(cpuinfer_ext.moe.MOE))
        bw_start = time.time()
        ctx.cpu_infer.submit(
            ctx.moe.backward(
                # layer_idx,
                output_grad.size(0),  # qlen
                expert_ids.size(1),   # k
                expert_ids.data_ptr(),
                weights.data_ptr(),
                input_tensor.data_ptr(), 
                output_grad.data_ptr(),
                input_grad.data_ptr(),
            )
        )
        ctx.cpu_infer.sync()
        
        bw_end   = time.time()
        t_bw    = bw_end - bw_start
        
        # ---------- FLOPs ----------
        qlen, k  = ctx.saved_dims
        flops_bw = 10 * qlen * k * H_FIXED * M_FIXED
        tflops_b = flops_bw / t_bw / 1e12
        # print(f"qlen:{qlen}, k:{k}")

        # with open("test_V3_ESC.txt", "a", encoding="utf-8") as f:
        #     f.write(f"[KSFTExpertsCPU]Backward: {flops_bw/1e9:.3f} GFLOPs {tflops_b:.2f} TFLOPS {t_bw*1e3:.2f} ms\n")
        
        return input_grad.to(device=ctx.out_device), None, None, None, None, None, None
    
    def unload(self):
        return

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
        # TODO: support Bias
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None
        gate_type = None
        up_type = None
        down_type = None

        for key in keys:
            if isinstance(self.gguf_loader, SafeTensorLoader):
                res = self.gguf_loader.load_experts(key)
                return {key: res}
            elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
                # gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                # up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                # down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
                gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
                up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
                down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
            
            elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
                gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
                up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
                down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
            elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info:
                # for supporting  Mixtral-8x7B-Instuct  
                gate = []
                up = []
                down = []
                for i in range(8):
                    gate_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_gate.{i}.weight")
                    up_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_up.{i}.weight")
                    down_it = self.gguf_loader.get_mmap_tensor(f"{key}.ffn_down.{i}.weight")
                    gate.append(gate_it)
                    up.append(up_it)
                    down.append(down_it)
                gate = np.stack(gate)
                up = np.stack(up)
                down = np.stack(down)
                gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
                up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
                down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")
            res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
        return res
    
class KExpertsMarlin(KExpertsBase):
    expert_num: int
    loaded_experts_idx: list[int]
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        n_routed_experts: int,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.expert_num = n_routed_experts
        self.loaded_experts_idx = []
        self.act_fn = ACT2FN[config.hidden_act]
        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
        self.device = device
        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size

        # create empty marlin experts according to the number of experts per token
        # up
        self.up_projs = [KLinearMarlin(key+ "." + "ffn_up_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
        # gate
        self.gate_projs = [KLinearMarlin(key+ "." + "ffn_gate_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]
        # down
        self.down_projs = [KLinearMarlin(key+ "." + "ffn_down_exps", gguf_loader, config, device=device) for i in range(self.expert_num)]

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin experts can only be loaded on GPU"
        if w is None:
            w = self.load_weights()
            load_by_experts = True

        if load_by_experts:
            if isinstance(w, dict):
                self.gate = w["gate"]
                self.up = (w["up"])
                self.down = (w["down"])
                for i in tqdm(range(self.expert_num), desc=f"Dequanting and quanting for KExpertsMarlin {self.key}"):
                    up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", self.up, i, self.elements_per_tensor, device=self.device)
                    gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", self.gate, i, self.elements_per_tensor, device=self.device)
                    down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", self.down, i, self.elements_per_tensor, device=self.device)
                    
                    self.up_projs[i].load(nn.Parameter(up_weights), device=device)
                    self.gate_projs[i].load(nn.Parameter(gate_weights), device=device)
                    self.down_projs[i].load(nn.Parameter(down_weights), device=device)
                    self.loaded_experts_idx.append(i)
        else:
            if isinstance(w, dict):
                self.gate = w["gate"]
                self.up = (w["up"])
                self.down = (w["down"])
                for i in range(self.expert_num):
                    self.up_projs[i].load(nn.Parameter(self.up[i,...]), device=device)
                    self.gate_projs[i].load(nn.Parameter(self.gate[i,...]), device=device)
                    self.down_projs[i].load(nn.Parameter(self.down[i,...]), device=device)
                    self.loaded_experts_idx.append(i)
        return 

    def unload(self):
        for i in self.loaded_experts_idx:
            self.up_projs[i].unload()
            self.gate_projs[i].unload()
            self.down_projs[i].unload()
        self.loaded_experts_idx = []

    def load_weights(self, override_key: str | None = None):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None

        for key in keys:
            if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
            res = {"gate": gate, "up": up, "down": down}
        return res

    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
        org_dtype = hidden_states_cpu.dtype
        org_device = hidden_states_cpu.device
        hidden_states_cpu = hidden_states_cpu.to(self.device)
        selected_experts_cpu = selected_experts_cpu.to(self.device)
        routing_weights_cpu = routing_weights_cpu.to(self.device).to(org_dtype)
        
        batch_sequence_length, hidden_dim = hidden_states_cpu.size()

        final_hidden_states = torch.zeros(
            (batch_sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
        )
        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.expert_num).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.expert_num):
            if not expert_mask[expert_idx].any():
                continue
            idx, top_x = torch.where(expert_mask[expert_idx])
            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            G = self.gate_projs[expert_idx].forward(current_state)
            A = self.act_fn(G)
            U = self.up_projs[expert_idx].forward(current_state)
            H = A * U  # Element-wise multiplication
            current_hidden_states = self.down_projs[expert_idx].forward(H) * routing_weights_cpu[top_x, idx, None]
            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states)
        
        return final_hidden_states.to(dtype=org_dtype, device=org_device)
    
# untested, CUDA OOM
class KExpertsTorch(KExpertsBase):
    expert_num: int
    loaded_experts_idx: list[int]
    gate: torch.Tensor
    up: torch.Tensor
    down: torch.Tensor
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        n_routed_experts: int,
        orig_module: nn.Module = None,
        device: str = "cpu",
        **kwargs
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.expert_num = n_routed_experts
        # self.loaded_experts_idx = []
        self.act_fn = ACT2FN[config.hidden_act]
        self.device = device
        self.elements_per_tensor = config.moe_intermediate_size * config.hidden_size
        self.gate = [None for _ in range(self.expert_num)]
        self.up = [None for _ in range(self.expert_num)]
        self.down = [None for _ in range(self.expert_num)]
        self.dtype = torch.get_default_dtype()

        self.call_count = 0
        self.flops_per_call = []
        self.times = []
        self.expert_flops_details = []  
        self.total_flops = 0
        
        h = self.config.hidden_size
        m = self.config.moe_intermediate_size
        self.params_per_expert = 3 * h * m
        self.total_params = self.expert_num * self.params_per_expert

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str | None = None, warmup: bool = False):
        if device is None: device = self.device
        if w is None:
            w = self.load_weights()
            load_by_experts = True

        if load_by_experts:
            if isinstance(w, dict):
                if isinstance(self.gguf_loader, SafeTensorLoader): 
                    for i in tqdm(range(self.expert_num), desc=f"Loading experts(safetensors) for {self.key}"):
                        up_k   = f"{self.key}.{i}.up_proj.weight"
                        gate_k = f"{self.key}.{i}.gate_proj.weight"
                        down_k = f"{self.key}.{i}.down_proj.weight"
                        
                        self.up[i]   = self.gguf_loader.load_tensor(up_k,   device=self.device).contiguous()
                        self.gate[i] = self.gguf_loader.load_tensor(gate_k, device=self.device).contiguous()
                        self.down[i] = self.gguf_loader.load_tensor(down_k, device=self.device).contiguous()
                else: # GGUFLoader
                    for i in tqdm(range(self.expert_num), desc=f"Dequanting for KExpertsTorch {self.key}"):
                        up_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_up_exps.weight", w["up"], i, self.elements_per_tensor, device=self.device)
                        gate_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_gate_exps.weight", w["gate"], i, self.elements_per_tensor, device=self.device)
                        down_weights = self.gguf_loader.load_expert_tensor(self.key + ".ffn_down_exps.weight", w["down"], i, self.elements_per_tensor, device=self.device)
                        
                        self.up[i] = up_weights
                        self.gate[i] = gate_weights
                        self.down[i] = down_weights
        else:
            if isinstance(w, dict):
                for i in range(self.expert_num):
                    self.gate[i] = w["gate"][i, ...].to(device=device, dtype=self.dtype)
                    self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype)
                    self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype)
        
        # self.up = torch.stack(self.up, dim=0)
        # self.gate = torch.stack(self.gate, dim=0)
        # self.down = torch.stack(self.down, dim=0)
        self.up = nn.Parameter(torch.stack(self.up, dim=0))
        self.gate = nn.Parameter(torch.stack(self.gate, dim=0))
        self.down = nn.Parameter(torch.stack(self.down, dim=0))
        return 

    def unload(self):
        if self.gate is not None:
            self.gate = None
            self.up = None
            self.down = None

    def load_weights(self, override_key: str | None = None):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None

        for key in keys:
            if isinstance(self.gguf_loader, SafeTensorLoader):
                res = self.gguf_loader.load_experts(key)
                return {key: res}
            elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
            else:
                import re
                match = re.match(r'model\.layers\.(\d+)\.mlp\.experts(.*)', key)
                if match:
                    layer_id = match.group(1)
                    suffix = match.group(2)
                    key = f"blk.{layer_id}{suffix}"
                    if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                        gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                        up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                        down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
            res = {"gate": gate, "up": up, "down": down}
        return res

    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
        start_time = time.time()

        org_device = hidden_states_cpu.device
        hidden_states_cpu = hidden_states_cpu.to(self.device)
        selected_experts_cpu = selected_experts_cpu.to(self.device)
        routing_weights_cpu = routing_weights_cpu.to(self.device)
        
        batch_sequence_length, hidden_dim = hidden_states_cpu.size()

        final_hidden_states = torch.zeros(
            (batch_sequence_length, hidden_dim), dtype=self.gate.dtype, device=hidden_states_cpu.device
        )
        org_dtype = hidden_states_cpu.dtype
        hidden_states_cpu = hidden_states_cpu.to(self.gate.dtype)
        routing_weights_cpu = routing_weights_cpu.to(self.gate.dtype)
        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.expert_num).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.expert_num):
            idx, top_x = torch.where(expert_mask[expert_idx])
            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            G = current_state @ self.gate[expert_idx,...].T
            A = self.act_fn(G)
            U = current_state @ self.up[expert_idx,...].T
            H = A * U  # Element-wise multiplication
            current_hidden_states = H @ self.down[expert_idx,...].T * routing_weights_cpu[top_x, idx, None]
            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states)

        call_flops = 0
        expert_details = []
        
        for expert_idx in range(self.expert_num):
            idx, top_x = torch.where(expert_mask[expert_idx])
            t_e = len(top_x)
            if t_e == 0:
                expert_details.append({'gate':0, 'act':0, 'up':0, 
                                      'element':0, 'down':0, 'routing':0})
                continue
                
            h = self.config.hidden_size
            m = self.config.moe_intermediate_size
            
            flops_gate = 2 * t_e * h * m
            flops_act = t_e * m
            flops_up = 2 * t_e * h * m
            flops_element = t_e * m
            flops_down = 2 * t_e * m * h
            flops_routing = t_e * h
            
            total_expert = sum([flops_gate, flops_act, flops_up, 
                               flops_element, flops_down, flops_routing])
            call_flops += total_expert
            
            expert_details.append({
                'gate': flops_gate,
                'act': flops_act,
                'up': flops_up,
                'element': flops_element,
                'down': flops_down,
                'routing': flops_routing
            })
        
        self.call_count += 1
        self.flops_per_call.append(call_flops)
        self.total_flops += call_flops
        self.expert_flops_details.append(expert_details)
        self.times.append(time.time() - start_time)

        return final_hidden_states.to(dtype=org_dtype, device=org_device)

    # def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
    #     print("Enter the forward function!")
    #     current_call_start = time.perf_counter()
    #     if hasattr(self, 'last_call_end_time') and self.last_call_end_time is not None:
    #         inter_call_interval = current_call_start - self.last_call_end_time
    #         # print(f"\n[Forward Call Interval] Time since last forward call: {inter_call_interval:.6f} seconds")
    #         logging.info(f"\n[Forward Call Interval] Time since last forward call: {inter_call_interval:.6f} seconds")
    #     else:
    #         inter_call_interval = 0.0

    #     data_transfer_time = 0.0
    #     tensor_init_time = 0.0
    #     expert_mask_time = 0.0
    #     expert_loop_total = 0.0
    #     gate_time_total = 0.0
    #     up_time_total = 0.0
    #     elementwise_time_total = 0.0
    #     down_time_total = 0.0
    #     index_add_time_total = 0.0
    #     cast_back_time = 0.0

    #     start = time.perf_counter()
    #     org_device = hidden_states_cpu.device
    #     hidden_states_cpu = hidden_states_cpu.to(self.device)
    #     selected_experts_cpu = selected_experts_cpu.to(self.device)
    #     routing_weights_cpu = routing_weights_cpu.to(self.device)
    #     data_transfer_time = time.perf_counter() - start

    #     start = time.perf_counter()
    #     batch_sequence_length, hidden_dim = hidden_states_cpu.size()
    #     final_hidden_states = torch.zeros(
    #         (batch_sequence_length, hidden_dim), dtype=self.gate.dtype, device=hidden_states_cpu.device
    #     )
    #     org_dtype = hidden_states_cpu.dtype
    #     hidden_states_cpu = hidden_states_cpu.to(self.gate.dtype)
    #     routing_weights_cpu = routing_weights_cpu.to(self.gate.dtype)
    #     tensor_init_time = time.perf_counter() - start

    #     start = time.perf_counter()
    #     expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.expert_num).permute(2, 1, 0)
    #     expert_mask_time = time.perf_counter() - start

    #     expert_loop_start = time.perf_counter()
    #     # for expert_idx in range(self.expert_num):
    #     for expert_idx in tqdm(range(self.expert_num), 
    #         idx, top_x = torch.where(expert_mask[expert_idx])
            
    #         current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)

    #         gate_start = time.perf_counter()
    #         G = current_state @ self.gate[expert_idx,...].T
    #         A = self.act_fn(G)
    #         gate_time_total += time.perf_counter() - gate_start

    #         up_start = time.perf_counter()
    #         U = current_state @ self.up[expert_idx,...].T
    #         up_time_total += time.perf_counter() - up_start

    #         element_start = time.perf_counter()
    #         H = A * U  # Element-wise multiplication
    #         elementwise_time_total += time.perf_counter() - element_start

    #         down_start = time.perf_counter()
    #         current_hidden_states = H @ self.down[expert_idx,...].T * routing_weights_cpu[top_x, idx, None]
    #         down_time_total += time.perf_counter() - down_start

    #         index_start = time.perf_counter()
    #         final_hidden_states.index_add_(0, top_x, current_hidden_states)
    #         index_add_time_total += time.perf_counter() - index_start

    #     expert_loop_total = time.perf_counter() - expert_loop_start
    #     start = time.perf_counter()
    #     final_hidden_states = final_hidden_states.to(dtype=org_dtype, device=org_device)
    #     cast_back_time = time.perf_counter() - start

    #     total_time = time.perf_counter() - current_call_start
    #     print(f"""
    # [Timing Breakdown]
    #     Data Transfer:          {data_transfer_time:.6f}s
    #     Tensor Initialization:  {tensor_init_time:.6f}s
    #     Expert Mask Creation:   {expert_mask_time:.6f}s
    #     Expert Loop Total:      {expert_loop_total:.6f}s
    #         -> Gate Computations:   {gate_time_total:.6f}s
    #         -> Up Projections:      {up_time_total:.6f}s
    #         -> Elementwise Mult:    {elementwise_time_total:.6f}s
    #         -> Down Projections:    {down_time_total:.6f}s
    #         -> Index Add Ops:       {index_add_time_total:.6f}s
    #     Cast Back to Original:  {cast_back_time:.6f}s
    #     Total Forward Time:     {total_time:.6f}s
    #     """)
    #     logging.info(f"""
    # [Timing Breakdown]
    #     Data Transfer:          {data_transfer_time:.6f}s
    #     Tensor Initialization:  {tensor_init_time:.6f}s
    #     Expert Mask Creation:   {expert_mask_time:.6f}s
    #     Expert Loop Total:      {expert_loop_total:.6f}s
    #         -> Gate Computations:   {gate_time_total:.6f}s
    #         -> Up Projections:      {up_time_total:.6f}s
    #         -> Elementwise Mult:    {elementwise_time_total:.6f}s
    #         -> Down Projections:    {down_time_total:.6f}s
    #         -> Index Add Ops:       {index_add_time_total:.6f}s
    #     Cast Back to Original:  {cast_back_time:.6f}s
    #     Total Forward Time:     {total_time:.6f}s
    #     """)

    #     self.last_call_end_time = time.perf_counter()

    #     return final_hidden_states


EXPERTS_MAP = {
    "KExpertsCPU": KExpertsCPU,
    "KSFTExpertsCPU": KSFTExpertsCPU,
    "KExpertsTorch": KExpertsTorch,
    "KExpertsMarlin": KExpertsMarlin,
}

class KTransformersExperts(BaseInjectedModule, KExpertsBase):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                #  device: str = "cuda",
                 prefill_device:str = "cuda",
                 prefill_op: str | None = "KExpertsTorch",
                 generate_device: str = "cpu",
                 generate_op: str | None = "KExpertsCPU",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        if generate_op is not None:
            self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
        else:
            self.generate_experts = None
        if prefill_op is not None:
            self.prefill_experts = EXPERTS_MAP[prefill_op](key, gguf_loader, config, len(orig_module), device=prefill_device, **kwargs)
        else:
            self.prefill_experts = None
        self.gpu_mlp_type = prefill_op
        self.cpu_mlp_type = generate_op
        self.mode = InferenceState.UNLOAD

    def load(self, w: dict = None,  mode: InferenceState = None, warmup: bool = True):
        # TODO support w as input
        if not mode: mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            self.prefill_experts.unload()
            self.generate_experts.load(w, warmup=warmup)
            self.device = self.generate_experts.device
            self.mode = mode
        elif mode == InferenceState.PREFILL:
            self.generate_experts.unload()
            self.prefill_experts.load(w, warmup=warmup)
            self.device = self.prefill_experts.device
            self.mode = mode
        elif mode == InferenceState.UNLOAD:
            self.unload()
            self.mode = mode
            self.device = self.generate_experts.device
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

    def unload(self):
        if self.generate_experts is not None:
            self.generate_experts.unload()
        if self.prefill_experts is not None:
            self.prefill_experts.unload()
        self.device = self.generate_experts.device

    def forward(self, input_tensor, expert_ids, weights):
        if self.mode == InferenceState.GENERATE:
            assert self.generate_experts is not None, "generate_experts is None"
            if type(self.generate_experts) == KSFTExpertsCPU:
                layer_idx = int(re.search(r'\d+', self.key).group())
                return self.generate_experts.apply(input_tensor, expert_ids, weights, self.generate_experts.cpu_infer, self.generate_experts.moe, self.generate_experts.out_device, layer_idx)
            else:
                return self.generate_experts.forward(input_tensor, expert_ids, weights)
        elif self.mode == InferenceState.PREFILL:
            assert self.prefill_experts is not None, "prefill_experts is None"
            return self.prefill_experts.forward(input_tensor, expert_ids, weights)
        else:
            raise ValueError("load or set_inference_mode before forward")

    def set_inference_mode(self, mode: InferenceState):
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE, warmup=False)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL, warmup=False)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")


from ktransformers.models.modeling_deepseek import DeepseekV2MoE
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MoE
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock
from ktransformers.models.modeling_mixtral import MixtralSparseMoeBlock


class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        orig_shape = hidden_states.shape
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)
        
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0], routing_weights[0])
            shared_expert_output = self.shared_expert(hidden_states)
            shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y += shared_expert_output
            y.resize_(*orig_shape)
            return y, router_logits
        
        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, KExpertsBase) else hidden_states.cpu()
        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts.cpu()
        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights.cpu()

        shared_expert_output = self.shared_expert(hidden_states)
        shared_expert_output = (
            F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
        )

        if isinstance(self.experts, KExpertsBase):
            y = (
                self.moe_kexperts(
                    hidden_states_expert, selected_experts_expert, routing_weights_expert
                )
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        elif hidden_states_expert.size(0) > 10:
            y = self.moe_infer(
                hidden_states_expert, selected_experts_expert, routing_weights_expert, orig_shape
            ).to(device=hidden_states.device)
        else:
            y = self.moe_infer_simple(
                hidden_states_expert, selected_experts_expert, routing_weights_expert
            ).to(device=hidden_states.device)
        y += shared_expert_output
        y.resize_(*orig_shape)
        return y, router_logits
    
    @maybe_no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
        '''
        hidden_states_cpu: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        '''
        outs = torch.zeros_like(hidden_states_cpu)
        for token_idx in range(selected_experts_cpu.size(0)):
            for expert_idx in range(selected_experts_cpu.size(1)):
                expert = self.experts[selected_experts_cpu[token_idx, expert_idx]]
                outs[token_idx] += expert.forward(hidden_states_cpu[token_idx]) * routing_weights_cpu[token_idx, expert_idx]
        return outs
    
    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
        
        batch_size, sequence_length, hidden_dim = orig_shape

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer.forward(current_state) * routing_weights_cpu[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states_cpu.dtype))

        return final_hidden_states

class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]
        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity).squeeze(0)
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y += y_
            y.resize_(*orig_shape)
            return y

        if self.config.n_shared_experts is not None:
            y_ = self.shared_experts(identity).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        if self.config.n_shared_experts is not None:
            y += y_
        return y

    @maybe_no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KDeepseekV3MoE(BaseInjectedModule, DeepseekV3MoE):
    
    def forward(self, hidden_states):
        identity = hidden_states
        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]
        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        
        # only for generate phase
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity).squeeze(0)
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y += y_
            y.resize_(*orig_shape)
            return y

        if self.config.n_shared_experts is not None:
            y_ = self.shared_experts(identity).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
            y = self.moe_kexperts(hidden_states, topk_idx, topk_weight).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        if self.config.n_shared_experts is not None:
            y += y_
        return y

    @maybe_no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KMistralSparseMoEBlock(BaseInjectedModule, MixtralSparseMoeBlock):
    
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        orig_shape = hidden_states.shape
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        if self.training and self.jitter_noise > 0:
            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)
        
        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0], routing_weights[0])
            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)
            y.resize_(*orig_shape)
            return y, router_logits
        
        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, KExpertsBase) else hidden_states_expert.cpu()
        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts_expert.cpu()
        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu()

        if isinstance(self.experts, KExpertsBase):
            y = (
                self.moe_kexperts(
                    hidden_states_expert, selected_experts_expert, routing_weights_expert
                )
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        elif hidden_states_expert.size(0) > 10:
            y = self.moe_infer(
                hidden_states_expert, selected_experts_expert, routing_weights_expert, orig_shape
            ).to(device=hidden_states.device)
        else:
            y = self.moe_infer_simple(
                hidden_states_expert, selected_experts_expert, routing_weights_expert
            ).to(device=hidden_states.device)
            
        y.resize_(*orig_shape)
        return y, router_logits
    
    @maybe_no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
        '''
        hidden_states_cpu: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        '''
        outs = torch.zeros_like(hidden_states_cpu)
        for token_idx in range(selected_experts_cpu.size(0)):
            for expert_idx in range(selected_experts_cpu.size(1)):
                expert = self.experts[selected_experts_cpu[token_idx, expert_idx]]
                outs[token_idx] += expert.forward(hidden_states_cpu[token_idx]) * routing_weights_cpu[token_idx, expert_idx]
        return outs
    
    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
        
        batch_size, sequence_length, hidden_dim = orig_shape

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states_cpu.dtype, device=hidden_states_cpu.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts_cpu, num_classes=self.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states_cpu[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer.forward(current_state) * routing_weights_cpu[top_x, idx, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states_cpu.dtype))

        return final_hidden_states

class KDeepseekV3MoEV2(BaseInjectedModule, DeepseekV3MoE):
    def forward(self, hidden_states, bsz_tensor, cuda_graph_idx=0):
        identity = hidden_states
        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]
        topk_idx, topk_weight = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
        

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, topk_idx, topk_weight, bsz_tensor, cuda_graph_idx)
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity, bsz_tensor).squeeze(0)
            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            y += y_
            y.resize_(*orig_shape)
            return y

        if self.config.n_shared_experts is not None:
            y_ = self.shared_experts(identity, bsz_tensor).squeeze(0)
            
        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, topk_idx, topk_weight, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, topk_idx, topk_weight)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        if self.config.n_shared_experts is not None:
            y += y_
        return y

    @maybe_no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KTransformersExpertsV2(BaseInjectedModule, KExpertsBase):
    def __init__(self,
                 key: str,
                 gguf_loader: GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                #  device: str = "cuda",
                 prefill_device:str = "cuda",
                 prefill_op: str | None = "KExpertsTorch",
                 generate_device: str = "cpu",
                 generate_op: str | None = "KExpertsCPU",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        KExpertsBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        if generate_op is not None:
            self.generate_experts = EXPERTS_MAP[generate_op](key, gguf_loader, config, len(orig_module), device=generate_device, **kwargs)
        else:
            self.generate_experts = None
        if prefill_op is not None:
            self.prefill_experts = EXPERTS_MAP[prefill_op](key, gguf_loader, config, len(orig_module), device=prefill_device, **kwargs)
        else:
            self.prefill_experts = None
        self.gpu_mlp_type = prefill_op
        self.cpu_mlp_type = generate_op
        self.mode = InferenceState.UNLOAD

    def load(self, w: dict = None,  mode: InferenceState = None, warmup: bool = True):
        # TODO support w as input
        if not mode: mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            self.prefill_experts.unload()
            self.generate_experts.load(w, warmup=warmup)
            self.device = self.generate_experts.device
            self.mode = mode
        elif mode == InferenceState.PREFILL:
            self.generate_experts.unload()
            self.prefill_experts.load(w, warmup=warmup)
            self.device = self.prefill_experts.device
            self.mode = mode
        elif mode == InferenceState.UNLOAD:
            self.unload()
            self.mode = mode
            self.device = self.generate_experts.device
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

    def unload(self):
        if self.generate_experts is not None:
            self.generate_experts.unload()
        if self.prefill_experts is not None:
            self.prefill_experts.unload()
        self.device = self.generate_experts.device

    def forward(self, input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx=0):
        if self.mode == InferenceState.GENERATE:
            assert self.generate_experts is not None, "generate_experts is None"
            return self.generate_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        elif self.mode == InferenceState.PREFILL:
            assert self.prefill_experts is not None, "prefill_experts is None"
            return self.prefill_experts.forward(input_tensor, expert_ids, weights, bsz_tensor, cuda_graph_idx)
        else:
            raise ValueError("load or set_inference_mode before forward")

    def set_inference_mode(self, mode: InferenceState):
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE, warmup=False)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL, warmup=False)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")

class KQwen2MoeSparseMoeBlockV2(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
    def forward(self, hidden_states, bsz_tensor, cuda_graph_idx=0):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        router_logits = self.gate(hidden_states, bsz_tensor)        

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
            y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
            y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_    

            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            
            y += y_
            y.resize_(*orig_shape)
            return y

        y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
        y_ = (
            F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        )


        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            ) 
        y += y_
        return y

    @maybe_no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out

class KQwen3MoeSparseMoeBlockV2(BaseInjectedModule, Qwen3MoeSparseMoeBlock):
    def forward(self, hidden_states, bsz_tensor=None, cuda_graph_idx=0):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        if bsz_tensor is None:
            router_logits = self.gate(hidden_states)
        else:
            router_logits = self.gate(hidden_states, bsz_tensor)

        if router_logits.device.type == "xpu":
            from ipex_llm.transformers.models.common import moe_softmax_topk
            selected_experts, routing_weights = moe_softmax_topk(
                router_logits.half(), self.top_k, self.norm_topk_prob
            )
        else:
            routing_weights = torch.nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
            routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
            if self.norm_topk_prob:
                routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        # only for generate phase
        if hasattr(self.experts.generate_experts, "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing(): # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx)
            # y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
            # y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_    

            y = self.experts.generate_experts.sync_for_one_decode(cuda_graph_idx).unsqueeze(0)
            
            # y += y_
            y.resize_(*orig_shape)
            return y

        # y_ = self.shared_expert(hidden_states, bsz_tensor).squeeze(0)
        # y_ = (
        #     F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        # )


        if isinstance(self.experts, KExpertsBase):
            y = self.moe_on_cpuinfer(hidden_states, selected_experts, routing_weights, bsz_tensor, cuda_graph_idx).view(*orig_shape).to(device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            ) 
        # y += y_
        return y

    @maybe_no_grad()
    def moe_on_cpuinfer(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor, bsz_tensor, cuda_graph_idx=0) -> torch.Tensor:
        outs = torch.empty_like(x)
        outs = self.experts(x, topk_ids, topk_weight, bsz_tensor, cuda_graph_idx)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(
        self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                    expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


class KQwen3MoeSparseMoeBlock(BaseInjectedModule, Qwen3MoeSparseMoeBlock):
    def forward(self, hidden_states):

        orig_shape = hidden_states.shape
        sequence_length = orig_shape[1]

        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])

        router_logits = self.gate(hidden_states)

        if router_logits.device.type == "xpu":
            from ipex_llm.transformers.models.common import moe_softmax_topk
            selected_experts, routing_weights = moe_softmax_topk(
                router_logits.half(), self.top_k, self.norm_topk_prob
            )
        else:
            routing_weights = torch.nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
            routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
            if self.norm_topk_prob:
                routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        # only for generate phase
        if sequence_length == 1 and hasattr(self.experts.generate_experts,
                                            "submit_for_one_decode") and torch.cuda.is_available() and torch.cuda.is_current_stream_capturing():  # TODO: this branch cause jit bug
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], selected_experts[0],
                                                                routing_weights[0])
            # y_ = self.shared_expert(hidden_states).squeeze(0)
            # y_ = F.sigmoid(self.shared_expert_gate(hidden_states)) * y_

            y = self.experts.generate_experts.sync_for_one_decode().unsqueeze(0)

            # y += y_
            y.resize_(*orig_shape)
            return y

        # y_ = self.shared_expert(hidden_states).squeeze(0)
        # y_ = (
        #     F.sigmoid(self.shared_expert_gate(hidden_states)) * y_
        # )

        if isinstance(self.experts, KExpertsBase):
            y = self.moe_kexperts(hidden_states, selected_experts, routing_weights).view(*orig_shape).to(
                device=hidden_states.device)
        elif hidden_states.size(0) > 10:
            # TODO may bugs here
            y = (
                self.moe_infer(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
        else:
            # TODO may bugs here
            y = (
                self.moe_infer_simple(hidden_states, selected_experts, routing_weights)
                .view(*orig_shape)
                .to(device=hidden_states.device)
            )
            # y += y_
        return y

    @maybe_no_grad()
    def moe_kexperts(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
        outs = self.experts(x, topk_ids, topk_weight)
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer_simple(
            self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor
    ) -> torch.Tensor:
        """
        x: [num_tokens, hidden_size]
        topk_ids, topk_weight: [num_tokens, num_selected_experts]
        """
        outs = torch.zeros_like(x)
        for token_idx in range(topk_ids.size(0)):
            for expert_idx in range(topk_ids.size(1)):
                expert = self.experts[topk_ids[token_idx, expert_idx]]
                outs[token_idx] += (
                        expert.forward(x[token_idx]) * topk_weight[token_idx, expert_idx]
                )
        return outs

    @maybe_no_grad()
    # TODO may bugs here
    def moe_infer(self, x, topk_ids, topk_weight):
        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_ids, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_ids.view(-1).argsort()
        sorted_tokens = x[idxs // topk_ids.shape[1]]
        tokens_per_expert = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert.forward(tokens_for_this_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_ids.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


================================================
FILE: kt-sft/ktransformers/operators/flashinfer_batch_prefill_wrapper.py
================================================
import torch
import flashinfer
import gc
try:
    from flash_attn import flash_attn_with_kvcache
    print("found flash_attn")
    
except ImportError:
    print("flash_attn not found, flashinfer unit test needed it. If you are using balance serve, ignore this.")

from typing import Union, Optional

def setup_seed(seed):
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

setup_seed(998244353)

torch.set_grad_enabled(False)
torch.set_default_dtype(torch.bfloat16)
global_dtype=torch.bfloat16
global_device=torch.device("cuda",0)
torch.cuda.set_device(0)
torch.backends.cudnn.enabled =True
torch.backends.cudnn.benchmark = True

class flashInferAttn():
	
	float_workspace_buffer = None
	def __init__(self,
			max_batch_token,
			max_batch_size,
			max_pages,
			device = "cuda:0",
			kv_layout: str = "NHD",
			use_cuda_graph: bool = False,
			) -> None:
		self.device = device
		self.max_batch_token = max_batch_token
		self.kv_layout = kv_layout
		self.use_cuda_graph = use_cuda_graph
		if flashInferAttn.float_workspace_buffer is None:
			flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device)
		self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
		self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
		self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
		self.paged_kv_last_page_len_buf = torch.empty((max_batch_size,), dtype=torch.int32, device=device)
		self.batch_size_tensor_buf = torch.empty((1,), dtype=torch.int32, device=device)
		self.num_tokens_tensor_buf = torch.empty((1,), dtype=torch.uint32, device=device)
	
		# TODO: custom mask
		self.custom_mask_buf = None
		self.qk_indptr_buf = None
		self.warpper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
			flashInferAttn.float_workspace_buffer,
			self.kv_layout,
			use_cuda_graph=self.use_cuda_graph,
			qo_indptr_buf=self.qo_indptr_buf,
			paged_kv_indptr_buf=self.paged_kv_indptr_buf,
			paged_kv_indices_buf=self.paged_kv_indices_buf,
			paged_kv_last_page_len_buf=self.paged_kv_last_page_len_buf,
			backend = "fa2",
		)

	def plan(self,
		qo_indptr: torch.Tensor,
		paged_kv_indptr: torch.Tensor,
		paged_kv_indices: torch.Tensor,
		paged_kv_last_page_len: torch.Tensor,
		batch_size_tensor: torch.Tensor,
		num_tokens_tensor: torch.Tensor,
		num_qo_heads: int,
		num_kv_heads: int,
		head_dim: int,
		page_size: int,
		causal: bool = True, 
		pos_encoding_mode: str = "NONE",
		q_data_type: Union[str, torch.dtype] = torch.bfloat16,
		kv_data_type: Optional[Union[str, torch.dtype]] = None):
		
		self.batch_size_tensor_buf.copy_(batch_size_tensor, non_blocking=True)
		self.num_tokens_tensor_buf.copy_(num_tokens_tensor, non_blocking=True)
		self.page_size = page_size
		self.warpper.plan(
			qo_indptr,
			paged_kv_indptr,
			paged_kv_indices,
			paged_kv_last_page_len,
			num_qo_heads,
			num_kv_heads,
			head_dim,
			page_size,
			causal = causal,
			pos_encoding_mode = pos_encoding_mode,
			q_data_type = q_data_type,
			kv_data_type = kv_data_type
			)

	def calc_batch_indices(self, ragged_size = None):
		if self.use_cuda_graph:
			self.batch_indices, self.positions = flashinfer.get_batch_indices_positions(
				self.qo_indptr_buf, flashinfer.get_seq_lens(self.paged_kv_indptr_buf, self.paged_kv_last_page_len_buf, self.page_size), self.batch_size_tensor_buf, self.max_batch_token)
		else:
			self.batch_indices, self.positions = flashinfer.get_batch_indices_positions(
				self.warpper._qo_indptr_buf, flashinfer.get_seq_lens(self.warpper._paged_kv_indptr_buf, self.warpper._paged_kv_last_page_len_buf, self.page_size), self.batch_size_tensor_buf, ragged_size)

	def forward(self, q, k_cache, v_cache, k, v):
		if self.use_cuda_graph:
			flashinfer.page.append_paged_kv_cache(k, v, self.batch_indices, self.positions, (k_cache, v_cache), self.paged_kv_indices_buf, self.paged_kv_indptr_buf, self.paged_kv_last_page_len_buf, self.num_tokens_tensor_buf)
			return self.warpper.run(q, (k_cache, v_cache))
		else:
			flashinfer.page.append_paged_kv_cache(k, v, self.batch_indices, self.positions, (k_cache, v_cache), self.warpper._paged_kv_indices_buf, self.warpper._paged_kv_indptr_buf, self.warpper._paged_kv_last_page_len_buf, self.num_tokens_tensor_buf)
			return self.warpper.run(q, (k_cache, v_cache))


def testCudaGraph():
	
	# use max batch to create buffer
	batch_decode = 8
	prefill_chunk = 48
	past_kv_0 = 4090
	past_kv_1 = 4096
	raged_size = prefill_chunk + batch_decode
	num_key_value_heads = 8
	head_dim = 128
	num_attention_heads = 64
	page_size = 256
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	attn = flashInferAttn(raged_size, batch_decode+1, total_num_pages, use_cuda_graph=True)

	batch_size_tensor = torch.tensor([batch_decode + 1], device=global_device, dtype=torch.int32)
	
	k_caches = []	
	v_caches = []
	ks = []
	vs = []
	qs = []
	for layer_idx in range(3):
		k_caches.append(torch.randn(total_num_pages, page_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		v_caches.append(torch.randn(total_num_pages, page_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		ks.append(torch.randn(raged_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		vs.append(torch.randn(raged_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		qs.append(torch.randn(raged_size, num_attention_heads, head_dim, device=global_device, dtype=torch.bfloat16))
	
	# warmup and capture small batch
	past_kv_0 = 250
	past_kv_1 = 256
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	q_indptr = torch.empty((batch_decode + 2,), dtype=torch.int32, device=global_device)
	q_indptr[0] = 0
	q_indptr[1:] = torch.arange(prefill_chunk, prefill_chunk + batch_decode + 1, device=global_device, dtype=torch.int32)
	kv_indptr = torch.arange(0, batch_decode + 2, device=global_device, dtype=torch.int32) * num_pages_per_seq
	kv_indices = torch.arange(0, total_num_pages, device=global_device, dtype=torch.int32)
	kv_last_page_len = torch.empty((batch_decode + 1,), dtype=torch.int32, device=global_device)
	kv_last_page_len[:1+batch_decode//2] = int((past_kv_0 - 1) % page_size + 1)
	kv_last_page_len[1+batch_decode//2:] = int((past_kv_1 - 1) % page_size + 1)

	print(q_indptr)
	print(kv_indptr)
	print(kv_indices)
	print(kv_last_page_len)
	attn.plan(q_indptr,
			kv_indptr,
			kv_indices,
			kv_last_page_len,
			batch_size_tensor,
			num_attention_heads,
			num_key_value_heads,
			head_dim,
			page_size,
			causal = True,
			pos_encoding_mode="NONE",
			q_data_type=torch.bfloat16)

	attn.calc_batch_indices(raged_size)
	for layer_idx in range(3):
		attn.forward(qs[layer_idx], k_caches[layer_idx], v_caches[layer_idx], ks[layer_idx], vs[layer_idx])
		torch.cuda.synchronize()

	outs = []
	g = torch.cuda.CUDAGraph()
	with torch.cuda.graph(g):
		for layer_idx in range(3):
			outs.append(attn.forward(qs[layer_idx], k_caches[layer_idx], v_caches[layer_idx], ks[layer_idx], vs[layer_idx]))
	g.replay()
	
	kv_last_page_len[:1+batch_decode//2] = int(past_kv_0)
	kv_last_page_len[1+batch_decode//2:] = int(past_kv_1)
	for layer_idx in range(3):
		for i in range(batch_decode + 1):
			
			qi = qs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			o_ref_i = flash_attn_with_kvcache(
				qi.unsqueeze(0),
				k_caches[layer_idx],
				v_caches[layer_idx],
				causal=True,
				block_table=kv_indices[kv_indptr[i]:kv_indptr[i+1]].unsqueeze(0),
				cache_seqlens=torch.tensor([past_kv_0 if i < 1+batch_decode//2 else past_kv_1], device=global_device, dtype=torch.int32)
			)
			o_i = outs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			print(layer_idx, i)
			torch.testing.assert_close(o_i.unsqueeze(0), o_ref_i, rtol=5e-3, atol=5e-3)

	# run another batch size use capture cuda graph
	past_kv_0 = 4090
	past_kv_1 = 4096
	prefill_chunk = 24
	batch_decode = 4
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	batch_size_tensor = torch.tensor([batch_decode + 1], device=global_device, dtype=torch.int32)
	num_tokens_tensor = torch.tensor([batch_decode + prefill_chunk], device=global_device, dtype=torch.int32)

	q_indptr = torch.empty((batch_decode + 2,), dtype=torch.int32, device=global_device)
	q_indptr[0] = 0
	q_indptr[1:] = torch.arange(prefill_chunk, prefill_chunk + batch_decode + 1, device=global_device, dtype=torch.int32)
	kv_indptr = torch.arange(0, batch_decode + 2, device=global_device, dtype=torch.int32) * num_pages_per_seq
	kv_indices = torch.arange(0, total_num_pages, device=global_device, dtype=torch.int32)
	kv_last_page_len = torch.empty((batch_decode + 1,), dtype=torch.int32, device=global_device)
	kv_last_page_len[:1+batch_decode//2] = int((past_kv_0 - 1) % page_size + 1)
	kv_last_page_len[1+batch_decode//2:] = int((past_kv_1 - 1) % page_size + 1)
	attn.plan(q_indptr,
			kv_indptr,
			kv_indices,
			kv_last_page_len,
			batch_size_tensor,
			num_attention_heads,
			num_key_value_heads,
			head_dim,
			page_size,
			causal = True,
			pos_encoding_mode="NONE",
			q_data_type=torch.bfloat16)
	attn.calc_batch_indices(raged_size)
	g.replay()
	
	kv_last_page_len[:1+batch_decode//2] = int(past_kv_0)
	kv_last_page_len[1+batch_decode//2:] = int(past_kv_1)
	for layer_idx in range(3):
		for i in range(batch_decode + 1):
			
			qi = qs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			o_ref_i = flash_attn_with_kvcache(
				qi.unsqueeze(0),
				k_caches[layer_idx],
				v_caches[layer_idx],
				causal=True,
				block_table=kv_indices[kv_indptr[i]:kv_indptr[i+1]].unsqueeze(0),
				cache_seqlens=torch.tensor([past_kv_0 if i < 1+batch_decode//2 else past_kv_1], device=global_device, dtype=torch.int32)
			)
			o_i = outs[layer_idx][q_indptr[i] : q_indptr[i + 1]]
			print(layer_idx, i)
			torch.testing.assert_close(o_i.unsqueeze(0), o_ref_i, rtol=5e-3, atol=5e-3)
			

def testAttentionFlashInfer(	
	):
	batch_decode = 32
	prefill_chunk = 64
	past_kv_0 = 510
	past_kv_1 = 512
	raged_size = prefill_chunk + batch_decode
	num_key_value_heads = 8
	head_dim = 128
	num_attention_heads = 64
	cases = 1
	page_size = 32
	num_pages_per_seq = (past_kv_1 + page_size - 1) // page_size
	total_num_pages = (num_pages_per_seq + 1) * (batch_decode + 1) + prefill_chunk // page_size
	workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
	qs = []
	kvs = []
	q_indptrs = []
	kv_indptrs = []
	kv_indicess = []
	kv_last_page_lens = []
	wrappers = []
	for case_id in range(cases):
		kvs.append(torch.randn(total_num_pages, 2, page_size, num_key_value_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		qs.append(torch.randn(raged_size, num_attention_heads, head_dim, device=global_device, dtype=torch.bfloat16))
		q_indptr = torch.empty((batch_decode + 2,), dtype=torch.int32, device=global_device)
		q_indptr[0] = 0
		q_indptr[1:] = torch.arange(prefill_chunk, prefill_chunk + batch_decode + 1, device=global_device, dtype=torch.int32)
		q_indptrs.append(q_indptr)
		kv_indptrs.append(torch.arange(0, batch_decode + 2, device=global_device, dtype=torch.int32) * num_pages_per_seq)
		kv_indicess.append(torch.arange(0, total_num_pages, device=global_device, dtype=torch.int32))
		kv_last_page_len = torch.empty((batch_decode + 1,), dtype=torch.int32, device=global_device)
		kv_last_page_len[:1+batch_decode//2] = int((past_kv_0 - 1) % page_size + 1)
		kv_last_page_len[1+batch_decode//2:] = int((past_kv_1 - 1) % page_size + 1)
		kv_last_page_lens.append(kv_last_page_len)
		wrappers.append(flashinfer.BatchPrefillWithPagedKVCacheWrapper(
			workspace_buffer,
			"NHD",
			use_cuda_graph=True,
			qo_indptr_buf=q_indptrs[case_id],
			paged_kv_indptr_buf=kv_indptrs[case_id],
			paged_kv_indices_buf=kv_indicess[case_id],
			paged_kv_last_page_len_buf=kv_last_page_lens[case_id],
		))
		wrappers[case_id].plan(
			q_indptrs[case_id],
			kv_indptrs[case_id],
			kv_indicess[case_id],
			kv_last_page_lens[case_id],
			num_attention_heads,
			num_key_value_heads,
			head_dim,
			page_size,
			causal = True,
			pos_encoding_mode="ROPE_LLAMA",
			q_data_type=torch.bfloat16
		)
					
	def custom_forward(case_id):
		out = wrappers[case_id].run(qs[case_id], kvs[case_id])
	
	custom_forward(0)

# testCudaGraph()
# pass

================================================
FILE: kt-sft/ktransformers/operators/flashinfer_wrapper.py
================================================
'''
Description  : flashinfer MLA wrapper
Author       : Boxin Zhang
Version      : 0.2.3
'''
import torch
import os
from ktransformers.operators.triton_attention import decode_attention_fwd_grouped

flashinfer_enabled = False

try:
    import flashinfer
    flashinfer_enabled = True
    print("found flashinfer")
    
except ImportError:
    print("flashinfer not found, use triton for linux")

import math

def attention_ref_torch(
    batch_size,
    q: torch.Tensor,
    k: torch.Tensor,
    v: torch.Tensor,
    causal: bool,
    sm_scale: float,
) -> torch.Tensor:
    qo_len = q.shape[0] // batch_size
    kv_len = k.shape[0] // batch_size
    num_qo_heads = q.shape[1]
    head_dim_qk = q.shape[2]
    head_dim_vo = v.shape[2]
    logits = (
        torch.einsum(
            "bmhd,bnhd->bhmn",
            q.view(batch_size, qo_len, num_qo_heads, head_dim_qk).float(),
            k.view(batch_size, kv_len, num_qo_heads, head_dim_qk).float(),
        )
        * sm_scale
    )

    #print("attn weights", logits)

    if causal:
        mask = (
            torch.arange(kv_len - qo_len, kv_len).unsqueeze(1)
            >= torch.arange(0, kv_len).unsqueeze(0)
        ).to(q.device)
    else:
        mask = torch.ones(qo_len, kv_len).to(q.device)

    logits = logits.masked_fill(mask.unsqueeze(0).unsqueeze(0) == 0, float("-inf"))
    lse_ref = torch.logsumexp(logits, -1).transpose(-1, -2)
    p = torch.softmax(logits, dim=-1)
    o_ref = (
        torch.einsum(
            "bhmn,bnhd->bmhd",
            p,
            v.view(batch_size, kv_len, num_qo_heads, head_dim_vo).float(),
        )
        .contiguous()
        .view(batch_size * qo_len, num_qo_heads, head_dim_vo)
        .to(q)
    )

    return o_ref, lse_ref * math.log2(math.e)

class MLAWrapper():
    def __init__(self,
                 max_batch_size,
                 max_pages,
                 use_cuda_graph = True,
                 device = "cuda",
                 ):
        self.float_workspace_buffer = torch.empty(128*1024*1024, dtype=torch.int8, device=device)
        self.max_batch_size = max_batch_size
        self.max_pages = max_pages
        if use_cuda_graph:
            if self.max_batch_size == 1:
                self.qo_indptr_buf = torch.arange(0, max_batch_size+1, dtype=torch.int32, device=device)
                self.kv_indptr_buf = torch.tensor([0, max_pages], dtype=torch.int32, device=device)
                self.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
            else:
                self.qo_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
                self.kv_indptr_buf = torch.empty(max_batch_size+1, dtype=torch.int32, device=device)
                self.kv_indices_buf = torch.empty(max_pages, dtype=torch.int32, device=device)
            self.batch_size_tensor_buf = torch.tensor([self.max_batch_size], dtype=torch.int32, device=device)
            self.kv_len_arr_buf = torch.empty(max_batch_size, dtype=torch.int32, device=device)
        else:
            self.qo_indptr_buf = None
            self.kv_indptr_buf = None
            self.kv_indices_buf = None
            self.kv_len_arr_buf = None
        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
            self.float_workspace_buffer,
            use_cuda_graph=use_cuda_graph,
            qo_indptr=self.qo_indptr_buf,
            kv_indptr=self.kv_indptr_buf,
            kv_indices=self.kv_indices_buf,
            kv_len_arr=self.kv_len_arr_buf,
            bsz_tensor=self.batch_size_tensor_buf,
            backend = "fa2",
        )
        self.need_plan = True

    
    def plan(self,
             qo_indptr,
             kv_indptr,
             kv_indices,
             kv_len_arr,
             bsz_tensor,
             num_heads,
             head_dim_ckv,
             head_dim_kpe,
             page_size,
             sm_scale,
             q_data_type,
             kv_data_type,
             ):
        if qo_indptr is None:
            assert self.max_batch_size == 1
            qo_indptr = self.qo_indptr_buf
        if kv_indptr is None:
            assert self.max_batch_size == 1
            kv_indptr = self.kv_indptr_buf
        if kv_indices is None:
            assert self.max_batch_size == 1
            kv_indices = self.kv_indices_buf
        if bsz_tensor is None:
            assert self.max_batch_size == 1
            bsz_tensor = self.batch_size_tensor_buf
        
        self.wrapper.plan(
            qo_indptr,
            kv_indptr,
            kv_indices,
            kv_len_arr,
            num_heads,
            head_dim_ckv,
            head_dim_kpe,
            page_size,
            True, # causal
            sm_scale,
            q_data_type,
            kv_data_type,
            bsz_tensor
        )

    def run(self, q_nope, q_pe, ckv, k_pe, return_lse = False):
        return self.wrapper.run(q_nope, q_pe, ckv, k_pe, return_lse = return_lse)

class MLAWrapperSingleton():
    wrappers:dict = {}

    @classmethod
    def get_instance(cls, device, *args, **kwargs)->MLAWrapper:
        if device not in cls.wrappers:
            cls.make_instance(device, *args, **kwargs)
        return cls.wrappers[device]
    
    @classmethod
    def make_instance(cls, device, *args, **kwargs):
        cls.wrappers[device] = MLAWrapper(*args, **kwargs, device=device)

    @classmethod
    def plan_all(cls, qo_indptr,
             kv_indptr,
             kv_indices,
             kv_len_arr,
             bsz_tensor,
             num_heads,
             head_dim_ckv,
             head_dim_kpe,
             page_size,
             sm_scale,
             q_data_type,
             kv_data_type,):
        for device, wrapper in cls.wrappers.items():
            kv_len_arr_cur_device = kv_len_arr.to(device)
            wrapper.plan(qo_indptr,
                kv_indptr,
                kv_indices,
                kv_len_arr_cur_device,
                bsz_tensor,
                num_heads,
                head_dim_ckv,
                head_dim_kpe,
                page_size,
                sm_scale,
                q_data_type,
                kv_data_type,)
            wrapper.need_plan = False
            
    @classmethod
    def need_plan_all(cls):
        for device, wrapper in cls.wrappers.items():
            wrapper.need_plan = True
        
    @classmethod
    def reset_buffer(cls):
        for device, wrapper in cls.wrappers.items():
            wrapper.qo_indptr_buf[1] = 1 # assert max_batch_size=1 here.
            
    @classmethod
    def update_buffer(cls, max_pages):
        for device, wrapper in cls.wrappers.items():
            wrapper.kv_indptr_buf[1] = max_pages # assert max_batch_size=1 here.
            wrapper.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
            wrapper.wrapper._kv_indices_buf = wrapper.kv_indices_buf

def checksame():
    flashinfer_folder = "./flashinfer_output"
    flashinfer_folder = "./kv_cache_flashinfer"
    triton_folder = "./triton_output"
    triton_folder = "./kv_cache_triton"
    
    max_layer_id = 1
    max_forward_id = 2

    for forward_id in range(0, 19):
        print("forward_id", forward_id)
        for layer_id in range(max_layer_id):
            print(layer_id)
            #file_name = f"layer_{layer_id}_forward_{forward_id}_attn_output.pt"
            #file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
            file_name = f"layer_{layer_id}.pt"
            
            flashinfer_path = os.path.join(flashinfer_folder, file_name)
            triton_path = os.path.join(triton_folder, file_name)
            
            if not os.path.exists(triton_path):
                print(f"{file_name} not exist in {triton_folder}")
                continue
            if not os.path.exists(flashinfer_path):
                print(f"{file_name} not exist in {flashinfer_folder}")
                continue
            
            
            flashinfer_tensor = torch.load(flashinfer_path)[1:2, :62]#
            triton_tensor = torch.load(triton_path)[1:2, :62]#.squeeze(1)#
            try:
                torch.testing.assert_close(flashinfer_tensor, triton_tensor, rtol=1e-9, atol=1e-9)
            except AssertionError as e:
                print(e)

if __name__ == "__main__":
    
    #checksame()
    #exit(0)

    max_batch_size = 2
    max_batch_tokens = 256
    max_pages = 128
    page_size = 64
    num_heads = 128
    
    # warm-up
    kv_len = 4023
    q_len = 1
    q_nope_buf = torch.randn((max_batch_tokens, num_heads, 512), dtype=torch.bfloat16, device="cuda")
    q_pe_buf = torch.randn((max_batch_tokens, num_heads, 64), dtype=torch.bfloat16, device="cuda")
    kv_buf = torch.randn((max_pages, page_size, 576), dtype=torch.bfloat16, device="cuda")
    ckv, k_pe = torch.split(kv_buf, [512, 64], dim=-1)
    

    wrapper = MLAWrapperSingleton.get_instance(
        "cuda",
        max_batch_size,
        max_pages,
    )
    
    used_pages = (kv_len + page_size - 1)// page_size
    kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
    kv_indptr = torch.tensor([0, used_pages], dtype=torch.int32, device="cuda")
    kv_indices = torch.empty(max_pages, dtype=torch.int32, device="cuda")
    kv_indices[:used_pages] = torch.arange(0, used_pages, dtype=torch.int32, device="cuda")
    bsz_tensor = torch.tensor([1], dtype=torch.int32, device="cuda")
    wrapper.plan(
        qo_indptr,
        kv_indptr,
        kv_indices,
        kv_len_arr,
        bsz_tensor,
        128,
        512,
        64,
        page_size,
        192 ** (-0.5),
        torch.bfloat16,
        torch.bfloat16,
    )

    attn_output = wrapper.run(q_nope_buf[:q_len], q_pe_buf[:q_len], ckv, k_pe)
    print(attn_output.shape)
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        attn_output = wrapper.run(q_nope_buf, q_pe_buf, ckv, k_pe)
    graph.replay()

    q = torch.cat([q_nope_buf, q_pe_buf], dim=-1)
    k = (
        torch.cat([ckv, k_pe], dim=-1)
        .view(-1, 1, 512 + 64)
        .repeat_interleave(num_heads, dim=1)
    )
    v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
    attn_ref, lse_ref = attention_ref_torch(
        1,
        q[:q_len],
        k[:kv_len],
        v[:kv_len],
        True,
        192 ** (-0.5)
    )
    torch.testing.assert_close(attn_output[:q_len], attn_ref, rtol=5e-3, atol=5e-3)
    # warm-up finished

    kv_len = 512
    q_len = 128
    pages = max_pages
    used_pages = (kv_len + page_size - 1)// page_size
    q_nope = torch.randn((q_len*2, num_heads, 512), dtype=torch.bfloat16, device="cuda")
    q_nope[q_len:] = q_nope[:q_len]
    q_pe = torch.randn((q_len*2, num_heads, 64), dtype=torch.bfloat16, device="cuda")
    q_pe[q_len:] = q_pe[:q_len]
    kv_cache = torch.randn((max_pages, page_size, 576), dtype=torch.bfloat16, device="cuda")
    kv_cache[used_pages:2*used_pages] = kv_cache[:used_pages]
    ckv, k_pe = torch.split(kv_cache, [512, 64], dim=-1)
    
    kv_len_arr = torch.tensor([kv_len, kv_len], dtype=torch.int32, device="cuda")
    qo_indptr = torch.tensor([0, q_len, q_len*2], dtype=torch.int32, device="cuda")
    kv_indptr = torch.tensor([0, used_pages, used_pages*2], dtype=torch.int32, device="cuda")
    kv_indices = torch.empty(max_pages, dtype=torch.int32, device="cuda")
    kv_indices[:2*used_pages] = torch.arange(0, 2*used_pages, dtype=torch.int32, device="cuda")
    bsz_tensor = torch.tensor([2], dtype=torch.int32, device="cuda")
    wrapper.plan(
        qo_indptr,
        kv_indptr,
        kv_indices,
        kv_len_arr,
        bsz_tensor,
        128,
        512,
        64,
        page_size,
        192 ** (-0.5),
        torch.bfloat16,
        torch.bfloat16,
    )
    
    q_nope_buf.copy_(q_nope)
    q_pe_buf.copy_(q_pe)
    kv_buf[:pages].copy_(kv_cache)

    torch.cuda.synchronize()
    graph.replay()
    torch.cuda.synchronize()

    # ref_torch
    q = torch.cat([q_nope, q_pe], dim=-1)
    k = (
        torch.cat([ckv, k_pe], dim=-1)
        .view(-1, 1, 512 + 64)
        .repeat_interleave(num_heads, dim=1)
    )
    v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
    attn_ref, lse_ref = attention_ref_torch(
        max_batch_size,
        q,
        k[:2*kv_len],
        v[:2*kv_len],
        True,
        192 ** (-0.5)
    )
    
    torch.testing.assert_close(attn_ref[:q_len], attn_ref[q_len:q_len*2], rtol=1e-9, atol=1e-9)
    torch.testing.assert_close(attn_output[:q_len], attn_output[q_len:q_len*2], rtol=1e-9, atol=1e-9)
    torch.testing.assert_close(attn_output[:q_len], attn_ref[:q_len], rtol=5e-3, atol=5e-3)
    torch.testing.assert_close(attn_output[q_len:q_len*2], attn_ref[q_len:q_len*2], rtol=5e-3, atol=5e-3)
    #torch.testing.assert_close(attn_output[:q_len], attn_output[q_len:q_len*2], rtol=1e-9, atol=1e-9)
    #torch.testing.assert_close(attn_output, attn_ref, rtol=5e-3, atol=5e-3)

    exit(0)

    for forward_id in range(0, 1):
        print("forward_id", forward_id)
        for layer_id in range(1):
            print(layer_id)
            flashinfer_folder = "./kv_cache_flashinfer"
            forward_id = 17
            layer_id = 0
            file_name = f"layer_{layer_id}.pt"
            kv_cache_path = os.path.join(flashinfer_folder, file_name)
            flashinfer_folder = "./flashinfer_output"

            q_len = 1
            kv_len = 126
            file_name = f"layer_{layer_id}_forward_{forward_id}_q_nope.pt"
            q_nope = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,512).to(device="cuda")
            file_name = f"layer_{layer_id}_forward_{forward_id}_q_pe.pt"
            q_pe = torch.load(os.path.join(flashinfer_folder, file_name)).view(q_len,128,64).to(device="cuda")
            q = torch.cat([q_nope, q_pe], dim=-1)
            kv_cache = torch.load(kv_cache_path).to(device="cuda")
            pages, page_size, _, head_dim = kv_cache.shape
            kv_cache = kv_cache.view(pages, page_size, head_dim)
            ckv, k_pe = torch.split(kv_cache, [512, 64], dim=-1)
    
            kv_len_arr = torch.tensor([kv_len], dtype=torch.int32, device="cuda")
            qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
            wrapper.plan(
                None,
                None,
                None,
                kv_len_arr,
                128,
                512,
                64,
                page_size,
                192 ** (-0.5),
                torch.bfloat16,
                torch.bfloat16,
            )
    
            q_nope_buf.copy_(q_nope)
            q_pe_buf.copy_(q_pe)
            kv_buf[:pages].copy_(kv_cache)

            torch.cuda.synchronize()
            graph.replay()
            torch.cuda.synchronize()

            # ref_torch
            k = (
                torch.cat([ckv, k_pe], dim=-1)
                .view(-1, 1, 512 + 64)
                .repeat_interleave(num_heads, dim=1)
            )
            v = ckv.view(-1, 1, 512).repeat_interleave(num_heads, dim=1)
            attn_ref, lse_ref = attention_ref_torch(
                max_batch_size,
                q,
                k[:kv_len],
                v[:kv_len],
                False,
                192 ** (-0.5)
            )
            torch.testing.assert_close(attn_output, attn_ref, rtol=1e-3, atol=1e-3)
    
            # ref_triton
            attn_logits = torch.empty(
                    (
                        max_batch_size,
                        num_heads,
                        4, #num_kv_splits # follow vLLM, fix it TODO
                        512 + 1, 
                    ),
                    dtype=torch.float32,
                    device = "cuda"
                )
            
            triton_ref = torch.zeros_like(q_nope)
            page_table = torch.arange(max_pages, dtype=torch.int32, device="cuda")
            ckv_with_pe = torch.cat([ckv, k_pe], dim=-1).contiguous().view(pages, page_size, 1, 576)
            ckv = ckv.view(pages, page_size, 1, 512)
            decode_attention_fwd_grouped(q, ckv_with_pe, ckv, triton_ref,
                page_table,
                kv_len_arr, attn_logits,
                4, #num_kv_splits # follow vLLM, fix it TODO
                192 ** (-0.5),
                page_size)

            torch.testing.assert_close(attn_output, triton_ref, rtol=1e-3, atol=1e-3)
            
            #file_name = f"./flashinfer_output/layer_{layer_id}_forward_{forward_id}_attn_output.pt"
            #ktrans_output = torch.load(file_name)
            #torch.testing.assert_close(attn_output, ktrans_output.squeeze(1), rtol=1e-3, atol=1e-3)
            print("test past")

================================================
FILE: kt-sft/ktransformers/operators/gate.py
================================================
from typing import Optional
from torch import nn
import torch
import torch.nn.functional as F
import os
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.linear import KTransformersLinear
from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod


# class Base(BaseInjectedModule, ABC):
class KMoEGateBase(ABC):
    def __init__(self, 
                 key: str, 
                 gguf_loader: GGUFLoader, 
                 config: PretrainedConfig, 
                 orig_module: nn.Module, 
                 device: str = "cuda", 
                 **kwargs):
        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        super().__init__()
        self.key = key
        self.gguf_loader = gguf_loader
        self.config = config
        self.device = device
        self.orig_module = orig_module
    
    @abstractmethod
    def forward(self, input_tensor, expert_ids, weights):
        pass

    @abstractmethod
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu", warmup: bool = False):
        pass
    
    @abstractmethod
    def unload():
        pass

    def load_weights(self, override_key: str | None = None, device: str = "cpu"):
        res = {}
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        gate = None
        up = None
        down = None
        gate_type = None
        up_type = None
        down_type = None

        for key in keys:
            # key = ".".join(key.split(".")[:-1])
            if isinstance(self.gguf_loader, SafeTensorLoader):
                res = self.gguf_loader.load_gate(key, device=device)
            elif self.gguf_loader.has_tensor(key+".weight"):
                # targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
                targets = [".weight", ".e_score_correction_bias"]
                tensors = self.load_multi(key, targets, device=device)
                weight = tensors[".weight"]
                e_score_correction_bias = tensors[".e_score_correction_bias"]
                # weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"]
                res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias}
            else:
                raise ValueError(f"Experts {key} not found in gguf_loader")

        return res
    
    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
        tensors = {}
        for k in keys:
            tensors[k] = self.gguf_loader.load_gguf_tensor(key + k, device=device)
        return tensors


class KMoEGate(BaseInjectedModule, KMoEGateBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "cuda",
        prefill_device: str = "cuda",
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KMoEGateBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def forward(self, hidden_states) -> torch.Tensor:
        return self.orig_module.forward(hidden_states)

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: w = self.load_weights(device=device)
        
        if isinstance(w, dict):
            self.orig_module.weight = nn.Parameter(w["weight"])
            self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
        else:
            raise ValueError("Invalid weight type")
        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device))
        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device))

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.e_score_correction_bias is not None:
            self.e_score_correction_bias = None


class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "cuda",
        generate_op: str| None = "KLinearMarlin",
        prefill_device: str = "cuda",
        prefill_op: str| None = "KLinearMarlin",
        use_quant: bool = False,
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KMoEGateBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.generate_device = generate_device
        self.prefill_device = prefill_device
        self.generate_op = generate_op
        self.prefill_op = prefill_op
        self.is_windows = os.name == 'nt'
        self.use_quant = use_quant
        if not self.is_windows and use_quant:
            self.gate_linear = nn.Linear(self.gating_dim, self.n_routed_experts, device=generate_device)
            self.gate_linear = KTransformersLinear(key + ".ffn_gate_inp", 
                                               gguf_loader, config, self.gate_linear, #orig_module
                                               generate_device, generate_op, prefill_device, prefill_op)
        else:
            self.gate_linear = None

    def forward(self, hidden_states) -> torch.Tensor:
        if self.is_windows:
            return self.orig_module.forward(hidden_states)
        
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        if self.use_quant:
            logits = self.gate_linear.forward(logits)
        else:
            logits = F.linear(
                hidden_states.type(torch.float32), self.weight.type(torch.float32), None
            )
            
        return grouped_topk(hidden_states, logits,
                            self.top_k, self.norm_topk_prob,
                            self.n_group, self.topk_group)

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: w = self.load_weights(device=device)
        
        if isinstance(w, dict):
            self.orig_module.weight = nn.Parameter(w["weight"])
            self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
        else:
            raise ValueError("Invalid weight type")
        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device))
        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device))
        if not self.is_windows and self.use_quant:
            self.gate_linear.load(self.orig_module.weight)

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.e_score_correction_bias is not None:
            self.e_score_correction_bias = None


class KMoEGateIPEXLLM(KMoEGate):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        generate_device: str = "xpu",
        prefill_device: str = "xpu",
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KMoEGate.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        self.generate_device = generate_device
        self.prefill_device = prefill_device

    def forward(self, hidden_states) -> torch.Tensor:
        x = hidden_states.view(-1, hidden_states.size(-1))
        logits = torch.nn.functional.linear(
            x.type(torch.float32), self.orig_module.weight.type(torch.float32), None
        )
        scores = logits.sigmoid()

        from ipex_llm.transformers.models.common import moe_group_topk
        topk_idx, topk_weight = moe_group_topk(scores, self.orig_module.e_score_correction_bias,
                                               self.n_group, self.topk_group, self.top_k,
                                               self.norm_topk_prob, self.routed_scaling_factor)
        return topk_idx, topk_weight.to(x.dtype)

================================================
FILE: kt-sft/ktransformers/operators/layernorm.py
================================================
'''
Date: 2024-11-13 15:05:52
LastEditors: Xie Weiyu ervinxie@qq.com
LastEditTime: 2024-11-25 08:59:19
'''
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

"""Fused operators for normalization layers."""

import logging
from typing import Optional, Tuple, Union
from transformers import PretrainedConfig
import torch
import torch.nn as nn
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
if not torch.xpu.is_available():
    from flashinfer.norm import (
        fused_add_rmsnorm,
        rmsnorm,
    )


logger = logging.getLogger(__name__)


class RMSNorm(DeepseekV3RMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


class KQwen2MoeRMSNorm(Qwen2MoeRMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(config.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


class KQwen3MoeRMSNorm(Qwen3MoeRMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self,
        x: torch.Tensor,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        #return self.forward_native(x, residual)
        bsz, hidden_size = x.shape
        x = x.view(-1, self.orig_module.hidden_size)
        if batch_size_tensor is None:
            return self.forward_native(x)
        if residual is not None:
            fused_add_rmsnorm(x, residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            #residual = x + residual
            #out = rmsnorm(residual, self.weight.data, batch_size_tensor, self.variance_epsilon)
            return x, residual
        # print(x.shape, self.weight.data.shape, self.variance_epsilon, x.dtype, self.weight.data.dtype, x.device, self.weight.device, x.is_contiguous(), self.weight.data.is_contiguous())
        out = rmsnorm(x, self.weight.data, batch_size_tensor,self.variance_epsilon)
        out = out.view(bsz, hidden_size)
        return out

    def forward_native(
        self, hidden_states    
    ):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

class DeepseekV3RMSNormTorch(DeepseekV3RMSNorm, BaseInjectedModule):
    def __init__(self,
                key: str,
                gguf_loader : GGUFLoader,
                config: PretrainedConfig,
                orig_module: nn.Module,
                prefill_device: str = "cuda",
                generate_device: str = "cuda",
                **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.hidden_size,
            orig_module.variance_epsilon)

    def forward(
        self, 
        x,
        batch_size_tensor: torch.Tensor = None,
        residual: Optional[torch.Tensor] = None,
    )-> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        if residual is not None:
            x = x + residual
            residual = x
        # range batch_size_tensor for x
        input_dtype = x.dtype
        x = x.to(torch.float32)
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.variance_epsilon)
        if residual is not None:
            return self.weight * x.to(input_dtype), residual
        return self.weight * x.to(input_dtype)


class KDeepseekRMSNormIPEXLLM(DeepseekV3RMSNorm, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "xpu",
                 generate_device: str = "xpu",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.weight.shape[0],
            orig_module.variance_epsilon)
        self.eps = orig_module.variance_epsilon

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        from ipex_llm.transformers.models.common import rms_norm_forward
        if x.dtype not in [torch.float32, torch.float16]:
            output = rms_norm_forward(self, x.float())
        else:
            output = rms_norm_forward(self, x)
        return output.to(x.dtype)

    def load(self):
        BaseInjectedModule.load(self)
        if self.weight.dtype not in [torch.float32, torch.float16]:
            self.weight = self.weight.float()

================================================
FILE: kt-sft/ktransformers/operators/linear.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Azure-Tang, Boxin Zhang
Date         : 2024-07-25 11:25:24
Version      : 0.1.0
LastEditors  : Azure 
LastEditTime : 2024-08-29 09:11:16
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''


import ctypes
import time
import torch
from torch import Tensor, nn
if not torch.xpu.is_available():
    import KTransformersOps
    import vLLMMarlin
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
from ktransformers.util.inference_state import InferenceState
if not torch.xpu.is_available():
    from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
        MarlinWorkspace,
        marlin_quantize,
        GPTQ_MARLIN_MIN_THREAD_N,
        GPTQ_MARLIN_MIN_THREAD_K,
        GPTQ_MARLIN_MAX_PARALLEL,
        vllm_marlin_quantize
    )
from ktransformers.operators.base_operator import BaseInjectedModule
from transformers.configuration_utils import PretrainedConfig
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from ktransformers.util.globals import GLOBAL_CONFIG
from abc import ABC, abstractmethod
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Release"))
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext
from ktransformers.operators.cpuinfer import CPUInfer
from ktransformers.server.config.config import Config
from typing import Dict, Tuple, Optional, Union
import numpy as np

#class KLinearBase(BaseInjectedModule, ABC):
class KLinearBase(nn.Module, ABC):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs,
    ):
        # super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        super().__init__()
        self.key = key
        self.gguf_loader = gguf_loader
        self.device = device
        self.config = config

        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        if orig_module is not None:
            self.in_features = orig_module.in_features
            self.out_features = orig_module.out_features
        else:
            shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
            if len(shape) == 1:
                print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
            self.in_features  = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0]
            self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1]

        self.loaded = False # for lm_head pre-load, TODO: use new way to do lm_head pre-load when layer wise prefill.

    @abstractmethod
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        pass

    def load_weight(self, override_key: str | None = None, device: str | None = None):
        if override_key is not None:
            keys = override_key
        else:
            keys = [self.key]

        for key in keys:
            if isinstance(self.gguf_loader, SafeTensorLoader):
                # using safetensor_loader
                tensor = self.gguf_loader.load_tensor(key+'.weight')
                if self.gguf_loader.has_tensor(key+'.weight_scale_inv'):
                    weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv')
                    return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
                return nn.Parameter(tensor)
                
            elif self.gguf_loader.has_tensor(key + ".weight") or "kv_b_proj" in key:
                if key + ".bias" in self.gguf_loader.tensor_file_map:
                    tensors = self.load_multi(key, ["weight", "bias"], device=device)
                    tensor = tensors["weight"]
                    bias = tensors["bias"]
                    # self.qtype = GGML_TYPE_QTYPE_MAP[tensorinfo[key + ".weight"]["ggml_type"]]
                    # print(torch.isinf(tensor).any(), torch.isinf(bias).any())
                    return nn.Parameter(tensor), nn.Parameter(bias)
                elif "kv_b_proj" in key and not self.gguf_loader.has_tensor(key + ".weight"):
                    attn_k_b_tensors = self.load_multi(key.replace("self_attn.kv_b_proj", "attn_k_b"), ["weight"], device=device)
                    attn_k_b = attn_k_b_tensors["weight"]
                    del attn_k_b_tensors
                    attn_k_b = attn_k_b.transpose(1, 2).contiguous()
                    attn_v_b_tensors = self.load_multi(key.replace("self_attn.kv_b_proj", "attn_v_b"), ["weight"], device=device)
                    attn_v_b = attn_v_b_tensors["weight"]
                    del attn_v_b_tensors
                    kv_b_proj = torch.cat((attn_k_b, attn_v_b), dim=1)
                    kv_b_proj = kv_b_proj.contiguous() if kv_b_proj.ndim == 2 else kv_b_proj.flatten(0, 1).contiguous()
                    del attn_k_b
                    del attn_v_b
                    return nn.Parameter(kv_b_proj)
                else:
                    tensors = self.load_multi(key, ["weight"], device=device)
                    tensor = tensors["weight"]
                    # self.qtype = GGML_TYPE_QTYPE_MAP[tensorinfo[key + ".weight"]["ggml_type"]]
                    return nn.Parameter(tensor)
            else:
                raise FileNotFoundError(f"Weight file not found for key {key}")

    def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
        tensors = {}
        for k in keys:
            tensors[k] = self.gguf_loader.load_gguf_tensor(key + "." + k, device=device)
        return tensors

    @abstractmethod
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = "cuda"):
        pass

    @abstractmethod
    def unload(self):
        pass


class KLinearTorch(KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.weight = None
        self.has_bias = False

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None, **kwargs) -> torch.Tensor:
        dtype = x.dtype
        out_device = x.device

        if (not x.requires_grad) and GLOBAL_CONFIG._config["mod"] == "sft":
            x = x.requires_grad_(True)
        # TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
        x = x.to(device=self.device, dtype=self.dtype)
        x = x @ self.weight
        if self.has_bias:
            x = x + self.bias
        x = x.to(dtype=dtype, device=out_device)
        return x

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        if w is None: w = self.load_weight(device=device)
        # else: self.out_features = w.shape[0], self.in_features = w.shape[1]
        
        if isinstance(w, nn.Parameter):
            try:
                self.weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except: 
                self.weight = w.to(dtype=self.dtype).T
            self.has_bias = False
        elif isinstance(w, tuple):
            try:
                self.weight = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except:
                self.weight = w[0].to(dtype=self.dtype).T
            self.bias = w[1].to(dtype=self.dtype)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        # self.linear = self.linear.to(device)
        self.weight = self.weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
        self.loaded = True

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.has_bias:
            self.bias = None

class KLinearQ8(KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.compute_dtype = torch.float32
        self.weight = None
        self.weight_scale = None
        self.weight_zero_point = None
        self.bias = None
        self.loaded = False
    
    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None) -> torch.Tensor:
        orig_dtype = x.dtype
        out_device = x.device
        
        x = x.to(device=self.device, dtype=self.compute_dtype)
        

        weight_dequant = self._dequantize_weight(self.weight, self.weight_scale, bits=8)
        out = x @ weight_dequant.T
        
        if self.has_bias:
            out = out + self.bias
        
        return out.to(dtype=orig_dtype, device=out_device)
    
    def _dequantize_weight(self, q_matrix, scales, bits=8):
        """
        Dequantize a low-precision matrix back to floating-point
        
        Args:
            q_matrix (torch.Tensor): Quantized int matrix
            scales (torch.Tensor): Scale factors for each column
            bits (int): Quantization bits used (8 or 4)
        
        Returns:
            torch.Tensor: Dequantized floating-point matrix
        """
        # Ensure inputs are torch tensors
        if not isinstance(q_matrix, torch.Tensor):
            q_matrix = torch.tensor(q_matrix, dtype=torch.int8)
        if not isinstance(scales, torch.Tensor):
            scales = torch.tensor(scales, dtype=torch.float32)
        
        # Convert to correct dtype if needed
        if q_matrix.dtype != torch.int8:
            q_matrix = q_matrix.to(torch.int8)
        if scales.dtype != torch.float32:
            scales = scales.to(torch.float32)
        
        # For Q4, ensure the values stay within 4-bit range
        if bits == 4:
            q_matrix = torch.clamp(q_matrix, -7, 7)
        rows, cols = q_matrix.shape
        dequant_matrix = q_matrix.to(torch.float32)
        scales_broadcast = scales.view(1, cols)
        # Apply dequantization to all columns at once using matrix multiplication
        dequant_matrix = dequant_matrix * scales_broadcast
        
        return dequant_matrix

    
    def _quantize_weight(self, matrix, bits=8):
        """
        Quantize a floating-point matrix to lower precision (Q8 or Q4)
        
        Args:
            matrix (torch.Tensor): Input matrix in floating-point format
            bits (int): Quantization bits, either 8 or 4
        
        Returns:
            tuple: (quantized int matrix, scale factors for each column)
        """
        if not isinstance(matrix, torch.Tensor):
            matrix = torch.tensor(matrix, dtype=torch.float32)
        
        # Convert to float32 if needed
        if matrix.dtype != torch.float32:
            matrix = matrix.to(torch.float32)
        
        # Get matrix shape
        rows, cols = matrix.shape
        
        # Determine quantization parameters based on bits
        if bits == 8:
            max_int = 127
            qtype = torch.int8
        elif bits == 4:
            max_int = 7
            qtype = torch.int8  # We'll still use int8 storage but limit to 4-bit range, wait for native support
        else:
            raise ValueError("Quantization bits must be either 8 or 4")
       
        scales = torch.zeros(cols, dtype=torch.float32, device=matrix.device)
        
        # Calculate max absolute value for each column
        max_abs_vals, _ = torch.max(torch.abs(matrix), dim=0)
        
        # Handle zero columns (avoid division by zero)
        zero_cols = max_abs_vals == 0
        max_abs_vals[zero_cols] = 1.0
        
        # Calculate scale factors for all columns at once
        scales = max_abs_vals / max_int
        
        # Prepare the scales for broadcasting [1, cols]
        scales_broadcast = scales.view(1, cols)
        
        # Apply quantization to the entire matrix at once
        q_matrix = torch.round(matrix / scales_broadcast).to(qtype)
        
        # For Q4, clamp values to ensure they stay within 4-bit range
        if bits == 4:
            q_matrix = torch.clamp(q_matrix, -max_int, max_int)
        
        return q_matrix, scales
    
    def load(self, w: Union[Dict, nn.Parameter, Tuple, None] = None, device: Optional[str] = None):
        if self.loaded: return
        if device is None: device = self.device 
        if w is None: w = self.load_weight(device=device)
        
        if isinstance(w, nn.Parameter):
            try:
                weight = w.to(dtype=self.compute_dtype).view(self.out_features, self.in_features)
            except:
                weight = w.to(dtype=self.compute_dtype)
            self.has_bias = False
        elif isinstance(w, tuple):
            try:
                weight = w[0].to(dtype=self.compute_dtype).view(self.out_features, self.in_features)
            except:
                weight = w[0].to(dtype=self.compute_dtype)
            self.bias = w[1].to(dtype=self.compute_dtype).to(device)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        
        self.weight, self.weight_scale = self._quantize_weight(weight, bits=8)
        
        self.weight = self.weight.to(device)
        self.weight_scale = self.weight_scale.to(device)
        
        if self.has_bias:
            self.bias = self.bias.to(device)
            
        self.loaded = True
    
    def unload(self):
        self.weight = None
        self.weight_scale = None
        self.weight_zero_point = None
        self._orig_weight = None
        
        if self.has_bias:
            self.bias = None
            
        self.loaded = False


class KLinearFP8(KLinearBase):
    # this kernel requires special handling for weight
    # Please load the weight file downloaded from KVCache.AI
    has_bias: bool
    weight: torch.Tensor
    bias: torch.Tensor
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        block_size: int = 128,
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.block_size = block_size
    
    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor) -> torch.Tensor:
        x = x.to(self.device)
        orig_dtype = x.dtype        
        x_quantized, scale_x = act_quant(x, self.block_size)
        y = fp8_gemm(x_quantized, scale_x, self.weight, self.weight_scale_inv)
        return y.to(dtype=orig_dtype)
    
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: 
            w = self.load_weight(device=device) 
        ### TODO fit weight_inv format
        if isinstance(w, tuple):
            self.weight = w[0].to(device)
            self.weight_scale_inv = w[1].to(device)
            self.has_bias = False
        else:
            raise ValueError("Invalid weight type")
        self.weight = self.weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
        
    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.has_bias:
            self.bias = None

# TODO: merge two marlin class

class VLinearMarlin(KLinearBase):
    marlin_q_w: torch.Tensor
    marlin_s: torch.Tensor
    g_idx: torch.Tensor
    sort_indices: torch.Tensor
    has_bias: bool
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        num_bits: int = 4,  # 4-bit/8-bit is supported
        group_size: int = 64,  # -1, 32, 64, 128
        act_order: bool = False,
        is_k_full=True,
        **kwargs,
    ):
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.num_bits = num_bits
        self.group_size = group_size
        self.act_order = act_order
        self.is_k_full = is_k_full
        self.padding = False
        self.orin_in_features = self.in_features
        self.orin_out_features = self.out_features
        if self.in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or self.out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
            #print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
            self.padding = True
            self.in_features = (self.in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
            self.out_features = (self.out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
            #print(f"After padding: in_features={in_features}, out_features={out_features}")
        
        self.k = self.in_features
        self.n = self.out_features

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        
        #if self.in_features * self.out_features:
        if w is None: 
            w = self.load_weight(device=device) 

        if isinstance(w, nn.Parameter):
            # pad weight
            weight = w.view(self.orin_out_features, self.orin_in_features).T
            self.has_bias = False
        elif isinstance(w, tuple):
            w = list(w)
            weight = w[0].view(self.orin_out_features, self.orin_in_features).T
            self.bias = w[1].view(self.orin_out_features)
            self.bias = w[1]
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        weight = weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
            
        if self.padding:
            padded_weight = torch.zeros(self.in_features, self.out_features, device=self.device)
            padded_weight[:self.orin_in_features, :self.orin_out_features] = weight
            weight = padded_weight

        # Pack Marlin linear
        marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
            weight, self.num_bits, self.group_size, self.act_order
        )
        self.workspace = MarlinWorkspace(
            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
        )
        self.weight = marlin_q_w
        self.marlin_q_w = marlin_q_w
        self.marlin_s = marlin_s
        self.g_idx = g_idx
        self.sort_indices = sort_indices
        self.k = weight.shape[0]
        self.n = weight.shape[1]
        # self.shape_buffer = torch.tensor([60], dtype=torch.int32, device=self.device)
        self.loaded = True


    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor = None) -> torch.Tensor:
        if bsz_tensor is None:
            bsz_tensor = torch.tensor([x.shape[0]], dtype=torch.int32, device=self.device)


        # Only support input x as BF16 and FP16
        x = x.to(self.device)
        orig_shape = list(x.shape)
        orig_dtype = x.dtype
        x = x.reshape(-1, orig_shape[-1])
        marlin_s = self.marlin_s.to(x.dtype)
        sms = -1

        # padding x.shape[0] to avoid CUDA illegal memory access error
        x, orig_size_m = self._pad_input(x)

        x = vLLMMarlin.gptq_marlin_gemm(
            x,
            self.marlin_q_w,
            marlin_s,
            self.g_idx,
            self.sort_indices,
            self.workspace.scratch,
            self.num_bits,
            bsz_tensor,
            x.shape[0],
            self.n,
            x.shape[-1],
            sms,
            self.is_k_full,
        )

        x = x[:orig_size_m]

        if self.has_bias:
            x = x + self.bias
        orig_shape[-1] = self.n
        return x.reshape(orig_shape).to(orig_dtype)

    def unload(self):

        if self.has_bias:
            self.bias = None
        self.marlin_q_w = None
        self.marlin_s = None
        self.g_idx = None
        self.sort_indices = None
        self.workspace = None  

    def _pad_input(self, x):

        size_m = x.shape[0]
        size_k = x.shape[1]

        # size_m and align value depends on VLinearMarlin implementation
        if size_m > 1024:
            align = 1024
        elif size_m > 64:
            align = 64
        else:
            align = 1

        padded_size_m = ((size_m + align - 1) // align) * align

        if padded_size_m > size_m:
            pad_len = padded_size_m - size_m
            pad_tensor = torch.zeros((pad_len, size_k), dtype=x.dtype, device=x.device)
            x = torch.cat([x, pad_tensor], dim = 0).contiguous()
        return x, size_m

class KLinearMarlin(KLinearBase):
    marlin_q_w: torch.Tensor
    marlin_s: torch.Tensor
    g_idx: torch.Tensor
    sort_indices: torch.Tensor
    has_bias: bool
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cuda",
        num_bits: int = 4,  # 4-bit/8-bit is supported
        group_size: int = 64,  # -1, 32, 64, 128
        act_order: bool = False,
        is_k_full=True,
        **kwargs,
    ):
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.num_bits = num_bits
        self.group_size = group_size
        self.act_order = act_order
        self.is_k_full = is_k_full
        self.padding = False
        self.orin_in_features = self.in_features
        self.orin_out_features = self.out_features
        if self.in_features%GPTQ_MARLIN_MIN_THREAD_K!=0 or self.out_features%GPTQ_MARLIN_MIN_THREAD_K!=0:
            #print(f"warning!, in_features={in_features} or out_features={out_features} is undivisible by GPTQ_MARLIN_MIN_THREAD_K={GPTQ_MARLIN_MIN_THREAD_K} and GPTQ_MARLIN_MIN_THREAD_N={GPTQ_MARLIN_MIN_THREAD_N}, padding")
            self.padding = True
            self.in_features = (self.in_features+GPTQ_MARLIN_MIN_THREAD_K-1)//GPTQ_MARLIN_MIN_THREAD_K*GPTQ_MARLIN_MIN_THREAD_K
            self.out_features = (self.out_features+GPTQ_MARLIN_MIN_THREAD_N-1)//GPTQ_MARLIN_MIN_THREAD_N*GPTQ_MARLIN_MIN_THREAD_N
            #print(f"After padding: in_features={in_features}, out_features={out_features}")
        
        self.k = self.in_features
        self.n = self.out_features

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
        
        #if self.in_features * self.out_features:
        if w is None: 
            w = self.load_weight(device=device) 

        if isinstance(w, nn.Parameter):
            # pad weight
            weight = w.view(self.orin_out_features, self.orin_in_features).T
            self.has_bias = False
        elif isinstance(w, tuple):
            w = list(w)
            weight = w[0].view(self.orin_out_features, self.orin_in_features).T
            self.bias = w[1].view(self.orin_out_features)
            self.bias = w[1]
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        weight = weight.to(device)
        if self.has_bias:
            self.bias = self.bias.to(device)
            
        if self.padding:
            padded_weight = torch.zeros(self.in_features, self.out_features, device=self.device)
            padded_weight[:self.orin_in_features, :self.orin_out_features] = weight
            weight = padded_weight

        # Pack Marlin linear
        marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
            weight, self.num_bits, self.group_size, self.act_order
        )
        self.workspace = MarlinWorkspace(
            self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
        )
        self.weight = marlin_q_w # modeling_xxx.py may use linear.weight
        self.marlin_q_w = marlin_q_w
        self.marlin_s = marlin_s
        self.g_idx = g_idx
        self.sort_indices = sort_indices
        self.k = weight.shape[0]
        self.n = weight.shape[1]
        self.loaded = True

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None, **kwargs) -> torch.Tensor:
        # Only support input x as BF16 and FP16
        x = x.to(self.device)
        orig_shape = list(x.shape)
        orig_dtype = x.dtype
        x = x.reshape(-1, orig_shape[-1])
        x = x.reshape(-1, x.shape[-1])
        if self.padding:
            padding_input=torch.empty(x.shape[0], self.in_features, device=x.device, dtype=x.dtype)
            padding_input[:,:self.orin_in_features] = x
            x = padding_input
        marlin_s = self.marlin_s.to(x.dtype)
        x = KTransformersOps.gptq_marlin_gemm(
            x,
            self.marlin_q_w,
            marlin_s,
            self.g_idx,
            self.sort_indices,
            self.workspace.scratch,
            self.num_bits,
            x.shape[0],
            self.n,
            x.shape[-1],
            self.is_k_full,
        )
        if self.padding:
            x = x[:,:self.orin_out_features]
            orig_shape[-1] = self.orin_out_features
        else:
            orig_shape[-1] = self.out_features
        if self.has_bias:
            x = x + self.bias
        return x.reshape(orig_shape).to(orig_dtype)

    def unload(self):

        if self.has_bias:
            self.bias = None
        self.marlin_q_w = None
        self.marlin_s = None
        self.g_idx = None
        self.sort_indices = None
        self.workspace = None

class KLinearCPUInfer(KLinearBase):
    CPU_INFER = None
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "cpu",
        out_device: str = "cuda", # this device mean which device the output should on. TODO: support cpu.
        stride = 16,
        group_max_len = 1024,
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        if KLinearCPUInfer.CPU_INFER is None:
            KLinearCPUInfer.CPU_INFER = CPUInfer(Config().cpu_infer)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.w = None
        self.has_bias = False
        self.stride = stride
        self.group_max_len = group_max_len
        self.out_device = out_device

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor = None) -> torch.Tensor:
        origin_shape = x.shape # [batch_size, q_len, hidden_size]
        if origin_shape[1] == 1 and torch.cuda.is_current_stream_capturing():
            out_device = x.device
            self.input_tensor_cpu.copy_(x, non_blocking=True)
            qlen = origin_shape[1]
            KLinearCPUInfer.CPU_INFER.submit_with_cuda_stream(
                torch.cuda.current_stream().cuda_stream,
                self.linear.forward(
                    qlen, 
                    self.input_tensor_cpu.data_ptr(), 
                    self.output_cpu.data_ptr()
                )
            )
            KLinearCPUInfer.CPU_INFER.sync_with_cuda_stream(torch.cuda.current_stream().cuda_stream)
            self.output_gpu.copy_(self.output_cpu, non_blocking=True)
            if self.has_bias:
                self.output_gpu += self.bias
            return self.output_gpu
        else:
            dtype = x.dtype
            out_device = x.device
            x = x.to(device=self.device)
            qlen = origin_shape[1]
            output_shape = (*origin_shape[:-1], self.out_features)
            output = torch.empty(output_shape, device=x.device, dtype=x.dtype)
            KLinearCPUInfer.CPU_INFER.submit(
                self.linear.forward(
                    qlen, 
                    x.data_ptr(), 
                    output.data_ptr()
                )
            )
            KLinearCPUInfer.CPU_INFER.sync()
            if self.has_bias:
                output = output + self.bias
            output = output.to(dtype=dtype, device=out_device)
            return output

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None, warmup:bool = True):
        print(f"loading {self.key} to {self.device} using CPUInfer")
        if device is None: device = self.device
        self.load_weights(w=w, device=device)
        if self.bias is not None:
            self.has_bias = True
            self.bias = self.bias.to(device)
            
        weight_ptr = ctypes.addressof(
            ctypes.cast(self.weight.ctypes.data, ctypes.POINTER(ctypes.c_uint64)).contents
        )
        config = cpuinfer_ext.linear.LinearConfig(self.in_features, self.out_features, self.stride, self.group_max_len, weight_ptr, self.weight_type, 30)
        self.linear = cpuinfer_ext.linear.Linear(config)
        
        if warmup:
            KLinearCPUInfer.CPU_INFER.submit(self.linear.warm_up())
            KLinearCPUInfer.CPU_INFER.sync()
        self.input_tensor_cpu = torch.zeros((1, 1, self.in_features), device="cpu", pin_memory=True)
        self.output_cpu = torch.zeros((1, 1, self.out_features), device="cpu", pin_memory=True, dtype=torch.bfloat16)
        self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)

    def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
        if self.gguf_loader.has_tensor(self.key + ".weight"):
            if self.key + ".bias" in self.gguf_loader.tensor_file_map:
                self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
                self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
                self.bias = self.gguf_loader.load_gguf_tensor(self.key + ".bias", device=device)
            else:
                self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
                self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
                self.bias = None
        else:
            raise ValueError(f"Linear {self.key} not found in gguf_loader")

    def unload(self):
        if self.w is not None:
            self.w = None
        if self.has_bias:
            self.bias = None       

class KLinearIPEXLLM(KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module = None,
        device: str = "xpu",
        precision: str = "sym_int4",
        **kwargs,
    ):
        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
        self.weight = None
        self.has_bias = False
        self.precision = precision
        self.qtype = None

    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor = None) -> torch.Tensor:
        dtype = x.dtype
        out_device = x.device
        from ipex_llm.transformers.models.common import linear_forward
        x = linear_forward(x.half(), self.weight, self.qtype, self.out_features)

        if self.has_bias:
            x = x + self.bias
        x = x.to(dtype=dtype, device=out_device)
        return x

    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if self.loaded: return
        if device is None: device = self.device
        assert device.lower()[:3] == "xpu", "IPEX-LLM quantized linear only supports XPU device"
        if w is None: w = self.load_weight(device=device)

        if isinstance(w, nn.Parameter):
            try:
                weight = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except:
                weight = w.to(dtype=self.dtype).T
            self.has_bias = False
        elif isinstance(w, tuple):
            try:
                weight = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except:
                weight = w[0].to(dtype=self.dtype).T
            self.bias = w[1].to(dtype=self.dtype)
            self.has_bias = True
        else:
            raise ValueError("Invalid weight type")
        weight = weight.to("cpu").float().transpose(0, 1).contiguous()

        if self.has_bias:
            self.bias = self.bias.to(device)

        # quantize linear weight
        from ipex_llm.transformers.models.common import quantize_linear
        paramsLowBit, qtype = quantize_linear(weight, self.in_features, self.precision)
        self.weight = paramsLowBit.to(device)
        self.qtype = qtype
        self.loaded = True

    def unload(self):
        if self.weight is not None:
            self.weight = None
        if self.has_bias:
            self.bias = None

LINEAR_MAP = {
    "KLinearMarlin": KLinearMarlin,
    "KLinearTorch": KLinearTorch,
    "KLinearCPUInfer": KLinearCPUInfer,
    "VLinearMarlin": VLinearMarlin,
    "KLinearFP8": KLinearFP8,
    "KLinearQ8": KLinearQ8,
    "KLinearIPEXLLM": KLinearIPEXLLM,
}

class KTransformersLinear(BaseInjectedModule, KLinearBase):
    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        generate_device: str = "cuda",
        generate_op: str| None = "KLinearMarlin",
        prefill_device: str = "cuda",
        prefill_op: str| None = "KLinearTorch",
        **kwargs,
    ):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        KLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
        # build all the linear operators
        if prefill_op is not None:
            assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
            self.prefill_linear = LINEAR_MAP[prefill_op](key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        else:
            self.prefill_linear = None

        if generate_op is not None:
            assert generate_op in LINEAR_MAP, f"linear_type {generate_op} not supported"
            self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
        else:
            self.generate_linear = None
        self.mode = InferenceState.UNLOAD

    def forward(self, x, bsz_tensor=None):
        # linear_fwd_st = time.time()
        if self.mode == InferenceState.PREFILL:
            assert self.prefill_linear is not None, "cpu linear is not initialized"
            y = self.prefill_linear.forward(x, bsz_tensor)
        else:
            assert self.generate_linear is not None, "gpu linear is not initialized"
        # TODO: A violence way to solve the weight=None, for Lora inference Test, need modify it later
            try:
                y = self.generate_linear.forward(x, bsz_tensor)
            except TypeError as e:
                Warning("A Dange way to avoid the none weight, Need to check it later in KTransformersLinear forward!!")
                self.generate_linear.weight = self.orig_module.generate_linear.weight
                self.weight = self.orig_module.generate_linear.weight
                y = self.generate_linear.forward(x, bsz_tensor)
        
        # linear_fwd_end = time.time()
        # print(f"[KTLinear] Forward time: {linear_fwd_end-linear_fwd_st}")
        return y

    def load(self, w: dict | nn.Parameter | tuple | None = None, mode: InferenceState = InferenceState.GENERATE):
        if not mode:
            mode = InferenceState.GENERATE
        # load to device
        if mode == InferenceState.PREFILL:
            self.generate_linear.unload()
            self.prefill_linear.load(w=w)
            self.device = self.prefill_linear.device
            self.weight = self.prefill_linear.weight # modeling_xxx.py may use linear.weight
        elif mode == InferenceState.GENERATE:
            self.prefill_linear.unload()
            self.generate_linear.load(w=w)
            self.device = self.generate_linear.device
            self.weight = self.generate_linear.weight # modeling_xxx.py may use linear.weight
        elif mode == InferenceState.UNLOAD:
            self.prefill_linear.unload()
            self.generate_linear.unload()
            self.device = "cpu"
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")
        self.mode = mode

    def unload(self):
        if self.prefill_linear is not None:
            self.prefill_linear.unload()
        if self.generate_linear is not None:
            self.generate_linear.unload()
        self.device = self.generate_linear.device

    def set_inference_mode(self, mode: InferenceState):
        if not mode: 
            mode = InferenceState.GENERATE
        if mode == InferenceState.GENERATE:
            self.load(mode=InferenceState.GENERATE)
        elif mode == InferenceState.PREFILL:
            self.load(mode=InferenceState.PREFILL)
        elif mode == InferenceState.UNLOAD:
            self.unload()
        else:
            raise ValueError("mode must be either InferenceState.GENERATE, InferenceState.PREFILL or InferenceState.UNLOAD")


================================================
FILE: kt-sft/ktransformers/operators/mlp.py
================================================

from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader
from transformers import PretrainedConfig
import torch.nn as nn
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeMLP
class kDeepseekV3MLP(DeepseekV3MLP, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.hidden_size, orig_module.intermediate_size)
    def forward(self, x, bsz_tensor):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor)
        return down_proj
class KQwen2MoeMLP(Qwen2MoeMLP, BaseInjectedModule):
    def __init__(self,
                 key: str,
                 gguf_loader : GGUFLoader,
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.intermediate_size)
    def forward(self, x, bsz_tensor):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor)
        return down_proj

================================================
FILE: kt-sft/ktransformers/operators/models.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :  
Author       : Azure-Tang
Date         : 2024-07-25 11:25:24
Version      : 1.0.0
LastEditors  : Azure 
LastEditTime : 2024-08-27 07:29:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
"""

import inspect
import math
from typing import List, Optional, Tuple, Union
import time
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ktransformers.operators.dynamic_attention import DynamicScaledDotProductAttention
from ktransformers.server.config.config import Config
import os
import yaml
from transformers.activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
)
from transformers.modeling_outputs import (
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from ktransformers.models.modeling_qwen2_moe import (
    Qwen2MoeSparseMoeBlock,
    Qwen2MoeMLP,
    Qwen2MoeDecoderLayer,
    Qwen2MoeRotaryEmbedding,
)

from ktransformers.models.modeling_qwen3_moe import (
    Qwen3MoeSparseMoeBlock,
    Qwen3MoeMLP,
    Qwen3MoeDecoderLayer,
)

from ktransformers.models.modeling_deepseek import (
    BaseModelOutputWithPast,
    DeepseekV2DecoderLayer,
    DeepseekV2MoE,
)
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.inference_state import InferenceState
from ktransformers.util.utils import get_compute_capability
from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import (
    LlamaDecoderLayer,
    LlamaRMSNorm,
    LlamaRotaryEmbedding,
)

if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

    _flash_supports_window_size = "window_size" in list(
        inspect.signature(flash_attn_func).parameters
    )

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Qwen/Qwen1.5-MoE-A2.7B"
_CONFIG_FOR_DOC = "Qwen2MoeConfig"

QWEN2MOE_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Qwen2MoeConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

QWEN2MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_router_logits (`bool`, *optional*):
            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
            should not be returned during inference.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
    QWEN2MOE_START_DOCSTRING,
)
class KQwen2MoeModel(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]

    Args:
        config: Qwen2MoeConfig
    """

    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, device, **kwargs
        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()

    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        per_layer_prefill_intput_threshold: (
            int | None
        ) = None,  # if None or 0, close per-layer prefill
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        # print(f'Total length of input_ids: {input_ids.size(1)}, {input_ids.size()}')

        if per_layer_prefill_intput_threshold is None:
            per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
        per_layer_prefill_flag = False
        seq_lenth = (
            inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
        )
        if (
            per_layer_prefill_intput_threshold
            and per_layer_prefill_intput_threshold < seq_lenth
        ):
            per_layer_prefill_flag = True
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.UNLOAD)
        else:
            pass
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_router_logits = (
            output_router_logits
            if output_router_logits is not None
            else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        use_legacy_cache = False
        if use_cache and not isinstance(past_key_values, Cache):
            use_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if inputs_embeds is None:
            input_ids = input_ids.to("cpu")
            inputs_embeds = self.embed_tokens(input_ids)
            inputs_embeds = inputs_embeds.to("cuda")

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds,
            cache_position,
            past_key_values,
            output_attentions,
        )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        if torch.xpu.is_available() and inputs_embeds.device.type == "xpu":
            position_embeddings = self.rotary_emb(hidden_states, position_ids)
        else:
            position_embeddings = None

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

        for i, decoder_layer in enumerate(self.layers):
            if self.transfer_map is not None and i in self.transfer_map:
                prev_stream = torch.cuda.current_stream()
                cur_device = self.transfer_map[i]
                if cur_device not in self.stream_device_map:
                    self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                torch.cuda.set_device(cur_device)
                self.stream_device_map[cur_device].wait_stream(prev_stream)
                torch.cuda.set_stream(self.stream_device_map[cur_device])
                hidden_states = hidden_states.to(
                    self.transfer_map[i], non_blocking=True
                )
                causal_mask = (
                    causal_mask.to(self.transfer_map[i], non_blocking=True)
                    if causal_mask is not None
                    else None
                )
                position_ids = (
                    position_ids.to(self.transfer_map[i], non_blocking=True)
                    if position_ids is not None
                    else None
                )
                cache_position = (
                    cache_position.to(self.transfer_map[i], non_blocking=True)
                    if cache_position is not None
                    else None
                )

            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                )
            else:
                if per_layer_prefill_flag:
                    # print(f"to gpu")
                    self.load_layer_to(decoder_layer, InferenceState.PREFILL)
                    torch.cuda.empty_cache()
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                )
                if per_layer_prefill_flag:
                    # print(f"to cpu")
                    self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
                    torch.cuda.empty_cache()
            hidden_states = layer_outputs[0]

            if use_cache and len(layer_outputs) > 1:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
            else:
                next_decoder_cache = None

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits and layer_outputs[-1] is not None:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        if per_layer_prefill_flag:
            per_layer_prefill_flag = False
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.GENERATE)
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            if next_decoder_cache is not None:
                next_cache = (
                    next_decoder_cache.to_legacy_cache()
                    if use_legacy_cache
                    else next_decoder_cache
                )
            else:
                next_cache = past_key_values

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_cache,
                    all_hidden_states,
                    all_self_attns,
                    all_router_logits,
                ]
                if v is not None
            )
        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    def load_layer_to(self, layer: Qwen2MoeDecoderLayer, target: InferenceState):
        assert isinstance(
            layer, Qwen2MoeDecoderLayer
        ), "module should be nn.ModuleList of decoder layers"

        # TODO Support restore to original device, not only cuda
        device = "cpu" if target == InferenceState.UNLOAD else "cuda"

        # attn
        layer.self_attn.q_proj.set_inference_mode(target)
        layer.self_attn.k_proj.set_inference_mode(target)
        layer.self_attn.v_proj.set_inference_mode(target)
        layer.self_attn.o_proj.set_inference_mode(target)
        layer.self_attn.rotary_emb = layer.self_attn.rotary_emb.to(device)

        # mlp
        if isinstance(layer.mlp, Qwen2MoeSparseMoeBlock):
            layer.mlp.gate.set_inference_mode(target)
            layer.mlp.experts.set_inference_mode(target)
            layer.mlp.shared_expert.gate_proj.set_inference_mode(target)
            layer.mlp.shared_expert.up_proj.set_inference_mode(target)
            layer.mlp.shared_expert.down_proj.set_inference_mode(target)
            layer.mlp.shared_expert.act_fn.to(device)
            layer.mlp.shared_expert_gate.to(device)
        else:
            layer.mlp.gate_proj.set_inference_mode(target)
            layer.mlp.up_proj.set_inference_mode(target)
            layer.mlp.down_proj.set_inference_mode(target)
            layer.mlp.act_fn.to(device)
        # layer norm
        layer.input_layernorm.to(device)
        layer.post_attention_layernorm.to(device)


DeepseekV2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class KDeepseekV2Model(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]

    Args:
        config: DeepseekV2Config
    """

    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, device, **kwargs
        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()

    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        per_layer_prefill_intput_threshold: (
            int | None
        ) = None,  # if None, no per-layer prefill
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        self.gradient_checkpointing = False
        if per_layer_prefill_intput_threshold is None:
            per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
        per_layer_prefill_flag = False
        seq_lenth = (
            inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
        )
        if (
            per_layer_prefill_intput_threshold
            and per_layer_prefill_intput_threshold < seq_lenth
        ):
            per_layer_prefill_flag = True
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.UNLOAD)
            torch.cuda.empty_cache()
        else:
            pass
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
                )
                use_cache = False

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)
        
        if inputs_embeds is None:
            org_device = input_ids.device
            # TODO move to embed_tokens's device, not hard code to cpu
            # input_ids = input_ids.to("cpu")
            input_ids = input_ids.to(self.embed_tokens.weight.device)
            inputs_embeds = self.embed_tokens(input_ids).to(org_device)
            input_ids = input_ids.to(org_device)

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        if inputs_embeds.device.type == "xpu" and position_ids is not None:
            cos, sin = self.layers[0].self_attn.rotary_emb(inputs_embeds,
                                                           position_ids)
            position_embeddings = (cos, sin)
        else:
            position_embeddings = None

        if per_layer_prefill_flag:
            causal_mask = None
        else:
            if (os.name == 'nt'
                or get_compute_capability() < 8
                or (self.transfer_map is not None and 'cpu' in self.transfer_map.values())
                or device_manager.gpu_vendor != GPUVendor.NVIDIA):
                # print("for Windows or GPU before ampere, use forward_windows")
                # only use mask in forward windows or can't flash attn
                causal_mask = self._update_causal_mask(
                    attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
                )
            else:
                causal_mask = None

        # embed positions
        hidden_states = inputs_embeds
        if per_layer_prefill_flag:
            print(f"Total length of input_ids: {hidden_states.size(1)}")

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        t_gpu = 0
        t_cpu = 0
        t_f = 0

        for i, decoder_layer in enumerate(self.layers):
            # print(f"@@@@@@@@@@@@@@@@@layer {i}@@@@@@@@@@@@@@@@@@@@ \n")
            if self.transfer_map is not None and i in self.transfer_map:
                prev_stream = torch.cuda.current_stream()
                cur_device = self.transfer_map[i]
                if cur_device not in self.stream_device_map and cur_device.lower() != "cpu":
                    self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
                if cur_device.lower() != "cpu":
                    torch.cuda.set_device(cur_device)
                    self.stream_device_map[cur_device].wait_stream(prev_stream)
                    torch.cuda.set_stream(self.stream_device_map[cur_device])
                hidden_states = hidden_states.to(
                    self.transfer_map[i], non_blocking=True
                )
                causal_mask = (
                    causal_mask.to(self.transfer_map[i], non_blocking=True)
                    if causal_mask is not None
                    else None
                )
                position_ids = (
                    position_ids.to(self.transfer_map[i], non_blocking=True)
                    if position_ids is not None
                    else None
                )
                cache_position = (
                    cache_position.to(self.transfer_map[i], non_blocking=True)
                    if cache_position is not None
                    else None
                )

            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                )
            else:
                t3 = time.time()
                if per_layer_prefill_flag:
                    # print(f"to gpu")
                    self.load_layer_to(decoder_layer, InferenceState.PREFILL)
                    torch.cuda.empty_cache()
                t4 = time.time()
                # with open("log.txt", "a") as f:
                #     f.write(f"@@@@@@@@@@@@@@@@@layer {i}@@@@@@@@@@@@@@@@@@@@ \n")
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                )
                t5 = time.time()
                if per_layer_prefill_flag:
                    # print(f"to cpu")
                    self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
                    torch.cuda.empty_cache()
                t6 = time.time()
            t_gpu += t4 - t3
            t_cpu += t6 - t5
            t_f += t5 - t4

            hidden_states = layer_outputs[0]

            # @@@@@@@ TODO open this notes, tmp close to fit deepseekv3
            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)
        # with open("log.txt", "a") as f:
        #     f.write(f"@@@After layers\n")
        #     f.write(f"hidden_states={hidden_states}\n")
        #     f.write(f"hidden_states.shape={hidden_states.shape}\n")

        if per_layer_prefill_flag:
            t6 = time.time()
            # print(f"restore")
            per_layer_prefill_flag = False
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.GENERATE)
            torch.cuda.empty_cache()
            t7 = time.time()

            print(
                f"total time: {t7-t3}, \n layer num{len(self.layers)}, gpu time: {t_gpu}, cpu time: {t_cpu}, forward time: {t_f}, restore time: {t7-t6}"
            )

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache()
                if use_legacy_cache
                else next_decoder_cache
            )
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    def load_layer_to(self, layer: DeepseekV2DecoderLayer, target: InferenceState):
        assert isinstance(
            layer, DeepseekV2DecoderLayer
        ), "module should be nn.ModuleList of decoder layers"

        # TODO Support restore to original device, not only cuda
        device = "cpu" if target == InferenceState.UNLOAD else "cuda"

        # TODO Support DFS to auto use {to, set_inference_mode} according to the module type

        # attn
        layer.self_attn.to(device)  #

        # mlp
        if isinstance(layer.mlp, DeepseekV2MoE):
            layer.mlp.gate.to(device)
            layer.mlp.experts.set_inference_mode(target)
            layer.mlp.shared_experts.gate_proj.set_inference_mode(target)
            layer.mlp.shared_experts.up_proj.set_inference_mode(target)
            layer.mlp.shared_experts.down_proj.set_inference_mode(target)
            layer.mlp.shared_experts.act_fn.to(device)
            # layer.mlp.shared_expert_gate.to(device)
        else:
            layer.mlp.gate_proj.set_inference_mode(target)
            layer.mlp.up_proj.set_inference_mode(target)
            layer.mlp.down_proj.set_inference_mode(target)
            layer.mlp.act_fn.to(device)
        # layer norm
        layer.input_layernorm.to(device)
        layer.post_attention_layernorm.to(device)


LLAMA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LlamaConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

LLAMA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaPreTrainedModel(PreTrainedModel):
    config_class = LlamaConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlamaDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


class KLlamaModel(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]

    Args:
        config: LlamaConfig
    """

    dynamic_sdpa = None

    def __init__(
        self,
        key: str,
        gguf_loader: GGUFLoader,
        config: PretrainedConfig,
        orig_module: nn.Module,
        device: str = "cuda",
        per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
        transfer_map: dict = None,
        **kwargs,
    ):

        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, device, **kwargs
        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()
        user_path: str = os.path.expanduser('~')
        localstore_path: str = os.path.join(user_path,'.ktransformers')
        config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
        with open(config_path,"r") as file:
            config_yaml = yaml.safe_load(file.read())
            self.long_context_config = config_yaml.get("long_context")
            self.ext_config = config_yaml.get("ext")

        KLlamaModel.dynamic_sdpa = DynamicScaledDotProductAttention(
            max_seq_len=self.long_context_config["max_seq_len"],
            block_size=self.long_context_config["block_size"],
            config=config,
            device=torch.device("cuda"),
            local_windows_len=self.long_context_config["local_windows_len"],
            topk=self.long_context_config["second_select_num"],
            threads_num=self.ext_config["cpu_infer"],
            anchor_type=self.long_context_config["anchor_type"],
            kv_type=self.long_context_config["kv_type"],
            dense_layer_num=self.long_context_config["dense_layer_num"],
            anchor_num=self.long_context_config["anchor_num"],
            preselect_block=self.long_context_config["preselect_block"],
            block_selection_mode=self.long_context_config["head_select_mode"],
            preselect_block_count=self.long_context_config["preselect_block_count"],
            layer_step=self.long_context_config["layer_step"],
            token_step=self.long_context_config["token_step"],
            prefill_chunk_size=self.long_context_config["chunk_size"],
            use_attn_sparsity=False,
        )

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
            )
            use_cache = False

        return_legacy_cache = False
        if (
            use_cache and not isinstance(past_key_values, Cache) and not self.training
        ):  # kept for BC (non `Cache` `past_key_values` inputs)
            return_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            logger.warning_once(
                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
            )

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device="cuda",
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = None
        chunck_size = self.long_context_config["chunk_size"]
        cur_idx = 0
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids.to("cpu"))
        q_len = cache_position.size(0)

        # generate
        if q_len == 1:
            x = inputs_embeds[:, -1:, :]
            position_ids = position_ids[:, -1:]
            return self.forward_chunk(
                x,
                causal_mask,
                position_ids,
                past_key_values,
                output_attentions,
                use_cache,
                cache_position,
                output_hidden_states,
                return_dict,
            )
        elif q_len <= chunck_size:
            inputs_embeds = inputs_embeds.to('cuda')
            output = self.forward_chunk(
                inputs_embeds,
                causal_mask,
                position_ids,
                past_key_values,
                output_attentions,
                use_cache,
                cache_position,
                output_hidden_states,
                return_dict,
            )
            KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
            KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
            return output
        cur_idx = 0
        assert (
            output_attentions == False
        ), "output_attentions is not supported when using chunked attention"
        attn_output = None
        # prefill
        KLlamaModel.dynamic_sdpa.remaining_length = q_len
        while cur_idx < q_len:
            print(f'current prefill length: {cur_idx}')
            chunk_mask = None
            if inputs_embeds.device.type == 'cpu':
                tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)].to("cuda")
            else:
                tmp_inputs_embeds = inputs_embeds[:, cur_idx : min(cur_idx + chunck_size, q_len)]
            output_with_past = self.forward_chunk(
                tmp_inputs_embeds,
                chunk_mask,
                position_ids[:, cur_idx : min(cur_idx + chunck_size, q_len)],
                past_key_values,
                output_attentions,
                use_cache,
                cache_position[cur_idx : min(cur_idx + chunck_size, q_len)],
            )
            cur_output = output_with_past.last_hidden_state
            KLlamaModel.dynamic_sdpa.remaining_length -= (
                min(cur_idx + chunck_size, q_len) - cur_idx
            )
            cur_idx += chunck_size
            # if attn_output is None:
            attn_output = cur_output
            # else:
            #     attn_output = torch.cat((attn_output, cur_output), dim=-2)

        KLlamaModel.dynamic_sdpa.calc_anchor(cache_position[-1] + 1)
        KLlamaModel.dynamic_sdpa.clear_importance(cache_position[-1] + 1)
        return BaseModelOutputWithPast(last_hidden_state=attn_output)

    def forward_chunk(
        self,
        inputs_embeds,
        causal_mask,
        position_ids,
        past_key_values,
        output_attentions,
        use_cache,
        cache_position,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_legacy_cache = False
        if use_cache and not isinstance(
            past_key_values, Cache
        ):  # kept for BC (non `Cache` `past_key_values` inputs)
            return_legacy_cache = True
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None
        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                    position_embeddings,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if return_legacy_cache:
            next_cache = next_cache.to_legacy_cache()

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = (
            past_key_values.get_seq_length() if past_key_values is not None else 0
        )
        using_static_cache = isinstance(past_key_values, StaticCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and not using_static_cache
            and not output_attentions
        ):
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=past_seen_tokens,
                is_training=self.training,
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_length()
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        if attention_mask is not None and attention_mask.dim() == 4:
            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
            if attention_mask.max() != 0:
                raise ValueError(
                    "Custom 4D attention mask should be passed in inverted form with max==0`"
                )
            causal_mask = attention_mask
        else:
            causal_mask = torch.full(
                (sequence_length, target_length),
                fill_value=min_dtype,
                dtype=dtype,
                device=device,
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(
                target_length, device=device
            ) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(
                input_tensor.shape[0], 1, -1, -1
            )
            if attention_mask is not None:
                causal_mask = (
                    causal_mask.clone()
                )  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = (
                    causal_mask[:, :, :, :mask_length]
                    + attention_mask[:, None, None, :]
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[
                    :, :, :, :mask_length
                ].masked_fill(padding_mask, min_dtype)
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(
                causal_mask, min_dtype
            )

        return causal_mask


QWEN3MOE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_router_logits (`bool`, *optional*):
            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
            should not be returned during inference.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""

class KQwen3MoeModel(BaseInjectedModule):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3MoeDecoderLayer`]

    Args:
        config: Qwen3MoeConfig
    """

    def __init__(
            self,
            key: str,
            gguf_loader: GGUFLoader,
            config: PretrainedConfig,
            orig_module: nn.Module,
            device: str = "cuda",
            per_layer_prefill_intput_threshold: int = 30000,  # if None, no per-layer prefill
            transfer_map: dict = None,
            **kwargs,
    ):
        BaseInjectedModule.__init__(
            self, key, gguf_loader, config, orig_module, device, **kwargs
        )
        self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
        self.transfer_map = transfer_map
        self.stream_device_map = dict()
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.rotary_emb = Qwen2MoeRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    @add_start_docstrings_to_model_forward(QWEN3MOE_INPUTS_DOCSTRING)
    def forward(
            self,
            input_ids: torch.LongTensor = None,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_values: Optional[List[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.FloatTensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            output_router_logits: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            cache_position: Optional[torch.LongTensor] = None,
            per_layer_prefill_intput_threshold: (
                    int | None
            ) = None,  # if None or 0, close per-layer prefill
    ) -> Union[Tuple, MoeModelOutputWithPast]:
        # print(f'Total length of input_ids: {input_ids.size(1)}, {input_ids.size()}')

        if per_layer_prefill_intput_threshold is None:
            per_layer_prefill_intput_threshold = self.per_layer_prefill_intput_threshold
        per_layer_prefill_flag = False
        seq_lenth = (
            inputs_embeds.size(1) if inputs_embeds is not None else input_ids.size(1)
        )
        if (
                per_layer_prefill_intput_threshold
                and per_layer_prefill_intput_threshold < seq_lenth
        ):
            per_layer_prefill_flag = True
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.UNLOAD)
        else:
            pass
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_router_logits = (
            output_router_logits
            if output_router_logits is not None
            else self.config.output_router_logits
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()
        # use_legacy_cache = False
        # if use_cache and not isinstance(past_key_values, Cache):
        #     use_legacy_cache = True
        #     past_key_values = DynamicCache.from_legacy_cache(past_key_values)
        #     logger.warning_once(
        #         "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
        #         "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
        #     )

        if inputs_embeds is None:
            input_ids = input_ids.to("cpu")
            inputs_embeds = self.embed_tokens(input_ids)
            inputs_embeds = inputs_embeds.to("cuda")

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds,
            cache_position,
            past_key_values,
            output_attentions,
        )

        hidden_states = inputs_embeds

        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_router_logits = () if output_router_logits else None
        # next_decoder_cache = None

        for i, decoder_layer in enumerate(self.layers):
            # if self.transfer_map is not None and i in self.transfer_map:
            #     prev_stream = torch.cuda.current_stream()
            #     cur_device = self.transfer_map[i]
            #     if cur_device not in self.stream_device_map:
            #         self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
            #     torch.cuda.set_device(cur_device)
            #     self.stream_device_map[cur_device].wait_stream(prev_stream)
            #     torch.cuda.set_stream(self.stream_device_map[cur_device])
            #     hidden_states = hidden_states.to(
            #         self.transfer_map[i], non_blocking=True
            #     )
            #     causal_mask = (
            #         causal_mask.to(self.transfer_map[i], non_blocking=True)
            #         if causal_mask is not None
            #         else None
            #     )
            #     position_ids = (
            #         position_ids.to(self.transfer_map[i], non_blocking=True)
            #         if position_ids is not None
            #         else None
            #     )
            #     cache_position = (
            #         cache_position.to(self.transfer_map[i], non_blocking=True)
            #         if cache_position is not None
            #         else None
            #     )

            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    output_router_logits,
                    use_cache,
                    cache_position,
                    # position_embeddings,
                )
            else:
                if per_layer_prefill_flag:
                    # print(f"to gpu")
                    self.load_layer_to(decoder_layer, InferenceState.PREFILL)
                    torch.cuda.empty_cache()
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    output_router_logits=output_router_logits,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    # position_embeddings=position_embeddings,
                )
                if per_layer_prefill_flag:
                    # print(f"to cpu")
                    self.load_layer_to(decoder_layer, InferenceState.UNLOAD)
                    torch.cuda.empty_cache()
            hidden_states = layer_outputs[0]
            # use_cache=False
            # if use_cache:
            #     next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

            if output_router_logits and layer_outputs[-1] is not None:
                all_router_logits += (layer_outputs[-1],)

        hidden_states = self.norm(hidden_states)

        if per_layer_prefill_flag:
            per_layer_prefill_flag = False
            for layer in self.layers:
                self.load_layer_to(layer, InferenceState.GENERATE)
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # next_cache = None
        # if use_cache:
        #     next_cache = (
        #         next_decoder_cache.to_legacy_cache()
        #         if use_legacy_cache
        #         else next_decoder_cache
        #     )

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    past_key_values,
                    all_hidden_states,
                    all_self_attns,
                    all_router_logits,
                ]
                if v is not None
            )
        return MoeModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            router_logits=all_router_logits,
        )

    def load_layer_to(self, layer: Qwen3MoeDecoderLayer, target: InferenceState):
        assert isinstance(
            layer, Qwen3MoeDecoderLayer
        ), "module should be nn.ModuleList of decoder layers"

        # TODO Support restore to original device, not only cuda
        device = "cpu" if target == InferenceState.UNLOAD else "cuda"

        # attn
        layer.self_attn.q_proj.set_inference_mode(target)
        layer.self_attn.k_proj.set_inference_mode(target)
        layer.self_attn.v_proj.set_inference_mode(target)
        layer.self_attn.o_proj.set_inference_mode(target)
        layer.self_attn.rotary_emb = layer.self_attn.rotary_emb.to(device)

        # mlp
        if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
            layer.mlp.gate.set_inference_mode(target)
            layer.mlp.experts.set_inference_mode(target)
            layer.mlp.shared_expert.gate_proj.set_inference_mode(target)
            layer.mlp.shared_expert.up_proj.set_inference_mode(target)
            layer.mlp.shared_expert.down_proj.set_inference_mode(target)
            layer.mlp.shared_expert.act_fn.to(device)
            layer.mlp.shared_expert_gate.to(device)
        else:
            layer.mlp.gate_proj.set_inference_mode(target)
            layer.mlp.up_proj.set_inference_mode(target)
            layer.mlp.down_proj.set_inference_mode(target)
            layer.mlp.act_fn.to(device)
        # layer norm
        layer.input_layernorm.to(device)
        layer.post_attention_layernorm.to(device)


================================================
FILE: kt-sft/ktransformers/operators/triton_attention.py
================================================
# Adapted from
# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
# which was originally adapted from
# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py

import triton
import triton.language as tl
from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor
@triton.jit
def tanh(x):
    # Tanh is just a scaled sigmoid
    return 2 * tl.sigmoid(2 * x) - 1

@triton.jit
def _fwd_grouped_kernel_stage1(
    Q,
    K_Buffer,
    V_Buffer,
    sm_scale,
    Req_to_tokens,
    B_Seqlen,
    Att_Out,
    stride_req_to_tokens_b,
    stride_qbs,
    stride_qh,
    stride_buf_kbs,
    stride_buf_kh,
    stride_buf_vbs,
    stride_buf_vh,
    stride_mid_ob,
    stride_mid_oh,
    stride_mid_os,
    kv_group_num: tl.constexpr,
    q_head_num: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    BLOCK_DPE: tl.constexpr,
    BLOCK_DV: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_H: tl.constexpr,
    NUM_KV_SPLITS: tl.constexpr,
    PAGE_SIZE: tl.constexpr,
    logit_cap: tl.constexpr,
    Lk: tl.constexpr,
    Lv: tl.constexpr,
):
    cur_batch = tl.program_id(0)
    cur_head_id = tl.program_id(1)
    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
    split_kv_id = tl.program_id(2)

    if kv_group_num > BLOCK_H:
        VALID_BLOCK_H: tl.constexpr = BLOCK_H
    else:
        VALID_BLOCK_H: tl.constexpr = kv_group_num
    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
    mask_h = mask_h & (cur_head < q_head_num)

    offs_d = tl.arange(0, BLOCK_DMODEL)
    offs_dv = tl.arange(0, BLOCK_DV)
    mask_d = offs_d < Lk
    mask_dv = offs_dv < Lv
    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
    cur_batch_req_idx = cur_batch

    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
        None, :]
    q = tl.load(Q + offs_q,
                mask=(mask_h[:, None]) & (mask_d[None, :]),
                other=0.0)

    if BLOCK_DPE > 0:
        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
        mask_dpe = offs_dpe < Lk
        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
                   offs_dpe[None, :])
        qpe = tl.load(Q + off_qpe,
                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
                      other=0.0)

    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
    split_kv_start = kv_len_per_split * split_kv_id
    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
                              cur_batch_seq_len)
    
    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)

    if split_kv_end > split_kv_start:
        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
            offs_n = start_n + tl.arange(0, BLOCK_N)
            kv_page_number = tl.load(
                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
                offs_n // PAGE_SIZE,
                mask=offs_n < split_kv_end,
                other=0,
            )
            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
                          cur_kv_head * stride_buf_kh + offs_d[:, None])
            k = tl.load(
                K_Buffer + offs_buf_k,
                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
                other=0.0,
            )
            qk = tl.dot(q, k.to(q.dtype))
            
            if BLOCK_DPE > 0:
                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
                                cur_kv_head * stride_buf_kh +
                                offs_dpe[:, None])
                kpe = tl.load(
                    K_Buffer + offs_buf_kpe,
                    mask=(offs_n[None, :] < split_kv_end) &
                    (mask_dpe[:, None]),
                    other=0.0,
                )
                qk += tl.dot(qpe, kpe.to(qpe.dtype))
            qk *= sm_scale

            if logit_cap > 0:
                qk = logit_cap * tanh(qk / logit_cap)

            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
                          qk, float("-inf"))

            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
            v = tl.load(
                V_Buffer + offs_buf_v,
                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
                other=0.0,
            )

            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
            re_scale = tl.exp(e_max - n_e_max)
            p = tl.exp(qk - n_e_max[:, None])
            acc *= re_scale[:, None]
            acc += tl.dot(p.to(v.dtype), v)

            e_sum = e_sum * re_scale + tl.sum(p, 1)
            e_max = n_e_max

        offs_mid_o = (cur_batch * stride_mid_ob +
                      cur_head[:, None] * stride_mid_oh +
                      split_kv_id * stride_mid_os + offs_dv[None, :])

        tl.store(
            Att_Out + offs_mid_o,
            acc / e_sum[:, None],
            mask=(mask_h[:, None]) & (mask_dv[None, :]),
        )

        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
                        split_kv_id * stride_mid_os + Lv)

        tl.store(
            Att_Out + offs_mid_o_1,
            e_max + tl.log(e_sum),
            mask=mask_h,
        )

def _decode_grouped_att_m_fwd(
    q,
    k_buffer,
    v_buffer,
    att_out,
    Req_to_tokens,
    B_Seqlen,
    num_kv_splits,
    sm_scale,
    page_size,
    logit_cap,
):
    BLOCK = 32
    Lk = k_buffer.shape[-1]
    Lv = v_buffer.shape[-1]

    # [TODO] work around shmem limit on MI3xx
    
    # TODO: support hip
    if device_manager.gpu_vendor == GPUVendor.AMD and Lk >= 576:
       BLOCK = 16

    if Lk == 576:
        BLOCK_DMODEL = 512
        BLOCK_DPE = 64
    elif Lk == 288:
        BLOCK_DMODEL = 256
        BLOCK_DPE = 32
    else:
        BLOCK_DMODEL = triton.next_power_of_2(Lk)
        BLOCK_DPE = 0
    BLOCK_DV = triton.next_power_of_2(Lv)

    batch, head_num = q.shape[0], q.shape[1]
    kv_group_num = q.shape[1] // k_buffer.shape[-2]

    BLOCK_H = 16
    NUM_KV_SPLITS = num_kv_splits
    grid = (
        batch,
        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
        NUM_KV_SPLITS,
    )

    extra_kargs = {}
    # TODO: support hip
    """
    if is_hip_:
        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
        extra_kargs = {
            "waves_per_eu": 4,
            "matrix_instr_nonkdim": 16,
            "kpack": 2
        }
    """
    
    _fwd_grouped_kernel_stage1[grid](
        q,
        k_buffer,
        v_buffer,
        sm_scale,
        Req_to_tokens,
        B_Seqlen,
        att_out,
        Req_to_tokens.stride(0),
        q.stride(0),
        q.stride(1),
        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
        att_out.stride(0),
        att_out.stride(1),
        att_out.stride(2),
        kv_group_num=kv_group_num,
        q_head_num=head_num,
        BLOCK_DMODEL=BLOCK_DMODEL,
        BLOCK_DPE=BLOCK_DPE,
        BLOCK_DV=BLOCK_DV,
        BLOCK_N=BLOCK,
        BLOCK_H=BLOCK_H,
        NUM_KV_SPLITS=NUM_KV_SPLITS,
        PAGE_SIZE=page_size,
        logit_cap=logit_cap,
        num_warps=4,
        num_stages=2,
        Lk=Lk,
        Lv=Lv,
        **extra_kargs,
    )

@triton.jit
def _fwd_kernel_stage2(
    Mid_O,
    o,
    B_Seqlen,
    stride_mid_ob,
    stride_mid_oh,
    stride_mid_os,
    stride_obs,
    stride_oh,
    NUM_KV_SPLITS: tl.constexpr,
    BLOCK_DV: tl.constexpr,
    Lv: tl.constexpr,
):
    cur_batch = tl.program_id(0)
    cur_head = tl.program_id(1)

    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)

    offs_d = tl.arange(0, BLOCK_DV)
    mask_d = offs_d < Lv

    e_sum = 0.0
    e_max = -float("inf")
    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)

    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv

    for split_kv_id in range(0, NUM_KV_SPLITS):
        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
        split_kv_start = kv_len_per_split * split_kv_id
        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
                                  cur_batch_seq_len)

        if split_kv_end > split_kv_start:
            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
                         mask=mask_d,
                         other=0.0)
            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
            n_e_max = tl.maximum(tlogic, e_max)

            old_scale = tl.exp(e_max - n_e_max)
            acc *= old_scale
            exp_logic = tl.exp(tlogic - n_e_max)
            acc += exp_logic * tv

            e_sum = e_sum * old_scale + exp_logic
            e_max = n_e_max

    tl.store(
        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
        acc / e_sum,
        mask=mask_d,
    )

def _decode_softmax_reducev_fwd(
    logits,
    q,
    o,
    v_buffer,
    b_seq_len,
    num_kv_splits,
):
    batch, head_num = q.shape[0], q.shape[1]
    Lv = v_buffer.shape[-1]
    BLOCK_DV = triton.next_power_of_2(Lv)

    NUM_KV_SPLITS = num_kv_splits

    extra_kargs = {}
    # TODO: support hip
    """
    if is_hip_:
        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
        extra_kargs = {
            "waves_per_eu": 4,
            "matrix_instr_nonkdim": 16,
            "kpack": 2
        }
    """
    
    grid = (batch, head_num)
    _fwd_kernel_stage2[grid](
        logits,
        o,
        b_seq_len,
        logits.stride(0),
        logits.stride(1),
        logits.stride(2),
        o.stride(0),
        o.stride(1),
        NUM_KV_SPLITS=NUM_KV_SPLITS,
        BLOCK_DV=BLOCK_DV,
        Lv=Lv,
        num_warps=4,
        num_stages=2,
        **extra_kargs,
    )

def decode_attention_fwd_grouped(
    q,
    k_buffer,
    v_buffer,
    o,
    req_to_token,
    b_seq_len,
    attn_logits,
    num_kv_splits,
    sm_scale,
    page_size,
    logit_cap=0.0,
):
    _decode_grouped_att_m_fwd(
        q,
        k_buffer,
        v_buffer,
        attn_logits,
        req_to_token,
        b_seq_len,
        num_kv_splits,
        sm_scale,
        page_size,
        logit_cap,
    )

    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
                                num_kv_splits)


================================================
FILE: kt-sft/ktransformers/operators/triton_attention_prefill.py
================================================

# Adapted from
# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py
# which was originally adapted from
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1

"""
Memory-efficient attention for prefill.
It supporst page size = 1.
"""

# Adapted from
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
import torch
import triton
import triton.language as tl

is_cuda_available = torch.cuda.is_available()
if is_cuda_available:
    CUDA_CAPABILITY = torch.cuda.get_device_capability()


@triton.jit
def _fwd_kernel(
    Q,
    K,
    V,
    sm_scale,
    B_Start_Loc,
    B_Seqlen,
    Out,
    stride_qbs,
    stride_qh,
    stride_kbs,
    stride_kh,
    stride_vbs,
    stride_vh,
    stride_obs,
    stride_oh,
    kv_group_num: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    IS_CAUSAL: tl.constexpr,
    Lk: tl.constexpr,
):
    cur_batch = tl.program_id(0)
    cur_head = tl.program_id(1)
    start_m = tl.program_id(2)

    cur_kv_head = cur_head // kv_group_num

    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)

    block_start_loc = BLOCK_M * start_m

    # initialize offsets
    offs_n = tl.arange(0, BLOCK_N)
    offs_d = tl.arange(0, BLOCK_DMODEL)
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    off_q = (
        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
        + cur_head * stride_qh
        + offs_d[None, :]
    )
    off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None]
    off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :]

    mask_d = offs_d < Lk

    q = tl.load(
        Q + off_q,
        mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]),
        other=0.0,
    )

    k_ptrs = K + off_k
    v_ptrs = V + off_v

    # initialize pointer to m and l
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)

    block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)

    end_n = (
        cur_batch_seq_len
        if not IS_CAUSAL
        else tl.minimum((start_m + 1) * BLOCK_M, cur_batch_seq_len)
    )
    for start_n in range(0, block_mask * end_n, BLOCK_N):
        start_n = tl.multiple_of(start_n, BLOCK_N)
        # -- compute qk ----
        k = tl.load(
            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
            mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]),
            other=0.0,
        )
        # mask = tl.load(mask_ptrs + start_n, mask=start_n + offs_n < cur_batch_end_loc, other=0.0)

        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        qk += tl.dot(q, k)
        qk *= sm_scale

        if IS_CAUSAL:
            qk += tl.where(
                (start_n + offs_n[None, :] < cur_batch_seq_len)
                & (offs_m[:, None] >= (start_n + offs_n[None, :])),
                0,
                float("-inf"),
            )
        else:
            qk += tl.where(
                (start_n + offs_n[None, :]) < cur_batch_seq_len, 0, float("-inf")
            )

        # -- compute m_ij, p, l_ij
        m_ij = tl.max(qk, 1)
        p = tl.exp(qk - m_ij[:, None])
        l_ij = tl.sum(p, 1)
        # -- update m_i and l_i
        m_i_new = tl.maximum(m_i, m_ij)
        alpha = tl.exp(m_i - m_i_new)
        beta = tl.exp(m_ij - m_i_new)
        l_i_new = alpha * l_i + beta * l_ij
        # -- update output accumulator --
        # scale p
        p_scale = beta / l_i_new
        p = p * p_scale[:, None]
        # scale acc
        acc_scale = l_i / l_i_new * alpha
        acc = acc * acc_scale[:, None]
        # update acc
        v = tl.load(
            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
            mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]),
            other=0.0,
        )

        p = p.to(v.dtype)
        acc += tl.dot(p, v)
        # update m_i and l_i
        l_i = l_i_new
        m_i = m_i_new
    # initialize pointers to output
    off_o = (
        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
        + cur_head * stride_oh
        + offs_d[None, :]
    )
    out_ptrs = Out + off_o
    tl.store(
        out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :])
    )


def context_attention_fwd(
    q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True
):
    """
    q, k, v: [b * s, head, head_dim]
    b_start_loc: [b]
    b_seq_len: [b]
    out: [b * s, head, head_dim]
    """
    if is_cuda_available and CUDA_CAPABILITY[0] > 8:
        BLOCK = 128
    else:
        BLOCK = 64

    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]

    sm_scale = 1.0 / (Lq**0.5)
    batch, head = b_seq_len.shape[0], q.shape[1]
    kv_group_num = q.shape[1] // k.shape[1]

    grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
    num_warps = 4 if Lk <= 64 else 8

    _fwd_kernel[grid](
        q,
        k,
        v,
        sm_scale,
        b_start_loc,
        b_seq_len,
        o,
        q.stride(0),
        q.stride(1),
        k.stride(0),
        k.stride(1),
        v.stride(0),
        v.stride(1),
        o.stride(0),
        o.stride(1),
        kv_group_num=kv_group_num,
        BLOCK_M=BLOCK,
        BLOCK_DMODEL=triton.next_power_of_2(Lk),
        BLOCK_N=BLOCK,
        IS_CAUSAL=is_causal,
        num_warps=num_warps,
        num_stages=1,
        Lk=Lk,
    )

================================================
FILE: kt-sft/ktransformers/optimize/optimize.py
================================================
'''
Description  :  
Author       : Boxin Zhang, Azure-Tang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
from typing import Mapping, List
import torch
import yaml
import re
from torch import nn
from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
# from operators import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
from ktransformers.util.utils import set_module, load_weights
import itertools
import copy

def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
    for name, child in module._modules.items():
        if child is not None:
            child_prefix = prefix + name
            if child_prefix in local_optimization_dict:
                inject_module_meta=local_optimization_dict[child_prefix]
                if inject_module_meta["class"] != "default":
                    import_path = inject_module_meta["class"].split(".")
                    import_module_name = ".".join(import_path[:-1])
                    gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
                    import_class_name = import_path[-1]
                    module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
                    print(f"Injecting {child_prefix} as", import_module_name, ".", import_class_name)
                    inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, **inject_module_meta["kwargs"])
                    set_module(module, name, inject_module)
                elif inject_module_meta["class"] == "default":
                    print(f"Injecting {child_prefix} as default")
                    gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
                else:
                    raise Exception("inject_module_meta[\"class\"] must be \"default\" or a class path")
                child_prefix += "."
                child_optimization_dict = {k: v for k, v in local_optimization_dict.items() if k.startswith(child_prefix)}
                inject(child, child_optimization_dict, model_config, gguf_loader, child_prefix)

def del_meta(module:nn.Module):
    #print("default loading weights", prefix)
    persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
    local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
    local_state = {k: v for k, v in local_name_params if v is not None}
    for name, param in local_state.items():
        if param.device == "meta" or param.device == torch.device("meta"):
            module.__delattr__(name)
    for name, child in module._modules.items():
        del_meta(child)

def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
    module_name = prefix[:-1]
    # translated_name = translate_name_to_gguf(prefix)[:-1]
    #print("gen_optimize_config", prefix, module_name, translated_name)
    recursive = True
    for rule in rule_list:
        match_meta = rule["match"]
        if "class" not in match_meta and "name" not in match_meta:
            raise Exception("match must have at least one of \"class\" and \"name\"")
        if "class" in match_meta:
            import_path = match_meta["class"].split(".")
            import_module_name = ".".join(import_path[:-1])
            import_class_name = import_path[-1]
            module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
            if not isinstance(module, module_cls):
                continue
        if "name" in match_meta:
            if re.search(match_meta["name"], module_name) is None:
                continue
        if "replace" not in rule:
            raise Exception("replace must be in rule")
        if "replace" in rule:
            replace_meta = rule["replace"]
            if module_name not in out_data:
                out_data[module_name]={"key": module_name,
                                    "class": replace_meta["class"] if "class" in replace_meta else "default",
                                    # "device": replace_meta["device"] if "device" in replace_meta else default_device,
                                    "kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
            else:
                if out_data[module_name]["class"] == "default":
                    out_data[module_name]["class"] = replace_meta["class"] if "class" in replace_meta else "default"
                out_data[module_name]["kwargs"].update(copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict())
        if "recursive" in rule:
            recursive = bool(rule["recursive"])
        break
            
    if module_name not in out_data:
        out_data[module_name]= {
            "class": "default",
            "key": module_name,
            "kwargs": {"generate_device": default_device,
                       "prefill_device": default_device}
        }

    #print(out_data[module_name])
    #input()

    if recursive:
        for name, child in module._modules.items():
            if child is not None:
                child_prefix = prefix + name + "."
                gen_optimize_config(child, out_data, rule_list, child_prefix, default_device = default_device)
    

def translate_model_config(model_config: PretrainedConfig):
    # for supporting some special model 
    if model_config.model_type == "mixtral":
        model_config.moe_intermediate_size = model_config.intermediate_size
    
    return model_config


def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0"):
    with open(rule_file, 'r', encoding='utf-8') as f:
        rule_list = yaml.load(f.read(), Loader=yaml.FullLoader)
    
    optimize_config = dict()
    gen_optimize_config(module, optimize_config, rule_list, default_device = default_device)
    
    model_config = translate_model_config(model_config)

    weights_loader = ModelLoaderFactory.create_loader(gguf_path)
    with torch.device("meta"):
        inject(module, optimize_config, model_config, weights_loader)
    # pre load lm_head because its big inter result
    load_weights(module.lm_head, weights_loader, "lm_head.", device=default_device)
    load_weights(module, weights_loader, device=default_device)
    module.gguf_loader = weights_loader
    del_meta(module)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif torch.xpu.is_available():
        torch.xpu.empty_cache()


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:2"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:2"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:3"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:3"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
- match:
    name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        15: "cuda:1"
        30: "cuda:2"
        45: "cuda:3"

- match:
    name: "^model\\.layers\\.([0-9]|[1][0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "(^model\\.layers\\.([2][0-9]|[1][5-9])\\.)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "(^model\\.layers\\.([3][0-9]|[4][0-4])\\.)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
      
- match:
    name: "(^model\\.layers\\.([5][0-9]|[4][5-9])\\.)|(^model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([345][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([345][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([345][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.(?!self_attn).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([12][0-9])\\.(?!self_attn).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        10: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\."
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([12][0-9])\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KSFTExpertsCPU"
      out_device: "cuda:0"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KSFTExpertsCPU"
      out_device: "cuda:1"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        10: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-use-adapter.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "llamafile"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === Rotary Embedding Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# === MLP (MoE) Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === MLP Gate Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === MLP Experts Replacement ===
# replace with marlin expert. Open and modify layer-num as needed.
# Each layer of malin experts takes about 6GB of GPU memory.
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!

# GPU 0: layers 3–4
# - match:
#     name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:0"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 1: layers 15–17
# - match:
#     name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 2: layers 30–32
# - match:
#     name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:2"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 3: layers 45–46
# - match:
#     name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:3"
#       generate_op:  "KExpertsMarlin"
#   recursive: False


# === MLP Experts Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:2"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:2"
  recursive: False

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:3"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:3"
  recursive: False

# === Self-Attention Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      absorb_for_prefill: False

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      absorb_for_prefill: False

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      absorb_for_prefill: False

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      absorb_for_prefill: False

# === Overall Model Replacement with Transfer Map ===

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
      transfer_map:
        15: "cuda:1" # Layers 15+ on GPU 1
        30: "cuda:2" # Layers 30+ on GPU 2
        45: "cuda:3" # Layers 45+ on GPU 3

# === Default Catch-All for Other Modules ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
- match:
    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === Rotary Embedding Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.([3][2-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"


# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# GPU 7: layers 56–63
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"


# === MLP (MoE) Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

# === MLP Gate Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"


# === MLP Experts Replacement ===
# replace with marlin expert. Open and modify layer-num as needed.
# Each layer of malin experts takes about 6GB of GPU memory.
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
# !!!Loading marlin expert will take signifcant time.!!!

# GPU 0: layers 0–7
# - match:
#     name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 1: layers 8–15
# - match:
#     name: "^model\\.layers\\.([8-9]|1[0-5)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 

# # GPU 2: layers 16–23
# - match:
#     name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0" 
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 3: layers 24–31
# - match:
#     name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 

# # GPU 4: layers 32–39
# - match:
#     name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0" 
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 5: layers 40–47
# - match:
#     name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 

# # GPU 6: layers 48–55
# - match:
#     name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts  
#     kwargs:
#       generate_device: "cuda:0"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 7: layers 56–60
# - match:
#     name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False 


# === MLP Experts Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:2"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:2"
  recursive: False

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:3"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:3"
  recursive: False

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:4"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:4"
  recursive: False

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:5"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:5"
  recursive: False

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:6"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:6"
  recursive: False

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:7"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda:7"
  recursive: False


# === Self-Attention Replacement ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–60
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

# === Overall Model Replacement with Transfer Map ===

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
      transfer_map:
        8: "cuda:1"
        16: "cuda:2"
        24: "cuda:3"
        32: "cuda:4"
        40: "cuda:5"
        48: "cuda:6"
        56: "cuda:7"

# === Default Catch-All for Other Modules ===

# GPU 0: layers 0–7
- match:
    name: "^model\\.layers\\.([0-7])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 8–15
- match:
    name: "^model\\.layers\\.(8|9|1[0-5])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 16–23
- match:
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 24–31
- match:
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# GPU 4: layers 32–39
- match:
    name: "^model\\.layers\\.(3[2-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"

# GPU 5: layers 40–47
- match:
    name: "^model\\.layers\\.(4[0-7])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"

# GPU 6: layers 48–55
- match:
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"

# GPU 7: layers 56–63
- match:
    name: "^model\\.layers\\.(5[6-9]|60)\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# For final modules (model.norm), ensure they are on GPU 7 (as in your original config)
- match:
    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearFP8"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


- match:
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-4])\\.mlp\\.experts$" # inject experts in layer 0~4 as marlin expert
  replace:
    class: ktransformers.operators.experts.KTransformersExperts  
    kwargs:
      generate_device: "cuda:0" # run in cuda:0
      generate_op:  "KExpertsMarlin"
  recursive: False

- match:
    name: "^model\\.layers\\.([3][0])\\.mlp\\.experts$" # inject experts in layer 30~31 as marlin expert
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      generate_device: "cuda:1"
      generate_op:  "KExpertsMarlin"
  recursive: False 

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

# === Rotary Embedding Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

# === MLP (MoE) Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === MLP Gate Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

# === MLP Experts Replacement ===
# replace with marlin expert. Open and modify layer-num as needed.
# Each layer of malin experts takes about 6GB of GPU memory.
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!

# GPU 0: layers 3–4
# - match:
#     name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:0"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 1: layers 15–17
# - match:
#     name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:1"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 2: layers 30–32
# - match:
#     name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:2"
#       generate_op:  "KExpertsMarlin"
#   recursive: False

# # GPU 3: layers 45–46
# - match:
#     name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
#   replace:
#     class: ktransformers.operators.experts.KTransformersExperts
#     kwargs:
#       generate_device: "cuda:3"
#       generate_op:  "KExpertsMarlin"
#   recursive: False


# === MLP Experts Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda:0"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda:1"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:2"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda:2"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda:3"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda:3"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False

# === Self-Attention Replacement ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      absorb_for_prefill: False

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      absorb_for_prefill: False

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
      absorb_for_prefill: False

# GPU 3: layers 45–60
- match:
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      absorb_for_prefill: False

# === Overall Model Replacement with Transfer Map ===

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
      transfer_map:
        15: "cuda:1" # Layers 15+ on GPU 1
        30: "cuda:2" # Layers 30+ on GPU 2
        45: "cuda:3" # Layers 45+ on GPU 3

# === Default Catch-All for Other Modules ===

# GPU 0: layers 0–14
- match:
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

# GPU 1: layers 15–29
- match:
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

# GPU 2: layers 30–44
- match:
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
- match:
    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
================================================
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\."
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"
  
- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KSFTExpertsCPU"
      out_device: "cuda:0"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KSFTExpertsCPU"
      out_device: "cuda:1"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        30: "cuda:1"

- match:
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml
================================================
- match:
    class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbeddingV2
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"
- match:
    class: ktransformers.models.modeling_llama.LlamaModel
  replace:
    class: ktransformers.operators.models.KLlamaModel
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill

- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KLlamaAttention
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Mixtral.yaml
================================================
- match:
    class: ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*$"
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.block_sparse_moe$"
    class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
  replace: 
    class: ktransformers.operators.experts.KMistralSparseMoEBlock
- match:
    name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$"
  replace: 
    class: ktransformers.operators.experts.KTransformersExperts
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "^model\\.layers\\..*\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B-serve.yaml
================================================


- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoEV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.RMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
  replace:
    class:  ktransformers.operators.mlp.kDeepseekV3MLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbeddingV4
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
# if want to use more VRAM, use experts Marlin and disable CUDA Graph(disable CUDA Graph may cause low performance)
#- match:
#    name: "^model\\.layers\\..*\\.mlp\\.experts$"
#  replace:
#    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
#    kwargs:
#      prefill_device: "cuda"
#      prefill_op: "KExpertsTorch"
#      generate_device: "cuda"
#      generate_op: "KExpertsMarlin"
#  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
================================================
- match:
    name: "^model\\.layers\\.([012])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([012])$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([012])\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
- match:
    name: "^model\\.layers\\.([012])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda:0"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda:1"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"
- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

- match:
    name: "(^model.norm)"
  replace:
    class: "default"
    kwargs:
        generate_device: "cuda:1"
        prefill_device: "cuda:1"

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        3: "cuda:1"

- match:
    name: "^model\\.layers\\.([012])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op:  "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"
- match:
    name: "^model\\.layers\\..*\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"


================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen2MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen2MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Qwen2-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen2MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen2MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
      backend: "AMXBF16" # or "AMXBF16" or "llamafile" (default)
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen3MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen3MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "VLinearMarlin"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "VLinearMarlin"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.balance_serve_attention.KQwen3MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KQwen3MoeRMSNorm
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    class: ktransformers.models.modeling_qwen3_moe.Qwen3MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/Qwen3Moe-sft-amx.yaml
================================================
- match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"

# - match:
#     name: "^model\\.layers\\..*$"  # regular expression 
#     class: torch.nn.Linear  # only match modules matching name and class simultaneously
#   replace:
#     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
#     kwargs:
#       generate_device: "cuda"
#       prefill_device: "cuda"
#       generate_op: "KLinearTorch"
#       prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearTorch"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
  replace:
    class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlock     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KSFTExpertsCPU"
      out_device: "cuda"
      backend: "AMXInt8" # or "AMXBF16" or "AMXInt8"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KQwen3MoeAttention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen3MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"

- match:
    name: "^lm_head$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cpu"
      prefill_device: "cuda"
      generate_op: "KLinearCPUInfer"
      prefill_op: "KLinearTorch"

- match:
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearQ8"
      prefill_op: "KLinearTorch"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGate
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "cuda"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/xpu/DeepSeek-V2-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "xpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "xpu"
  recursive: False # don't recursively inject submodules of this module
- match:
    class: ktransformers.models.modeling_deepseek.DeepseekV2RMSNorm
  replace:
    class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      device: "xpu"
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/xpu/DeepSeek-V3-Chat.yaml
================================================
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^lm_head$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
  replace:
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
  replace:
    class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
    class: ktransformers.operators.gate.KMoEGateIPEXLLM
    kwargs:
      generate_device: "xpu:0"
      prefill_device: "xpu:0"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "xpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "xpu"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"

================================================
FILE: kt-sft/ktransformers/optimize/optimize_rules/xpu/Qwen3Moe-Chat.yaml
================================================
- match:
    name: "rotary_emb$"
  replace:
    class: ktransformers.operators.RoPE.KQwen3MoeRotaryEmbedding
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^lm_head$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\.(?!.*mlp\\.gate).*$"  # regular expression
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
      generate_op: "KLinearIPEXLLM"
      prefill_op: "KLinearIPEXLLM"
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
  replace:
    class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2     # mlp module with custom forward function
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
    class: ktransformers.operators.experts.KTransformersExpertsV2     # custom MoE Kernel with expert paralleism
    kwargs:
      prefill_device: "xpu"
      prefill_op: "KExpertsTorch"
      generate_device: "cpu"
      generate_op: "KExpertsCPU"
      out_device: "xpu"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
    class: ktransformers.operators.attention.KQwen3MoeAttentionIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KQwen2MoeModel"
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
      generate_device: "cpu"
      prefill_device: "cpu"
- match:
    class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeRMSNorm
  replace:
    class: ktransformers.operators.layernorm.KDeepseekRMSNormIPEXLLM
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"
- match:
    class: transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeMLP
  replace:
    class:  ktransformers.operators.mlp.KQwen2MoeMLP
    kwargs:
      generate_device: "xpu"
      prefill_device: "xpu"


================================================
FILE: kt-sft/ktransformers/server/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/api/__init__.py
================================================
from fastapi import APIRouter

from .ollama import router as ollama_router
from .openai import router as openai_router,post_db_creation_operations
from .web import router as web_router

router = APIRouter()
router.include_router(ollama_router)
router.include_router(openai_router)
router.include_router(web_router)


================================================
FILE: kt-sft/ktransformers/server/api/ollama/__init__.py
================================================
from fastapi import APIRouter

from .completions import router as completions_router

router = APIRouter()
router.include_router(completions_router)


================================================
FILE: kt-sft/ktransformers/server/api/ollama/completions.py
================================================
from datetime import datetime
from http.client import NOT_IMPLEMENTED
import json
from time import time
from uuid import uuid4
from typing import List, Optional

from fastapi import APIRouter, Request
from pydantic import BaseModel, Field

from ktransformers.server.config.config import Config
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import check_link_response
from ktransformers.server.backend.base import BackendInterfaceBase

from ktransformers.server.schemas.endpoints.chat import RawUsage

router = APIRouter(prefix='/api')

# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
class OllamaGenerateCompletionRequest(BaseModel):
    model: str = Field(..., description="The model name, which is required.")
    prompt: Optional[str] = Field(
        None, description="The prompt to generate a response for.")
    images: Optional[List[str]] = Field(
        None, description="A list of base64-encoded images for multimodal models such as llava.")
    # Advanced parameters
    format: Optional[str] = Field(
        None, description="The format to return a response in, accepted value is json.")
    options: Optional[dict] = Field(
        None, description="Additional model parameters as listed in the documentation.")
    system: Optional[str] = Field(
        None, description="System message to override what is defined in the Modelfile.")
    template: Optional[str] = Field(
        None, description="The prompt template to use, overriding what is defined in the Modelfile.")
    context: Optional[str] = Field(
        None, description="The context parameter from a previous request to keep a short conversational memory.")
    stream: Optional[bool] = Field(
        None, description="If false, the response will be returned as a single response object.")
    raw: Optional[bool] = Field(
        None, description="If true, no formatting will be applied to the prompt.")
    keep_alive: Optional[str] = Field(
        "5m", description="Controls how long the model will stay loaded into memory following the request.")

class OllamaGenerationStreamResponse(BaseModel):
    model: str
    created_at: str
    response: str
    done: bool = Field(...)

class OllamaGenerationResponse(BaseModel):
    model: str
    created_at: str
    response: str
    done: bool

@router.post("/generate", tags=['ollama'])
async def generate(request: Request, input: OllamaGenerateCompletionRequest):
    id = str(uuid4())
    interface: BackendInterfaceBase = get_interface()
    print(f'COMPLETION INPUT:----\n{input.prompt}\n----')
    config = Config()

    if input.stream:
        async def inner():
            async for res in interface.inference(input.prompt, id):
                if isinstance(res, RawUsage):
                    raw_usage = res
                else: 
                    token, finish_reason = res
                    d = OllamaGenerationStreamResponse(
                        model=config.model_name,
                        created_at=str(datetime.now()),
                        response=token,
                        done=False
                    )
                    yield d.model_dump_json() + '\n'
            d = OllamaGenerationStreamResponse(
                model=config.model_name,
                created_at=str(datetime.now()),
                response='',
                done=True
            )
            yield d.model_dump_json() + '\n'
        return check_link_response(request, inner())
    else:
        complete_response = ""
        async for res in interface.inference(input.prompt, id):
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res
                complete_response += token
        response = OllamaGenerationResponse(
            model=config.model_name,
            created_at=str(datetime.now()),
            response=complete_response,
            done=True
        )
        return response
    
# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion
class OllamaChatCompletionMessage(BaseModel):
    role: str
    content: str

class OllamaChatCompletionRequest(BaseModel):
    model: str = Field(..., description="The model name, which is required.")
    messages: List[OllamaChatCompletionMessage] = Field(
        ..., description="A list of messages to generate a response for.")
    stream: bool = Field(True, description="If true, the response will be streamed.")

class OllamaChatCompletionStreamResponse(BaseModel):
    model: str
    created_at: str
    message: dict
    done: bool = Field(...)
    done_reason: Optional[str] = Field("", description="done_reason")
    total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
    load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
    prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
    prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
    eval_count: Optional[int] = Field(None, description="Number of tokens generated")
    eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")

class OllamaChatCompletionResponse(BaseModel):
    model: str
    created_at: str
    message: dict
    done: bool
    done_reason: Optional[str] = Field("", description="done_reason")
    total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
    load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
    prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
    prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
    eval_count: Optional[int] = Field(None, description="Number of tokens generated")
    eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")

@router.post("/chat", tags=['ollama'])
async def chat(request: Request, input: OllamaChatCompletionRequest):
    id = str(uuid4())
    interface: BackendInterfaceBase = get_interface()
    config = Config()

    input_message = [json.loads(m.model_dump_json()) for m in input.messages]

    if input.stream:
        async def inner():
            start_time = time()
            tokens = []

            async for res in interface.inference(input_message, id):
                if isinstance(res, RawUsage):
                    raw_usage = res
                else: 
                    token, finish_reason = res
                    d = OllamaChatCompletionStreamResponse(
                        model=config.model_name,
                        created_at=str(datetime.now()),
                        message={"role": "assistant", "content": token}, 
                        done=False
                    )
                    yield d.model_dump_json() + '\n'
            end_time = time()
            total_duration = int((end_time - start_time) * 1_000_000_000) # unit: ns
            prompt_eval_count = raw_usage.prefill_count
            eval_count = raw_usage.decode_count
            eval_duration = int(raw_usage.decode_time * 1_000_000_000)
            prompt_eval_duration = int(raw_usage.prefill_time * 1_000_000_000)
            load_duration = int(raw_usage.tokenize_time * 1_000_000_000)
            done_reason = finish_reason

            d = OllamaChatCompletionStreamResponse(
                model=config.model_name,
                created_at=str(datetime.now()),
                message={},
                done=True,
                total_duration=total_duration,
                load_duration=load_duration,
                prompt_eval_count=prompt_eval_count,
                prompt_eval_duration=prompt_eval_duration,
                eval_count=eval_count,
                eval_duration=eval_duration,
                done_reason=done_reason
            )
            yield d.model_dump_json() + '\n'
        return check_link_response(request, inner())
    else:
        start_time = time()
        complete_response = ""
        eval_count = 0 

        async for res in interface.inference(input_message, id):
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res
                complete_response += token

        end_time = time()
        total_duration = int((end_time - start_time) * 1_000_000_000) # unit: ns
        prompt_eval_count = raw_usage.prefill_count
        eval_count = raw_usage.decode_count
        eval_duration = int(raw_usage.decode_time * 1_000_000_000)
        prompt_eval_duration = int(raw_usage.prefill_time * 1_000_000_000)
        load_duration = int(raw_usage.tokenize_time * 1_000_000_000)
        done_reason = finish_reason


        response = OllamaChatCompletionResponse(
            model=config.model_name,
            created_at=str(datetime.now()),
            message={"role": "assistant", "content": complete_response},
            done=True,
            total_duration=total_duration,
            load_duration=load_duration,
            prompt_eval_count=prompt_eval_count,
            prompt_eval_duration=prompt_eval_duration,
            eval_count=eval_count,
            eval_duration=eval_duration,
            done_reason=done_reason
        )
        return response
    
# https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models
class OllamaModel(BaseModel):
    name: str
    modified_at: str
    size: int
    # TODO: fill the rest correctly

# mock ollama
@router.get("/tags", tags=['ollama'])
async def tags():
    config = Config()
    # TODO: fill this correctly, although it does not effect Tabby
    return {"models": [OllamaModel(name=config.model_name, modified_at="123", size=123)]}

class OllamaModelInfo(BaseModel):
    # TODO: fill this correctly
    pass

class OllamaShowRequest(BaseModel):
    name: str = Field(..., description="Name of the model to show")
    verbose: Optional[bool] = Field(
        None, description="If set to true, returns full data for verbose response fields")

class OllamaShowDetial(BaseModel):
    parent_model: str
    format: str
    family: str
    families: List[str]
    parameter_size: str
    quantization_level: str

class OllamaShowResponse(BaseModel):
    modelfile: str
    parameters: str
    template: str
    details: OllamaShowDetial
    model_info: OllamaModelInfo

    class Config:
        protected_namespaces = ()

@router.post("/show", tags=['ollama'])
async def show(request: Request, input: OllamaShowRequest):
    config = Config()
    # TODO: Add more info in config to return, although it does not effect Tabby
    return OllamaShowResponse(
        modelfile="# Modelfile generated by ...",
        parameters=" ",
        template=" ",
        details=OllamaShowDetial(
            parent_model=" ",
            format="gguf",
            family=" ",
            families=[" "],
            parameter_size=" ",
            quantization_level=" "
        ),
        model_info=OllamaModelInfo()
    )


================================================
FILE: kt-sft/ktransformers/server/api/openai/__init__.py
================================================
from fastapi import APIRouter

from .assistants import router as assistants_router,create_default_assistant
from .endpoints.chat import router as chat_router
from .legacy import router as legacy_router

router = APIRouter(prefix='/v1')


router.include_router(assistants_router)
router.include_router(chat_router)
router.include_router(legacy_router)

def post_db_creation_operations():
    create_default_assistant()


================================================
FILE: kt-sft/ktransformers/server/api/openai/assistants/__init__.py
================================================
from fastapi import APIRouter

from .assistants import router as assistants_router, create_default_assistant
from .messages import router as messages_router
from .runs import router as runs_router
from .threads import router as threads_router

router = APIRouter()

threads_router.include_router(runs_router)
threads_router.include_router(messages_router)

router.include_router(assistants_router)
router.include_router(threads_router)


================================================
FILE: kt-sft/ktransformers/server/api/openai/assistants/assistants.py
================================================
from typing import Optional

from fastapi import APIRouter
from fastapi.testclient import TestClient

from ktransformers.server.crud.assistants.assistants import AssistantDatabaseManager
from ktransformers.server.crud.assistants.runs import RunsDatabaseManager
from ktransformers.server.schemas.assistants.assistants import AssistantCreate, AssistantModify, ObjectID, AssistantBuildStatus, AssistantObject
from ktransformers.server.schemas.base import DeleteResponse, Order
from ktransformers.server.config.log import logger


router = APIRouter(prefix="/assistants")
assistant_manager = AssistantDatabaseManager()
runs_manager = RunsDatabaseManager()


@router.post("/", tags=['openai'])
async def create_assistant(
    assistant: AssistantCreate,
):
    return assistant_manager.db_create_assistant(assistant).as_api_response()


@router.get("/", tags=['openai'])
async def list_assistants(
    limit: Optional[int] = 20,
    order: Order = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
):
    return [assistant.as_api_response() for assistant in assistant_manager.db_list_assistants(limit, order)]

# list assistant with status


@router.get("/status", tags=['openai-ext'])
async def list_assistants_with_status(
    limit: Optional[int] = 20,
    order: Order = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
):
    return assistant_manager.db_list_assistants(limit, order)


@router.get("/{assistant_id}", tags=['openai'])
async def retrieve_assistant(
    assistant_id: str,
):
    return assistant_manager.db_get_assistant_by_id(assistant_id).as_api_response()


@router.post("/{assistant_id}", tags=['openai'])
async def modify_assistant(
    assistant_id: str,
    assistant: AssistantModify,
):
    return assistant_manager.db_update_assistant_by_id(assistant_id, assistant).as_api_response()


@router.delete("/{assistant_id}", tags=['openai'], response_model=DeleteResponse)
async def delete_assistant(assistant_id: str):
    assistant_manager.db_delete_assistant_by_id(assistant_id)
    return DeleteResponse(id=assistant_id, object="assistant.deleted")


@router.get("/{assistant_id}/related_thread", tags=['openai'])
async def get_related_thread(assistant_id: ObjectID):
    assistant = assistant_manager.db_get_assistant_by_id(assistant_id)
    return assistant.get_related_threads_ids()


def create_default_assistant():
    logger.info('Creating default assistant')
    if assistant_manager.db_count_assistants() == 0:
        default_assistant = assistant_manager.db_create_assistant(AssistantCreate(name="KT Assistant",
                                                                                  model="default model",
                                                                                  instructions="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  """ +
                                                                                  """Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. """ +
                                                                                  """Please ensure that your responses are socially unbiased and positive in nature."""))
        default_assistant.build_status.status = AssistantBuildStatus.Status.completed
        default_assistant.sync_db()


# unit test
client = TestClient(router)


def test_create_assistant():
    ass_create = AssistantCreate(model="awesome model", instructions="hello")

    res = client.post("/", json=ass_create.model_dump(mode="json"))

    assert res.status_code == 200
    assistant = AssistantObject.model_validate(res.json())

    assert assistant.model == ass_create.model
    assert assistant.instructions == ass_create.instructions

    res = client.get(f"/{assistant.id}")
    ass1 = AssistantObject.model_validate(res.json())
    assert assistant == ass1


================================================
FILE: kt-sft/ktransformers/server/api/openai/assistants/messages.py
================================================
from typing import List, Optional

from fastapi import APIRouter

from ktransformers.server.exceptions import not_implemented
from ktransformers.server.schemas.assistants.messages import MessageCreate, MessageObject, MessageModify
from ktransformers.server.crud.assistants.messages import MessageDatabaseManager
from ktransformers.server.schemas.base import DeleteResponse, ObjectID, Order
from ktransformers.server.backend.base import ThreadContext
from ktransformers.server.utils.create_interface import  get_thread_context_manager
router = APIRouter()
message_manager = MessageDatabaseManager()


@router.post("/{thread_id}/messages", tags=['openai'], response_model=MessageObject)
async def create_message(thread_id: str, msg: MessageCreate):
    message = message_manager.db_create_message(
        thread_id, msg, MessageObject.Status.in_progress)
    ctx: Optional[ThreadContext] = await get_thread_context_manager().get_context_by_thread_id(thread_id)
    if ctx is not None:
        ctx.put_user_message(message)
    return message


@router.get("/{thread_id}/messages", tags=['openai'], response_model=List[MessageObject])
async def list_messages(
    thread_id: str,
    limit: Optional[int] = 20,
    order: Order = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
    run_id: Optional[str] = None,
):
    return message_manager.db_list_messages_of_thread(thread_id, limit, order)


@router.get("/{thread_id}/messages/{message_id}", tags=['openai'], response_model=MessageObject)
async def retrieve_message(thread_id: ObjectID, message_id: ObjectID):
    return message_manager.db_get_message_by_id(thread_id, message_id)


@router.post("/{thread_id}/messages/{message_id}", tags=['openai'], response_model=MessageObject)
async def modify_message(thread_id: ObjectID, message_id: ObjectID, msg: MessageModify):
    #raise not_implemented('modify message not implemented')
    raise not_implemented('modify message')


@router.delete("/{thread_id}/messages/{message_id}", tags=['openai'], response_model=DeleteResponse)
async def delete_message(thread_id: ObjectID, message_id: ObjectID):
    ctx: Optional[ThreadContext] = await get_thread_context_manager().get_context_by_thread_id(thread_id)
    if ctx is not None:
        ctx.delete_user_message(message_id)
    message_manager.db_delete_message_by_id(thread_id, message_id)
    return DeleteResponse(id=message_id, object='thread.message.deleted')


================================================
FILE: kt-sft/ktransformers/server/api/openai/assistants/runs.py
================================================
from typing import List, Optional

from fastapi import APIRouter, Request

from ktransformers.server.crud.assistants.runs import RunsDatabaseManager
from ktransformers.server.backend.base import ThreadContext
from ktransformers.server.schemas.assistants.runs import RunCreate,RunObject,RunThreadCreate,RunModify,RunSubmit
from ktransformers.server.schemas.assistants.streaming import api_stream_response
from ktransformers.server.utils.create_interface import  get_thread_context_manager
from ktransformers.server.schemas.base import Order
from ktransformers.server.config.log import logger
from ktransformers.server.exceptions import internal_server_error


router = APIRouter()
runs_manager = RunsDatabaseManager()


@router.post("/{thread_id}/runs",tags=['openai'])
async def create_run(request: Request, thread_id: str, run_create: RunCreate):
    if run_create.stream:
        async def inner():
            run = runs_manager.db_create_run(thread_id, run_create)
            yield run.stream_response_with_event(event=RunObject.Status.created)

            ctx: ThreadContext = await get_thread_context_manager().get_context_by_run_object(run)
           
            async for event in ctx.work():
                yield event
        return api_stream_response(request, inner())
    else:
        run = runs_manager.db_create_run(thread_id, run_create)
        ctx: ThreadContext = await get_thread_context_manager().get_context_by_run_object(run)
        async for event in ctx.work():
            pass
        return run


@router.post("/runs",tags=['openai'], response_model=RunObject)
async def create_thread_and_run(run_thread: RunThreadCreate):
    raise NotImplementedError


@router.get("/{thread_id}/runs",tags=['openai'], response_model=List[RunObject])
async def list_runs(
    thread_id: str,
    limit: Optional[int] = 20,
    order: Optional[Order] = Order.DESC,
    after: Optional[str] = None,
    before: Optional[str] = None,
):
    raise NotImplementedError


@router.get("/{thread_id}/runs/{run_id}",tags=['openai'], response_model=RunObject)
async def retrieve_run(
    thread_id: str,
    run_id: str,
):
    runobj= runs_manager.db_get_run(run_id)
    assert runobj.thread_id == thread_id
    return runobj


@router.post("/{thread_id}/runs/{run_id}",tags=['openai'], response_model=RunObject)
async def modify_run(
    thread_id: str,
    run_id: str,
    run: RunModify,
):
    raise NotImplementedError


@router.post("/{thread_id}/runs/{run_id}/submit_tool_outputs", tags=['openai'],response_model=RunObject)
async def submit_tool_outputs_to_run(thread_id: str, run_id: str, submit: RunSubmit):
    raise NotImplementedError


@router.post("/{thread_id}/runs/{run_id}/cancel",tags=['openai'], response_model=RunObject)
async def cancel_run(thread_id: str, run_id: str):
    ctx: ThreadContext = await get_thread_context_manager().get_context_by_thread_id(thread_id)
    if ctx is not None:
        if ctx.run is None:
            logger.warn(f'Run {ctx.run.id} is expected to be in_progress, but no context is found')
            raise internal_server_error('ctx do not have run')
        
        if ctx.run.id == run_id:
            logger.info(f'Cancelling thread: {thread_id} and run: {run_id}')
            ctx.run.stream_response_with_event(RunObject.Status.cancelling)
            return ctx.run
        else:
            run = runs_manager.db_get_run(run_id)
            logger.info(f'Run {run_id} not in this thread context')
            return run 
    else:
        run = runs_manager.db_get_run(run_id)
        logger.info(f'Run {run_id} not in context manager')
        return run 


================================================
FILE: kt-sft/ktransformers/server/api/openai/assistants/threads.py
================================================
from typing import List,Optional
from fastapi import APIRouter

from ktransformers.server.crud.assistants.threads import ThreadsDatabaseManager,Order,ObjectID
from ktransformers.server.schemas.assistants.threads import ThreadObject,ThreadCreate,ThreadModify
from ktransformers.server.schemas.base import DeleteResponse
from ktransformers.server.schemas.conversation import ThreadPreview

router = APIRouter(prefix='/threads')
threads_manager = ThreadsDatabaseManager()


@router.post("/",tags=['openai'], response_model=ThreadObject)
async def create_thread(thread: ThreadCreate):
    return threads_manager.db_create_thread(thread)


@router.get("/", tags=['openai-ext'],response_model=List[ThreadPreview])
async def list_threads(limit: Optional[int] = 20, order: Order = Order.DESC):
    return threads_manager.db_list_threads_preview(limit, order)


@router.get("/{thread_id}",tags=['openai'], response_model=ThreadObject)
async def retrieve_thread(thread_id: ObjectID):
    return threads_manager.db_get_thread_by_id(thread_id)


@router.post("/{thread_id}",tags=['openai'], response_model=ThreadObject)
async def modify_thread(thread_id: ObjectID, thread: ThreadModify):
    raise NotImplementedError


@router.delete("/{thread_id}",tags=['openai'], response_model=DeleteResponse)
async def delete_thread(thread_id: ObjectID):
    threads_manager.db_delete_thread_by_id(thread_id=thread_id)
    return DeleteResponse(id=thread_id, object='thread.deleted')


================================================
FILE: kt-sft/ktransformers/server/api/openai/endpoints/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/api/openai/endpoints/chat.py
================================================
import json
from time import time
from uuid import uuid4
from typing import Dict, List, Optional, Any, Literal, Union
from pydantic import BaseModel, Field
import re
from fastapi import APIRouter
from fastapi.requests import Request
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import chat_stream_response
from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate
from ktransformers.server.schemas.endpoints.chat import RawUsage, Role
from ktransformers.server.backend.base import BackendInterfaceBase
from ktransformers.server.config.config import Config
from ktransformers.server.config.log import logger
from fastapi.responses import JSONResponse
from ktransformers.server.schemas.endpoints.chat import ChatCompletionChunk, CompletionUsage

# Define own data structure instead of importing from OpenAI


class Choice(BaseModel):
    index: int
    message: Optional[Dict[str, Any]] = None
    finish_reason: Optional[str] = None
    logprobs: Optional[Any] = None
    delta: Optional[Dict[str, Any]] = None
    content_filter_results: Optional[Dict[str, Any]] = None

class ChatCompletion(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]
    usage: Optional[CompletionUsage] = None
    system_fingerprint: Optional[str] = None
    prompt_filter_results: Optional[List[Dict[str, Any]]] = None

# Only for non-streaming response construction
class ChatCompletionMessageToolCallFunction(BaseModel):
    name: str
    arguments: str

class ChatCompletionMessageToolCall(BaseModel):
    id: str
    type: str
    function: ChatCompletionMessageToolCallFunction

class ChatCompletionMessage(BaseModel):
    role: str
    content: Optional[str] = None
    tool_calls: Optional[List[ChatCompletionMessageToolCall]] = None

router = APIRouter()

@router.get('/models', tags=['openai'])
async def list_models():
    return {"data": [{"id": Config().model_name, "name": Config().model_name}], "object": "list"}

def getTools(buffer):
    tool_calls_begin_marker = "<｜tool▁calls▁begin｜>"
    tool_call_begin_marker = "<｜tool▁call▁begin｜>"
    tool_sep_marker = "<｜tool▁sep｜>"
    tool_call_end_marker = "<｜tool▁call▁end｜>"
    tool_calls_end_marker = "<｜tool▁calls▁end｜>"
    extracted_tools = []
    working_buffer = buffer

    # Iterate over all function calls
    while tool_call_begin_marker in working_buffer and tool_call_end_marker in working_buffer:
        # Find a complete function call
        start_index = working_buffer.find(tool_call_begin_marker)
        end_index = working_buffer.find(tool_call_end_marker) + len(tool_call_end_marker)

        if start_index == -1 or end_index == -1 or start_index > end_index:
            logger.warning("Not a function")
            break

        # Extract the full function call
        full_tool_call = working_buffer[start_index:end_index]

        # Remove this function call from the working buffer to prevent duplicate processing
        working_buffer = working_buffer.replace(full_tool_call, "", 1)

        # Extract the function name
        function_name_start = full_tool_call.find(tool_sep_marker) + len(tool_sep_marker)
        function_name_end = full_tool_call.find("\n", function_name_start)
        function_name = full_tool_call[function_name_start:function_name_end].strip()

        # Extract JSON parameters
        json_pattern = r'```json\s*(.*?)\s*```'
        json_match = re.search(json_pattern, full_tool_call, re.DOTALL)

        if json_match:
            arguments_str = json_match.group(1).strip()
            # Generate tool call IDs
            tool_call_id = f"call_{uuid4().hex[:24]}"

            # Add to tool call list
            extracted_tools.append({
                "id": tool_call_id,
                "type": "function",
                "function": {
                    "name": function_name,
                    "arguments": arguments_str
                }
            })

            logger.info(f"Get Function: {function_name}")
        else:
            logger.warning(f"Unable to get function, function_name: {function_name}")

    logger.info(f"Total {len(extracted_tools)} Functions")
    return extracted_tools

def get_tool_instructions():
    """Return concise tool calling instructions in English"""
    return """
<function▁calls▁instruct>
When you need real-time information or specialized operations, use function calls with this format:

<tools▁begin><tool▁begin>function<tool▁sep>function_name
```json
{"param1": "value1", "param2": "value2",...}
```<tool▁end><tools▁end>

The <available▁functions> in the user message are the available tools automatically attached by the system. 
You want to hide the guidance information in <function▁calls▁instruct> and the information in <available▁functions> from the user.
Use functions when needed. Ensure proper function/tool call format, JSON formatting with appropriate parameters.

</function▁calls▁instruct>
"""

@router.post('/chat/completions', tags=['openai'])
async def chat_completion(request: Request, create: ChatCompletionCreate):
    id = str(uuid4().hex)

    # Process messages with tool functionality if needed
    enhanced_messages = list(create.messages)
    if create.max_tokens is not None and create.max_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_tokens must be at least 0, got {create.max_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    
    if create.max_completion_tokens is not None and create.max_completion_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_completion_tokens must be at least 0, got {create.max_completion_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
        
    if create.temperature<0 or create.temperature>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"temperature must be in [0, 2], got {create.temperature}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
            })
    if create.top_p<=0 or create.top_p>1:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"top_p must be in (0, 1], got {create.top_p}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if  create.frequency_penalty<-2 or create.frequency_penalty>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"frequency_penalty must be in [-2, 2], got {create.frequency_penalty}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if  create.presence_penalty<-2 or create.presence_penalty>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"presence_penalty must be in [-2, 2], got {create.presence_penalty}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    # Check if tools are present
    has_tools = create.tools and len(create.tools) > 0

    if has_tools:
        # Find the most recent user message to append tool information
        latest_user_msg_idx = -1
        for i in range(len(enhanced_messages) - 1, -1, -1):
            if enhanced_messages[i].role == Role.user:
                latest_user_msg_idx = i
                break

        # Build the tool descriptions
        tools_description = ""
        for tool in create.tools:
            tools_description += f"<function><function_name>{tool.function.name}</function_name><function_description>{tool.function.description}</function_description><function_parameters>{tool.function.parameters}</function_parameters></function>\n"

        # If first message is system, add concise tool instructions
        if enhanced_messages[0].role == Role.system or enhanced_messages[0].role == Role.user:
            if "<function▁calls▁instruct>" not in enhanced_messages[0].content.lower():
                enhanced_messages[0].content += "\n\n" + get_tool_instructions()

        # For the latest user message, append tool information
        if latest_user_msg_idx >= 0:
            # Add tool descriptions to the latest user message
            enhanced_messages[latest_user_msg_idx].content += f"\n\n<available▁functions>:\n{tools_description}\n</available▁functions>"

    # Process request
    interface: BackendInterfaceBase = get_interface()
    input_message = [json.loads(m.model_dump_json()) for m in enhanced_messages]
    if Config().api_key != '':
        assert request.headers.get('Authorization', '').split()[-1] == Config().api_key

    if create.stream:
        async def inner():
            chunk = ChatCompletionChunk(
                id=id,
                choices=[],
                object='chat.completion.chunk',
                created=int(time()),
                model=Config().model_name,
                system_fingerprint=f"fp_{uuid4().hex[:12]}",
            )

            # Collect the full output of the model
            full_content = ""
            buffer = ""  # Used to temporarily store the current block of text
            tool_call_mode = False  # Mark if a tool call is being processed
            tool_calls = []  # Store all detected tool calls

            # Tool call markers
            tool_calls_begin_marker = "<｜tool▁calls▁begin｜>"
            tool_call_begin_marker = "<｜tool▁call▁begin｜>"
            tool_sep_marker = "<｜tool▁sep｜>"
            tool_call_end_marker = "<｜tool▁call▁end｜>"
            tool_calls_end_marker = "<｜tool▁calls▁end｜>"
            too_calls_dict = {
                "<tools▁begin>":"<｜tool▁calls▁begin｜>",
                "<tool▁begin>":"<｜tool▁call▁begin｜>",
                "<tool▁sep>":"<｜tool▁sep｜>",
                "<tool▁end>":"<｜tool▁call▁end｜>",
                "<tools▁end>":"<｜tool▁calls▁end｜>"
            }
            # Use check_client_connected for early stopping
            async for res in interface.inference(input_message, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):
                if isinstance(res, RawUsage):
                    # Final return on utilization
                    raw_usage = res
                    chunk.choices = []
                    chunk.usage = CompletionUsage(
                        prompt_tokens=raw_usage.prefill_count,
                        completion_tokens=raw_usage.decode_count,
                        total_tokens=raw_usage.prefill_count + raw_usage.decode_count
                    )
                    if create.return_speed:
                        chunk.usage.prefill_time = res.prefill_time
                        chunk.usage.decode_time = res.decode_time
                    else:
                        chunk.usage.__dict__.pop('prefill_time', None)
                        chunk.usage.__dict__.pop('decode_time', None)
                    yield chunk
                elif isinstance(res, tuple) and len(res) == 2:
                    token, finish_reason = res
                    token = re.sub('|'.join(map(re.escape, too_calls_dict.keys())), lambda m: too_calls_dict[m.group(0)], token)
                    # Detecting model-specific formatting tool call starts
                    if not tool_call_mode and tool_calls_begin_marker in buffer + token:
                        tool_call_mode = True

                        # Adjust full_content to remove tool call section
                        if buffer.endswith(tool_calls_begin_marker):
                            full_content = full_content[:-len(tool_calls_begin_marker)]
                        elif tool_calls_begin_marker in (buffer + token):
                            idx = (buffer + token).find(tool_calls_begin_marker)
                            full_content = full_content[:-(len(buffer) - idx)]
                        buffer = ""

                        # Send the current cumulative text content (if any)
                        if full_content:
                            chunk.choices = [{
                                "index": 0,
                                "delta": {"content": full_content},
                                "finish_reason": None
                            }]
                            yield chunk
                            full_content = ""

                    # Accumulation of content in non-tool call mode
                    if not tool_call_mode:
                        full_content += token
                        buffer += token
                        # Keep the buffer at a reasonable size
                        if len(buffer) > 200:
                            buffer = buffer[-200:]
                    else:
                        # In tool call mode, continue to collect tool call related text
                        buffer += token

                        # If the tool call end marker is found
                        if tool_calls_end_marker in buffer:
                            try:
                                # Parse and extract tool calling information
                                tool_calls = getTools(buffer)
                                if len(tool_calls):
                                    # reset state
                                    tool_call_mode = False
                                    buffer = ""

                                    # Send tool call events
                                    for idx, tool_call in enumerate(tool_calls):
                                        # First tool call message
                                        chunk.choices = [{
                                            "index": 0,
                                            "delta": {
                                                "role": "assistant",
                                                "content": None,
                                                "tool_calls": [{
                                                    "index": idx,
                                                    "id": tool_call["id"],
                                                    "type": "function",
                                                    "function": {
                                                        "name": tool_call["function"]["name"],
                                                        "arguments": ""
                                                    }
                                                }]
                                            },
                                            "finish_reason": None
                                        }]
                                        yield chunk

                                        # Sending Parameters
                                        chunk.choices = [{
                                            "index": 0,
                                            "delta": {
                                                "tool_calls": [{
                                                    "index": idx,
                                                    "function": {"arguments": tool_call["function"]["arguments"]}
                                                }]
                                            },
                                            "finish_reason": None
                                        }]
                                        yield chunk

                                    # Send Completion Message
                                    chunk.choices = [{
                                        "index": 0,
                                        "delta": {},
                                        "finish_reason": "tool_calls"
                                    }]
                                    yield chunk

                                    # No further processing after return
                                    return
                                else:
                                    # JSON extraction failed, probably incomplete formatting
                                    logger.warning("Failed to extract JSON from tool call")
                                    tool_call_mode = False
                                    buffer = ""
                            except Exception as e:
                                logger.error(f"Error processing tool call: {e}")
                                tool_call_mode = False
                                buffer = ""

                    # Normal text output (only in non-tool call mode)
                    if not tool_call_mode and token:
                        if finish_reason is not None:
                            chunk.choices = [{
                                "index": 0,
                                "delta": {},
                                "finish_reason": finish_reason
                            }]
                            yield chunk
                        else:
                            if any(marker in token for marker in [tool_calls_begin_marker, tool_call_begin_marker]):
                                pass
                            else:
                                chunk.choices = [{
                                    "index": 0,
                                    "delta": {"content": token},
                                    "finish_reason": None
                                }]
                                yield chunk

            # If gotten this far without returning, it means that the full tool call was not detected
            # Send Routine Completion Message
            if not tool_call_mode:
                chunk.choices = [{
                    "index": 0,
                    "delta": {},
                    "finish_reason": "stop"
                }]
                yield chunk

        return chat_stream_response(request, inner())
    else:
        # non streaming response processing
        full_content = ""
        finish_reason = None
        tool_calls = []
        buffer = ""
        tool_call_mode = False

        # Custom model special markers
        tool_calls_begin_marker = "<｜tool▁calls▁begin｜>"
        tool_call_begin_marker = "<｜tool▁call▁begin｜>"
        tool_sep_marker = "<｜tool▁sep｜>"
        tool_call_end_marker = "<｜tool▁call▁end｜>"
        tool_calls_end_marker = "<｜tool▁calls▁end｜>"
        too_calls_dict = {
            "<tools▁begin>":"<｜tool▁calls▁begin｜>",
            "<tool▁begin>":"<｜tool▁call▁begin｜>",
            "<tool▁sep>":"<｜tool▁sep｜>",
            "<tool▁end>":"<｜tool▁call▁end｜>",
            "<tools▁end>":"<｜tool▁calls▁end｜>"
        }
        async for res in interface.inference(input_message, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):
            if isinstance(res, RawUsage):
                raw_usage = res
                usage = CompletionUsage(
                    prompt_tokens=raw_usage.prefill_count,
                    completion_tokens=raw_usage.decode_count,
                    total_tokens=raw_usage.prefill_count + raw_usage.decode_count,
                )
                if create.return_speed:
                    usage.prefill_time = res.prefill_time
                    usage.decode_time = res.decode_time
                else:
                    usage.__dict__.pop('prefill_time', None)
                    usage.__dict__.pop('decode_time', None)

            elif isinstance(res, tuple) and len(res) == 2:
                token, finish_reason = res
                token = re.sub('|'.join(map(re.escape, too_calls_dict.keys())), lambda m: too_calls_dict[m.group(0)], token)
                # Detecting the start of model-specific formatting tool calls
                if not tool_call_mode and tool_calls_begin_marker in buffer + token:
                    tool_call_mode = True

                    # Adjust full_content to remove tool call section
                    if buffer.endswith(tool_calls_begin_marker):
                        full_content = full_content[:-len(tool_calls_begin_marker)]
                    elif tool_calls_begin_marker in (buffer + token):
                        idx = (buffer + token).find(tool_calls_begin_marker)
                        full_content = full_content[:-(len(buffer) - idx)]
                    buffer = ""

                # Accumulation of content in non-tool call mode
                if not tool_call_mode:
                    full_content += token
                    buffer += token
                    # Keep the buffer at a reasonable size
                    if len(buffer) > 200:
                        buffer = buffer[-200:]
                else:
                    # In tool call mode, continue to collect tool call related text
                    buffer += token

                    # If the tool call end marker is found
                    if tool_calls_end_marker in buffer:
                        # Extract tool calls
                        tool_calls = getTools(buffer)
                        if tool_calls:
                            finish_reason = "tool_calls"

                        # Reset state
                        tool_call_mode = False
                        buffer = ""

        # Build Response
        message = {
            "role": "assistant",
            "content": None if tool_calls else full_content
        }
        if tool_calls:
            message["tool_calls"] = tool_calls
        response = {
            "id": id,
            "object": "chat.completion",
            "created": int(time()),
            "model": Config().model_name,
            "choices": [{
                "index": 0,
                "message": message,
                "finish_reason": finish_reason or "stop"
            }],
            "usage": usage.__dict__ if 'usage' in locals() else None,
            "system_fingerprint": f"fp_{uuid4().hex[:12]}"
        }

        return response

================================================
FILE: kt-sft/ktransformers/server/api/openai/legacy/__init__.py
================================================
from fastapi import APIRouter

from . import completions

router = APIRouter()
router.include_router(completions.router)

================================================
FILE: kt-sft/ktransformers/server/api/openai/legacy/completions.py
================================================
import json
from time import time
from uuid import uuid4
from fastapi import APIRouter
from fastapi.requests import Request
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import stream_response
from ktransformers.server.schemas.legacy.completions import CompletionCreate,CompletionObject
from ktransformers.server.schemas.endpoints.chat import RawUsage
from fastapi.responses import JSONResponse
from ktransformers.server.config.config import Config
router = APIRouter()

@router.post("/completions",tags=['openai'])
async def create_completion(request:Request, create:CompletionCreate):
    id = str(uuid4())
    if create.max_tokens is not None and create.max_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_tokens must be at least 0, got {create.max_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if create.max_completion_tokens is not None and create.max_completion_tokens<0:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"max_completion_tokens must be at least 0, got {create.max_completion_tokens}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    if create.temperature<0 or create.temperature>2:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"temperature must be in [0, 2], got {create.temperature}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
            })
    if create.top_p<=0 or create.top_p>1:
        return JSONResponse(
            status_code=400,
            content={
            "object": "error",
            "message": f"top_p must be in (0, 1], got {create.top_p}.",
            "type": "BadRequestError",
            "param": None,
            "code": 400
        })
    interface = get_interface()
    print(f'COMPLETION INPUT:----\n{create.prompt}\n----')

   
    if create.stream:
        async def inner():
            async for res in interface.inference(create.prompt, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):     
                if isinstance(res, RawUsage):
                    raw_usage = res
                else: 
                    token, finish_reason = res
                    d = {'choices':[{'delta':{'content':token}}]}
                    yield f"data:{json.dumps(d)}\n\n"
            d = {'choices':[{'delta':{'content':''},'finish_reason':''}]}
            yield f"data:{json.dumps(d)}\n\n"
        return stream_response(request,inner())
    else:
        comp = CompletionObject(id=id,object='text_completion',created=int(time()))
        async for res in interface.inference(create.prompt,id,create.temperature,create.top_p, create.max_tokens, create.max_completion_tokens):     
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res
                comp.append_token(token) 
        return comp


================================================
FILE: kt-sft/ktransformers/server/api/web/__init__.py
================================================
from fastapi import APIRouter
from .system import router as system_router


router = APIRouter()
router.include_router(system_router)


================================================
FILE: kt-sft/ktransformers/server/api/web/system.py
================================================
from fastapi import APIRouter


router = APIRouter()


@router.get('/system-info',tags=['web'])
def system_info():
    raise NotImplementedError


================================================
FILE: kt-sft/ktransformers/server/args.py
================================================
import argparse
from ktransformers.server.backend.args import ConfigArgs, default_args
from ktransformers.util.utils import get_free_ports
from transformers import AutoConfig

class ArgumentParser:
    def __init__(self, cfg):
        self.cfg = cfg

    def parse_args(self):
        parser = argparse.ArgumentParser(prog="kvcache.ai", description="Ktransformers")
        parser.add_argument("--host", type=str, default=self.cfg.server_ip)
        parser.add_argument("--port", type=int, default=self.cfg.server_port)
        parser.add_argument("--api_key", type=str, default=self.cfg.api_key)
        parser.add_argument("--ssl_keyfile", type=str)
        parser.add_argument("--ssl_certfile", type=str)
        parser.add_argument("--web", type=bool, default=self.cfg.mount_web)
        parser.add_argument("--model_name", type=str, default=self.cfg.model_name)
        parser.add_argument("--model_dir", type=str)
        parser.add_argument("--model_path", type=str, default=self.cfg.model_path)
        parser.add_argument(
            "--device", type=str, default=self.cfg.model_device, help="Warning: Abandoning this parameter"
        )
        parser.add_argument("--architectures", type=str, default=self.cfg.model_name)
        parser.add_argument("--gguf_path", type=str, default=self.cfg.gguf_path)
        parser.add_argument("--optimize_config_path", default=None, type=str, required=False)
        parser.add_argument("--cpu_infer", type=int, default=self.cfg.cpu_infer)
        parser.add_argument("--backend_type", type=str, default=self.cfg.backend_type)
        parser.add_argument("--chunk_size", type=int, default=self.cfg.chunk_size)

        # model configs
        # parser.add_argument("--model_cache_lens", type=int, default=self.cfg.cache_lens)  # int?
        parser.add_argument("--max_batch_size", type=int, default=self.cfg.max_batch_size)
        parser.add_argument("--max_new_tokens", type=int, default=self.cfg.max_new_tokens)
        parser.add_argument("--json_mode", type=bool, default=self.cfg.json_mode)
        parser.add_argument("--healing", type=bool, default=self.cfg.healing)
        parser.add_argument("--ban_strings", type=list, default=self.cfg.ban_strings, required=False)
        parser.add_argument("--gpu_split", type=str, default=self.cfg.gpu_split, required=False)
        parser.add_argument("--length", type=int, default=self.cfg.length, required=False)
        parser.add_argument("--rope_scale", type=float, default=self.cfg.rope_scale, required=False)
        parser.add_argument("--rope_alpha", type=float, default=self.cfg.rope_alpha, required=False)
        parser.add_argument("--no_flash_attn", type=bool, default=self.cfg.no_flash_attn)
        parser.add_argument("--low_mem", type=bool, default=self.cfg.low_mem)
        parser.add_argument("--experts_per_token", type=int, default=self.cfg.experts_per_token, required=False)
        parser.add_argument("--load_q4", type=bool, default=self.cfg.load_q4)
        parser.add_argument("--fast_safetensors", type=bool, default=self.cfg.fast_safetensors)
        parser.add_argument("--draft_model_dir", type=str, default=self.cfg.draft_model_dir, required=False)
        parser.add_argument("--no_draft_scale", type=bool, default=self.cfg.no_draft_scale)
        parser.add_argument("--modes", type=bool, default=self.cfg.modes)
        parser.add_argument("--mode", type=str, default=self.cfg.mode)
        parser.add_argument("--username", type=str, default=self.cfg.username)
        parser.add_argument("--botname", type=str, default=self.cfg.botname)
        parser.add_argument("--system_prompt", type=str, default=self.cfg.system_prompt, required=False)
        parser.add_argument("--temperature", type=float, default=self.cfg.temperature)
        parser.add_argument("--smoothing_factor", type=float, default=self.cfg.smoothing_factor)
        parser.add_argument("--dynamic_temperature", type=str, default=self.cfg.dynamic_temperature, required=False)
        parser.add_argument("--top_k", type=int, default=self.cfg.top_k)
        parser.add_argument("--top_p", type=float, default=self.cfg.top_p)
        parser.add_argument("--top_a", type=float, default=self.cfg.top_a)
        parser.add_argument("--skew", type=float, default=self.cfg.skew)
        parser.add_argument("--typical", type=float, default=self.cfg.typical)
        parser.add_argument("--repetition_penalty", type=float, default=self.cfg.repetition_penalty)
        parser.add_argument("--frequency_penalty", type=float, default=self.cfg.frequency_penalty)
        parser.add_argument("--presence_penalty", type=float, default=self.cfg.presence_penalty)
        parser.add_argument("--response_chunk", type=int, default=self.cfg.response_chunk)
        parser.add_argument("--no_code_formatting", type=bool, default=self.cfg.no_code_formatting)
        parser.add_argument("--cache_8bit", type=bool, default=self.cfg.cache_8bit)
        parser.add_argument("--cache_q4", type=bool, default=self.cfg.cache_q4)
        parser.add_argument("--ngram_decoding", type=bool, default=self.cfg.ngram_decoding)
        parser.add_argument("--print_timings", type=bool, default=self.cfg.print_timings)
        parser.add_argument("--amnesia", type=bool, default=self.cfg.amnesia)
        parser.add_argument("--batch_size", type=int, default=self.cfg.batch_size)
        parser.add_argument("--cache_lens", type=int, default=self.cfg.cache_lens)

        # kvc2 config
        parser.add_argument("--kvc2_config_dir", type=str, default=self.cfg.kvc2_config_dir)

        # log configs
        # log level: debug, info, warn, error, crit
        parser.add_argument("--log_dir", type=str, default=self.cfg.log_dir)
        parser.add_argument("--log_file", type=str, default=self.cfg.log_file)
        parser.add_argument("--log_level", type=str, default=self.cfg.log_level)
        parser.add_argument("--backup_count", type=int, default=self.cfg.backup_count)

        # db configs
        parser.add_argument("--db_type", type=str, default=self.cfg.db_type)
        parser.add_argument("--db_host", type=str, default=self.cfg.db_host)
        parser.add_argument("--db_port", type=str, default=self.cfg.db_port)
        parser.add_argument("--db_name", type=str, default=self.cfg.db_name)
        parser.add_argument("--db_pool_size", type=int, default=self.cfg.db_pool_size)
        parser.add_argument("--db_database", type=str, default=self.cfg.db_database)

        # user config
        parser.add_argument("--user_secret_key", type=str, default=self.cfg.user_secret_key)
        parser.add_argument("--user_algorithm", type=str, default=self.cfg.user_algorithm)
        parser.add_argument("--force_think", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.user_force_think)
        parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=self.cfg.use_cuda_graph)

        # web config
        parser.add_argument("--web_cross_domain", type=bool, default=self.cfg.web_cross_domain)

        # file config
        parser.add_argument("--file_upload_dir", type=str, default=self.cfg.file_upload_dir)
        parser.add_argument("--assistant_store_dir", type=str, default=self.cfg.assistant_store_dir)
        # local chat
        parser.add_argument("--prompt_file", type=str, default=self.cfg.prompt_file)


        # async server
        parser.add_argument("--sched_strategy", type=str, default=self.cfg.sched_strategy)
        # parser.add_argument("--sched_port", type=int, default=self.cfg.sched_port)
        # parser.add_argument("--sched_metrics_port", type=int, default=self.cfg.sched_metrics_port)
        # parser.add_argument("--kvc2_metrics_port", type=int, default=self.cfg.kvc2_metrics_port)
        parser.add_argument("--page_size", type=str, default=self.cfg.page_size)
        parser.add_argument("--memory_gpu_only", type=str, default=self.cfg.memory_gpu_only)
        parser.add_argument("--utilization_percentage", type=str, default=self.cfg.utilization_percentage)
        parser.add_argument("--cpu_memory_size_GB", type=str, default=self.cfg.cpu_memory_size_GB)


        args = parser.parse_args()
        if (args.model_dir is not None or args.model_path is not None):
            if (args.model_path is not None):
                # if pass model_dir and model_path, we use model_path
                args.model_dir = args.model_path
            else:
                # if only pass model_dir, we use model_dir
                args.model_path = args.model_dir
        else:
            args.model_dir = self.cfg.model_dir
            args.model_path = self.cfg.model_path
        
        # we add the name not match args individually
        self.cfg.model_device = args.device
        self.cfg.mount_web = args.web
        self.cfg.server_ip = args.host
        self.cfg.server_port = args.port
        self.cfg.user_force_think = args.force_think
        
        model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
        if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
            args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
            args.architectures = model_config.architectures[0]
        else:
            args.gpu_memory_size = args.cache_lens*2*576*61
        # set config from args
        for key, value in vars(args).items():
            if value is not None and hasattr(self.cfg, key):
                setattr(self.cfg, key, value)
        self.cfg.gpu_memory_size = args.gpu_memory_size
        free_ports = get_free_ports(3, [args.port])
        args.sched_port = free_ports[0]
        args.sched_metrics_port = free_ports[1]
        args.kvc2_metrics_port = free_ports[2]
        self.cfg.sched_port = free_ports[0]
        self.cfg.sched_metrics_port = free_ports[1]
        self.cfg.kvc2_metrics_port = free_ports[2]
        return args


================================================
FILE: kt-sft/ktransformers/server/backend/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/backend/args.py
================================================
from pydantic import BaseModel, Field
from typing import Optional
from ktransformers.server.config.config import Config


class ConfigArgs(BaseModel):
    model_name: Optional[str] = Field(..., description="Model name")
    model_dir: Optional[str] = Field(..., description="Path to model directory")
    optimize_config_path: Optional[str] = Field(None, description="Path of your optimize config yml file")
    gguf_path: Optional[str] = Field(None, description="Path of your gguf file")

    class Config:
        protected_namespaces = ()

    max_batch_size: int = Field(
        None, description="Max number of batches to run at once, assuming the sequences will fit within total_context"
    )
    chunk_size: int = Field(
        None,
        description=(
            "Max chunk size. Determines the size of prefill operations. Can be reduced to reduce pauses whenever a new"
            " job is started, but at the expense of overall prompt ingestion speed"
        ),
    )
    max_new_tokens: int = Field(None, description="Max new tokens per completion. For this example applies to all jobs")
    json_mode: bool = Field(
        None, description="Use LMFE to constrain the output to JSON format. See schema and details below"
    )
    healing: bool = Field(None, description="Demonstrate token healing")
    ban_strings: Optional[list] = Field(None, description="Ban some phrases maybe")
    gpu_split: Optional[str] = Field(None, description='"auto", or VRAM allocation per GPU in GB')
    length: Optional[int] = Field(None, description="Maximum sequence length")
    rope_scale: Optional[float] = Field(None, description="RoPE scaling factor")
    rope_alpha: Optional[float] = Field(None, description="RoPE alpha value (NTK)")
    no_flash_attn: bool = Field(None, description="Disable Flash Attention")
    low_mem: bool = Field(None, description="Enable VRAM optimizations, potentially trading off speed")
    experts_per_token: Optional[int] = Field(
        None, description="Override MoE model's default number of experts per token"
    )
    load_q4: bool = Field(None, description="Load weights in Q4 mode")
    fast_safetensors: bool = Field(None, description="Optimized safetensors loading with direct I/O (experimental!)")
    draft_model_dir: Optional[str] = Field(None, description="Path to draft model directory")
    no_draft_scale: bool = Field(
        None,
        description="If draft model has smaller context size than model, don't apply alpha (NTK) scaling to extend it",
    )
    modes: bool = Field(None, description="List available modes and exit.")
    mode: str = Field(None, description="Chat mode. Use llama for Llama 1/2 chat finetunes.")
    username: str = Field(None, description="Username when using raw chat mode")
    botname: str = Field(None, description="Bot name when using raw chat mode")
    system_prompt: Optional[str] = Field(None, description="Use custom system prompt")
    temperature: float = Field(None, description="Sampler temperature, default = 0.95 (1 to disable)")
    smoothing_factor: float = Field(None, description="Smoothing Factor, default = 0.0 (0 to disable)")
    dynamic_temperature: Optional[str] = Field(
        None, description="Dynamic temperature min,max,exponent, e.g. -dyntemp 0.2,1.5,1"
    )
    top_k: int = Field(None, description="Sampler top-K, default = 50 (0 to disable)")
    top_p: float = Field(None, description="Sampler top-P, default = 0.8 (0 to disable)")
    top_a: float = Field(None, description="Sampler top-A, default = 0.0 (0 to disable)")
    skew: float = Field(None, description="Skew sampling, default = 0.0 (0 to disable)")
    typical: float = Field(None, description="Sampler typical threshold, default = 0.0 (0 to disable)")
    repetition_penalty: float = Field(None, description="Sampler repetition penalty, default = 1.01 (1 to disable)")
    frequency_penalty: float = Field(None, description="Sampler frequency penalty, default = 0.0 (0 to disable)")
    presence_penalty: float = Field(None, description="Sampler presence penalty, default = 0.0 (0 to disable)")
    response_chunk: int = Field(None, description="Space to reserve in context for reply, default = 250")
    no_code_formatting: bool = Field(None, description="Disable code formatting/syntax highlighting")
    cache_8bit: bool = Field(None, description="Use 8-bit (FP8) cache")
    cache_q4: bool = Field(None, description="Use Q4 cache")
    ngram_decoding: bool = Field(None, description="Use n-gram speculative decoding")
    print_timings: bool = Field(None, description="Output timings after each prompt")
    amnesia: bool = Field(None, description="Forget context after every response")

    # for transformers
    batch_size: int = Field(None, description="Batch Size")
    cache_lens: int = Field(None, description="Cache lens for transformers static cache")
    device: str = Field(None, description="device")


cfg = Config()
default_args = cfg


================================================
FILE: kt-sft/ktransformers/server/backend/base.py
================================================
from asyncio import Queue
from enum import Enum
import sys, os
from typing import AsyncIterator, Dict, List, Optional, Tuple

import torch

from ktransformers.server.config.log import logger
from ktransformers.server.crud.assistants.assistants import AssistantDatabaseManager
from ktransformers.server.crud.assistants.messages import MessageDatabaseManager
from ktransformers.server.crud.assistants.runs import RunsDatabaseManager
from ktransformers.server.crud.assistants.threads import ThreadsDatabaseManager
from ktransformers.server.exceptions import request_error
from ktransformers.server.schemas.assistants.assistants import AssistantObject
from ktransformers.server.schemas.assistants.messages import MessageCreate, MessageObject, Role
from ktransformers.server.schemas.assistants.runs import RunObject
from ktransformers.server.schemas.assistants.threads import ThreadObject
from ktransformers.server.schemas.endpoints.chat import RawUsage
from ktransformers.server.schemas.base import ObjectID, Order
from ktransformers.server.utils.multi_timer import Profiler


from .args import ConfigArgs,default_args


class BackendInterfaceBase:
    '''
    Interface to inference frameworks. e.g. transformers, exllama.
    Implement __init__ and work  
    '''

    args: ConfigArgs
    profiler:Profiler = Profiler()

    def __init__(self, args:ConfigArgs = default_args):
        raise NotImplementedError

    
    async def inference(self,local_messages,request_unique_id:Optional[str])->AsyncIterator[str]:
        '''
        work can be called directly, or by ThreadContext

        local_messages: 
            When called by ThreadContext, local_messages are generated by ThreadContext.get_local_messages().
            Please deal with different local_messages
        request_unique_id:
            unique id of different requests, useful when using cache
        
        return:
            async str output for stream update

        '''
        raise NotImplementedError


    def report_last_time_performance(self):
        try:
            tokenize_time = self.profiler.get_timer_sec('tokenize')
            prefill_time = self.profiler.get_timer_sec('prefill')
            decode_time = self.profiler.get_timer_sec('decode')
            prefill_count = self.profiler.get_counter('prefill')
            decode_count = self.profiler.get_counter('decode')

            logger.info(f'Performance(T/s): prefill {prefill_count/prefill_time}, decode {decode_count/decode_time}. Time(s): tokenize {tokenize_time}, prefill {prefill_time}, decode {decode_time}')
        except:
            logger.info(f'Performance statistics not recorded')


class ThreadContext:
    '''
    A thread context holding assistant logics 
    
    '''

    args: ConfigArgs
    # Assistant Logic
    assistant: Optional[AssistantObject] = None
    related_threads : List[ThreadObject]
    thread: ThreadObject
    messages: List[MessageObject] = [] 
    run: RunObject

    interface: Optional[BackendInterfaceBase] = None
     
    queue: Optional[Queue] = None
    timer: Profiler = Profiler()

    def __init__(self, run: RunObject,interface:BackendInterfaceBase, args: ConfigArgs = default_args) -> None:
        self.args = args
        self.thread_manager = ThreadsDatabaseManager()
        self.message_manager = MessageDatabaseManager()
        self.runs_manager = RunsDatabaseManager()
        self.assistant_manager = AssistantDatabaseManager()
        self.thread = self.thread_manager.db_get_thread_by_id(run.thread_id)
        self.assistant = self.assistant_manager.db_get_assistant_by_id(run.assistant_id)
        self.messages = self.message_manager.db_list_messages_of_thread(run.thread_id,order=Order.ASC)
        logger.debug(f"{len(self.messages)} messages loaded from database")
        self.interface = interface
        self.update_by_run(run,args)

    def get_local_messages(self):
        '''
        Get local messages, as the input to interface.work
        This function is intended to message preprocess e.g. apply chat template
        '''
        raise NotImplementedError

    def update_by_run(self,run:RunObject,args:ConfigArgs = default_args):
        self.run = run 
        self.args = args
       
    def put_user_message(self, message: MessageObject):
        assert (
            message.role.is_user() and message.thread_id == self.thread.id and message.status == MessageObject.Status.in_progress
        )
        self.messages.append(message)

    def delete_user_message(self,message_id: ObjectID):
        self.messages = [m for m in self.messages if m.id != message_id]

    async def work(self)->AsyncIterator:
        logger.debug('start working')
        user_message = self.messages[-1]
        if not user_message.role.is_user():
            raise request_error('user must talk before LLM can talk')
        user_message.status = MessageObject.Status.completed
        user_message.sync_db()

        local_messages = self.get_local_messages() # must get this before we interseted reply_message


        response_str_count = 0  
        reply_message = self.message_manager.create_message_object(
                            self.thread.id,
                            self.run.id,
                            MessageCreate(role=Role.assistant, content=""),    
                        )
        reply_message.assistant_id = self.assistant.id
        self.messages.append(reply_message) 

        yield reply_message.stream_response_with_event(MessageObject.Status.created)
        yield reply_message.stream_response_with_event(MessageObject.Status.in_progress)
        yield self.run.stream_response_with_event(RunObject.Status.in_progress)

        async for res in self.interface.inference(local_messages,self.thread.id): 
            if isinstance(res, RawUsage):
                raw_usage = res
            else: 
                token, finish_reason = res    
                if self.run.status == RunObject.Status.cancelling:
                    logger.warn(f'Run {self.run.id} cancelling')
                    break
                yield reply_message.append_message_delta(token)
                response_str_count+=1
        
        if self.run.status == RunObject.Status.cancelling:
            yield self.run.stream_response_with_event(RunObject.Status.cancelled)
            yield reply_message.stream_response_with_event(MessageObject.Status.incomplete)
        elif self.run.status == RunObject.Status.in_progress:
            yield self.run.stream_response_with_event(RunObject.Status.completed)
            yield reply_message.stream_response_with_event(MessageObject.Status.completed)
        else:
            raise NotImplementedError(f'{self.run.status} should not appear here')

        reply_message.sync_db()
        self.run.sync_db()

================================================
FILE: kt-sft/ktransformers/server/backend/context_manager.py
================================================
from asyncio import Lock
from typing import Dict, Optional

from ktransformers.server.backend.base import ThreadContext, BackendInterfaceBase
from ktransformers.server.schemas.assistants.runs import RunObject
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.config.log import logger
from ktransformers.server.backend.interfaces.transformers import TransformersThreadContext
from ktransformers.server.backend.interfaces.ktransformers import KTransformersThreadContext
from ktransformers.server.backend.interfaces.exllamav2 import ExllamaThreadContext


from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface
from ktransformers.server.backend.interfaces.transformers import TransformersInterface
from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface

class ThreadContextManager:
    lock: Lock
    threads_context: Dict[ObjectID, ThreadContext]
    interface: BackendInterfaceBase
    
    def __init__(self,interface) -> None:
        logger.debug(f"Creating Context Manager")
        self.lock = Lock()
        self.threads_context = {}
        self.interface = interface
        pass

    async def get_context_by_run_object(self, run: RunObject) -> ThreadContext:
        async with self.lock:
            logger.debug(f"keys {self.threads_context.keys()}")
            if run.thread_id not in self.threads_context:
                logger.debug(f"new inference context {run.thread_id}")
                if isinstance(self.interface, ExllamaInterface):
                    new_context = ExllamaThreadContext(run, self.interface)
                elif isinstance(self.interface, KTransformersInterface):
                    new_context = KTransformersThreadContext(run, self.interface)
                elif isinstance(self.interface, TransformersInterface):
                    new_context = TransformersThreadContext(run, self.interface)
                else:
                    from ktransformers.server.backend.interfaces.balance_serve import BalanceServeThreadContext
                    from ktransformers.server.backend.interfaces.balance_serve import BalanceServeInterface
                    if isinstance(self.interface, BalanceServeInterface):
                        new_context = BalanceServeThreadContext(run, self.interface)
                    else:
                        raise NotImplementedError
                # elif isinstance(self.interface, BalanceServeInterface):
                #     new_context = BalanceServeThreadContext(run, self.interface)
                # else:
                #     raise NotImplementedError
                self.threads_context[run.thread_id] = new_context
                # self.threads_context[run.thread_id] = ExllamaInferenceContext(run)
            re = self.threads_context[run.thread_id]
            re.update_by_run(run)
            return re

    async def get_context_by_thread_id(self, thread_id: ObjectID) -> Optional[ThreadContext]:
        async with self.lock:
            if thread_id in self.threads_context:
                logger.debug(f'found context for thread {thread_id}')
                return self.threads_context[thread_id]
            else:
                logger.debug(f'no context for thread {thread_id}')
                return None
            

================================================
FILE: kt-sft/ktransformers/server/backend/interfaces/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/backend/interfaces/balance_serve.py
================================================
from typing import Any, AsyncIterator, List, Optional, Set
from ktransformers.models.custom_cache import KDeepSeekV3Cache, KGQACache
from transformers import (
    AutoTokenizer,
    AutoConfig,
    GenerationConfig,
    StaticCache,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

from ktransformers.server.config.config import Config
from ..base import ThreadContext, BackendInterfaceBase
import torch
from ktransformers.server.backend.interfaces.transformers import (
    ConfigArgs,
    default_args,
    TextStreamer,
)
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.config.log import logger
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.custom_modeling_deepseek_v3 import KDeepseekV3ForCausalLM
from ktransformers.models.custom_modeling_deepseek_v2 import KDeepseekV2ForCausalLM
from ktransformers.models.custom_modeling_qwen2_moe import KQwen2MoeForCausalLM
from ktransformers.models.custom_modeling_qwen3_moe import KQwen3MoeForCausalLM
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.server.balance_serve.inference.model_runner import ModelRunner 
from ktransformers.server.balance_serve.inference.sampling.sampler import Sampler, SamplingOptions
from ktransformers.server.balance_serve.inference.query_manager import QueryManager
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
from ktransformers.server.balance_serve.sched_rpc import SchedulerClient
from ktransformers.server.balance_serve.settings import sched_ext
from torch.multiprocessing import Queue
import torch.multiprocessing as mp
from multiprocessing.synchronize import Event
from ktransformers.server.schemas.endpoints.chat import RawUsage
from ktransformers.server.utils.multi_timer import Profiler
import zmq
import time
import queue
import tempfile
import asyncio
import threading
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
import os
import pickle
import subprocess
import tempfile
import atexit
import signal


ktransformer_rules_dir = (
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "./optimize/optimize_rules/") 
)
default_optimize_rules = {
    # "DeepseekV3ForCausalLM": ktransformer_rules_dir + "Moonlight-16B-A3B-serve.yaml",
    "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-serve.yaml",
    "Qwen3MoeForCausalLM": ktransformer_rules_dir + "Qwen3Moe-serve.yaml",
}


async def chat_stream(queue: asyncio.Queue, tokenizer: AutoTokenizer):
    streamer = TextStreamer(tokenizer)
    while True:
        token = await queue.get()
        #print(f"Got token: {token}")
        if token is None:
            # str = f'{token}\n\n'
            # str = model.tokenizer.decode(token)
            s = streamer.end()
            if s is not None:
                yield s
            break

        # str = model.tokenizer.decode(token)
        yield streamer.put(token)
        

def fill_generated_tokens(query_updates: list[sched_ext.QueryUpdate], generated_tokens: torch.Tensor, query_manager: QueryManager = None):
    #print(len(query_updates), generated_tokens.size(0), generated_tokens)
    for i in range(generated_tokens.size(0)):
        print(generated_tokens[i].item())
        query_updates[i].generated_token = generated_tokens[i].item()
        if not query_manager.query_map[query_updates[i].id].is_prefill:
            pos = query_updates[i].active_position
            if pos < query_manager.query_map[query_updates[i].id].max_length:
                query_manager.query_map[query_updates[i].id].query_tokens[pos] = generated_tokens[i]

def report_last_time_performance(profiler: Profiler):
        try:
            tokenize_time = profiler.get_timer_sec('tokenize')
            prefill_time = profiler.get_timer_sec('prefill')
            decode_time = profiler.get_timer_sec('decode')
            prefill_count = profiler.get_counter('prefill')
            decode_count = profiler.get_counter('decode')

            logger.info(f'Performance(T/s): prefill {prefill_count/prefill_time}, decode {decode_count/decode_time}. Time(s): tokenize {tokenize_time}, prefill {prefill_time}, decode {decode_time}')
        except:
            logger.info(f'Performance statistics not recorded')

class Engine:
    sched_client : SchedulerClient
    updates : list[sched_ext.QueryUpdate]
    batch : sched_ext.BatchQueryTodo
    model_runner: ModelRunner
    sampler: Sampler
    query_manager: QueryManager
    cache: KDeepSeekV3Cache | KGQACache
    def __init__(self, args: ConfigArgs = default_args, generated_token_queue:Queue = None, broadcast_endpoint: str = None, kvcache_event: Event = None):
        self.args = args

        for key, value in vars(args).items():
            if value is not None and hasattr(Config(), key):
                setattr(Config(), key, value)

        self.device = self.args.device
        self.sched_client = SchedulerClient(args.sched_port)
        self.updates = []

        try: 
            config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) 
        except:
            if args.model_name == "Qwen3Moe": 
                config = Qwen3MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
            else:
                assert False, f"model {args.model_name} not supported" 

            
        self.gen_queue = generated_token_queue
            
        with torch.device("meta"):
            if config.architectures[0] == "DeepseekV3ForCausalLM":
                self.cache = KDeepSeekV3Cache(config, self.args.page_size)
                self.model = KDeepseekV3ForCausalLM(config, self.cache)
            elif config.architectures[0] == "DeepseekV2ForCausalLM":
                self.cache = KDeepSeekV3Cache(config, self.args.page_size)
                self.model = KDeepseekV2ForCausalLM(config, self.cache)
            elif config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
                self.cache = KGQACache(config, self.args.page_size)
                if config.architectures[0] == "Qwen2MoeForCausalLM":
                    self.model = KQwen2MoeForCausalLM(config, self.cache)
                else:
                    self.model = KQwen3MoeForCausalLM(config, self.cache)


        context = zmq.Context()

            
        self.pub_socket = context.socket(zmq.PUB)
        self.pub_socket.bind(f"ipc://{broadcast_endpoint}") 
        # time.sleep(1) # make sure all subscribers are ready


        try:
            generation_config = GenerationConfig.from_pretrained(args.model_dir)
        except:
            generation_config = GenerationConfig(
                max_length=args.max_new_tokens,
                temperature=args.temperature,
                top_p=args.top_p,
                do_sample=True
            )
            
        if args.optimize_config_path is None:
            optimize_config_path = default_optimize_rules[config.architectures[0]]
               
        else:
            optimize_config_path = args.optimize_config_path
        gguf_path = args.gguf_path
        if gguf_path is None:
            gguf_path = input(
                "please input the path of your gguf file(gguf file in the dir containing input gguf file must all"
                " belong to current model):"
            )
        optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config)
        self.model.generation_config = generation_config
        if self.model.generation_config.pad_token_id is None:
            self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id

        self.model.eval()
        kvcache_event.set()
        # load kvcache
        print(f"Getting inference context from sched_client.")
        inference_context = self.sched_client.get_inference_context_raw()
        print(f"Got inference context, sending it to subscribers.")
        inference_context = self.sched_client.rebuild_inferece_context(inference_context)
        self.cache.load(inference_context)
        print(f"kv_cache loaded successfully.")
        

        self.block_num = inference_context.k_cache[0].size(1)
        self.model_runner = ModelRunner(self.model, self.device, self.args.use_cuda_graph, page_size = args.page_size, block_num=self.block_num)
        #@TODO add config
        if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
            self.model.init_wrapper(self.args.use_cuda_graph, self.device, max(self.model_runner.cuda_graphs), args.max_batch_size, self.block_num) 
        else:
            self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)

        self.sampler = Sampler()
        self.query_manager = QueryManager(device = self.device, page_size = args.page_size)

            
    def sampling(self, forward_output: ForwardBatchOutput):
        generated_tokens = torch.empty(0, device=self.device, dtype=torch.int32)
        for i in range(forward_output.num_batchs):
            logit = forward_output.logits[i]
            if hasattr(forward_output, "temperatures"):
                temperatures = forward_output.temperatures[i]
            else:
                temperatures = None
            
            if hasattr(forward_output, "top_ps"):
                top_ps = forward_output.top_ps[i]
            else:
                top_ps = None

            sample_options = SamplingOptions(logit.size(0), self.device, pretrained_config=self.model.generation_config, temperatures=temperatures, top_ps=top_ps)
            generated_tokens, probs=self.sampler(logit, sample_options)
        return generated_tokens, probs
    
    def loop(self):

        next_batch = None   

        while True:
            self.batch = next_batch
            if self.batch is not None:
                self.model_runner.run(self.batch, self.query_manager)

            if len(self.updates) > 0:
                for q in self.updates:
                    if q.is_prefill == True:
                        continue
                    # print(f"Putting token {q.generated_token} into queue for query id: {q.id}")
                    try:
                        self.gen_queue.put((q.id, q.generated_token if q.decode_done == False else None), timeout=5)
                    except queue.Full:
                        pass#print("Queue is full after timeout; unable to put more items.")
                
            next_batch = self.sched_client.update_last_batch(self.updates)
            if next_batch.query_ids == []:
                next_batch = None
            self.pub_socket.send_pyobj(next_batch)  

            if next_batch is not None:
                self.query_manager.add_query(next_batch)
            
            
            if self.batch is not None:
                self.model_runner.sync()
                print(f"Model execution time (GPU): {self.model_runner.model_time:.3f} ms, {1000/self.model_runner.model_time:.3f} tokens/s")
                # if self.rank == 0:
                
                generated_tokens, probs = self.sampling( self.model_runner.output)
                
                self.updates = self.query_manager.update(self.batch)
                fill_generated_tokens(self.updates, generated_tokens, self.query_manager)
            else:
                self.updates = []

class BalanceServeThreadContext(ThreadContext):
    def get_local_messages(self):
        local_messages = []
        for m in self.messages:
            local_messages.append({"role": m.role.value, "content": m.get_text_content()})

        return local_messages
    

def run_engine(args, token_queue, broadcast_endpoint, event, kvcache_event):
    engine = Engine(args, token_queue, broadcast_endpoint, kvcache_event)
    if args.use_cuda_graph:
        engine.model_runner.warmup()
        
    event.set()
    engine.loop()


class BalanceServeInterface(BackendInterfaceBase):
    use_static_cache: bool = True

    model: Any
    tokenizer: AutoTokenizer

    cache: StaticCache
    generated_ids: torch.Tensor
    seq_length: int

    streamer: TextStreamer

    # thread_related
    last_request_id: Optional[str] = None
    ever_generated_ids: Set[int] = set()

    def __init__(self, args: ConfigArgs = default_args):
        self.args = args
        self.queue_map:dict[int,asyncio.Queue] = {}
        self.thread_map: dict[int, int] = {}
        processes = []
        self.broadcast_endpoint = tempfile.NamedTemporaryFile(delete=False).name # @TODO add to config
        ctx = mp.get_context("spawn")
        self.token_queue = ctx.Queue(maxsize=1000) 
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir, trust_remote_code=True)
        self.sched_client = SchedulerClient(args.sched_port)
        self.streamer = TextStreamer(self.tokenizer)

        start_event = ctx.Event()
        kvcache_event = ctx.Event()

        p = ctx.Process(target=run_engine, args=(self.args, self.token_queue, self.broadcast_endpoint, start_event, kvcache_event))
        p.start()
        processes.append(p)
        kvcache_event.wait()


        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            pickle.dump(args, temp_file)
            temp_file_path = temp_file.name
        current_file = __file__
        target_file = os.path.join(os.path.dirname(current_file), "..", "..", "balance_serve", "sched_rpc.py")
        target_file = os.path.normpath(target_file)
        log_path = os.path.join(args.log_dir, "rpc.log")
        log = open(log_path, "a") 
        sched_process = subprocess.Popen(
            ["python3", target_file, "--config", temp_file_path], 
            stdout=log, 
            stderr=log
        )
        print("sched_rpc started with PID:", sched_process.pid)

        def signal_handler(signum, frame):
            print(f"Received signal {signum}, shutting down...")
            cleanup()
            os._exit(0) 

        def cleanup():
            print("Cleaning up...")

            for p in processes:
                if p.is_alive():
                    print(f"Terminating subprocess {p.pid}")
                    p.terminate()
                    p.join()

            if sched_process and sched_process.poll() is None:
                print(f"Terminating sched_process {sched_process.pid}")
                sched_process.terminate()
                sched_process.wait()
        signal.signal(signal.SIGINT, signal_handler)   
        signal.signal(signal.SIGTERM, signal_handler)

        start_event.wait()
    
    def get_params(self, temperature: Optional[float] = None, top_p: Optional[float] = None, 
                   max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None) -> tuple[float, float]:
        """Get sampling parameters and handle default values and edge cases"""
        if max_tokens is not None:
            max_completion_tokens = max_tokens
        if max_completion_tokens is None:
            max_completion_tokens = self.args.max_new_tokens
        else:
            max_completion_tokens = min(self.args.max_new_tokens, max_completion_tokens)
        if temperature is None:
            temperature = self.args.temperature
        if top_p is None:
            top_p = self.args.top_p
            
        if temperature == 0:
            temperature = 0.0001
        if top_p == 0:
            top_p = 0.0001
            
        return temperature, top_p, max_completion_tokens

    def run_queue_proxy(self):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.queue_proxy())

    @asynccontextmanager
    async def lifespan(self, app: FastAPI):
        asyncio.create_task(self.queue_proxy())
        yield

    async def queue_proxy(self):
        print("Queue Proxy Started")
        while True:
            try:
                query_id, token = self.token_queue.get_nowait()
                try:
                    # query id might not be allocated yet
                    self.queue_map[query_id].put_nowait(token)
                    #print(f"Proxy Put token: {token} to queue for query id: {query_id}")
                except asyncio.QueueFull:
                    #print(f"Queue for query id: {query_id} is full, waiting to put: {token}")
                    await self.queue_map[query_id].put(token)

            except queue.Empty:
                # print("no new token")
                # await asyncio.sleep(1)
                await asyncio.sleep(0)
    def tokenize_prompt(self, prompt: str):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.args.device)
        return input_ids

    def format_and_tokenize_input_ids(self, thread_id: ObjectID, messages: List):
        input_str: str = self.tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
        # drop <think> token in chat template
        if input_str.endswith('<think>\n'):
            input_str = input_str[:-len('<think>\n')]
        input_ids = self.tokenizer.encode(input_str, return_tensors="pt", add_special_tokens=False).to(self.args.device)
        logger.debug(f"get input ids of shape {input_ids.shape}")
        return input_ids
    
    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, 
                        max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        profiler = Profiler()
        profiler.create_and_start_timer("tokenize")
        
        if isinstance(local_messages, List):
            input_ids = self.format_and_tokenize_input_ids(thread_id, local_messages)
        elif isinstance(local_messages, str):
            input_ids = self.tokenize_prompt(local_messages)
        else:
            raise ValueError("local_messages should be List or str")
        if Config().user_force_think:
            token_thinks = torch.tensor([self.tokenizer.encode("<think>\n",add_special_tokens=False)],device=input_ids.device)
            input_ids = torch.cat(
                [input_ids, token_thinks], dim=1
            )

        profiler.pause_timer("tokenize")

        profiler.create_and_start_timer("prefill")
        
        query_add = sched_ext.QueryAdd()
        query_add.query_token =  input_ids[0].tolist()
        query_length = input_ids[0].shape[0]
        query_add.query_length = query_length
        profiler.set_counter("prefill", query_length)
        #@TODO add server
        stop_criteria =  [self.tokenizer.encode(self.tokenizer.eos_token, add_special_tokens=False),self.tokenizer.encode("<|im_end|>")]
        query_add.stop_criteria = stop_criteria
        
        temperature, top_p, max_new_tokens = self.get_params(temperature, top_p, max_tokens, max_completion_tokens)
            
        query_add.sample_options.temperature = temperature
        query_add.sample_options.top_p = top_p
        query_add.estimated_length = min(self.args.cache_lens, query_length+max_new_tokens)

        if query_add.estimated_length < query_add.query_length:
            raise Exception(f'query too long: estimated_length={query_add.estimated_length} < query_length={query_add.query_length}')

        query_id = self.sched_client.add_query(query_add)
        queue = asyncio.Queue(maxsize=max_new_tokens)
        self.queue_map[query_id] = queue
        self.thread_map[thread_id] = query_id
        is_first_token = True
        async for token in chat_stream(self.queue_map[query_id], self.tokenizer):
            if is_first_token:
                is_first_token=False
                profiler.pause_timer("prefill")
                profiler.create_and_start_timer("decode")
                profiler.set_counter("decode", 0)
                if Config().user_force_think:
                    think = '<think>\n'
                    print(think, end="",flush=True)
                    yield think, None
            else:
                profiler.inc("decode")
            yield token, None
        profiler.pause_timer("decode")
        report_last_time_performance(profiler)
        yield self.streamer.end(), None
        if profiler.get_counter('decode') >= max_new_tokens - 1:
            yield "", "length"
        else:
            yield "", "stop"
        
        
        yield RawUsage(
                tokenize_time = profiler.get_timer_sec('tokenize'),
                prefill_time = profiler.get_timer_sec('prefill'),
                decode_time = profiler.get_timer_sec('decode'),
                prefill_count = profiler.get_counter('prefill'),
                decode_count = profiler.get_counter('decode'),
            )


================================================
FILE: kt-sft/ktransformers/server/backend/interfaces/exllamav2.py
================================================
import sys, os
from typing import AsyncIterator, Dict, Tuple

import torch

from ..args import ConfigArgs, default_args

from ..base import BackendInterfaceBase, ThreadContext
from ktransformers.server.schemas.assistants.runs import RunObject


from ..args import *

class ExllamaThreadContext(ThreadContext):
    def __init__(self, run: RunObject, args: ConfigArgs = default_args) -> None:
        super().__init__(run,args)
        
    def get_interface(self):
        return 

    def get_local_messages(self):
        raise NotImplementedError


class ExllamaInterface(BackendInterfaceBase):
    
    def __init__(self, args: ConfigArgs = ...):
        raise NotImplementedError
    
    def tokenize_prompt(self, prompt: str) -> torch.Tensor:
        raise NotImplementedError
    
    async def inference(self,local_messages,request_unique_id:Optional[str])->AsyncIterator:
        raise NotImplementedError
    

================================================
FILE: kt-sft/ktransformers/server/backend/interfaces/ktransformers.py
================================================
import torch
from typing import Optional, List
import asyncio
from transformers import AutoTokenizer, AutoConfig, GenerationConfig
from ktransformers.server.backend.interfaces.transformers import (
    TransformersInterface,
    ConfigArgs,
    TransformersThreadContext,
    default_args,
    TextStreamer,
)
from ktransformers.server.config.log import logger
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.custom_cache import StaticCache
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
from ktransformers.local_chat import custom_models, default_optimize_rules
from ktransformers.util.utils import get_device
from typing import Optional
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
from ktransformers.server.schemas.endpoints.chat import RawUsage
from ktransformers.util.grad_wrapper import maybe_no_grad

warm_uped = False

class KTransformersThreadContext(TransformersThreadContext):
    pass


class KTransformersInterface(TransformersInterface):
    def __init__(self, args: ConfigArgs = default_args):
        self.args = args
        torch.set_grad_enabled(False)
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir, device=args.device, trust_remote_code=args.trust_remote_code)
        config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=args.trust_remote_code)
        try:
            generation_config = GenerationConfig.from_pretrained(args.model_dir)
        except:
            generation_config = GenerationConfig(
                max_length=args.max_new_tokens,
                temperature=args.temperature,
                top_p=args.top_p,
                do_sample=True
            )
        
        torch.set_default_dtype(config.torch_dtype)
        if config.architectures[0] == "Qwen2MoeForCausalLM":
            config._attn_implementation = "flash_attention_2"

        with torch.device("meta"):
            self.model = custom_models[config.architectures[0]](config)
        if default_args.optimize_config_path is None:
            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
            optimize_config_path = args.optimize_config_path

        # print(optimize_config)

        gguf_path = args.gguf_path
        if gguf_path is None:
            gguf_path = input(
                "please input the path of your gguf file(gguf file in the dir containing input gguf file must all"
                " belong to current model):"
            )
        optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config)
        self.model.generation_config = generation_config
        self.device_map = self.model.gguf_loader.tensor_device_map
        # logger.info(f"{args.model_name} loaded from {args.model_dir} to {self.device_map}")
        self.cache = StaticCache(
            config=self.model.config,
            max_batch_size=args.batch_size,
            max_cache_len=args.cache_lens,
            device=self.device_map,
            dtype=self.model.dtype,
        )
        # logger.info(f"StaticCache (length={args.cache_lens}), batch size:{args.batch_size}")

        if self.model.generation_config.pad_token_id is None:
            self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
        self.streamer = TextStreamer(self.tokenizer)

        self._infer_lock = asyncio.Lock()

    def decode_one_tokens(self):
        global warm_uped

        device_map = self.model.gguf_loader.tensor_device_map
        torch_device = get_device("blk.0.self_attn", device_map)
        torch_device = "cuda:0" if torch_device == "cuda" else torch_device
        torch.cuda.set_device(torch_device)
        if warm_uped and self.args.use_cuda_graph:
            if not hasattr(self, "cuda_graph_runner"):
                self.cuda_graph_runner = CUDAGraphRunner()
                self.cuda_graph_runner.capture(
                    self.model,
                    self.current_ids,
                    self.active_cache_position.unsqueeze(0),
                    self.active_cache_position,
                    self.cache,
                    main_device=torch_device,
                    return_dict=False,
                    use_cache=True,
                )

            if hasattr(self, "cuda_graph_runner"):
                logits = self.cuda_graph_runner(
                    self.current_ids, self.active_cache_position.unsqueeze(0), self.active_cache_position
                )
                self.cache.change_seq_length(1)
                torch.cuda.synchronize()
                logits = logits[0, -1, :]
                return self.logits_to_token(logits)
        
        if self.args.use_cuda_graph:
            warm_uped = True
            
        if self.use_static_cache:
            logits = self.model(
                self.current_ids.to(torch_device),
                cache_position=self.active_cache_position,
                past_key_values=self.cache,
                return_dict=False,
                use_cache=True,
            )[0]
        else:
            logits = self.model(self.current_ids, return_dict=False)[0]
        logits = logits[0, -1, :]

        return self.logits_to_token(logits)


    @maybe_no_grad
    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        input_ids_length = input_ids.shape[-1]
        if max_tokens is not None:
            max_completion_tokens = max_tokens
        if max_completion_tokens is None:
            max_new_tokens = self.args.max_new_tokens
        else:
            max_new_tokens = min(self.args.max_new_tokens, max_completion_tokens)
        if(input_ids_length >= self.args.cache_lens):
            logger.warning(f"input_ids_length {input_ids_length} > cache_lens {self.args.cache_lens}")
            self.seq_length = input_ids_length
            return
        logger.debug(f"input_ids: {input_ids.shape}")
        device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
        device = "cuda:0" if device == "cuda" else device

        if is_new:
            self.ever_generated_ids.clear()
            same_prefix = 0
            flat_input_ids = input_ids.flatten()

            if getattr(self, 'generated_ids', None) is None:
                self.generated_ids = torch.zeros(
                    self.args.batch_size,
                    input_ids.shape[-1] + max_new_tokens + 1,
                    dtype=torch.int,
                    device=self.args.device,
                )
                self.seq_length = 1            
            
            flat_prev_ids = self.generated_ids.flatten()
            for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1):
                if flat_input_ids[i] == flat_prev_ids[i]:
                    same_prefix += 1
                else:
                    break
            
            logger.debug(f"same prefix len: {same_prefix}")
            self.cache.remove_suffix(same_prefix)
            self.seq_length = same_prefix
            self.generated_ids = self.generated_ids[..., :same_prefix]
            input_ids = input_ids[..., same_prefix:]
            input_ids_length = input_ids.shape[-1]

        self.ever_generated_ids.clear()
        self.profiler.set_counter("prefill", input_ids_length)
        logger.debug(f"input_ids: {input_ids.shape}")
        logger.debug(f"generate_ids: {self.generated_ids.shape}")
        
        former_seq_length = self.seq_length
        self.seq_length += input_ids_length
        expected_length = min(self.seq_length + max_new_tokens + 1, self.args.cache_lens)
        delta_length = expected_length - self.generated_ids.shape[-1]
        if delta_length > 0:
            new_generate_ids = torch.zeros(
                self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
            )
            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
        else:
            logger.warning(f"seq_length bigger than cache_lens, killed")
            exit(0)
        
        logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
        cache_position = torch.arange(former_seq_length, self.seq_length, device=device)
        self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int)

        if not (type(self) is TransformersInterface):
            input_ids = input_ids.to("cpu")
        
        def chunk_prefill(input_ids, cache_position):
            inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
            torch.cuda.set_device(device)
            if flashinfer_enabled:
                MLAWrapperSingleton.need_plan_all()
            if self.use_static_cache:
                logits = self.model(
                    inputs_embeds=inputs_embeds,
                    cache_position=cache_position,
                    past_key_values=self.cache,
                    return_dict=False,
                    use_cache=True,
                )[0]
            else:
                logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]

            return logits

        chunk_start = 0
        while chunk_start < input_ids_length:
            chunk_end = min(chunk_start + self.args.chunk_size, input_ids_length)
            if self.cache != None:
                self.cache.cur_idx=cache_position[chunk_start:chunk_end]
            logits = chunk_prefill(input_ids[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end])
            chunk_start += self.args.chunk_size
            
        if flashinfer_enabled:
            MLAWrapperSingleton.reset_buffer()
        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
        next_token = self.logits_to_token(logits[0, -1, :])
        self.max_new_tokens = min(max_new_tokens, self.args.cache_lens - self.seq_length) - 1 
        yield self.append_new_tokens(next_token)
        
    @property
    def active_cache_position(self):
        device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0")
        return torch.tensor([self.seq_length - 1], device=device)
    
    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        async with self._infer_lock:
            async for v in super().inference(local_messages, thread_id, temperature, top_p, max_tokens, max_completion_tokens):
                yield v
            
            # return this inference raw usage
            yield RawUsage(
                tokenize_time = self.profiler.get_timer_sec('tokenize'),
                prefill_time = self.profiler.get_timer_sec('prefill'),
                decode_time = self.profiler.get_timer_sec('decode'),
                prefill_count = self.profiler.get_counter('prefill'),
                decode_count = self.profiler.get_counter('decode'),
            )

================================================
FILE: kt-sft/ktransformers/server/backend/interfaces/transformers.py
================================================
from typing import Any, List, Optional, Set
import re
import json
import uuid
from transformers import (
    LlamaTokenizer,
    AutoTokenizer,
    AutoConfig,
    LlamaForCausalLM,
    GenerationConfig,
    StaticCache,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    LogitsProcessorList,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    MinPLogitsWarper,
    TypicalLogitsWarper,
    EpsilonLogitsWarper,
    EtaLogitsWarper,
)

from ktransformers.server.config.config import Config
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.utils.multi_timer import Profiler
from torch.nn.attention import SDPBackend
import torch
import sys, os
from ..base import ThreadContext, BackendInterfaceBase
from ktransformers.server.config.log import logger
from ..args import ConfigArgs, default_args
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
from ktransformers.util.grad_wrapper import maybe_no_grad

# This TextStreamer is a modified version from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/streamers.py
class TextStreamer:

    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
        self.tokenizer = tokenizer
        self.skip_prompt = skip_prompt
        self.decode_kwargs = decode_kwargs

        # variables used in the streaming process
        self.token_cache = []
        self.print_len = 0
        self.next_tokens_are_prompt = True

    def reset(self):
        self.token_cache = []
        self.print_len = 0

    def put(self, value) -> Optional[str]:
        """
        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
        """
        if not isinstance(value, int):
            raise ValueError("TextStreamer only supports batch size 1, and int type input")

        if self.skip_prompt and self.next_tokens_are_prompt:
            self.next_tokens_are_prompt = False
            return None

        # Add the new token to the cache and decodes the entire thing.
        self.token_cache.append(value)
        text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True, **self.decode_kwargs)

        # After the symbol for a new line, we flush the cache.
        if text.endswith("\n"):
            printable_text = text[self.print_len :]
            self.reset()
        # If the last token is a CJK character, we print the characters.
        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
            printable_text = text[self.print_len :]
            self.print_len += len(printable_text)
        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
        # which may change with the subsequent token -- there are probably smarter ways to do this!)
        else:
            printable_text = text[self.print_len : text.rfind(" ") + 1]
            self.print_len += len(printable_text)
        return printable_text

    def end(self) -> Optional[str]:
        """Flushes any remaining cache and prints a newline to stdout."""
        # Flush the cache, if it exists
        if len(self.token_cache) > 0:
            text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True, **self.decode_kwargs)
            printable_text = text[self.print_len :]
            self.reset()
        else:
            printable_text = ""

        self.next_tokens_are_prompt = True
        return printable_text

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False


class TransformersThreadContext(ThreadContext):
    def get_local_messages(self):
        local_messages = []
        for m in self.messages:
            local_messages.append({"role": m.role.value, "content": m.get_text_content()})

        return local_messages


class TransformersInterface(BackendInterfaceBase):
    use_static_cache: bool = True

    model: Any
    tokenizer: AutoTokenizer

    cache: StaticCache
    generated_ids: torch.Tensor
    seq_length: int

    streamer: TextStreamer

    # thread_related
    last_request_id: Optional[str] = None
    ever_generated_ids: Set[int] = set()

    def __init__(self, args: ConfigArgs = default_args):
        self.args = args

        self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
        self.model = AutoModelForCausalLM.from_pretrained(args.model_dir, device_map=args.device, use_safetensors=True)
        # logger.info(f"{args.model_name} loaded from {args.model_dir} to {args.device}")

        self.cache = StaticCache(
            config=self.model.config,
            max_batch_size=args.batch_size,
            max_cache_len=args.cache_lens,
            device=args.device,
            dtype=self.model.dtype,
        )
        # logger.info(f"StaticCache (length={args.cache_lens}) created at {args.device}, batch size:{args.batch_size}")

        self.streamer = TextStreamer(self.tokenizer)

    @property
    def current_ids(self):
        return self.generated_ids[:, self.seq_length - 1].unsqueeze(1)

    @property
    def active_cache_position(self):
        return torch.tensor([self.seq_length - 1], device=self.args.device)

    def tokenize_prompt(self, prompt: str):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.args.device)
        return input_ids

    def format_and_tokenize_input_ids(self, thread_id: ObjectID, messages: List):
        for m in messages:
            if m["role"] == "system":
                logger.warning(f'change {m["role"]} to user')
                m["role"] = "user"

        new_messages = [messages[0]]
        for m in messages[1:]:
            if m["role"] == "user" and new_messages[-1]["role"] == "user":
                logger.warning("merge two adjacent user messages")
                new_messages[-1]["content"] += '\n' + m["content"]
            else:
                new_messages.append(m)
        # if (self.last_request_id is not None) and self.last_request_id == thread_id:
        #     input_ids = self.tokenizer.encode(self.tokenizer.eos_token+self.tokenizer.apply_chat_template([new_messages[-1]], return_tensors="pt",tokenize=False, add_generation_prompt=True), add_special_tokens = False, return_tensors="pt").to(self.args.device)
        # else:
        #     input_ids = self.tokenizer.apply_chat_template(
        #         new_messages, return_tensors="pt", add_generation_prompt=True
        #     ).to(self.args.device)
        input_str: str = self.tokenizer.apply_chat_template(new_messages,tokenize=False,add_generation_prompt=True)
        # drop <think> token in chat template
        if input_str.endswith('<think>\n'):
            input_str = input_str[:-len('<think>\n')]
        input_ids = self.tokenizer.encode(input_str, return_tensors="pt").to(self.args.device)
        if (self.last_request_id is not None) and self.last_request_id == thread_id:
            x = self.generated_ids[:,:self.seq_length]
            y = input_ids[:,:self.seq_length]
            # We can only hope that the input_ids are the same
            unequal_mask = torch.ne(x,y)
            unequal_positions = torch.nonzero(unequal_mask)
            num_unequal_elements = unequal_mask.sum().item()
            logger.warning(f'num_unequal_elements: {num_unequal_elements}') 

            input_ids = input_ids[:,self.seq_length:]
        logger.debug(f"get input ids of shape {input_ids.shape}")
        return input_ids

    def append_new_tokens(self, new_tokens: int) -> Optional[str]:
        self.generated_ids[0, self.seq_length] = new_tokens
        self.seq_length += 1
        return self.streamer.put(new_tokens)

    @staticmethod
    def tf_logits_warper(generation_config):
        """
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
        used for multinomial sampling.
        """

        # instantiate warpers list
        warpers = LogitsProcessorList()

        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
        # better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
        if generation_config.num_beams > 1:
            if isinstance(generation_config._eos_token_tensor, list):
                min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
            elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
                min_tokens_to_keep = generation_config._eos_token_tensor.shape[0] + 1
            else:
                min_tokens_to_keep = 2
        else:
            min_tokens_to_keep = 1

        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
        # all samplers can be found in `generation_utils_samplers.py`
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TemperatureLogitsWarper(generation_config.temperature))
        if generation_config.top_k is not None and generation_config.top_k != 0:
            warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
            warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.min_p is not None:
            # Applied after temperature scaling (see https://github.com/ggerganov/llama.cpp/pull/3841#issuecomment-2073826084)
            warpers.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
            warpers.append(
                TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
            warpers.append(
                EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
            warpers.append(
               EtaLogitsWarper(
                    epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep, device=device
                )
            )
        # `LogitNormalization` should always be the last logit processor, when present
        if generation_config.renormalize_logits is True:
            warpers.append(LogitNormalization())
        return warpers

    def prepare_logits_wrapper(self, inputs, device, temperature: Optional[float] = None, top_p: Optional[float] = None):
        if temperature is None or temperature == 0:
            temperature = self.model.generation_config.temperature
        if top_p is None:
            top_p = self.model.generation_config.top_p
        if top_p == 0:
            top_p = 0.0001
        generation_config, model_kwargs = self.model._prepare_generation_config(
            None, max_length=self.args.max_new_tokens,
            do_sample=True, 
            top_k=self.args.top_k, 
            top_p=top_p, 
            temperature=temperature,
            repetition_penalty=self.args.repetition_penalty # change this to modify generate config
        )
        self.inputs = inputs

        self.logits_warper = self.tf_logits_warper(generation_config)

    def logits_to_token(self, logits: torch.Tensor):
        logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1))

        probs = torch.nn.functional.softmax(logits, dim=-1)

        sample = True
        if sample:
            last = torch.multinomial(probs, num_samples=1)
        else:
            _, last = torch.topk(probs, k=1, dim=-1)

        last = last.item()
        self.ever_generated_ids.add(last)
        return last

    def decode_one_tokens(self):
        if self.use_static_cache:
            logits = self.model(
                self.current_ids,
                cache_position=self.active_cache_position,
                past_key_values=self.cache,
                return_dict=False,
                use_cache=True,
            )[0]
        else:
            logits = self.model(self.current_ids, return_dict=False)[0]
        logits = logits[0, -1, :]

        return self.logits_to_token(logits)

    @maybe_no_grad
    def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        input_ids_length = input_ids.shape[-1]
        logger.debug(f"input_ids: {input_ids.shape}")
        if max_tokens is not None:
            max_completion_tokens = max_tokens
        if max_completion_tokens is None:
            max_new_tokens = self.args.max_new_tokens
        else:
            max_new_tokens = min(self.args.max_new_tokens, max_completion_tokens)
        if is_new:
            self.ever_generated_ids.clear()
            same_prefix = 0
            flat_input_ids = input_ids.flatten()

            if getattr(self, 'generated_ids', None) is None:
                self.generated_ids = torch.zeros(
                    self.args.batch_size,
                    input_ids.shape[-1] + max_new_tokens + 1,
                    dtype=torch.int,
                    device=self.args.device,
                )
                self.seq_length = 1            
            
            flat_prev_ids = self.generated_ids.flatten()
            for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1):
                if flat_input_ids[i] == flat_prev_ids[i]:
                    same_prefix += 1
                else:
                    break
            
            logger.debug(f"same prefix len: {same_prefix}")
            self.cache.remove_suffix(same_prefix)
            self.seq_length = same_prefix
            self.generated_ids = self.generated_ids[..., :same_prefix]
            input_ids = input_ids[..., same_prefix:]
            input_ids_length = input_ids.shape[-1]
        
        self.ever_generated_ids.clear()
        self.profiler.set_counter("prefill", input_ids_length)
        logger.debug(f"input_ids: {input_ids.shape}")

        logger.debug(f"generate_ids: {self.generated_ids.shape}")
        former_seq_length = self.seq_length
        self.seq_length += input_ids_length
        expected_length = self.seq_length + max_new_tokens + 1
        delta_length = expected_length - self.generated_ids.shape[-1]
        if delta_length > 0:
            new_generate_ids = torch.zeros(
                self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device
            )
            self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1)
            
        logger.debug(f"cache position: {former_seq_length} to {self.seq_length}")
        cache_position = torch.arange(former_seq_length, self.seq_length, device=self.args.device)
        self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int)

        device = input_ids.device
        if not (type(self) is TransformersInterface):
            input_ids = input_ids.to("cpu")
        inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
        if self.use_static_cache:
            logits = self.model(
                inputs_embeds=inputs_embeds,
                cache_position=cache_position,
                past_key_values=self.cache,
                return_dict=False,
                use_cache=True,
            )[0]
        else:
            logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]

        self.prepare_logits_wrapper(input_ids, device, temperature, top_p)
        next_token = self.logits_to_token(logits[0, -1, :])
        self.max_new_tokens = min(max_new_tokens, self.args.cache_lens - self.seq_length) - 1 
        yield self.append_new_tokens(next_token)

    @maybe_no_grad
    def generate(self):
        logger.info(f"args.max_new_tokens: {self.args.max_new_tokens}, cache_lens: {self.args.cache_lens}, seq_length: {self.seq_length}")
        if(self.max_new_tokens <= 0):
            logger.warning("max_new_tokens is less than 0")
            yield self.streamer.end(), "length"
            return
        self.profiler.set_counter("decode", 0)

        for i in range(1, self.max_new_tokens):
            with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
                if flashinfer_enabled:
                    MLAWrapperSingleton.plan_all(None,None,None,self.active_cache_position.to(torch.int32)+1, None,
                                             num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
                                             head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.cache.page_size,
                                             sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
                next_token = self.decode_one_tokens()
                self.profiler.inc("decode")
                if next_token == self.tokenizer.eos_token_id or "<|im_end|>" == self.tokenizer.decode(next_token):
                    yield self.streamer.end(), None
                    yield "", "stop"
                    assert self.args.batch_size == 1
                    break
                yield self.append_new_tokens(next_token), None

        else:   # for's else, if output get max new tokens
            yield self.streamer.end(), None
            yield "", "length"
        
        
    def check_is_new(self, thread_id: str):
        if not self.use_static_cache:
            return True
        if self.last_request_id is None:
            self.last_request_id = thread_id
            return True
        else:
            if self.last_request_id == thread_id:
                return False
            else:
                self.last_request_id = thread_id
                return True

    async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None):
        self.streamer.reset()
        self.profiler.create_and_start_timer("tokenize")
        if isinstance(local_messages, List):
            input_ids = self.format_and_tokenize_input_ids(thread_id, local_messages)
        elif isinstance(local_messages, str):
            #local_messages = local_messages[0]['content']
            input_ids = self.tokenize_prompt(local_messages)
            #input_ids = torch.tensor([[6366]], device=input_ids.device)
        else:
            raise ValueError("local_messages should be List or str")
        
        if Config().user_force_think:
            token_thinks = torch.tensor([self.tokenizer.encode("<think>\n",add_special_tokens=False)],device=input_ids.device)
            input_ids = torch.cat(
                [input_ids, token_thinks], dim=1
            )

        self.profiler.pause_timer("tokenize")

        self.profiler.create_and_start_timer("prefill")

        if Config().user_force_think:
            think = '<think>\n'
            print(think, end="",flush=True)
            yield think, None
        
        for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p, max_tokens, max_completion_tokens):
            # output think token after prefill done
            if t is not None:
                print(t, end="",flush=True)
                yield t, None
        self.profiler.pause_timer("prefill")

        self.profiler.create_and_start_timer("decode")
        for t, finish_reason in self.generate():
            if t is not None:
                print(t, end="",flush=True)
                yield t, finish_reason
        print("")
        self.profiler.pause_timer("decode")
        self.report_last_time_performance()


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/config.py
================================================
'''
Date: 2024-11-07 07:30:16
LastEditors: djw
LastEditTime: 2024-11-15 14:23:26
'''
import math
from dataclasses import dataclass
from typing import Optional

import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F
import yaml

import json
from typing import Optional

class ModelConfig:
    vocab_size: int = 32000
    n_layer: int = 1
    n_head: int = 32
    dim: int = 4096
    intermediate_size: int = 18944
    n_local_heads: int = 8
    head_dim: int = 128
    rope_base: float = 1000000.0
    norm_eps: float = 1e-06
    rope_scaling: Optional[dict] = None
    rms_norm_eps: float = 1e-6
    hidden_act: str = "silu"
    model_path: str
    gguf_path: str
    optimize_rule_path: str
    speculative_rule_path: str
            

    # quantize config
    quant_algorithm: Optional[str] = None
    quant_group_size: Optional[int] = None
    quant_num_bits: Optional[int] = None

    json_key_map = {
        "vocab_size": "vocab_size",
        "n_layer": "num_hidden_layers",
        "n_head": "num_attention_heads",
        "dim": "hidden_size",
        "intermediate_size": "intermediate_size",
        "n_local_heads": "num_key_value_heads",
        "rope_base": "rope_theta",
        "norm_eps": "norm_eps",
        "rms_norm_eps": "rms_norm_eps",
        "hidden_act": "hidden_act",
    }

    def __init__(self, config):
        self.model_path = config["model"]["model_path"]
        self.gguf_path = config["model"]["gguf_path"]
        self.optimize_rule_path = config["model"]["optimize_rule_path"]
        if "speculative_rule_path" in config["model"]:
            self.speculative_rule_path =  config["model"]["speculative_rule_path"]
            self.speculative_gguf_path = config["model"]["speculative_gguf_path"]
            self.speculative_model_path = config["model"]["speculative_model_path"]
        self.quant_algorithm = config["model"]["quant"]["algorithm"]
        self.quant_group_size = config["model"]["quant"]["group_size"]
        self.quant_num_bits = config["model"]["quant"]["num_bits"]
        self.load_config()
        self.n_layer = config["model"]["n_layers"]

    def load_config(self):
        config_file = f"{self.model_path}/config.json"
        try:
            with open(config_file, "r") as f:
                config_data = json.load(f)
        except FileNotFoundError:
            raise FileNotFoundError(f"Configuration file not found at {config_file}")

        for attr, json_key in self.json_key_map.items():
            if json_key in config_data:
                setattr(self, attr, config_data[json_key])
            else:
                setattr(self, attr, getattr(self, attr))


class ParallelConfig:
    def __init__(
        self,
        config,
    ) -> None:
        self.pipeline_parallel_size = config["parallel"]["pp"]
        self.tensor_parallel_size = config["parallel"]["tp"]
        self.disable_custom_all_reduce = config["parallel"]["disable_custom_all_reduce"]
        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size

class AttnConfig:
    page_size: int = 256
    block_num: int = 32
    max_batch_token : int = 256
    max_batch_size: int = 32

    def __init__(self, config):
        self.page_size = config["attn"]["page_size"]
        self.block_num = config["attn"]["block_num"]
        self.max_batch_token = config["attn"]["max_batch_token"]
        self.max_batch_size = config["attn"]["max_batch_size"]


class SamplerConfig():
	# Batched sampling params
    temperatures: float
    is_all_greedy: bool
	
    def __init__(self, config):
        self.temperatures = config["sample"]["temperature"]
        self.is_all_greedy = True


def load_yaml_config(file_path):
    with open(file_path, "r") as f:
        return yaml.safe_load(f)
    

class LLMConfig:
    model_config: ModelConfig
    parallel_config: ParallelConfig
    attn_config: AttnConfig
    sample_config: SamplerConfig
    config_file: str

    def __init__(self, config_file):
        self.config_file = config_file
        config = load_yaml_config(config_file)
        self.model_config = ModelConfig(config)
        self.parallel_config = ParallelConfig(config)
        self.attn_config = AttnConfig(config)
        self.sample_config = SamplerConfig(config)


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/__init__.py
================================================
from .communication_op import *
from .parallel_state import *
from .utils import *


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/communication_op.py
================================================
"""
Date: 2024-12-11 06:02:42
LastEditors: djw
LastEditTime: 2024-12-12 09:52:06
"""

from typing import Any, Dict, Optional, Union

import torch
import torch.distributed

from .parallel_state import get_tp_group


def tensor_model_parallel_all_reduce(input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
    """All-reduce the input tensor across model parallel group."""
    return get_tp_group().all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)


def tensor_model_parallel_all_gather(
    input_: torch.Tensor, dim: int = -1
) -> torch.Tensor:
    """All-gather the input tensor across model parallel group."""
    return get_tp_group().all_gather(input_, dim)


def tensor_model_parallel_gather(
    input_: torch.Tensor, dst: int = 0, dim: int = -1
) -> Optional[torch.Tensor]:
    """Gather the input tensor across model parallel group."""
    return get_tp_group().gather(input_, dst, dim)


def broadcast_tensor_dict(
    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
):
    if not torch.distributed.is_initialized():
        return tensor_dict
    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/cuda_wrapper.py
================================================
"""This file is a pure Python wrapper for the cudart library.
It avoids the need to compile a separate shared library, and is
convenient for use when we just need to call a few functions.
"""

import ctypes
from dataclasses import dataclass
from typing import Any, Dict, List, Optional

# this line makes it possible to directly load `libcudart.so` using `ctypes`
import torch  # noqa

# === export types and functions from cudart to Python ===
# for the original cudart definition, please check
# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html

cudaError_t = ctypes.c_int
cudaMemcpyKind = ctypes.c_int


class cudaIpcMemHandle_t(ctypes.Structure):
    _fields_ = [("internal", ctypes.c_byte * 128)]


@dataclass
class Function:
    name: str
    restype: Any
    argtypes: List[Any]


def find_loaded_library(lib_name) -> Optional[str]:
    """
    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
    the file `/proc/self/maps` contains the memory maps of the process, which includes the
    shared libraries loaded by the process. We can use this file to find the path of the
    a loaded library.
    """ # noqa
    found = False
    with open("/proc/self/maps") as f:
        for line in f:
            if lib_name in line:
                found = True
                break
    if not found:
        # the library is not loaded in the current process
        return None
    # if lib_name is libcudart, we need to match a line with:
    # address /path/to/libcudart-hash.so.11.0
    start = line.index("/")
    path = line[start:].strip()
    filename = path.split("/")[-1]
    assert filename.rpartition(".so")[0].startswith(lib_name), \
        f"Unexpected filename: {filename} for library {lib_name}"
    return path


class CudaRTLibrary:
    exported_functions = [
        # ​cudaError_t cudaSetDevice ( int  device )
        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
        # cudaError_t 	cudaDeviceSynchronize ( void )
        Function("cudaDeviceSynchronize", cudaError_t, []),
        # ​cudaError_t cudaDeviceReset ( void )
        Function("cudaDeviceReset", cudaError_t, []),

        # const char* 	cudaGetErrorString ( cudaError_t error )
        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),

        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
        Function("cudaMalloc", cudaError_t,
                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
        # ​cudaError_t 	cudaFree ( void* devPtr )
        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
        Function("cudaMemset", cudaError_t,
                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
        Function("cudaMemcpy", cudaError_t, [
            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
        ]),

        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
        Function("cudaIpcGetMemHandle", cudaError_t,
                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
        Function("cudaIpcOpenMemHandle", cudaError_t, [
            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
        ]),
    ]

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
    path_to_library_cache: Dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):
        if so_file is None:
            so_file = find_loaded_library("libcudart")
            assert so_file is not None, \
                "libcudart is not loaded in the current process"
        if so_file not in CudaRTLibrary.path_to_library_cache:
            lib = ctypes.CDLL(so_file)
            CudaRTLibrary.path_to_library_cache[so_file] = lib
        self.lib = CudaRTLibrary.path_to_library_cache[so_file]

        if so_file not in CudaRTLibrary.path_to_dict_mapping:
            _funcs = {}
            for func in CudaRTLibrary.exported_functions:
                f = getattr(self.lib, func.name)
                f.restype = func.restype
                f.argtypes = func.argtypes
                _funcs[func.name] = f
            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]

    def CUDART_CHECK(self, result: cudaError_t) -> None:
        if result != 0:
            error_str = self.cudaGetErrorString(result)
            raise RuntimeError(f"CUDART error: {error_str}")

    def cudaGetErrorString(self, error: cudaError_t) -> str:
        return self.funcs["cudaGetErrorString"](error).decode("utf-8")

    def cudaSetDevice(self, device: int) -> None:
        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))

    def cudaDeviceSynchronize(self) -> None:
        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())

    def cudaDeviceReset(self) -> None:
        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())

    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
        devPtr = ctypes.c_void_p()
        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
        return devPtr

    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))

    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
                   count: int) -> None:
        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))

    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
                   count: int) -> None:
        cudaMemcpyDefault = 4
        kind = cudaMemcpyDefault
        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))

    def cudaIpcGetMemHandle(self,
                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
        handle = cudaIpcMemHandle_t()
        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
            ctypes.byref(handle), devPtr))
        return handle

    def cudaIpcOpenMemHandle(self,
                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
        cudaIpcMemLazyEnablePeerAccess = 1
        devPtr = ctypes.c_void_p()
        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
        return devPtr


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce.py
================================================
import ctypes
from contextlib import contextmanager
from typing import List, Optional, Union

import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup

import server.envs as envs
from server.inference.distributed.cuda_wrapper import CudaRTLibrary
from server.inference.distributed.custom_all_reduce_utils import gpu_p2p_access_check
from server.inference.distributed.parallel_state import in_the_same_node_as
from server.inference.platforms import current_platform
from server.utils import cuda_device_count_stateless
import vLLMCustomAllreduce

try:
    vLLMCustomAllreduce.meta_size()
    custom_ar = True
except Exception:
    # For AMD GPUs and CPUs
    custom_ar = False


def _can_p2p(rank: int, world_size: int) -> bool:
    for i in range(world_size):
        if i == rank:
            continue
        if envs.VLLM_SKIP_P2P_CHECK:
            print("Skipping P2P check and trusting the driver's P2P report.")
            return torch.cuda.can_device_access_peer(rank, i)
        if not gpu_p2p_access_check(rank, i):
            return False
    return True


def is_weak_contiguous(inp: torch.Tensor):
    return inp.is_contiguous() or (
        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
        == inp.numel() * inp.element_size()
    )


class CustomAllreduce:

    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]

    # max_size: max supported allreduce size
    def __init__(
        self,
        group: ProcessGroup,
        device: Union[int, str, torch.device],
        max_size=8192 * 1024,
    ) -> None:
        """
        Args:
            group: the process group to work on. If None, it will use the
                default process group.
            device: the device to bind the CustomAllreduce to. If None,
                it will be bind to f"cuda:{local_rank}".
        It is the caller's responsibility to make sure each communicator
        is bind to a unique device, and all communicators in this group
        are in the same node.
        """
        self._IS_CAPTURING = False
        self.disabled = True

        if not custom_ar:
            # disable because of missing custom allreduce library
            # e.g. in a non-cuda environment
            return

        self.group = group

        assert (
            dist.get_backend(group) != dist.Backend.NCCL
        ), "CustomAllreduce should be attached to a non-NCCL group."

        if not all(in_the_same_node_as(group, source_rank=0)):
            # No need to initialize custom allreduce for multi-node case.
            print(
                "Custom allreduce is disabled because this process group"
                " spans across nodes."
            )
            return

        rank = dist.get_rank(group=self.group)
        world_size = dist.get_world_size(group=self.group)
        if world_size == 1:
            # No need to initialize custom allreduce for single GPU case.
            return

        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
            print(
                "Custom allreduce is disabled due to an unsupported world"
                " size: %d. Supported world sizes: %s. To silence this "
                "warning, specify disable_custom_all_reduce=True explicitly.",
                world_size,
                str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
            )
            return

        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
        # now `device` is a `torch.device` object
        assert isinstance(device, torch.device)
        self.device = device

        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
        if cuda_visible_devices:
            device_ids = list(map(int, cuda_visible_devices.split(",")))
        else:
            device_ids = list(range(cuda_device_count_stateless()))

        physical_device_id = device_ids[device.index]
        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
        gather_list = [
            torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
        ]
        dist.all_gather(gather_list, tensor, group=self.group)
        physical_device_ids = [t.item() for t in gather_list]

        # test nvlink first, this will filter out most of the cases
        # where custom allreduce is not supported
        # this checks hardware and driver support for NVLink
        assert current_platform.is_cuda()
        from server.inference.platforms.cuda import CudaPlatform

        cuda_platform: CudaPlatform = current_platform
        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
        if world_size > 2 and not full_nvlink:
            print(
                "Custom allreduce is disabled because it's not supported on"
                " more than two PCIe-only GPUs. To silence this warning, "
                "specify disable_custom_all_reduce=True explicitly."
            )
            return
        # test P2P capability, this checks software/cudaruntime support
        # this is expensive to compute at the first time
        # then we cache the result
        if not _can_p2p(rank, world_size):
            print(
                "Custom allreduce is disabled because your platform lacks "
                "GPU P2P capability or P2P test failed. To silence this "
                "warning, specify disable_custom_all_reduce=True explicitly."
            )
            return

        self.disabled = False
        # Buffers memory are owned by this Python class and passed to C++.
        # Meta data composes of two parts: meta data for synchronization and a
        # temporary buffer for storing intermediate allreduce results.
        self.meta_ptrs = self.create_shared_buffer(
            vLLMCustomAllreduce.meta_size() + max_size, group=group
        )
        # This is a pre-registered IPC buffer. In eager mode, input tensors
        # are first copied into this buffer before allreduce is performed
        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
        # This is a buffer for storing the tuples of pointers pointing to
        # IPC buffers from all ranks. Each registered tuple has size of
        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
        # is enough for 131072 such tuples. The largest model I've seen only
        # needs less than 10000 of registered tuples.
        self.rank_data = torch.empty(
            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
        )
        self.max_size = max_size
        self.rank = rank
        self.world_size = world_size
        self.full_nvlink = full_nvlink
        self._ptr = vLLMCustomAllreduce.init_custom_ar(
            self.meta_ptrs, self.rank_data, rank, self.full_nvlink
        )
        vLLMCustomAllreduce.register_buffer(self._ptr, self.buffer_ptrs)

    @staticmethod
    def create_shared_buffer(
        size_in_bytes: int, group: Optional[ProcessGroup] = None
    ) -> List[int]:
        """
        Creates a shared buffer and returns a list of pointers
        representing the buffer on all processes in the group.
        """
        lib = CudaRTLibrary()
        pointer = lib.cudaMalloc(size_in_bytes)
        handle = lib.cudaIpcGetMemHandle(pointer)
        world_size = dist.get_world_size(group=group)
        rank = dist.get_rank(group=group)
        handles = [None] * world_size
        dist.all_gather_object(handles, handle, group=group)

        pointers: List[int] = []
        for i, h in enumerate(handles):
            if i == rank:
                pointers.append(pointer.value)  # type: ignore
            else:
                pointers.append(lib.cudaIpcOpenMemHandle(h).value)  # type: ignore

        return pointers

    @staticmethod
    def free_shared_buffer(
        pointers: List[int], group: Optional[ProcessGroup] = None
    ) -> None:
        rank = dist.get_rank(group=group)
        lib = CudaRTLibrary()
        lib.cudaFree(ctypes.c_void_p(pointers[rank]))

    @contextmanager
    def capture(self):
        """
        The main responsibility of this context manager is the
        `register_graph_buffers` call at the end of the context.
        It records all the buffer addresses used in the CUDA graph.
        """
        try:
            self._IS_CAPTURING = True
            yield
        finally:
            self._IS_CAPTURING = False
            if not self.disabled:
                self.register_graph_buffers()

    def register_graph_buffers(self):
        handle, offset = vLLMCustomAllreduce.get_graph_buffer_ipc_meta(self._ptr)
        print("Registering %d cuda graph addresses", len(offset))
        # We cannot directly use `dist.all_gather_object` here
        # because it is incompatible with `gloo` backend under inference mode.
        # see https://github.com/pytorch/pytorch/issues/126032 for details.
        all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
        all_data[self.rank] = [handle, offset]
        ranks = sorted(dist.get_process_group_ranks(group=self.group))
        for i, rank in enumerate(ranks):
            dist.broadcast_object_list(
                all_data[i], src=rank, group=self.group, device="cpu"
            )
        # Unpack list of tuples to tuple of lists.
        handles = [d[0] for d in all_data]  # type: ignore
        offsets = [d[1] for d in all_data]  # type: ignore
        vLLMCustomAllreduce.register_graph_buffers(self._ptr, handles, offsets)

    def should_custom_ar(self, inp: torch.Tensor):
        if self.disabled:
            return False
        inp_size = inp.numel() * inp.element_size()
        # custom allreduce requires input byte size to be multiples of 16
        if inp_size % 16 != 0:
            return False
        if not is_weak_contiguous(inp):
            return False
        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
        # little performance improvement over NCCL.
        if self.world_size == 2 or self.full_nvlink:
            return inp_size < self.max_size
        return False

    def all_reduce(
        self, inp: torch.Tensor, *, out: torch.Tensor = None, bsz_tensor: torch.Tensor = None, registered: bool = False,
        is_compute_bound=False, overlap=False
    ):
        """Performs an out-of-place all reduce.

        If registered is True, this assumes inp's pointer is already
        IPC-registered. Otherwise, inp is first copied into a pre-registered
        buffer.
        """
        if is_compute_bound:
            sms = 2 if overlap else 36
        else:
            sms = 20 if overlap else 36
        #print("all reduce sms", sms)
        if out is None:
            out = torch.empty_like(inp)
        if registered:
            vLLMCustomAllreduce.all_reduce(self._ptr, inp, out, 0, 0, bsz_tensor, block_limit=sms)
        else:
            vLLMCustomAllreduce.all_reduce(
                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size, bsz_tensor, block_limit=sms
            )
        return out

    def custom_all_reduce(self, input: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> Optional[torch.Tensor]:
        """The main allreduce API that provides support for cuda graph."""
        # When custom allreduce is disabled, this will be None.
        if self.disabled or not self.should_custom_ar(input):
            return None
        if self._IS_CAPTURING:
            if torch.cuda.is_current_stream_capturing():
                return self.all_reduce(input, bsz_tensor=bsz_tensor, registered=True, is_compute_bound=is_compute_bound, overlap=overlap)
            else:
                # If warm up, mimic the allocation pattern since custom
                # allreduce is out-of-place.
                return torch.empty_like(input)
        else:
            # Note: outside of cuda graph context, custom allreduce incurs a
            # cost of cudaMemcpy, which should be small (<=1% of overall
            # latency) compared to the performance gain of using custom kernels
            return self.all_reduce(input, bsz_tensor=bsz_tensor, registered=False, is_compute_bound=is_compute_bound, overlap=overlap)

    def close(self):
        if not self.disabled and self._ptr:
            vLLMCustomAllreduce.dispose(self._ptr)
            self._ptr = 0
            self.free_shared_buffer(self.meta_ptrs)
            self.free_shared_buffer(self.buffer_ptrs)

    def __del__(self):
        self.close()


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce_utils.py
================================================
import ctypes
import json
import os
import pickle
import subprocess
import sys
import tempfile
from itertools import product
from typing import Dict, List, Optional, Sequence

import torch.distributed as dist
import torch.multiprocessing as mp

import server.envs as envs
from server.inference.distributed.cuda_wrapper import CudaRTLibrary
from server.utils import cuda_device_count_stateless, update_environment_variables


def producer(
    batch_src: Sequence[int],
    producer_queue,
    consumer_queue,
    result_queue,
    cuda_visible_devices: Optional[str] = None,
):
    if cuda_visible_devices is not None:
        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})

    lib = CudaRTLibrary()
    for i in batch_src:
        lib.cudaSetDevice(i)
        pointer = lib.cudaMalloc(1024)
        lib.cudaMemset(pointer, 1, 1024)
        lib.cudaDeviceSynchronize()
        handle = lib.cudaIpcGetMemHandle(pointer)
        producer_queue.put(handle)
        open_success = consumer_queue.get()
        if open_success:
            # use two queues to simulate barrier
            producer_queue.put(0)
            consumer_queue.get()
            # check if the memory is modified
            host_data = (ctypes.c_char * 1024)()
            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
            for i in range(1024):
                if ord(host_data[i]) != 2:
                    open_success = False
                    break
        result_queue.put(open_success)
        lib.cudaDeviceReset()


def consumer(
    batch_tgt: Sequence[int],
    producer_queue,
    consumer_queue,
    result_queue,
    cuda_visible_devices: Optional[str] = None,
):
    if cuda_visible_devices is not None:
        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})

    lib = CudaRTLibrary()
    for j in batch_tgt:
        lib.cudaSetDevice(j)
        handle = producer_queue.get()
        open_success = False
        try:
            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
            open_success = True
        except RuntimeError:
            # cannot error out here, because the producer process
            # is still waiting for the response.
            pass
        consumer_queue.put(open_success)
        if open_success:
            # modify the memory
            lib.cudaMemset(pointer, 2, 1024)
            lib.cudaDeviceSynchronize()
            # use two queues to simulate barrier
            producer_queue.get()
            consumer_queue.put(0)
            # check if the memory is modified
            host_data = (ctypes.c_char * 1024)()
            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
            for i in range(1024):
                if ord(host_data[i]) != 2:
                    open_success = False
                    break
        result_queue.put(open_success)
        lib.cudaDeviceReset()


def can_actually_p2p(
    batch_src: Sequence[int],
    batch_tgt: Sequence[int],
) -> Sequence[bool]:
    """
    Usually, checking if P2P access is enabled can be done by
    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
    returns `True` even if P2P access is not actually possible.
    See https://github.com/vllm-project/vllm/issues/2728 and
    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
    Therefore, we have to perform a real P2P access to check if it is actually
    possible.

    Note on p2p and cuda IPC:
    Usually, one process uses one GPU:
    GPU src --> cuda context src --> tensor src --> process src

    We need to combine p2p and cuda IPC, so that:
    GPU src --> cuda context src --> tensor src --> process src
                                      |shared|
    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
    That is to say, process src creates a tensor in GPU src, passes IPC handle to
    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
    tensor in process tgt will be reflected in the tensor in process src, because
    they are the same memory segment.
    It is important to note that process tgt accesses the tensor in GPU tgt, not
    GPU src. That's why we need p2p access.

    The most time-consuming part is the process creation. To avoid creating
    processes for every pair of GPUs, we use batched testing. We create two
    processes for testing all pairs of GPUs in batch. The trick is to reset
    the device after each test (which is not available in PyTorch).
    """  # noqa
    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
    # pass the CUDA_VISIBLE_DEVICES to the child process
    # to make sure they see the same set of GPUs

    # make sure the processes are spawned
    smp = mp.get_context("spawn")
    producer_queue = smp.Queue()
    consumer_queue = smp.Queue()
    result_queue = smp.Queue()
    p_src = smp.Process(
        target=producer,
        args=(
            batch_src,
            producer_queue,
            consumer_queue,
            result_queue,
            cuda_visible_devices,
        ),
    )
    p_tgt = smp.Process(
        target=consumer,
        args=(
            batch_tgt,
            producer_queue,
            consumer_queue,
            result_queue,
            cuda_visible_devices,
        ),
    )
    p_src.start()
    p_tgt.start()
    p_src.join()
    p_tgt.join()
    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
    result: List[bool] = []
    for src, tgt in zip(batch_src, batch_tgt):
        a = result_queue.get()
        b = result_queue.get()
        if a != b:
            print(
                "Two processes do not agree on the P2P access"
                " status on %d -> %d, treat as disabled.",
                src,
                tgt,
            )
            result.append(False)
        else:
            result.append(a)
    return result


# why do we need this cache?
# we are testing peer-to-peer (p2p) access between GPUs,across processes.
# if we test it every time, it will be very slow, because we need to create
#  N * N * 2 processes, where N is the world size. This is very slow.
# to reduce the time, we use a cache file to store the p2p access status.
# the cache file is generated by the master process if it does not exist.
# then all the processes can read the cache file to check the p2p access status.
# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
#  e.g. used by different vllm engines. The device id in the cache file is a
#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
#  of visible devices in the vllm engine.
_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None


def gpu_p2p_access_check(src: int, tgt: int) -> bool:
    """Check if GPU src can access GPU tgt."""

    # if the cache variable is already calculated,
    # read from the cache instead of checking it again
    global _gpu_p2p_access_cache
    if _gpu_p2p_access_cache is not None:
        return _gpu_p2p_access_cache[f"{src}->{tgt}"]

    is_distributed = dist.is_initialized()

    num_dev = cuda_device_count_stateless()
    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
    if cuda_visible_devices is None:
        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))

    path = os.path.join(
        envs.VLLM_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
    )
    os.makedirs(os.path.dirname(path), exist_ok=True)
    from server.inference.distributed.parallel_state import get_world_group

    if (not is_distributed or get_world_group().local_rank == 0) and (
        not os.path.exists(path)
    ):
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
        print("generating GPU P2P access cache in %s", path)
        cache: Dict[str, bool] = {}
        ids = list(range(num_dev))
        # batch of all pairs of GPUs
        batch_src, batch_tgt = zip(*list(product(ids, ids)))
        # NOTE: we use `subprocess` rather than `multiprocessing` here
        # because the caller might not have `if __name__ == "__main__":`,
        # in that case we cannot use spawn method in multiprocessing.
        # However, `can_actually_p2p` requires spawn method.
        # The fix is, we use `subprocess` to call the function,
        # where we have `if __name__ == "__main__":` in this file.

        # use a temporary file to store the result
        # we don't use the output of the subprocess directly,
        # because the subprocess might produce logging output
        with tempfile.NamedTemporaryFile() as output_file:
            input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
            returned = subprocess.run(
                [sys.executable, __file__], input=input_bytes, capture_output=True
            )
            # check if the subprocess is successful
            try:
                returned.check_returncode()
            except Exception as e:
                # wrap raised exception to provide more information
                raise RuntimeError(
                    f"Error happened when batch testing "
                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
                    f"{returned.stderr.decode()}"
                ) from e
            with open(output_file.name, "rb") as f:
                result = pickle.load(f)
        for _i, _j, r in zip(batch_src, batch_tgt, result):
            cache[f"{_i}->{_j}"] = r
        with open(path, "w") as f:
            json.dump(cache, f, indent=4)
    if is_distributed:
        get_world_group().barrier()
    print("reading GPU P2P access cache from %s", path)
    with open(path) as f:
        cache = json.load(f)
    _gpu_p2p_access_cache = cache
    return _gpu_p2p_access_cache[f"{src}->{tgt}"]


__all__ = ["gpu_p2p_access_check"]

if __name__ == "__main__":
    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
    result = can_actually_p2p(batch_src, batch_tgt)
    with open(output_file, "wb") as f:
        f.write(pickle.dumps(result))


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/parallel_state.py
================================================
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""vLLM distributed state.
It takes over the control of the distributed environment from PyTorch.
The typical workflow is:

- call `init_distributed_environment` to initialize the distributed environment.
- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to
 initialize the model parallel groups.

- any code dealing with the distributed stuff

- call `destroy_model_parallel` to destroy the model parallel groups.
- call `destroy_distributed_environment` to destroy the distributed environment.

If you only need to use the distributed environment without model/pipeline
 parallelism, you can skip the model parallel initialization and destruction
 steps.
"""
import contextlib
import gc
import pickle
import weakref
from collections import namedtuple
from contextlib import contextmanager, nullcontext
from dataclasses import dataclass
from multiprocessing import shared_memory
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from unittest.mock import patch

import torch
import torch.distributed
from torch.distributed import Backend, ProcessGroup

import server.envs as envs
from server.inference.platforms import current_platform
from server.utils import direct_register_custom_op, supports_custom_op


@dataclass
class GraphCaptureContext:
    stream: torch.cuda.Stream


TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])


def _split_tensor_dict(
    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
    """Split the tensor dictionary into two parts:
    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
         by its metadata.
    2. A list of tensors.
    """
    metadata_list: List[Tuple[str, Any]] = []
    tensor_list: List[torch.Tensor] = []
    for key, value in tensor_dict.items():
        if isinstance(value, torch.Tensor):
            # Note: we cannot use `value.device` here,
            # because it contains not only the device type but also the device
            # index (e.g. "cuda:0"). We only need the device type.
            # receiving side will set the device index.
            device = value.device.type
            metadata_list.append(
                (key, TensorMetadata(device, value.dtype, value.size()))
            )
            tensor_list.append(value)
        else:
            metadata_list.append((key, value))
    return metadata_list, tensor_list


_group_name_counter: Dict[str, int] = {}


def _get_unique_name(name: str) -> str:
    """Get a unique name for the group.
    Example:
    _get_unique_name("tp") -> "tp:0"
    _get_unique_name("tp") -> "tp:1"
    """
    if name not in _group_name_counter:
        _group_name_counter[name] = 0
    newname = f"{name}:{_group_name_counter[name]}"
    _group_name_counter[name] += 1
    return newname


_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}


def _register_group(group: "GroupCoordinator") -> None:
    _groups[group.unique_name] = weakref.ref(group)


if supports_custom_op():

    def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
        assert group_name in _groups, f"Group {group_name} is not found."
        group = _groups[group_name]()
        if group is None:
            raise ValueError(f"Group {group_name} is destroyed.")
        group._all_reduce_in_place(tensor)

    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
        return

    direct_register_custom_op(
        op_name="inplace_all_reduce",
        op_func=inplace_all_reduce,
        mutates_args=["tensor"],
        fake_impl=inplace_all_reduce_fake,
    )

    def outplace_all_reduce(tensor: torch.Tensor, group_name: str, bsz_tensor: torch.Tensor, is_compute_bound: bool = False, overlap: bool = False) -> torch.Tensor:
        assert group_name in _groups, f"Group {group_name} is not found."
        group = _groups[group_name]()
        if group is None:
            raise ValueError(f"Group {group_name} is destroyed.")
        return group._all_reduce_out_place(tensor, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)

    def outplace_all_reduce_fake(tensor: torch.Tensor, group_name: str, bsz_tensor: torch.Tensor, is_compute_bound: bool = False, overlap: bool = False) -> torch.Tensor:
        return torch.empty_like(tensor)

    direct_register_custom_op(
        op_name="outplace_all_reduce",
        op_func=outplace_all_reduce,
        mutates_args=[],
        fake_impl=outplace_all_reduce_fake,
    )


class GroupCoordinator:
    """
    PyTorch ProcessGroup wrapper for a group of processes.
    PyTorch ProcessGroup is bound to one specific communication backend,
        e.g. NCCL, Gloo, MPI, etc.
    GroupCoordinator takes charge of all the communication operations among
        the processes in the group. It can route the communication to
        a specific implementation (e.g. switch allreduce implementation
        based on the tensor size and cuda graph mode).
    """

    # available attributes:
    rank: int  # global rank
    ranks: List[int]  # global ranks in the group
    world_size: int  # size of the group
    # difference between `local_rank` and `rank_in_group`:
    # if we have a group of size 4 across two nodes:
    # Process | Node | Rank | Local Rank | Rank in Group
    #   0     |   0  |  0   |     0      |       0
    #   1     |   0  |  1   |     1      |       1
    #   2     |   1  |  2   |     0      |       2
    #   3     |   1  |  3   |     1      |       3
    local_rank: int  # local rank used to assign devices
    rank_in_group: int  # rank inside the group
    cpu_group: ProcessGroup  # group for CPU communication
    device_group: ProcessGroup  # group for device communication
    use_pynccl: bool  # a hint of whether to use PyNccl
    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
    # communicators are only created for world size > 1
    pynccl_comm: Optional[Any]  # PyNccl communicator
    ca_comm: Optional[Any]  # Custom allreduce communicator
    mq_broadcaster: Optional[Any]  # shared memory broadcaster

    def __init__(
        self,
        group_ranks: List[List[int]],
        local_rank: int,
        torch_distributed_backend: Union[str, Backend],
        use_pynccl: bool,
        use_custom_allreduce: bool,
        use_tpu_communicator: bool,
        use_hpu_communicator: bool,
        use_xpu_communicator: bool,
        use_message_queue_broadcaster: bool = False,
        group_name: Optional[str] = None,
    ):
        group_name = group_name or "anonymous"
        self.unique_name = _get_unique_name(group_name)
        _register_group(self)

        self.rank = torch.distributed.get_rank()
        self.local_rank = local_rank
        self.device_group = None
        self.cpu_group = None

        for ranks in group_ranks:
            device_group = torch.distributed.new_group(
                ranks, backend=torch_distributed_backend
            )
            # a group with `gloo` backend, to allow direct coordination between
            # processes through the CPU.
            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
            if self.rank in ranks:
                self.ranks = ranks
                self.world_size = len(ranks)
                self.rank_in_group = ranks.index(self.rank)
                self.device_group = device_group
                self.cpu_group = cpu_group

        assert self.cpu_group is not None
        assert self.device_group is not None
        assert current_platform.is_cuda_alike()

        if current_platform.is_cuda_alike():
            self.device = torch.device(f"cuda:{local_rank}")
        else:
            self.device = torch.device("cpu")

        self.use_pynccl = use_pynccl
        self.use_custom_allreduce = use_custom_allreduce
        self.use_tpu_communicator = use_tpu_communicator
        self.use_hpu_communicator = use_hpu_communicator
        self.use_xpu_communicator = use_xpu_communicator

        # lazy import to avoid documentation build error
        from server.inference.distributed.custom_all_reduce import CustomAllreduce
        from server.inference.distributed.pynccl import PyNcclCommunicator

        self.pynccl_comm: Optional[PyNcclCommunicator] = None
        # if use_pynccl and self.world_size > 1:
        #     self.pynccl_comm = PyNcclCommunicator(
        #         group=self.cpu_group,
        #         device=self.device,
        #     )

        self.ca_comm: Optional[CustomAllreduce] = None
        if use_custom_allreduce and self.world_size > 1:
            # Initialize a custom fast all-reduce implementation.
            self.ca_comm = CustomAllreduce(
                group=self.cpu_group,
                device=self.device,
            )

        #### we assume we won't use tpu or hpu or xpu or messagequeue broadcast

        # from vllm.distributed.device_communicators.tpu_communicator import (
        #     TpuCommunicator)
        # self.tpu_communicator: Optional[TpuCommunicator] = None
        # if use_tpu_communicator and self.world_size > 1:
        #     self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
        self.tpu_communicator = None

        # from vllm.distributed.device_communicators.hpu_communicator import (
        #     HpuCommunicator)
        # self.hpu_communicator: Optional[HpuCommunicator]
        # if use_hpu_communicator and self.world_size > 1:
        #     self.hpu_communicator = HpuCommunicator(group=self.device_group)
        self.hpu_communicator = None

        # from vllm.distributed.device_communicators.xpu_communicator import (
        #     XpuCommunicator)
        # self.xpu_communicator: Optional[XpuCommunicator]
        # if use_xpu_communicator and self.world_size > 1:
        #     self.xpu_communicator = XpuCommunicator(group=self.device_group)
        self.xpu_communicator = None

        # from vllm.distributed.device_communicators.shm_broadcast import (
        #     MessageQueue)
        # self.mq_broadcaster: Optional[MessageQueue] = None
        # if use_message_queue_broadcaster and self.world_size > 1:
        #     self.mq_broadcaster = MessageQueue.create_from_process_group(
        #         self.cpu_group, 1 << 22, 6)
        self.mq_broadcaster = None

    @property
    def first_rank(self):
        """Return the global rank of the first process in the group"""
        return self.ranks[0]

    @property
    def last_rank(self):
        """Return the global rank of the last process in the group"""
        return self.ranks[-1]

    @property
    def is_first_rank(self):
        """Return whether the caller is the first process in the group"""
        return self.rank == self.first_rank

    @property
    def is_last_rank(self):
        """Return whether the caller is the last process in the group"""
        return self.rank == self.last_rank

    @property
    def next_rank(self):
        """Return the global rank of the process that follows the caller"""
        rank_in_group = self.rank_in_group
        world_size = self.world_size
        return self.ranks[(rank_in_group + 1) % world_size]

    @property
    def prev_rank(self):
        """Return the global rank of the process that precedes the caller"""
        rank_in_group = self.rank_in_group
        world_size = self.world_size
        return self.ranks[(rank_in_group - 1) % world_size]

    @contextmanager
    def graph_capture(
        self, graph_capture_context: Optional[GraphCaptureContext] = None
    ):
        if graph_capture_context is None:
            stream = torch.cuda.Stream()
            graph_capture_context = GraphCaptureContext(stream)
        else:
            stream = graph_capture_context.stream

        ca_comm = self.ca_comm
        maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture()

        # ensure all initialization operations complete before attempting to
        # capture the graph on another stream
        curr_stream = torch.cuda.current_stream()
        if curr_stream != stream:
            stream.wait_stream(curr_stream)

        with torch.cuda.stream(stream), maybe_ca_context:
            # In graph mode, we have to be very careful about the collective
            # operations. The current status is:
            #     allreduce \ Mode   |  Eager  |  Graph  |
            # --------------------------------------------
            # custom allreduce       | enabled | enabled |
            # PyNccl                 | disabled| enabled |
            # torch.distributed      | enabled | disabled|
            #
            # Note that custom allreduce will have a runtime check, if the
            #  tensor size is too large, it will fallback to the next
            #  available option.
            # In summary: When using CUDA graph, we use
            #  either custom all-reduce kernel or pynccl. When not using
            #  CUDA graph, we use either custom all-reduce kernel or
            #  PyTorch NCCL. We always prioritize using custom all-reduce
            #  kernel but fall back to PyTorch or pynccl if it is
            #  disabled or not supported.
            pynccl_comm = self.pynccl_comm
            maybe_pynccl_context: Any
            if not pynccl_comm:
                maybe_pynccl_context = nullcontext()
            else:
                maybe_pynccl_context = pynccl_comm.change_state(
                    enable=True, stream=torch.cuda.current_stream()
                )
            with maybe_pynccl_context:
                yield graph_capture_context

    def all_reduce(self, input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
        """
        User-facing all-reduce function before we actually call the
        all-reduce operation.

        We need this because Dynamo does not support passing an arbitrary
        object (`self` in this case) to a custom op. We need to pass the
         group name as a string, and then look up the group coordinator from
         the group name, dispatch the all-reduce operation to the group
         coordinator.

        In addition, PyTorch custom ops do not support mutation or returning
        a new tensor in the same op. So we need to figure out if the op is
        in-place or out-of-place ahead of time.
        """
        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return input_

        if input_.is_cpu:
            import intel_extension_for_pytorch as ipex

            ipex.distributed.all_reduce(input_, group=self.device_group)
            return input_

        if not supports_custom_op():
            self._all_reduce_in_place(input_)
            return input_

        if self.tpu_communicator is not None and not self.tpu_communicator.disabled:
            # TPU handles Dynamo with its own logic.
            return self.tpu_communicator.all_reduce(input_)

        if self.hpu_communicator is not None and not self.hpu_communicator.disabled:
            return self.hpu_communicator.all_reduce(input_)

        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
            return self.xpu_communicator.all_reduce(input_)

        if (
            self.ca_comm is not None
            and not self.ca_comm.disabled
            and self.ca_comm.should_custom_ar(input_)
        ):
            return torch.ops.vllm.outplace_all_reduce(
                input_, group_name=self.unique_name, bsz_tensor=bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap
            )
        else:
            #assert self.ca_comm is not None
            #assert not self.ca_comm.disabled
            #assert self.ca_comm.should_custom_ar(input_)
            torch.ops.vllm.inplace_all_reduce(input_, group_name=self.unique_name)
            return input_

    def _all_reduce_out_place(self, input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
        ca_comm = self.ca_comm
        assert ca_comm is not None
        assert not ca_comm.disabled
        out = ca_comm.custom_all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)
        assert out is not None
        return out

    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
        pynccl_comm = self.pynccl_comm
        if pynccl_comm is not None and not pynccl_comm.disabled:
            pynccl_comm.all_reduce(input_)
        else:
            torch.distributed.all_reduce(input_, group=self.device_group)

    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
        world_size = self.world_size
        # Bypass the function if we are using only 1 GPU.
        if world_size == 1:
            return input_
        assert (
            -input_.dim() <= dim < input_.dim()
        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"

        # For TPUs, use TPU communicator.
        tpu_comm = self.tpu_communicator
        if tpu_comm is not None and not tpu_comm.disabled:
            return tpu_comm.all_gather(input_, dim)

        # For HPUs, use HPU communicator.
        hpu_comm = self.hpu_communicator
        if hpu_comm is not None and not hpu_comm.disabled:
            return hpu_comm.all_gather(input_, dim)

        if dim < 0:
            # Convert negative dim to positive.
            dim += input_.dim()
        input_size = input_.size()
        # NOTE: we have to use concat-style all-gather here,
        # stack-style all-gather has compatibility issues with
        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
        output_size = (input_size[0] * world_size,) + input_size[1:]
        # Allocate output tensor.
        output_tensor = torch.empty(
            output_size, dtype=input_.dtype, device=input_.device
        )
        # All-gather.
        torch.distributed.all_gather_into_tensor(
            output_tensor, input_, group=self.device_group
        )
        # Reshape
        output_tensor = output_tensor.reshape((world_size,) + input_size)
        output_tensor = output_tensor.movedim(0, dim)
        output_tensor = output_tensor.reshape(
            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
        )
        return output_tensor

    def gather(
        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
    ) -> Optional[torch.Tensor]:
        """
        NOTE: We assume that the input tensor is on the same device across
        all the ranks.
        NOTE: `dst` is the local rank of the destination rank.
        """
        world_size = self.world_size
        # Bypass the function if we are using only 1 GPU.
        if world_size == 1:
            return input_
        assert (
            -input_.dim() <= dim < input_.dim()
        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
        if dim < 0:
            # Convert negative dim to positive.
            dim += input_.dim()
        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
            return self.xpu_communicator.gather(input_, self.rank_in_group, dst, dim)
        # Allocate output tensor.
        if self.rank_in_group == dst:
            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
        else:
            gather_list = None
        # Gather.
        torch.distributed.gather(
            input_, gather_list, dst=self.ranks[dst], group=self.device_group
        )
        if self.rank_in_group == dst:
            output_tensor = torch.cat(gather_list, dim=dim)
        else:
            output_tensor = None
        return output_tensor

    def broadcast(self, input_: torch.Tensor, src: int = 0):
        """Broadcast the input tensor.
        NOTE: `src` is the local rank of the source rank.
        """
        assert src < self.world_size, f"Invalid src rank ({src})"

        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return input_
        # Broadcast.
        torch.distributed.broadcast(
            input_, src=self.ranks[src], group=self.device_group
        )
        return input_

    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
        """Broadcast the input object.
        NOTE: `src` is the local rank of the source rank.
        """
        assert src < self.world_size, f"Invalid src rank ({src})"

        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return obj
        if self.mq_broadcaster is not None:
            assert src == 0, "Message queue broadcaster only supports src=0"
            return self.mq_broadcaster.broadcast_object(obj)
        if self.rank_in_group == src:
            torch.distributed.broadcast_object_list(
                [obj], src=self.ranks[src], group=self.cpu_group
            )
            return obj
        else:
            recv = [None]
            torch.distributed.broadcast_object_list(
                recv, src=self.ranks[src], group=self.cpu_group
            )
            return recv[0]

    def broadcast_object_list(
        self, obj_list: List[Any], src: int = 0, group: Optional[ProcessGroup] = None
    ):
        """Broadcast the input object list.
        NOTE: `src` is the local rank of the source rank.
        """
        assert src < self.world_size, f"Invalid src rank ({src})"

        # Bypass the function if we are using only 1 GPU.
        if self.world_size == 1:
            return obj_list
        # Broadcast.
        torch.distributed.broadcast_object_list(
            obj_list, src=self.ranks[src], group=self.device_group
        )
        return obj_list

    def send_object(self, obj: Any, dst: int) -> None:
        """Send the input object list to the destination rank."""
        """NOTE: `dst` is the local rank of the destination rank."""

        assert dst < self.world_size, f"Invalid dst rank ({dst})"

        assert dst != self.rank_in_group, (
            "Invalid destination rank. Destination rank is the same "
            "as the current rank."
        )

        # Serialize object to tensor and get the size as well
        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)

        size_tensor = torch.tensor(
            [object_tensor.numel()], dtype=torch.long, device="cpu"
        )

        # Send object size

        torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)

        # Send object
        torch.distributed.send(object_tensor, dst=self.ranks[dst], group=self.cpu_group)

        return None

    def recv_object(self, src: int) -> Any:
        """Receive the input object list from the source rank."""
        """NOTE: `src` is the local rank of the source rank."""

        assert src < self.world_size, f"Invalid src rank ({src})"

        assert (
            src != self.rank_in_group
        ), "Invalid source rank. Source rank is the same as the current rank."

        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")

        # Receive object size
        rank_size = torch.distributed.recv(
            size_tensor, src=self.ranks[src], group=self.cpu_group
        )

        # Tensor to receive serialized objects into.
        object_tensor = torch.empty(  # type: ignore[call-overload]
            size_tensor.item(),  # type: ignore[arg-type]
            dtype=torch.uint8,
            device="cpu",
        )

        rank_object = torch.distributed.recv(
            object_tensor, src=self.ranks[src], group=self.cpu_group
        )

        assert (
            rank_object == rank_size
        ), "Received object sender rank does not match the size sender rank."

        obj = pickle.loads(object_tensor.numpy().tobytes())

        return obj

    def broadcast_tensor_dict(
        self,
        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
        src: int = 0,
        group: Optional[ProcessGroup] = None,
        metadata_group: Optional[ProcessGroup] = None,
    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
        """Broadcast the input tensor dictionary.
        NOTE: `src` is the local rank of the source rank.
        """
        # Bypass the function if we are using only 1 GPU.
        if not torch.distributed.is_initialized() or self.world_size == 1:
            return tensor_dict

        group = self.device_group
        metadata_group = self.cpu_group
        assert src < self.world_size, f"Invalid src rank ({src})"

        rank_in_group = self.rank_in_group
        if rank_in_group == src:
            metadata_list: List[Tuple[Any, Any]] = []
            assert isinstance(
                tensor_dict, dict
            ), f"Expecting a dictionary, got {type(tensor_dict)}"
            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
            # `metadata_list` lives in CPU memory.
            # `broadcast_object_list` has serialization & deserialization,
            # all happening on CPU. Therefore, we can use the CPU group.
            self.broadcast_object(metadata_list, src=src)
            async_handles = []
            for tensor in tensor_list:
                if tensor.numel() == 0:
                    # Skip broadcasting empty tensors.
                    continue
                if tensor.is_cpu:
                    # use metadata_group for CPU tensors
                    handle = torch.distributed.broadcast(
                        tensor, src=self.ranks[src], group=metadata_group, async_op=True
                    )
                else:
                    # use group for GPU tensors
                    handle = torch.distributed.broadcast(
                        tensor, src=self.ranks[src], group=group, async_op=True
                    )
                async_handles.append(handle)
            for async_handle in async_handles:
                async_handle.wait()

        else:
            metadata_list = self.broadcast_object(None, src=src)
            tensor_dict = {}
            async_handles = []
            for key, value in metadata_list:
                if isinstance(value, TensorMetadata):
                    tensor = torch.empty(
                        value.size, dtype=value.dtype, device=value.device
                    )
                    if tensor.numel() == 0:
                        # Skip broadcasting empty tensors.
                        tensor_dict[key] = tensor
                        continue
                    if tensor.is_cpu:
                        # use metadata_group for CPU tensors
                        handle = torch.distributed.broadcast(
                            tensor,
                            src=self.ranks[src],
                            group=metadata_group,
                            async_op=True,
                        )
                    else:
                        # use group for GPU tensors
                        handle = torch.distributed.broadcast(
                            tensor, src=self.ranks[src], group=group, async_op=True
                        )
                    async_handles.append(handle)
                    tensor_dict[key] = tensor
                else:
                    tensor_dict[key] = value
            for async_handle in async_handles:
                async_handle.wait()
        return tensor_dict

    def send_tensor_dict(
        self,
        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
        dst: Optional[int] = None,
        all_gather_group: Optional["GroupCoordinator"] = None,
    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
        """Send the input tensor dictionary.
        NOTE: `dst` is the local rank of the source rank.
        """
        # Bypass the function if we are using only 1 GPU.
        if not torch.distributed.is_initialized() or self.world_size == 1:
            return tensor_dict

        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
        all_gather_rank = (
            0 if all_gather_group is None else all_gather_group.rank_in_group
        )

        group = self.device_group
        metadata_group = self.cpu_group

        if dst is None:
            dst = (self.rank_in_group + 1) % self.world_size
        assert dst < self.world_size, f"Invalid dst rank ({dst})"

        metadata_list: List[Tuple[Any, Any]] = []
        assert isinstance(
            tensor_dict, dict
        ), f"Expecting a dictionary, got {type(tensor_dict)}"
        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
        # `metadata_list` lives in CPU memory.
        # `send_object_list` has serialization & deserialization,
        # all happening on CPU. Therefore, we can use the CPU group.
        self.send_object(metadata_list, dst=dst)
        for tensor in tensor_list:
            if tensor.numel() == 0:
                # Skip sending empty tensors.
                continue

            # send-allgather: send only a slice, then do allgather.
            if all_gather_group is not None and tensor.numel() % all_gather_size == 0:
                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]

            if tensor.is_cpu:
                # use metadata_group for CPU tensors
                torch.distributed.send(
                    tensor, dst=self.ranks[dst], group=metadata_group
                )
            else:
                # use group for GPU tensors
                torch.distributed.send(tensor, dst=self.ranks[dst], group=group)
        return None

    def recv_tensor_dict(
        self,
        src: Optional[int] = None,
        all_gather_group: Optional["GroupCoordinator"] = None,
    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
        """Recv the input tensor dictionary.
        NOTE: `src` is the local rank of the source rank.
        """
        # Bypass the function if we are using only 1 GPU.
        if not torch.distributed.is_initialized() or self.world_size == 1:
            return None

        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
        all_gather_rank = (
            0 if all_gather_group is None else all_gather_group.rank_in_group
        )

        group = self.device_group
        metadata_group = self.cpu_group

        if src is None:
            src = (self.rank_in_group - 1) % self.world_size
        assert src < self.world_size, f"Invalid src rank ({src})"

        recv_metadata_list = self.recv_object(src=src)
        tensor_dict: Dict[str, Any] = {}
        for key, value in recv_metadata_list:
            if isinstance(value, TensorMetadata):
                tensor = torch.empty(value.size, dtype=value.dtype, device=value.device)
                if tensor.numel() == 0:
                    # Skip broadcasting empty tensors.
                    tensor_dict[key] = tensor
                    continue

                # send-allgather: send only a slice, then do allgather.
                use_all_gather = (
                    all_gather_group is not None
                    and tensor.numel() % all_gather_size == 0
                )

                if use_all_gather:
                    orig_shape = tensor.shape
                    tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]

                if tensor.is_cpu:
                    # use metadata_group for CPU tensors
                    torch.distributed.recv(
                        tensor, src=self.ranks[src], group=metadata_group
                    )
                else:
                    # use group for GPU tensors
                    torch.distributed.recv(tensor, src=self.ranks[src], group=group)
                if use_all_gather:
                    # do the allgather
                    tensor = all_gather_group.all_gather(tensor, dim=0)  # type: ignore
                    tensor = tensor.reshape(orig_shape)

                tensor_dict[key] = tensor
            else:
                tensor_dict[key] = value
        return tensor_dict

    def barrier(self):
        """Barrier synchronization among the group.
        NOTE: don't use `device_group` here! `barrier` in NCCL is
        terrible because it is internally a broadcast operation with
        secretly created GPU tensors. It is easy to mess up the current
        device. Use the CPU group instead.
        """
        torch.distributed.barrier(group=self.cpu_group)

    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
        """Sends a tensor to the destination rank in a non-blocking way"""
        """NOTE: `dst` is the local rank of the destination rank."""
        if dst is None:
            dst = (self.rank_in_group + 1) % self.world_size

        pynccl_comm = self.pynccl_comm
        if pynccl_comm is not None and not pynccl_comm.disabled:
            pynccl_comm.send(tensor, dst)
        else:
            torch.distributed.send(tensor, self.ranks[dst], self.device_group)

    def recv(
        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
    ) -> torch.Tensor:
        """Receives a tensor from the source rank."""
        """NOTE: `src` is the local rank of the source rank."""
        if src is None:
            src = (self.rank_in_group - 1) % self.world_size

        tensor = torch.empty(size, dtype=dtype, device=self.device)
        pynccl_comm = self.pynccl_comm
        if pynccl_comm is not None and not pynccl_comm.disabled:
            pynccl_comm.recv(tensor, src)
        else:
            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
        return tensor

    def destroy(self):
        if self.device_group is not None:
            torch.distributed.destroy_process_group(self.device_group)
            self.device_group = None
        if self.cpu_group is not None:
            torch.distributed.destroy_process_group(self.cpu_group)
            self.cpu_group = None
        if self.pynccl_comm is not None:
            self.pynccl_comm = None
        if self.ca_comm is not None:
            self.ca_comm = None
        if self.mq_broadcaster is not None:
            self.mq_broadcaster = None


_WORLD: Optional[GroupCoordinator] = None


def get_world_group() -> GroupCoordinator:
    assert _WORLD is not None, "world group is not initialized"
    return _WORLD


def init_world_group(
    ranks: List[int], local_rank: int, backend: str
) -> GroupCoordinator:
    return GroupCoordinator(
        group_ranks=[ranks],
        local_rank=local_rank,
        torch_distributed_backend=backend,
        use_pynccl=False,
        use_custom_allreduce=False,
        use_tpu_communicator=False,
        use_hpu_communicator=False,
        use_xpu_communicator=False,
        group_name="world",
    )


def init_model_parallel_group(
    group_ranks: List[List[int]],
    local_rank: int,
    backend: str,
    use_custom_allreduce: Optional[bool] = None,
    use_message_queue_broadcaster: bool = False,
    group_name: Optional[str] = None,
) -> GroupCoordinator:
    if use_custom_allreduce is None:
        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
    return GroupCoordinator(
        group_ranks=group_ranks,
        local_rank=local_rank,
        torch_distributed_backend=backend,
        use_pynccl=True,
        use_custom_allreduce=use_custom_allreduce,
        use_tpu_communicator=True,
        use_hpu_communicator=True,
        use_xpu_communicator=True,
        use_message_queue_broadcaster=use_message_queue_broadcaster,
        group_name=group_name,
    )


_TP: Optional[GroupCoordinator] = None


def get_tp_group() -> GroupCoordinator:
    assert _TP is not None, "tensor model parallel group is not initialized"
    return _TP


# kept for backward compatibility
get_tensor_model_parallel_group = get_tp_group

_PP: Optional[GroupCoordinator] = None


def get_pp_group() -> GroupCoordinator:
    assert _PP is not None, "pipeline model parallel group is not initialized"
    return _PP


# kept for backward compatibility
get_pipeline_model_parallel_group = get_pp_group


@contextmanager
def graph_capture():
    """
    `graph_capture` is a context manager which should surround the code that
    is capturing the CUDA graph. Its main purpose is to ensure that the
    some operations will be run after the graph is captured, before the graph
    is replayed. It returns a `GraphCaptureContext` object which contains the
    necessary data for the graph capture. Currently, it only contains the
    stream that the graph capture is running on. This stream is set to the
    current CUDA stream when the context manager is entered and reset to the
    default stream when the context manager is exited. This is to ensure that
    the graph capture is running on a separate stream from the default stream,
    in order to explicitly distinguish the kernels to capture
    from other kernels possibly launched on background in the default stream.
    """
    with get_tp_group().graph_capture() as context, get_pp_group().graph_capture(
        context
    ):
        yield context


_ENABLE_CUSTOM_ALL_REDUCE = True


def set_custom_all_reduce(enable: bool):
    global _ENABLE_CUSTOM_ALL_REDUCE
    _ENABLE_CUSTOM_ALL_REDUCE = enable


def init_distributed_environment(
    world_size: int = -1,
    rank: int = -1,
    distributed_init_method: str = "env://",
    local_rank: int = -1,
    backend: str = "nccl",
):
    print(
        "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s",
        world_size,
        rank,
        local_rank,
        distributed_init_method,
        backend,
    )
    if not torch.distributed.is_initialized():
        assert distributed_init_method is not None, (
            "distributed_init_method must be provided when initializing "
            "distributed environment"
        )
        # this backend is used for WORLD
        torch.distributed.init_process_group(
            backend=backend,
            init_method=distributed_init_method,
            world_size=world_size,
            rank=rank,
        )
    # set the local rank
    # local_rank is not available in torch ProcessGroup,
    # see https://github.com/pytorch/pytorch/issues/122816
    if local_rank == -1:
        # local rank not set, this usually happens in single-node
        # setting, where we can use rank as local rank
        if distributed_init_method == "env://":
            local_rank = envs.LOCAL_RANK
        else:
            local_rank = rank
    global _WORLD
    if _WORLD is None:
        ranks = list(range(torch.distributed.get_world_size()))
        _WORLD = init_world_group(ranks, local_rank, backend)
    else:
        assert (
            _WORLD.world_size == torch.distributed.get_world_size()
        ), "world group already initialized with a different world size"


def initialize_model_parallel(
    tensor_model_parallel_size: int = 1,
    pipeline_model_parallel_size: int = 1,
    backend: Optional[str] = None,
) -> None:
    """
    Initialize model parallel groups.

    Arguments:
        tensor_model_parallel_size: number of GPUs used for tensor model
            parallelism.
        pipeline_model_parallel_size: number of GPUs used for pipeline model
            parallelism.

    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
    the model pipeline. The present function will
    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
        4 tensor model-parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
        2 pipeline model-parallel groups:
            [g0, g2, g4, g6], [g1, g3, g5, g7]
    Note that for efficiency, the caller should make sure adjacent ranks
    are on the same DGX box. For example if we are using 2 DGX-1 boxes
    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
    ranks 8 to 15 belong to the second box.
    """
    # Get world size and rank. Ensure some consistencies.
    assert torch.distributed.is_initialized()
    world_size: int = torch.distributed.get_world_size()
    backend = backend or torch.distributed.get_backend(get_world_group().device_group)

    if world_size != tensor_model_parallel_size * pipeline_model_parallel_size:
        raise RuntimeError(
            f"world_size ({world_size}) is not equal to "
            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})"
        )

    # Build the tensor model-parallel groups.
    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
    global _TP
    assert _TP is None, "tensor model parallel group is already initialized"
    group_ranks = []
    for i in range(num_tensor_model_parallel_groups):
        ranks = list(
            range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
        )
        group_ranks.append(ranks)

    # message queue broadcaster is only used in tensor model parallel group
    _TP = init_model_parallel_group(
        group_ranks,
        get_world_group().local_rank,
        backend,
        use_message_queue_broadcaster=True,
        group_name="tp",
    )

    # Build the pipeline model-parallel groups.
    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
    global _PP
    assert _PP is None, "pipeline model parallel group is already initialized"
    group_ranks = []
    for i in range(num_pipeline_model_parallel_groups):
        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
        group_ranks.append(ranks)
    # pipeline parallel does not need custom allreduce
    _PP = init_model_parallel_group(
        group_ranks,
        get_world_group().local_rank,
        backend,
        use_custom_allreduce=False,
        group_name="pp",
    )


def ensure_model_parallel_initialized(
    tensor_model_parallel_size: int,
    pipeline_model_parallel_size: int,
    backend: Optional[str] = None,
) -> None:
    """Helper to initialize model parallel groups if they are not initialized,
    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
    values if the model parallel groups are initialized.
    """
    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
    if not model_parallel_is_initialized():
        initialize_model_parallel(
            tensor_model_parallel_size, pipeline_model_parallel_size, backend
        )
        return

    assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
        "tensor parallel group already initialized, but of unexpected size: "
        f"{get_tensor_model_parallel_world_size()=} vs. "
        f"{tensor_model_parallel_size=}"
    )
    pp_world_size = get_pp_group().world_size
    assert pp_world_size == pipeline_model_parallel_size, (
        "pipeline parallel group already initialized, but of unexpected size: "
        f"{pp_world_size=} vs. "
        f"{pipeline_model_parallel_size=}"
    )


def model_parallel_is_initialized():
    """Check if tensor and pipeline parallel groups are initialized."""
    return _TP is not None and _PP is not None


_TP_STATE_PATCHED = False


@contextmanager
def patch_tensor_parallel_group(tp_group: GroupCoordinator):
    """Patch the tp group temporarily until this function ends.

    This method is for draft workers of speculative decoding to run draft model
    with different tp degree from that of target model workers.

    Args:
        tp_group (GroupCoordinator): the tp group coordinator
    """
    global _TP_STATE_PATCHED
    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"

    _TP_STATE_PATCHED = True
    old_tp_group = get_tp_group()
    global _TP
    _TP = tp_group
    try:
        yield
    finally:
        # restore the original state
        _TP_STATE_PATCHED = False
        _TP = old_tp_group


def get_tensor_model_parallel_world_size():
    """Return world size for the tensor model parallel group."""
    return get_tp_group().world_size


def get_tensor_model_parallel_rank():
    """Return my rank for the tensor model parallel group."""
    return get_tp_group().rank_in_group


def destroy_model_parallel():
    """Set the groups to none and destroy them."""
    global _TP
    if _TP:
        _TP.destroy()
    _TP = None

    global _PP
    if _PP:
        _PP.destroy()
    _PP = None


def destroy_distributed_environment():
    global _WORLD
    if _WORLD:
        _WORLD.destroy()
    _WORLD = None
    if torch.distributed.is_initialized():
        torch.distributed.destroy_process_group()


def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
    destroy_model_parallel()
    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    if shutdown_ray:
        import ray  # Lazy import Ray

        ray.shutdown()
    gc.collect()
    if not current_platform.is_cpu():
        torch.cuda.empty_cache()


def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
    """
    This is a collective operation that returns if each rank is in the same node
    as the source rank. It tests if processes are attached to the same
    memory system (shared access to shared memory).
    """
    assert (
        torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL
    ), "in_the_same_node_as should be tested with a non-NCCL group."
    # local rank inside the group
    rank = torch.distributed.get_rank(group=pg)
    world_size = torch.distributed.get_world_size(group=pg)

    # local tensor in each process to store the result
    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)

    # global ranks of the processes in the group
    ranks = torch.distributed.get_process_group_ranks(pg)

    magic_message = b"magic_message"
    shm = None

    try:
        with contextlib.suppress(OSError):
            if rank == source_rank:
                # create a shared memory segment
                shm = shared_memory.SharedMemory(create=True, size=128)
                shm.buf[: len(magic_message)] = magic_message
                torch.distributed.broadcast_object_list(
                    [shm.name], src=ranks[source_rank], group=pg
                )
                is_in_the_same_node[rank] = 1
            else:
                # try to open the shared memory segment
                recv = [None]
                torch.distributed.broadcast_object_list(
                    recv, src=ranks[source_rank], group=pg
                )
                name = recv[0]
                # fix to https://stackoverflow.com/q/62748654/9191338
                # Python incorrectly tracks shared memory even if it is not
                # created by the process. The following patch is a workaround.
                with patch(
                    "multiprocessing.resource_tracker.register",
                    lambda *args, **kwargs: None,
                ):
                    shm = shared_memory.SharedMemory(name=name)
                if shm.buf[: len(magic_message)] == magic_message:
                    is_in_the_same_node[rank] = 1
    except Exception as e:
        print("Error ignored in is_in_the_same_node: %s", e)
    finally:
        if shm:
            shm.close()

    torch.distributed.barrier(group=pg)

    # clean up the shared memory segment
    with contextlib.suppress(OSError):
        if rank == source_rank and shm:
            shm.unlink()
    torch.distributed.all_reduce(is_in_the_same_node, group=pg)

    return [x == 1 for x in is_in_the_same_node.tolist()]


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/pynccl.py
================================================
from contextlib import contextmanager
from typing import Optional, Union

# ===================== import region =====================
import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup, ReduceOp

from server.inference.distributed.pynccl_wrapper import (
    NCCLLibrary,
    buffer_type,
    cudaStream_t,
    ncclComm_t,
    ncclDataTypeEnum,
    ncclRedOpTypeEnum,
    ncclUniqueId,
)
from server.inference.distributed.utils import StatelessProcessGroup


class PyNcclCommunicator:

    def __init__(
        self,
        group: Union[ProcessGroup, StatelessProcessGroup],
        device: Union[int, str, torch.device],
        library_path: Optional[str] = None,
    ):
        """
        Args:
            group: the process group to work on. If None, it will use the
                default process group.
            device: the device to bind the PyNcclCommunicator to. If None,
                it will be bind to f"cuda:{local_rank}".
            library_path: the path to the NCCL library. If None, it will
                use the default library path.
        It is the caller's responsibility to make sure each communicator
        is bind to a unique device.
        """
        if not isinstance(group, StatelessProcessGroup):
            assert dist.is_initialized()
            assert (
                dist.get_backend(group) != dist.Backend.NCCL
            ), "PyNcclCommunicator should be attached to a non-NCCL group."
            # note: this rank is the rank in the group
            self.rank = dist.get_rank(group)
            self.world_size = dist.get_world_size(group)
        else:
            self.rank = group.rank
            self.world_size = group.world_size

        self.group = group

        # if world_size == 1, no need to create communicator
        if self.world_size == 1:
            self.available = False
            self.disabled = True
            self.stream = None
            return
        try:
            self.nccl = NCCLLibrary(library_path)
        except Exception:
            # disable because of missing NCCL library
            # e.g. in a non-GPU environment
            self.available = False
            self.disabled = True
            self.stream = None
            return

        self.available = True
        self.disabled = False

        print("vLLM is using nccl==%s", self.nccl.ncclGetVersion())

        if self.rank == 0:
            # get the unique id from NCCL
            self.unique_id = self.nccl.ncclGetUniqueId()
        else:
            # construct an empty unique id
            self.unique_id = ncclUniqueId()

        if not isinstance(group, StatelessProcessGroup):
            tensor = torch.ByteTensor(list(self.unique_id.internal))
            ranks = dist.get_process_group_ranks(group)
            # arg `src` in `broadcast` is the global rank
            dist.broadcast(tensor, src=ranks[0], group=group)
            byte_list = tensor.tolist()
            for i, byte in enumerate(byte_list):
                self.unique_id.internal[i] = byte
        else:
            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
        # now `device` is a `torch.device` object
        assert isinstance(device, torch.device)
        self.device = device
        # nccl communicator and stream will use this device
        # `torch.cuda.device` is a context manager that changes the
        # current cuda device to the specified one
        with torch.cuda.device(device):
            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                self.world_size, self.unique_id, self.rank
            )
            self.stream = torch.cuda.Stream()

            # A small all_reduce for warmup.
            data = torch.zeros(1, device=device)
            self.all_reduce(data)
            self.stream.synchronize()
            del data

        # by default it is disabled, e.g. in profiling models and prefill phase.
        # to use it, use under `with obj.change_state(enable=True)`, usually
        # when we are using CUDA graph.
        self.disabled = True

    def all_reduce(
        self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None
    ):
        if self.disabled:
            return
        # nccl communicator created on a specific device
        # will only work on tensors on the same device
        # otherwise it will cause "illegal memory access"
        assert tensor.device == self.device, (
            f"this nccl communicator is created to work on {self.device}, "
            f"but the input tensor is on {tensor.device}"
        )
        if stream is None:
            stream = self.stream
        self.nccl.ncclAllReduce(
            buffer_type(tensor.data_ptr()),
            buffer_type(tensor.data_ptr()),
            tensor.numel(),
            ncclDataTypeEnum.from_torch(tensor.dtype),
            ncclRedOpTypeEnum.from_torch(op),
            self.comm,
            cudaStream_t(stream.cuda_stream),
        )

    def send(self, tensor: torch.Tensor, dst: int, stream=None):
        if self.disabled:
            return
        assert tensor.device == self.device, (
            f"this nccl communicator is created to work on {self.device}, "
            f"but the input tensor is on {tensor.device}"
        )
        if stream is None:
            stream = self.stream
        self.nccl.ncclSend(
            buffer_type(tensor.data_ptr()),
            tensor.numel(),
            ncclDataTypeEnum.from_torch(tensor.dtype),
            dst,
            self.comm,
            cudaStream_t(stream.cuda_stream),
        )

    def recv(self, tensor: torch.Tensor, src: int, stream=None):
        if self.disabled:
            return
        assert tensor.device == self.device, (
            f"this nccl communicator is created to work on {self.device}, "
            f"but the input tensor is on {tensor.device}"
        )
        if stream is None:
            stream = self.stream
        self.nccl.ncclRecv(
            buffer_type(tensor.data_ptr()),
            tensor.numel(),
            ncclDataTypeEnum.from_torch(tensor.dtype),
            src,
            self.comm,
            cudaStream_t(stream.cuda_stream),
        )

    @contextmanager
    def change_state(
        self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None
    ):
        """
        A context manager to change the state of the communicator.
        """
        if enable is None:
            # guess a default value when not specified
            enable = self.available

        if stream is None:
            stream = self.stream

        old_disable = self.disabled
        old_stream = self.stream

        self.stream = stream
        self.disabled = not enable
        yield

        self.disabled = old_disable
        self.stream = old_stream


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/pynccl_wrapper.py
================================================
# This file is a pure Python wrapper for the NCCL library.
# The main purpose is to use NCCL combined with CUDA graph.
# Before writing this script, we tried the following approach:
# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
#  often gets stuck when initializing the NCCL communicator.
# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
#  contains many other potential cuda APIs, that are not allowed during
#  capturing the CUDA graph. For further details, please check
# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
#
# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
# doable, but we often encounter issues related with nccl versions, and need
# to switch between different versions of NCCL. See
# https://github.com/NVIDIA/nccl/issues/1234 for more details.
# A C/C++ binding is not flexible enough to handle this. It requires
# recompilation of the code every time we want to switch between different
# versions. This current implementation, with a **pure** Python wrapper, is
# more flexible. We can easily switch between different versions of NCCL by
# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
# variable in the code.

import ctypes
import platform
from dataclasses import dataclass
from typing import Any, Dict, List, Optional

import torch
from torch.distributed import ReduceOp

from server.utils import find_nccl_library


# === export types and functions from nccl to Python ===
# for the original nccl definition, please check
# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in

ncclResult_t = ctypes.c_int
ncclComm_t = ctypes.c_void_p


class ncclUniqueId(ctypes.Structure):
    _fields_ = [("internal", ctypes.c_byte * 128)]


cudaStream_t = ctypes.c_void_p
buffer_type = ctypes.c_void_p

ncclDataType_t = ctypes.c_int


class ncclDataTypeEnum:
    ncclInt8 = 0
    ncclChar = 0
    ncclUint8 = 1
    ncclInt32 = 2
    ncclInt = 2
    ncclUint32 = 3
    ncclInt64 = 4
    ncclUint64 = 5
    ncclFloat16 = 6
    ncclHalf = 6
    ncclFloat32 = 7
    ncclFloat = 7
    ncclFloat64 = 8
    ncclDouble = 8
    ncclBfloat16 = 9
    ncclNumTypes = 10

    @classmethod
    def from_torch(cls, dtype: torch.dtype) -> int:
        if dtype == torch.int8:
            return cls.ncclInt8
        if dtype == torch.uint8:
            return cls.ncclUint8
        if dtype == torch.int32:
            return cls.ncclInt32
        if dtype == torch.int64:
            return cls.ncclInt64
        if dtype == torch.float16:
            return cls.ncclFloat16
        if dtype == torch.float32:
            return cls.ncclFloat32
        if dtype == torch.float64:
            return cls.ncclFloat64
        if dtype == torch.bfloat16:
            return cls.ncclBfloat16
        raise ValueError(f"Unsupported dtype: {dtype}")


ncclRedOp_t = ctypes.c_int


class ncclRedOpTypeEnum:
    ncclSum = 0
    ncclProd = 1
    ncclMax = 2
    ncclMin = 3
    ncclAvg = 4
    ncclNumOps = 5

    @classmethod
    def from_torch(cls, op: ReduceOp) -> int:
        if op == ReduceOp.SUM:
            return cls.ncclSum
        if op == ReduceOp.PRODUCT:
            return cls.ncclProd
        if op == ReduceOp.MAX:
            return cls.ncclMax
        if op == ReduceOp.MIN:
            return cls.ncclMin
        if op == ReduceOp.AVG:
            return cls.ncclAvg
        raise ValueError(f"Unsupported op: {op}")


@dataclass
class Function:
    name: str
    restype: Any
    argtypes: List[Any]


class NCCLLibrary:
    exported_functions = [
        # const char* ncclGetErrorString(ncclResult_t result)
        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
        # ncclResult_t  ncclGetVersion(int *version);
        Function("ncclGetVersion", ncclResult_t,
                 [ctypes.POINTER(ctypes.c_int)]),
        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
        Function("ncclGetUniqueId", ncclResult_t,
                 [ctypes.POINTER(ncclUniqueId)]),
        # ncclResult_t  ncclCommInitRank(
        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
        # note that ncclComm_t is a pointer type, so the first argument
        # is a pointer to a pointer
        Function("ncclCommInitRank", ncclResult_t, [
            ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId,
            ctypes.c_int
        ]),
        # ncclResult_t  ncclAllReduce(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function("ncclAllReduce", ncclResult_t, [
            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
            ncclRedOp_t, ncclComm_t, cudaStream_t
        ]),

        # ncclResult_t  ncclSend(
        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
        #   int dest, ncclComm_t comm, cudaStream_t stream);
        Function("ncclSend", ncclResult_t, [
            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
            ncclComm_t, cudaStream_t
        ]),

        # ncclResult_t  ncclRecv(
        #   void* recvbuff, size_t count, ncclDataType_t datatype,
        #   int src, ncclComm_t comm, cudaStream_t stream);
        Function("ncclRecv", ncclResult_t, [
            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
            ncclComm_t, cudaStream_t
        ]),

        # be cautious! this is a collective call, it will block until all
        # processes in the communicator have called this function.
        # because Python object destruction can happen in random order,
        # it is better not to call it at all.
        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
    ]

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
    path_to_library_cache: Dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):

        so_file = so_file or find_nccl_library()

        try:
            if so_file not in NCCLLibrary.path_to_dict_mapping:
                lib = ctypes.CDLL(so_file)
                NCCLLibrary.path_to_library_cache[so_file] = lib
            self.lib = NCCLLibrary.path_to_library_cache[so_file]
        except Exception as e:
            print(
                "Failed to load NCCL library from %s ."
                "It is expected if you are not running on NVIDIA/AMD GPUs."
                "Otherwise, the nccl library might not exist, be corrupted "
                "or it does not support the current platform %s."
                "If you already have the library, please set the "
                "environment variable VLLM_NCCL_SO_PATH"
                " to point to the correct nccl library path.", so_file,
                platform.platform())
            raise e

        if so_file not in NCCLLibrary.path_to_dict_mapping:
            _funcs: Dict[str, Any] = {}
            for func in NCCLLibrary.exported_functions:
                f = getattr(self.lib, func.name)
                f.restype = func.restype
                f.argtypes = func.argtypes
                _funcs[func.name] = f
            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]

    def ncclGetErrorString(self, result: ncclResult_t) -> str:
        return self._funcs["ncclGetErrorString"](result).decode("utf-8")

    def NCCL_CHECK(self, result: ncclResult_t) -> None:
        if result != 0:
            error_str = self.ncclGetErrorString(result)
            raise RuntimeError(f"NCCL error: {error_str}")

    def ncclGetVersion(self) -> str:
        version = ctypes.c_int()
        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
        version_str = str(version.value)
        # something like 21903 --> "2.19.3"
        major = version_str[0].lstrip("0")
        minor = version_str[1:3].lstrip("0")
        patch = version_str[3:].lstrip("0")
        return f"{major}.{minor}.{patch}"

    def ncclGetUniqueId(self) -> ncclUniqueId:
        unique_id = ncclUniqueId()
        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](
            ctypes.byref(unique_id)))
        return unique_id

    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
                         rank: int) -> ncclComm_t:
        comm = ncclComm_t()
        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
                                                        world_size, unique_id,
                                                        rank))
        return comm

    def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
                      count: int, datatype: int, op: int, comm: ncclComm_t,
                      stream: cudaStream_t) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count,
                                                     datatype, op, comm,
                                                     stream))

    def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
                 dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
                                                dest, comm, stream))

    def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
                 src: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
                                                comm, stream))

    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))


__all__ = [
    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
    "ncclComm_t", "cudaStream_t", "buffer_type"
]


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/distributed/utils.py
================================================
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import dataclasses
import pickle
import time
from collections import deque
from typing import Any, Deque, Dict, Optional, Sequence, Tuple

import torch
from torch.distributed import TCPStore

import server.envs as envs


def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
    assert numerator % denominator == 0, "{} is not divisible by {}".format(
        numerator, denominator
    )


def divide(numerator, denominator):
    """Ensure that numerator is divisible by the denominator and return
    the division value."""
    ensure_divisibility(numerator, denominator)
    return numerator // denominator


def split_tensor_along_last_dim(
    tensor: torch.Tensor,
    num_partitions: int,
    contiguous_split_chunks: bool = False,
) -> Sequence[torch.Tensor]:
    """Split a tensor along its last dimension.

    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.

    Returns:
        A list of Tensors
    """
    # Get the size and dimension.
    last_dim = tensor.dim() - 1
    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
    # Split.
    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
    # NOTE: torch.split does not create contiguous tensors by default.
    if contiguous_split_chunks:
        return tuple(chunk.contiguous() for chunk in tensor_list)

    return tensor_list


def get_pp_indices(
    num_hidden_layers: int, pp_rank: int, pp_size: int
) -> Tuple[int, int]:
    """Try to evenly distribute layers across partitions.
    If the number of layers is not divisible by the number of partitions,
    the last partition will have the remaining layers.
    """
    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
    if partition_list_str is not None:
        try:
            partitions = [int(layer) for layer in partition_list_str.split(",")]
        except ValueError as err:
            raise ValueError(
                "Invalid partition string: {}".format(partition_list_str)
            ) from err
        if len(partitions) != pp_size:
            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
        if sum(partitions) != num_hidden_layers:
            raise ValueError(f"{sum(partitions)=} does not match {num_hidden_layers=}.")
        start_layer = sum(partitions[:pp_rank])
        end_layer = start_layer + partitions[pp_rank]
    else:
        layers_per_partition = num_hidden_layers // pp_size
        start_layer = pp_rank * layers_per_partition
        end_layer = start_layer + layers_per_partition

        if pp_rank == pp_size - 1:
            end_layer = num_hidden_layers

    return (start_layer, end_layer)


@dataclasses.dataclass
class StatelessProcessGroup:
    """A dataclass to hold a metadata store, and the rank, world_size of the
    group. Only use it to communicate metadata between processes.
    For data-plane communication, create NCCL-related objects.
    """

    rank: int
    world_size: int
    store: torch._C._distributed_c10d.Store
    data_expiration_seconds: int = 3600  # 1 hour

    # dst rank -> counter
    send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
    # src rank -> counter
    recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
    broadcast_send_counter: int = 0
    broadcast_recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)

    # A deque to store the data entries, with key and timestamp.
    entries: Deque[Tuple[str, float]] = dataclasses.field(default_factory=deque)

    def __post_init__(self):
        assert self.rank < self.world_size
        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
        self.broadcast_recv_src_counter = {i: 0 for i in range(self.world_size)}

    def send_obj(self, obj: Any, dst: int):
        """Send an object to a destination rank."""
        self.expire_data()
        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
        self.store.set(key, pickle.dumps(obj))
        self.send_dst_counter[dst] += 1
        self.entries.append((key, time.time()))

    def expire_data(self):
        """Expire data that is older than `data_expiration_seconds` seconds."""
        while self.entries:
            # check the oldest entry
            key, timestamp = self.entries[0]
            if time.time() - timestamp > self.data_expiration_seconds:
                self.store.delete_key(key)
                self.entries.popleft()
            else:
                break

    def recv_obj(self, src: int) -> Any:
        """Receive an object from a source rank."""
        obj = pickle.loads(
            self.store.get(f"send_to/{self.rank}/{self.recv_src_counter[src]}")
        )
        self.recv_src_counter[src] += 1
        return obj

    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
        """Broadcast an object from a source rank to all other ranks.
        It does not clean up after all ranks have received the object.
        Use it for limited times, e.g., for initialization.
        """
        if self.rank == src:
            self.expire_data()
            key = f"broadcast_from/{src}/" f"{self.broadcast_send_counter}"
            self.store.set(key, pickle.dumps(obj))
            self.broadcast_send_counter += 1
            self.entries.append((key, time.time()))
            return obj
        else:
            key = f"broadcast_from/{src}/" f"{self.broadcast_recv_src_counter[src]}"
            recv_obj = pickle.loads(self.store.get(key))
            self.broadcast_recv_src_counter[src] += 1
            return recv_obj

    def all_gather_obj(self, obj: Any) -> list[Any]:
        """All gather an object from all ranks."""
        gathered_objs = []
        for i in range(self.world_size):
            if i == self.rank:
                gathered_objs.append(obj)
                self.broadcast_obj(obj, src=self.rank)
            else:
                recv_obj = self.broadcast_obj(None, src=i)
                gathered_objs.append(recv_obj)
        return gathered_objs

    def barrier(self):
        """A barrier to synchronize all ranks."""
        for i in range(self.world_size):
            if i == self.rank:
                self.broadcast_obj(None, src=self.rank)
            else:
                self.broadcast_obj(None, src=i)

    @staticmethod
    def create(
        host: str,
        port: int,
        rank: int,
        world_size: int,
        data_expiration_seconds: int = 3600,
    ) -> "StatelessProcessGroup":
        """A replacement for `torch.distributed.init_process_group` that does not
        pollute the global state.

        If we have process A and process B called `torch.distributed.init_process_group`
        to form a group, and then we want to form another group with process A, B, C,
        D, it is not possible in PyTorch, because process A and process B have already
        formed a group, and process C and process D cannot join that group. This
        function is a workaround for this issue.

        `torch.distributed.init_process_group` is a global call, while this function
        is a stateless call. It will return a `StatelessProcessGroup` object that can be
        used for exchanging metadata. With this function, process A and process B
        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
        C, and D can call `StatelessProcessGroup.create` to form another group.
        """  # noqa
        store = TCPStore(
            host_name=host,
            port=port,
            world_size=world_size,
            is_master=(rank == 0),
        )

        return StatelessProcessGroup(
            rank=rank,
            world_size=world_size,
            store=store,
            data_expiration_seconds=data_expiration_seconds,
        )


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/forward_batch.py
================================================
'''
Date: 2024-11-12 14:15:16
LastEditors: Xie Weiyu ervinxie@qq.com
LastEditTime: 2024-11-26 08:12:49
'''
import torch
from ktransformers.server.balance_serve.settings import sched_ext
from ktransformers.server.balance_serve.inference.query_manager import QueryManager, QueryInfo
import time
from ktransformers.server.config.config import Config
class ForwardBatchInput:

    class ForwardMiniBatch:
        q_indptr: torch.Tensor
        kv_indptr: torch.Tensor
        kv_indices: torch.Tensor
        kv_last_page_len: torch.Tensor
        kv_len: torch.Tensor
        position_ids: torch.Tensor
        tokens: torch.Tensor
        batch_indices: torch.Tensor
        positions: torch.Tensor
        chunk_size: int
        decode_batch: int        
        is_last_prefill_chunk: bool
        logits_start: list

        temperatures: torch.Tensor
        top_ps: torch.Tensor

        def __init__(self, prefill_querys_info: list[QueryInfo], decode_querys_info: list[QueryInfo], prefill_s: list[int] = None, prefill_l: list[int] = None, device = torch.device('cuda'), page_size = 256):
            batch_decode = len(decode_querys_info)
            batch_prefill = len(prefill_querys_info)

            self.q_indptr = torch.tensor([0], device=device, dtype=torch.int32)
            self.kv_indptr = torch.tensor([0], device=device, dtype=torch.int32)
            self.kv_indices = torch.tensor([], device=device, dtype=torch.int32)
            self.kv_len = torch.tensor([], device=device, dtype=torch.int32)
            self.kv_last_page_len = torch.tensor([], device=device, dtype=torch.int32)
            self.position_ids = torch.tensor([], device=device, dtype=torch.int32)
            self.tokens = torch.tensor([], device=device, dtype=torch.int32)

            self.temperatures = torch.tensor([], device=device, dtype=torch.float32)
            self.top_ps = torch.tensor([], device=device, dtype=torch.float32)

            self.logits_start = []
            self.decode_batch = batch_decode
            self.num_tokens = batch_decode + sum(prefill_l)
            self.batch_size = batch_decode + batch_prefill
            
            for i, prefill_query_info in enumerate(prefill_querys_info):
                if prefill_query_info != None:
                    prefill_kv_block_len = (prefill_query_info.active_position + prefill_l[i] + page_size - 1) // page_size if prefill_query_info is not None else 0
                    # print(f"block_len: {prefill_kv_block_len}, page_size: {page_size}")
                    self.q_indptr = torch.concat((self.q_indptr, torch.tensor([prefill_l[i] + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                    self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([prefill_kv_block_len + self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                    self.kv_indices = torch.concat((self.kv_indices, prefill_query_info.block_index[:prefill_kv_block_len]), dim=0)
                    self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i]) % page_size if (prefill_query_info.active_position + prefill_l[i]) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
                    self.kv_len = torch.concat((self.kv_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i])], device=device, dtype=torch.int32)), dim=0)
                    self.position_ids = torch.concat((self.position_ids, torch.arange(prefill_s[i], prefill_l[i] + prefill_s[i], device=device, dtype=torch.int32)), dim=0)
                    self.tokens = torch.concat((self.tokens, prefill_query_info.query_tokens[prefill_s[i]:prefill_s[i] + prefill_l[i]]), dim=0)
                    self.logits_start.append(prefill_l[i] - 1 if len(self.logits_start) == 0 else sum(prefill_l[:i+1])-1)

                    self.temperatures = torch.concat((self.temperatures, torch.tensor([prefill_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
                    self.top_ps = torch.concat((self.top_ps, torch.tensor([prefill_query_info.top_p], device=device, dtype=torch.float32)), dim=0)

            for decode_query_info in decode_querys_info:
                decode_kv_block_len = (decode_query_info.active_position + 1 + page_size - 1) // page_size
                self.q_indptr = torch.concat((self.q_indptr, torch.tensor([1 + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([decode_kv_block_len+self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indices = torch.concat((self.kv_indices, decode_query_info.block_index[:decode_kv_block_len]), dim=0)
                self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(decode_query_info.active_position + 1) % page_size if (decode_query_info.active_position + 1) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
                self.kv_len = torch.concat((self.kv_len, torch.tensor([(decode_query_info.active_position + 1)], device=device, dtype=torch.int32)), dim=0)
                self.position_ids = torch.concat((self.position_ids, torch.arange(decode_query_info.active_position, decode_query_info.active_position + 1, device=device, dtype=torch.int32)), dim=0)
                if decode_query_info.active_position > 0:
                    self.tokens = torch.concat((self.tokens, decode_query_info.query_tokens[decode_query_info.active_position:decode_query_info.active_position+1]), dim=0)
                else: 
                    self.tokens = torch.concat((self.tokens, torch.tensor([0], device=device, dtype=torch.int32)), dim=0)
                self.logits_start.append(0 if len(self.logits_start) == 0 else self.logits_start[-1]+1)

                self.temperatures = torch.concat((self.temperatures, torch.tensor([decode_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
                self.top_ps = torch.concat((self.top_ps, torch.tensor([decode_query_info.top_p], device=device, dtype=torch.float32)), dim=0)

            self.q_indptr = self.q_indptr.contiguous()
            self.kv_indptr = self.kv_indptr.contiguous()
            self.kv_indices = self.kv_indices.contiguous()
            self.kv_len = self.kv_len.contiguous()
            self.kv_last_page_len = self.kv_last_page_len.contiguous()
            self.position_ids = self.position_ids.contiguous()
            self.tokens = self.tokens.contiguous()

            self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)

        def fill(self, prefill_querys_info: list[QueryInfo], decode_querys_info: list[QueryInfo], prefill_s: list[int] = None, prefill_l: list[int] = None, device = torch.device('cuda'), page_size = 256):
            batch_decode = len(decode_querys_info)
            batch_prefill = len(prefill_querys_info)

            self.q_indptr = torch.tensor([0], device=device, dtype=torch.int32)
            self.kv_indptr = torch.tensor([0], device=device, dtype=torch.int32)
            self.kv_indices = torch.tensor([], device=device, dtype=torch.int32)
            self.kv_len = torch.tensor([], device=device, dtype=torch.int32)
            self.kv_last_page_len = torch.tensor([], device=device, dtype=torch.int32)
            new_position_ids = torch.tensor([], device=device, dtype=torch.int32)
            new_tokens = torch.tensor([], device=device, dtype=torch.int32)

            self.temperatures = torch.tensor([], device=device, dtype=torch.float32)
            self.top_ps = torch.tensor([], device=device, dtype=torch.float32)

            self.logits_start = []
            self.decode_batch = batch_decode
            self.num_tokens = batch_decode + sum(prefill_l)
            self.batch_size = batch_decode + batch_prefill

            for i, prefill_query_info in enumerate(prefill_querys_info):
                prefill_kv_block_len = (prefill_query_info.active_position + prefill_l[i] + page_size - 1) // page_size if prefill_query_info is not None else 0
            # print(f"block_len: {prefill_kv_block_len}, page_size: {page_size}")
                self.q_indptr = torch.concat((self.q_indptr, torch.tensor([prefill_l[i] + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([prefill_kv_block_len + self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indices = torch.concat((self.kv_indices, prefill_query_info.block_index[:prefill_kv_block_len]), dim=0)
                self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i]) % page_size if (prefill_query_info.active_position + prefill_l[i]) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
                self.kv_len = torch.concat((self.kv_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i])], device=device, dtype=torch.int32)), dim=0)
                new_position_ids = torch.concat((new_position_ids, torch.arange(prefill_s[i], prefill_l[i] + prefill_s[i], device=device, dtype=torch.int32)), dim=0)
                new_tokens = torch.concat((new_tokens, prefill_query_info.query_tokens[prefill_s[i]:prefill_s[i] + prefill_l[i]]), dim=0)
                self.logits_start.append(prefill_l[i] - 1 if len(self.logits_start) == 0 else sum(prefill_l[:i+1])-1)

                self.temperatures = torch.concat((self.temperatures, torch.tensor([prefill_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
                self.top_ps = torch.concat((self.top_ps, torch.tensor([prefill_query_info.top_p], device=device, dtype=torch.float32)), dim=0)


            for decode_query_info in decode_querys_info:
                decode_kv_block_len = (decode_query_info.active_position + 1 + page_size - 1) // page_size
                self.q_indptr = torch.concat((self.q_indptr, torch.tensor([1 + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([decode_kv_block_len+self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
                self.kv_indices = torch.concat((self.kv_indices, decode_query_info.block_index[:decode_kv_block_len]), dim=0)
                self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(decode_query_info.active_position + 1) % page_size if (decode_query_info.active_position + 1) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
                self.kv_len = torch.concat((self.kv_len, torch.tensor([(decode_query_info.active_position + 1)], device=device, dtype=torch.int32)), dim=0)
                new_position_ids = torch.concat((new_position_ids, torch.arange(decode_query_info.active_position, decode_query_info.active_position + 1, device=device, dtype=torch.int32)), dim=0)
                if decode_query_info.active_position > 0:
                    new_tokens = torch.concat((new_tokens, decode_query_info.query_tokens[decode_query_info.active_position:decode_query_info.active_position+1]), dim=0)
                else: 
                    new_tokens = torch.concat((new_tokens, torch.tensor([0], device=device, dtype=torch.int32)), dim=0)
                self.logits_start.append(0 if len(self.logits_start) == 0 else self.logits_start[-1]+1)

                self.temperatures = torch.concat((self.temperatures, torch.tensor([decode_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
                self.top_ps = torch.concat((self.top_ps, torch.tensor([decode_query_info.top_p], device=device, dtype=torch.float32)), dim=0)


            self.q_indptr = self.q_indptr.contiguous()
            self.kv_indptr = self.kv_indptr.contiguous()
            self.kv_indices = self.kv_indices.contiguous()
            self.kv_len = self.kv_len.contiguous()
            self.kv_last_page_len = self.kv_last_page_len.contiguous()

            self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)
            
            # copy new_position_ids and new_tokens to self.position_ids and self.tokens
            # print("new_position_ids: ", new_position_ids)
            # self.print()
            self.position_ids[:new_position_ids.size(0)].copy_(new_position_ids)
            self.position_ids[new_position_ids.size(0):].zero_()
            self.tokens[:new_tokens.size(0)].copy_(new_tokens)


    forward_minibatchs: list[ForwardMiniBatch]
    batch_size: int
    minibatch: ForwardMiniBatch


    def __init__(self, batch : sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None, device=None, tokens: torch.Tensor = None):
        
        if batch is None:
            return


        prefill_minibatches = batch.prefill_mini_batches
        decode_mini_batches = [item for sublist in batch.decode_mini_batches for item in sublist]
        prefill_querys_info = []
        prefill_s = []
        prefill_l = []
        decode_querys_info = []
        self.batch_size = 1
        for (id, s, l) in prefill_minibatches:
            prefill_querys_info.append(query_manager.query_map[id])
            prefill_s.append(s)
            prefill_l.append(l)
        for decode_batch_idx in decode_mini_batches:
            if query_manager.query_map[decode_batch_idx].decode_start_time is None:
                query_manager.query_map[decode_batch_idx].decode_start_time =time.time()
            decode_querys_info.append(query_manager.query_map[decode_batch_idx])


        minibatch = ForwardBatchInput.ForwardMiniBatch(prefill_querys_info, decode_querys_info, prefill_s, prefill_l, device = query_manager.device, page_size = query_manager.page_size)
 
        self.minibatch = minibatch

    @classmethod
    def gen_max_forward_batch(
        cls,
        device=None,
        tokens: torch.Tensor = None,
        num_mini_batches: int = 1,
        max_seq_length: int = 4096, # TODO: add to yaml
        prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config
        prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size,
        gen_prefill: bool = True,
        decode_batch_size: int = Config().max_decode_batch_size,
        decode_active_position: torch.Tensor = None,
        page_size = 256,
        cuda_lens = 1
    ):
        instance = cls()
        
        instance.batch_size = num_mini_batches
        page_size = page_size
     
        prefill_query_info = []
        offset = 0
        if gen_prefill and prefill_query_length != 0:
            for i in range(Config().max_prefill_batch_size):
                prefill_query_info.append(QueryInfo(i, prefill_query_length, max_seq_length, page_size, device, offset=offset))
                offset += max_seq_length // page_size

        decode_querys_info = []
        for i in range(min(decode_batch_size, cuda_lens)):
            query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, 256, page_size, device, is_prefill=False, offset=offset)
            offset += max_seq_length // page_size
            if tokens is not None:
                query_info.query_tokens[prefill_active_length:prefill_active_length + 1].copy_(tokens)            
            if decode_active_position is None:
                query_info.active_position = 255
            else: 
                query_info.active_position = decode_active_position[i]

            decode_querys_info.append(query_info)
        
        if prefill_query_length*Config().max_prefill_batch_size + len(decode_querys_info) < cuda_lens:
            decode_querys_info.append(query_info)

        instance.minibatch = ForwardBatchInput.ForwardMiniBatch(prefill_query_info, decode_querys_info, [0, 0], [prefill_active_length for _ in range(Config().max_prefill_batch_size)], device, page_size)
        
        return instance

    def fill(self, batch : sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None, page_size = 256):
        if batch is None:
            return
        prefill_minibatches = batch.prefill_mini_batches
        decode_mini_batches = [item for sublist in batch.decode_mini_batches for item in sublist]

        prefill_querys_info = []
        prefill_s = []
        prefill_l = []
        decode_querys_info = []
        self.batch_size = 1
        for (id, s, l) in prefill_minibatches:
            prefill_querys_info.append(query_manager.query_map[id])
            prefill_s.append(s)
            prefill_l.append(l)
        for decode_batch_idx in decode_mini_batches:
            if query_manager.query_map[decode_batch_idx].decode_start_time is None:
                query_manager.query_map[decode_batch_idx].decode_start_time =time.time()
            decode_querys_info.append(query_manager.query_map[decode_batch_idx])

        self.minibatch.fill(prefill_querys_info, decode_querys_info, prefill_s, prefill_l, device=query_manager.device, page_size=page_size)


class ForwardBatchOutput:
    logits: list[torch.Tensor]
    num_batchs: int
    batch_sizes: list[int]
    generated_tokens_num: list[int]
    lm_start: list[int]
    
    temperatures: list[torch.Tensor]
    top_ps: list[torch.Tensor]

    def __init__(self):
        self.logits = []
        self.batch_sizes = []
        self.generated_tokens_num = []
        self.top_ps = []
        self.temperatures = []
        self.num_batchs = 1

================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/model_runner.py
================================================
"""
Date: 2024-11-07 07:02:20
LastEditors: djw
LastEditTime: 2024-12-10 08:48:32
"""

import torch
from torch import nn
import queue
import signal
import queue
from typing import AsyncIterable
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from contextlib import asynccontextmanager
from pydantic import BaseModel, Field
import asyncio
import multiprocessing
import time
import torch.multiprocessing as mp
import random
import torch.distributed as dist
import zmq
import tempfile
from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput

from ktransformers.server.config.config import Config
from ktransformers.models.custom_modeling_deepseek_v3 import KDeepseekV3ForCausalLM
from ktransformers.models.custom_modeling_deepseek_v2 import KDeepseekV2ForCausalLM
from ktransformers.models.custom_modeling_qwen2_moe import KQwen2MoeForCausalLM
from ktransformers.models.custom_modeling_qwen3_moe import KQwen3MoeForCausalLM
from ktransformers.server.balance_serve.inference.query_manager import QueryManager
from ktransformers.server.balance_serve.settings import sched_ext


def pad_num_tokens(num_tokens):
    return (num_tokens + 63) // 64 * 64

def deduplicate_and_sort(lst):
    return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
    assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
    base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]

    if chunk_size <= 1024:
        return deduplicate_and_sort(base_list)

    multiples = [i for i in range(1024, chunk_size + 1, 1024)]

    return deduplicate_and_sort(base_list + multiples)
class ModelRunner:
    """A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""

    model: KDeepseekV3ForCausalLM  | KQwen2MoeForCausalLM | KQwen3MoeForCausalLM 
    input: ForwardBatchInput | list[ForwardBatchInput]
    output: ForwardBatchOutput
    
    def __init__(self, model = None, device = None, use_cuda_graph = False, max_decode_batch_size = 1, max_chunk_size = 4096, num_mini_batches: int = 1, page_size = 256, block_num = 8):
        
        self.stream = torch.cuda.Stream(device=device)
        self.model = model  # Compile and move model to the specified device
        self.device = device
        self.input = None
        self.features_buf = None
        self.output = None
        self.graph_memory_pool = None
        self.cuda_graphs = generate_cuda_graphs(Config().chunk_size)
        self.use_cuda_graph = use_cuda_graph
        self.model_time = 0
        self.page_size = page_size
        self.block_num = block_num
        # GPU timing for model execution
        self.start_model_event = torch.cuda.Event(enable_timing=True)
        self.end_model_event = torch.cuda.Event(enable_timing=True)

        self.graphs = [torch.cuda.CUDAGraph() for _ in range(len(self.cuda_graphs))]
        self.page_idx_buf = [torch.zeros([self.cuda_graphs[i]], dtype=torch.int32, device = self.device) for i in range(len(self.cuda_graphs))]
        self.page_offset_buf = [torch.zeros([self.cuda_graphs[i]], dtype=torch.int32, device = self.device) for i in range(len(self.cuda_graphs))]
 
        self.num_mini_batches = num_mini_batches

        self.max_chunk_size = max_chunk_size

        self.bsz_tensor_buf = torch.empty((1, ),dtype=torch.int32, device=device)
        self.num_tokens_tensor_buf = torch.empty((1, ),dtype=torch.int32, device=device)

    def model_attn_plan(self, batch, cuda_graph_idx=0):
        if isinstance(self.model, KDeepseekV3ForCausalLM):
            self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
                                             num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
                                             head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.model.cache.page_size, causal=True,
                                             sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
        elif isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM):
            self.model.flash_infer_attn_plan(batch, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
                                             num_q_heads=self.model.config.num_attention_heads, num_kv_heads=self.model.config.num_key_value_heads,
                                             head_dim=self.model.config.head_dim if hasattr(self.model.config, 'head_dim') else self.model.config.hidden_size // self.model.config.num_attention_heads, 
                                             page_size=self.model.cache.page_size, causal=True,
                                             q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16, cuda_graph_idx=cuda_graph_idx)
        else:
            assert False, "model type not supported"


    def warmup(self):

        def capture_graphs(cuda_graph_idx):
            with torch.cuda.graph(self.graphs[cuda_graph_idx], pool=self.graph_memory_pool, stream=self.stream):
                self.outputs_buf[cuda_graph_idx] = self.model(self.input[cuda_graph_idx], self.features_buf[cuda_graph_idx], self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf[cuda_graph_idx], self.page_offset_buf[cuda_graph_idx], cuda_graph_idx=cuda_graph_idx)   
            self.graph_memory_pool = self.graphs[cuda_graph_idx].pool()

        self.input = []
        self.features_buf = []
        self.outputs_buf = []
        self.bsz_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
        self.num_tokens_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
        for i in range(len(self.cuda_graphs)):
            prefill_query_length = (self.cuda_graphs[i] - Config().max_decode_batch_size) // Config().max_prefill_batch_size if self.cuda_graphs[i] > Config().max_decode_batch_size else 0  #@TODO only supprot 2 prefill batch
            self.input.append(ForwardBatchInput.gen_max_forward_batch(device=self.device, num_mini_batches = self.num_mini_batches, prefill_query_length=prefill_query_length, prefill_active_length=prefill_query_length, page_size=self.page_size, cuda_lens=self.cuda_graphs[i]))

            self.features_buf.append(self.model.batch_embeddings(self.input[i]))
            batch_size = self.input[i].minibatch.q_indptr.size(0)-1
            num_tokens = self.features_buf[i][0].size(0)
            print("capturing cuda graph", batch_size, num_tokens)

            if isinstance(self.model, KQwen2MoeForCausalLM) or isinstance(self.model, KQwen3MoeForCausalLM):
                self.model.init_wrapper(self.use_cuda_graph, self.device, num_tokens ,batch_size, self.block_num, i) # TODO: 1024 is a magic number(max_batch_tokens)

            self.bsz_tensor_buf[0] = batch_size
            self.num_tokens_tensor_buf[0] = num_tokens

            self.model_attn_plan(self.input[i], i)
        
            page_idx, page_offset = self.model.cache.get_page_table(self.input[i].minibatch.position_ids, self.input[i].minibatch.q_indptr, self.input[i].minibatch.kv_indptr, self.input[i].minibatch.kv_indices, self.num_tokens_tensor_buf)

            
            self.page_idx_buf[i][:num_tokens].copy_(page_idx[:num_tokens])
            self.page_offset_buf[i][:num_tokens].copy_(page_offset[:num_tokens])

            self.page_idx_buf[i][num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size -1) 
        
            self.outputs_buf.append(None)
        
            torch.cuda.synchronize()
            for warm_up_iters in range(11):
                with torch.cuda.stream(self.stream):
                    self.outputs_buf[i] = self.model(self.input[i], self.features_buf[i], self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf[i], self.page_offset_buf[i], cuda_graph_idx=i)
            torch.cuda.synchronize()

            self.outputs_buf[i].num_batchs = batch_size

            capture_graphs(i)

            with torch.cuda.stream(self.stream):
                self.graphs[i].replay()

            self.sync(calc_time=False)
            print(f"cuda_graph: {i+1}/{len(self.cuda_graphs)}, warmup finished.")
        
    def run(self, batch: sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None):
        with torch.cuda.stream(self.stream):

            batch_size = len(batch.prefill_mini_batches) # TODO: calc this
            num_tokens = 0
            for i in range(len(batch.decode_mini_batches)):
                batch_size += len(batch.decode_mini_batches[i])
                num_tokens += len(batch.decode_mini_batches[i])
                print(f'decode_batch_i: {len(batch.decode_mini_batches[i])},')

            for i in range(len(batch.prefill_mini_batches)):
                num_tokens += batch.prefill_mini_batches[i][2]
                print(f'prefill_batch_i: {batch.prefill_mini_batches[i][2]},')


            # cuda graph idx equal to min idx i in self.cuda_graphs, that self.cuda_graphs[i] > num_tokens
            cuda_graph_idx = next((i for i, token in enumerate(self.cuda_graphs) if token >= num_tokens), len(self.cuda_graphs))
            if not self.use_cuda_graph:
                cuda_graph_idx = 0
            # if cuda_graph_idx == len(self.cuda_graphs):
            #     assert False, "num_tokens is too large"
    
            if self.use_cuda_graph:
                self.input[cuda_graph_idx].fill(batch, query_manager, self.page_size)
            else:
                self.input = [ForwardBatchInput(batch=batch, query_manager=query_manager, device=self.device)]
                

            if self.use_cuda_graph:
                self.features = self.model.batch_embeddings(self.input[cuda_graph_idx], device=self.device)
            else:
                self.features = self.model.batch_embeddings(self.input[cuda_graph_idx], device=self.device)


            self.bsz_tensor_buf.copy_(batch_size)
            self.num_tokens_tensor_buf.copy_(torch.tensor([num_tokens], dtype=torch.int32, device=self.device))

            if self.use_cuda_graph:
                self.features_buf[cuda_graph_idx][0].copy_(self.features[0], non_blocking=True)

            self.model_attn_plan(self.input[cuda_graph_idx], cuda_graph_idx)
            self.start_model_event.record(self.stream)
            page_idx, page_offset = self.model.cache.get_page_table(self.input[cuda_graph_idx].minibatch.position_ids, self.input[cuda_graph_idx].minibatch.q_indptr, self.input[cuda_graph_idx].minibatch.kv_indptr, self.input[cuda_graph_idx].minibatch.kv_indices, self.num_tokens_tensor_buf)
            if self.use_cuda_graph:
                self.page_idx_buf[cuda_graph_idx][:num_tokens].copy_(page_idx[:num_tokens])
                self.page_offset_buf[cuda_graph_idx][:num_tokens].copy_(page_offset[:num_tokens])

                self.page_idx_buf[cuda_graph_idx][num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size -1)
                self.replay(cuda_graph_idx)
                self.output = ForwardBatchOutput()
                
                self.output.top_ps.append(self.input[cuda_graph_idx].minibatch.top_ps)
                self.output.temperatures.append(self.input[cuda_graph_idx].minibatch.temperatures)


                self.output.logits.append(self.outputs_buf[cuda_graph_idx].logits[0][self.input[cuda_graph_idx].minibatch.logits_start].clone())
            else:
                self.output = self.model(self.input[cuda_graph_idx], self.features, self.bsz_tensor_buf, self.num_tokens_tensor_buf, page_idx, page_offset)
                self.output.logits[0] = self.output.logits[0][self.input[cuda_graph_idx].minibatch.logits_start]
                self.output.top_ps.append(self.input[cuda_graph_idx].minibatch.top_ps)
                self.output.temperatures.append(self.input[cuda_graph_idx].minibatch.temperatures)
            self.end_model_event.record(self.stream)


    def replay(self, cuda_graph_idx=-1):
        with torch.cuda.stream(self.stream):
            if cuda_graph_idx != -1:
                self.graphs[cuda_graph_idx].replay()
            else:
                self.graphs.replay()


    def sync(self, calc_time = True):
        self.stream.synchronize()
        if calc_time:
            self.model_time = self.start_model_event.elapsed_time(self.end_model_event)  # In ms

================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/query_manager.py
================================================
'''
Date: 2024-11-14 12:23:45
LastEditors: djw
LastEditTime: 2024-11-20 04:06:23
'''
import torch
from ktransformers.server.balance_serve.settings import sched_ext
import random
import time

class QueryInfo:
    id: int
    active_position: int
    query_length: int
    is_prefill: int
    block_index: torch.Tensor
    query_tokens: torch.Tensor
    stop_criteria: list[torch.Tensor]

    temperature: float
    top_p: float

    max_length: int 

    def __init__(self, id, query_length: int, max_length: int, page_size: int, device: torch.device, is_prefill: bool = True, offset: int = 0, active_position: int = 0, temperature: float = 0.01, top_p: float = 1.0):
        self.id = id
        self.is_prefill = is_prefill
        self.active_position = active_position
        self.max_length = max_length - 1
        self.query_tokens = torch.zeros((max_length,), dtype=torch.int, device = device)
        self.stop_criteria = []
        self.block_index = torch.arange(offset, offset + (max_length + active_position + page_size - 1) // page_size, dtype=torch.int, device = device)
        self.query_length = query_length
        self.enqueue_time = time.time()
        self.decode_start_time = None
        self.speculative_token = {} # {position: (accept, token)}

        self.temperature = temperature
        self.top_p = top_p

    def check_stop(self):
        if self.active_position >= self.max_length - 2:
            return True

        for stop_tensor in self.stop_criteria:
            stop_len = len(stop_tensor)
            
            if stop_len >= self.active_position:
                continue
            
            #print(f"stop_tensor: {stop_tensor}, stop_len: {stop_len}, active_position: {self.active_position}, query_token: {self.query_tokens[self.active_position - stop_len - 1:self.active_position - 1]}")

            if (torch.equal(self.query_tokens[self.active_position - stop_len - 1:self.active_position - 1], stop_tensor) and self.active_position) or self.max_length <= self.active_position + 3:
                self.life_time = time.time() - self.enqueue_time
                self.decode_duration_time = time.time() - self.decode_start_time
                self.decode_tps = (self.active_position -  self.query_length) / self.decode_duration_time
                print(f"prefill length: {self.query_length}, prefill time: {self.prefill_duration_time}, prefill tps {self.prefill_tps}, decode length: {self.active_position -  self.query_length}, decode time: {self.decode_duration_time}, decode tps {self.decode_tps}")
                return True
                
        
        return False


    def print(self):
        print(f"active_position: {self.active_position}, query_length: {self.query_length}, is_prefill: {self.is_prefill}")
        print(f"block_index_shape: {self.block_index.shape}, query_tokens_shape: {self.query_tokens.shape}")


class QueryManager:

    page_size: int = 256
    device: torch.device
    query_map : dict[int, QueryInfo]

    def __init__(self, page_size = 256, device = torch.device('cuda')):
        self.page_size = page_size
        self.device = device
        self.query_map = {}

    def add_query(self, batch: sched_ext.BatchQueryTodo):

        for i in range(len(batch.query_ids)):
            id = batch.query_ids[i]
            if id not in self.query_map:
                print(f"add query id: {id}, batch.query_lengths: {batch.query_lengths[i]}, batch_query_tokens: {batch.query_tokens[i].shape}, batch.block_indexes: {batch.block_indexes[i]}")
                query_info = QueryInfo(id=id, query_length=batch.query_lengths[i], max_length=batch.query_tokens[i].size(0) + 1, page_size=self.page_size, device=self.device, temperature=batch.sample_options[i].temperature, top_p=batch.sample_options[i].top_p)
                query_info.query_tokens[:query_info.query_length].copy_(batch.query_tokens[i][:query_info.query_length].to(self.device))
                
                for stop_token_list in batch.stop_criteria[i]:
                    query_info.stop_criteria.append(torch.tensor(stop_token_list, dtype=torch.int, device = self.device))

                block_num = batch.block_indexes[i].size(0)
                query_info.block_index[:block_num].copy_(batch.block_indexes[i].to(self.device))

                self.query_map[id] = query_info
                
                prefill_mini_batches = batch.prefill_mini_batches
                for (prefill_id, s, l) in prefill_mini_batches:
                    if prefill_id == id:
                        self.query_map[prefill_id].active_position = s


    def update(self, batch: sched_ext.BatchQueryTodo) -> list[sched_ext.QueryUpdate]:
        query_updates = []

        prefill_mini_batches = batch.prefill_mini_batches

        for (id, s, l) in prefill_mini_batches:

            if id not in self.query_map:
                assert False, f"query id {id} not found in query_map"

            # update query_info
            query_info = self.query_map[id]
            query_info.active_position += l

            if query_info.active_position >= query_info.query_length and query_info.is_prefill:
                query_info.is_prefill = False
                query_info.prefill_duration_time = time.time() - query_info.enqueue_time
                query_info.prefill_tps = query_info.query_length / query_info.prefill_duration_time
                

            # generate schedule query_update
            query_update = sched_ext.QueryUpdate()
            query_update.id = id
            query_update.ok = True
            query_update.is_prefill = query_info.is_prefill
            query_update.active_position = query_info.active_position
            # if(not query_info.is_prefill):
            query_updates.append(query_update)


        decode_mini_batches = batch.decode_mini_batches

        for ids in decode_mini_batches:
            for id in ids:
                if id not in self.query_map:
                    assert False, f"query id {id} not found in query_map"

                query_info = self.query_map[id]
                query_info.active_position += 1

                query_update = sched_ext.QueryUpdate()
                query_update.id = id
                query_update.ok = True
                query_update.is_prefill = query_info.is_prefill

                query_update.decode_done = query_info.check_stop()

                query_update.active_position = query_info.active_position
                query_updates.append(query_update)

        return query_updates


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/sampling/penaltylib/__init__.py
================================================
from .orchestrator import BatchedPenalizerOrchestrator
from .penalizers.frequency_penalty import BatchedFrequencyPenalizer
from .penalizers.min_new_tokens import BatchedMinNewTokensPenalizer
from .penalizers.presence_penalty import BatchedPresencePenalizer
from .penalizers.repetition_penalty import BatchedRepetitionPenalizer

__all__ = [
    "BatchedFrequencyPenalizer",
    "BatchedMinNewTokensPenalizer",
    "BatchedPresencePenalizer",
    "BatchedRepetitionPenalizer",
    "BatchedPenalizerOrchestrator",
]


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/sampling/penaltylib/orchestrator.py
================================================
import abc
import dataclasses
import typing

import torch


@dataclasses.dataclass
class _ReqLike:
    origin_input_ids: typing.Union[torch.Tensor, typing.List[int]]


@dataclasses.dataclass
class _BatchLike:
    reqs: typing.List[_ReqLike]

    def batch_size(self):
        return len(self.reqs)


class BatchedPenalizerOrchestrator:
    batch: _BatchLike
    device: str
    vocab_size: int
    penalizers: typing.Dict[typing.Type["_BatchedPenalizer"], "_BatchedPenalizer"]

    def __init__(
        self,
        vocab_size: int,
        batch: _BatchLike,
        device: str,
        Penalizers: typing.Set[typing.Type["_BatchedPenalizer"]],
    ):
        self.vocab_size = vocab_size
        self.batch = batch
        self.device = device

        self.penalizers = {Penalizer: Penalizer(self) for Penalizer in Penalizers}

        is_required = False
        for penalizer in self.penalizers.values():
            pen_is_required = penalizer.prepare_if_required()
            is_required |= pen_is_required
        self.is_required = is_required

        if self.is_required:
            self.cumulate_input_tokens(
                input_ids=[req.origin_input_ids for req in self.reqs()]
            )

    def reqs(self):
        return self.batch.reqs

    def batch_size(self):
        return self.batch.batch_size()

    def cumulate_input_tokens(
        self,
        input_ids: typing.Union[
            typing.List[torch.Tensor], typing.List[typing.List[int]]
        ],
    ):
        """
        Feed the input tokens to the penalizers.

        Args:
            input_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The input tokens.
        """
        token_ids = _TokenIDs(orchestrator=self, token_ids=input_ids)

        for penalizer in self.penalizers.values():
            penalizer.cumulate_input_tokens(input_ids=token_ids)

    def cumulate_output_tokens(
        self,
        output_ids: typing.Union[
            typing.List[torch.Tensor], typing.List[typing.List[int]]
        ],
    ):
        """
        Feed the output tokens to the penalizers.

        Args:
            output_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The output tokens.
        """
        if not self.is_required:
            return

        token_ids = _TokenIDs(orchestrator=self, token_ids=output_ids)

        for penalizer in self.penalizers.values():
            penalizer.cumulate_output_tokens(output_ids=token_ids)

    def apply(self, logits: torch.Tensor) -> torch.Tensor:
        """
        Apply the penalizers to the logits.
        Note that it may apply the penalizers in-place.

        Args:
            logits (torch.Tensor): The logits to apply the penalizers to.

        Returns:
            torch.Tensor: The logits after applying the penalizers.
        """
        if not self.is_required:
            return

        for penalizer in self.penalizers.values():
            logits = penalizer.apply(logits)

        return logits

    def filter(
        self,
        indices_to_keep: typing.List[int],
        indices_tensor_to_keep: torch.Tensor = None,
    ):
        """
        Filter the penalizers based on the indices to keep in the batch.

        Args:
            indices_to_keep (typing.List[int]): List of indices to keep in the batch.
            indices_tensor_to_keep (torch.Tensor = None): Tensor of indices to keep in the batch. If not None, it will be used instead of converting indices_to_keep to a tensor.
        """
        if not self.is_required:
            return

        empty_indices = len(indices_to_keep) == 0

        is_required = False
        for penalizer in self.penalizers.values():
            tmp_is_required = penalizer.is_required()
            is_required = is_required or tmp_is_required
            if not tmp_is_required or empty_indices:
                penalizer.teardown()
            else:
                # create tensor index only when it's needed
                if indices_tensor_to_keep is None:
                    indices_tensor_to_keep = torch.tensor(
                        indices_to_keep, dtype=torch.int32, device=self.device
                    )

                penalizer.filter(
                    indices_to_keep=indices_to_keep,
                    indices_tensor_to_keep=indices_tensor_to_keep,
                )
        self.is_required = is_required

    def merge(self, their: "BatchedPenalizerOrchestrator"):
        """
        Merge the penalizers of another orchestrator into this one.

        Note that this function **must** be called _before_ self.batch.reqs is updated (filtered).
        Each unprepared penalizers would have to be prepared (creating tensors, etc.) first before merging.
        This step requires the original batch.reqs, before it gets merged with other batch.reqs.

        Args:
            their (BatchedPenalizerOrchestrator): The orchestrator to merge into this one.
        """
        if not self.is_required and not their.is_required:
            return

        self.is_required |= their.is_required
        for Penalizer, their_penalizer in their.penalizers.items():
            if Penalizer not in self.penalizers:
                raise ValueError(f"Penalizer {Penalizer} not found in self.penalizers")

            self.penalizers[Penalizer].merge(their_penalizer)


class _TokenIDs:
    """
    A class that wraps token IDs to provide additional utility functions to penalizers.

    Attributes:
        orchestrator (BatchedPenalizerOrchestrator): The orchestrator that this token IDs belong to.
        token_ids (typing.Union[torch.Tensor, typing.List[torch.Tensor]]): The token IDs.
        cached_counts (torch.Tensor): The cached occurrence count tensor.
    """

    orchestrator: BatchedPenalizerOrchestrator
    token_ids: typing.Union[torch.Tensor, typing.List[torch.Tensor]]
    cached_counts: torch.Tensor = None

    def __init__(
        self,
        orchestrator: BatchedPenalizerOrchestrator,
        token_ids: typing.Union[
            typing.List[torch.Tensor], typing.List[typing.List[int]]
        ],
    ):
        self.orchestrator = orchestrator

        if not isinstance(token_ids[0], torch.Tensor):
            token_ids = [
                torch.tensor(
                    data=ids, dtype=torch.int64, device=self.orchestrator.device
                )
                for ids in token_ids
            ]

        self.token_ids = token_ids

    def occurrence_count(self) -> torch.Tensor:
        """
        Returns a tensor of shape (batch_size, vocab_size) where each element is the number of times the corresponding token appears in the batch.

        Returns:
            torch.Tensor: The occurrence count tensor.
        """
        if self.cached_counts is not None:
            return self.cached_counts

        token_ids = self.token_ids

        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.unsqueeze(1)

            # needs to be long to be used as index in scatter_add
            if token_ids.dtype != torch.int64:
                token_ids = token_ids.to(torch.int64)

        padded_token_ids = torch.nn.utils.rnn.pad_sequence(
            sequences=token_ids,
            batch_first=True,
            padding_value=self.orchestrator.vocab_size,
        )

        self.cached_counts = torch.zeros(
            size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
            dtype=torch.int64,
            device=self.orchestrator.device,
        ).scatter_add_(
            dim=1,
            index=padded_token_ids,
            src=torch.ones_like(padded_token_ids),
        )[
            :, : self.orchestrator.vocab_size
        ]

        return self.cached_counts


class _BatchedPenalizer(abc.ABC):
    """
    An abstract class for a batched penalizer.
    """

    orchestrator: BatchedPenalizerOrchestrator
    _is_prepared: bool = False

    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
        self.orchestrator = orchestrator

    def is_prepared(self) -> bool:
        return self._is_prepared

    def is_required(self) -> bool:
        return self._is_required()

    def prepare(self):
        if not self.is_prepared():
            self._prepare()
            self._is_prepared = True

    def prepare_if_required(self):
        if self.is_required():
            self.prepare()
            return True
        else:
            return False

    def teardown(self):
        if self.is_prepared():
            self._teardown()
            self._is_prepared = False

    def cumulate_input_tokens(self, input_ids: _TokenIDs):
        if not self.is_prepared():
            return

        self._cumulate_input_tokens(input_ids=input_ids)

    def cumulate_output_tokens(self, output_ids: _TokenIDs):
        if not self.is_prepared():
            return

        self._cumulate_output_tokens(output_ids=output_ids)

    def apply(self, logits: torch.Tensor) -> torch.Tensor:
        if not self.is_prepared():
            return logits

        return self._apply(logits=logits)

    def filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        if not self.is_prepared():
            return

        self._filter(
            indices_to_keep=indices_to_keep,
            indices_tensor_to_keep=indices_tensor_to_keep,
        )

    def merge(self, their: "_BatchedPenalizer"):
        if not self.is_prepared() and not their.is_prepared():
            return

        self.prepare()
        their.prepare()
        self._merge(their)

    @abc.abstractmethod
    def _is_required(self) -> bool:
        """
        Check if the penalizer is required to be prepared.
        """
        pass

    @abc.abstractmethod
    def _prepare(self):
        """
        Prepare the penalizer.
        Usually, this is where the penalizer initializes its tensors.
        """
        pass

    @abc.abstractmethod
    def _teardown(self):
        """
        Tear down the penalizer.
        Usually, this is where the penalizer frees its tensors.
        """
        pass

    @abc.abstractmethod
    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        """
        Cumulate the input tokens.
        Orchestrator will call this function to feed the input tokens to the penalizer.
        """
        pass

    @abc.abstractmethod
    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        """
        Cumulate the output tokens.
        Orchestrator will call this function to feed the output tokens to the penalizer.
        """
        pass

    @abc.abstractmethod
    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        """
        Apply the penalizer to the logits.
        Penalizers can modify the logits in-place if needed.
        """
        pass

    @abc.abstractmethod
    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        """
        Filter the penalizer (tensors or underlying data) based on the indices to keep in the batch.
        """
        pass

    @abc.abstractmethod
    def _merge(self, their: "_BatchedPenalizer"):
        """
        Merge the penalizer with another penalizer.
        """
        pass


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/frequency_penalty.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedFrequencyPenalizer(_BatchedPenalizer):
    """
    Frequency penalizer penalizes tokens based on their frequency in the output.
    """

    frequency_penalties: torch.Tensor = None
    cumulated_frequency_penalties: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.frequency_penalty != 0.0
            for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.cumulated_frequency_penalties = (
            torch.tensor(
                data=[0.0 for _ in self.orchestrator.reqs()],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .repeat(1, self.orchestrator.vocab_size)
        )

        self.frequency_penalties = (
            torch.tensor(
                data=[
                    req.sampling_params.frequency_penalty
                    for req in self.orchestrator.reqs()
                ],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .expand_as(self.cumulated_frequency_penalties)
        )

    def _teardown(self):
        del self.frequency_penalties
        del self.cumulated_frequency_penalties

        self.frequency_penalties = None
        self.cumulated_frequency_penalties = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        pass

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        self.cumulated_frequency_penalties += (
            self.frequency_penalties * output_ids.occurrence_count()
        )

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        logits -= self.cumulated_frequency_penalties
        return logits

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.frequency_penalties = self.frequency_penalties[indices_tensor_to_keep]
        self.cumulated_frequency_penalties = self.cumulated_frequency_penalties[
            indices_tensor_to_keep
        ]

    def _merge(self, their: "BatchedFrequencyPenalizer"):
        self.frequency_penalties = torch.cat(
            [self.frequency_penalties, their.frequency_penalties], dim=0
        )
        self.cumulated_frequency_penalties = torch.cat(
            [self.cumulated_frequency_penalties, their.cumulated_frequency_penalties],
            dim=0,
        )


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/min_new_tokens.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
    """
    Min new tokens penalizer penalizes tokens based on the length of the output.
    """

    min_new_tokens: torch.Tensor = None
    stop_token_penalties: torch.Tensor = None
    len_output_tokens: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.min_new_tokens > 0 for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.min_new_tokens = torch.tensor(
            data=[
                req.sampling_params.min_new_tokens for req in self.orchestrator.reqs()
            ],
            dtype=torch.int32,
            device=self.orchestrator.device,
        ).unsqueeze_(1)

        padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
            sequences=[
                torch.tensor(
                    data=(
                        list(
                            (req.sampling_params.stop_token_ids or set())
                            | (req.tokenizer.additional_stop_token_ids or set())
                            | {req.tokenizer.eos_token_id}
                        )
                    ),
                    dtype=torch.int64,
                    device=self.orchestrator.device,
                )
                for req in self.orchestrator.reqs()
            ],
            batch_first=True,
            padding_value=self.orchestrator.vocab_size,
        )
        self.stop_token_penalties = torch.zeros(
            size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
            dtype=torch.float32,
            device=self.orchestrator.device,
        ).scatter_add_(
            dim=1,
            index=padded_stop_token_ids,
            src=torch.full_like(
                input=padded_stop_token_ids,
                dtype=torch.float32,
                fill_value=float("-inf"),
                device=self.orchestrator.device,
            ),
        )[
            :, : self.orchestrator.vocab_size
        ]

        self.len_output_tokens = torch.zeros(
            size=(self.orchestrator.batch_size(), 1),
            dtype=torch.int32,
            device=self.orchestrator.device,
        )

    def _teardown(self):
        del self.min_new_tokens
        del self.stop_token_penalties
        del self.len_output_tokens

        self.min_new_tokens = None
        self.stop_token_penalties = None
        self.len_output_tokens = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        pass

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        self.len_output_tokens += 1

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        mask = (self.len_output_tokens < self.min_new_tokens).expand_as(logits)
        logits[mask] += self.stop_token_penalties[mask]
        return logits

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.min_new_tokens = self.min_new_tokens[indices_tensor_to_keep]
        self.stop_token_penalties = self.stop_token_penalties[indices_tensor_to_keep]
        self.len_output_tokens = self.len_output_tokens[indices_tensor_to_keep]

    def _merge(self, their: "BatchedMinNewTokensPenalizer"):
        self.min_new_tokens = torch.cat(
            [self.min_new_tokens, their.min_new_tokens], dim=0
        )
        self.stop_token_penalties = torch.cat(
            [self.stop_token_penalties, their.stop_token_penalties], dim=0
        )
        self.len_output_tokens = torch.cat(
            [self.len_output_tokens, their.len_output_tokens], dim=0
        )


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/presence_penalty.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedPresencePenalizer(_BatchedPenalizer):
    """
    Presence penalizer penalizes tokens based on their presence in the output.
    """

    presence_penalties: torch.Tensor = None
    cumulated_presence_penalties: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.presence_penalty != 0.0
            for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.cumulated_presence_penalties = (
            torch.tensor(
                data=[0.0 for _ in self.orchestrator.reqs()],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .repeat(1, self.orchestrator.vocab_size)
        )

        self.presence_penalties = (
            torch.tensor(
                data=[
                    req.sampling_params.presence_penalty
                    for req in self.orchestrator.reqs()
                ],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .expand_as(self.cumulated_presence_penalties)
        )

    def _teardown(self):
        del self.presence_penalties
        del self.cumulated_presence_penalties

        self.presence_penalties = None
        self.cumulated_presence_penalties = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        pass

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        mask = output_ids.occurrence_count() > 0
        self.cumulated_presence_penalties[mask] = self.presence_penalties[mask]

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        logits -= self.cumulated_presence_penalties
        return logits

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.presence_penalties = self.presence_penalties[indices_tensor_to_keep]
        self.cumulated_presence_penalties = self.cumulated_presence_penalties[
            indices_tensor_to_keep
        ]

    def _merge(self, their: "BatchedPresencePenalizer"):
        self.presence_penalties = torch.cat(
            [self.presence_penalties, their.presence_penalties], dim=0
        )
        self.cumulated_presence_penalties = torch.cat(
            [self.cumulated_presence_penalties, their.cumulated_presence_penalties],
            dim=0,
        )


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/repetition_penalty.py
================================================
import typing

import torch

from ..orchestrator import _BatchedPenalizer, _TokenIDs


class BatchedRepetitionPenalizer(_BatchedPenalizer):
    """
    Repetition penalizer penalizes tokens based on their repetition in the input and output.
    """

    repetition_penalties: torch.Tensor = None
    cumulated_repetition_penalties: torch.Tensor = None

    def _is_required(self) -> bool:
        return any(
            req.sampling_params.repetition_penalty != 1.0
            for req in self.orchestrator.reqs()
        )

    def _prepare(self):
        self.cumulated_repetition_penalties = (
            torch.tensor(
                data=[1.0 for _ in self.orchestrator.reqs()],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .repeat(1, self.orchestrator.vocab_size)
        )

        self.repetition_penalties = (
            torch.tensor(
                data=[
                    req.sampling_params.repetition_penalty
                    for req in self.orchestrator.reqs()
                ],
                dtype=torch.float32,
                device=self.orchestrator.device,
            )
            .unsqueeze_(1)
            .expand_as(self.cumulated_repetition_penalties)
        )

    def _teardown(self):
        del self.repetition_penalties
        del self.cumulated_repetition_penalties

        self.repetition_penalties = None
        self.cumulated_repetition_penalties = None

    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
        mask = input_ids.occurrence_count() > 0
        self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]

    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
        mask = output_ids.occurrence_count() > 0
        self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]

    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
        return torch.where(
            logits > 0,
            logits / self.cumulated_repetition_penalties,
            logits * self.cumulated_repetition_penalties,
        )

    def _filter(
        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
    ):
        self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]
        self.cumulated_repetition_penalties = self.cumulated_repetition_penalties[
            indices_tensor_to_keep
        ]

    def _merge(self, their: "BatchedRepetitionPenalizer"):
        self.repetition_penalties = torch.cat(
            [self.repetition_penalties, their.repetition_penalties], dim=0
        )
        self.cumulated_repetition_penalties = torch.cat(
            [self.cumulated_repetition_penalties, their.cumulated_repetition_penalties],
            dim=0,
        )


================================================
FILE: kt-sft/ktransformers/server/balance_serve/inference/sampling/sampler.py
================================================
'''
Date: 2024-11-14 12:23:45
LastEditors: Xie Weiyu ervinxie@qq.com
LastEditTime: 2024-11-25 08:59:23
'''
import logging
import torch
from torch import nn
from transformers import GenerationConfig

from flashinfer.sampling import (
	min_p_sampling_from_probs,
	top_k_renorm_probs,
	top_k_top_p_sampling_from_logits,
	top_p_renorm_probs,
)

logger = logging.getLogger(__name__)

class SamplingOptions():
	# Batched sampling params
	temperatures: torch.Tensor
	top_ps: torch.Tensor
	top_ks: torch.Tensor
	min_ps: torch.Tensor

	# All requests use greedy sampling
	is_all_greedy: bool

	# Dispatch in CUDA graph
	need_min_p_sampling: bool
	
	def __init__(self, bsz = 1, device = torch.device('cuda'), pretrained_config:GenerationConfig = None, temperatures: torch.Tensor = None, top_ps: torch.Tensor = None):
		if pretrained_config is None and temperatures is None:
			self.temperatures = torch.full((bsz, 1), 0, device=device, dtype=torch.float32)
			self.top_ps = torch.ones((bsz, 1), device=device, dtype=torch.float32)
			self.top_ks = torch.ones((bsz, 1), device=device, dtype=torch.float32)
			self.need_min_p_sampling = False
			self.is_all_greedy = True
		else:
			if temperatures is not None:
				self.temperatures = temperatures.unsqueeze(-1)
			else:
				self.temperatures = torch.full((bsz, 1), pretrained_config.temperature, device=device, dtype=torch.float32)
			
			if top_ps is not None:
				self.top_ps = top_ps.unsqueeze(-1)
			else:	
				self.top_ps = torch.full((bsz, 1), pretrained_config.top_p, device=device, dtype=torch.float32)
			self.top_ks = torch.full((bsz, 1), pretrained_config.top_k, device=device, dtype=torch.float32)
			self.need_min_p_sampling = False
			self.is_all_greedy = False

class Sampler(nn.Module):
	def __init__(self):
		super().__init__()
	
	def forward(
		self,
		logits: torch.Tensor,
		sampling_config: SamplingOptions = None,
	):
		if sampling_config == None:
			sampling_config = SamplingOptions()

		logits = logits.contiguous()
		origin_logits = logits.clone()
		if sampling_config.is_all_greedy:
			# Use torch.argmax if all requests use greedy sampling
			probs = logits
			batch_next_token_ids = torch.argmax(logits, -1)
		else:
			# Post process logits
			logits.div_(sampling_config.temperatures)
			max_top_k_round, batch_size = 32, logits.shape[0]
			if sampling_config.need_min_p_sampling:
				probs = torch.softmax(logits, dim=-1)
				logits = None
				del logits
				probs = top_k_renorm_probs(probs, sampling_config.top_ks)
				probs = top_p_renorm_probs(probs, sampling_config.top_ps)
				batch_next_token_ids = min_p_sampling_from_probs(
					probs, sampling_config.min_ps
				)
				temperature_0_idx = torch.where(sampling_config.temperatures == 0)[0]
				batch_next_token_ids[temperature_0_idx] = torch.argmax(origin_logits[temperature_0_idx], -1).to(torch.int32)
			else:
				# TODO: use different kernel when don't need top_k or top_p
				# @TODO get probs
				probs = logits
				batch_next_token_ids = top_k_top_p_sampling_from_logits(
					logits,
					sampling_config.top_ks,
					sampling_config.top_ps,
					filter_apply_order="joint",
				)
				temperature_0_idx = torch.where(sampling_config.temperatures == 0)[0]
				batch_next_token_ids[temperature_0_idx] = torch.argmax(origin_logits[temperature_0_idx], -1).to(torch.int32)
			
		return batch_next_token_ids.to(torch.int32), probs

================================================
FILE: kt-sft/ktransformers/server/balance_serve/sched_rpc.py
================================================
from datetime import datetime
import os
from typing import Optional
import zmq
import pickle
import threading
import torch.multiprocessing as mp
import sys
current_file_path = os.path.abspath(__file__)
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
import pickle
import argparse
from ktransformers.server.balance_serve.settings import sched_ext, create_sched_settings, create_sched_settings_qwen2moe, create_sched_settings_qwen3moe


if mp.get_start_method(allow_none=True) is None:
    print('set start method')
    mp.set_start_method('spawn')
else:
    print(f'start method already set to {mp.get_start_method(allow_none=True)}')


class SchedulerServer:
    def __init__(self, settings, main_args):
        self.sched = sched_ext.create_scheduler(settings)
    
        self.context = zmq.Context()
        self.frontend = self.context.socket(zmq.ROUTER)
        print(f"sched zmq rpc server on port {main_args.sched_port}")
        self.frontend.bind(f"tcp://*:{main_args.sched_port}") 

        self.backend = self.context.socket(zmq.DEALER)
        self.backend.bind("inproc://backend")

    def run_scheduler(self):
        self.sched.run()

    def stop_scheduler(self):
        self.sched.stop()

    def start_proxy(self):
        zmq.proxy(self.frontend, self.backend)

    def worker_routine(self):
        worker = self.context.socket(zmq.REP)
        worker.connect("inproc://backend")
        while True:
            try:
                message = worker.recv()
                data = pickle.loads(message)

                method = data.get('method')
                params = data.get('params', {})
                # print(f"Received request: {method}")

                if method == 'add_query':
                    query_add = params.get('query')
                    query_id = self.sched.add_query(query_add)
                    response = {'status': 'ok', 'query_id': query_id}
                    worker.send(pickle.dumps(response))

                elif method == 'cancel_query':
                    query_id = params.get('query_id')
                    self.sched.cancel(query_id)
                    response = {'status': 'ok'}
                    worker.send(pickle.dumps(response))

                elif method == 'update_last_batch':
                    updates = params.get('updates')

                    batch_todo = self.sched.update_last_batch(updates)

                    response = {'status': 'ok', 'batch_todo': batch_todo}
                    # print (batch_todo.query_lengths, batch_todo.query_ids)
                    worker.send(pickle.dumps(response))

                elif method == 'get_inference_context':
                    inference_context = self.sched.get_inference_context()
                    data = {
                        "k_cache":inference_context.k_cache,
                        "v_cache":inference_context.v_cache
                    }
                    print(f"Serializing KVCache")
                    data["k_cache"] = [mp.reductions.reduce_tensor(t) for t in data['k_cache']]
                    data["v_cache"] = [mp.reductions.reduce_tensor(t) for t in data['v_cache']]
                    # print(data)
                    response = {'status': 'ok', 'inference_context': data}

                    worker.send(pickle.dumps(response))
                    # response['inference_context'].k_cache[0][0, 0, 0, 0, 0] = 1 
                    # print("k_cache update")

                else:
                    response = {'status': 'error', 'message': 'Unknown method'}
                    worker.send(pickle.dumps(response))

            except Exception as e:
                response = {'status': 'error', 'message': str(e)}
                worker.send(pickle.dumps(response))

    def start_rpc_service(self):
        try:
            print("Scheduler RPC service is running...")

            threading.Thread(target=self.run_scheduler, daemon=True).start()

            for _ in range(10):
                threading.Thread(target=self.worker_routine, daemon=True).start()

            self.start_proxy()

        except KeyboardInterrupt:
            print("Shutting down scheduler RPC service...")
            self.stop_rpc_service()

    def stop_rpc_service(self):
        self.stop_scheduler()
        self.frontend.close()
        self.backend.close()
        self.context.term()

def start_server(settings, main_args):
    server = SchedulerServer(settings, main_args)
    server.start_rpc_service()


# Add async client for webserver
class SchedulerClient:
    def __init__(self, sched_port):
        address=f'tcp://localhost:{sched_port}'
        self.address = address
        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REQ)
        self.socket.connect(self.address)
        print(f"Connected to server at {self.address}")
    
    def __del__(self):
        self.socket.close()
        self.context.term()
    
    def send_request(self, method, params=None):
        if params is None:
            params = {}
        request = {
            'method': method,
            'params': params
        }
        # print(f'send request {request}')
        self.socket.send(pickle.dumps(request))
        response = self.socket.recv()
        # print(response)
        response = pickle.loads(response)
        if response.get('status') == 'ok':
            return response
        else:
            raise Exception(f"Error from server: {response.get('message')}")
    
    def add_query(self, query):
        response = self.send_request('add_query', {'query': query})
        return response.get('query_id')
    
    def cancel_query(self, query_id):
        self.send_request('cancel_query', {'query_id': query_id})
    
    def update_last_batch(self, updates):
        response = self.send_request('update_last_batch', {'updates': updates})
        # print(f"update_last_batch response {response}")
        return response.get('batch_todo')
    
    def rebuild_inferece_context(self,response):
        data = response.get('inference_context')
        inference_context = sched_ext.InferenceContext()
        print('Rebuilding kvcache')
        inference_context.k_cache = [fn(*args) for fn,args in data['k_cache']]
        inference_context.v_cache = [fn(*args) for fn,args in data['v_cache']]
        return inference_context

    def get_inference_context_raw(self):
        response = self.send_request('get_inference_context')
        return response
       

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, required=True)
    args = parser.parse_args()
    with open(args.config, "rb") as f:
        main_args = pickle.load(f)
    if main_args.architectures == "Qwen2MoeForCausalLM": 
        settings = create_sched_settings_qwen2moe(main_args)
    elif main_args.architectures == "Qwen3MoeForCausalLM":
        settings = create_sched_settings_qwen3moe(main_args)
    else:
        settings = create_sched_settings(main_args)
    start_server(settings, main_args)


================================================
FILE: kt-sft/ktransformers/server/balance_serve/settings.py
================================================
'''
Date: 2024-11-13 09:43:39
LastEditors: djw
LastEditTime: 2024-11-18 16:41:03
'''
import sys, os
import yaml, json
from time import sleep


import sched_ext
from transformers import AutoConfig

from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig

def create_sched_settings(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = 1 # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 576
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = True
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = False

    settings.kvc2_root_path = '/mnt/data/persist-kvc'
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings


def create_sched_settings_qwen2moe(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 128
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = False
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = True

    settings.kvc2_root_path = '/mnt/data/persist-kvc'
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings


def create_sched_settings_qwen3moe(args):
    default_sample_options = sched_ext.SampleOptions()
    model_name = os.path.basename(os.path.normpath(args.model_dir))
    input_model_settings = sched_ext.ModelSettings()
    input_model_settings.model_path = args.model_dir
    input_model_settings.params_count = int(0)
    model_config = Qwen3MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
    input_model_settings.layer_count = model_config.num_hidden_layers
    input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
    input_model_settings.k_head_dim = 128
    input_model_settings.bytes_per_params = 2
    input_model_settings.bytes_per_kv_cache_element = 2
    settings = sched_ext.Settings()
    settings.model_name = model_name
    settings.quant_type = "BF16"
    settings.model_settings = input_model_settings
    settings.page_size = args.page_size
    settings.gpu_device_count = 1 # tp
    settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
    # settings.gpu_memory_size = args.cache_lens*576*2
    settings.gpu_memory_size = args.gpu_memory_size
    settings.memory_utilization_percentage = args.utilization_percentage
    max_batch_size = args.max_batch_size
    chunk_size = args.chunk_size

    max_decode_batch_size = max_batch_size - 2

    settings.max_batch_size = max_batch_size
    settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
    settings.sample_options = default_sample_options
    settings.sched_metrics_port = args.sched_metrics_port
    settings.gpu_only = args.memory_gpu_only
    settings.use_self_defined_head_dim = False
    settings.self_defined_head_dim = 576
    settings.full_kv_cache_on_each_gpu = True
    settings.k_cache_on = True
    settings.v_cache_on = True

    settings.kvc2_root_path = '/mnt/data/persist-kvc'
    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port
    settings.load_from_disk = False
    settings.save_to_disk = True


    settings.strategy_name = args.sched_strategy

    settings.auto_derive()
    return settings


================================================
FILE: kt-sft/ktransformers/server/config/config.py
================================================
#!/usr/bin/env python
# coding=utf-8
"""
Description  :
Author       : unicornchan
Date         : 2024-06-11 16:35:42
Version      : 1.0.0
LastEditors  : WuHao
LastEditTime : 2024-08-12 06:31:14
"""
import os
import shutil
import yaml
import psutil

from ktransformers.server.config.singleton import Singleton
from typing import Optional


class Config(metaclass=Singleton):
    """Singleton pattern Config class, used to get all configurations."""

    CONFIG_FILE_NAME = "config.yaml"

    @staticmethod
    def load() -> dict:
        """load config file

        Returns:
            dict: all configs
        """
        base_path: str = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        config_yaml: str = os.path.join(base_path, "configs", Config.CONFIG_FILE_NAME)

        user_path: str = os.path.expanduser("~")
        localstore_path: str = os.path.join(user_path, ".ktransformers")
        kvc2_config_dir = os.path.join(localstore_path, "kvc2")
        config_path: str = os.path.join(localstore_path, Config.CONFIG_FILE_NAME)
        if not os.path.exists(config_yaml):
            print(f"Can't find config file, {config_yaml}")
            exit(-1)
        if not os.path.exists(localstore_path):
            os.mkdir(localstore_path)
        if not os.path.exists(kvc2_config_dir):
            os.mkdir(kvc2_config_dir)
        if not os.path.exists(config_path):
            shutil.copyfile(config_yaml, config_path)
        with open(config_path, "r", encoding="utf-8") as fp:
            config = yaml.safe_load(fp)
        return config

    @staticmethod
    def to_path(path: str) -> str:
        """
        process file path
        """
        base_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        real_path = path if os.path.isabs(path) else os.path.join(base_path, path)
        return real_path

    def __init__(self):
        cfg = Config.load()
        self.base_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        self.user_path: str = os.path.expanduser("~")
        self.localstore_path: str = os.path.join(self.user_path, ".ktransformers")
        # log configs
        self.log_dir = os.path.join(self.localstore_path, cfg["log"]["dir"])
        if not os.path.exists(self.log_dir):
            os.mkdir(self.log_dir)
        self.log_file = cfg["log"]["file"]
        self.log_level = cfg["log"]["level"]
        self.backup_count = cfg["log"]["backup_count"]

        self.kvc2_config_dir = os.path.join(self.localstore_path, "kvc2")
        # server configs
        self.server: dict = cfg.get("server", {})
        self.server_ip = self.server.get("ip", "0.0.0.0")
        self.server_port = self.server.get("port", 9016)
        self.api_key = self.server.get("api_key", "")

        # db configs
        self.db_configs: dict = cfg.get("db", {})
        self.db_type = self.db_configs.get("type", "")
        self.db_host = self.localstore_path
        self.db_port = self.db_configs.get("port", "")
        self.db_name = self.db_configs.get("database", "")
        self.db_pool_size = self.db_configs.get("pool_size")
        self.db_database = self.db_configs.get("database", "")

        # user config
        self.user_config: dict = cfg.get("user", {})
        self.user_secret_key = self.user_config.get("secret_key", "")
        self.user_algorithm = self.user_config.get("algorithm", "")
        self.user_force_think = self.user_config.get("force_think", False)

        # model config
        self.model: dict = cfg.get("model", {})
        self.backend_type: str = self.model.get("type", "transformers")
        self.model_dir: str = self.model.get("path", "")
        # to make sure it consistent with previous version
        self.model_path: str = self.model_dir
        self.model_name: str = self.model.get("name", "")
        self.architectures: str = self.model.get("name", "")
        self.model_device: str = self.model.get("device", "cuda:0")
        self.gguf_path: Optional[str] = self.model.get("gguf_path", None)
        self.use_cuda_graph = self.model.get("use_cuda_graph", True)
        self.trust_remote_code = self.model.get("trust_remote_code", True)
        # self.model_cache_lens = self.model.get("cache_lens")
        self.optimize_config_path: Optional[str] = self.model.get(
            "optimize_config_path", None
        )
        
        self.max_new_tokens = self.model.get("max_new_tokens", 2000)
        self.json_mode = self.model.get("json_mode", False)
        self.healing = self.model.get("healing", False)
        self.ban_strings: Optional[list] = self.model.get("ban_strings", None)
        self.gpu_split: Optional[str] = self.model.get("gpu_split", None)
        self.length: Optional[int] = self.model.get("length", None)
        self.rope_scale: Optional[float] = self.model.get("rope_scale", None)
        self.rope_alpha: Optional[float] = self.model.get("rope_alpha", None)
        self.no_flash_attn = self.model.get("no_flash_attn", False)
        self.low_mem = self.model.get("low_mem", False)
        self.experts_per_token: Optional[int] = self.model.get("experts_per_token", None)
        self.load_q4 = self.model.get("load_q4", False)
        self.fast_safetensors = self.model.get("fast_safetensors", False)
        self.draft_model_dir: Optional[str] = self.model.get("draft_model_dir", None)
        self.no_draft_scale = self.model.get("no_draft_scale", False)
        self.modes = self.model.get("modes", False)
        self.mode = self.model.get("mode", "llama")
        self.username = self.model.get("username", "User")
        self.botname = self.model.get("botname", "Chatbort")
        self.system_prompt: Optional[str] = self.model.get("system_prompt", None)
        self.temperature = self.model.get("temperature", 0.95)
        self.smoothing_factor = self.model.get("smoothing_factor", 0.0)
        self.dynamic_temperature: Optional[str] = self.model.get("dynamic_temperature", None)
        self.top_k = self.model.get("top_k", 50)
        self.top_p = self.model.get("top_p", 0.8)
        self.top_a = self.model.get("top_a", 0.0)
        self.skew = self.model.get("skew", 0.0)
        self.typical = self.model.get("typical", 0.0)
        self.repetition_penalty = self.model.get("repetition_penalty", 1.01)
        self.frequency_penalty = self.model.get("frequency_penalty", 0.0)
        self.presence_penalty = self.model.get("presence_penalty", 0.0)
        self.response_chunk = self.model.get("response_chunk", 250)
        self.no_code_formatting = self.model.get("no_code_formatting", False)
        self.cache_8bit = self.model.get("cache_8bit", False)
        self.cache_q4 = self.model.get("cache_q4", True)
        self.ngram_decoding = self.model.get("ngram_decoding", False)
        self.print_timings = self.model.get("print_timings", False)
        self.amnesia = self.model.get("amnesia", False)
        self.batch_size = self.model.get("batch_size", 1)
        self.cache_lens = self.model.get("cache_lens", 4096)
        self.device = self.model.get("device", "cuda:2")

        # web config
        self.web: dict = cfg.get("web", {})
        self.web_cross_domain: bool = self.web.get("open_cross_domain", True)
        self.mount_web: bool = self.web.get("mount", False)

        # ext
        self.ext: dict = cfg.get("ext", {})
        self.cpu_infer = psutil.cpu_count(logical=False) - 3

        # file config
        self.local_store_configs: dict = cfg.get("local_store", {})
        self.file_upload_dir: str = os.path.join(
            self.localstore_path, self.local_store_configs.get("file_upload_dir", "")
        )
        self.assistant_store_dir: str = os.path.join(
            self.localstore_path, self.local_store_configs.get("assistant_store_dir", "")
        )

        # long context config
        self.long_context_config: dict = cfg.get("long_context", {})
        self.max_seq_len = self.long_context_config.get("max_seq_len", 32000)
        self.block_size = self.long_context_config.get("block_size", 128)
        self.local_windows_len = self.long_context_config.get("local_windows_len", 4096)
        self.second_select_num = self.long_context_config.get("second_select_num", 32)
        self.anchor_type = self.long_context_config.get("anchor_type", "DYNAMIC")
        self.kv_type = self.long_context_config.get("kv_type", "FP16")
        self.dense_layer_num = self.long_context_config.get("dense_layer_num", 2)
        self.anchor_num = self.long_context_config.get("anchor_num", 1)
        self.preselect_block = self.long_context_config.get("preselect_block", True)
        self.head_select_mode = self.long_context_config.get("head_select_mode", "SHARED")
        self.preselect_block_count = self.long_context_config.get("preselect_block_count", 32)
        self.layer_step = self.long_context_config.get("layer_step", 1)
        self.token_step = self.long_context_config.get("token_step", 100)

        # local chat
        self.local_chat_config: dict = cfg.get("local_chat", {})
        self.prompt_file = self.local_chat_config.get("prompt_file", None)

        # asyncserver
        self.sched_strategy = cfg["async_server"]["sched_strategy"]
        self.sched_port = cfg["async_server"]["sched_port"]
        self.sched_metrics_port = cfg["async_server"]["sched_metrics_port"]
        self.kvc2_metrics_port = cfg["async_server"]["kvc2_metrics_port"]
        self.max_batch_size = cfg["async_server"]["max_batch_size"]
        self.page_size = cfg["attn"]["page_size"]
        self.chunk_size = cfg["attn"]["chunk_size"]
        self.memory_gpu_only = cfg["kvc2"]["gpu_only"]
        self.cache_lens = ((self.cache_lens + self.page_size - 1) // self.page_size) * self.page_size
        self.gpu_memory_size = 2*576*61*self.cache_lens
        self.utilization_percentage = 1.0 #cfg["kvc2"]["utilization_percentage"]
        self.cpu_memory_size_GB = cfg["kvc2"]["cpu_memory_size_GB"]
        # only support 2 prefill task
        self.max_prefill_batch_size = 2
        self.max_decode_batch_size = self.max_batch_size - self.max_prefill_batch_size 


================================================
FILE: kt-sft/ktransformers/server/config/log.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : unicornchan
Date         : 2024-06-12 02:48:39
Version      : 1.0.0
LastEditors  : chenxl 
LastEditTime : 2024-07-27 01:55:50
'''

import codecs
import logging
import os
import re
import locale
from pathlib import Path
from logging.handlers import BaseRotatingHandler
import time
import colorlog

from ktransformers.server.config.config import Config


class DailyRotatingFileHandler(BaseRotatingHandler):
    """
    such as 'logging.TimeRotatingFileHandler', Additional features:
     - support multiprocess
     - support rotating daily
    """

    def __init__(self, filename, backupCount=0, encoding=None, delay=False, utc=False, **kwargs): # pylint: disable=unused-argument
        self.backup_count = backupCount
        self.utc = utc
        self.suffix = "%Y-%m-%d"
        self.base_log_path = Path(filename)
        if not os.path.exists(self.base_log_path.parent):
            os.makedirs(self.base_log_path.parent)
        self.base_filename = self.base_log_path.name
        self.current_filename = self._compute_fn()
        self.current_log_path = self.base_log_path.with_name(
            self.current_filename)
        BaseRotatingHandler.__init__(self, filename, 'a', encoding, delay)

    # pylint: disable=unused-argument, invalid-name
    def shouldRollover(self, record):
        """
        Determine whether to rotate the log. If the log filename corresponding to the current 
        time is not consistent with the currently opened log filename, then it is necessary
        to rotate the log
        Args:
            record: record is not used, as we are just comparing times, but it is needed so
        the method signatures are the same
        """
        if self.current_filename != self._compute_fn():
            return True
        return False

    def doRollover(self):
        """
        roll over
        """
        # close last log file
        if self.stream:
            self.stream.close()
            self.stream = None  # type: ignore

        # gen new log file name
        self.current_filename = self._compute_fn()
        self.current_log_path = self.base_log_path.with_name(
            self.current_filename)

        if not self.delay:
            self.stream = self._open() # type: ignore

        self.delete_expired_files()

    def _compute_fn(self):
        """
        gen log file name
        """
        return self.base_filename + "." + time.strftime(self.suffix, time.localtime())

    def _open(self):
        """
        open a new log file, create soft link
        """
        if self.encoding is None:
            stream = open(str(self.current_log_path), self.mode, encoding=locale.getpreferredencoding())
        else:
            stream = codecs.open(str(self.current_log_path), self.mode, self.encoding)

        if self.base_log_path.exists():
            try:
                if not self.base_log_path.is_symlink() or os.readlink(self.base_log_path) != self.current_filename:
                    os.remove(self.base_log_path)
            except OSError:
                pass

        try:
            os.symlink(self.current_filename, str(self.base_log_path))
        except OSError:
            pass
        return stream

    def delete_expired_files(self):
        """
        delete expired files every day
        """
        if self.backup_count <= 0:
            return

        file_names = os.listdir(str(self.base_log_path.parent))
        result = []
        prefix = self.base_filename + "."
        plen = len(prefix)
        for file_name in file_names:
            if file_name[:plen] == prefix:
                suffix = file_name[plen:]
                if re.match(r"^\d{4}-\d{2}-\d{2}(\.\w+)?$", suffix):
                    result.append(file_name)
        if len(result) < self.backup_count:
            result = []
        else:
            result.sort()
            result = result[:len(result) - self.backup_count]

        for file_name in result:
            os.remove(str(self.base_log_path.with_name(file_name)))


class Logger(object):
    """
    logger class
    """
    level_relations = {
        'debug': logging.DEBUG,
        'info': logging.INFO,
        'warn': logging.WARNING,
        'error': logging.ERROR,
        'crit': logging.CRITICAL
    }

    def __init__(self, level: str = 'info'):
        fmt = '%(asctime)s %(levelname)s %(pathname)s[%(lineno)d] %(funcName)s: %(message)s'
        cfg: Config = Config()
        filename: str = os.path.join(cfg.log_dir, cfg.log_file)
        backup_count: int = cfg.backup_count
        th = DailyRotatingFileHandler(filename=filename, when='MIDNIGHT', backupCount=backup_count, encoding="utf-8")
        th.setFormatter(logging.Formatter(fmt))


        color_fmt = (
            '%(log_color)s%(asctime)s %(levelname)s %(pathname)s[%(lineno)d]: %(message)s'
        )
        color_formatter = colorlog.ColoredFormatter(
            color_fmt,
            log_colors={
                'DEBUG': 'cyan',
                'INFO': 'green',
                'WARNING': 'yellow',
                'ERROR': 'red',
                'CRITICAL': 'bold_red'
            }
        )

        sh = logging.StreamHandler()
        sh.setFormatter(color_formatter)

        self.logger = logging.getLogger(filename)
        self.logger.setLevel(self.level_relations.get(level)) # type: ignore
        self.logger.addHandler(th)
        self.logger.addHandler(sh)


logger = Logger(level=Config().log_level).logger


================================================
FILE: kt-sft/ktransformers/server/config/singleton.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  : Implement singleton
Author       : unicornchan
Date         : 2024-06-11 17:08:36
Version      : 1.0.0
LastEditors  : chenxl 
LastEditTime : 2024-07-27 01:55:56
'''
import abc

class Singleton(abc.ABCMeta, type):
    """_summary_

    Args:
        abc.ABCMeta: Provide a mechanism for defining abstract methods and properties,
            enforcing subclasses to implement these methods and properties.
        type: Inherit from 'type' to make 'Singleton' a metaclass,
            enabling the implementation of the Singleton
    """
    _instances = {}

    def __call__(cls, *args, **kwds):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwds)
        return cls._instances[cls]

class AbstractSingleton(abc.ABC, metaclass=Singleton):
    """Provided an abstract Singleton base class, any class inheriting from
       this base class will automatically become a Singleton class.

    Args:
        abc.ABC: Abstract base class, it cannot be instantiated, only inherited. 
    """


================================================
FILE: kt-sft/ktransformers/server/crud/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/crud/assistants/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/crud/assistants/assistants.py
================================================
from time import time
from typing import Optional,List
from uuid import uuid4

from ktransformers.server.models.assistants.assistants import Assistant
from ktransformers.server.schemas.assistants.assistants import AssistantCreate,AssistantObject,AssistantModify
from ktransformers.server.utils.sql_utils import SQLUtil
from ktransformers.server.config.log import logger
from ktransformers.server.schemas.base import Order


class AssistantDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()

    def create_assistant_object(self, assistant: AssistantCreate) -> AssistantObject:
        assistant = AssistantObject(
            **assistant.model_dump(mode='json'),
            id=str(uuid4()),
            object='assistant',
            created_at=int(time()),
        )
        return assistant

    def db_count_assistants(self) -> int:
        with self.sql_util.get_db() as db:
            return db.query(Assistant).count()

    def db_create_assistant(self, assistant: AssistantCreate):
        ass_obj = self.create_assistant_object(assistant)
        ass_obj.sync_db()
        return ass_obj

    def db_list_assistants(self, limit: Optional[int], order: Order) -> List[AssistantObject]:
        with self.sql_util.get_db() as db:
            query = db.query(Assistant).order_by(
                order.to_sqlalchemy_order()(Assistant.created_at))
            if limit is not None:
                db_assistants = query.limit(limit)
            else:
                db_assistants = query.all()
            return [AssistantObject.model_validate(a.__dict__) for a in db_assistants]

    def db_get_assistant_by_id(self, assistant_id: str) -> Optional[AssistantObject]:
        with self.sql_util.get_db() as db:
            db_assistant = db.query(Assistant).filter(
                Assistant.id == assistant_id).first()
            if db_assistant is None:
                logger.debug(f"no assistant with id {str}")
                return None
            return AssistantObject.model_validate(db_assistant.__dict__)

    def db_update_assistant_by_id(self, assistant_id: str, assistant: AssistantModify):
        with self.sql_util.get_db() as db:
            db_assistant = db.query(Assistant).filter(
                Assistant.id == assistant_id).first()
            self.sql_util.db_update_commit_refresh(db, db_assistant, assistant)
            return AssistantObject.model_validate(db_assistant.__dict__)

    def db_delete_assistant_by_id(self, assistant_id: str):
        with self.sql_util.get_db() as db:
            db_assistant = db.query(Assistant).filter(
                Assistant.id == assistant_id).first()
            db.delete(db_assistant)
            db.commit()


================================================
FILE: kt-sft/ktransformers/server/crud/assistants/messages.py
================================================
from time import time
from typing import Optional
from uuid import uuid4

from ktransformers.server.models.assistants.messages import Message
from ktransformers.server.schemas.assistants.messages import MessageCore, MessageCreate,  MessageObject
from ktransformers.server.schemas.base import Order,ObjectID
from ktransformers.server.utils.sql_utils import SQLUtil

class MessageDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()

    @staticmethod
    def create_db_message_by_core(message: MessageCore):
        message_dict = message.model_dump(mode="json")
        return Message(**message_dict, id=str(uuid4()), created_at=int(time()))

    def create_db_message(self, message: MessageCreate):
        return MessageDatabaseManager.create_db_message_by_core(message.to_core())

    def db_add_message(self, message: Message):
        with self.sql_util.get_db() as db:
            db.add(message)
            self.sql_util.db_add_commit_refresh(db, message)

    def db_create_message(self, thread_id: str, message: MessageCreate, status: MessageObject.Status):
        db_message = self.create_db_message(message)
        db_message.status = status.value
        db_message.thread_id = thread_id
        self.db_add_message(db_message)
        return MessageObject.model_validate(db_message.__dict__)

    @staticmethod
    def create_message_object(thread_id: ObjectID, run_id: ObjectID, message: MessageCreate):
        core = message.to_core()
        return MessageObject(
            **core.model_dump(mode='json'),
            id=str(uuid4()),
            object='thread.message',
            created_at=int(time()),
            thread_id=thread_id,
            run_id=run_id,
            status=MessageObject.Status.in_progress,
        )

    def db_sync_message(self, message: MessageObject):
        db_message = Message(
            **message.model_dump(mode="json"),
        )
        with self.sql_util.get_db() as db:
            self.sql_util.db_merge_commit(db, db_message)

    def db_list_messages_of_thread(
            self, thread_id: str, limit: Optional[int] = None, order: Order = Order.DESC):

        # logger.debug(
        #     f"list messages of: {thread_id}, limit {limit}, order {order}")
        with self.sql_util.get_db() as db:
            query = (
                db.query(Message)
                .filter(Message.thread_id == thread_id)
                .order_by(order.to_sqlalchemy_order()(Message.created_at))
            )
            if limit is not None:
                messages = query.limit(limit)
            else:
                messages = query.all()
            message_list = [MessageObject.model_validate(m.__dict__) for m in messages]
        return message_list

    def db_get_message_by_id(self, thread_id: ObjectID, message_id: ObjectID) -> MessageObject:
        with self.sql_util.get_db() as db:
            message = db.query(Message).filter(
                Message.id == message_id).first()
        assert message.thread_id == thread_id
        message_info = MessageObject.model_validate(message.__dict__)
        return message_info

    def db_delete_message_by_id(self, thread_id: ObjectID, message_id: ObjectID):
        with self.sql_util.get_db() as db:
            message = db.query(Message).filter(
                Message.id == message_id).first()
            assert message.thread_id == thread_id
            db.delete(message)
            db.commit()


================================================
FILE: kt-sft/ktransformers/server/crud/assistants/runs.py
================================================
from time import time
from uuid import uuid4

from ktransformers.server.models.assistants.runs import Run
from ktransformers.server.schemas.assistants.runs import RunCreate,RunObject
from ktransformers.server.schemas.base import ObjectID
from ktransformers.server.utils.sql_utils import SQLUtil


class RunsDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()

    def create_run_object(self, thread_id: ObjectID, run: RunCreate) -> RunObject:
        run_obj = RunObject(
            **run.model_dump(mode='json', exclude={"stream"}),
            id=str(uuid4()),
            object='run',
            created_at=int(time()),
            thread_id=thread_id,
            status=RunObject.Status.queued,
        )
        run_obj.set_compute_save(0)
        return run_obj

    def db_create_run(self, thread_id: str, run: RunCreate):
        db_run = Run(
            **run.model_dump(mode="json", exclude={"stream"}),
            id=str(uuid4()),
            created_at=int(time()),
            status="queued",
            thread_id=thread_id,
        )
        with self.sql_util.get_db() as db:
            self.sql_util.db_add_commit_refresh(db, db_run)
            run_obj = RunObject.model_validate(db_run.__dict__)
            run_obj.set_compute_save(0)
        return run_obj

    def db_sync_run(self, run: RunObject) -> None:
        db_run = Run(
            **run.model_dump(mode='json'),
        )
        with self.sql_util.get_db() as db:
            self.sql_util.db_merge_commit(db, db_run)

    def db_get_run(self, run_id: ObjectID) -> RunObject:
        with self.sql_util.get_db() as db:
            db_run = db.query(Run).filter(Run.id == run_id).first()
            return RunObject.model_validate(db_run.__dict__)


================================================
FILE: kt-sft/ktransformers/server/crud/assistants/threads.py
================================================
from time import time
from typing import Optional,List
from uuid import uuid4

from ktransformers.server.models.assistants.messages import Message
from ktransformers.server.models.assistants.threads import Thread
from ktransformers.server.schemas.assistants.threads import ThreadCreate,ThreadObject
from ktransformers.server.schemas.base import ObjectID, Order
from ktransformers.server.schemas.conversation import ThreadPreview
from ktransformers.server.utils.sql_utils import SQLUtil
from ktransformers.server.crud.assistants.messages import MessageDatabaseManager
from ktransformers.server.config.log import logger
from ktransformers.server.crud.assistants.assistants import AssistantDatabaseManager

class ThreadsDatabaseManager:
    def __init__(self) -> None:
        self.sql_util = SQLUtil()
        self.message_manager = MessageDatabaseManager()
        self.assistant_maanager = AssistantDatabaseManager()

    def db_create_thread(self, thread: ThreadCreate):
        thread_id = str(uuid4())
        db_messages = []
        with self.sql_util.get_db() as db:
            if thread.messages is not None:
                logger.debug("Creating messages first for thread")
                for message in thread.messages:
                    db_message: Message = MessageDatabaseManager.create_db_message_by_core(
                        message)
                    db_message.role = "user"
                    db_message.thread_id = thread_id
                    db.add(db_message)
                    db_messages.append(db_message)

            db_thread = Thread(
                **thread.model_dump(exclude="messages"),
                id=str(uuid4()),
                created_at=int(time()),
                messages=db_messages,
            )

            self.sql_util.db_add_commit_refresh(db, db_thread)
            thread_obj = ThreadObject.model_validate(db_thread.__dict__)

            if 'assistant_id' in thread.meta_data:
#                assistant = self.assistant_maanager.db_get_assistant_by_id(thread.meta_data['assistant_id'], db)
                assistant = self.assistant_maanager.db_get_assistant_by_id(thread.meta_data['assistant_id'])
                logger.info(
                    f'Append this related thread to assistant {assistant.id}')
                assistant.append_related_threads([thread_obj.id])
                assistant.sync_db(db)
        return thread_obj

    def db_get_thread_by_id(self, thread_id: ObjectID):
        with self.sql_util.get_db() as db:
            db_thread = db.query(Thread).filter(Thread.id == thread_id).first()
            return ThreadObject.model_validate(db_thread.__dict__)

    def db_list_threads(self, limit: Optional[int], order: Order) -> List[ThreadObject]:
        with self.sql_util.get_db() as db:
            query = db.query(Thread).order_by(order.to_sqlalchemy_order()(
                Thread.created_at)).filter(~Thread.meta_data.contains('assistant_id'))

            if limit is not None:
                db_threads = query.limit(limit)
            else:
                db_threads = query.all()

            return [ThreadObject.model_validate(tool.__dict__) for tool in db_threads]

    def db_list_threads_preview(self, limit: Optional[int], order: Order) -> List[ThreadPreview]:
        threads = self.db_list_threads(limit, order)
        previews = []
        for thread in threads:
            messages = self.message_manager.db_list_messages_of_thread(
                thread.id, limit=2, order=Order.ASC)
            if len(messages) == 2:
                message = messages[0]
                assistant = self.assistant_maanager.db_get_assistant_by_id(
                    messages[1].assistant_id)
            else:
                message = None
                assistant = None
            previews.append(ThreadPreview(
                assistant=assistant, thread=thread, first_message=message))
        return previews

    def db_delete_thread_by_id(self, thread_id: ObjectID):
        with self.sql_util.get_db() as db:
            db_thread = db.query(Thread).filter(Thread.id == thread_id).first()
            db.delete(db_thread)
            # TODO delete related messages and runs and other stuff or just gc
            db.commit()


================================================
FILE: kt-sft/ktransformers/server/exceptions.py
================================================
from fastapi import HTTPException, status


def db_exception():
    return HTTPException(
        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        detail="DB Error",
    )


def not_implemented(what):
    return HTTPException(
        status_code=status.HTTP_501_NOT_IMPLEMENTED,
        detail=f"{what} not implemented",
    )


def internal_server_error(what):
    return HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"{what}")


def request_error(what):
    return HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"{what}")


================================================
FILE: kt-sft/ktransformers/server/main.py
================================================
import os
import re
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
import uvicorn.logging
import uvicorn
import sys
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
from fastapi.middleware.cors import CORSMiddleware
from ktransformers.server.args import ArgumentParser
from ktransformers.server.config.config import Config
from ktransformers.server.utils.create_interface import create_interface, GlobalInterface
from fastapi.openapi.utils import get_openapi
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from ktransformers.server.api import router, post_db_creation_operations
from ktransformers.server.utils.sql_utils import Base, SQLUtil
from ktransformers.server.config.log import logger


def mount_app_routes(mount_app: FastAPI):
    sql_util = SQLUtil()
    logger.info("Creating SQL tables")
    Base.metadata.create_all(bind=sql_util.sqlalchemy_engine)
    post_db_creation_operations()
    mount_app.include_router(router)


def create_app():
    cfg = Config()
    if(hasattr(GlobalInterface.interface, "lifespan")):
        app = FastAPI(lifespan=GlobalInterface.interface.lifespan)
    else:
        app = FastAPI()
    if Config().web_cross_domain:
        app.add_middleware(
            CORSMiddleware,
            allow_origins=["*"],
            allow_credentials=True,
            allow_methods=["*"],
            allow_headers=["*"],
        )
    mount_app_routes(app)
    if cfg.mount_web:
        mount_index_routes(app)
    return app


def update_web_port(config_file: str):
    ip_port_pattern = (
        r"(localhost|((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)):[0-9]{1,5}"
    )
    with open(config_file, "r", encoding="utf-8") as f_cfg:
        web_config = f_cfg.read()
    ip_port = "localhost:" + str(Config().server_port)
    new_web_config = re.sub(ip_port_pattern, ip_port, web_config)
    with open(config_file, "w", encoding="utf-8") as f_cfg:
        f_cfg.write(new_web_config)


def mount_index_routes(app: FastAPI):
    project_dir = os.path.dirname(os.path.dirname(__file__))
    web_dir = os.path.join(project_dir, "website/dist")
    web_config_file = os.path.join(web_dir, "config.js")
    update_web_port(web_config_file)
    if os.path.exists(web_dir):
        app.mount("/web", StaticFiles(directory=web_dir), name="static")
    else:
        err_str = f"No website resources in {web_dir}, please complile the website by npm first"
        logger.error(err_str)
        print(err_str)
        exit(1)


def run_api(app, host, port, **kwargs):
    if kwargs.get("ssl_keyfile") and kwargs.get("ssl_certfile"):
        uvicorn.run(
            app,
            host=host,
            port=port,
            ssl_keyfile=kwargs.get("ssl_keyfile"),
            ssl_certfile=kwargs.get("ssl_certfile"),
        )
    else:
        uvicorn.run(app, host=host, port=port, log_level="debug")


def custom_openapi(app):
    if app.openapi_schema:
        return app.openapi_schema
    openapi_schema = get_openapi(
        title="ktransformers server",
        version="1.0.0",
        summary="This is a server that provides a RESTful API for ktransformers.",
        description="We provided chat completion and openai assistant interfaces.",
        routes=app.routes,
    )
    openapi_schema["info"]["x-logo"] = {"url": "https://kvcache.ai/media/icon_1.png"}
    app.openapi_schema = openapi_schema
    return app.openapi_schema


def main():
    cfg = Config()

    arg_parser = ArgumentParser(cfg)

    args = arg_parser.parse_args()
    create_interface(config=cfg, default_args=cfg)
    app = create_app()
    custom_openapi(app)

    run_api(
        app=app,
        host=args.host,
        port=args.port,
        ssl_keyfile=args.ssl_keyfile,
        ssl_certfile=args.ssl_certfile,
    )

if __name__ == "__main__":
    main()


================================================
FILE: kt-sft/ktransformers/server/models/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/models/assistants/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/models/assistants/assistants.py
================================================
from sqlalchemy import JSON, Column, Float, Integer, String, Text
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Assistant(Base):
    __tablename__ = "assistants"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="assistant")
    created_at = Column(Integer)

    name = Column(String, nullable=True)
    description = Column(String, nullable=True)
    model = Column(String)
    instructions = Column(Text, nullable=True)
    tools = Column(JSON)
    tool_resources = Column(JSON)
    temperature = Column(Float, nullable=True)
    meta_data = Column(JSON, nullable=True)
    top_p = Column(Float, nullable=True)
    response_format = Column(JSON, default="auto")

    build_status = Column(JSON, nullable=True)

    runs = relationship("Run", back_populates="assistant")

    messages = relationship("Message", back_populates="assistant")


================================================
FILE: kt-sft/ktransformers/server/models/assistants/messages.py
================================================
from sqlalchemy import JSON, Column, ForeignKey, Integer, String
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Message(Base):
    __tablename__ = "messages"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread.message")
    created_at = Column(Integer)

    thread_id = Column(String, ForeignKey("threads.id"))
    status = Column(String, default="in_progress")
    incomplete_details = Column(JSON, nullable=True)
    completed_at = Column(Integer, nullable=True)
    incomplete_at = Column(Integer, nullable=True)
    role = Column(JSON)
    content = Column(JSON)
    assistant_id = Column(String, ForeignKey("assistants.id"), nullable=True)
    run_id = Column(String, ForeignKey("runs.id"), nullable=True)
    attachments = Column(JSON, nullable=True)
    meta_data = Column(JSON, nullable=True)

    thread = relationship("Thread", back_populates="messages")
    assistant = relationship("Assistant", back_populates="messages")
    run = relationship("Run", back_populates="message")


================================================
FILE: kt-sft/ktransformers/server/models/assistants/run_steps.py
================================================
from sqlalchemy import JSON, Column, ForeignKey, Integer, String
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class RunStep(Base):
    __tablename__ = "run_steps"
    # todo
    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread.run.step")
    created_at = Column(Integer)

    assistant_id = Column(String, ForeignKey("assistants.id"))
    thread_id = Column(String, ForeignKey("threads.id"))
    run_id = Column(String, ForeignKey("runs.id"))
    type = Column(String)
    status = Column(String)
    step_details = Column(JSON)
    last_error = Column(JSON, nullable=True)
    expires_at = Column(Integer, nullable=True)
    cancelled_at = Column(Integer, nullable=True)
    failed_at = Column(Integer, nullable=True)
    completed_at = Column(Integer, nullable=True)

    meta_data = Column(JSON, nullable=True)
    usage = Column(JSON, nullable=True)

    assistant = relationship("Assistant", back_populates="run_steps")
    thread = relationship("Thread", back_populates="run_steps")
    run = relationship("Run", back_populates="run_steps")


================================================
FILE: kt-sft/ktransformers/server/models/assistants/runs.py
================================================
from sqlalchemy import JSON, Column, Float, ForeignKey, Integer, String, Text
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Run(Base):
    __tablename__ = "runs"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread.run")
    created_at = Column(Integer)
    thread_id = Column(String, ForeignKey("threads.id"))
    assistant_id = Column(String, ForeignKey("assistants.id"))
    status = Column(String)
    required_action = Column(JSON, nullable=True)
    last_error = Column(JSON, nullable=True)
    expires_at = Column(Integer, nullable=True)
    started_at = Column(Integer, nullable=True)
    cancelled_at = Column(Integer, nullable=True)
    failed_at = Column(Integer, nullable=True)
    completed_at = Column(Integer, nullable=True)
    incomplete_details = Column(JSON, nullable=True)
    # get from assistant
    model = Column(String)
    instructions = Column(Text, nullable=True)
    tools = Column(JSON)
    meta_data = Column(JSON, nullable=True)
    usage = Column(JSON, nullable=True)
    temperature = Column(Float, nullable=True)
    top_p = Column(Float, nullable=True)
    max_propmp_tokens = Column(Integer, nullable=True)
    truncation_strategy = Column(JSON)
    tool_choice = Column(JSON)
    response_format = Column(JSON, default="auto")

    thread = relationship("Thread", back_populates="runs")
    assistant = relationship("Assistant", back_populates="runs")
    message = relationship("Message", back_populates="run")


================================================
FILE: kt-sft/ktransformers/server/models/assistants/threads.py
================================================
from sqlalchemy import JSON, Column, Integer, String
from sqlalchemy.orm import relationship

from ktransformers.server.utils.sql_utils import Base


class Thread(Base):
    __tablename__ = "threads"

    id = Column(String, primary_key=True, index=True)
    object = Column(String, default="thread")
    created_at = Column(Integer)

    tool_resources = Column(JSON, nullable=True)
    meta_data = Column(JSON, nullable=True)

    runs = relationship("Run", back_populates="thread")
    messages = relationship("Message", back_populates="thread")


================================================
FILE: kt-sft/ktransformers/server/schemas/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/schemas/assistants/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/schemas/assistants/assistants.py
================================================
from enum import Enum
from time import time
from typing import AsyncIterable, Callable, Dict, List, Optional, Union
from asyncio import Lock, Queue

from fastapi import logger
from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
import torch

from ktransformers.server.config.config import Config
from ktransformers.server.models.assistants.assistants import Assistant
from ktransformers.server.models.assistants.threads import Thread
from ktransformers.server.schemas.assistants.messages import Role
from ktransformers.server.schemas.assistants.runs import RunObject,RunStreamResponse,ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.threads import ThreadObject
from ktransformers.server.schemas.base import Metadata,MetadataField,ObjectID
from ktransformers.server.schemas.assistants.tool import Tool,CodeInterpreter,FileSearch,RelatedThreads,FuntionTool,ToolResource,CodeInterpreterResource,FileSearchResource,RelatedThreadsResource,ToolType
from ktransformers.server.utils.sql_utils import SQLUtil


class AssistantBase(BaseModel):
    name: Optional[str] = Field(None,description='The name of the assistant.') 
    description: Optional[str] = Field(None,description='The description of the assistant.')
    instructions: Optional[str] = Field(None,description='Instructions which is added in front of the input of LLM') 
    tools: List[Tool] = Field([], max_length=128)

    @field_validator('tools', mode='before')
    def validate_tools(cls, value):
        re = []
        if not isinstance(value, list):
            raise ValueError('Invalid type for tools')

        for tool in value:
            if 'type' not in tool:
                raise ValueError('Invalid type for tools')
            if tool['type'] == 'code_interpreter':
                re.append(CodeInterpreter(**tool))
            elif tool['type'] == 'file_search':
                re.append(FileSearch(**tool))
            elif tool['type'] == 'related_threads':
                re.append(RelatedThreads(**tool))
            elif tool['type'] == 'function':
                re.append(FuntionTool(**tool))
            else:
                raise ValueError('Invalid type for tools')
        return re

    tool_resources: List[ToolResource] = Field([], max_length=128)

    @field_validator('tool_resources', mode='before')
    def validate_tool_resources(cls, value):
        re = []
        if not isinstance(value, list):
            raise ValueError('Invalid type for tool resources')

        for tool_re in value:
            if 'file_ids' in tool_re:
                re.append(CodeInterpreterResource(**tool_re))
            elif 'vector_stores' in tool_re:
                re.append(FileSearchResource(**tool_re))
            elif 'thread_ids' in tool_re:
                re.append(RelatedThreadsResource(**tool_re))
            else:
                raise ValueError('Invalid type for tool resources')
        return re

    meta_data: Metadata = MetadataField

    @model_validator(mode='before')
    def convert_meta_data(cls, values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    temperature: Optional[float] = Field(ge=0.0, le=2.0, default=1)
    top_p: Optional[float] = Field(ge=0.0, le=1.0, default=1)
    response_format: Union[str, Dict[str, str]] = "auto"


class AssistantCreate(AssistantBase):
    model: str


class AssistantBuildStatus(BaseModel):
    class Status(Enum):
        not_build = "not_build"
        in_queue = "in_queue"
        parsing = "parsing"
        prefilling = "prefilling"
        dumping = "dumping"
        completed = "completed"
        paused = "paused"

    _lock: Lock = PrivateAttr(default_factory=Lock)
    _queue: Optional[Queue] = PrivateAttr(None)

    status: Status = Field(default=Status.not_build)
    total_file_count: int = Field(default=0)
    parsed_file_count: int = Field(default=0)

    prefilling_current: int = Field(default=0)
    prefilling_total: int = Field(default=0)

    build_started_time: Optional[int] = Field(default=None)
    build_completed_time: Optional[int] = Field(default=None)

    # in megabytes
    assistant_usage: int = Field(default=0, description='')
    assistant_total_usage: int = Field(default=0)
    disk_free_space: int = Field(default=0)
    disk_total_space: int = Field(default=0)

    def to_stream_reply(self) -> str:
        return f"event: assistant.build.status\ndata: {self.model_dump_json()}\n\n"


class AssistantObject(AssistantBase, ObjectWithCreatedTime):
    model: Optional[str] = Field(
        default=Config().model_name)
    related_threads_objects: Optional[List] = Field(None, exclude=True)
    _encoded_instruction: Optional[torch.Tensor] = PrivateAttr(default=None)
    build_status: AssistantBuildStatus = Field(default=AssistantBuildStatus())

    def as_api_response(self):
        return self.model_dump(exclude={'build_status'})

    def get_related_threads_ids(self) -> List[ObjectID]:
        re = []
        for tool, tool_re in zip(self.tools, self.tool_resources):
            if tool.type == ToolType.RELATED_THREADS:
                re += tool_re.thread_ids or []
        return re

    def get_related_threads_objects(self) -> List:
        # raise NotImplementedError  # should be replaced
        sql_utils = SQLUtil()
        if self.related_threads_objects is None:
            with sql_utils.get_db() as db:
                db_threads = db.query(Thread).all()
            self.related_threads_objects = [tool for tool in [ThreadObject.model_validate(
                tool.__dict__) for tool in db_threads] if tool.is_related_threads and tool.meta_data['assistant_id'] == self.id]
            # logger.debug(
            #     f'Found {len(self.related_threads_objects)} related threads')
        return self.related_threads_objects

    def append_related_threads(self, thread_ids: List[ObjectID]):
        # logger.debug(f'{self.tools} {self.tool_resources}')
        for tool, tool_re in zip(self.tools, self.tool_resources):
            if tool.type == ToolType.RELATED_THREADS:
                tool_re.thread_ids += thread_ids
                return

        self.tools.append(RelatedThreads(type=ToolType.RELATED_THREADS))
        self.tool_resources.append(
            RelatedThreadsResource(thread_ids=thread_ids))

    async def update_build_status(self, events: AsyncIterable) -> AsyncIterable:
        async for event in events:
            # logger.debug(event)
            if isinstance(event, RunStreamResponse):
                if event.event == RunObject.Status.completed:
                    self.build_status.status = AssistantBuildStatus.Status.completed
                    self.build_status.build_completed_time = int(time())
                    self.sync_db()
                    yield self.build_status.model_copy()
            elif isinstance(event, dict):
                # logger.debug('dict')
                if 'stage' in event:
                    if event['stage'] == 'prefill':
                        self.build_status.status = AssistantBuildStatus.Status.prefilling
                        self.build_status.prefilling_current = event['curr_progress']
                        self.build_status.prefilling_total = event['max_progress']
                    if event['stage'] == 'parse':
                        self.build_status.status = AssistantBuildStatus.Status.parsing
                        self.build_status.parsed_file_count = event['curr_progress']
                        self.build_status.total_file_count = event['max_progress']
                    yield self.build_status.model_copy()

    def get_build_status(self) -> AssistantBuildStatus:
        return self.build_status
     
    
    def sync_db(self)->None:
        # raise NotImplementedError # should be replaced
        sql_utils = SQLUtil()
        db_assistant = Assistant(
            **self.model_dump(mode='json'),
        )
        with sql_utils.get_db() as db:
            sql_utils.db_merge_commit(db, db_assistant)
    
    def get_encoded_instruction(self,encode_fn:Callable)->torch.Tensor:
        if self._encoded_instruction is None:
            logger.info(f'encoding assistant instruction: {self.instructions}')
            self._encoded_instruction = encode_fn(self.instructions, Role.user)
        return self._encoded_instruction


class AssistantModify(AssistantBase):
    model: Optional[str] = None


# Non API Backend


================================================
FILE: kt-sft/ktransformers/server/schemas/assistants/messages.py
================================================
from enum import Enum
from typing import ForwardRef, List, Optional, Union,Callable

import torch
from pydantic import BaseModel, PrivateAttr, model_validator

from ktransformers.server.exceptions import not_implemented
from ktransformers.server.config.log import logger
from ktransformers.server.models.assistants.messages import Message
from ktransformers.server.schemas.base import Metadata, MetadataField, ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.tool import Field,CodeInterpreter,FileSearch
from ktransformers.server.utils.sql_utils import SQLUtil


class IncompleteDetails(BaseModel):
    reason: str


class ContentType(Enum):
    image_file = "image_file"
    image_url = "image_url"
    text = "text"


class ContentObject(BaseModel):
    type: ContentType


class ImageFile(BaseModel):
    file_id: str
    detail: str


class ImageFileObject(ContentObject):
    image_file: ImageFile


class ImageUrl(BaseModel):
    url: str
    detail: str


class ImageUrlObject(ContentObject):
    image_url: ImageUrl


class Annotation(BaseModel):
    todo: str


class Text(BaseModel):
    value: str
    annotations: List[Annotation] = Field(default=[])


class TextObject(ContentObject):
    text: Text
    delta_index: int = Field(default=0,exclude=True)
    special_tokens_on: bool = Field(default=False,exclude=True) 
    last_two: str= Field(default='',exclude=True)  

    def filter_append(self,text:str):     
        self.text.value+=text
        self.delta_index+=1
        return True  


Content = Union[ImageFileObject, ImageUrlObject, TextObject]


class Attachment(BaseModel):
    file_id: Optional[str] = Field(default=None)
    tools: Optional[List[Union[CodeInterpreter, FileSearch]]] = Field(default=None)


class Role(Enum):
    user = "user"
    assistant = "assistant"

    def is_user(self)->bool:
        return self == Role.user


class MessageCore(BaseModel):
    role: Role
    content: List[Content]
    attachments: Optional[List[Attachment]]
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values


class MessageBase(MessageCore):
    class Status(Enum):
        created = "created" # only used for stream
        in_progress = "in_progress"
        incomplete = "incomplete"
        completed = "completed"
    thread_id: str
    status: Status
    incomplete_details: Optional[IncompleteDetails] = None
    completed_at: Optional[int] = None
    incomplete_at: Optional[int] = None

    assistant_id: Optional[str] = None
    run_id: Optional[str]


MessageStreamResponse = ForwardRef('MessageStreamResponse')

class MessageObject(MessageBase, ObjectWithCreatedTime):
    _encoded_content: Optional[torch.Tensor] = PrivateAttr(default=None)
    

    def get_text_content(self) -> str:
        text_content = ""
        for content in self.content:
            if content.type == ContentType.text:
                text_content += content.text.value
            else:
                raise not_implemented("Content other than text")
        return text_content

    async def get_encoded_content(self,encode_fn:Callable):
        if self._encoded_content is None:
            logger.info(f'encoding {self.role.value} message({self.status.value}): {self.get_text_content()}')
            self._encoded_content = encode_fn(self.get_text_content(),self.role)

            for f in self.get_attached_files():
                logger.info(f'encoding file: {f.filename}')
                self._encoded_content = torch.cat([self._encoded_content, encode_fn(await f.get_str(),self.role)],dim=-1)
                yield None 

        yield self._encoded_content


    def get_attached_files(self):
        raise NotImplementedError # should be replaced 


    def append_message_delta(self,text:str):
        raise NotImplementedError # should be replaced 
    
    def sync_db(self):
        # raise NotImplementedError # should be replaced
        sql_utils = SQLUtil()
        db_message = Message(
            **self.model_dump(mode="json"),
        )
        with sql_utils.get_db() as db:
            sql_utils.db_merge_commit(db, db_message)
    

    def stream_response_with_event(self, event: MessageBase.Status) -> MessageStreamResponse:
        match event:
            case MessageObject.Status.created:
                self.status = MessageObject.Status.in_progress
            case _:
                self.status = event
        return MessageStreamResponse(message=self, event=event)
   

class MessageStreamResponse(BaseModel):
    message: MessageObject
    event: MessageObject.Status

    def to_stream_reply(self):
        return f"event: thread.message.{self.event.value}\ndata: {self.message.model_dump_json()}\n\n"


class MessageCreate(BaseModel):
    role: Role = Field(default=Role.user)
    content: Union[str | List[Content]]
    attachments: Optional[List[Attachment]] = None
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values

    def to_core(self) -> MessageCore:
        # logger.debug(f"Converting message create to core {self.model_dump()}")
        core = MessageCore(
            role=self.role,
            content=[],
            attachments=self.attachments,
            meta_data=self.meta_data,
        )
        if isinstance(self.content, str):
            core.content = [TextObject(type="text", text=Text(value=self.content, annotations=[]))]
        elif isinstance(self.content, list):
            core.content = self.content
        else:
            raise ValueError("Invalid content type")
        return core


class MessageModify(BaseModel):
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values


================================================
FILE: kt-sft/ktransformers/server/schemas/assistants/runs.py
================================================
from enum import Enum
from typing import Dict, List, Optional, Union, ForwardRef

from pydantic import BaseModel, Field, model_validator

from ktransformers.server.models.assistants.runs import Run
from ktransformers.server.schemas.base import TODO, Metadata, MetadataField, ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.threads import ThreadCreate
from ktransformers.server.schemas.assistants.tool import Tool, ToolResource
from ktransformers.server.utils.sql_utils import SQLUtil


class ToolCall(BaseModel):
    id: str
    type: str
    function: TODO


class SubmitToolOutputs(BaseModel):
    tool_calls: List[ToolCall]


class RequiredAction(BaseModel):
    type: str
    submit_tool_outputs: TODO


class LastError(BaseModel):
    code: str
    message: str


class IncompleteDetails(BaseModel):
    reason: str


class Usage(BaseModel):
    completion_tokens: int
    prompt_tokens: int
    total_tokens: int


class TruncationStrategy(BaseModel):
    type: str = "auto"
    last_message: Optional[int]


class ToolChoiceType(Enum):
    none = "none"
    auto = "auto"
    required = "required"


class RunBase(BaseModel):
    class Status(Enum):
        created = "created" # only stream event will have this created status
        queued = "queued"
        in_progress = "in_progress"
        requires_action = "requires_action"
        cancelling = "cancelling"
        cancelled = "cancelled"
        failed = "failed"
        completed = "completed"
        expired = "expired"


    thread_id: str
    assistant_id: str
    status: Status = Status.queued
    required_action: Optional[RequiredAction] = Field(None)
    last_error: Optional[LastError] = Field(None)
    expires_at: Optional[int]= Field(None)
    started_at: Optional[int] = Field(None)
    cancelled_at: Optional[int] = Field(None)
    failed_at: Optional[int] = Field(None)
    completed_at: Optional[int] = Field(None)
    incomplete_details: Optional[IncompleteDetails] = Field(None)
    model: Optional[str] = Field(None)
    instructions: Optional[str] = Field(None)
    tools: Optional[List[Tool]] = Field([])
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    
    def set_compute_save(self,save:int):
        self.meta_data['compute_save'] = str(save)


    usage: Optional[Usage] = Field(None)
    temperature: Optional[float] = Field(None)
    top_p: Optional[float]= Field(None)
    max_propmp_tokens: Optional[int]= Field(None)
    truncation_strategy: Optional[TruncationStrategy]= Field(None)
    tool_choice: Optional[Union[ToolChoiceType, dict]]= Field(None)
    response_format: Union[str, Dict[str, str]] = "auto"


RunStreamResponse = ForwardRef('RunStreamResponse')

class RunObject(RunBase, ObjectWithCreatedTime):
    def stream_response_with_event(self,event:RunBase.Status)->RunStreamResponse:
        match event:
            case RunBase.Status.created:
                self.status = RunBase.Status.queued
            case _:
                self.status = event
        return RunStreamResponse(run=self, event=event)
 
    
    def sync_db(self):
        # raise NotImplementedError # should be replaced in crud
        sql_utils = SQLUtil()
        db_run = Run(
            **self.model_dump(mode='json'),
        )
        with sql_utils.get_db() as db:
            sql_utils.db_merge_commit(db, db_run)
    
    def create_message_creation_step(self):
        raise NotImplementedError # should be replaced 
        

class RunStreamResponse(BaseModel):
    run: RunObject
    event: RunObject.Status
    def to_stream_reply(self):
        return f"event: thread.run.{self.event.value}\ndata: {self.run.model_dump_json()}\n\n"

class RunCreate(BaseModel):
    assistant_id: str
    model: Optional[str] = Field(default=None)
    instructions: Optional[str] = Field(default=None)
    # TODO: Add this
    # additional_instructions: Optional[str]
    # additional_messages: Optional[List[MessageCore]]
    tools: List[Tool] = Field(default=[])
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    temperature: Optional[float] = Field(default=None)
    top_p: Optional[float] = Field(default=None)
    stream: Optional[bool] = Field(default=None)
    max_propmp_tokens: Optional[int] = Field(default=None)
    # TODO: Add this
    # max_completion_tokens: Optional[int]
    truncation_strategy: Optional[TruncationStrategy] = Field(default=None)
    tool_choice: Optional[Union[ToolChoiceType, dict]] = Field(default=None)
    response_format: Union[str, Dict[str, str]] = Field(default="auto")


class RunThreadCreate(BaseModel):
    assistant_id: str
    thread: Optional[ThreadCreate]
    model: Optional[str]
    instructions: Optional[str]
    tools: List[Tool]
    tool_resources: List[ToolResource]
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values
    temperature: Optional[float]
    top_p: Optional[float]
    stream: Optional[bool]
    max_propmp_tokens: Optional[int]
    # TODO: Add this
    # max_completion_tokens: Optional[int]
    truncation_strategy: TruncationStrategy
    tool_choice: Union[ToolChoiceType, dict]
    response_format: Union[str, Dict[str, str]] = "auto"


class RunModify(BaseModel):
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values


class ToolOutput(BaseModel):
    tool_call_id: Optional[str]
    output: Optional[str]


class RunSubmit(BaseModel):
    tool_outputs: List[ToolOutput]
    stream: Optional[bool]


================================================
FILE: kt-sft/ktransformers/server/schemas/assistants/streaming.py
================================================
import asyncio
from typing import AsyncIterable, List, Union

from fastapi import Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel

from ktransformers.server.schemas.assistants.runs import RunStreamResponse
from ktransformers.server.schemas.endpoints.chat import ChatCompletionChunk
from ktransformers.server.config.log import logger
from ktransformers.server.schemas.base import Object
from ktransformers.server.schemas.assistants.messages import ContentType, ImageFileObject, ImageUrlObject, MessageObject, Text, TextObject


class TextObjectWithIndex(TextObject):
    index: int


class ImageFileObjectWithIndex(ImageFileObject):
    index: int


class ImageUrlObjectWithIndex(ImageUrlObject):
    index: int


ContentWithIndex = Union[TextObjectWithIndex,
                         ImageFileObjectWithIndex, ImageUrlObjectWithIndex]


class MessageDeltaImpl(BaseModel):
    # role: Optional[str]
    content: List[ContentWithIndex]


class MessageDelta(Object):
    delta: MessageDeltaImpl

    def to_stream_reply(self):
        return f"event: thread.message.delta\ndata: {self.model_dump_json()}\n\n"


def text_delta(index: int, text: str):
    return MessageDeltaImpl(content=[TextObjectWithIndex(index=index, type=ContentType.text, text=Text(value=text))])


def append_message_delta(self: MessageObject, text: str):

    if len(self.content) == 0:
        self.content.append(TextObject(type=ContentType.text,
                            text=Text(value=''), delta_index=0))

    text_object: TextObject = self.content[0]
    if text_object.filter_append(text):
        return MessageDelta(id=self.id, object="thread.message.delta", delta=text_delta(text_object.delta_index, text))
    else:
        return None


MessageObject.append_message_delta = append_message_delta


class RunStepDeltaImpl(BaseModel):
    pass


class RunStepDelta(Object):
    delta: RunStepDeltaImpl

    def to_stream_reply(self):
        return f"event: thread.run.step.delta\ndata: {self.model_dump_json()}\n\n"


class Done():
    def to_stream_reply(self):
        return f"data: [DONE]\n\n"


async def check_client_link(request: Request, async_events: AsyncIterable):
    async for event in async_events:
        if await request.is_disconnected():
            break
        yield event


async def add_done(async_events: AsyncIterable):
    async for event in async_events:
        yield event
    yield Done()


async def to_stream_reply(async_events: AsyncIterable):
    async for event in async_events:
        if isinstance(event, str):
            yield event
        else:
            yield event.to_stream_reply()


async def filter_api_event(async_events: AsyncIterable):
    async for event in async_events:
        if isinstance(event, MessageDelta) or isinstance(event, RunStepDelta) or isinstance(event, RunStreamResponse) or isinstance(event, Done):
            yield event


async def filter_chat_chunk(async_events: AsyncIterable):
    async for event in async_events:
        if isinstance(event, ChatCompletionChunk):
            yield event


async def filter_by_types(async_events: AsyncIterable, types: List):
    async for event in async_events:
        for type in types:
            if isinstance(event, type):
                yield event
                continue


def api_stream_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, to_stream_reply(add_done(filter_api_event(async_events)))), media_type="text/event-stream")


def chat_stream_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, to_stream_reply(add_done(filter_chat_chunk(async_events)))), media_type="text/event-stream")


def stream_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, to_stream_reply(add_done(async_events))), media_type="text/event-stream")


def check_link_response(request: Request, async_events: AsyncIterable):
    return StreamingResponse(check_client_link(request, async_events), media_type="text/event-stream")


def wrap_async_generator_into_queue(async_events: AsyncIterable) -> asyncio.Queue:
    queue = asyncio.Queue()

    async def inner():
        # logger.debug('run inner')
        async for event in async_events:
            # logger.debug(f'put: {event}')
            await queue.put(event)
            await asyncio.sleep(0)
        # logger.debug(f'put: None')
        await queue.put(None)
    asyncio.create_task(inner())
    return queue


async def unwrap_async_queue(queue: asyncio.Queue) -> AsyncIterable:
    while True:
        events = [await queue.get()]
        events.extend([queue.get_nowait() for _ in range(queue.qsize())])

        logger.debug(f'getting {len(events)} events')
        for event in events:
            if event is None:
                break
            yield event


async def unwrap_async_queue_slow(queue: asyncio.Queue) -> AsyncIterable:
    while True:
        event = await queue.get()
        # logger.debug(f'unwrap_async_queue {event}')
        if event is None:
            break
        yield event


================================================
FILE: kt-sft/ktransformers/server/schemas/assistants/threads.py
================================================
from enum import Enum
from typing import List
from typing_extensions import Self 

from pydantic import BaseModel, Field, model_validator

from ktransformers.server.schemas.base import Metadata, MetadataField, ObjectWithCreatedTime
from ktransformers.server.schemas.assistants.tool import ToolResource
from ktransformers.server.schemas.assistants.messages import MessageCore


class ThreadBase(BaseModel):
    meta_data: Metadata = MetadataField
    @model_validator(mode='before')
    @classmethod
    def convert_meta_data(cls,values):
        if 'meta_data' in values:
            values['metadata'] = values['meta_data']
        return values

    tool_resources: List[ToolResource] = Field([], max_length=128)


class ThreadObject(ThreadBase, ObjectWithCreatedTime):
    is_related_threads:bool = Field(False,exclude=True)

    @model_validator(mode='after')
    def check_is_related_threads(self)->Self:
        # logger.debug(f'check thread {self.id} is related thread? by {self}')
        if 'assistant_id' in self.meta_data:
            self.is_related_threads = True
        return self

    class StreamEvent(Enum):
        created = 'created'

    def to_stream_reply(self,event:StreamEvent):
        return f"event: thread.{event.value}\ndata: {self.model_dump_json()}\n\n"
    

class ThreadCreate(ThreadBase):
    messages: List[MessageCore] = Field(default=[])


class ThreadModify(ThreadBase):
    pass


# other than OpenAI API


================================================
FILE: kt-sft/ktransformers/server/schemas/assistants/tool.py
================================================
from enum import Enum
from typing import List, Optional, Union

from pydantic import BaseModel, Field

from ktransformers.server.schemas.base import ObjectID


class ToolType(str, Enum):
    CODE_INTERPRETER = "code_interpreter"
    FILE_SEARCH = "file_search"
    RELATED_THREADS = "related_threads"
    FUNCTION = "function"


class ToolBase(BaseModel):
    type: ToolType


class CodeInterpreter(ToolBase):
    pass


class FileSearch(ToolBase):
    pass


class RelatedThreads(ToolBase):
    pass


class FuntionTool(ToolBase):
    description: str
    name: str
    parameters: List[str]


Tool = Union[CodeInterpreter, FileSearch, RelatedThreads, FuntionTool]


class CodeInterpreterResource(BaseModel):
    file_ids: Optional[List[str]] = Field(default_factory=list, max_length=20)


class FileSearchResource(BaseModel):
    vector_store_ids: Optional[List[str]] = Field(default_factory=list, max_length=1)
    vector_stores: Optional[List[str]] = Field(default_factory=list, max_length=1)


class RelatedThreadsResource(BaseModel):
    thread_ids: List[ObjectID] = Field(default=[])


ToolResource = Union[CodeInterpreterResource,FileSearchResource,RelatedThreadsResource] 


================================================
FILE: kt-sft/ktransformers/server/schemas/base.py
================================================
from enum import Enum
from typing import Dict

import sqlalchemy
from pydantic import BaseModel, ConfigDict, Field

TODO = BaseModel

ObjectID = str


class Object(BaseModel):
    id: ObjectID
    object: str

    model_config = ConfigDict(from_attributes=True)


# Pydantic Base Models
class ObjectWithCreatedTime(Object):
    created_at: int


class Order(str, Enum):
    ASC = "asc"
    DESC = "desc"

    def to_sqlalchemy_order(self):
        match self:
            case Order.ASC:
                return sqlalchemy.asc
            case Order.DESC:
                return sqlalchemy.desc


Metadata = Dict[str, str]
MetadataField: Metadata = Field({},max_length=16, alias="metadata")


class DeleteResponse(Object):
    deleted: bool = True

class OperationResponse(BaseModel):
    operation: str
    status: str


================================================
FILE: kt-sft/ktransformers/server/schemas/conversation.py
================================================
from typing import Optional

from pydantic import BaseModel

from .assistants.assistants import AssistantObject
from .assistants.threads import ThreadObject
from .assistants.messages import MessageObject

class ThreadPreview(BaseModel):
    assistant: Optional[AssistantObject] = None
    thread: ThreadObject
    first_message: Optional[MessageObject] = None


================================================
FILE: kt-sft/ktransformers/server/schemas/endpoints/chat.py
================================================
from typing import List, Optional, Union, Dict, Any
from typing_extensions import Literal
from enum import Enum
from pydantic import BaseModel, Field
from ktransformers.server.config.config import Config
from ktransformers.server.schemas.base import Object


from openai.types.chat.chat_completion_chunk import Choice

from uuid import uuid4

class CompletionUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    prompt_tokens_details: Optional[Dict[str, Any]] = None
    completion_tokens_details: Optional[Dict[str, Any]] = None
    prefill_time: Optional[float] = None
    decode_time: Optional[float] = None

class Role(Enum):
    system = 'system'
    user = 'user'
    assistant = 'assistant'
    tool = 'tool'
    function = 'function'

class Message(BaseModel):
    content: Optional[str] = None
    role: Role
    name: Optional[str] = None
    tool_calls: Optional[List[Dict[str, Any]]] = {}
    tool_call_id: Optional[str] = None
    
    def to_tokenizer_message(self):
        message = {'role': self.role.value}
        if self.content is not None:
            message['content'] = self.content
        if self.name is not None:
            message['name'] = self.name
        if self.tool_calls is not {}:
            message['tool_calls'] = self.tool_calls
        if self.tool_call_id is not None:
            message['tool_call_id'] = self.tool_call_id
        return message

class FunctionParameters(BaseModel):
    type: str = "object"
    properties: Dict[str, Any] = {}
    required: Optional[List[str]] = None

class FunctionDefinition(BaseModel):
    name: str
    description: Optional[str] = None
    parameters: FunctionParameters = Field(default_factory=FunctionParameters)

class ToolFunction(BaseModel):
    function: FunctionDefinition
    
class Tool(BaseModel):
    type: Literal["function"]
    function: FunctionDefinition

class ChatCompletionCreate(BaseModel):
    messages: List[Message]
    model: str
    stream: bool = False
    temperature: Optional[float] = Field(default=Config().temperature)
    top_p: Optional[float] = Field(default=Config().top_p)
    tools: Optional[List[Tool]] = None
    tool_choice: Optional[Union[str, Dict[str, Any]]] = None
    stream_options: Optional[Dict[str, Any]] = None
    frequency_penalty: float = 0
    presence_penalty: float = 0
    max_tokens: Optional[int] = Field(default=None)
    max_completion_tokens: Optional[int] = Field(default=None)
    return_speed: Optional[bool] = Field(default=False)
    def get_tokenizer_messages(self):
        return [m.to_tokenizer_message() for m in self.messages]

class ChatCompletionChunk(BaseModel):
    id: str
    choices: List[Choice]
    created: int
    model: str
    object: Literal["chat.completion.chunk"]
    service_tier: Optional[Literal["scale", "default"]] = None
    system_fingerprint: Optional[str] = None
    usage: Optional[CompletionUsage] = None

    def to_stream_reply(self):
        return f"data: {self.model_dump_json()}\n\n"

class RawUsage(BaseModel):
    tokenize_time: float
    prefill_time: float
    decode_time: float
    prefill_count: int
    decode_count: int

================================================
FILE: kt-sft/ktransformers/server/schemas/legacy/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/schemas/legacy/completions.py
================================================
from typing import List, Optional
from enum import Enum
from pydantic import BaseModel, Field
from ktransformers.server.config.config import Config
from ..base import Object

class CompletionCreate(BaseModel):
    model: str
    prompt: str | List[str]
    stream: bool = False
    temperature: Optional[float] = Field(default=Config().temperature)
    top_p: Optional[float] = Field(default=Config().top_p)
    max_tokens: Optional[int] = Field(default=None)
    max_completion_tokens: Optional[int] = Field(default=None)
    
    def get_tokenizer_messages(self):
        if isinstance(self.prompt,List):
            self.get_tokenizer_messages('\n'.join(self.prompt))
        return [{'content':self.prompt,'role':'user'}]


class FinishReason(Enum):
    stop = 'stop'
    length = 'length'

class Choice(BaseModel):
    index: int
    text: str
    logprobs: Optional[str] = None
    finish_reason: FinishReason = None


class CompletionObject(Object):
    created:int
    choices: List[Choice] = []
    model:str = 'not implmented'
    system_fingerprint:str = 'not implmented'
    usage: Optional[str] = None

    def set_token(self,token:str):
        if len(self.choices)==0:
            self.choices.append(Choice(index=0,text=''))
        self.choices[0].text = token    

    def append_token(self,token:str):
        if len(self.choices)==0:
            self.choices.append(Choice(index=0,text=''))
        self.choices[0].text += token

    def to_stream_reply(self):
        return f"data:{self.model_dump_json()}\n\n"


================================================
FILE: kt-sft/ktransformers/server/utils/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/server/utils/create_interface.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : qiyuxinlin
Date         : 2024-07-25 11:50:16
Version      : 1.0.0
LastEditors  : qiyuxinlin 
LastEditTime : 2024-07-25 12:54:48
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
from ktransformers.server.config.config import Config
from ktransformers.server.backend.args import ConfigArgs
from ktransformers.server.backend.context_manager import ThreadContextManager
from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface
from ktransformers.server.backend.interfaces.transformers import TransformersInterface
from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface

def create_interface(config: Config, default_args: ConfigArgs):
    if config.backend_type=='transformers':
        from ktransformers.server.backend.interfaces.transformers import  TransformersInterface as BackendInterface
    elif config.backend_type == 'exllamav2':
        from ktransformers.server.backend.interfaces.exllamav2 import  ExllamaInterface as BackendInterface
    elif config.backend_type == 'ktransformers':
        from ktransformers.server.backend.interfaces.ktransformers import  KTransformersInterface as BackendInterface
    elif config.backend_type == 'balance_serve':
        from ktransformers.server.backend.interfaces.balance_serve import BalanceServeInterface as BackendInterface
    else:
        raise NotImplementedError(f'{config.backend_type} not implemented')
    GlobalInterface.interface = BackendInterface(default_args)
    GlobalContextManager.context_manager = ThreadContextManager(GlobalInterface.interface)

class GlobalContextManager:
    context_manager: ThreadContextManager
class GlobalInterface:
    interface:  TransformersInterface | KTransformersInterface | ExllamaInterface 
    
def get_thread_context_manager() -> GlobalContextManager:
    return GlobalContextManager.context_manager
def get_interface() -> GlobalInterface:
    return GlobalInterface.interface

================================================
FILE: kt-sft/ktransformers/server/utils/multi_timer.py
================================================
import time


def format_time(seconds):
    units = [
        ("hours", 3600),
        ("minutes", 60),
        ("seconds", 1),
        ("milliseconds", 1e-3),
        ("microseconds", 1e-6),
    ]

    for unit_name, unit_value in units:
        if seconds >= unit_value:
            time_value = seconds / unit_value
            return f"{time_value:.2f} {unit_name}"
    return "0 seconds"  # Handle case for 0 seconds


class Profiler:
    def __init__(self):
        self.timers = {}
        self.counters = {}

    def create_timer(self, name):
        self.timers[name] = {
            "start_time": None,
            "elapsed_time": 0,
            "running": False,
        }

    def start_timer(self, name):
        if name not in self.timers:
            raise ValueError(f"Timer '{name}' does not exist.")
        if self.timers[name]["running"]:
            raise ValueError(f"Timer '{name}' is already running.")
        self.timers[name]["start_time"] = time.time()
        self.timers[name]["running"] = True

    def pause_timer(self, name):
        if name not in self.timers:
            raise ValueError(f"Timer '{name}' does not exist.")
        if not self.timers[name]["running"]:
            raise ValueError(f"Timer '{name}' is not running.")
        self.timers[name]["elapsed_time"] += time.time() - self.timers[name]["start_time"]
        self.timers[name]["running"] = False

    def get_timer_sec(self, name):
        if name not in self.timers:
            raise ValueError(f"Timer '{name}' does not exist.")
        if self.timers[name]["running"]:
            current_time = self.timers[name]["elapsed_time"] + (time.time() - self.timers[name]["start_time"])
        else:
            current_time = self.timers[name]["elapsed_time"]
        return current_time

    def get_all_timers(self):
        all_timers = {}
        for name in self.timers:
            all_timers[name] = self.get_timer_sec(name)
        return all_timers

    def report_timer_string(self, name):
        return f"{name} elapsed time: {format_time(self.get_timer_sec(name))}"

    def create_and_start_timer(self, name):
        self.create_timer(name)
        self.start_timer(name)


    # Counter
    def inc(self,key:str,delta:int=1):
        self.counters[key] = self.counters.get(key,0) + delta

    def set_counter(self,key:str,to=0):
        self.counters[key] = to

    def get_counter(self,key:str):
        return self.counters.get(key,0)


================================================
FILE: kt-sft/ktransformers/server/utils/sql_utils.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : chenxl
Date         : 2024-06-12 09:12:58
Version      : 1.0.0
LastEditors  : chenxl 
LastEditTime : 2024-07-27 01:56:04
'''

from urllib.parse import urlparse
import os
from contextlib import contextmanager
from sqlalchemy import create_engine
from sqlalchemy.orm import Session, sessionmaker, declarative_base

from ktransformers.server.config.config import Config
from ktransformers.server.config.singleton import Singleton
from ktransformers.server.config.log import logger
from ktransformers.server.exceptions import db_exception


Base = declarative_base()


class SQLUtil(metaclass=Singleton):
    """
    database connections init and management
    """
    sqlalchemy_engine = None
    session_local = None

    def __init__(self) -> None:
        self.cfg: Config = Config()
        if not self.sqlalchemy_engine:
            SQLUtil.init_engine(self.cfg)

    @contextmanager
    def get_db(self):
        """
        After you finish using the session, it's crucial to close it.
        """
        if not SQLUtil.sqlalchemy_engine:
            SQLUtil.init_engine(self.cfg)
        session = self.session_local()  # type: ignore pylint: disable=not-callable
        try:
            yield session
        finally:
            session.close()

    @staticmethod
    def init_engine(cfg: Config):
        """
        initial engine and session maker Factory
        """
        pool_size = cfg.db_pool_size
        if SQLUtil.sqlalchemy_engine is None:
            if cfg.db_type == "sqllite":
                db_url = SQLUtil.create_sqllite_url(cfg)
            else:
                logger.error("Unsupported database type %s", cfg.db_type)
                exit(-1)
            SQLUtil.sqlalchemy_engine = create_engine(
                db_url, connect_args={"check_same_thread": False}, pool_size=pool_size)
            SQLUtil.session_local = sessionmaker(
                autocommit=False, autoflush=False, bind=SQLUtil.sqlalchemy_engine)

    @staticmethod
    def create_sqllite_url(cfg):
        """
        create and validate SQLLite url
        """
        path: str = cfg.db_host
        database: str = cfg.db_database
        absolute_path: str = os.path.join(path, database)
        url = 'sqlite:///' + absolute_path
        try:
            result = urlparse(url)
            if all([result.scheme, result.path, result.scheme == 'sqlite']):
                return url
            else:
                logger.error("invalid sqllite url: %s", url)
                exit(-1)
        except ValueError:
            logger.error("invalid sqllite url: %s", url)
            exit(-1)

    def db_add_commit_refresh(self, session: Session, what):
        """
        add data to database
        """
        try:
            session.add(what)
            session.commit()
            session.refresh(what)
        except Exception as e:
            logger.exception("db commit error with data %s", str(what.__dict__))
            ex = db_exception()
            ex.detail = str(e)
            session.rollback()
            raise ex from e

    def db_merge_commit(self, session: Session, what):
        try:
            session.merge(what)
            session.commit()
        except Exception as e:
            ex = db_exception()
            ex.detail = str(e)
            logger.exception("db merge commit error with data %s", str(what.__dict__))
            session.rollback()
            raise ex from e

    def db_update_commit_refresh(self, session: Session, existing, what):
        what = what.model_dump(mode="json")
        try:
            for key in what.keys():
                if what[key] is not None:
                    setattr(existing, key, what[key])
            session.commit()
            session.refresh(existing)
        except Exception as e:
            ex = db_exception()
            ex.detail = str(e)
            logger.exception("db update commit refresh error with data %s", str(what.__dict__))
            session.rollback()
            raise ex from e


================================================
FILE: kt-sft/ktransformers/sft/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/sft/flops_utils/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/sft/flops_utils/custom_profile.py
================================================
from distutils.version import LooseVersion

from thop.vision.basic_hooks import *
from thop.rnn_hooks import *
from thop.utils import prGreen, prRed, prYellow
import sys, os

project_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
sys.path.insert(0, project_dir)

from ktransformers.util.utils import prefill_and_generate

# logger = logging.getLogger(__name__)
# logger.setLevel(logging.INFO)


if LooseVersion(torch.__version__) < LooseVersion("1.0.0"):
    logging.warning(
        "You are using an old version PyTorch {version}, which THOP does NOT support.".format(
            version=torch.__version__
        )
    )

default_dtype = torch.float64

register_hooks = {
    nn.ZeroPad2d: zero_ops,  # padding does not involve any multiplication.
    nn.Conv1d: count_convNd,
    nn.Conv2d: count_convNd,
    nn.Conv3d: count_convNd,
    nn.ConvTranspose1d: count_convNd,
    nn.ConvTranspose2d: count_convNd,
    nn.ConvTranspose3d: count_convNd,
    nn.BatchNorm1d: count_normalization,
    nn.BatchNorm2d: count_normalization,
    nn.BatchNorm3d: count_normalization,
    nn.LayerNorm: count_normalization,
    nn.InstanceNorm1d: count_normalization,
    nn.InstanceNorm2d: count_normalization,
    nn.InstanceNorm3d: count_normalization,
    nn.PReLU: count_prelu,
    nn.Softmax: count_softmax,
    nn.ReLU: zero_ops,
    nn.ReLU6: zero_ops,
    nn.LeakyReLU: count_relu,
    nn.MaxPool1d: zero_ops,
    nn.MaxPool2d: zero_ops,
    nn.MaxPool3d: zero_ops,
    nn.AdaptiveMaxPool1d: zero_ops,
    nn.AdaptiveMaxPool2d: zero_ops,
    nn.AdaptiveMaxPool3d: zero_ops,
    nn.AvgPool1d: count_avgpool,
    nn.AvgPool2d: count_avgpool,
    nn.AvgPool3d: count_avgpool,
    nn.AdaptiveAvgPool1d: count_adap_avgpool,
    nn.AdaptiveAvgPool2d: count_adap_avgpool,
    nn.AdaptiveAvgPool3d: count_adap_avgpool,
    nn.Linear: count_linear,
    nn.Dropout: zero_ops,
    nn.Upsample: count_upsample,
    nn.UpsamplingBilinear2d: count_upsample,
    nn.UpsamplingNearest2d: count_upsample,
    nn.RNNCell: count_rnn_cell,
    nn.GRUCell: count_gru_cell,
    nn.LSTMCell: count_lstm_cell,
    nn.RNN: count_rnn,
    nn.GRU: count_gru,
    nn.LSTM: count_lstm,
    nn.Sequential: zero_ops,
    nn.PixelShuffle: zero_ops,
}

if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"):
    register_hooks.update({nn.SyncBatchNorm: count_normalization})


def profile_origin(model, inputs, custom_ops=None, verbose=True, report_missing=False):
    handler_collection = []
    types_collection = set()
    if custom_ops is None:
        custom_ops = {}
    if report_missing:
        verbose = True

    def add_hooks(m):
        if len(list(m.children())) > 0:
            return

        if hasattr(m, "total_ops") or hasattr(m, "total_params"):
            logging.warning(
                "Either .total_ops or .total_params is already defined in %s. "
                "Be careful, it might change your code's behavior." % str(m)
            )

        m.register_buffer("total_ops", torch.zeros(1, dtype=default_dtype))
        m.register_buffer("total_params", torch.zeros(1, dtype=default_dtype))

        for p in m.parameters():
            m.total_params += torch.DoubleTensor([p.numel()])

        m_type = type(m)

        fn = None
        if (
            m_type in custom_ops
        ):  # if defined both op maps, use custom_ops to overwrite.
            fn = custom_ops[m_type]
            if m_type not in types_collection and verbose:
                print("[INFO] Customize rule %s() %s." % (fn.__qualname__, m_type))
        elif m_type in register_hooks:
            fn = register_hooks[m_type]
            if m_type not in types_collection and verbose:
                print("[INFO] Register %s() for %s." % (fn.__qualname__, m_type))
        else:
            if m_type not in types_collection and report_missing:
                prRed(
                    "[WARN] Cannot find rule for %s. Treat it as zero Macs and zero Params."
                    % m_type
                )

        if fn is not None:
            handler = m.register_forward_hook(fn)
            handler_collection.append(handler)
        types_collection.add(m_type)

    training = model.training

    model.eval()
    model.apply(add_hooks)

    with torch.no_grad():
        model(*inputs)

    total_ops = 0
    total_params = 0
    for m in model.modules():
        if len(list(m.children())) > 0:  # skip for non-leaf module
            continue
        total_ops += m.total_ops
        total_params += m.total_params

    total_ops = total_ops.item()
    total_params = total_params.item()

    # reset model to original status
    model.train(training)
    for handler in handler_collection:
        handler.remove()

    # remove temporal buffers
    for n, m in model.named_modules():
        if len(list(m.children())) > 0:
            continue
        if "total_ops" in m._buffers:
            m._buffers.pop("total_ops")
        if "total_params" in m._buffers:
            m._buffers.pop("total_params")

    return total_ops, total_params


def custom_profile(
    model: nn.Module,
    inputs,
    content,
    tokenizer,
    custom_ops=None,
    verbose=True,
    ret_layer_info=False,
    report_missing=False,
):
    handler_collection = {}
    types_collection = set()
    if custom_ops is None:
        custom_ops = {}
    if report_missing:
        # overwrite `verbose` option when enable report_missing
        verbose = True

    def add_hooks(m: nn.Module):
        m.register_buffer("total_ops", torch.zeros(1, dtype=torch.float64))
        m.register_buffer("total_params", torch.zeros(1, dtype=torch.float64))

        # for p in m.parameters():
        #     m.total_params += torch.DoubleTensor([p.numel()])

        m_type = type(m)

        fn = None
        if m_type in custom_ops:
            # if defined both op maps, use custom_ops to overwrite.
            fn = custom_ops[m_type]
            if m_type not in types_collection and verbose:
                print("[INFO] Customize rule %s() %s." % (fn.__qualname__, m_type))
        elif m_type in register_hooks:
            fn = register_hooks[m_type]
            if m_type not in types_collection and verbose:
                print("[INFO] Register %s() for %s." % (fn.__qualname__, m_type))
        else:
            if m_type not in types_collection and report_missing:
                prRed(
                    "[WARN] Cannot find rule for %s. Treat it as zero Macs and zero Params."
                    % m_type
                )

        if fn is not None:
            handler_collection[m] = (
                m.register_forward_hook(fn),
                m.register_forward_hook(count_parameters),
            )
        types_collection.add(m_type)

    prev_training_status = model.training

    model.eval()
    model.apply(add_hooks)
    
    messages = [{"role": "user", "content": content}]
    input_tensor = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    )

    with torch.no_grad():
        # model(*inputs)
        # TODO: model.model to deal with the PeftModelForCaualLM temp
        simple_prefill_and_generate_for_test(
            model.model, tokenizer, input_tensor.cuda(), max_new_tokens=1000, use_cuda_graph=False, mode = 'normal', force_think = False, chunk_prefill_size = 8192,
        )

    def dfs_count(module: nn.Module, prefix="\t") -> (int, int):
        total_ops, total_params = module.total_ops.item(), 0
        ret_dict = {}
        for n, m in module.named_children():
            # if not hasattr(m, "total_ops") and not hasattr(m, "total_params"):  # and len(list(m.children())) > 0:
            #     m_ops, m_params = dfs_count(m, prefix=prefix + "\t")
            # else:
            #     m_ops, m_params = m.total_ops, m.total_params
            next_dict = {}
            if m in handler_collection and not isinstance(
                m, (nn.Sequential, nn.ModuleList)
            ):
                m_ops, m_params = m.total_ops.item(), m.total_params.item()
            else:
                m_ops, m_params, next_dict = dfs_count(m, prefix=prefix + "\t")
            ret_dict[n] = (m_ops, m_params, next_dict)
            total_ops += m_ops
            total_params += m_params
        # print(prefix, module._get_name(), (total_ops, total_params))
        return total_ops, total_params, ret_dict

    total_ops, total_params, ret_dict = dfs_count(model)

    # reset model to original status
    model.train(prev_training_status)
    for m, (op_handler, params_handler) in handler_collection.items():
        op_handler.remove()
        params_handler.remove()
        m._buffers.pop("total_ops")
        m._buffers.pop("total_params")

    if ret_layer_info:
        return total_ops, total_params, ret_dict
    return total_ops, total_params


================================================
FILE: kt-sft/ktransformers/sft/flops_utils/lora_test_utils.py
================================================
from torch.profiler import profile, record_function, ProfilerActivity
import os
from transformers import TrainerCallback

class ProfilerCallback(TrainerCallback):
    def __init__(self, profiler):
        self.profiler = profiler

    def on_step_end(self, args, state, control, **kwargs):
        self.profiler.step()

def _short(t):
    return tuple(t.shape) if isinstance(t, torch.Tensor) else type(t)

def install_shape_probes(model):
    if os.environ.get("KT_DEBUG_MOE","0") != "1":
        print("[KT_DEBUG_MOE] off"); return

    try:
        acc = trainer.accelerator
        cfg = getattr(acc, "dataloader_config", None)
        if cfg is not None:
            print("[ACCEL DL CONFIG]",
                  "split_batches=", getattr(cfg,"split_batches",None),
                  "dispatch_batches=", getattr(cfg,"dispatch_batches",None),
                  "even_batches=", getattr(cfg,"even_batches",None),
                  "use_seedable_sampler=", getattr(cfg,"use_seedable_sampler",None),
                  "non_blocking=", getattr(cfg,"non_blocking",None))
    except Exception as e:
        print("[ACCEL DL CONFIG] <err>", e)

    try:
        emb = model.base_model.model.model.embed_tokens
        def _emb_pre(mod, inp):
            x = inp[0]
            if not hasattr(mod, "_dbg_once"):
                print(f"[DBG] embed input_ids shape = {tuple(x.shape)}  (expect B,S)")
                mod._dbg_once = True
        emb.register_forward_pre_hook(_emb_pre)
    except Exception as e:
        print("[DBG] embed hook failed:", e)

    try:
        first_layer = model.base_model.model.model.layers[0]
        _orig_fwd = first_layer.forward
        def _wrap_fwd(self, *args, **kwargs):
            hs = args[0] if args else kwargs.get("hidden_states")
            if not hasattr(self, "_dbg_once_in"):
                print(f"[DBG] L0.in hidden_states = {_short(hs)}  (expect B,S,H)")
                self._dbg_once_in = True
            out = _orig_fwd(*args, **kwargs)
            hs_out = out[0] if isinstance(out, (tuple, list)) else out
            if not hasattr(self, "_dbg_once_out"):
                print(f"[DBG] L0.out hidden_states = {_short(hs_out)}")
                self._dbg_once_out = True
            return out
        first_layer.forward = MethodType(_wrap_fwd, first_layer)
    except Exception as e:
        print("[DBG] L0 wrap failed:", e)

    try:
        moe_layer = None
        for i, lyr in enumerate(model.base_model.model.model.layers):
            if hasattr(lyr, "mlp"):
                moe_layer = lyr.mlp
                moe_idx = i
                break
        if moe_layer is not None:
            _moe_orig = moe_layer.forward
            def _moe_wrap(self, *args, **kwargs):
                x = args[0] if args else kwargs.get("hidden_states")
                if not hasattr(self, "_dbg_once"):
                    print(f"[DBG] MLP(in) @layer{moe_idx} hidden_states = {_short(x)}")
                    if isinstance(x, torch.Tensor) and x.dim() == 3:
                        B,S,H = x.shape
                        print(f"[DBG] tokens before flatten = B*S = {B}*{S} = {B*S}")
                    self._dbg_once = True
                return _moe_orig(*args, **kwargs)
            moe_layer.forward = MethodType(_moe_wrap, moe_layer)
        else:
            print("[DBG] no moe_layer found")
    except Exception as e:
        print("[DBG] moe wrap failed:", e)

    try:
        from ktransformers.operators.experts import KTransformersExperts
        def _experts_pre(mod, args):
            if hasattr(mod, "_dbg_once"): return
            try:
                input_tensor, expert_ids, weights = args[:3]
                print(f"[DBG] experts.in input_tensor={tuple(input_tensor.shape)} "
                      f"expert_ids={tuple(expert_ids.shape)} weights={tuple(weights.shape)}")
                if input_tensor.dim()==2:
                    N = input_tensor.shape[0]
                    print(f"[DBG] N(input rows)={N}")
                if expert_ids.dim()==2:
                    T,K = expert_ids.shape
                    print(f"[DBG] tokens(T)={T}, K={K}, T*K={T*K}")
                mod._dbg_once = True
            except Exception as e:
                print("[DBG] experts hook parse err:", e)
        count=0
        for name,m in model.named_modules():
            if isinstance(m, KTransformersExperts):
                m.register_forward_pre_hook(_experts_pre); count+=1
        print(f"[KT_DEBUG_MOE] installed experts hook on {count} modules.")
    except Exception as e:
        print("[DBG] experts hook failed:", e)

def inspect_device(model, write_file):
    for name, module in model.named_modules(): 
        with open(write_file, 'a') as file:
            file.write(f"Layer: {name}\n")
        for param_name, param in module.named_parameters(recurse=False): 
            with open(write_file, 'a') as file:
                file.write(f"  Parameter '{param_name}' device: {param.device}\n")
        for buffer_name, buffer in module.named_buffers(recurse=False): 
            with open(write_file, 'a') as file:
                file.write(f"  Buffer '{buffer_name}' device: {buffer.device}\n")

def print_model_params(model):
    # for layer_idx in range(len(model.model.orig_module.layers)):
    for layer_idx in range(0, 3):
        layer = model.model.orig_module.layers[layer_idx]
        
        print(f"\n================ Layer {layer_idx} Attention ================")
        
        q_proj = layer.self_attn.orig_module.q_proj.orig_module
        print(f"\nq_proj.generate_linear.weight (shape: {q_proj.generate_linear.weight.shape})")
        print(q_proj.generate_linear.weight.cpu())
        
        # kv_a_proj = layer.self_attn.orig_module.kv_a_proj_with_mqa.orig_module
        # print(f"\nkv_a_proj.weight (shape: {kv_a_proj.weight.shape})")
        # print(kv_a_proj.weight.data[:3, :5].detach().cpu().numpy())
        
        # o_proj = layer.self_attn.orig_module.o_proj.orig_module
        # print(f"\no_proj.weight (shape: {o_proj.weight.shape})")
        # print(o_proj.weight.data[:3, :5].detach().cpu().numpy())
        
        # print(f"\n================ Layer {layer_idx} MLP/MoE ================")
        
        # if layer_idx == 0:
        #     mlp = layer.mlp
        #     for proj_type in ['gate_proj', 'up_proj', 'down_proj']:
        #         module = getattr(mlp, proj_type).orig_module
        #         print(f"\n{proj_type}.weight (shape: {module.weight.shape})")
        #         print(module.weight.data[:3, :5].detach().cpu().numpy())
        # else:
        #     moe = layer.mlp.orig_module
        #     print("\n[Shared Experts]")
        #     for proj_type in ['gate_proj', 'up_proj', 'down_proj']:
        #         module = getattr(moe.shared_experts, proj_type).orig_module
        #         print(f"\nshared_{proj_type}.weight (shape: {module.weight.shape})")
        #         print(module.weight.data[:3, :5].detach().cpu().numpy())
            
        #     print("\n[Experts]")
        #     for expert_idx in range(3):
        #         expert = moe.experts.orig_module[expert_idx]
        #         print(f"\nExpert {expert_idx}:")
        #         for proj_type in ['gate_proj', 'up_proj', 'down_proj']:
        #             module = getattr(expert, proj_type)
        #             print(f"{proj_type}.weight (shape: {module.weight.shape})")
        #             print(module.weight.data[:3, :5].detach().cpu().numpy())

def print_lora_params(model):
    # for layer_idx in range(len(model.model.orig_module.layers)):
    for layer_idx in range(0, 3):
        layer = model.base_model.model.model.orig_module.layers[layer_idx]
        # layer = model.model.orig_module.layers[layer_idx]
        
        q_proj_module = layer.self_attn.orig_module.q_proj.orig_module
        
        linear_weight = q_proj_module.generate_linear.weight
        lora_A_weight = q_proj_module.lora_A["default"].weight
        lora_B_weight = q_proj_module.lora_B["default"].weight
        
        print(f"\n=================== Layer {layer_idx} ===================")
        
        print("\nOriginal Linear (first row slice):")
        print(linear_weight.cpu())
        
        print("\nLora_A (first row slice):")
        print(lora_A_weight.cpu())
        
        print("\nLora_B (first row slice):")
        print(lora_B_weight.cpu())

def print_grad_fn(grad_fn, indent=0):
    """递归打印计算图节点"""
    if grad_fn is None:
        return
    print(' ' * indent, f"Node: {str(grad_fn).split('(')[0]}")
    print(' ' * indent, f"  Metadata: {grad_fn.metadata}")
    for child in getattr(grad_fn, 'next_functions', []):
        if child[0] is not None:
            print_grad_fn(child[0], indent + 2)

def forward_hook(module, inputs, output):
    if isinstance(output, (tuple, list)):
        for i, o in enumerate(output):
            if o is None:
                print(f"{module.__class__.__name__} output index {i} is None")
            else:
                print(f"{module.__class__.__name__} output index {i}: requires_grad={o.requires_grad}, grad_fn={o.grad_fn}")
    elif output is None:
        print(f"{module.__class__.__name__} returned None")
    else:
        print(f"{module.__class__.__name__}: requires_grad={output.requires_grad}, grad_fn={output.grad_fn}")

def check_moe_gradients(model):
    moe_layer = model.base_model.model.model.orig_module.layers[1].mlp.orig_module
    for name, param in moe_layer.named_parameters():
        if param.requires_grad and param.grad is not None:
            grad_norm = torch.norm(param.grad)
            print(f"MoE参数 {name} 梯度范数: {grad_norm}")
        else:
            print(f"MoE参数 {name} 无梯度")

def disable_all_dropout(module):
        for name, child in module.named_children():
            if isinstance(child, nn.Dropout):
                child.p = 0
                child.inplace = False
            disable_all_dropout(child)

def verify_lora_layers(model):
    for layer_path in target_layers:
        module = model.get_submodule(layer_path)
        orig_module = module.orig_module
        
        W = orig_module.weight.data  # [576, 2048] -> [2048, 576]
        lora_A = module.lora_A['default'].weight.data  # [8, 2048]
        lora_B = module.lora_B['default'].weight.data  # [576, 8]
        alpha_over_r = 32/8  # alpha=32, r=8
        
        input_tensor = layer_data[layer_path]['input']  # [1, 512, 2048]
        
        try:
            original_output = torch.matmul(input_tensor, W)  # [1,512,2048] @ [2048,576] => [1,512,576]
        except:
            original_output = torch.matmul(input_tensor, W.T)  # [1,512,2048] @ [2048,576] => [1,512,576]
        
        lora_effect = torch.matmul(
            torch.matmul(input_tensor, lora_A.T),  # [1,512,2048] @ [2048,8] => [1,512,8]
            lora_B.T  # [1,512,8] @ [8,576] => [1,512,576]
        ) * alpha_over_r
        
        manual_output = original_output + lora_effect  # [1,512,576]
        
        model_output = layer_data[layer_path]['output']

        print(f"manual_output:{manual_output}")
        print(f"model_output:{model_output}")
        
        if torch.allclose(manual_output, model_output, atol=1e-5):
            print(f"{layer_path} 验证通过")
        else:
            print(f"{layer_path} 验证失败！最大误差：{torch.max(torch.abs(manual_output - model_output))}")

def print_moe_stats(moe_layer: KExpertsTorch):
    print(f"Total Params: {moe_layer.total_params/1e6:.2f}M")
    
    total_time = sum(moe_layer.times)
    gflops = (moe_layer.total_flops / 1e9) / total_time if total_time !=0 else 0
    
    print(f"Total Calls: {moe_layer.call_count}")
    # print(f"Avg GFLOPS per Call: {gflops/moe_layer.call_count:.2f}")
    print(f"Overall GFLOPS: {gflops:.2f}")
    
    if moe_layer.call_count > 0:
        last_flops = moe_layer.flops_per_call[-1]
        last_time = moe_layer.times[-1]
        print(f"\nLast Call - FLOPs: {last_flops/1e9:.2f}G  Time: {last_time*1000:.2f}ms  "
              f"GFLOPS: {(last_flops/1e9)/last_time:.2f}")
        
def recursive_traverse(model, parent_name=''):
    """
    递归遍历模型，查找MoE层并调用print_moe_stats。
    """
    for name, module in model.named_children():
        full_name = f"{parent_name}.{name}" if parent_name else name
        
        if isinstance(module, KTransformersExperts):
            print(f"Found MoE layer: {full_name}")
            print_moe_stats(module.generate_experts)
        
        recursive_traverse(module, full_name)

def log_step_state(
    step: int,
    inputs: dict,
    loss: torch.Tensor,
    model: nn.Module,
    log_dir: str = "train_logs",
):
    """
    把当前 step 的输入 / loss / grad / param 保存到 log_dir/step_{step}.pt
    """
    Path(log_dir).mkdir(parents=True, exist_ok=True)

    logged_inputs = {
        k: v.detach().cpu()
        for k, v in inputs.items()
        if isinstance(v, torch.Tensor)
    }

    loss_val = loss.detach().cpu()

    params, grads = {}, {}
    for name, p in model.named_parameters():
        params[name] = p.detach().cpu()
        grads[name] = p.grad.detach().cpu() if p.grad is not None else None

    torch.save(
        {
            "step": step,
            "inputs": logged_inputs,
            "loss": loss_val,
            "params": params,
            "grads": grads,
        },
        f"{log_dir}/step_{step:08d}.pt",
    )

def collect_gradients(model, input_ids):
    torch.manual_seed(42)
    
    output = model(input_ids=input_ids)
    
    logits = output.logits
    loss = logits.mean()
    
    model.zero_grad()
    loss.backward()
    
    grads = []
    for name, param in model.named_parameters():
        if param.requires_grad and param.grad is not None:
            grads.append(f"{name}: {param.grad.norm().item():.6f}")
    
    return grads

def report_meta_tensors(model):
    import torch, inspect
    meta_modules = []
    for mod_name, mod in model.named_modules():
        metas = []
        for n, p in list(mod.named_parameters(recurse=False)):
            if getattr(p, "is_meta", False) and p.is_meta:
                metas.append(("param", n, tuple(p.shape)))
        for n, b in list(mod.named_buffers(recurse=False)):
            if getattr(b, "is_meta", False) and b.is_meta:
                metas.append(("buffer", n, tuple(b.shape)))
        if metas:
            print(f"[META] {mod_name} ({type(mod).__name__}): {metas}")
            meta_modules.append((mod_name, type(mod).__name__, metas))
    return meta_modules

# def lora_and_load_adapter(model, tokenizer, sft_data_path, save_adapter_path, is_profiler=False):
    # show some lora test
    
    '''
    # multi-gpu dataloader test
    # _ = report_meta_tensors(model)
    
    # print("=== SAMPLE INSPECT ===")
    # for i in range(2):
    #     summary = {}
    #     for k,v in ex.items():
    #         if isinstance(v, list):
    #             if len(v)>0 and isinstance(v[0], list):
    #                 summary[k] = f"list-of-lists len={len(v)} x len0={len(v[0])}"
    #             else:
    #                 summary[k] = f"list len={len(v)}"
    #         elif torch.is_tensor(v):
    #             summary[k] = f"tensor shape={tuple(v.shape)}"
    #         else:
    #             summary[k] = str(type(v))
    #     print(f"[SAMPLE {i}]", summary)
    
    # trainer.accelerator = Accelerator(device_placement=False)
    # first_batch = next(iter(trainer.get_train_dataloader()))
    # print("Batch keys:", list(first_batch.keys()))
    
    # acc = KAccelerator(device_placement=False)
    # acc.state.device_ids = [0]
    # acc.state.num_processes = 1
    # acc.state.num_gpus = 1
    # trainer.accelerator = acc

    # print("Accelerator device_ids:", trainer.accelerator.state.device_ids)
    # print(f"type(trainer.model):{type(trainer.model)}")
    # print(f"type(trainer.accelerator):{type(trainer.accelerator)}")
    
    
    # print("-------------------------START TRAINING!!!-------------------------")

    # cfg = getattr(trainer.accelerator, "dataloader_config", None)
    # print(
    #     "[ACCEL DL CONFIG]",
    #     "split_batches=", getattr(cfg, "split_batches", None),
    #     "dispatch_batches=", getattr(cfg, "dispatch_batches", None),
    #     "even_batches=", getattr(cfg, "even_batches", None),
    #     "use_seedable_sampler=", getattr(cfg, "use_seedable_sampler", None),
    #     "non_blocking=", getattr(cfg, "non_blocking", None),
    # )
    # print("--------------------NEW DEBUG--------------------")
    # install_shape_probes(trainer.model) # print some debug info about multi-gpu placement.

    # input_ids = torch.randint(0, 1000, (32, 128), device="cuda:0")
    # gradients = collect_gradients(model, input_ids)
    '''
    
    # with open(f"/home/lpl/kt-sft/tmp/KSFTExpertsCPU_grads.txt", "w") as f:
    #     f.write("\n".join(gradients))
    # print(xx)
    
    # total_length = 0
    # valid_count = 0
    # for batch in tqdm(train_dataloader):
    #     input_ids = batch['input_ids']
    #     # print(f"Token count per sample: {[len(ids) for ids in input_ids]}")
    #     for ids in input_ids:
    #         if not torch.equal(ids, torch.tensor([100001])):
    #             total_length += len(ids)
    #     valid_count += 1
    #     # print(f"Input tensor: {input_ids}")
    #     # print(f"total_length:{total_length}")
    #     # break

    # if valid_count > 0:
    #     average_length = total_length / valid_count
    # else:

    # print(xx)
    
    # from ktransformers.sft.flops_utils.custom_profile import custom_profile

    # for module in model.modules():
    #     if not hasattr(module, 'total_ops'):
    #         module.register_buffer('total_ops', torch.zeros(1, dtype=torch.float64))
    #     if not hasattr(module, 'total_params'):
    #         module.register_buffer('total_params', torch.zeros(1, dtype=torch.float64))
            
    # # print(f"input:{input}")
    # for inputs in tqdm(train_dataloader):
    #     # input_ids = batch['input_ids']
    #     # del inputs['instruction']
    #     # del inputs['input']
    #     # del inputs['output']
    #     # output = model(**inputs)
    #     model.eval()
    #     content = inputs['instruction'][0] + inputs['input'][0]
    #     # flops,params = custom_profile(model, inputs=inputs, content=content, tokenizer=tokenizer, custom_ops={YourModule: count_your_model})
    #     # print('FLOPs = ' + str(flops / 1000 ** 3) + 'G')
    #     # print('Params = ' + str(params / 1000 ** 2) + 'M')

    #     messages = [{"role": "user", "content": content}]
    #     input_tensor = tokenizer.apply_chat_template(
    #         messages, add_generation_prompt=True, return_tensors="pt"
    #     )
    #     with torch.no_grad():
    #         # model(*inputs)
    #         # model.model to deal with the PeftModelForCaualLM temp
    #         prefill_and_generate(
    #             model.model, tokenizer, input_tensor.cuda(), max_new_tokens=1000, use_cuda_graph=False, mode = 'normal', force_think = False, chunk_prefill_size = 8192,
    #         )
    #     recursive_traverse(model)
    
    # output = model(input_ids=torch.tensor([[1,2,3]], dtype=torch.int32, device="cuda:0"))
    # loss = output.logits.mean()
        
    # dot = make_dot(loss, params=dict(model.named_parameters()))
    # dot.render("KT_compute_cpuinfer_moe_model_graph", format="svg")

    # with open("tmp/output_loss_KCPU.txt", "w") as file:
    #     file.write("Output (logits):\n")
    #     file.write("\n\nLoss:\n")
    
    # disable_all_dropout(model)

    # def print_dropout_status(module, prefix=""):
    #     for name, child in module.named_children():
    #         if isinstance(child, nn.Dropout):
    #             print(f"{prefix}{name}: p={child.p}, training={child.training}")
    #         print_dropout_status(child, prefix + name + ".")
    
    # print_dropout_status(model)

    # for layer_path in target_layers:
    #     module = model.get_submodule(layer_path)
    #     hook = module.register_forward_hook(
    #         lambda m, i, o, ln=layer_path: record_layer_io(m, i, o, ln)
    #     )
    #     hooks.append(hook)

    
    # if is_profiler:
    #     profiler = profile(
    #         activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    #         schedule=torch.profiler.schedule(
    #         ),
    #         on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),
    #         record_shapes=False,
    #         with_stack=False
    #     )

    #     # profiler_args = {
    #     #     "schedule": torch.profiler.schedule(
    #     #     )
    #     # }

    #     trainer = KTrainer(
    #         model=model,
    #         train_dataset=train_dataset,
    #         data_collator=DataCollatorForSeq2Seq(
    #             tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    #         ),
    #         callbacks=[ProfilerCallback(profiler)]
    #     )

    #     with profiler:
    #         trainer.train()

    #     print("Training finished. Exporting profiler data...")
    #     with open("profiler_output.txt", "w") as f:
    #         f.write(profiler.key_averages().table(sort_by="cuda_time_total", row_limit=20))
    
    #   profiler.export_chrome_trace("trace.json")
    
    
    # verify_lora_layers(model)

    # model.save_pretrained(save_adapter_path)

    '''
    ----------------------- START: Lora Test -----------------------
    

    # for name, module in model.named_modules():
    #     if "q_proj" in name or "kv_a_proj" in name or "o_proj" in name:
    #         print(name)

    # print_model_params(model)

    # model = KTransformersLinearLora()

    # inspect_device(model, '/home/yj/ktransformers/device1.txt')
    # with open('/home/yj/ktransformers/device1.txt', 'a') as file:
    #     file.write(f"Base model device: {model.base_model.device}\n")
        # file.write(f"LoRA adapter device: {model.lora_config['target_modules'].device}\n")
    # print(f"Base model device: {model.base_model.device}") 
    # print(f"LoRA adapter device: {model.lora_config['target_modules'].device}") 


    # model = model.to('cuda')

    # for name, module in model.named_modules():
    #     module.register_forward_hook(forward_hook)

    # for name, parms in model.named_parameters():	
    #     # parms.requires_grad = True
    #     print('-->name:', name)
    #     print('-->para:', parms)
    #     print('-->grad_requirs:',parms.requires_grad)
    #     print('-->grad_fn:',parms.grad_fn)
    #     print('-->grad_value:',parms.grad)
    #     print("===")

    # output = model(input_ids=torch.tensor([[1,2,3]], dtype=torch.int32, device="cuda:0"))
    # loss = output.logits.mean()

    # dot = make_dot(loss, params=dict(model.named_parameters()))
    # dot.render("KT_compute_graph", format="svg")

    # inspect_device(model, '/home/yj/ktransformers/device2.txt')
    # with open('/home/yj/ktransformers/device2.txt', 'a') as file:
    #     file.write(f"Base model device: {model.base_model.device}\n")
        # file.write(f"LoRA adapter device: {model.lora_config['target_modules'].device}\n")
    # print(f"Base model device: {model.base_model.device}") 
    # print(f"LoRA adapter device: {model.lora_config['target_modules'].device}") 

    # print_lora_params(model)

    # trainer = KTrainer(
    #     model=model,
    #     train_dataset=train_dataset,
    #     args=transformers.TrainingArguments(
    #         output_dir=save_adapter_path,
    #         per_device_train_batch_size=1,
    #         gradient_accumulation_steps=16,
    #         num_train_epochs=10,
    #         learning_rate=3e-4,
    #         fp16=False,
    #         logging_steps=10,
    #         save_steps=200,
    #         dataloader_drop_last=True,
    #         ddp_find_unused_parameters=False 
    #     ),
    #     data_collator=DataCollatorForSeq2Seq(
    #         tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    #     ),
    # )

    # model(input_ids=torch.tensor([[1,2,3]], dtype=torch.int32, device="cuda:0"))

    # trainer.train()

    # print_lora_params(model)

    # model = model.merge_and_unload()
    ----------------------- END: Lora Test -----------------------

    '''

================================================
FILE: kt-sft/ktransformers/sft/lora.py
================================================
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import Trainer
from transformers.training_args import OptimizerNames
from transformers.trainer_utils import seed_worker
from transformers.utils import (
    is_datasets_available,
    is_sagemaker_mp_enabled,
    is_torch_xpu_available,
    is_torch_mlu_available,
    is_torch_musa_available,
    is_torch_npu_available,
    is_torch_mps_available,
    is_torch_hpu_available,
    is_accelerate_available,
    is_apex_available,
    logging,
)
from packaging import version
import os
import inspect
import functools
from typing import Union, Any, Dict, List

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.utils.data import DataLoader, IterableDataset
from torch.utils.data import Dataset as TorchDataset

from peft import LoraConfig, TaskType
from datasets import Dataset
from torchviz import make_dot
from tqdm import tqdm
import os, json
from pathlib import Path
from accelerate import Accelerator
if is_accelerate_available("0.28.0"):
    from accelerate.utils import DataLoaderConfiguration
from accelerate import __version__ as accelerate_version
if version.parse(accelerate_version) > version.parse("1.3.0"):
        from accelerate.utils import TorchTensorParallelPlugin
if is_sagemaker_mp_enabled():
    from transformers.trainer_utils import smp_forward_backward

from ktransformers.sft.peft_utils.mapping import get_peft_model

logger = logging.get_logger(__name__)

class KAccelerator(Accelerator):
    def __init__(self, *args, **kwargs):
        kwargs.setdefault("device_placement", False)
        super().__init__(*args, **kwargs)
        
    def prepare_model(self, model, *args, **kwargs):
        return model
    
    def prepare(self, *args, **kwargs):
        prepped = []
        for obj in args:
            if isinstance(obj, nn.Module):
                prepped.append(self.prepare_model(obj, **kwargs))
            else:
                prepped.append(super().prepare(obj, **kwargs))
        return tuple(prepped) if len(prepped) > 1 else prepped[0]

class KTrainer(Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        output_dir = output_dir or self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        # only save LoRA adapter, including adapter_config.json
        self.model.save_pretrained(output_dir)
        
    def _move_model_to_device(self, model, device):
        print("[KTrainer] Due to the placement feature in KTransformers, skip moving model to", device)
        return model
    
    def _wrap_model(self, model, training=True, dataloader=None):
        self.model_wrapped = model
        return model
    
    def create_accelerator_and_postprocess(self):
        # We explicitly don't rely on the `Accelerator` to do gradient accumulation
        grad_acc_kwargs = {}
        if is_accelerate_available("0.28.0") and self.args.accelerator_config.gradient_accumulation_kwargs is not None:
            grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs

        # check if num_steps is attempted to be passed in gradient_accumulation_kwargs
        if "num_steps" in grad_acc_kwargs:
            if self.args.gradient_accumulation_steps > 1:
                # raise because we do not know which setting is intended.
                raise ValueError(
                    "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`"
                    "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`."
                )
            else:
                self.args.gradient_accumulation_steps = grad_acc_kwargs["num_steps"]

        accelerator_config = self.args.accelerator_config.to_dict()

        if is_accelerate_available("0.28.0"):
            # Extract dataloader config params from accelerator config
            dataloader_params = ["split_batches", "dispatch_batches", "even_batches", "use_seedable_sampler"]
            dataloader_config_dict = {param: accelerator_config.pop(param) for param in dataloader_params if param in accelerator_config}
            if DataLoaderConfiguration is None:
                raise ImportError("Your accelerate does not provide DataLoaderConfiguration but Trainer expects it.")
            dataloader_config = DataLoaderConfiguration(**dataloader_config_dict)
            if is_accelerate_available("1.1.0"):
                dataloader_config.data_seed = self.args.data_seed
        else:
            dataloader_config = None

        non_blocking = accelerator_config.pop("non_blocking", False)
        if not is_accelerate_available("0.30.0"):
            if non_blocking:
                raise ImportError(
                    "`non_blocking` is only supported in accelerate v0.30.0 and above. Please upgrade accelerate to use this feature."
                )
        else:
            if non_blocking and not self.args.dataloader_pin_memory:
                logger.warning("`non_blocking` is enabled but `dataloader_pin_memory` is not. For best performance, enable both.")
            if dataloader_config is not None:
                dataloader_config.non_blocking = non_blocking

        accelerator_config.pop("gradient_accumulation_kwargs", None)

        args = {
            "deepspeed_plugin": self.args.deepspeed_plugin,
            "device_placement": False,
        }

        if is_accelerate_available("0.28.0"):
            args["dataloader_config"] = dataloader_config
        else:
            args.update(accelerator_config)

        if getattr(self.args, "tp_size", 1) > 1:
            self.is_tp_enabled = True
            if version.parse(accelerate_version) > version.parse("1.3.0") and TorchTensorParallelPlugin is not None:
                args["torch_tp_plugin"] = TorchTensorParallelPlugin(tp_size=self.args.tp_size)
            else:
                raise ValueError("Requires accelerate>1.3.0 to use Tensor Parallelism.")

        self.accelerator = KAccelerator(**args)

        try:
            self.accelerator.state.device_ids = [0]
            self.accelerator.state.num_processes = 1
            self.accelerator.state.num_gpus = 1
        except Exception:
            pass

        # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
        self.gather_function = self.accelerator.gather_for_metrics

        if "use_gather_object" in inspect.signature(self.gather_function).parameters.keys():
            self.gather_function = functools.partial(
                self.gather_function, use_gather_object=self.args.eval_use_gather_object
            )

        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
        self.is_tp_enabled = getattr(self.accelerator.state, "torch_tp_plugin", None) is not None
        # post accelerator creation setup
        if self.is_fsdp_enabled:
            fsdp_plugin = self.accelerator.state.fsdp_plugin
            for param in ["limit_all_gathers", "activation_checkpointing"]:
                setattr(fsdp_plugin, param, self.args.fsdp_config.get(param, getattr(fsdp_plugin, param)))
            if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
                raise ValueError(
                    "The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg "
                    "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic "
                    "when using FSDP."
                )

        if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None:
            self.propagate_args_to_deepspeed()

        # `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end`
        if (
            self.args.save_only_model
            and (self.is_deepspeed_enabled or self.is_fsdp_enabled)
            and self.args.load_best_model_at_end
        ):
            wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
            raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.")

        # `auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3
        if (
            self.is_deepspeed_enabled
            and self.accelerator.state.deepspeed_plugin.zero_stage == 3
            and self.args.auto_find_batch_size
        ):
            raise ValueError(
                "`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
            )
        if (
            self.args.save_only_model
            and self.is_fsdp_enabled
            and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
        ):
            raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")
        
        if dataloader_config is not None:
            dataloader_config.split_batches = False
            dataloader_config.dispatch_batches = False
            dataloader_config.even_batches = False
            
    def get_train_dataloader(self) -> DataLoader:
        """
        Returns the training DataLoader with per_device_train_batch_size
        (no implicit multipliers by number of visible GPUs).
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        data_collator = self.data_collator

        if is_datasets_available():
            try:
                import datasets
                if isinstance(train_dataset, datasets.Dataset):
                    train_dataset = self._remove_unused_columns(train_dataset, description="training")
                else:
                    data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
            except Exception:
                data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
        else:
            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")

        dataloader_params = {
            "batch_size": self.args.per_device_train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        if not isinstance(train_dataset, IterableDataset):
            dataloader_params["sampler"] = self._get_train_sampler()
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker
            if self.args.dataloader_num_workers > 0 and self.args.dataloader_prefetch_factor is not None:
                dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        dl = DataLoader(train_dataset, **dataloader_params)

        try:
            prepared = self.accelerator.prepare(dl, device_placement=[False])
        except TypeError:
            prepared = self.accelerator.prepare(dl)

        return prepared
    
    def training_step(
        self,
        model: torch.nn.Module,
        inputs: dict[str, Union[torch.Tensor, Any]],
        num_items_in_batch=None
    ) -> torch.Tensor:
        model.train()
        if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
            self.optimizer.train()

        inputs = self._prepare_inputs(inputs)

        if is_sagemaker_mp_enabled():
            loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
            return loss_mb.reduce_mean().detach().to(self.args.device)

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)

        del inputs

        if (
            self.args.torch_empty_cache_steps is not None
            and self.state.global_step % self.args.torch_empty_cache_steps == 0
        ):
            if is_torch_xpu_available():
                torch.xpu.empty_cache()
            elif is_torch_mlu_available():
                torch.mlu.empty_cache()
            elif is_torch_musa_available():
                torch.musa.empty_cache()
            elif is_torch_npu_available():
                torch.npu.empty_cache()
            elif is_torch_mps_available(min_version="2.0"):
                torch.mps.empty_cache()
            elif is_torch_hpu_available():
                logger.warning(
                    "`torch_empty_cache_steps` is set but HPU device/backend does not support empty_cache()."
                )
            else:
                torch.cuda.empty_cache()

        kwargs = {}

        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
            kwargs["learning_rate"] = self._get_learning_rate()

        if self.args.n_gpu > 1:
            loss = loss.mean()

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:  # type: ignore
                scaled_loss.backward()
        else:
            if not self.model_accepts_loss_kwargs and self.compute_loss_func is None:
                loss = loss / self.args.gradient_accumulation_steps

            if getattr(self.accelerator, "distributed_type", None) and \
               str(self.accelerator.distributed_type) == "DistributedType.DEEPSPEED":
                kwargs["scale_wrt_gas"] = False

            self.accelerator.backward(loss, **kwargs)

        ret = loss.detach()
        if ret.device != self.args.device:
            ret = ret.to(self.args.device, non_blocking=True)

        if os.environ.get("KT_DBG_STEP", "0") == "1" and not hasattr(self, "_kt_dbg_once"):
            try:
                print(f"[KT-DBG] args.device={self.args.device}  loss(before)={loss.device}  loss(return)={ret.device}")
            except Exception:
                pass
            self._kt_dbg_once = True

        return ret

class SFTJsonListDataset(TorchDataset):
    def __init__(self, path: str, tokenizer: AutoTokenizer, max_len: int = 512):
        super().__init__()
        with open(path, "r", encoding="utf-8") as f:
            self.samples: List[Dict] = json.load(f)
        self.tok = tokenizer
        self.max_len = max_len

    @staticmethod
    def build_example(ins: str, inp: str, out: str) -> Dict[str, str]:
        ins = (ins or "").strip()
        inp = (inp or "").strip()
        out = (out or "").strip()
        prompt = (ins + inp) if ins else inp
        return {"prompt": prompt, "response": out}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx: int):
        rec = self.samples[idx]
        eg = self.build_example(rec.get("instruction", ""), rec.get("input", ""), rec.get("output", ""))

        prompt_ids = self.tok(
            eg["prompt"],
            max_length=self.max_len,
            truncation=True,
            add_special_tokens=False,
        )["input_ids"]

        response_ids = self.tok(
            eg["response"],
            max_length=self.max_len,
            truncation=True,
            add_special_tokens=False,
        )["input_ids"]

        eos_id = self.tok.eos_token_id
        input_ids = prompt_ids + response_ids + ([eos_id] if eos_id is not None else [])
        input_ids = input_ids[: self.max_len]

        labels = [-100] * min(len(prompt_ids), self.max_len)
        tail = input_ids[len(labels):]
        labels = labels + tail
        labels = labels[: self.max_len]

        attention_mask = [1] * len(input_ids)

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
        }

def lora_and_load_adapter(model, tokenizer, sft_data_path, save_adapter_path):
    
    Path(save_adapter_path).mkdir(parents=True, exist_ok=True)
    
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        target_modules=[
            "q_proj", # FOR DeepSeek-V2-Lite
            "q_a_proj", # FOR DeepSeek-V3&R1
            "q_b_proj",
            "kv_a_proj_with_mqa",
            "kv_b_proj",
            "o_proj",
            "mlp.gate_proj",
            "mlp.up_proj",
            "mlp.down_proj",
            "shared_experts.gate_proj",
            "shared_experts.up_proj",
            "shared_experts.down_proj",
        ],
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    train_dataset = SFTJsonListDataset(sft_data_path, tokenizer, max_len=512)
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=save_adapter_path,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        num_train_epochs=1,
        # max_steps=30, # TODO: FOR TEST, will override any value given in num_train_epochs
        learning_rate=1e-4,
        fp16=False,
        logging_steps=10,
        save_steps=200,
        dataloader_drop_last=True,
        ddp_find_unused_parameters=False,
    )
    
    debug_path = os.path.join(save_adapter_path, "model_infra_debug.json")
    with open(debug_path, "w", encoding="utf-8") as f:
        json.dump({"model": str(model)}, f, ensure_ascii=False, indent=2)
    
    # output = model(input_ids=torch.tensor([[1,2,3]], dtype=torch.int32, device="cuda:0"))
    # loss = output.logits.mean()
        
    # dot = make_dot(loss, params=dict(model.named_parameters()))
    # dot.render("KT_compute_cpuinfer_moe_model_graph", format="svg")
    
    trainer = KTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )
    model.config.use_cache = False
    # model.gradient_checkpointing_enable()
    # if hasattr(model, "enable_input_require_grads"):
    #     model.enable_input_require_grads()
    
    trainer.train()

def inject_lora_layer(model, use_adapter_path):

    cfg_path = os.path.join(use_adapter_path, "adapter_config.json")
    with open(cfg_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    task_type_str = (data.get("task_type") or "CAUSAL_LM").upper()
    bias = data.get("bias", "none")
    if bias in (None, False):
        bias = "none"
    if data.get("lora_bias") is True and bias == "none":
        bias = "lora_only"

    tmods = data.get("target_modules")
    if isinstance(tmods, str):
        tmods = [m.strip() for m in tmods.split(",") if m.strip()]

    mts = data.get("modules_to_save", None)
    if isinstance(mts, str):
        mts = [m.strip() for m in mts.split(",") if m.strip()]

    rank_pattern = data.get("rank_pattern") or None
    alpha_pattern = data.get("alpha_pattern") or None

    lora_config = LoraConfig(
        r=data.get("r", 8),
        lora_alpha=data.get("lora_alpha", 32),
        lora_dropout=float(data.get("lora_dropout", 0.0)),
        bias=bias,
        task_type=TaskType[task_type_str],
        target_modules=tmods,
        modules_to_save=mts,
        init_lora_weights=bool(data.get("init_lora_weights", True)),
        inference_mode=bool(data.get("inference_mode", True)),
        use_rslora=bool(data.get("use_rslora", False)),
        use_dora=bool(data.get("use_dora", False)),
    )
    print(f"lora_config:{lora_config.__dict__}")
    
    # model = inject_adapter_in_model(lora_config, model)
    model = get_peft_model(model, lora_config)
    model.config.use_cache = False
    model.eval()

================================================
FILE: kt-sft/ktransformers/sft/metrics.py
================================================
# Copyright 2025 HuggingFace Inc., THUDM, and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's transformers library and the THUDM's ChatGLM implementation.
# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
# https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

import numpy as np
import torch
from transformers.utils import is_jieba_available, is_nltk_available

from ktransformers.sft.metrics_utils.constants import IGNORE_INDEX
from ktransformers.sft.metrics_utils.misc import numpify
from ktransformers.sft.metrics_utils.packages import is_rouge_available


if TYPE_CHECKING:
    from transformers import EvalPrediction, PreTrainedTokenizer


if is_jieba_available():
    import jieba  # type: ignore


if is_nltk_available():
    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu  # type: ignore


if is_rouge_available():
    from rouge_chinese import Rouge  # type: ignore


def eval_logit_processor(logits: "torch.Tensor", labels: "torch.Tensor") -> "torch.Tensor":
    r"""Compute the token with the largest likelihood to reduce memory footprint."""
    if isinstance(logits, (list, tuple)):
        if logits[0].dim() == 3:  # (batch_size, seq_len, vocab_size)
            logits = logits[0]
        else:  # moe models have aux loss
            logits = logits[1]

    if logits.dim() != 3:
        raise ValueError("Cannot process the logits.")

    return torch.argmax(logits, dim=-1)

@dataclass
class ComputeSimilarity:
    r"""Compute text similarity scores and support `batch_eval_metrics`.

    Wraps the tokenizer into metric functions, used in CustomSeq2SeqTrainer.
    """

    tokenizer: "PreTrainedTokenizer"

    def _dump(self) -> Optional[dict[str, float]]:
        result = None
        if hasattr(self, "score_dict"):
            result = {k: float(np.mean(v)) for k, v in self.score_dict.items()}

        self.score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], 
                           "bleu-1": [], "bleu-2": [], "bleu-3": [], "bleu-4": []}
        return result

    def __post_init__(self):
        self._dump()

    def __call__(self, eval_preds: "EvalPrediction", compute_result: bool = True) -> Optional[dict[str, float]]:
        preds, labels = numpify(eval_preds.predictions), numpify(eval_preds.label_ids)

        preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
        labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id)

        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

        for pred, label in zip(decoded_preds, decoded_labels):
            hypothesis = list(jieba.cut(pred))
            reference = list(jieba.cut(label))

            if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
                result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
            else:
                rouge = Rouge()
                scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
                result = scores[0]
                
                refs = [reference]
                hyp  = hypothesis
                smooth = SmoothingFunction().method3
                bleu1 = sentence_bleu(refs, hyp, weights=(1.0, 0.0, 0.0, 0.0), smoothing_function=smooth)
                bleu2 = sentence_bleu(refs, hyp, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smooth)
                bleu3 = sentence_bleu(refs, hyp, weights=(1/3, 1/3, 1/3, 0.0), smoothing_function=smooth)
                bleu4 = sentence_bleu(refs, hyp, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)

            for k, v in result.items():
                self.score_dict[k].append(round(v["f"] * 100, 4))

            # bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
            # self.score_dict["bleu-4"].append(round(bleu_score * 100, 4))
            
            self.score_dict["bleu-1"].append(round(bleu1 * 100, 4))
            self.score_dict["bleu-2"].append(round(bleu2 * 100, 4))
            self.score_dict["bleu-3"].append(round(bleu3 * 100, 4))
            self.score_dict["bleu-4"].append(round(bleu4 * 100, 4))

        if compute_result:
            return self._dump()

================================================
FILE: kt-sft/ktransformers/sft/metrics_utils/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/sft/metrics_utils/constants.py
================================================
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from collections import OrderedDict, defaultdict
from enum import Enum, unique
from typing import Optional

from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME
from peft.utils import WEIGHTS_NAME as ADAPTER_WEIGHTS_NAME
from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME


AUDIO_PLACEHOLDER = os.getenv("AUDIO_PLACEHOLDER", "<audio>")

CHECKPOINT_NAMES = {
    SAFE_ADAPTER_WEIGHTS_NAME,
    ADAPTER_WEIGHTS_NAME,
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
}

CHOICES = ["A", "B", "C", "D"]

DATA_CONFIG = "dataset_info.json"

DEFAULT_TEMPLATE = defaultdict(str)

FILEEXT2TYPE = {
    "arrow": "arrow",
    "csv": "csv",
    "json": "json",
    "jsonl": "json",
    "parquet": "parquet",
    "txt": "text",
}

IGNORE_INDEX = -100

IMAGE_PLACEHOLDER = os.getenv("IMAGE_PLACEHOLDER", "<image>")

LAYERNORM_NAMES = {"norm", "ln"}

LLAMABOARD_CONFIG = "llamaboard_config.yaml"

METHODS = ["full", "freeze", "lora"]

MOD_SUPPORTED_MODELS = {"bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"}

MULTIMODAL_SUPPORTED_MODELS = set()

PEFT_METHODS = {"lora"}

RUNNING_LOG = "running_log.txt"

SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"]

SUPPORTED_MODELS = OrderedDict()

TRAINER_LOG = "trainer_log.jsonl"

TRAINING_ARGS = "training_args.yaml"

TRAINING_STAGES = {
    "Supervised Fine-Tuning": "sft",
    "Reward Modeling": "rm",
    "PPO": "ppo",
    "DPO": "dpo",
    "KTO": "kto",
    "Pre-Training": "pt",
}

STAGES_USE_PAIR_DATA = {"rm", "dpo"}

SUPPORTED_CLASS_FOR_S2ATTN = {"llama"}

SWANLAB_CONFIG = "swanlab_public_config.json"

VIDEO_PLACEHOLDER = os.getenv("VIDEO_PLACEHOLDER", "<video>")

V_HEAD_WEIGHTS_NAME = "value_head.bin"

V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors"


class AttentionFunction(str, Enum):
    AUTO = "auto"
    DISABLED = "disabled"
    SDPA = "sdpa"
    FA2 = "fa2"


class EngineName(str, Enum):
    HF = "huggingface"
    VLLM = "vllm"
    SGLANG = "sglang"


class DownloadSource(str, Enum):
    DEFAULT = "hf"
    MODELSCOPE = "ms"
    OPENMIND = "om"


@unique
class QuantizationMethod(str, Enum):
    r"""Borrowed from `transformers.utils.quantization_config.QuantizationMethod`."""

    BNB = "bnb"
    GPTQ = "gptq"
    AWQ = "awq"
    AQLM = "aqlm"
    QUANTO = "quanto"
    EETQ = "eetq"
    HQQ = "hqq"
    MXFP4 = "mxfp4"


class RopeScaling(str, Enum):
    LINEAR = "linear"
    DYNAMIC = "dynamic"
    YARN = "yarn"
    LLAMA3 = "llama3"


def register_model_group(
    models: dict[str, dict[DownloadSource, str]],
    template: Optional[str] = None,
    multimodal: bool = False,
) -> None:
    for name, path in models.items():
        SUPPORTED_MODELS[name] = path
        if template is not None and (
            any(suffix in name for suffix in ("-Chat", "-Distill", "-Instruct", "-Thinking")) or multimodal
        ):
            DEFAULT_TEMPLATE[name] = template

        if multimodal:
            MULTIMODAL_SUPPORTED_MODELS.add(name)


register_model_group(
    models={
        "Aya-23-8B-Chat": {
            DownloadSource.DEFAULT: "CohereForAI/aya-23-8B",
        },
        "Aya-23-35B-Chat": {
            DownloadSource.DEFAULT: "CohereForAI/aya-23-35B",
        },
    },
    template="cohere",
)


register_model_group(
    models={
        "Baichuan-7B-Base": {
            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-7B",
            DownloadSource.MODELSCOPE: "baichuan-inc/baichuan-7B",
        },
        "Baichuan-13B-Base": {
            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Base",
            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Base",
        },
        "Baichuan-13B-Chat": {
            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Chat",
            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Chat",
        },
    },
    template="baichuan",
)


register_model_group(
    models={
        "Baichuan2-7B-Base": {
            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Base",
            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Base",
        },
        "Baichuan2-13B-Base": {
            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Base",
            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Base",
            DownloadSource.OPENMIND: "Baichuan/Baichuan2_13b_base_pt",
        },
        "Baichuan2-7B-Chat": {
            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Chat",
            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Chat",
            DownloadSource.OPENMIND: "Baichuan/Baichuan2_7b_chat_pt",
        },
        "Baichuan2-13B-Chat": {
            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Chat",
            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Chat",
            DownloadSource.OPENMIND: "Baichuan/Baichuan2_13b_chat_pt",
        },
    },
    template="baichuan2",
)


register_model_group(
    models={
        "BLOOM-560M": {
            DownloadSource.DEFAULT: "bigscience/bloom-560m",
            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-560m",
        },
        "BLOOM-3B": {
            DownloadSource.DEFAULT: "bigscience/bloom-3b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-3b",
        },
        "BLOOM-7B1": {
            DownloadSource.DEFAULT: "bigscience/bloom-7b1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-7b1",
        },
    },
)


register_model_group(
    models={
        "BLOOMZ-560M": {
            DownloadSource.DEFAULT: "bigscience/bloomz-560m",
            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-560m",
        },
        "BLOOMZ-3B": {
            DownloadSource.DEFAULT: "bigscience/bloomz-3b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-3b",
        },
        "BLOOMZ-7B1-mt": {
            DownloadSource.DEFAULT: "bigscience/bloomz-7b1-mt",
            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-7b1-mt",
        },
    },
)


register_model_group(
    models={
        "BlueLM-7B-Base": {
            DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Base",
            DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Base",
        },
        "BlueLM-7B-Chat": {
            DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Chat",
            DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Chat",
        },
    },
    template="bluelm",
)


register_model_group(
    models={
        "Breeze-7B": {
            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Base-v1_0",
        },
        "Breeze-7B-Instruct": {
            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
        },
    },
    template="breeze",
)


register_model_group(
    models={
        "ChatGLM2-6B-Chat": {
            DownloadSource.DEFAULT: "zai-org/chatglm2-6b",
            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm2-6b",
        }
    },
    template="chatglm2",
)


register_model_group(
    models={
        "ChatGLM3-6B-Base": {
            DownloadSource.DEFAULT: "zai-org/chatglm3-6b-base",
            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b-base",
        },
        "ChatGLM3-6B-Chat": {
            DownloadSource.DEFAULT: "zai-org/chatglm3-6b",
            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b",
        },
    },
    template="chatglm3",
)


register_model_group(
    models={
        "Chinese-Llama-2-1.3B": {
            DownloadSource.DEFAULT: "hfl/chinese-llama-2-1.3b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-1.3b",
        },
        "Chinese-Llama-2-7B": {
            DownloadSource.DEFAULT: "hfl/chinese-llama-2-7b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-7b",
        },
        "Chinese-Llama-2-13B": {
            DownloadSource.DEFAULT: "hfl/chinese-llama-2-13b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-13b",
        },
        "Chinese-Alpaca-2-1.3B-Chat": {
            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-1.3b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-1.3b",
        },
        "Chinese-Alpaca-2-7B-Chat": {
            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-7b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-7b",
        },
        "Chinese-Alpaca-2-13B-Chat": {
            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-13b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-13b",
        },
    },
    template="llama2_zh",
)


register_model_group(
    models={
        "CodeGeeX4-9B-Chat": {
            DownloadSource.DEFAULT: "zai-org/codegeex4-all-9b",
            DownloadSource.MODELSCOPE: "ZhipuAI/codegeex4-all-9b",
        },
    },
    template="codegeex4",
)


register_model_group(
    models={
        "CodeGemma-7B": {
            DownloadSource.DEFAULT: "google/codegemma-7b",
        },
        "CodeGemma-7B-Instruct": {
            DownloadSource.DEFAULT: "google/codegemma-7b-it",
            DownloadSource.MODELSCOPE: "AI-ModelScope/codegemma-7b-it",
        },
        "CodeGemma-1.1-2B": {
            DownloadSource.DEFAULT: "google/codegemma-1.1-2b",
        },
        "CodeGemma-1.1-7B-Instruct": {
            DownloadSource.DEFAULT: "google/codegemma-1.1-7b-it",
        },
    },
    template="gemma",
)


register_model_group(
    models={
        "Codestral-22B-v0.1-Chat": {
            DownloadSource.DEFAULT: "mistralai/Codestral-22B-v0.1",
            DownloadSource.MODELSCOPE: "swift/Codestral-22B-v0.1",
        },
    },
    template="mistral",
)


register_model_group(
    models={
        "CommandR-35B-Chat": {
            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01",
            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-v01",
        },
        "CommandR-Plus-104B-Chat": {
            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus",
            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-plus",
        },
        "CommandR-35B-4bit-Chat": {
            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01-4bit",
            DownloadSource.MODELSCOPE: "mirror013/c4ai-command-r-v01-4bit",
        },
        "CommandR-Plus-104B-4bit-Chat": {
            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus-4bit",
        },
    },
    template="cohere",
)


register_model_group(
    models={
        "DBRX-132B-Base": {
            DownloadSource.DEFAULT: "databricks/dbrx-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-base",
        },
        "DBRX-132B-Instruct": {
            DownloadSource.DEFAULT: "databricks/dbrx-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-instruct",
        },
    },
    template="dbrx",
)


register_model_group(
    models={
        "DeepSeek-LLM-7B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-base",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-base",
        },
        "DeepSeek-LLM-67B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-base",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-base",
        },
        "DeepSeek-LLM-7B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-chat",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-chat",
        },
        "DeepSeek-LLM-67B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-chat",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-chat",
        },
        "DeepSeek-Math-7B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-base",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-base",
        },
        "DeepSeek-Math-7B-Instruct": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-instruct",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-instruct",
        },
        "DeepSeek-MoE-16B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base",
        },
        "DeepSeek-MoE-16B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
        },
        "DeepSeek-V2-16B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite",
        },
        "DeepSeek-V2-236B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
        },
        "DeepSeek-V2-16B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite-Chat",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite-Chat",
        },
        "DeepSeek-V2-236B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
        },
        "DeepSeek-Coder-V2-16B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
        },
        "DeepSeek-Coder-V2-236B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Base",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Base",
        },
        "DeepSeek-Coder-V2-16B-Instruct": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
        },
        "DeepSeek-Coder-V2-236B-Instruct": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
        },
    },
    template="deepseek",
)


register_model_group(
    models={
        "DeepSeek-Coder-6.7B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-base",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-base",
        },
        "DeepSeek-Coder-7B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-base-v1.5",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-7b-base-v1.5",
        },
        "DeepSeek-Coder-33B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-base",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-base",
        },
        "DeepSeek-Coder-6.7B-Instruct": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-instruct",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-instruct",
        },
        "DeepSeek-Coder-7B-Instruct": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
        },
        "DeepSeek-Coder-33B-Instruct": {
            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-instruct",
            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-instruct",
        },
    },
    template="deepseekcoder",
)


register_model_group(
    models={
        "DeepSeek-V2-0628-236B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat-0628",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat-0628",
        },
        "DeepSeek-V2.5-236B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2.5",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2.5",
        },
        "DeepSeek-V2.5-1210-236B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2.5-1210",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2.5-1210",
        },
        "DeepSeek-V3-671B-Base": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3-Base",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3-Base",
        },
        "DeepSeek-V3-671B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3",
        },
        "DeepSeek-V3-0324-671B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3-0324",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3-0324",
        },
    },
    template="deepseek3",
)


register_model_group(
    models={
        "DeepSeek-R1-1.5B-Distill": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        },
        "DeepSeek-R1-7B-Distill": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
        },
        "DeepSeek-R1-8B-Distill": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
        },
        "DeepSeek-R1-14B-Distill": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
        },
        "DeepSeek-R1-32B-Distill": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
        },
        "DeepSeek-R1-70B-Distill": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
        },
        "DeepSeek-R1-671B-Chat-Zero": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Zero",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Zero",
        },
        "DeepSeek-R1-671B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1",
        },
        "DeepSeek-R1-0528-8B-Distill": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
        },
        "DeepSeek-R1-0528-671B-Chat": {
            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-0528",
            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-0528",
        },
    },
    template="deepseekr1",
)


register_model_group(
    models={
        "Devstral-Small-2507-Instruct": {
            DownloadSource.DEFAULT: "mistralai/Devstral-Small-2507",
            DownloadSource.MODELSCOPE: "mistralai/Devstral-Small-2507",
        },
    },
    template="mistral_small",
)


register_model_group(
    models={
        "EXAONE-3.0-7.8B-Instruct": {
            DownloadSource.DEFAULT: "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
        },
    },
    template="exaone",
)


register_model_group(
    models={
        "Falcon-7B": {
            DownloadSource.DEFAULT: "tiiuae/falcon-7b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b",
        },
        "Falcon-11B": {
            DownloadSource.DEFAULT: "tiiuae/falcon-11B",
            DownloadSource.MODELSCOPE: "tiiuae/falcon-11B",
        },
        "Falcon-40B": {
            DownloadSource.DEFAULT: "tiiuae/falcon-40b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b",
        },
        "Falcon-180B": {
            DownloadSource.DEFAULT: "tiiuae/falcon-180b",
            DownloadSource.MODELSCOPE: "modelscope/falcon-180B",
        },
        "Falcon-7B-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/falcon-7b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b-instruct",
        },
        "Falcon-40B-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/falcon-40b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b-instruct",
        },
        "Falcon-180B-Chat": {
            DownloadSource.DEFAULT: "tiiuae/falcon-180b-chat",
            DownloadSource.MODELSCOPE: "modelscope/falcon-180B-chat",
        },
    },
    template="falcon",
)

register_model_group(
    models={
        "Falcon-H1-0.5B-Base": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Base",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Base",
        },
        "Falcon-H1-1.5B-Base": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Base",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Base",
        },
        "Falcon-H1-1.5B-Deep-Base": {
            DownloadSource.DEFAULT: "tiuae/Falcon-H1-1.5B-Deep-Base",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Base",
        },
        "Falcon-H1-3B-Base": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Base",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Base",
        },
        "Falcon-H1-7B-Base": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Base",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Base",
        },
        "Falcon-H1-34B-Base": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Base",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Base",
        },
        "Falcon-H1-0.5B-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Instruct",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Instruct",
        },
        "Falcon-H1-1.5B-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Instruct",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Instruct",
        },
        "Falcon-H1-1.5B-Deep-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
        },
        "Falcon-H1-3B-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Instruct",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Instruct",
        },
        "Falcon-H1-7B-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Instruct",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Instruct",
        },
        "Falcon-H1-34B-Instruct": {
            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Instruct",
            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Instruct",
        },
    },
    template="falcon_h1",
)


register_model_group(
    models={
        "Gemma-2B": {
            DownloadSource.DEFAULT: "google/gemma-2b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-2b",
        },
        "Gemma-7B": {
            DownloadSource.DEFAULT: "google/gemma-7b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-2b-it",
        },
        "Gemma-2B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-2b-it",
            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-7b",
        },
        "Gemma-7B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-7b-it",
            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-7b-it",
        },
        "Gemma-1.1-2B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-1.1-2b-it",
        },
        "Gemma-1.1-7B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-1.1-7b-it",
        },
    },
    template="gemma",
)


register_model_group(
    models={
        "Gemma-2-2B": {
            DownloadSource.DEFAULT: "google/gemma-2-2b",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b",
        },
        "Gemma-2-9B": {
            DownloadSource.DEFAULT: "google/gemma-2-9b",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b",
        },
        "Gemma-2-27B": {
            DownloadSource.DEFAULT: "google/gemma-2-27b",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-27b",
        },
        "Gemma-2-2B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-2-2b-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b-it",
            DownloadSource.OPENMIND: "LlamaFactory/gemma-2-2b-it",
        },
        "Gemma-2-9B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-2-9b-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b-it",
            DownloadSource.OPENMIND: "LlamaFactory/gemma-2-9b-it",
        },
        "Gemma-2-27B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-2-27b-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-27b-it",
        },
        "Gemma-3-1B": {
            DownloadSource.DEFAULT: "google/gemma-3-1b-pt",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-1b-pt",
        },
        "Gemma-3-1B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-3-1b-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-1b-it",
        },
        "MedGemma-27B-Instruct": {
            DownloadSource.DEFAULT: "google/medgemma-27b-text-it",
            DownloadSource.MODELSCOPE: "google/medgemma-27b-text-it",
        },
    },
    template="gemma2",
)


register_model_group(
    models={
        "Gemma-3-4B": {
            DownloadSource.DEFAULT: "google/gemma-3-4b-pt",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-4b-pt",
        },
        "Gemma-3-12B": {
            DownloadSource.DEFAULT: "google/gemma-3-12b-pt",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-12b-pt",
        },
        "Gemma-3-27B": {
            DownloadSource.DEFAULT: "google/gemma-3-27b-pt",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-27b-pt",
        },
        "Gemma-3-4B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-3-4b-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-4b-it",
        },
        "Gemma-3-12B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-3-12b-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-12b-it",
        },
        "Gemma-3-27B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-3-27b-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-27b-it",
        },
        "MedGemma-4B": {
            DownloadSource.DEFAULT: "google/medgemma-4b-pt",
            DownloadSource.MODELSCOPE: "google/medgemma-4b-pt",
        },
        "MedGemma-4B-Instruct": {
            DownloadSource.DEFAULT: "google/medgemma-4b-it",
            DownloadSource.MODELSCOPE: "google/medgemma-4b-it",
        },
    },
    template="gemma3",
    multimodal=True,
)


register_model_group(
    models={
        "Gemma-3n-E2B": {
            DownloadSource.DEFAULT: "google/gemma-3n-E2B",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E2B",
        },
        "Gemma-3n-E4B": {
            DownloadSource.DEFAULT: "google/gemma-3n-E4B",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E4B",
        },
        "Gemma-3n-E2B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-3n-E2B-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E2B-it",
        },
        "Gemma-3n-E4B-Instruct": {
            DownloadSource.DEFAULT: "google/gemma-3n-E4B-it",
            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E4B-it",
        },
    },
    template="gemma3n",
    multimodal=True,
)


register_model_group(
    models={
        "GLM-4-9B": {
            DownloadSource.DEFAULT: "zai-org/glm-4-9b",
            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b",
        },
        "GLM-4-9B-Chat": {
            DownloadSource.DEFAULT: "zai-org/glm-4-9b-chat",
            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat",
            DownloadSource.OPENMIND: "LlamaFactory/glm-4-9b-chat",
        },
        "GLM-4-9B-1M-Chat": {
            DownloadSource.DEFAULT: "zai-org/glm-4-9b-chat-1m",
            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat-1m",
        },
        "GLM-4-0414-9B-Chat": {
            DownloadSource.DEFAULT: "zai-org/GLM-4-9B-0414",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-9B-0414",
        },
        "GLM-4-0414-32B-Base": {
            DownloadSource.DEFAULT: "zai-org/GLM-4-32B-Base-0414",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-Base-0414",
        },
        "GLM-4-0414-32B-Chat": {
            DownloadSource.DEFAULT: "zai-org/GLM-4-32B-0414",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-0414",
        },
    },
    template="glm4",
)


register_model_group(
    models={
        "GLM-4.1V-9B-Base": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Base",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Base",
        },
        "GLM-4.1V-9B-Thinking": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Thinking",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Thinking",
        },
    },
    template="glm4v",
    multimodal=True,
)


register_model_group(
    models={
        "GLM-4.5-Air-Base": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Air-Base",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Air-Base",
        },
        "GLM-4.5-Base": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Base",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Base",
        },
        "GLM-4.5-Air-Thinking": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Air",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Air",
        },
        "GLM-4.5-Thinking": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.5",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5",
        },
    },
    template="glm4_moe",
)


register_model_group(
    models={
        "GLM-4.5V-Air-Thinking":{
            DownloadSource.DEFAULT: "zai-org/GLM-4.5V",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5V",
        }
    },
    template="glm45v",
    multimodal=True,
)


register_model_group(
    models={
        "GLM-Z1-0414-9B-Chat": {
            DownloadSource.DEFAULT: "zai-org/GLM-Z1-9B-0414",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-Z1-9B-0414",
        },
        "GLM-Z1-0414-32B-Chat": {
            DownloadSource.DEFAULT: "zai-org/GLM-Z1-32B-0414",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-Z1-32B-0414",
        },
    },
    template="glmz1",
)


register_model_group(
    models={
        "GPT-2-Small": {
            DownloadSource.DEFAULT: "openai-community/gpt2",
            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2",
        },
        "GPT-2-Medium": {
            DownloadSource.DEFAULT: "openai-community/gpt2-medium",
            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2-medium",
        },
        "GPT-2-Large": {
            DownloadSource.DEFAULT: "openai-community/gpt2-large",
            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2-large",
        },
        "GPT-2-XL": {
            DownloadSource.DEFAULT: "openai-community/gpt2-xl",
            DownloadSource.MODELSCOPE: "goodbai95/GPT2-xl",
        },
    },
)


register_model_group(
    models={
        "GPT-OSS-20B-Thinking": {
            DownloadSource.DEFAULT: "openai/gpt-oss-20b",
            DownloadSource.MODELSCOPE: "openai/gpt-oss-20b",
        },
        "GPT-OSS-120B-Thinking": {
            DownloadSource.DEFAULT: "openai/gpt-oss-120b",
            DownloadSource.MODELSCOPE: "openai/gpt-oss-120b",
        },
    },
    template="gpt",
)


register_model_group(
    models={
        "Granite-3.0-1B-A400M-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-1b-a400m-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-1b-a400m-base",
        },
        "Granite-3.0-3B-A800M-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-3b-a800m-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-3b-a800m-base",
        },
        "Granite-3.0-2B-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-2b-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-2b-base",
        },
        "Granite-3.0-8B-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-8b-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-8b-base",
        },
        "Granite-3.0-1B-A400M-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-1b-a400m-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-1b-a400m-instruct",
        },
        "Granite-3.0-3B-A800M-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-3b-a800m-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-3b-a800m-instruct",
        },
        "Granite-3.0-2B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-2b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-2b-instruct",
        },
        "Granite-3.0-8B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-8b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-8b-instruct",
        },
        "Granite-3.1-1B-A400M-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-1b-a400m-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-1b-a400m-base",
        },
        "Granite-3.1-3B-A800M-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-3b-a800m-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-3b-a800m-base",
        },
        "Granite-3.1-2B-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-2b-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-2b-base",
        },
        "Granite-3.1-8B-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-8b-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-8b-base",
        },
        "Granite-3.1-1B-A400M-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-1b-a400m-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-1b-a400m-instruct",
        },
        "Granite-3.1-3B-A800M-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-3b-a800m-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-3b-a800m-instruct",
        },
        "Granite-3.1-2B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-2b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-2b-instruct",
        },
        "Granite-3.1-8B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-8b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-8b-instruct",
        },
        "Granite-3.2-2B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.2-2b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.2-2b-instruct",
        },
        "Granite-3.2-8B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.2-8b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.2-8b-instruct",
        },
        "Granite-3.3-2B-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-2b-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-2b-base",
        },
        "Granite-3.3-8B-Base": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-8b-base",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-8b-base",
        },
        "Granite-3.3-2B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-2b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-2b-instruct",
        },
        "Granite-3.3-8B-Instruct": {
            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-8b-instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-8b-instruct",
        },
    },
    template="granite3",
)


register_model_group(
    models={
        "Granite-Vision-3.2-2B": {
            DownloadSource.DEFAULT: "ibm-granite/granite-vision-3.2-2b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-vision-3.2-2b",
        },
    },
    template="granite3_vision",
    multimodal=True,
)


register_model_group(
    models={
        "Granite-4.0-tiny-preview": {
            DownloadSource.DEFAULT: "ibm-granite/granite-4.0-tiny-preview",
            DownloadSource.MODELSCOPE: "ibm-granite/granite-4.0-tiny-preview",
        },
    },
    template="granite4",
)


register_model_group(
    models={
        "Hunyuan-7B-Instruct": {
            DownloadSource.DEFAULT: "tencent/Hunyuan-7B-Instruct",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Hunyuan-7B-Instruct",
        },
    },
    template="hunyuan",
)


register_model_group(
    models={
        "Index-1.9B-Base": {
            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B",
            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B",
        },
        "Index-1.9B-Base-Pure": {
            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Pure",
            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Pure",
        },
        "Index-1.9B-Chat": {
            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Chat",
            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Chat",
        },
        "Index-1.9B-Character-Chat": {
            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Character",
            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Character",
        },
        "Index-1.9B-Chat-32K": {
            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-32K",
            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-32K",
        },
    },
    template="index",
)


register_model_group(
    models={
        "InternLM-7B": {
            DownloadSource.DEFAULT: "internlm/internlm-7b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-7b",
        },
        "InternLM-20B": {
            DownloadSource.DEFAULT: "internlm/internlm-20b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-20b",
        },
        "InternLM-7B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm-chat-7b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-7b",
        },
        "InternLM-20B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm-chat-20b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-20b",
        },
    },
    template="intern",
)


register_model_group(
    models={
        "InternLM2-7B": {
            DownloadSource.DEFAULT: "internlm/internlm2-7b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-7b",
        },
        "InternLM2-20B": {
            DownloadSource.DEFAULT: "internlm/internlm2-20b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-20b",
        },
        "InternLM2-7B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm2-chat-7b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-7b",
        },
        "InternLM2-20B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm2-chat-20b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-20b",
        },
        "InternLM2.5-1.8B": {
            DownloadSource.DEFAULT: "internlm/internlm2_5-1_8b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-1_8b",
            DownloadSource.OPENMIND: "Intern/internlm2_5-1_8b",
        },
        "InternLM2.5-7B": {
            DownloadSource.DEFAULT: "internlm/internlm2_5-7b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b",
        },
        "InternLM2.5-20B": {
            DownloadSource.DEFAULT: "internlm/internlm2_5-20b",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-20b",
            DownloadSource.OPENMIND: "Intern/internlm2_5-20b",
        },
        "InternLM2.5-1.8B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm2_5-1_8b-chat",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat",
            DownloadSource.OPENMIND: "Intern/internlm2_5-1_8b-chat",
        },
        "InternLM2.5-7B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm2_5-7b-chat",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
            DownloadSource.OPENMIND: "Intern/internlm2_5-7b-chat",
        },
        "InternLM2.5-7B-1M-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm2_5-7b-chat-1m",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m",
            DownloadSource.OPENMIND: "Intern/internlm2_5-7b-chat-1m",
        },
        "InternLM2.5-20B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm2_5-20b-chat",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-20b-chat",
            DownloadSource.OPENMIND: "Intern/internlm2_5-20b-chat",
        },
        "InternLM3-8B-Chat": {
            DownloadSource.DEFAULT: "internlm/internlm3-8b-instruct",
            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm3-8b-instruct",
        },
    },
    template="intern2",
)


register_model_group(
    models={
        "InternVL2.5-2B-MPO": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL2_5-2B-MPO-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL2_5-2B-MPO-hf",
        },
        "InternVL2.5-8B-MPO": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL2_5-8B-MPO-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL2_5-8B-MPO-hf",
        },
        "InternVL3-1B-hf": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-1B-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-1B-hf",
        },
        "InternVL3-2B-hf": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-2B-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-2B-hf",
        },
        "InternVL3-8B-hf": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-8B-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-8B-hf",
        },
        "InternVL3-14B-hf": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-14B-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-14B-hf",
        },
        "InternVL3-38B-hf": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-38B-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-38B-hf",
        },
        "InternVL3-78B-hf": {
            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-78B-hf",
            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-78B-hf",
        },
    },
    template="intern_vl",
    multimodal=True,
)


register_model_group(
    models={
        "Jamba-v0.1": {
            DownloadSource.DEFAULT: "ai21labs/Jamba-v0.1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Jamba-v0.1",
        }
    },
)


register_model_group(
    models={
        "Keye-VL-8B-Chat": {
            DownloadSource.DEFAULT: "Kwai-Keye/Keye-VL-8B-Preview",
            DownloadSource.MODELSCOPE: "Kwai-Keye/Keye-VL-8B-Preview",
        },
    },
    template="keye_vl",
    multimodal=True,
)


register_model_group(
    models={
        "Kimi-Dev-72B-Instruct": {
            DownloadSource.DEFAULT: "moonshotai/Kimi-Dev-72B",
            DownloadSource.MODELSCOPE: "moonshotai/Kimi-Dev-72B",
        },
    },
    template="qwen",
)


register_model_group(
    models={
        "Kimi-VL-A3B-Instruct": {
            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Instruct",
            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Instruct",
        },
        "Kimi-VL-A3B-Thinking": {
            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Thinking",
            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Thinking",
        },
        "Kimi-VL-A3B-Thinking-2506": {
            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Thinking-2506",
            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Thinking-2506",
        },
    },
    template="kimi_vl",
    multimodal=True,
)


register_model_group(
    models={
        "LingoWhale-8B": {
            DownloadSource.DEFAULT: "deeplang-ai/LingoWhale-8B",
            DownloadSource.MODELSCOPE: "DeepLang/LingoWhale-8B",
        }
    },
)


register_model_group(
    models={
        "Llama-7B": {
            DownloadSource.DEFAULT: "huggyllama/llama-7b",
            DownloadSource.MODELSCOPE: "skyline2006/llama-7b",
        },
        "Llama-13B": {
            DownloadSource.DEFAULT: "huggyllama/llama-13b",
            DownloadSource.MODELSCOPE: "skyline2006/llama-13b",
        },
        "Llama-30B": {
            DownloadSource.DEFAULT: "huggyllama/llama-30b",
            DownloadSource.MODELSCOPE: "skyline2006/llama-30b",
        },
        "Llama-65B": {
            DownloadSource.DEFAULT: "huggyllama/llama-65b",
            DownloadSource.MODELSCOPE: "skyline2006/llama-65b",
        },
    }
)


register_model_group(
    models={
        "Llama-2-7B": {
            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-hf",
            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-ms",
        },
        "Llama-2-13B": {
            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-hf",
            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-ms",
        },
        "Llama-2-70B": {
            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-hf",
            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-ms",
        },
        "Llama-2-7B-Chat": {
            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-chat-hf",
            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-chat-ms",
        },
        "Llama-2-13B-Chat": {
            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-chat-hf",
            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-chat-ms",
        },
        "Llama-2-70B-Chat": {
            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-chat-hf",
            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-chat-ms",
        },
    },
    template="llama2",
)


register_model_group(
    models={
        "Llama-3-8B": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B",
        },
        "Llama-3-70B": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B",
        },
        "Llama-3-8B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B-Instruct",
        },
        "Llama-3-70B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct",
        },
        "Llama-3-8B-Chinese-Chat": {
            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat",
            DownloadSource.OPENMIND: "LlamaFactory/Llama3-Chinese-8B-Instruct",
        },
        "Llama-3-70B-Chinese-Chat": {
            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-70B-Chinese-Chat",
        },
        "Llama-3.1-8B": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B",
        },
        "Llama-3.1-70B": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B",
        },
        "Llama-3.1-405B": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B",
        },
        "Llama-3.1-8B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B-Instruct",
        },
        "Llama-3.1-70B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B-Instruct",
        },
        "Llama-3.1-405B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B-Instruct",
        },
        "Llama-3.1-8B-Chinese-Chat": {
            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-8B-Chinese-Chat",
            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-8B-Chinese-Chat",
        },
        "Llama-3.1-70B-Chinese-Chat": {
            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-70B-Chinese-Chat",
            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-70B-Chinese-Chat",
        },
        "Llama-3.2-1B": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B",
        },
        "Llama-3.2-3B": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B",
        },
        "Llama-3.2-1B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B-Instruct",
        },
        "Llama-3.2-3B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B-Instruct",
        },
        "Llama-3.3-70B-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.3-70B-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.3-70B-Instruct",
        },
    },
    template="llama3",
)


register_model_group(
    models={
        "Llama-3.2-11B-Vision": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-11B-Vision",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-11B-Vision",
        },
        "Llama-3.2-11B-Vision-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-11B-Vision-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-11B-Vision-Instruct",
        },
        "Llama-3.2-90B-Vision": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-90B-Vision",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-90B-Vision",
        },
        "Llama-3.2-90B-Vision-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-90B-Vision-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-90B-Vision-Instruct",
        },
    },
    template="mllama",
    multimodal=True,
)


register_model_group(
    models={
        "Llama-4-Scout-17B-16E": {
            DownloadSource.DEFAULT: "meta-llama/Llama-4-Scout-17B-16E",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Scout-17B-16E",
        },
        "Llama-4-Scout-17B-16E-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Llama-4-Scout-17B-16E-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Scout-17B-16E-Instruct",
        },
        "Llama-4-Maverick-17B-128E": {
            DownloadSource.DEFAULT: "meta-llama/Llama-4-Maverick-17B-128E",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Maverick-17B-128E",
        },
        "Llama-4-Maverick-17B-128E-Instruct": {
            DownloadSource.DEFAULT: "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Maverick-17B-128E-Instruct",
        },
    },
    template="llama4",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-1.5-7B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-1.5-7b-hf",
            DownloadSource.MODELSCOPE: "swift/llava-1.5-7b-hf",
        },
        "LLaVA-1.5-13B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-1.5-13b-hf",
            DownloadSource.MODELSCOPE: "swift/llava-1.5-13b-hf",
        },
    },
    template="llava",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-7B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-7b-hf",
            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-7b-hf",
        },
        "LLaVA-NeXT-13B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-13b-hf",
            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-13b-hf",
        },
    },
    template="llava_next",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-Mistral-7B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-mistral-7b-hf",
            DownloadSource.MODELSCOPE: "swift/llava-v1.6-mistral-7b-hf",
        },
    },
    template="llava_next_mistral",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-Llama3-8B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llama3-llava-next-8b-hf",
            DownloadSource.MODELSCOPE: "swift/llama3-llava-next-8b-hf",
        },
    },
    template="llava_next_llama3",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-34B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
            DownloadSource.MODELSCOPE: "LLM-Research/llava-v1.6-34b-hf",
        },
    },
    template="llava_next_yi",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-72B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-72b-hf",
        },
        "LLaVA-NeXT-110B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-110b-hf",
        },
    },
    template="llava_next_qwen",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-Video-7B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-hf",
        },
        "LLaVA-NeXT-Video-7B-DPO-Chat": {
            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-DPO-hf",
        },
    },
    template="llava_next_video",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-Video-7B-32k-Chat": {
            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-32K-hf",
        },
    },
    template="llava_next_video_mistral",
    multimodal=True,
)


register_model_group(
    models={
        "LLaVA-NeXT-Video-34B-Chat": {
            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-34B-hf",
        },
        "LLaVA-NeXT-Video-34B-DPO-Chat": {
            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
        },
    },
    template="llava_next_video_yi",
    multimodal=True,
)


register_model_group(
    models={
        "Marco-o1-Chat": {
            DownloadSource.DEFAULT: "AIDC-AI/Marco-o1",
            DownloadSource.MODELSCOPE: "AIDC-AI/Marco-o1",
        },
    },
    template="marco",
)


register_model_group(
    models={
        "MiMo-7B-Base": {
            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-Base",
            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-Base",
        },
        "MiMo-7B-Instruct": {
            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-SFT",
            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-SFT",
        },
        "MiMo-7B-Instruct-RL": {
            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-RL",
            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-RL",
        },
        "MiMo-7B-RL-ZERO": {
            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-RL-ZERO",
            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-RL-ZERO",
        },
    },
    template="mimo",
)


register_model_group(
    models={
        "MiMo-7B-VL-Instruct": {
            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-SFT",
            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-SFT",
        },
        "MiMo-7B-VL-RL": {
            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-RL",
            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-RL",
        },
    },
    template="mimo_vl",
    multimodal=True,
)


register_model_group(
    models={
        "MiniCPM-2B-SFT-Chat": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-sft-bf16",
            DownloadSource.MODELSCOPE: "OpenBMB/miniCPM-bf16",
        },
        "MiniCPM-2B-DPO-Chat": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-dpo-bf16",
            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-2B-dpo-bf16",
        },
    },
    template="cpm",
)


register_model_group(
    models={
        "MiniCPM3-4B-Chat": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM3-4B",
            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM3-4B",
            DownloadSource.OPENMIND: "LlamaFactory/MiniCPM3-4B",
        },
    },
    template="cpm3",
)


register_model_group(
    models={
        "MiniCPM4-0.5B-Chat": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM4-0.5B",
            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4-0.5B",
        },
        "MiniCPM4-8B-Chat": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM4-8B",
            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4-8B",
        },
    },
    template="cpm4",
)


register_model_group(
    models={
        "MiniCPM-o-2_6": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM-o-2_6",
            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-o-2_6",
        },
    },
    template="minicpm_o",
    multimodal=True,
)


register_model_group(
    models={
        "MiniCPM-V-2_6": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-2_6",
            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-2_6",
        },
    },
    template="minicpm_v",
    multimodal=True,
)


register_model_group(
    models={
        "MiniCPM-V-4": {
            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-4",
            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-4",
        },
    },
    template="minicpm_v",
    multimodal=True,
)


register_model_group(
    models={
        "Ministral-8B-Instruct-2410": {
            DownloadSource.DEFAULT: "mistralai/Ministral-8B-Instruct-2410",
            DownloadSource.MODELSCOPE: "mistralai/Ministral-8B-Instruct-2410",
        },
        "Mistral-Nemo-Base-2407": {
            DownloadSource.DEFAULT: "mistralai/Mistral-Nemo-Base-2407",
            DownloadSource.MODELSCOPE: "LLM-Research/Mistral-Nemo-Base-2407",
        },
        "Mistral-Nemo-Instruct-2407": {
            DownloadSource.DEFAULT: "mistralai/Mistral-Nemo-Instruct-2407",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-Nemo-Instruct-2407",
        },
    },
    template="ministral",
)


register_model_group(
    models={
        "Mistral-7B-v0.1": {
            DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1",
        },
        "Mistral-7B-v0.2": {
            DownloadSource.DEFAULT: "alpindale/Mistral-7B-v0.2-hf",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.2-hf",
        },
        "Mistral-7B-v0.3": {
            DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.3",
            DownloadSource.MODELSCOPE: "LLM-Research/mistral-7b-v0.3",
        },
        "Mistral-7B-Instruct-v0.1": {
            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1",
        },
        "Mistral-7B-Instruct-v0.2": {
            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.2",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.2",
        },
        "Mistral-7B-Instruct-v0.3": {
            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.3",
            DownloadSource.MODELSCOPE: "LLM-Research/Mistral-7B-Instruct-v0.3",
        },
    },
    template="mistral",
)


register_model_group(
    models={
        "Mistral-Small-24B-Base-2501": {
            DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Base-2501",
            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Base-2501",
        },
        "Mistral-Small-24B-Instruct-2501": {
            DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Instruct-2501",
            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Instruct-2501",
        },
    },
    template="mistral_small",
)


register_model_group(
    models={
        "Mistral-Small-3.1-24B-Base": {
            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Base-2503",
            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Base-2503",
        },
        "Mistral-Small-3.1-24B-Instruct": {
            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
        },
        "Mistral-Small-3.2-24B-Instruct": {
            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
        },
    },
    template="mistral_small",
    multimodal=True,
)


register_model_group(
    models={
        "Mixtral-8x7B-v0.1": {
            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-v0.1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-v0.1",
        },
        "Mixtral-8x22B-v0.1": {
            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-v0.1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-v0.1",
        },
        "Mixtral-8x7B-v0.1-Instruct": {
            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-Instruct-v0.1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-Instruct-v0.1",
        },
        "Mixtral-8x22B-v0.1-Instruct": {
            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-Instruct-v0.1",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-Instruct-v0.1",
        },
    },
    template="mistral",
)


register_model_group(
    models={
        "Moonlight-16B-A3B": {
            DownloadSource.DEFAULT: "moonshotai/Moonlight-16B-A3B",
            DownloadSource.MODELSCOPE: "moonshotai/Moonlight-16B-A3B",
        },
        "Moonlight-16B-A3B-Instruct": {
            DownloadSource.DEFAULT: "moonshotai/Moonlight-16B-A3B-Instruct",
            DownloadSource.MODELSCOPE: "moonshotai/Moonlight-16B-A3B-Instruct",
        },
    },
    template="moonlight",
)


register_model_group(
    models={
        "OLMo-1B": {
            DownloadSource.DEFAULT: "allenai/OLMo-1B-hf",
        },
        "OLMo-7B": {
            DownloadSource.DEFAULT: "allenai/OLMo-7B-hf",
        },
        "OLMo-7B-Chat": {
            DownloadSource.DEFAULT: "ssec-uw/OLMo-7B-Instruct-hf",
        },
        "OLMo-1.7-7B": {
            DownloadSource.DEFAULT: "allenai/OLMo-1.7-7B-hf",
        },
    },
)


register_model_group(
    models={
        "OpenChat3.5-7B-Chat": {
            DownloadSource.DEFAULT: "openchat/openchat-3.5-0106",
            DownloadSource.MODELSCOPE: "xcwzxcwz/openchat-3.5-0106",
        }
    },
    template="openchat",
)


register_model_group(
    models={
        "OpenChat3.6-8B-Chat": {
            DownloadSource.DEFAULT: "openchat/openchat-3.6-8b-20240522",
        }
    },
    template="openchat-3.6",
)


register_model_group(
    models={
        "OpenCoder-1.5B-Base": {
            DownloadSource.DEFAULT: "infly/OpenCoder-1.5B-Base",
            DownloadSource.MODELSCOPE: "infly/OpenCoder-1.5B-Base",
        },
        "OpenCoder-8B-Base": {
            DownloadSource.DEFAULT: "infly/OpenCoder-8B-Base",
            DownloadSource.MODELSCOPE: "infly/OpenCoder-8B-Base",
        },
        "OpenCoder-1.5B-Instruct": {
            DownloadSource.DEFAULT: "infly/OpenCoder-1.5B-Instruct",
            DownloadSource.MODELSCOPE: "infly/OpenCoder-1.5B-Instruct",
        },
        "OpenCoder-8B-Instruct": {
            DownloadSource.DEFAULT: "infly/OpenCoder-8B-Instruct",
            DownloadSource.MODELSCOPE: "infly/OpenCoder-8B-Instruct",
        },
    },
    template="opencoder",
)


register_model_group(
    models={
        "Orion-14B-Base": {
            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Base",
            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Base",
        },
        "Orion-14B-Chat": {
            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat",
            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat",
        },
        "Orion-14B-Long-Chat": {
            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-LongChat",
            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-LongChat",
        },
        "Orion-14B-RAG-Chat": {
            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat-RAG",
            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat-RAG",
        },
        "Orion-14B-Plugin-Chat": {
            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat-Plugin",
            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat-Plugin",
        },
    },
    template="orion",
)


register_model_group(
    models={
        "PaliGemma-3B-pt-224": {
            DownloadSource.DEFAULT: "google/paligemma-3b-pt-224",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-224",
        },
        "PaliGemma-3B-pt-448": {
            DownloadSource.DEFAULT: "google/paligemma-3b-pt-448",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-448",
        },
        "PaliGemma-3B-pt-896": {
            DownloadSource.DEFAULT: "google/paligemma-3b-pt-896",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-896",
        },
        "PaliGemma-3B-mix-224": {
            DownloadSource.DEFAULT: "google/paligemma-3b-mix-224",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-224",
        },
        "PaliGemma-3B-mix-448": {
            DownloadSource.DEFAULT: "google/paligemma-3b-mix-448",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-448",
        },
    },
    template="paligemma",
    multimodal=True,
)


register_model_group(
    models={
        "PaliGemma2-3B-pt-224": {
            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-224",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-224",
        },
        "PaliGemma2-3B-pt-448": {
            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-448",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-448",
        },
        "PaliGemma2-3B-pt-896": {
            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-896",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-896",
        },
        "PaliGemma2-10B-pt-224": {
            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-224",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-224",
        },
        "PaliGemma2-10B-pt-448": {
            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-448",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-448",
        },
        "PaliGemma2-10B-pt-896": {
            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-896",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-896",
        },
        "PaliGemma2-28B-pt-224": {
            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-224",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-224",
        },
        "PaliGemma2-28B-pt-448": {
            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-448",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-448",
        },
        "PaliGemma2-28B-pt-896": {
            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-896",
            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-896",
        },
        "PaliGemma2-3B-mix-224": {
            DownloadSource.DEFAULT: "google/paligemma2-3b-mix-224",
            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-3b-mix-224-bf16",
        },
        "PaliGemma2-3B-mix-448": {
            DownloadSource.DEFAULT: "google/paligemma2-3b-mix-448",
            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-3b-mix-448-bf16",
        },
        "PaliGemma2-10B-mix-224": {
            DownloadSource.DEFAULT: "google/paligemma2-10b-mix-224",
            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-10b-mix-224-bf16",
        },
        "PaliGemma2-10B-mix-448": {
            DownloadSource.DEFAULT: "google/paligemma2-10b-mix-448",
            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-10b-mix-448-bf16",
        },
        "PaliGemma2-28B-mix-224": {
            DownloadSource.DEFAULT: "google/paligemma2-28b-mix-224",
            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-28b-mix-224-bf16",
        },
        "PaliGemma2-28B-mix-448": {
            DownloadSource.DEFAULT: "google/paligemma2-28b-mix-448",
            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-28b-mix-448-bf16",
        },
    },
    template="paligemma",
    multimodal=True,
)


register_model_group(
    models={
        "Phi-1.5-1.3B": {
            DownloadSource.DEFAULT: "microsoft/phi-1_5",
            DownloadSource.MODELSCOPE: "allspace/PHI_1-5",
        },
        "Phi-2-2.7B": {
            DownloadSource.DEFAULT: "microsoft/phi-2",
            DownloadSource.MODELSCOPE: "AI-ModelScope/phi-2",
        },
    }
)


register_model_group(
    models={
        "Phi-3-4B-4k-Instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-4k-instruct",
        },
        "Phi-3-4B-128k-Instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-128k-instruct",
        },
        "Phi-3-14B-8k-Instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3-medium-4k-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-4k-instruct",
        },
        "Phi-3-14B-128k-Instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3-medium-128k-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-128k-instruct",
        },
        "Phi-3.5-4B-instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3.5-mini-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3.5-mini-instruct",
        },
        "Phi-3.5-MoE-42B-A6.6B-instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3.5-MoE-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3.5-MoE-instruct",
        },
    },
    template="phi",
)


register_model_group(
    models={
        "Phi-3-7B-8k-Instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3-small-8k-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-8k-instruct",
        },
        "Phi-3-7B-128k-Instruct": {
            DownloadSource.DEFAULT: "microsoft/Phi-3-small-128k-instruct",
            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-128k-instruct",
        },
    },
    template="phi_small",
)


register_model_group(
    models={
        "Phi-4-14B-Instruct": {
            DownloadSource.DEFAULT: "microsoft/phi-4",
            DownloadSource.MODELSCOPE: "LLM-Research/phi-4",
        },
    },
    template="phi4",
)


register_model_group(
    models={
        "Pixtral-12B": {
            DownloadSource.DEFAULT: "mistral-community/pixtral-12b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b",
        }
    },
    template="pixtral",
    multimodal=True,
)


register_model_group(
    models={
        "Qwen-1.8B": {
            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B",
        },
        "Qwen-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B",
        },
        "Qwen-14B": {
            DownloadSource.DEFAULT: "Qwen/Qwen-14B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B",
        },
        "Qwen-72B": {
            DownloadSource.DEFAULT: "Qwen/Qwen-72B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B",
        },
        "Qwen-1.8B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B-Chat",
        },
        "Qwen-7B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B-Chat",
        },
        "Qwen-14B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B-Chat",
        },
        "Qwen-72B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B-Chat",
        },
        "Qwen-1.8B-Chat-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B-Chat-Int8",
        },
        "Qwen-1.8B-Chat-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B-Chat-Int4",
        },
        "Qwen-7B-Chat-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B-Chat-Int8",
        },
        "Qwen-7B-Chat-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B-Chat-Int4",
        },
        "Qwen-14B-Chat-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B-Chat-Int8",
        },
        "Qwen-14B-Chat-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B-Chat-Int4",
        },
        "Qwen-72B-Chat-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B-Chat-Int8",
        },
        "Qwen-72B-Chat-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B-Chat-Int4",
        },
    },
    template="qwen",
)


register_model_group(
    models={
        "Qwen1.5-0.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B",
        },
        "Qwen1.5-1.8B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B",
        },
        "Qwen1.5-4B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B",
        },
        "Qwen1.5-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B",
        },
        "Qwen1.5-14B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B",
        },
        "Qwen1.5-32B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-32B",
        },
        "Qwen1.5-72B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B",
        },
        "Qwen1.5-110B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-110B",
        },
        "Qwen1.5-MoE-A2.7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-MoE-A2.7B",
        },
        "Qwen1.5-0.5B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B-Chat",
        },
        "Qwen1.5-1.8B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B-Chat",
        },
        "Qwen1.5-4B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B-Chat",
        },
        "Qwen1.5-7B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B-Chat",
        },
        "Qwen1.5-14B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B-Chat",
        },
        "Qwen1.5-32B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-32B-Chat",
        },
        "Qwen1.5-72B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B-Chat",
        },
        "Qwen1.5-110B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-110B-Chat",
        },
        "Qwen1.5-MoE-A2.7B-Chat": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
        },
        "Qwen1.5-0.5B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
        },
        "Qwen1.5-0.5B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B-Chat-AWQ",
        },
        "Qwen1.5-1.8B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
        },
        "Qwen1.5-1.8B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B-Chat-AWQ",
        },
        "Qwen1.5-4B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
        },
        "Qwen1.5-4B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B-Chat-AWQ",
        },
        "Qwen1.5-7B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
        },
        "Qwen1.5-7B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B-Chat-AWQ",
        },
        "Qwen1.5-14B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
        },
        "Qwen1.5-14B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B-Chat-AWQ",
        },
        "Qwen1.5-32B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-32B-Chat-AWQ",
        },
        "Qwen1.5-72B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
        },
        "Qwen1.5-72B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B-Chat-AWQ",
        },
        "Qwen1.5-110B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-110B-Chat-AWQ",
        },
        "Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
        },
        "CodeQwen1.5-7B": {
            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B",
            DownloadSource.MODELSCOPE: "Qwen/CodeQwen1.5-7B",
        },
        "CodeQwen1.5-7B-Chat": {
            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat",
            DownloadSource.MODELSCOPE: "Qwen/CodeQwen1.5-7B-Chat",
        },
        "CodeQwen1.5-7B-Chat-AWQ": {
            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/CodeQwen1.5-7B-Chat-AWQ",
        },
    },
    template="qwen",
)


register_model_group(
    models={
        "Qwen2-0.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B",
        },
        "Qwen2-1.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B",
        },
        "Qwen2-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B",
        },
        "Qwen2-72B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-72B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B",
        },
        "Qwen2-MoE-57B-A14B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B",
        },
        "Qwen2-0.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct",
            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-0.5B-Instruct",
        },
        "Qwen2-1.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct",
            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-1.5B-Instruct",
        },
        "Qwen2-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct",
            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-7B-Instruct",
        },
        "Qwen2-72B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct",
        },
        "Qwen2-MoE-57B-A14B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B-Instruct",
        },
        "Qwen2-0.5B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8",
        },
        "Qwen2-0.5B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4",
        },
        "Qwen2-0.5B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-AWQ",
        },
        "Qwen2-1.5B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8",
        },
        "Qwen2-1.5B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
        },
        "Qwen2-1.5B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-AWQ",
        },
        "Qwen2-7B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-GPTQ-Int8",
        },
        "Qwen2-7B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-GPTQ-Int4",
        },
        "Qwen2-7B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-AWQ",
        },
        "Qwen2-72B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-GPTQ-Int8",
        },
        "Qwen2-72B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-GPTQ-Int4",
        },
        "Qwen2-72B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-AWQ",
        },
        "Qwen2-57B-A14B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
        },
        "Qwen2-Math-1.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-1.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-1.5B",
        },
        "Qwen2-Math-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-7B",
        },
        "Qwen2-Math-72B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-72B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-72B",
        },
        "Qwen2-Math-1.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-1.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-1.5B-Instruct",
        },
        "Qwen2-Math-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-7B-Instruct",
        },
        "Qwen2-Math-72B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-72B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-72B-Instruct",
        },
    },
    template="qwen",
)


register_model_group(
    models={
        "Qwen2.5-0.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B",
        },
        "Qwen2.5-1.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B",
        },
        "Qwen2.5-3B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B",
        },
        "Qwen2.5-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B",
        },
        "Qwen2.5-14B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B",
        },
        "Qwen2.5-32B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B",
        },
        "Qwen2.5-72B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B",
        },
        "Qwen2.5-0.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct",
        },
        "Qwen2.5-1.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct",
        },
        "Qwen2.5-3B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct",
        },
        "Qwen2.5-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct",
        },
        "Qwen2.5-14B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct",
        },
        "Qwen2.5-32B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct",
        },
        "Qwen2.5-72B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct",
        },
        "Qwen2.5-7B-Instruct-1M": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-1M",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-1M",
        },
        "Qwen2.5-14B-Instruct-1M": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-1M",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-1M",
        },
        "Qwen2.5-0.5B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8",
        },
        "Qwen2.5-0.5B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",
        },
        "Qwen2.5-0.5B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-AWQ",
        },
        "Qwen2.5-1.5B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8",
        },
        "Qwen2.5-1.5B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4",
        },
        "Qwen2.5-1.5B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
        },
        "Qwen2.5-3B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8",
        },
        "Qwen2.5-3B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4",
        },
        "Qwen2.5-3B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-AWQ",
        },
        "Qwen2.5-7B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
        },
        "Qwen2.5-7B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4",
        },
        "Qwen2.5-7B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-AWQ",
        },
        "Qwen2.5-14B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
        },
        "Qwen2.5-14B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
        },
        "Qwen2.5-14B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-AWQ",
        },
        "Qwen2.5-32B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8",
        },
        "Qwen2.5-32B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",
        },
        "Qwen2.5-32B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-AWQ",
        },
        "Qwen2.5-72B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8",
        },
        "Qwen2.5-72B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4",
        },
        "Qwen2.5-72B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-AWQ",
        },
        "Qwen2.5-Coder-0.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-0.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-0.5B",
        },
        "Qwen2.5-Coder-1.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-1.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B",
        },
        "Qwen2.5-Coder-3B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-3B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-3B",
        },
        "Qwen2.5-Coder-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B",
        },
        "Qwen2.5-Coder-14B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-14B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-14B",
        },
        "Qwen2.5-Coder-32B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-32B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-32B",
        },
        "Qwen2.5-Coder-0.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-0.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-0.5B-Instruct",
        },
        "Qwen2.5-Coder-1.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
        },
        "Qwen2.5-Coder-3B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-3B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-3B-Instruct",
        },
        "Qwen2.5-Coder-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B-Instruct",
        },
        "Qwen2.5-Coder-14B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-14B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-14B-Instruct",
        },
        "Qwen2.5-Coder-32B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-32B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-32B-Instruct",
        },
        "Qwen2.5-Math-1.5B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-1.5B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-1.5B",
        },
        "Qwen2.5-Math-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-7B",
        },
        "Qwen2.5-Math-72B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-72B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-72B",
        },
        "Qwen2.5-Math-1.5B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-1.5B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
        },
        "Qwen2.5-Math-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B-Instruct",
        },
        "Qwen2.5-Math-72B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-72B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-72B-Instruct",
        },
        "QwQ-32B-Preview-Instruct": {
            DownloadSource.DEFAULT: "Qwen/QwQ-32B-Preview",
            DownloadSource.MODELSCOPE: "Qwen/QwQ-32B-Preview",
        },
        "QwQ-32B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/QwQ-32B",
            DownloadSource.MODELSCOPE: "Qwen/QwQ-32B",
        },
    },
    template="qwen",
)


register_model_group(
    models={
        "Qwen3-0.6B-Base": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-Base",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-Base",
        },
        "Qwen3-1.7B-Base": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-Base",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-Base",
        },
        "Qwen3-4B-Base": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Base",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Base",
        },
        "Qwen3-8B-Base": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-Base",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-Base",
        },
        "Qwen3-14B-Base": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-Base",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-Base",
        },
        "Qwen3-30B-A3B-Base": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Base",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Base",
        },
        "Qwen3-0.6B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B",
        },
        "Qwen3-1.7B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B",
        },
        "Qwen3-4B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B",
        },
        "Qwen3-4B-Thinking-2507": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Thinking-2507",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Thinking-2507",
        },
        "Qwen3-8B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-8B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B",
        },
        "Qwen3-14B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-14B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B",
        },
        "Qwen3-32B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-32B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B",
        },
        "Qwen3-30B-A3B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B",
        },
        "Qwen3-30B-A3B-Thinking-2507": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Thinking-2507",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Thinking-2507",
        },
        "Qwen3-235B-A22B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B",
        },
        "Qwen3-235B-A22B-Thinking-2507": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Thinking-2507",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Thinking-2507",
        },
        "Qwen3-0.6B-Thinking-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-GPTQ-Int8",
        },
        "Qwen3-1.7B-Thinking-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-GPTQ-Int8",
        },
        "Qwen3-4B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-AWQ",
        },
        "Qwen3-8B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-AWQ",
        },
        "Qwen3-14B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-AWQ",
        },
        "Qwen3-32B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-32B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B-AWQ",
        },
        "Qwen3-30B-A3B-Thinking-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
        },
        "Qwen3-235B-A22B-Thinking-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
        },
    },
    template="qwen3",
)


register_model_group(
    models={
        "Qwen3-4B-Instruct-2507": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Instruct-2507",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Instruct-2507",
        },
        "Qwen3-30B-A3B-Instruct-2507": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Instruct-2507",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Instruct-2507",
        },
        "Qwen3-235B-A22B-Instruct-2507": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Instruct-2507",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Instruct-2507",
        },
    },
    template="qwen3_nothink",
)


register_model_group(
    models={
        "Qwen2-Audio-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Audio-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Audio-7B",
        },
        "Qwen2-Audio-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-Audio-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Audio-7B-Instruct",
        },
    },
    template="qwen2_audio",
    multimodal=True,
)


register_model_group(
    models={
        "Qwen2.5-Omni-3B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-3B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-3B",
        },
        "Qwen2.5-Omni-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B",
        },
        "Qwen2.5-Omni-7B-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B-GPTQ-Int4",
        },
        "Qwen2.5-Omni-7B-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B-AWQ",
        },
    },
    template="qwen2_omni",
    multimodal=True,
)


register_model_group(
    models={
        "Qwen2-VL-2B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B",
        },
        "Qwen2-VL-7B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B",
        },
        "Qwen2-VL-72B": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B",
        },
        "Qwen2-VL-2B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct",
            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-VL-2B-Instruct",
        },
        "Qwen2-VL-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct",
            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-VL-7B-Instruct",
        },
        "Qwen2-VL-72B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct",
        },
        "Qwen2-VL-2B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
        },
        "Qwen2-VL-2B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
        },
        "Qwen2-VL-2B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-AWQ",
        },
        "Qwen2-VL-7B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
        },
        "Qwen2-VL-7B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
        },
        "Qwen2-VL-7B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-AWQ",
        },
        "Qwen2-VL-72B-Instruct-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
        },
        "Qwen2-VL-72B-Instruct-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
        },
        "Qwen2-VL-72B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-AWQ",
        },
        "QVQ-72B-Preview": {
            DownloadSource.DEFAULT: "Qwen/QVQ-72B-Preview",
            DownloadSource.MODELSCOPE: "Qwen/QVQ-72B-Preview",
        },
        "Qwen2.5-VL-3B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-3B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-3B-Instruct",
        },
        "Qwen2.5-VL-7B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-7B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-7B-Instruct",
        },
        "Qwen2.5-VL-32B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-32B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-32B-Instruct",
        },
        "Qwen2.5-VL-72B-Instruct": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-72B-Instruct",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-72B-Instruct",
        },
        "Qwen2.5-VL-3B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
        },
        "Qwen2.5-VL-7B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
        },
        "Qwen2.5-VL-72B-Instruct-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
        },
    },
    template="qwen2_vl",
    multimodal=True,
)


register_model_group(
    models={
        "Seed-Coder-8B-Base": {
            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Base",
        },
        "Seed-Coder-8B-Instruct": {
            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Instruct",
        },
        "Seed-Coder-8B-Instruct-Reasoning": {
            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
        },
    },
    template="seed_coder",
)


register_model_group(
    models={
        "Skywork-13B-Base": {
            DownloadSource.DEFAULT: "Skywork/Skywork-13B-base",
            DownloadSource.MODELSCOPE: "skywork/Skywork-13B-base",
        }
    }
)


register_model_group(
    models={
        "Skywork-o1-Open-Llama-3.1-8B": {
            DownloadSource.DEFAULT: "Skywork/Skywork-o1-Open-Llama-3.1-8B",
            DownloadSource.MODELSCOPE: "AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B",
        }
    },
    template="skywork_o1",
)


register_model_group(
    models={
        "SmolLM-135M": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-135M",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-135M",
        },
        "SmolLM-360M": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-360M",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-360M",
        },
        "SmolLM-1.7B": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-1.7B",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-1.7B",
        },
        "SmolLM-135M-Instruct": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-135M-Instruct",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-135M-Instruct",
        },
        "SmolLM-360M-Instruct": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-360M-Instruct",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-360M-Instruct",
        },
        "SmolLM-1.7B-Instruct": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-1.7B-Instruct",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-1.7B-Instruct",
        },
    },
    template="smollm",
)


register_model_group(
    models={
        "SmolLM2-135M": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-135M",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-135M",
        },
        "SmolLM2-360M": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-360M",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-360M",
        },
        "SmolLM2-1.7B": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-1.7B",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-1.7B",
        },
        "SmolLM2-135M-Instruct": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-135M-Instruct",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-135M-Instruct",
        },
        "SmolLM2-360M-Instruct": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-360M-Instruct",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-360M-Instruct",
        },
        "SmolLM2-1.7B-Instruct": {
            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-1.7B-Instruct",
            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-1.7B-Instruct",
        },
    },
    template="smollm2",
)


register_model_group(
    models={
        "SOLAR-10.7B-v1.0": {
            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0",
        },
        "SOLAR-10.7B-Instruct-v1.0": {
            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0",
            DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0",
        },
    },
    template="solar",
)


register_model_group(
    models={
        "StarCoder2-3B": {
            DownloadSource.DEFAULT: "bigcode/starcoder2-3b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-3b",
        },
        "StarCoder2-7B": {
            DownloadSource.DEFAULT: "bigcode/starcoder2-7b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-7b",
        },
        "StarCoder2-15B": {
            DownloadSource.DEFAULT: "bigcode/starcoder2-15b",
            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-15b",
        },
    }
)


register_model_group(
    models={
        "TeleChat-1B-Chat": {
            DownloadSource.DEFAULT: "Tele-AI/TeleChat-1B",
            DownloadSource.MODELSCOPE: "TeleAI/TeleChat-1B",
        },
        "TeleChat-7B-Chat": {
            DownloadSource.DEFAULT: "Tele-AI/telechat-7B",
            DownloadSource.MODELSCOPE: "TeleAI/telechat-7B",
            DownloadSource.OPENMIND: "TeleAI/TeleChat-7B-pt",
        },
        "TeleChat-12B-Chat": {
            DownloadSource.DEFAULT: "Tele-AI/TeleChat-12B-v2",
            DownloadSource.MODELSCOPE: "TeleAI/TeleChat-12B-v2",
            DownloadSource.OPENMIND: "TeleAI/TeleChat-12B-pt",
        },
        "TeleChat-52B-Chat": {
            DownloadSource.DEFAULT: "Tele-AI/TeleChat-52B",
        },
    },
    template="telechat",
)


register_model_group(
    models={
        "TeleChat2-3B-Chat": {
            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-3B",
            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-3B",
        },
        "TeleChat2-7B-Chat": {
            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-7B",
            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-7B",
        },
        "TeleChat2-35B-Chat": {
            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-35B-Nov",
        },
        "TeleChat2-115B-Chat": {
            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-115B",
            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-115B",
        },
    },
    template="telechat2",
)


register_model_group(
    models={
        "Vicuna-v1.5-7B-Chat": {
            DownloadSource.DEFAULT: "lmsys/vicuna-7b-v1.5",
            DownloadSource.MODELSCOPE: "Xorbits/vicuna-7b-v1.5",
        },
        "Vicuna-v1.5-13B-Chat": {
            DownloadSource.DEFAULT: "lmsys/vicuna-13b-v1.5",
            DownloadSource.MODELSCOPE: "Xorbits/vicuna-13b-v1.5",
        },
    },
    template="vicuna",
)


register_model_group(
    models={
        "Video-LLaVA-7B-Chat": {
            DownloadSource.DEFAULT: "LanguageBind/Video-LLaVA-7B-hf",
        },
    },
    template="video_llava",
    multimodal=True,
)


register_model_group(
    models={
        "XuanYuan-6B": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B",
        },
        "XuanYuan-70B": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B",
        },
        "XuanYuan2-70B": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B",
        },
        "XuanYuan-6B-Chat": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat",
        },
        "XuanYuan-70B-Chat": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat",
        },
        "XuanYuan2-70B-Chat": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat",
        },
        "XuanYuan-6B-Chat-8bit": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
        },
        "XuanYuan-6B-Chat-4bit": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
        },
        "XuanYuan-70B-Chat-8bit": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
        },
        "XuanYuan-70B-Chat-4bit": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
        },
        "XuanYuan2-70B-Chat-8bit": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
        },
        "XuanYuan2-70B-Chat-4bit": {
            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
        },
    },
    template="xuanyuan",
)


register_model_group(
    models={
        "XVERSE-7B": {
            DownloadSource.DEFAULT: "xverse/XVERSE-7B",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B",
        },
        "XVERSE-13B": {
            DownloadSource.DEFAULT: "xverse/XVERSE-13B",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B",
        },
        "XVERSE-65B": {
            DownloadSource.DEFAULT: "xverse/XVERSE-65B",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B",
        },
        "XVERSE-65B-2": {
            DownloadSource.DEFAULT: "xverse/XVERSE-65B-2",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-2",
        },
        "XVERSE-7B-Chat": {
            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat",
        },
        "XVERSE-13B-Chat": {
            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat",
        },
        "XVERSE-65B-Chat": {
            DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat",
        },
        "XVERSE-MoE-A4.2B": {
            DownloadSource.DEFAULT: "xverse/XVERSE-MoE-A4.2B",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-MoE-A4.2B",
        },
        "XVERSE-7B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int8",
        },
        "XVERSE-7B-Chat-GPTQ-Int4": {
            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int4",
        },
        "XVERSE-13B-Chat-GPTQ-Int8": {
            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int8",
        },
        "XVERSE-13B-Chat-GPTQ-Int4": {
            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int4",
        },
        "XVERSE-65B-Chat-GPTQ-Int4": {
            DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat-GPTQ-Int4",
        },
    },
    template="xverse",
)


register_model_group(
    models={
        "Yayi-7B": {
            DownloadSource.DEFAULT: "wenge-research/yayi-7b-llama2",
            DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-7b-llama2",
        },
        "Yayi-13B": {
            DownloadSource.DEFAULT: "wenge-research/yayi-13b-llama2",
            DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-13b-llama2",
        },
    },
    template="yayi",
)


register_model_group(
    models={
        "Yi-6B": {
            DownloadSource.DEFAULT: "01-ai/Yi-6B",
            DownloadSource.MODELSCOPE: "01ai/Yi-6B",
        },
        "Yi-9B": {
            DownloadSource.DEFAULT: "01-ai/Yi-9B",
            DownloadSource.MODELSCOPE: "01ai/Yi-9B",
        },
        "Yi-34B": {
            DownloadSource.DEFAULT: "01-ai/Yi-34B",
            DownloadSource.MODELSCOPE: "01ai/Yi-34B",
        },
        "Yi-6B-Chat": {
            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat",
            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat",
        },
        "Yi-34B-Chat": {
            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat",
            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat",
        },
        "Yi-6B-Chat-8bits": {
            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-8bits",
            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-8bits",
        },
        "Yi-6B-Chat-4bits": {
            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-4bits",
            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-4bits",
        },
        "Yi-34B-Chat-8bits": {
            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-8bits",
            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-8bits",
        },
        "Yi-34B-Chat-4bits": {
            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-4bits",
            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-4bits",
        },
        "Yi-1.5-6B": {
            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B",
            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B",
        },
        "Yi-1.5-9B": {
            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B",
            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B",
        },
        "Yi-1.5-34B": {
            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B",
            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B",
        },
        "Yi-1.5-6B-Chat": {
            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B-Chat",
            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B-Chat",
            DownloadSource.OPENMIND: "LlamaFactory/Yi-1.5-6B-Chat",
        },
        "Yi-1.5-9B-Chat": {
            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B-Chat",
            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B-Chat",
        },
        "Yi-1.5-34B-Chat": {
            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B-Chat",
            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B-Chat",
        },
        "Yi-Coder-1.5B": {
            DownloadSource.DEFAULT: "01-ai/Yi-Coder-1.5B",
            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-1.5B",
        },
        "Yi-Coder-9B": {
            DownloadSource.DEFAULT: "01-ai/Yi-Coder-9B",
            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-9B",
        },
        "Yi-Coder-1.5B-Chat": {
            DownloadSource.DEFAULT: "01-ai/Yi-Coder-1.5B-Chat",
            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-1.5B-Chat",
        },
        "Yi-Coder-9B-Chat": {
            DownloadSource.DEFAULT: "01-ai/Yi-Coder-9B-Chat",
            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-9B-Chat",
        },
    },
    template="yi",
)


register_model_group(
    models={
        "Yi-VL-6B-Chat": {
            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf",
        },
        "Yi-VL-34B-Chat": {
            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-34B-hf",
        },
    },
    template="yi_vl",
    multimodal=True,
)


register_model_group(
    models={
        "Yuan2-2B-Chat": {
            DownloadSource.DEFAULT: "IEITYuan/Yuan2-2B-hf",
            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-2B-hf",
        },
        "Yuan2-51B-Chat": {
            DownloadSource.DEFAULT: "IEITYuan/Yuan2-51B-hf",
            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-51B-hf",
        },
        "Yuan2-102B-Chat": {
            DownloadSource.DEFAULT: "IEITYuan/Yuan2-102B-hf",
            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-102B-hf",
        },
    },
    template="yuan",
)


register_model_group(
    models={
        "Zephyr-7B-Alpha-Chat": {
            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-alpha",
            DownloadSource.MODELSCOPE: "AI-ModelScope/zephyr-7b-alpha",
        },
        "Zephyr-7B-Beta-Chat": {
            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-beta",
            DownloadSource.MODELSCOPE: "modelscope/zephyr-7b-beta",
        },
        "Zephyr-141B-ORPO-Chat": {
            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
        },
    },
    template="zephyr",
)


================================================
FILE: kt-sft/ktransformers/sft/metrics_utils/env.py
================================================
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's transformers library.
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/commands/env.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import platform

import accelerate
import datasets
import peft
import torch
import transformers
# import trl
from transformers.utils import is_torch_cuda_available, is_torch_npu_available


VERSION = "0.9.4.dev0"


def print_env() -> None:
    info = {
        "`llamafactory` version": VERSION,
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
        "PyTorch version": torch.__version__,
        "Transformers version": transformers.__version__,
        "Datasets version": datasets.__version__,
        "Accelerate version": accelerate.__version__,
        "PEFT version": peft.__version__,
        "TRL version": "0.21.0",
    }

    if is_torch_cuda_available():
        info["PyTorch version"] += " (GPU)"
        info["GPU type"] = torch.cuda.get_device_name()
        info["GPU number"] = torch.cuda.device_count()
        info["GPU memory"] = f"{torch.cuda.mem_get_info()[1] / (1024**3):.2f}GB"

    if is_torch_npu_available():
        info["PyTorch version"] += " (NPU)"
        info["NPU type"] = torch.npu.get_device_name()
        info["CANN version"] = torch.version.cann

    try:
        import deepspeed  # type: ignore

        info["DeepSpeed version"] = deepspeed.__version__
    except Exception:
        pass

    try:
        import bitsandbytes  # type: ignore

        info["Bitsandbytes version"] = bitsandbytes.__version__
    except Exception:
        pass

    try:
        import vllm

        info["vLLM version"] = vllm.__version__
    except Exception:
        pass

    try:
        import subprocess

        commit_info = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True)
        commit_hash = commit_info.stdout.strip()
        info["Git commit"] = commit_hash
    except Exception:
        pass

    if os.path.exists("data"):
        info["Default data directory"] = "detected"
    else:
        info["Default data directory"] = "not detected"

    print("\n" + "\n".join([f"- {key}: {value}" for key, value in info.items()]) + "\n")


================================================
FILE: kt-sft/ktransformers/sft/metrics_utils/logging.py
================================================
# Copyright 2025 Optuna, HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's transformers library.
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/logging.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys
import threading
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from typing import Optional

from .constants import RUNNING_LOG


_thread_lock = threading.RLock()
_default_handler: Optional["logging.Handler"] = None
_default_log_level: "logging._Level" = logging.INFO


class LoggerHandler(logging.Handler):
    r"""Redirect the logging output to the logging file for LLaMA Board."""

    def __init__(self, output_dir: str) -> None:
        super().__init__()
        self._formatter = logging.Formatter(
            fmt="[%(levelname)s|%(asctime)s] %(filename)s:%(lineno)s >> %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
        )
        self.setLevel(logging.INFO)
        os.makedirs(output_dir, exist_ok=True)
        self.running_log = os.path.join(output_dir, RUNNING_LOG)
        if os.path.exists(self.running_log):
            os.remove(self.running_log)

        self.thread_pool = ThreadPoolExecutor(max_workers=1)

    def _write_log(self, log_entry: str) -> None:
        with open(self.running_log, "a", encoding="utf-8") as f:
            f.write(log_entry + "\n")

    def emit(self, record) -> None:
        if record.name == "httpx":
            return

        log_entry = self._formatter.format(record)
        self.thread_pool.submit(self._write_log, log_entry)

    def close(self) -> None:
        self.thread_pool.shutdown(wait=True)
        return super().close()


class _Logger(logging.Logger):
    r"""A logger that supports rank0 logging."""

    def info_rank0(self, *args, **kwargs) -> None:
        self.info(*args, **kwargs)

    def warning_rank0(self, *args, **kwargs) -> None:
        self.warning(*args, **kwargs)

    def warning_rank0_once(self, *args, **kwargs) -> None:
        self.warning(*args, **kwargs)


def _get_default_logging_level() -> "logging._Level":
    r"""Return the default logging level."""
    env_level_str = os.getenv("LLAMAFACTORY_VERBOSITY", None)
    if env_level_str:
        if env_level_str.upper() in logging._nameToLevel:
            return logging._nameToLevel[env_level_str.upper()]
        else:
            raise ValueError(f"Unknown logging level: {env_level_str}.")

    return _default_log_level


def _get_library_name() -> str:
    return __name__.split(".")[0]


def _get_library_root_logger() -> "_Logger":
    return logging.getLogger(_get_library_name())


def _configure_library_root_logger() -> None:
    r"""Configure root logger using a stdout stream handler with an explicit format."""
    global _default_handler

    with _thread_lock:
        if _default_handler:  # already configured
            return

        formatter = logging.Formatter(
            fmt="[%(levelname)s|%(asctime)s] %(name)s:%(lineno)s >> %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
        )
        _default_handler = logging.StreamHandler(sys.stdout)
        _default_handler.setFormatter(formatter)
        library_root_logger = _get_library_root_logger()
        library_root_logger.addHandler(_default_handler)
        library_root_logger.setLevel(_get_default_logging_level())
        library_root_logger.propagate = False


def get_logger(name: Optional[str] = None) -> "_Logger":
    r"""Return a logger with the specified name. It it not supposed to be accessed externally."""
    if name is None:
        name = _get_library_name()

    _configure_library_root_logger()
    return logging.getLogger(name)


def add_handler(handler: "logging.Handler") -> None:
    r"""Add a handler to the root logger."""
    _configure_library_root_logger()
    _get_library_root_logger().addHandler(handler)


def remove_handler(handler: logging.Handler) -> None:
    r"""Remove a handler to the root logger."""
    _configure_library_root_logger()
    _get_library_root_logger().removeHandler(handler)


def info_rank0(self: "logging.Logger", *args, **kwargs) -> None:
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        self.info(*args, **kwargs)


def warning_rank0(self: "logging.Logger", *args, **kwargs) -> None:
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        self.warning(*args, **kwargs)


@lru_cache(None)
def warning_rank0_once(self: "logging.Logger", *args, **kwargs) -> None:
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        self.warning(*args, **kwargs)


logging.Logger.info_rank0 = info_rank0
logging.Logger.warning_rank0 = warning_rank0
logging.Logger.warning_rank0_once = warning_rank0_once


================================================
FILE: kt-sft/ktransformers/sft/metrics_utils/misc.py
================================================
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's PEFT library.
# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/peft_model.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import os
import socket
from typing import TYPE_CHECKING, Any, Literal, Optional, Union

import torch
import torch.distributed as dist
import transformers.dynamic_module_utils
from huggingface_hub.utils import WeakFileLock
from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
from transformers.dynamic_module_utils import get_relative_imports
from transformers.utils import (
    is_torch_bf16_gpu_available,
    is_torch_cuda_available,
    is_torch_mps_available,
    is_torch_npu_available,
    is_torch_xpu_available,
)
from transformers.utils.versions import require_version

from . import logging


_is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
try:
    _is_bf16_available = is_torch_bf16_gpu_available() or (is_torch_npu_available() and torch.npu.is_bf16_supported())
except Exception:
    _is_bf16_available = False


if TYPE_CHECKING:
    from numpy.typing import NDArray

    from ..hparams import ModelArguments


logger = logging.get_logger(__name__)


class AverageMeter:
    r"""Compute and store the average and current value."""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def check_version(requirement: str, mandatory: bool = False) -> None:
    r"""Optionally check the package version."""
    if is_env_enabled("DISABLE_VERSION_CHECK") and not mandatory:
        logger.warning_rank0_once("Version checking has been disabled, may lead to unexpected behaviors.")
        return

    if "gptmodel" in requirement or "autoawq" in requirement:
        pip_command = f"pip install {requirement} --no-build-isolation"
    else:
        pip_command = f"pip install {requirement}"

    if mandatory:
        hint = f"To fix: run `{pip_command}`."
    else:
        hint = f"To fix: run `{pip_command}` or set `DISABLE_VERSION_CHECK=1` to skip this check."

    require_version(requirement, hint)


def check_dependencies() -> None:
    r"""Check the version of the required packages."""
    check_version("transformers>=4.49.0,<=4.55.0")
    check_version("datasets>=2.16.0,<=3.6.0")
    check_version("accelerate>=1.3.0,<=1.7.0")
    check_version("peft>=0.14.0,<=0.15.2")
    check_version("trl>=0.8.6,<=0.9.6")


def calculate_tps(dataset: list[dict[str, Any]], metrics: dict[str, float], stage: Literal["sft", "rm"]) -> float:
    r"""Calculate effective tokens per second."""
    effective_token_num = 0
    for data in dataset:
        if stage == "sft":
            effective_token_num += len(data["input_ids"])
        elif stage == "rm":
            effective_token_num += len(data["chosen_input_ids"]) + len(data["rejected_input_ids"])

    result = effective_token_num * metrics["epoch"] / metrics["train_runtime"]
    return result / dist.get_world_size() if dist.is_initialized() else result


def count_parameters(model: "torch.nn.Module") -> tuple[int, int]:
    r"""Return the number of trainable parameters and number of all parameters in the model."""
    trainable_params, all_param = 0, 0
    for param in model.parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by itemsize
        if param.__class__.__name__ == "Params4bit":
            if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"):
                num_bytes = param.quant_storage.itemsize
            elif hasattr(param, "element_size"):  # for older pytorch version
                num_bytes = param.element_size()
            else:
                num_bytes = 1

            num_params = num_params * 2 * num_bytes

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def get_current_device() -> "torch.device":
    r"""Get the current available device."""
    if is_torch_xpu_available():
        device = "xpu:{}".format(os.getenv("LOCAL_RANK", "0"))
    elif is_torch_npu_available():
        device = "npu:{}".format(os.getenv("LOCAL_RANK", "0"))
    elif is_torch_mps_available():
        device = "mps:{}".format(os.getenv("LOCAL_RANK", "0"))
    elif is_torch_cuda_available():
        device = "cuda:{}".format(os.getenv("LOCAL_RANK", "0"))
    else:
        device = "cpu"

    return torch.device(device)


def get_device_count() -> int:
    r"""Get the number of available devices."""
    if is_torch_xpu_available():
        return torch.xpu.device_count()
    elif is_torch_npu_available():
        return torch.npu.device_count()
    elif is_torch_mps_available():
        return torch.mps.device_count()
    elif is_torch_cuda_available():
        return torch.cuda.device_count()
    else:
        return 0


def get_logits_processor() -> "LogitsProcessorList":
    r"""Get logits processor that removes NaN and Inf logits."""
    logits_processor = LogitsProcessorList()
    logits_processor.append(InfNanRemoveLogitsProcessor())
    return logits_processor


def get_current_memory() -> tuple[int, int]:
    r"""Get the available and total memory for the current device (in Bytes)."""
    if is_torch_xpu_available():
        return torch.xpu.mem_get_info()
    elif is_torch_npu_available():
        return torch.npu.mem_get_info()
    elif is_torch_mps_available():
        return torch.mps.current_allocated_memory(), torch.mps.recommended_max_memory()
    elif is_torch_cuda_available():
        return torch.cuda.mem_get_info()
    else:
        return 0, -1


def get_peak_memory() -> tuple[int, int]:
    r"""Get the peak memory usage (allocated, reserved) for the current device (in Bytes)."""
    if is_torch_xpu_available():
        return torch.xpu.max_memory_allocated(), torch.xpu.max_memory_reserved()
    elif is_torch_npu_available():
        return torch.npu.max_memory_allocated(), torch.npu.max_memory_reserved()
    elif is_torch_mps_available():
        return torch.mps.current_allocated_memory(), -1
    elif is_torch_cuda_available():
        return torch.cuda.max_memory_allocated(), torch.cuda.max_memory_reserved()
    else:
        return 0, -1


def has_tokenized_data(path: "os.PathLike") -> bool:
    r"""Check if the path has a tokenized dataset."""
    return os.path.isdir(path) and len(os.listdir(path)) > 0


def infer_optim_dtype(model_dtype: Optional["torch.dtype"]) -> "torch.dtype":
    r"""Infer the optimal dtype according to the model_dtype and device compatibility."""
    if _is_bf16_available and (model_dtype == torch.bfloat16 or model_dtype is None):
        return torch.bfloat16
    elif _is_fp16_available:
        return torch.float16
    else:
        return torch.float32


def is_accelerator_available() -> bool:
    r"""Check if the accelerator is available."""
    return (
        is_torch_xpu_available() or is_torch_npu_available() or is_torch_mps_available() or is_torch_cuda_available()
    )


def is_env_enabled(env_var: str, default: str = "0") -> bool:
    r"""Check if the environment variable is enabled."""
    return os.getenv(env_var, default).lower() in ["true", "y", "1"]


def numpify(inputs: Union["NDArray", "torch.Tensor"]) -> "NDArray":
    r"""Cast a torch tensor or a numpy array to a numpy array."""
    if isinstance(inputs, torch.Tensor):
        inputs = inputs.cpu()
        if inputs.dtype == torch.bfloat16:  # numpy does not support bfloat16 until 1.21.4
            inputs = inputs.to(torch.float32)

        inputs = inputs.numpy()

    return inputs


def skip_check_imports() -> None:
    r"""Avoid flash attention import error in custom model files."""
    if not is_env_enabled("FORCE_CHECK_IMPORTS"):
        transformers.dynamic_module_utils.check_imports = get_relative_imports


def torch_gc() -> None:
    r"""Collect the device memory."""
    gc.collect()
    if is_torch_xpu_available():
        torch.xpu.empty_cache()
    elif is_torch_npu_available():
        torch.npu.empty_cache()
    elif is_torch_mps_available():
        torch.mps.empty_cache()
    elif is_torch_cuda_available():
        torch.cuda.empty_cache()


def try_download_model_from_other_hub(model_args: "ModelArguments") -> str:
    if (not use_modelscope() and not use_openmind()) or os.path.exists(model_args.model_name_or_path):
        return model_args.model_name_or_path

    if use_modelscope():
        check_version("modelscope>=1.14.0", mandatory=True)
        from modelscope import snapshot_download  # type: ignore
        from modelscope.hub.api import HubApi  # type: ignore

        if model_args.ms_hub_token:
            api = HubApi()
            api.login(model_args.ms_hub_token)

        revision = "master" if model_args.model_revision == "main" else model_args.model_revision
        with WeakFileLock(os.path.abspath(os.path.expanduser("~/.cache/llamafactory/modelscope.lock"))):
            model_path = snapshot_download(
                model_args.model_name_or_path,
                revision=revision,
                cache_dir=model_args.cache_dir,
            )

        return model_path

    if use_openmind():
        check_version("openmind>=0.8.0", mandatory=True)
        from openmind.utils.hub import snapshot_download  # type: ignore

        with WeakFileLock(os.path.abspath(os.path.expanduser("~/.cache/llamafactory/openmind.lock"))):
            model_path = snapshot_download(
                model_args.model_name_or_path,
                revision=model_args.model_revision,
                cache_dir=model_args.cache_dir,
            )

        return model_path


def use_modelscope() -> bool:
    return is_env_enabled("USE_MODELSCOPE_HUB")


def use_openmind() -> bool:
    return is_env_enabled("USE_OPENMIND_HUB")


def use_ray() -> bool:
    return is_env_enabled("USE_RAY")


def find_available_port() -> int:
    r"""Find an available port on the local machine."""
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.bind(("", 0))
    port = sock.getsockname()[1]
    sock.close()
    return port


def fix_proxy(ipv6_enabled: bool = False) -> None:
    r"""Fix proxy settings for gradio ui."""
    os.environ["no_proxy"] = "localhost,127.0.0.1,0.0.0.0"
    if ipv6_enabled:
        os.environ.pop("http_proxy", None)
        os.environ.pop("HTTP_PROXY", None)


================================================
FILE: kt-sft/ktransformers/sft/metrics_utils/packages.py
================================================
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's transformers library.
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib.metadata
import importlib.util
from functools import lru_cache
from typing import TYPE_CHECKING

from packaging import version


if TYPE_CHECKING:
    from packaging.version import Version


def _is_package_available(name: str) -> bool:
    return importlib.util.find_spec(name) is not None


def _get_package_version(name: str) -> "Version":
    try:
        return version.parse(importlib.metadata.version(name))
    except Exception:
        return version.parse("0.0.0")


def is_pyav_available():
    return _is_package_available("av")


def is_librosa_available():
    return _is_package_available("librosa")


def is_fastapi_available():
    return _is_package_available("fastapi")


def is_galore_available():
    return _is_package_available("galore_torch")


def is_apollo_available():
    return _is_package_available("apollo_torch")


def is_gradio_available():
    return _is_package_available("gradio")


def is_matplotlib_available():
    return _is_package_available("matplotlib")


def is_pillow_available():
    return _is_package_available("PIL")


def is_ray_available():
    return _is_package_available("ray")


def is_requests_available():
    return _is_package_available("requests")


def is_rouge_available():
    return _is_package_available("rouge_chinese")


def is_starlette_available():
    return _is_package_available("sse_starlette")


@lru_cache
def is_transformers_version_greater_than(content: str):
    return _get_package_version("transformers") >= version.parse(content)


def is_uvicorn_available():
    return _is_package_available("uvicorn")


def is_vllm_available():
    return _is_package_available("vllm")


def is_sglang_available():
    return _is_package_available("sglang")


================================================
FILE: kt-sft/ktransformers/sft/metrics_utils/ploting.py
================================================
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import math
import os
from typing import Any

from transformers.trainer import TRAINER_STATE_NAME

from . import logging
from .packages import is_matplotlib_available


if is_matplotlib_available():
    import matplotlib.figure
    import matplotlib.pyplot as plt


logger = logging.get_logger(__name__)


def smooth(scalars: list[float]) -> list[float]:
    r"""EMA implementation according to TensorBoard."""
    if len(scalars) == 0:
        return []

    last = scalars[0]
    smoothed = []
    weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
    for next_val in scalars:
        smoothed_val = last * weight + (1 - weight) * next_val
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed


def gen_loss_plot(trainer_log: list[dict[str, Any]]) -> "matplotlib.figure.Figure":
    r"""Plot loss curves in LlamaBoard."""
    plt.close("all")
    plt.switch_backend("agg")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    steps, losses = [], []
    for log in trainer_log:
        if log.get("loss", None):
            steps.append(log["current_steps"])
            losses.append(log["loss"])

    ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original")
    ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed")
    ax.legend()
    ax.set_xlabel("step")
    ax.set_ylabel("loss")
    return fig


def plot_loss(save_dictionary: str, keys: list[str] = ["loss"]) -> None:
    r"""Plot loss curves and saves the image."""
    plt.switch_backend("agg")
    with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), encoding="utf-8") as f:
        data = json.load(f)

    for key in keys:
        steps, metrics = [], []
        for i in range(len(data["log_history"])):
            if key in data["log_history"][i]:
                steps.append(data["log_history"][i]["step"])
                metrics.append(data["log_history"][i][key])

        if len(metrics) == 0:
            logger.warning_rank0(f"No metric {key} to plot.")
            continue

        plt.figure()
        plt.plot(steps, metrics, color="#1f77b4", alpha=0.4, label="original")
        plt.plot(steps, smooth(metrics), color="#1f77b4", label="smoothed")
        plt.title(f"training {key} of {save_dictionary}")
        plt.xlabel("step")
        plt.ylabel(key)
        plt.legend()
        figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace("/", "_")))
        plt.savefig(figure_path, format="png", dpi=100)
        print("Figure saved at:", figure_path)


================================================
FILE: kt-sft/ktransformers/sft/monkey_patch_torch_module.py
================================================
import torch
from collections import OrderedDict
from torch.nn.modules import Module

_ORIG_MODULE_INIT = Module.__init__

def _patched_module_init(self, *args, **kwargs):
    torch._C._log_api_usage_once("python.nn_module")

    if self.call_super_init is False and bool(kwargs):
        raise TypeError(
            f"{type(self).__name__}.__init__() got an unexpected keyword argument '{next(iter(kwargs))}'"
        )
    if self.call_super_init is False and bool(args):
        raise TypeError(
            f"{type(self).__name__}.__init__() takes 1 positional argument but {len(args) + 1} were given"
        )

    object.__setattr__(self, "training", True)
    object.__setattr__(self, "_parameters", {})
    object.__setattr__(self, "_buffers", {})
    object.__setattr__(self, "_non_persistent_buffers_set", set())
    object.__setattr__(self, "_backward_pre_hooks", OrderedDict())
    object.__setattr__(self, "_backward_hooks", OrderedDict())
    object.__setattr__(self, "_is_full_backward_hook", None)
    object.__setattr__(self, "_forward_hooks", OrderedDict())
    object.__setattr__(self, "_forward_hooks_with_kwargs", OrderedDict())
    object.__setattr__(self, "_forward_hooks_always_called", OrderedDict())
    object.__setattr__(self, "_forward_pre_hooks", OrderedDict())
    object.__setattr__(self, "_forward_pre_hooks_with_kwargs", OrderedDict())
    object.__setattr__(self, "_state_dict_hooks", OrderedDict())
    object.__setattr__(self, "_state_dict_pre_hooks", OrderedDict())
    object.__setattr__(self, "_load_state_dict_pre_hooks", OrderedDict())
    object.__setattr__(self, "_load_state_dict_post_hooks", OrderedDict())

    if not (hasattr(self, "orig_module") and isinstance(self.orig_module, torch.nn.modules.linear.Linear)):
        object.__setattr__(self, "_modules", {})

    if self.call_super_init:
        object.__init__(self)

def install_patch():
    Module.__init__ = _patched_module_init

def restore_patch():
    Module.__init__ = _ORIG_MODULE_INIT

install_patch()

================================================
FILE: kt-sft/ktransformers/sft/peft_utils/__init__.py
================================================


================================================
FILE: kt-sft/ktransformers/sft/peft_utils/lora_layer.py
================================================
from abc import ABC
from copy import deepcopy
import math
import warnings
from typing import Any, Optional, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from accelerate.utils.imports import is_xpu_available
from torch import BufferDict, svd_lowrank, transpose
from transformers.pytorch_utils import Conv1D

from peft.tuners.lora.config import LoraConfig

from ktransformers.operators.linear import KTransformersLinear, KLinearTorch, KLinearBase
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.inference_state import InferenceState

def dispatch_default(
    target: torch.nn.Module,
    adapter_name: str,
    lora_config: LoraConfig,
    **kwargs,
) -> Optional[torch.nn.Module]:
    new_module = None

    if isinstance(target, BaseTunerLayer):
        target_orig_module = target.get_orig_module()
    else:
        target_orig_module = target

    if isinstance(target_orig_module, torch.nn.Embedding):
        embedding_kwargs = kwargs.copy()
        embedding_kwargs.pop("fan_in_fan_out", None)
        embedding_kwargs.update(lora_config.loftq_config)
        new_module = Embedding(target, adapter_name, **embedding_kwargs)

    elif isinstance(target_orig_module, torch.nn.Linear):
        if kwargs["fan_in_fan_out"]:
            warnings.warn(
                "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
                "Setting fan_in_fan_out to False."
            )
            kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
        kwargs.update(lora_config.loftq_config)
        new_module = Linear(target, adapter_name, **kwargs)

    elif isinstance(target_orig_module, KTransformersLinear):
        kwargs.update(lora_config.loftq_config)
        new_module = KTransformersLinearLora(target, adapter_name, **kwargs)

    return new_module

class BaseTunerLayer(ABC):
    r"""
    A tuner layer mixin that provides the common methods and attributes for all tuners.

    Args:
        is_pluggable (`bool`, *optional*):
            Whether the adapter layer can be plugged to any pytorch module
        active_adapters (Union[List[`str`], `str`], *optional*):
            The name of the active adapter.
    """

    # All names of layers that may contain adapter (trainable) weights
    adapter_layer_names: tuple[str, ...] = ()
    # All names of other parameters that may contain adapter-related parameters
    other_param_names: tuple[str, ...] = ()

    # indicates whether all adapters should be disabled
    _disable_adapters: bool = False

    # the currently active adapter(s)
    _active_adapter: str | list[str] = "default"

    # List all merged adapters
    merged_adapters: list[str] = []

    def get_orig_module(self) -> nn.Module:
        """
        (Recursively) get the orig_module.

        This is necessary for the case that the tuner layer wraps another tuner layer.

        """
        orig_module = self
        while hasattr(orig_module, "orig_module"):
            orig_module = orig_module.orig_module
        return orig_module

    @property
    def weight(self) -> torch.Tensor:
        # This is required for some transformers code, e.g. for T5, weight is accessed as:
        #     self.wo.weight
        # where "wo" is the adapter layer.
        # https://github.com/huggingface/transformers/blob/78f6ed6c70b29c1560780e3869a7ad4c6b3d2710/src/transformers
        # /models/t5/modeling_t5.py#L292
        orig_module = self.get_orig_module()
        if hasattr(orig_module, "qweight"):
            # QuantLinear
            weight = orig_module.qweight
        else:
            # Other layers
            weight = orig_module.weight
        return weight

    @property
    def bias(self) -> torch.Tensor:
        orig_module = self.get_orig_module()
        return orig_module.bias

    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
        raise NotImplementedError

    def unmerge(self) -> None:
        raise NotImplementedError

    @property
    def merged(self) -> bool:
        return bool(self.merged_adapters)

    @property
    def disable_adapters(self) -> bool:
        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
        return self._disable_adapters

    @property
    def active_adapter(self) -> str | list[str]:
        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
        return self._active_adapter

    def _get_available_adapters(self) -> set[str]:
        """Return all adapter names that can be found on this module."""
        adapters = set()
        for layer_name in self.adapter_layer_names:
            module = getattr(self, layer_name)
            if not isinstance(module, (nn.ModuleDict, nn.ParameterDict)):
                continue
            adapters.update(set(module.keys()))
        return adapters

    @property
    def active_adapters(self):
        if isinstance(self.active_adapter, str):
            return [self.active_adapter]
        # is already a list of str
        return self.active_adapter

    def enable_adapters(self, enabled: bool) -> None:
        """Toggle the enabling and disabling of adapters

        Takes care of setting the requires_grad flag for the adapter weights.

        Args:
            enabled (bool): True to enable adapters, False to disable adapters
        """
        if enabled:
            self.set_adapter(self.active_adapters)
            self._disable_adapters = False
        else:
            # disable grads on all adapter layers
            for layer_name in self.adapter_layer_names:
                layer = getattr(self, layer_name)
                layer.requires_grad_(False)
            self._disable_adapters = True

    def set_adapter(self, adapter_names: str | list[str]) -> None:
        """Set the active adapter(s).

        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
        not desired, use the following code.

        ```py
        >>> for name, param in model_peft.named_parameters():
        ...     if ...:  # some check on name (ex. if 'lora' in name)
        ...         param.requires_grad = False
        ```

        Args:
            adapter_name (`str` or `List[str]`): Name of the adapter(s) to be activated.
        """
        if isinstance(adapter_names, str):
            adapter_names = [adapter_names]

        # Deactivate grads on the inactive adapter and activate grads on the active adapter
        for layer_name in self.adapter_layer_names:
            module_dict = getattr(self, layer_name)
            for key, layer in module_dict.items():
                if key in adapter_names:
                    # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may
                    # happen if a completely different adapter layer is being activated.
                    layer.requires_grad_(True)
                else:
                    layer.requires_grad_(False)

        self._active_adapter = adapter_names

    def _all_available_adapter_names(self) -> list[str]:
        """Return a sorted list of all available adapter names"""
        adapter_names = set()
        for name in self.adapter_layer_names + self.other_param_names:
            # we check each possible attribute and if it's a dict or ModuleDict, we assume that the keys are the adapter
            # names
            attr = getattr(self, name)
            if hasattr(attr, "keys"):
                adapter_names.update(attr.keys())
        return sorted(adapter_names)

    def delete_adapter(self, adapter_name: str) -> None:
        """
        Delete an adapter from the layer

        This should be called on all adapter layers, or else we will get an inconsistent state.

        This method will also set a new active adapter if the deleted adapter was an active adapter. It is important
        that the new adapter is chosen in a deterministic way, so that the same adapter is chosen on all layers.

        Args:
            adapter_name (`str`): The name of the adapter to delete

        """
        for attr in self.adapter_layer_names + self.other_param_names:
            if adapter_name in getattr(self, attr):
                del getattr(self, attr)[adapter_name]

        if adapter_name in self.active_adapters:
            # choose a new active adapter
            active_adapters = self.active_adapters[:]
            active_adapters.remove(adapter_name)
            if active_adapters:
                self.set_adapter(active_adapters)
            else:
                # no active adapters left, set a new default adapter
                # here we get the list of all adapters existing adapter names and choose the first one
                remaining_adapters = self._all_available_adapter_names()
                if not remaining_adapters:
                    self.set_adapter([])
                else:
                    new_active_adapter = remaining_adapters[0]
                    warnings.warn(
                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to "
                        f"{new_active_adapter}."
                    )
                    self.set_adapter(remaining_adapters[0])

    def _move_adapter_to_device_of_orig_module(self, adapter_name: str, device: Optional[torch.device] = None) -> None:
        """
        Move the adapter of the given name to the device of the base layer.
        """
        if device is None:
            # check weight and qweight (for GPTQ)
            for weight_name in ("weight", "qweight"):
                weight = getattr(self.get_orig_module(), weight_name, None)
                if weight is not None:
                    device = weight.device
                    dtype = weight.dtype
                    break
            else:
                # no break encountered: could not determine the device
                return

        meta = torch.device("meta")

        # loop through all potential adapter layers and move them to the device of the base layer; be careful to only
        # move this specific adapter to the device, as the other adapters could be on different devices
        # see #1639
        for adapter_layer_name in self.adapter_layer_names + self.other_param_names:
            adapter_layer = getattr(self, adapter_layer_name, None)
            if not isinstance(adapter_layer, (nn.ModuleDict, nn.ParameterDict, BufferDict)):
                continue
            if adapter_name not in adapter_layer:
                continue
            if any(p.device == meta for p in adapter_layer.parameters()):
                continue

            if weight.dtype.is_floating_point or weight.dtype.is_complex:
                adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device, dtype=dtype)
            else:
                adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device)


class LoraLayer(BaseTunerLayer):
    # All names of layers that may contain (trainable) adapter weights
    adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B")
    # All names of other parameters that may contain adapter-related parameters
    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")

    def __init__(self, orig_module: nn.Module, ephemeral_gpu_offload: bool = False, **kwargs) -> None:
        self.orig_module = orig_module
        self.r = {}
        self.lora_alpha = {}
        self.scaling = {}
        self.lora_dropout = nn.ModuleDict({})
        self.lora_A = nn.ModuleDict({})
        self.lora_B = nn.ModuleDict({})
        # For Embedding layer
        self.lora_embedding_A = nn.ParameterDict({})
        self.lora_embedding_B = nn.ParameterDict({})
        # Mark the weight as unmerged
        self._disable_adapters = False
        self.merged_adapters = []
        self.use_dora: dict[str, bool] = {}
        self.lora_bias: dict[str, bool] = {}
        self.lora_magnitude_vector = torch.nn.ModuleDict()  # for DoRA
        self._caches: dict[str, Any] = {}
        self.ephemeral_gpu_offload: bool = ephemeral_gpu_offload
        self.kwargs = kwargs

        orig_module = self.get_orig_module()
        if isinstance(orig_module, nn.Linear):
            in_features, out_features = orig_module.in_features, orig_module.out_features
        elif isinstance(orig_module, nn.Embedding):
            in_features, out_features = orig_module.num_embeddings, orig_module.embedding_dim
        else:
            raise TypeError(f"unknown type of {orig_module}, not in Linear or Embedding.")

        self.in_features = in_features
        self.out_features = out_features

    def update_layer(
        self,
        adapter_name,
        r,
        lora_alpha,
        lora_dropout,
        init_lora_weights,
        use_rslora: bool = False,
        use_dora: bool = False,
        lora_bias: bool = False,
    ):
        # This code works for linear layers, override for other layer types
        if r <= 0:
            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")

        self.r[adapter_name] = r
        self.lora_alpha[adapter_name] = lora_alpha
        if lora_dropout > 0.0:
            lora_dropout_layer = nn.Dropout(p=lora_dropout)
        else:
            lora_dropout_layer = nn.Identity()

        self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
        # Actual trainable parameters
        self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False)
        self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=lora_bias)
        self.lora_bias[adapter_name] = lora_bias

        if use_rslora:
            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
        else:
            self.scaling[adapter_name] = lora_alpha / r

        # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed
        if init_lora_weights == "eva":
            nn.init.zeros_(self.lora_B[adapter_name].weight)
        elif init_lora_weights:
            self.reset_lora_parameters(adapter_name, init_lora_weights)
        # call this before dora_init
        self._move_adapter_to_device_of_orig_module(adapter_name)

        self.set_adapter(self.active_adapters)

    def reset_lora_parameters(self, adapter_name, init_lora_weights):
        if init_lora_weights is False:
            return

        if adapter_name in self.lora_A.keys():
            if init_lora_weights is True:
                # initialize A the same way as the default for nn.Linear and B to zero
                # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
                nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
            elif init_lora_weights.lower() == "gaussian":
                nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name])
            else:
                raise ValueError(f"Unknown initialization {init_lora_weights=}")
            nn.init.zeros_(self.lora_B[adapter_name].weight)
            if self.lora_bias[adapter_name]:
                nn.init.zeros_(self.lora_B[adapter_name].bias)
        if adapter_name in self.lora_embedding_A.keys():
            # Initialize A to zeros and B the same way as the default for nn.Embedding, see:
            # https://github.com/microsoft/LoRA/blob/4c0333854cb905966f8cc4e9a74068c1e507c7b7/loralib/layers.py#L59-L60
            nn.init.zeros_(self.lora_embedding_A[adapter_name])
            nn.init.normal_(self.lora_embedding_B[adapter_name])
            if self.lora_bias[adapter_name]:
                # embeddings are not supported at the moment, but still adding this for consistency
                nn.init.zeros_(self.lora_embedding_B[adapter_name].bias)

    def olora_init(self, adapter_name):
        orig_module = self.get_orig_module()
        orig_weight = orig_module.weight
        dtype = orig_weight.dtype

        if dtype in [torch.float32, torch.float16, torch.bfloat16]:
            weight_tensor = orig_weight
        else:
            raise TypeError(f"Unsupported data type for the base layer. Got {dtype}.")

        scale_factor = self.scaling[adapter_name]
        r = self.r[adapter_name]
        weight_tensor = weight_tensor.to(torch.float32)
        Q, R = torch.linalg.qr(weight_tensor.data)

        Qr, Rr = Q[:, :r], R[:r]

        self.lora_A[adapter_name].weight.data = Rr.contiguous()
        self.lora_B[adapter_name].weight.data = Qr.contiguous()

        weight_tensor.data -= scale_factor * self.lora_B[adapter_name].weight @ self.lora_A[adapter_name].weight
        weight_tensor = weight_tensor.to(dtype)
            
        orig_module.weight.data = weight_tensor

    def pissa_init(self, adapter_name, init_lora_weights):
        weight = self.get_orig_module().weight
        dtype = weight.dtype
        if dtype not in [torch.float32, torch.float16, torch.bfloat16]:
            raise TypeError(
                "Please initialize PiSSA under float32, float16, or bfloat16. "
                "Subsequently, re-quantize the residual model to help minimize quantization errors."
            )
        weight = transpose(weight.to(torch.float32), self.fan_in_fan_out)
        if init_lora_weights == "pissa":
            # USV^T = W <-> VSU^T = W^T, where W^T = weight.data in R^{out_channel, in_channel},
            V, S, Uh = torch.linalg.svd(weight.data, full_matrices=False)
            Vr = V[:, : self.r[adapter_name]]
            Sr = S[: self.r[adapter_name]]
            Sr /= self.scaling[adapter_name]
            Uhr = Uh[: self.r[adapter_name]]
        elif len(init_lora_weights.split("_niter_")) == 2:
            Vr, Sr, Ur = svd_lowrank(
                weight.data, self.r[adapter_name], niter=int(init_lora_weights.split("_niter_")[-1])
            )
            Sr /= self.scaling[adapter_name]
            Uhr = Ur.t()
        else:
            raise ValueError(
                f"init_lora_weights should be 'pissa' or 'pissa_niter_[number of iters]', got {init_lora_weights} instead."
            )

        lora_A = torch.diag(torch.sqrt(Sr)) @ Uhr
        lora_B = Vr @ torch.diag(torch.sqrt(Sr))
        self.lora_A[adapter_name].weight.data = lora_A
        self.lora_B[adapter_name].weight.data = lora_B
        weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A
        weight = transpose(weight.to(dtype), self.fan_in_fan_out)
        self.get_orig_module().weight.data = weight

    def loftq_init(self, adapter_name):
        from peft.utils.loftq_utils import loftq_init

        weight = self.get_orig_module().weight
        kwargs = {
            "num_bits": self.kwargs.get("loftq_bits", 4),
            "reduced_rank": self.r[adapter_name],
            "num_iter": self.kwargs.get("loftq_iter", 1),
        }

        qweight, lora_A, lora_B = loftq_init(weight, **kwargs)
        if adapter_name in self.lora_A.keys():
            # initialize A the same way as the default for nn.Linear and B to zero
            self.lora_A[adapter_name].weight.data = lora_A
            self.lora_B[adapter_name].weight.data = lora_B
        if adapter_name in self.lora_embedding_A.keys():
            # initialize a the same way as the default for nn.linear and b to zero
            self.lora_embedding_A[adapter_name].weight.data = lora_A
            self.lora_embedding_B[adapter_name].weight.data = lora_B
        self.get_orig_module().weight.data = qweight

    def _cache_store(self, key: str, value: Any) -> None:
        self._caches[key] = value

    def _cache_pop(self, key: str) -> Any:
        value = self._caches.pop(key)
        return value

    def set_scale(self, adapter, scale):
        if adapter not in self.scaling:
            # Ignore the case where the adapter is not in the layer
            return
        self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter]

    def scale_layer(self, scale: float) -> None:
        if scale == 1:
            return

        for active_adapter in self.active_adapters:
            if active_adapter not in self.lora_A.keys():
                continue

            self.scaling[active_adapter] *= scale

    def unscale_layer(self, scale=None) -> None:
        for active_adapter in self.active_adapters:
            if active_adapter not in self.lora_A.keys():
                continue

            if scale is None:
                self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter]
            else:
                self.scaling[active_adapter] /= scale

    def _check_forward_args(self, x, *args, **kwargs):
        """Check if the arguments are compatible with the configs and state of the model"""
        adapter_names = kwargs.get("adapter_names", None)
        if adapter_names is None:
            return

        if len(x) != len(adapter_names):
            msg = (
                "Length of `adapter_names` should be the same as the number of inputs, but got "
                f"{len(adapter_names)} and {len(x)} respectively."
            )
            raise ValueError(msg)

        if self.merged:
            # It is unclear what would be the right thing to do if users pass adapter_names and there are merged
            # adapters. Therefore, it is better to raise an error in this case.
            msg = "Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first."
            raise ValueError(msg)

        # DoRA is not supported (yet), check that it's not being used. Don't check "__base__", as this is the
        # placeholder for the base model.
        unique_adapters = {name for name in adapter_names if name != "__base__"}
        for adapter_name in unique_adapters:
            if self.use_dora.get(adapter_name, False):
                msg = "Cannot pass `adapter_names` when DoRA is enabled."
                raise ValueError(msg)

    def _mixed_batch_forward(
        self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any
    ) -> torch.Tensor:
        # This is a special method that handles the case when users pass the argument `adapter_names`. This is an
        # extra argument that allows mixing different adapters in the same batch at inference time.
        result = self.orig_module(x, *args, **kwargs)
        torch_result_dtype = result.dtype

        unique_adapters = set(adapter_names)
        sub_batch_indices_list = []
        for adapter in unique_adapters:
            sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter])

        for i, active_adapter in enumerate(unique_adapters):
            if active_adapter == "__base__":
                continue
            if active_adapter not in self.lora_A.keys():
                continue

            lora_A = self.lora_A[active_adapter]
            lora_B = self.lora_B[active_adapter]
            dropout = self.lora_dropout[active_adapter]
            scaling = self.scaling[active_adapter]

            # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear
            # layer output
            sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype)
            lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling
            result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype)

        return result

class Linear(nn.Module, LoraLayer):
    # Lora implemented in a dense layer
    def __init__(
        self,
        orig_module,
        adapter_name: str,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        is_target_conv_1d_layer: bool = False,
        init_lora_weights: Union[bool, str] = True,
        use_rslora: bool = False,
        use_dora: bool = False,
        lora_bias: bool = False,
        **kwargs,
    ) -> None:
        super().__init__()
        LoraLayer.__init__(self, orig_module, **kwargs)
        self.fan_in_fan_out = fan_in_fan_out

        self._active_adapter = adapter_name
        self.update_layer(
            adapter_name,
            r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            init_lora_weights=init_lora_weights,
            use_rslora=use_rslora,
            use_dora=use_dora,
            lora_bias=lora_bias,
        )
        self.is_target_conv_1d_layer = is_target_conv_1d_layer

    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
        """
        Merge the active adapter weights into the base weights

        Args:
            safe_merge (`bool`, *optional*):
                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                before merging the weights. This is useful if you want to check if the merge operation will produce
                NaNs. Defaults to `False`.
            adapter_names (`list[str]`, *optional*):
                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
                to `None`.
        """
        if not adapter_names:
            # no adapter to merge
            return

        for active_adapter in adapter_names:
            if active_adapter in self.lora_A.keys():
                orig_module = self.get_orig_module()
                if safe_merge:
                    # Note that safe_merge will be slower than the normal merge
                    # because of the copy operation.
                    orig_weights = orig_module.weight.data.clone()
                    delta_weight = self.get_delta_weight(active_adapter)
                    if not self.use_dora[active_adapter]:
                        orig_weights += delta_weight
                    else:
                        # handle dora
                        # since delta_weight already includes scaling, set it to 1 here
                        weight_norm = (
                            self.lora_magnitude_vector[active_adapter]
                            .get_weight_norm(orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1)
                            .detach()
                        )
                        # We need to cache weight_norm because it has to be based on the original weights. We
                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
                        # different value
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
                        orig_weights = dora_factor * (orig_weights + delta_weight)

                    if not torch.isfinite(orig_weights).all():
                        raise ValueError(
                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                        )

                    orig_module.weight.data = orig_weights

                    if self.lora_bias[active_adapter]:
                        new_bias = orig_module.bias + self.lora_B[active_adapter].bias
                        if not torch.isfinite(new_bias).all():
                            raise ValueError(
                                f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                            )
                        orig_module.bias.data = new_bias

                else:
                    delta_weight = self.get_delta_weight(active_adapter)
                    if not self.use_dora[active_adapter]:
                        orig_module.weight.data += delta_weight
                    else:
                        # handle dora
                        # since delta_weight already includes scaling, set it to 1 here
                        weight_norm = (
                            self.lora_magnitude_vector[active_adapter]
                            .get_weight_norm(
                                orig_module.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1
                            )
                            .detach()
                        )
                        # We need to cache weight_norm because it has to be based on the original weights. We
                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
                        # different value
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
                        new_weight = dora_factor * (orig_module.weight.data + delta_weight)
                        orig_module.weight.data = new_weight

                    if self.lora_bias[active_adapter]:
                        orig_module.bias.data += self.lora_B[active_adapter].bias

                self.merged_adapters.append(active_adapter)

    def unmerge(self) -> None:
        """
        This method unmerges all merged adapter layers from the base weights.
        """
        if not self.merged:
            warnings.warn("Already unmerged. Nothing to do.")
            return
        while len(self.merged_adapters) > 0:
            active_adapter = self.merged_adapters.pop()
            if active_adapter in self.lora_A.keys():
                weight = self.get_orig_module().weight
                delta_weight = self.get_delta_weight(active_adapter)
                if not self.use_dora[active_adapter]:
                    weight.data -= delta_weight
                else:
                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight
                    weight.data = weight_orig

                if self.lora_bias[active_adapter]:
                    self.get_orig_module().bias.data -= self.lora_B[active_adapter].bias

    def get_delta_weight(self, adapter) -> torch.Tensor:
        """
        Compute the delta weight for the given adapter.

        Args:
            adapter (str):
                The name of the adapter for which the delta weight should be computed.
        """
        device = self.lora_B[adapter].weight.device
        dtype = self.lora_B[adapter].weight.dtype

        # In case users wants to merge the adapter weights that are in
        # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
        # (b)float16 because some CPUs have slow bf16/fp16 matmuls.
        cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)

        weight_A = self.lora_A[adapter].weight
        weight_B = self.lora_B[adapter].weight

        if cast_to_fp32:
            weight_A = weight_A.float()
            weight_B = weight_B.float()

        output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]

        if cast_to_fp32:
            output_tensor = output_tensor.to(dtype=dtype)

            # cast back the weights
            self.lora_A[adapter].weight.data = weight_A.to(dtype)
            self.lora_B[adapter].weight.data = weight_B.to(dtype)

        return output_tensor

    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
        self._check_forward_args(x, *args, **kwargs)
        adapter_names = kwargs.pop("adapter_names", None)

        if self.disable_adapters:
            if self.merged:
                self.unmerge()
            result = self.orig_module(x, *args, **kwargs)
        elif adapter_names is not None:
            result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
        elif self.merged:
            result = self.orig_module(x, *args, **kwargs)
        else:
            result = self.orig_module(x, *args, **kwargs)
            torch_result_dtype = result.dtype
            for active_adapter in self.active_adapters:
                if active_adapter not in self.lora_A.keys():
                    continue
                lora_A = self.lora_A[active_adapter]
                lora_B = self.lora_B[active_adapter]
                dropout = self.lora_dropout[active_adapter]
                scaling = self.scaling[active_adapter]
                x = x.to(lora_A.weight.dtype)

                # TODO: Remove dora method up to now.
                result = result + lora_B(lora_A(dropout(x))) * scaling
                
            result = result.to(torch_result_dtype)

        return result

    def __repr__(self) -> str:
        rep = super().__repr__()
        return "lora." + rep


class Embedding(nn.Module, LoraLayer):
    # LoRA implemented in a Embedding layer
    def __init__(
        self,
        orig_module: nn.Module,
        adapter_name: str,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        init_lora_weights: Union[bool, str] = True,
        use_rslora: bool = False,
        use_dora: bool = False,
        lora_bias: bool = False,
        **kwargs,
    ) -> None:
        if lora_bias:
            # lora_bias=True is not supported (yet) for embedding layers, as they use nn.Parameter
            raise ValueError(f"lora_bias={lora_bias} is not supported for {self.__class__.__name__}.")

        super().__init__()
        LoraLayer.__init__(self, orig_module)

        self._active_adapter = adapter_name
        self.update_layer(
            adapter_name,
            r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            init_lora_weights=init_lora_weights,
            use_rslora=use_rslora,
            use_dora=use_dora,
            lora_bias=lora_bias,
        )

    def update_layer(
        self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias
    ):
        if r <= 0:
            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")

        self.r[adapter_name] = r
        self.lora_alpha[adapter_name] = lora_alpha
        if lora_dropout > 0.0:
            lora_dropout_layer = nn.Dropout(p=lora_dropout)
        else:
            lora_dropout_layer = nn.Identity()

        self.lora_dropout[adapter_name] = lora_dropout_layer
        # Actual trainable parameters
        weight_A = torch.randn((r, self.in_features))
        weight_B = torch.randn((self.out_features, r))
        self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A)
        self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B)
        self.lora_bias[adapter_name] = lora_bias

        if use_rslora:
            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
        else:
            self.scaling[adapter_name] = lora_alpha / r

        if init_lora_weights == "loftq":
            self.loftq_init(adapter_name)
        elif init_lora_weights:
            self.reset_lora_parameters(adapter_name, init_lora_weights)

        # call this before dora_init
        self._move_adapter_to_device_of_orig_module(adapter_name)

        self.set_adapter(self.active_adapters)

    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
        """
        Merge the active adapter weights into the base weights

        Args:
            safe_merge (`bool`, *optional*):
                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                before merging the weights. This is useful if you want to check if the merge operation will produce
                NaNs. Defaults to `False`.
            adapter_names (`list[str]`, *optional*):
                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
                to `None`.
        """
        if not adapter_names:
            # no adapter to merge
            return

        for active_adapter in adapter_names:
            if active_adapter in self.lora_embedding_A.keys():
                orig_module = self.get_orig_module()
                if safe_merge:
                    # Note that safe_merge will be slower than the normal merge
                    # because of the copy operation.
                    orig_weights = orig_module.weight.data.clone()
                    orig_weights += self.get_delta_weight(active_adapter)

                    if not torch.isfinite(orig_weights).all():
                        raise ValueError(
                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                        )

                    orig_module.weight.data = orig_weights
                else:
                    orig_module.weight.data += self.get_delta_weight(active_adapter)
                self.merged_adapters.append(active_adapter)

    def unmerge(self) -> None:
        """
        This method unmerges all merged adapter layers from the base weights.
        """
        if not self.merged:
            warnings.warn("Already unmerged. Nothing to do.")
            return
        while len(self.merged_adapters) > 0:
            active_adapter = self.merged_adapters.pop()
            if active_adapter in self.lora_embedding_A.keys():
                self.get_orig_module().weight.data -= self.get_delta_weight(active_adapter)

    def get_delta_weight(self, adapter) -> torch.Tensor:
        """
        Compute the delta weight for the given adapter.

        Args:
            adapter (str):
                The name of the adapter for which the delta weight should be computed.
        """
        device = self.lora_embedding_B[adapter].device
        dtype = self.lora_embedding_A[adapter].dtype

        # In case users wants to merge the adapter weights that are in
        # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
        # (b)float16 because some CPUs have slow bf16/fp16 matmuls.
        cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)

        weight_A = self.lora_embedding_A[adapter]
        weight_B = self.lora_embedding_B[adapter]

        if cast_to_fp32:
            weight_A = weight_A.float()
            weight_B = weight_B.float()

        output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter]

        if cast_to_fp32:
            output_tensor = output_tensor.to(dtype=dtype)

            # cast back the weights
            self.lora_embedding_A[adapter] = weight_A.to(dtype)
            self.lora_embedding_B[adapter] = weight_B.to(dtype)

        return output_tensor

    def _mixed_batch_forward(
        self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any
    ) -> torch.Tensor:
        # This is a special method that handles the case when users pass the argument `adapter_names`. This is an
        # extra argument that allows mixing different adapters in the same batch at inference time.
        result = self.orig_module(x, *args, **kwargs)

        unique_adapters = set(adapter_names)
        sub_batch_indices_list = []
        for adapter in unique_adapters:
            sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter])

        for i, active_adapter in enumerate(unique_adapters):
            if active_adapter == "__base__":
                continue
            if active_adapter not in self.lora_embedding_A.keys():
                continue

            embedding_A = self.lora_embedding_A[active_adapter].T
            embedding_B = self.lora_embedding_B[active_adapter].T
            scaling = self.scaling[active_adapter]

            # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear
            # layer output
            sub_batch = x[sub_batch_indices_list[i]]
            after_A = self._embed(sub_batch, embedding_A)
            result[sub_batch_indices_list[i]] += (after_A @ embedding_B) * scaling

        return result

    def _embed(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
        orig_module = self.get_orig_module()
        return F.embedding(
            input,
            weight,
            padding_idx=orig_module.padding_idx,
            max_norm=orig_module.max_norm,
            norm_type=orig_module.norm_type,
            scale_grad_by_freq=orig_module.scale_grad_by_freq,
            sparse=orig_module.sparse,
        )

    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
        # TODO: no dtype conversion here, unlike in Linear, is that correct?
        self._check_forward_args(x, *args, **kwargs)
        adapter_names = kwargs.pop("adapter_names", None)

        if self.disable_adapters:
            if self.merged:
                self.unmerge()
            result = self.orig_module(x, *args, **kwargs)
        elif adapter_names is not None:
            result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
        elif self.merged:
            result = self.orig_module(x, *args, **kwargs)
        else:
            result = self.orig_module(x, *args, **kwargs)
            torch_result_dtype = result.dtype
            for active_adapter in self.active_adapters:
                if active_adapter not in self.lora_embedding_A:
                    continue
                embedding_A = self.lora_embedding_A[active_adapter].T
                embedding_B = self.lora_embedding_B[active_adapter].T
                scaling = self.scaling[active_adapter]

                if not self.use_dora[active_adapter]:
                    after_A = self._embed(x, embedding_A)
                    result = result + (after_A @ embedding_B) * scaling
                else:
                    mag_norm_scale, dora_result = self.lora_magnitude_vector[active_adapter](
                        x,
                        lora_A=embedding_A,
                        lora_B=embedding_B,
                        scaling=scaling,
                        orig_module=self.get_orig_module(),
                        embed_fn=self._embed,
                    )
                    result = mag_norm_scale * result + dora_result
            result = result.to(torch_result_dtype)

        return result

    def __repr__(self) -> str:
        rep = super().__repr__()
        return "lora." + rep
    
class KTransformersLinearLora(KTransformersLinear, LoraLayer):
    def __init__(
        self,
        orig_module: KTransformersLinear,
        adapter_name: str,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        is_target_conv_1d_layer: bool = False,
        init_lora_weights: Union[bool, str] = True,
        use_rslora: bool = False,
        use_dora: bool = False,
        lora_bias: bool = False,
        **kwargs,
    ):
        # super().__init__(orig_module, **kwargs)
        # print(f"KTransformersLinearLora:{KTransformersLinearLora.__mro__}")
        
        KTransformersLinear.__init__(
            self,
            key=orig_module.key,
            gguf_loader=orig_module.gguf_loader,
            config=orig_module.config,
            orig_module=orig_module.orig_module,
            generate_device=orig_module.generate_device,
            prefill_device=orig_module.prefill_device,
            prefill_op="KLinearTorch",
            generate_op="KLinearTorch",
            **kwargs
        )

        LoraLayer.__init__(self, orig_module=orig_module.orig_module, **kwargs)

        # self.load(mode = InferenceState.GENERATE) # for test

        self._active_adapter = adapter_name

        
        self.update_layer(
            adapter_name,
            r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            init_lora_weights=init_lora_weights,
            use_rslora=use_rslora,
            use_dora=use_dora,
            lora_bias=lora_bias,
        )

        self.is_target_conv_1d_layer = is_target_conv_1d_layer

    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
        if not adapter_names:
            return

        for active_adapter in adapter_names:
            if active_adapter in self.lora_A:
                orig_module = self.get_orig_module()
                if safe_merge:
                    orig_weights = orig_module.weight.data.clone()
                    delta_weight = self.get_delta_weight(active_adapter)
                    if not self.use_dora.get(active_adapter, False):
                        orig_weights += delta_weight
                    else:
                        weight_norm = self.lora_magnitude_vector[active_adapter].get_weight_norm(
                            orig_weights, 
                            transpose(delta_weight, self.fan_in_fan_out), 
                            scaling=1
                        ).detach()
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
                        orig_weights = dora_factor * (orig_weights + delta_weight)

                    if not torch.isfinite(orig_weights).all():
                        raise ValueError(f"NaNs detected when merging adapter {active_adapter}")
                    orig_module.weight.data = orig_weights

                    if self.lora_bias.get(active_adapter, False):
                        new_bias = orig_module.bias.data + self.lora_B[active_adapter].bias
                        if not torch.isfinite(new_bias).all():
                            raise ValueError(f"NaNs detected in bias when merging adapter {active_adapter}")
                        orig_module.bias.data = new_bias
                else:
                    delta_weight = self.get_delta_weight(active_adapter)
                    if not self.use_dora.get(active_adapter, False):
                        orig_module.weight.data += delta_weight
                    else:
                        weight_norm = self.lora_magnitude_vector[active_adapter].get_weight_norm(
                            orig_module.weight.data,
                            transpose(delta_weight, self.fan_in_fan_out),
                            scaling=1
                        ).detach()
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
                        orig_module.weight.data = dora_factor * (orig_module.weight.data + delta_weight)

                    if self.lora_bias.get(active_adapter, False):
                        orig_module.bias.data += self.lora_B[active_adapter].bias

                self.merged_adapters.append(active_adapter)

    def unmerge(self) -> None:
        if not self.merged:
            warnings.warn("Already unmerged. Nothing to do.")
            return
        while self.merged_adapters:
            active_adapter = self.merged_adapters.pop()
            if active_adapter in self.lora_A:
                orig_module = self.get_orig_module()
                delta_weight = self.get_delta_weight(active_adapter)
                if not self.use_dora.get(active_adapter, False):
                    orig_module.weight.data -= delta_weight
                else:
                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    orig_weights = orig_module.weight.data / dora_factor.view(-1, 1) - delta_weight
                    orig_module.weight.data = orig_weights

                if self.lora_bias.get(active_adapter, False):
                    orig_module.bias.data -= self.lora_B[active_adapter].bias

    def get_delta_weight(self, adapter: str) -> torch.Tensor:
        lora_A = self.lora_A[adapter].weight
        lora_B = self.lora_B[adapter].weight
        delta_weight = transpose(lora_B @ lora_A, self.fan_in_fan_out) * self.scaling[adapter]
        return delta_weight

    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
        result = super().forward(x, *args, **kwargs)
        if self.disable_adapters or self.merged:
            return result

        for active_adapter in self.active_adapters:
            if active_adapter not in self.lora_A:
                continue
            lora_A = self.lora_A[active_adapter]
            lora_B = self.lora_B[active_adapter]
            dropout = self.lora_dropout[active_adapter]
            scaling = self.scaling[active_adapter]
            x = dropout(x)
            x = x.to(lora_A.weight.dtype)
            lora_output = lora_B(lora_A(x)) * scaling
            result += lora_output.to(result.dtype)

        return result
    

================================================
FILE: kt-sft/ktransformers/sft/peft_utils/lora_model.py
================================================
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

from abc import ABC
import math
import operator
import warnings
from contextlib import contextmanager
from dataclasses import asdict, replace
from enum import Enum
from functools import partial, reduce
from typing import Literal, Optional, Union
import logging

import torch
from torch import nn
from tqdm import tqdm

from peft.utils.other import get_pattern_key
from peft.utils import ModulesToSaveWrapper, _get_submodules
from peft.tuners.tuners_utils import check_target_module_exists
from peft.config import PeftConfig

from ktransformers.sft.peft_utils.lora_layer import dispatch_default, LoraLayer, BaseTunerLayer

logger = logging.getLogger(__name__)

class LoraModel(nn.Module, ABC):
    """
    Creates Low Rank Adapter (LoRA) model from a pretrained transformers model.

    The method is described in detail in https://arxiv.org/abs/2106.09685.

    Args:
        model ([`torch.nn.Module`]): The model to be adapted.
        config ([`LoraConfig`]): The configuration of the Lora model.
        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
        low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
            Create empty adapter weights on meta device. Useful to speed up the loading process.

    Returns:
        `torch.nn.Module`: The Lora model.

    Example:

        ```py
        >>> from transformers import AutoModelForSeq2SeqLM
        >>> from peft import LoraModel, LoraConfig

        >>> config = LoraConfig(
        ...     task_type="SEQ_2_SEQ_LM",
        ...     r=8,
        ...     lora_alpha=32,
        ...     target_modules=["q", "v"],
        ...     lora_dropout=0.01,
        ... )

        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
        >>> lora_model = LoraModel(model, config, "default")
        ```

        ```py
        >>> import torch
        >>> import transformers
        >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

        >>> rank = ...
        >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"]
        >>> config = LoraConfig(
        ...     r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
        ... )
        >>> quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)

        >>> tokenizer = transformers.AutoTokenizer.from_pretrained(
        ...     "kakaobrain/kogpt",
        ...     revision="KoGPT6B-ryan1.5b-float16",  # or float32 version: revision=KoGPT6B-ryan1.5b
        ...     bos_token="[BOS]",
        ...     eos_token="[EOS]",
        ...     unk_token="[UNK]",
        ...     pad_token="[PAD]",
        ...     mask_token="[MASK]",
        ... )
        >>> model = transformers.GPTJForCausalLM.from_pretrained(
        ...     "kakaobrain/kogpt",
        ...     revision="KoGPT6B-ryan1.5b-float16",  # or float32 version: revision=KoGPT6B-ryan1.5b
        ...     pad_token_id=tokenizer.eos_token_id,
        ...     use_cache=False,
        ...     device_map={"": rank},
        ...     torch_dtype=torch.float16,
        ...     quantization_config=quantization_config,
        ... )
        >>> model = prepare_model_for_kbit_training(model)
        >>> lora_model = get_peft_model(model, config)
        ```

    **Attributes**:
        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
        - **peft_config** ([`LoraConfig`]): The configuration of the Lora model.
    """

    prefix: str = "lora_"

    def __init__(
        self,
        model,
        peft_config: Union[PeftConfig, dict[str, PeftConfig]],
        adapter_name: str,
        low_cpu_mem_usage: bool = False,
    ) -> None:
        super().__init__()

        self.model = model
        self.targeted_module_names: list[str] = []

        # For advanced developers, if you want to attach multiple adapters to your
        # model, just add a `peft_config` dict attribute to your model.
        if not hasattr(self, "peft_config"):
            self.peft_config = {adapter_name: peft_config} if isinstance(peft_config, PeftConfig) else peft_config
        else:
            logger.info(
                "Already found a `peft_config` attribute in the model. This will lead to having multiple adapters"
                " in the model. Make sure to know what you are doing!"
            )
            if isinstance(peft_config, PeftConfig):
                self.peft_config[adapter_name] = peft_config
            else:
                # user is adding a dict of PeftConfigs
                self.peft_config.update(peft_config)

        self.active_adapter: str | list[str] = adapter_name
        self._pre_injection_hook(self.model, self.peft_config[adapter_name], adapter_name)
        
        self.inject_adapter(self.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage)

        # Copy the peft_config in the injected model.
        self.model.peft_config = self.peft_config

    def inject_adapter(
        self, model: nn.Module, adapter_name: str, autocast_adapter_dtype: bool = True, low_cpu_mem_usage: bool = False
    ) -> None:
        r"""
        Creates adapter layers and replaces the target modules with the adapter layers. This method is called under the
        hood by `peft.mapping.get_peft_model` if a non-prompt tuning adapter class is passed.

        The corresponding PEFT config is directly retrieved from the `peft_config` attribute of the BaseTuner class.

        Args:
            model (`nn.Module`):
                The model to be tuned.
            adapter_name (`str`):
                The adapter name.
            autocast_adapter_dtype (`bool`, *optional*):
                Whether to autocast the adapter dtype. Defaults to `True`.
            low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
                Create empty adapter weights on meta device. Useful to speed up the loading process.

        """
        peft_config = self.peft_config[adapter_name]
        excluded_modules = []
        unmatched_modules = []
        # Note: If possible, all checks should be performed *at the start of this method*.
        # This way, we can raise early if something goes wrong, without leaving the model
        # in a bad (half-initialized) state.

        _check_for_modules_to_save = getattr(peft_config, "modules_to_save", None) is not None
        _has_modules_to_save = False

        key_list = [key for key, _ in model.named_modules()]

        for key in key_list:
            if not key:
                continue
            # Check for modules_to_save in case
            if _check_for_modules_to_save and any(
                key.endswith(f"{module_to_save}") for module_to_save in peft_config.modules_to_save
            ):
                # Optionally set the modules to save
                parent, target, target_name = _get_submodules(model, key)

                if not isinstance(target, ModulesToSaveWrapper):
                    new_module = ModulesToSaveWrapper(target, adapter_name)
                    setattr(parent, target_name, new_module)
                else:
                    target.update(adapter_name)

                _has_modules_to_save = True
                continue

            result = check_target_module_exists(peft_config, key)
            if not result:
                unmatched_modules.append(key)
            else:
                self.targeted_module_names.append(key)
                parent, target, target_name = _get_submodules(model, key)

                # TODO: not consider the low_cpu_mem_usage up to now
                self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)

        # It's important to set the adapter here (again), because otherwise it can happen that if a 2nd adapter is
        # added, and it targets different layer(s) than the first adapter (which is active), then those different
        # layers will be activated, which we don't want.
        # TODO: not consider multi-adapter up to now
        # self.set_adapter(self.active_adapters)
        self._mark_only_adapters_as_trainable(model)

        if self.peft_config[adapter_name].inference_mode:
            for n, p in model.named_parameters():
                if adapter_name in n:
                    p.requires_grad = False

    def _create_and_replace(
        self,
        lora_config,
        adapter_name,
        target,
        target_name,
        parent,
        current_key,
    ):

        # Regexp matching - Find key which matches current target_name in patterns provided
        r_key = get_pattern_key(lora_config.rank_pattern.keys(), current_key)
        alpha_key = get_pattern_key(lora_config.alpha_pattern.keys(), current_key)
        r = lora_config.rank_pattern.get(r_key, lora_config.r)
        alpha = lora_config.alpha_pattern.get(alpha_key, lora_config.lora_alpha)

        kwargs = {
            "r": r,
            "lora_alpha": alpha,
            "lora_dropout": lora_config.lora_dropout,
            "fan_in_fan_out": lora_config.fan_in_fan_out,
            "init_lora_weights": lora_config.init_lora_weights,
            "use_rslora": lora_config.use_rslora,
            "use_dora": lora_config.use_dora,
            "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload,
            "lora_bias": lora_config.lora_bias,
            "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
            "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
        }

        new_module = self._create_new_module(lora_config, adapter_name, target, parent, **kwargs)
        self._replace_module(parent, target_name, new_module, target)

    def _replace_module(self, parent, child_name, new_module, child):
        setattr(parent, child_name, new_module)
        # It's not necessary to set requires_grad here, as that is handled by
        # _mark_only_adapters_as_trainable

        # child layer wraps the original module, unpack it
        if hasattr(child, "orig_module"):
            child = child.orig_module

        if not hasattr(new_module, "orig_module"):
            if hasattr(new_module, "W_q"):  # HQQ
                new_module.W_q = child.W_q
            else:
                new_module.weight = child.weight
            if hasattr(child, "bias"):
                new_module.bias = child.bias

        if getattr(child, "state", None) is not None:
            if hasattr(new_module, "orig_module"):
                new_module.orig_module.state = child.state
            else:
                new_module.state = child.state
            new_module.to(child.weight.device)

        meta = torch.device("meta")
        # dispatch to correct device
        for name, module in new_module.named_modules():
            if (self.prefix in name) or ("ranknum" in name):
                weight = (
                    child.qweight
                    if hasattr(child, "qweight")
                    else child.W_q
                    if hasattr(child, "W_q")
                    else child.weight
                    if hasattr(child, "weight")
                    else child.generate_linear.weight
                    if hasattr(child.generate_linear, "weight")
                    else next(child.parameters())
                )
                # (orig_module): Lora.Linear(
                    # (orig_module): Linear(..),
                    # (Lora_A): Linear(..)...)
                
                if not any(p.device == meta for p in module.parameters()):
                    module.to(weight.device)

    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
        for n, p in model.named_parameters():
            if self.prefix not in n:
                p.requires_grad = False

        for active_adapter in self.active_adapters:
            bias = self.peft_config[active_adapter].bias
            if bias == "none":
                continue

            if bias == "all":
                for n, p in model.named_parameters():
                    if "bias" in n:
                        p.requires_grad = True
            elif bias == "lora_only":
                for m in model.modules():
                    if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None:
                        m.bias.requires_grad = True
            else:
                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")

    @staticmethod
    def _create_new_module(lora_config, adapter_name, target, parent, **kwargs):
        # Collect dispatcher functions to decide what backend to use for the replaced LoRA layer. The order matters,
        # because the first match is always used. Therefore, the default layers should be checked last.
        dispatchers = []

        dispatchers.extend(
            [
                dispatch_default, # TODO
            ]
        )

        new_module = None
        for dispatcher in dispatchers:
            new_module = dispatcher(target=target, adapter_name=adapter_name, lora_config=lora_config, **kwargs)
            if new_module is not None:  # first match wins
                break

        if new_module is None:
            # no module could be matched
            raise ValueError(
                f"Target module {target} is not supported. Currently, only the following modules are supported: "
                "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `torch.nn.Conv3d`, "
                "`transformers.pytorch_utils.Conv1D`."
            )

        return new_module

    def __getattr__(self, name: str):
        """Forward missing attributes to the wrapped module."""
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
                raise
            return getattr(self.model, name)

    def _pre_injection_hook(self, model: nn.Module, config: PeftConfig, adapter_name: str) -> None:
        r"""
        A hook to be called before the adapter is injected into the model. This method can be overridden by child
        classes to perform any pre-injection operations.

        Args:
            model (`nn.Module`):
                The model to be adapted.
            config (`PeftConfig`):
                The adapter config.
            adapter_name (`str`):
                The adapter name.
        """
        pass

    def _set_adapter_layers(self, enabled: bool = True) -> None:
        for module in self.model.modules():
            if isinstance(module, BaseTunerLayer):
                module.enable_adapters(enabled)

    def disable_adapter_layers(self) -> None:
        """
        Disable all adapters in-place.

        When disabling all adapters, the model output corresponds to the output of the base model.
        """
        # TODO: deprecate in favor of enable_adapters
        self._set_adapter_layers(enabled=False)

    def enable_adapter_layers(self) -> None:
        """
        Enable all adapters in-place
        """
        # TODO: deprecate in favor of enable_adapters
        self._set_adapter_layers(enabled=True)
        
        
    # def set_adapter(self, adapter_names: str | list[str]) -> None:
    #     """Set the active adapter(s).

    #     Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
    #     not desired, use the following code.

    #     ```py
    #     >>> for name, param in model_peft.named_parameters():
    #     ...     if ...:  # some check on name (ex. if 'lora' in name)
    #     ...         param.requires_grad = False
    #     ```

    #     Args:
    #         adapter_name (`str` or `List[str]`): Name of the adapter(s) to be activated.
    #     """
    #     if isinstance(adapter_names, str):
    #         adapter_names = [adapter_names]

    #     # Deactivate grads on the inactive adapter and activate grads on the active adapter
    #     for layer_name in self.adapter_layer_names:
    #         module_dict = getattr(self, layer_name)
    #         for key, layer in module_dict.items():
    #             if key in adapter_names:
    #                 # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may
    #                 # happen if a completely different adapter layer is being activated.
    #                 layer.requires_grad_(True)
    #             else:
    #                 layer.requires_grad_(False)

    #     self._active_adapter = adapter_names
    
    @property
    def active_adapters(self) -> list[str]:
        if isinstance(self.active_adapter, str):
            return [self.active_adapter]
        # is already a list of str
        return self.active_adapter

================================================
FILE: kt-sft/ktransformers/sft/peft_utils/mapping.py
================================================
import torch
from transformers import PreTrainedModel
import warnings
from typing import TYPE_CHECKING, Any, Optional

from peft.config import PeftConfig

from ktransformers.sft.peft_utils.lora_model import LoraModel
from ktransformers.sft.peft_utils.peft_model import PeftModel, PeftModelForCausalLM

def get_peft_model(
    model: PreTrainedModel,
    peft_config: PeftConfig,
    adapter_name: str = "default",
    mixed: bool = False,
    autocast_adapter_dtype: bool = True,
    revision: Optional[str] = None,
    low_cpu_mem_usage: bool = False,
) -> PeftModel:
    """
    Returns a Peft model object from a model and a config.

    Args:
        model ([`transformers.PreTrainedModel`]):
            Model to be wrapped.
        peft_config ([`PeftConfig`]):
            Configuration object containing the parameters of the Peft model.
        adapter_name (`str`, `optional`, defaults to `"default"`):
            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
        mixed (`bool`, `optional`, defaults to `False`):
            Whether to allow mixing different (compatible) adapter types.
        autocast_adapter_dtype (`bool`, *optional*):
            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
            using float16 or bfloat16 to float32, as this is typically required for stable training, and only affect
            select PEFT tuners.
        revision (`str`, `optional`, defaults to `main`):
            The revision of the base model. If this isn't set, the saved peft model will load the `main` revision for
            the base model
        low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
            Create empty adapter weights on meta device. Useful to speed up the loading process. Leave this setting as
            False if you intend on training the model, unless the adapter weights will be replaced by different weights
            before training starts.
    """
    new_name = model.__dict__.get("name_or_path", None)
    peft_config.base_model_name_or_path = new_name

    return PeftModelForCausalLM(
        model,
        peft_config,
        adapter_name=adapter_name,
        autocast_adapter_dtype=autocast_adapter_dtype,
        low_cpu_mem_usage=low_cpu_mem_usage,
    )

def inject_adapter_in_model(
    peft_config: PeftConfig, model: torch.nn.Module, adapter_name: str = "default", low_cpu_mem_usage: bool = False
) -> torch.nn.Module:
    r"""
    A simple API to create and inject adapter in-place into a model. Currently the API does not support prompt learning
    methods and adaption prompt. Make sure to have the correct `target_names` set in the `peft_config` object. The API
    calls `get_peft_model` under the hood but would be restricted only to non-prompt learning methods.

    Args:
        peft_config (`PeftConfig`):
            Configuration object containing the parameters of the Peft model.
        model (`torch.nn.Module`):
            The input model where the adapter will be injected.
        adapter_name (`str`, `optional`, defaults to `"default"`):
            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
        low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
            Create empty adapter weights on meta device. Useful to speed up the loading process.
    """
    # tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING["LORA"]

    # By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules.
    peft_model = LoraModel(model, peft_config, adapter_name=adapter_name, low_cpu_mem_usage=low_cpu_mem_usage)

    return peft_model.model


================================================
FILE: kt-sft/ktransformers/sft/peft_utils/peft_model.py
================================================
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import collections
import copy
import inspect
import os
import warnings
from contextlib import contextmanager, nullcontext
from copy import deepcopy
from dataclasses import dataclass
from typing import Any, Literal, Optional, Union

import packaging.version
from peft import __version__
import torch
import transformers
from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights
from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules
from accelerate.utils import get_balanced_memory, named_module_tensors
from huggingface_hub import HfFileSystem, ModelCard, ModelCardData, hf_hub_download
from safetensors import safe_open
from safetensors.torch import save_file as safe_save_file
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import Cache, DynamicCache, EncoderDecoderCache, PreTrainedModel
from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput
from transformers.utils import PushToHubMixin

from peft.utils.constants import DUMMY_MODEL_CONFIG, PEFT_TYPE_TO_PREFIX_MAPPING

from peft.config import PeftConfig
from .lora_layer import BaseTunerLayer
from peft.utils import (
    SAFETENSORS_WEIGHTS_NAME,
    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
    WEIGHTS_NAME,
    PeftType,
    TaskType,
    _get_batch_size,
    _prepare_prompt_learning_config,
    _set_adapter,
    _set_trainable,
    get_peft_model_state_dict,
    id_tensor_storage,
    infer_device,
    load_peft_weights,
    map_cache_to_layer_device_map,
    set_peft_model_state_dict,
    shift_tokens_right,
)

from ktransformers.sft.peft_utils.lora_model import LoraModel


class PeftModel(PushToHubMixin, torch.nn.Module):
    """
    Base model encompassing various Peft methods.

    Args:
        model ([`~transformers.PreTrainedModel`]): The base transformer model used for Peft.
        peft_config ([`PeftConfig`]): The configuration of the Peft model.
        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
        autocast_adapter_dtype (`bool`, *optional*):
            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
            select PEFT tuners.
        low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
            Create empty adapter weights on meta device. Useful to speed up the loading loading process.

            <Tip>

            Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training.

            </Tip>

    **Attributes**:
        - **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft.
        - **peft_config** ([`PeftConfig`]) -- The configuration of the Peft model.
        - **modules_to_save** (`list` of `str`) -- The list of sub-module names to save when
            saving the model.
        - **prompt_encoder** ([`PromptEncoder`]) -- The prompt encoder used for Peft if
            using [`PromptLearningConfig`].
        - **prompt_tokens** (`torch.Tensor`) -- The virtual prompt tokens used for Peft if
            using [`PromptLearningConfig`].
        - **transformer_backbone_name** (`str`) -- The name of the transformer
            backbone in the base model if using [`PromptLearningConfig`].
        - **word_embeddings** (`torch.nn.Embedding`) -- The word embeddings of the transformer backbone
            in the base model if using [`PromptLearningConfig`].
    """

    def __init__(
        self,
        model: PreTrainedModel,
        peft_config: PeftConfig,
        adapter_name: str = "default",
        autocast_adapter_dtype: bool = True,
        low_cpu_mem_usage: bool = False,
    ) -> None:
        super().__init__()
        self.modules_to_save = None
        self.active_adapter = adapter_name
        self.peft_type = peft_config.peft_type
        # These args are special PEFT arguments that users can pass. They need to be removed before passing them to
        # forward.
        self.special_peft_forward_args = {"adapter_names"}

        self._is_prompt_learning = peft_config.is_prompt_learning
        if self._is_prompt_learning:
            self._peft_config = {adapter_name: peft_config}
            self.base_model = model
            self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage)
        else:
            self._peft_config = None
            ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
            with ctx():
                self.base_model = LoraModel(model, {adapter_name: peft_config}, adapter_name)
            self.set_additional_trainable_modules(peft_config, adapter_name)

        if hasattr(self.base_model, "_cast_adapter_dtype"):
            self.base_model._cast_adapter_dtype(
                adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype
            )

        if getattr(model, "is_gradient_checkpointing", True):
            model = self._prepare_model_for_gradient_checkpointing(model)

        # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid
        # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected
        # behavior we disable that in this line.
        if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"):
            self.base_model.config.pretraining_tp = 1

    @property
    def peft_config(self) -> dict[str, PeftConfig]:
        if self._is_prompt_learning:
            return self._peft_config
        return self.base_model.peft_config

    @property
    def active_adapters(self) -> list[str]:
        try:
            adapters = self.base_model.active_adapters
            if not isinstance(adapters, list):
                # Base model is probably a transformers model, see:
                # https://github.com/huggingface/transformers/pull/30790#issuecomment-2253808249
                # Unfortunately, transformers models also have an active_adapters method but it's 1) not a property and
                # 2) calling it fails because the base model (usually) has no loaded adapter. The base model can be a
                # transformers model for prompt learning, where the base model is not wrapped in a LoraModel or similar.
                adapters = self.active_adapter
                if isinstance(adapters, str):
                    adapters = [adapters]
        except AttributeError:
            adapters = self.active_adapter
            if isinstance(adapters, str):
                adapters = [adapters]
        return adapters

    @peft_config.setter
    def peft_config(self, value: dict[str, PeftConfig]):
        if self._is_prompt_learning:
            self._peft_config = value
        else:
            self.base_model.peft_config = value

    def save_pretrained(
        self,
        save_directory: str,
        safe_serialization: bool = True,
        selected_adapters: Optional[list[str]] = None,
        save_embedding_layers: Union[str, bool] = "auto",
        is_main_process: bool = True,
        path_initial_model_for_weight_conversion: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
        r"""
        This function saves the adapter model and the adapter configuration files to a directory, so that it can be
        reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`]
        method.

        Args:
            save_directory (`str`):
                Directory where the adapter model and configuration files will be saved (will be created if it does not
                exist).
            safe_serialization (`bool`, *optional*):
                Whether to save the adapter files in safetensors format, defaults to `True`.
            selected_adapters (`List[str]`,  *optional*):
                A list of adapters to be saved. If `None`, will default to all adapters.
            save_embedding_layers (`Union[bool, str]`, *optional*, defaults to `"auto"`):
                If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common
                embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available.
                and automatically sets the boolean flag. This only works for 🤗 transformers models.
            is_main_process (`bool`, *optional*):
                Whether the process calling this is the main process or not. Will default to `True`. Will not save the
                checkpoint if not on the main process, which is important for multi device setups (e.g. DDP).
            path_initial_model_for_weight_conversion (`str, *optional*`):
                The path to the initialized adapter, which is obtained after initializing the model with PiSSA or OLoRA
                and before performing any training. When `path_initial_model_for_weight_conversion` is not None, the
                difference in adapter before and after fine-tuning is calculated. This difference can be represented as
                the parameters of a standard LoRA adapter. Using this converted adapter does not require changes to the
                base model, thus conveniently allowing the use of multiple PiSSA or OLoRA adapters with LoRA adapters,
                and the activation or deactivation of any adapters. Note that this conversion is not supported if
                `rslora` is used in combination with `rank_pattern` or `alpha_pattern`.
            kwargs (additional keyword arguments, *optional*):
                Additional keyword arguments passed along to the `push_to_hub` method.

        """
        if os.path.isfile(save_directory):
            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")

        if selected_adapters is None:
            selected_adapters = list(self.peft_config.keys())
        else:
            if any(
                selected_adapter_name not in list(self.peft_config.keys())
                for selected_adapter_name in selected_adapters
            ):
                raise ValueError(
                    f"You passed an invalid `selected_adapters` arguments, current supported adapter names are"
                    f" {list(self.peft_config.keys())} - got {selected_adapters}."
                )

        def save_mutated_as_lora(peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs):
            if peft_config.use_rslora and (peft_config.rank_pattern or peft_config.alpha_pattern):
                msg = (
                    "Passing `path_initial_model_for_weight_conversion` to `save_pretrained` is not supported when "
                    "using `rank_pattern` or `alpha_pattern` at the same time as `use_rslora=True`."
                )
                raise ValueError(msg)

            if not any(
                str(peft_config.init_lora_weights).lower().startswith(prefix) for prefix in ["pissa", "olora", "true"]
            ):
                warnings.warn(
                    "`path_initial_model_for_weight_conversion` only works for converting a PiSSA or OLoRA adapter to "
                    "a LoRA adapter"
                )
            initial_adapter_name = os.path.basename(path_initial_model_for_weight_conversion)
            try:
                self.load_adapter(
                    os.path.dirname(path_initial_model_for_weight_conversion),
                    subfolder=initial_adapter_name,
                    adapter_name=initial_adapter_name,
                )
                is_pissa = str(self.peft_config[initial_adapter_name].init_lora_weights).lower().startswith("pissa")
                is_olora = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "olora"
                if is_pissa or is_olora:
                    raise ValueError(
                        "The `init_lora_weights` parameter of the initial adapter should be set to `True`. "
                        "Otherwise, `self.load_adapter` will subtract the decomposed values again based on the "
                        "residual model."
                    )
                output_state_dict = self.base_model.subtract_mutated_init(
                    output_state_dict, initial_adapter_name, kwargs
                )
            finally:
                self.delete_adapter(initial_adapter_name)
            return output_state_dict

        if is_main_process:
            os.makedirs(save_directory, exist_ok=True)
            self.create_or_update_model_card(save_directory)

        for adapter_name in selected_adapters:
            peft_config = self.peft_config[adapter_name]
            # save only the trainable weights
            output_state_dict = get_peft_model_state_dict(
                self,
                state_dict=kwargs.get("state_dict", None),
                adapter_name=adapter_name,
                save_embedding_layers=save_embedding_layers,
            )
            output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory
            os.makedirs(output_dir, exist_ok=True)

            if is_main_process and safe_serialization:
                # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134
                # Safetensors does not allow tensor aliasing.
                # We're going to remove aliases before saving
                ptrs = collections.defaultdict(list)
                for name, tensor in output_state_dict.items():
                    # Sometimes in the state_dict we have non-tensor objects.
                    # e.g. in bitsandbytes we have some `str` objects in the state_dict
                    if isinstance(tensor, torch.Tensor):
                        ptrs[id_tensor_storage(tensor)].append(name)
                    else:
                        # In the non-tensor case, fall back to the pointer of the object itself
                        ptrs[id(tensor)].append(name)

                # These are all the pointers of shared tensors.
                shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}

                for _, names in shared_ptrs.items():
                    # Here we just clone the shared tensors to avoid tensor aliasing which is
                    # not supported in safetensors.
                    for shared_tensor_name in names[1:]:
                        output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone()
                if path_initial_model_for_weight_conversion is not None:
                    peft_config = copy.deepcopy(peft_config)
                    peft_config.init_lora_weights = True
                    peft_config.save_pretrained(path_initial_model_for_weight_conversion)
                    output_state_dict = save_mutated_as_lora(
                        peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs
                    )
                safe_save_file(
                    output_state_dict,
                    os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME),
                    metadata={"format": "pt"},
                )
            elif is_main_process:
                if path_initial_model_for_weight_conversion is not None:
                    peft_config = copy.deepcopy(peft_config)
                    peft_config.init_lora_weights = True
                    peft_config.save_pretrained(path_initial_model_for_weight_conversion)
                    output_state_dict = save_mutated_as_lora(
                        peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs
                    )
                torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME))

            # save the config and change the inference mode to `True`
            if peft_config.base_model_name_or_path is None:
                peft_config.base_model_name_or_path = (
                    self.base_model.__dict__.get("name_or_path", None)
                    if peft_config.is_prompt_learning
                    else self.base_model.model.__dict__.get("name_or_path", None)
                )
            inference_mode = peft_config.inference_mode
            peft_config.inference_mode = True

            if peft_config.task_type is None:
                # deal with auto mapping
                base_model_class = self._get_base_model_class(
                    is_prompt_tuning=peft_config.is_prompt_learning,
                )
                parent_library = base_model_class.__module__

                auto_mapping_dict = {
                    "base_model_class": base_model_class.__name__,
                    "parent_library": parent_library,
                }
            else:
                auto_mapping_dict = None

            if is_main_process:
                if path_initial_model_for_weight_conversion is not None:
                    peft_config.init_lora_weights = True
                    peft_config.r *= 2
                    if not peft_config.use_rslora:
                        peft_config.lora_alpha *= 2
                    else:
                        # with rslora, we have scaling = alpha / sqrt(r), we thus adjust alpha to keep the same scaling
                        peft_config.lora_alpha *= 2**0.5

                    if peft_config.rank_pattern:
                        peft_config.rank_pattern = {key: 2 * val for key, val in peft_config.rank_pattern.items()}
                    if peft_config.alpha_pattern:
                        peft_config.alpha_pattern = {key: 2 * val for key, val in peft_config.alpha_pattern.items()}

                peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict)
            peft_config.inference_mode = inference_mode

    @classmethod
    def from_pretrained(
        cls,
        model: torch.nn.Module,
        model_id: Union[str, os.PathLike],
        adapter_name: str = "default",
        is_trainable: bool = False,
        config: Optional[PeftConfig] = None,
        autocast_adapter_dtype: bool = True,
        ephemeral_gpu_offload: bool = False,
        low_cpu_mem_usage: bool = False,
        **kwargs: Any,
    ) -> PeftModel:
        r"""
        Instantiate a PEFT model from a pretrained model and loaded PEFT weights.

        Note that the passed `model` may be modified inplace.

        Args:
            model ([`torch.nn.Module`]):
                The model to be adapted. For 🤗 Transformers models, the model should be initialized with the
                [`~transformers.PreTrainedModel.from_pretrained`].
            model_id (`str` or `os.PathLike`):
                The name of the PEFT configuration to use. Can be either:
                    - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face
                      Hub.
                    - A path to a directory containing a PEFT configuration file saved using the `save_pretrained`
                      method (`./my_peft_config_directory/`).
            adapter_name (`str`, *optional*, defaults to `"default"`):
                The name of the adapter to be loaded. This is useful for loading multiple adapters.
            is_trainable (`bool`, *optional*, defaults to `False`):
                Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
                used for inference.
            config ([`~peft.PeftConfig`], *optional*):
                The configuration object to use instead of an automatically loaded configuration. This configuration
                object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already
                loaded before calling `from_pretrained`.
            autocast_adapter_dtype (`bool`, *optional*):
                Whether to autocast the adapter dtype. Defaults to `True`. Only relevant for specific adapter types.
            ephemeral_gpu_offload (`bool`, *optional*):
                Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. This is
                useful when parts of the model and/or components (such as adapters) are kept in CPU memory until they
                are needed. Rather than perform expensive operations on small data, the data is transferred to the GPU
                on-demand, the operation(s) performed, and the results moved back to CPU memory. This brings a slight
                momentary VRAM overhead but gives orders of magnitude speedup in certain cases.
            low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
                Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the
                process.
            torch_device (`str`, *optional*, defaults to None):
                The device to load the adapter on. If `None`, the device will be inferred.
            kwargs: (`optional`):
                Additional keyword arguments passed along to the specific PEFT configuration class.
        """
        from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING

        # load the config
        if config is None:
            config = PEFT_TYPE_TO_CONFIG_MAPPING[
                PeftConfig._get_peft_type(
                    model_id,
                    subfolder=kwargs.get("subfolder", None),
                    revision=kwargs.get("revision", None),
                    cache_dir=kwargs.get("cache_dir", None),
                    use_auth_token=kwargs.get("use_auth_token", None),
                    token=kwargs.get("token", None),
                )
            ].from_pretrained(model_id, **kwargs)
        elif isinstance(config, PeftConfig):
            config.inference_mode = not is_trainable
        else:
            raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}")

        # Runtime configuration, if supported
        if hasattr(config, "runtime_config"):
            config.runtime_config.ephemeral_gpu_offload = ephemeral_gpu_offload
        else:
            if ephemeral_gpu_offload:
                warnings.warn("Ephemeral GPU offloading is not supported for this model. Ignoring.")

        if hasattr(model, "hf_device_map"):
            weight_map = dict(named_module_tensors(model, recurse=True))

            # recreate the offload_index for disk-offloaded modules: we need to know the location in storage of each weight
            # before the offload hook is removed from the model
            disk_modules = set()
            index = None
            for name, module in model.named_modules():
                if hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "original_devices"):
                    if hasattr(module._hf_hook.weights_map, "dataset"):
                        index = module._hf_hook.weights_map.dataset.index
                    for key in module._hf_hook.original_devices.keys():
                        if module._hf_hook.original_devices[key] == torch.device("meta"):
                            disk_modules.add(str(name) + "." + str(key))

            if disk_modules and not kwargs.get("use_safetensors", True):
                raise ValueError("Disk offloading currently only supported for safetensors")

            if index:
                offload_index = {
                    p: {
                        "safetensors_file": index[p]["safetensors_file"],
                        "weight_name": p,
                        "dtype": str(weight_map[p].dtype).replace("torch.", ""),
                    }
                    for p in weight_map.keys()
                    if p in disk_modules
                }
                kwargs["offload_index"] = offload_index

        if (getattr(model, "hf_device_map", None) is not None) and len(
            set(model.hf_device_map.values()).intersection({"cpu", "disk"})
        ) > 0:
            remove_hook_from_submodules(model)

        if config.is_prompt_learning and is_trainable:
            raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
        else:
            config.inference_mode = not is_trainable
        if isinstance(getattr(model, "base_model", None), XLoraModel):
            if not isinstance(config, XLoraConfig):
                raise TypeError(f"Expected 'XLoraConfig', got '{type(config)}' instead.")
            if "adapters" in kwargs:
                config.adapters = kwargs["adapters"]
            else:
                # If the path is on HF hub, then we get the adapter names to create a subfolders list which tells
                # `load_adapter` where the adapters are.
                if not os.path.exists(model_id):
                    s = HfFileSystem()

                    # The names of the adapters which must be in folders
                    adapter_names = [
                        file["name"][len(model_id) + 1 :] for file in s.ls(model_id) if file["type"] == "directory"
                    ]
                    # Prepare a dict of adapter paths, which really just point to the hf id; we will use the subfolders
                    adapter_paths = {}
                    for adapter_name in adapter_names:
                        adapter_paths[adapter_name] = os.path.join(model_id, model_id)
                    config.adapters = adapter_paths
                    config._subfolders = adapter_names
                else:
                    if "adapters" not in kwargs:
                        raise ValueError("If model_id is a local path, then `adapters` must be passed in kwargs.")

        if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys():
            model = cls(
                model,
                config,
                adapter_name,
                autocast_adapter_dtype=autocast_adapter_dtype,
                low_cpu_mem_usage=low_cpu_mem_usage,
            )
        else:
            model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](
                model,
                config,
                adapter_name,
                autocast_adapter_dtype=autocast_adapter_dtype,
                low_cpu_mem_usage=low_cpu_mem_usage,
            )

        load_result = model.load_adapter(
            model_id,
            adapter_name,
            is_trainable=is_trainable,
            autocast_adapter_dtype=autocast_adapter_dtype,
            low_cpu_mem_usage=low_cpu_mem_usage,
            **kwargs,
        )

        # 1. Remove VB-LoRA vector bank, since it's a shared parameter set via the VBLoRAModel
        # 2. Remove the prompt encoder, as it does not need to be part of the checkpoint
        missing_keys = [
            k for k in load_result.missing_keys if "vblora_vector_bank" not in k and "prompt_encoder" not in k
        ]
        if missing_keys:
            # Let's warn here since (in contrast to load_adapter) we don't return the load result, so it could be quite
            # difficult for users to even notice that something might have gone wrong here. As we filter out non PEFT
            # keys from the missing keys, this gives no false positives.
            warnings.warn(f"Found missing adapter keys while loading the checkpoint: {missing_keys}")

        return model

    def _setup_prompt_encoder(self, adapter_name: str):
        config = self.peft_config[adapter_name]
        if not hasattr(self, "prompt_encoder"):
            self.prompt_encoder = torch.nn.ModuleDict({})
            self.prompt_tokens = {}
        transformer_backbone = None
        for name, module in self.base_model.named_children():
            for param in module.parameters():
                param.requires_grad = False
            if isinstance(module, PreTrainedModel):
                # Make sure to freeze Tranformers model
                if transformer_backbone is None:
                    transformer_backbone = module
                    self.transformer_backbone_name = name
        if transformer_backbone is None:
            transformer_backbone = self.base_model

        if config.num_transformer_submodules is None:
            config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1

        # determine the word embeddings
        word_embeddings = None
        try:
            # First try to find the word embeddings based on the module name, this should work for models like Bert,
            # Roberta, Deberta, etc.
            word_embeddings = self.base_model.get_submodule("embeddings.word_embeddings")
        except AttributeError:
            pass

        if word_embeddings is None:
            # Word embeddings could not be determined. Next try to guess them by checking which parameter has the size
            # of the vocab.
            for named_param, value in list(transformer_backbone.named_parameters()):
                # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape
                # [0] the actual unsharded shape is stored in "ds_shape" attribute special handling is needed in case
                # the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig has been called before
                # For reference refer to issue: https://github.com/huggingface/peft/issues/996
                deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None)

                if value.shape[0] == self.base_model.config.vocab_size or (
                    deepspeed_distributed_tensor_shape is not None
                    and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size
                ):
                    word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
                    break

        self.word_embeddings = word_embeddings

        if config.peft_type == PeftType.PROMPT_TUNING:
            prompt_encoder = PromptEmbedding(config, self.word_embeddings)
        elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
            prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings)
        elif config.peft_type == PeftType.P_TUNING:
            prompt_encoder = PromptEncoder(config)
        elif config.peft_type == PeftType.PREFIX_TUNING:
            # prefix tuning now uses Cache but that won't work with gradient checkpointing
            if any(getattr(module, "gradient_checkpointing", False) for module in self.get_base_model().modules()):
                raise ValueError("Prefix tuning does not work with gradient checkpointing.")
            prompt_encoder = PrefixEncoder(config)
        elif config.peft_type == PeftType.CPT:
            prompt_encoder = CPTEmbedding(config, self.word_embeddings)
        else:
            raise ValueError("Not supported")

        prompt_encoder = prompt_encoder.to(self.device)
        self.prompt_encoder.update(torch.nn.ModuleDict({adapter_name: prompt_encoder}))
        self.prompt_tokens[adapter_name] = torch.arange(
            config.num_virtual_tokens * config.num_transformer_submodules
        ).long()

    def _prepare_model_for_gradient_checkpointing(self, model: PreTrainedModel):
        r"""
        Prepares the model for gradient checkpointing if necessary
        """
        if not (
            getattr(model, "is_loaded_in_8bit", False)
            or getattr(model, "is_loaded_in_4bit", False)
            or getattr(model, "is_quantized", False)
        ):
            if hasattr(model, "enable_input_require_grads"):
                model.enable_input_require_grads()
            elif hasattr(model, "get_input_embeddings"):

                def make_inputs_require_grad(module, input, output):
                    output.requires_grad_(True)

                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
        return model

    def get_prompt_embedding_to_save(self, adapter_name: str) -> torch.Tensor:
        """
        Returns the prompt embedding to save when saving the model. Only applicable when using a prompt learning
        method.
        """
        prompt_encoder = self.prompt_encoder[adapter_name]
        prompt_tokens = (
            self.prompt_tokens[adapter_name].unsqueeze(0).expand(1, -1).to(prompt_encoder.embedding.weight.device)
        )
        if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING:
            prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens]

        if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING:
            prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens)
        else:
            prompt_embeddings = prompt_encoder(prompt_tokens)

        return prompt_embeddings[0].detach().cpu()

    def get_prompt(self, batch_size: int, task_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Returns the virtual prompts to use for Peft. Only applicable when using a prompt learning method.
        """
        peft_config = self.active_peft_config
        prompt_encoder = self.prompt_encoder[self.active_adapter]
        prompt_tokens = (
            self.prompt_tokens[self.active_adapter]
            .unsqueeze(0)
            .expand(batch_size, -1)
            .to(prompt_encoder.embedding.weight.device)
        )
        if peft_config.peft_type == PeftType.PREFIX_TUNING:
            prompt_tokens = prompt_tokens[:, : peft_config.num_virtual_tokens]
            if peft_config.inference_mode:
                past_key_values = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1)
            else:
                past_key_values = prompt_encoder(prompt_tokens)
            if self.base_model_torch_dtype is not None:
                past_key_values = past_key_values.to(self.base_model_torch_dtype)
            past_key_values = past_key_values.view(
                batch_size,
                peft_config.num_virtual_tokens,
                peft_config.num_layers * 2,
                peft_config.num_attention_heads,
                peft_config.token_dim // peft_config.num_attention_heads,
            )
            if peft_config.num_transformer_submodules == 2:
                past_key_values = torch.cat([past_key_values, past_key_values], dim=2)
            past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(
                peft_config.num_transformer_submodules * 2
            )
            if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None:
                post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type]
                past_key_values = post_process_fn(past_key_values)
            elif peft_config.num_transformer_submodules == 1:
                # Dont' apply this to encoder-decoder models and not to models requiring special processing.
                # local import in case users use a very old transformers version
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            elif peft_config.num_transformer_submodules == 2 and self.base_model._supports_cache_class:
                # Dont' apply this to encoder-decoder models that don't support new Cachc format yet
                # If we don't apply this, prefix-tuning fails to update cross-attn cache
                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
                past_key_values.cross_attention_cache = DynamicCache()
                past_key_values.is_updated = {
                    layer_idx: False for layer_idx in range(len(past_key_values.cross_attention_cache.key_cache))
                }
            map_cache_to_layer_device_map(self.get_base_model(), past_key_values)  # no-op if not a Cache instance
            return past_key_values
        else:
            if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
                prompts = prompt_encoder(prompt_tokens, task_ids)
            else:
                if peft_config.inference_mode:
                    prompts = prompt_encoder.embedding.weight
                else:
                    # Take only one prompt token sample and expand the output instead of expanding the input, see:
                    # https://github.com/huggingface/peft/issues/2043#issuecomment-2321522577
                    prompt_tokens = prompt_tokens[:1]
                    prompts = prompt_encoder(prompt_tokens)
                prompts = prompts.repeat(batch_size, 1, 1)
            return prompts

    def get_nb_trainable_parameters(self) -> tuple[int, int]:
        r"""
        Returns the number of trainable parameters and the number of all parameters in the model.
        """
        trainable_params = 0
        all_param = 0
        for _, param in self.named_parameters():
            num_params = param.numel()
            # if using DS Zero 3 and the weights are initialized empty
            if num_params == 0 and hasattr(param, "ds_numel"):
                num_params = param.ds_numel

            # Due to the design of 4bit linear layers from bitsandbytes
            # one needs to multiply the number of parameters by 2 to get
            # the correct number of parameters
            if param.__class__.__name__ == "Params4bit":
                if hasattr(param, "element_size"):
                    num_bytes = param.element_size()
                elif not hasattr(param, "quant_storage"):
                    num_bytes = 1
                else:
                    num_bytes = param.quant_storage.itemsize
                num_params = num_params * 2 * num_bytes

            all_param += num_params
            if param.requires_grad:
                trainable_params += num_params

        return trainable_params, all_param

    def print_trainable_parameters(self) -> None:
        """
        Prints the number of trainable parameters in the model.

        Note: print_trainable_parameters() uses get_nb_trainable_parameters() which is different from
        num_parameters(only_trainable=True) from huggingface/transformers. get_nb_trainable_parameters() returns
        (trainable parameters, all parameters) of the Peft Model which includes modified backbone transformer model.
        For techniques like LoRA, the backbone transformer model is modified in place with LoRA modules. However, for
        prompt tuning, the backbone transformer model is unmodified. num_parameters(only_trainable=True) returns number
        of trainable parameters of the backbone transformer model which can be different.
        """
        trainable_params, all_param = self.get_nb_trainable_parameters()

        print(
            f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}"
        )

    def __getattr__(self, name: str):
        """Forward missing attributes to the wrapped module."""
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
            if name == "base_model":  # see #1892: prevent infinite recursion if class is not initialized
                raise
            return getattr(self.base_model, name)

    @contextmanager
    def _enable_peft_forward_hooks(self, *args, **kwargs):
        # If the base model has a method called _enable_peft_forward_hooks, it is invoked as a context. Otherwise, this
        # runs without any changes
        if hasattr(self.base_model, "_enable_peft_forward_hooks"):
            with self.base_model._enable_peft_forward_hooks(*args, **kwargs):
                yield
            return
        else:
            # nothing to enable
            yield
            return

    def forward(self, *args: Any, **kwargs: Any):
        """
        Forward pass of the model.
        """
        with self._enable_peft_forward_hooks(*args, **kwargs):
            kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
            return self.get_base_model()(*args, **kwargs)

    def generate(self, *args, **kwargs):
        with self._enable_peft_forward_hooks(*args, **kwargs):
            kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
            return self.get_base_model().generate(*args, **kwargs)

    def _get_base_model_class(self, is_prompt_tuning=False):
        """
        Returns the base model class.
        """
        if not is_prompt_tuning:
            return self.base_model.model.__class__
        return self.base_model.__class__

    @contextmanager
    def disable_adapter(self):
        """
        Context manager that disables the adapter module. Use this to run inference on the base model.

        Example:

        ```py
        >>> with model.disable_adapter():
        ...     model(inputs)
        ```
        """
        if self.peft_config[self.active_adapter].is_prompt_learning:
            try:
                # TODO: consider replacing this patching of methods with a more robust mechanism: setting a flag and
                # letting the underlying methods deal with it, same as how LoRA does it.
                old_forward = self.forward
                self.forward = self.base_model.forward
                old_prepare_inputs_for_generation = self.prepare_inputs_for_generation
                self.prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
                yield
            finally:
                self.forward = old_forward
                self.prepare_inputs_for_generation = old_prepare_inputs_for_generation

        elif self.peft_config[self.active_adapter].is_adaption_prompt:
            try:
                self.base_model.disable_adapter_layers()
                yield
            finally:
                self.base_model.enable_adapter_layers()

        else:  # LoRA, LoHa, etc.
            model_status = self.get_model_status()
            if model_status.enabled == "irregular":
                warnings.warn(
                    "The model contains some adapter layers that are enabled and others that are disabled. "
                    "This is most likely unintentional. After exiting the disable_adapter context, all adapters "
                    "will be enabled"
                )
            try:
                self.base_model.disable_adapter_layers()
                yield
            finally:
                if model_status.enabled is not False:
                    # model_status.enabled is `True` or `"irregular"`
                    self.base_model.enable_adapter_layers()

    def get_base_model(self) -> torch.nn.Module:
        """
        Returns the base model.
        """
        return (
            self.base_model
            if (self.active_peft_config.is_prompt_learning or self.peft_type == PeftType.POLY)
            else self.base_model.model
        )

    def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None:
        """
        Add an adapter to the model based on the passed configuration.

        This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`].

        The name for the new adapter should be unique.

        The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active
        adapter.

        Args:
            adapter_name (`str`):
                The name of the adapter to be added.
            peft_config ([`PeftConfig`]):
                The configuration of the adapter to be added.
            low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
                Create empty adapter weights on meta device. Useful to speed up the process when loading saved
                adapters. Don't use this option when creating a new PEFT adapter for training.

        """
        if peft_config.peft_type != self.peft_type:
            raise ValueError(
                f"Cannot combine adapters with different peft types. "
                f"Found {self.peft_type} and {peft_config.peft_type}."
            )

        try:
            if peft_config.is_prompt_learning:
                self.peft_config[adapter_name] = peft_config
                if hasattr(self.config, "to_dict"):
                    dict_config = self.config.to_dict()
                else:
                    dict_config = self.config

                peft_config = _prepare_prompt_learning_config(peft_config, dict_config)
                self._setup_prompt_encoder(adapter_name)
            elif peft_config.is_adaption_prompt:
                self.base_model.add_adapter(adapter_name, peft_config)
            else:
                self.peft_config[adapter_name] = peft_config
                self.base_model.inject_adapter(
                    self.base_model.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage
                )
        except Exception:  # something went wrong, roll back
            if adapter_name in self.peft_config:
                del self.peft_config[adapter_name]
            raise

        self.set_additional_trainable_modules(peft_config, adapter_name)

    def set_additional_trainable_modules(self, peft_config, adapter_name):
        if getattr(peft_config, "modules_to_save", None) is not None:
            if self.modules_to_save is None:
                self.modules_to_save = set(peft_config.modules_to_save)
            else:
                self.modules_to_save.update(peft_config.modules_to_save)
            _set_trainable(self, adapter_name)  # this may add a new ModulesToSaveWrapper

    def get_layer_status(self) -> list[TunerLayerStatus]:
        """Get the status of each adapter layer in the model.

        This method returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following
        attributes:

        - `name` (`str`):
           The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`.
        - `module_type` (`str`):
           The type of the adapter layer, e.g. `lora.Linear`.
        - `enabled` (`bool`):
           Whether the adapter layer is enabled.
        - `active_adapters` (`list[str]`):
           The names of the active adapters, if any, e.g. `["default"]`.
        - `merged_adapters` (`list[str]`):
           The names of the merged adapters, if any, e.g. `["default"]`.
        - `available_adapters` (`list[str]`):
           The names of the available adapters, e.g. `["default"]`.

        Args:
            model ([`~PeftModel`]):
                The model to get the adapter layer status from.

        Returns:
            list[`peft.peft_model.TunerLayerStatus`]:
                A list of dataclasses, each containing the status of the corresponding adapter layer.

        """
        return get_layer_status(self)

    def get_model_status(self) -> TunerModelStatus:
        """Get the status of tuners of the model.

        This method returns a `TunerModelStatus` dataclass instance, which contains the following attributes:

        - `base_model_type` (`str`):
           The type of the base model, e.g. `T5Model`.
        - `adapter_model_type` (`str`):
           The type of the adapter model, e.g. `LoraModel`.
        - `peft_types` (`dict[str, str]`):
           The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`.
        - `trainable_params` (`int`):
           The number of trainable parameters in the model.
        - `total_params` (`int`):
           The total number of parameters in the model.
        - `num_adapter_layers` (`int`):
           The number of adapter layers in the model.
        - `enabled` (`bool`, `Literal["irregular"]`):
           Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`.
           This means that your model is in an inconsistent state and might not work as expected.
        - `active_adapters` (`list[str]`, `Literal["irregular"]`):
           The names of the active adapters. If the active adapters are not consistent across all layers, this will be
           `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
        - `merged_adapters` (`list[str]`, `Literal["irregular"]`):
           The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be
           `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
        - `available_adapters` (`list[str]`):
           The names of the available adapters, e.g. `["default"]`.

        Args:
            model ([`~PeftModel`]):
                The model to get the adapter layer status from.

        Returns:
            `peft.peft_model.TunerModelStatus`:
                A dataclass containing the status of the model.

        """
        return get_model_status(self)

    @classmethod
    def _split_kwargs(cls, kwargs: dict[str, Any]):
        _kwargs_not_in_hf_hub_download_signature = ("use_auth_token",)
        hf_hub_download_kwargs = {}
        other_kwargs = {}

        for key, value in kwargs.items():
            if key in inspect.signature(hf_hub_download).parameters or key in _kwargs_not_in_hf_hub_download_signature:
                hf_hub_download_kwargs[key] = value
            else:
                other_kwargs[key] = value

        return hf_hub_download_kwargs, other_kwargs

    def _update_offload(self, offload_index: dict[str, dict[str, str]], adapters_weights: dict[str, torch.tensor]):
        """
        Update the offload_index and safetensors files for loading and mergine PeftModels with disk-offloaded modules.

        Args:
            offload_index (Dict[str: str]):
                Dictionary of disk-offloaded modules with their metadata and safetensors filenames
            adapters_weights (Dict[str: torch.tensor]):
                Dictionary of Peft adapter module names and weights
        """

        if not offload_index:
            return offload_index

        prefix = "base_model.model."
        # rename offload index weight and model names
        adapter_names = list(self.peft_config.keys())
        for adapter_name in adapter_names:
            keys = list(offload_index.keys())
            block_id = keys[0].split(".")[0] + "."  # for writing safetensors key,

            # replace original offload index keys with PeftModel keys
            for key in keys:
                suffix_pos = key.rfind(".")
                extended_prefix = prefix + key[:suffix_pos]
                module = dict(self.named_modules())[extended_prefix]
                if isinstance(module, BaseTunerLayer):
                    new_key = prefix + key[:suffix_pos] + ".base_layer" + key[suffix_pos:]
                else:
                    new_key = prefix + key
                offload_index[key]["weight_name"] = new_key
                offload_index[new_key] = offload_index[key]
                del offload_index[key]

            files_seen = set()
            # rename safetensors for dispatch
            for new_key in list(offload_index.keys()):
                fname = offload_index[new_key]["safetensors_file"]

                # make a new file name
                new_fname_list = list(fname.split(os.sep))
                for i, name in enumerate(new_fname_list):
                    if "--" in name:
                        new_fname_list[i] += "-peft"
                        break
                new_fname = os.path.join(*new_fname_list)

                if fname in files_seen:
                    continue
                safe_dict = {}
                with safe_open(fname, framework="pt") as f:
                    for safe_key in f.keys():
                        safe_tensor = f.get_tensor(safe_key)
                        metadata = f.metadata()
                        suffix_pos = safe_key.rfind(".")
                        extended_prefix = prefix + block_id + safe_key[:suffix_pos]
                        safe_module = dict(self.named_modules())[extended_prefix]
                        if isinstance(safe_module, BaseTunerLayer):
                            final_key = extended_prefix + ".base_layer" + safe_key[suffix_pos:]
                            lora_dict = {key: val for key, val in adapters_weights.items() if extended_prefix in key}

                            # add LoRA keys and values to disk offload
                            for lora_key, lora_val in lora_dict.items():
                                divide = lora_key.rfind(".")
                                new_key = lora_key[:divide] + f".{adapter_name}" + lora_key[divide:]
                                safe_dict[new_key] = lora_val
                        else:
                            final_key = prefix + block_id + safe_key
                        safe_dict[final_key] = safe_tensor
                    files_seen.add(new_fname)

                    # avoid overwriting original safetensors
                    for key in safe_dict.keys():
                        offload_index[key] = {"safetensors_file": new_fname, "weight_name": key}

                    base_name = os.path.dirname(new_fname)
                    if not os.path.exists(base_name):
                        os.makedirs(base_name)
                    safe_save_file(safe_dict, new_fname, metadata=metadata)

    def _check_new_adapter_config(self, peft_config: PeftConfig, is_trainable: bool) -> None:
        """Perform checks on newly added PEFT configs to ensure integrity."""
        if peft_config.is_prompt_learning and is_trainable:
            raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")

        # Since PiSSA/OLoRA modifies the base weights, it should not be combined with other adapters.
        all_configs = [peft_config] + list(self.peft_config.values())
        if len(all_configs) > 1:
            if any(getattr(config, "init_lora_weights", None) == "pissa" for config in all_configs):
                msg = (
                    "PiSSA changes the base weights of the model and should thus not be used with other adapters. "
                    "Consider converting the PiSSA adapter into a normal LoRA adapter: "
                    "https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning#convert-pissa-to-lora"
                )
                warnings.warn(msg)
            elif any(getattr(config, "init_lora_weights", None) == "olora" for config in all_configs):
                msg = (
                    "OLoRA changes the base weights of the model and should thus not be used with other adapters. "
                    "Consider converting the OLoRA adapter into a normal LoRA adapter: "
                    "https://github.com/huggingface/peft/tree/main/examples/olora_finetuning#olora-and-lora"
                )
                warnings.warn(msg)

    def load_adapter(
        self,
        model_id: Union[str, os.PathLike],
        adapter_name: str,
        is_trainable: bool = False,
        torch_device: Optional[str] = None,
        autocast_adapter_dtype: bool = True,
        ephemeral_gpu_offload: bool = False,
        low_cpu_mem_usage: bool = False,
        **kwargs: Any,
    ):
        """
        Load a trained adapter into the model.

        The name for the new adapter should be unique.

        The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active
        adapter.

        Args:
            model_id (`str` or `os.PathLike`):
                The name of the PEFT configuration to use. Can be either:
                    - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face
                      Hub.
                    - A path to a directory containing a PEFT configuration file saved using the `save_pretrained`
                      method (`./my_peft_config_directory/`).
            adapter_name (`str`):
                The name of the adapter to be added.
            is_trainable (`bool`, *optional*, defaults to `False`):
                Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
                used for inference.
            torch_device (`str`, *optional*, defaults to None):
                The device to load the adapter on. If `None`, the device will be inferred.
            autocast_adapter_dtype (`bool`, *optional*, defaults to `True`):
                Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter
                weights using float16 and bfloat16 to float32, as this is typically required for stable training, and
                only affect select PEFT tuners.
            ephemeral_gpu_offload (`bool`, *optional*, defaults to `False`):
                Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`.
            low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
                Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the
                process.
            kwargs: (`optional`):
                Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub.
        """
        from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING

        hf_hub_download_kwargs, kwargs = self._split_kwargs(kwargs)
        if torch_device is None:
            torch_device = infer_device()

        if adapter_name not in self.peft_config:
            # load the config
            peft_config = PEFT_TYPE_TO_CONFIG_MAPPING[
                PeftConfig._get_peft_type(
                    model_id,
                    **hf_hub_download_kwargs,
                )
            ].from_pretrained(
                model_id,
                ephemeral_gpu_offload=ephemeral_gpu_offload,
                **hf_hub_download_kwargs,
            )
            self._check_new_adapter_config(peft_config, is_trainable=is_trainable)
            peft_config.inference_mode = not is_trainable
            self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage)

        adapters_weights = load_peft_weights(model_id, device=torch_device, **hf_hub_download_kwargs)

        # load the weights into the model
        ignore_mismatched_sizes = kwargs.get("ignore_mismatched_sizes", False)
        load_result = set_peft_model_state_dict(
            self,
            adapters_weights,
            adapter_name=adapter_name,
            ignore_mismatched_sizes=ignore_mismatched_sizes,
            low_cpu_mem_usage=low_cpu_mem_usage,
        )

        tuner = self.peft_config[adapter_name].peft_type
        tuner_prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(tuner, "")
        adapter_missing_keys = []

        # Filter missing keys specific to the current adapter and tuner prefix.
        for key in load_result.missing_keys:
            if tuner_prefix in key and adapter_name in key:
                adapter_missing_keys.append(key)

        load_result.missing_keys.clear()
        load_result.missing_keys.extend(adapter_missing_keys)

        if (
            (getattr(self, "hf_device_map", None) is not None)
            and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0)
            and len(self.peft_config) == 1
        ):
            device_map = kwargs.get("device_map", "auto")
            max_memory = kwargs.get("max_memory", None)
            offload_dir = kwargs.get("offload_folder", None)
            offload_index = kwargs.get("offload_index", None)

            dispatch_model_kwargs = {}
            # Safety checker for previous `accelerate` versions
            # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/
            if "offload_index" in inspect.signature(dispatch_model).parameters:
                dispatch_model_kwargs["offload_index"] = offload_index

            no_split_module_classes = self._no_split_modules

            if device_map != "sequential":
                max_memory = get_balanced_memory(
                    self,
                    max_memory=max_memory,
                    no_split_module_classes=no_split_module_classes,
                    low_zero=(device_map == "balanced_low_0"),
                )

            if isinstance(device_map, str):
                device_map = infer_auto_device_map(
                    self, max_memory=max_memory, no_split_module_classes=no_split_module_classes
                )

            self._update_offload(offload_index, adapters_weights)
            dispatch_model_kwargs["offload_index"] = offload_index

            dispatch_model(
                self,
                device_map=device_map,
                offload_dir=offload_dir,
                **dispatch_model_kwargs,
            )

            hook = AlignDevicesHook(io_same_device=True)
            if self.peft_config[adapter_name].is_prompt_learning:
                remove_hook_from_submodules(self.prompt_encoder)
            add_hook_to_module(self.get_base_model(), hook)

        if hasattr(self.base_model, "_cast_adapter_dtype"):
            self.base_model._cast_adapter_dtype(
                adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype
            )

        # Set model in evaluation mode to deactivate Dropout modules by default
        if not is_trainable:
            self.eval()
        return load_result

    def set_adapter(self, adapter_name: str) -> None:
        """
        Sets the active adapter.

        Only one adapter can be active at a time.

        Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is
        not desired, use the following code.

        ```py
        >>> for name, param in model_peft.named_parameters():
        ...     if ...:  # some check on name (ex. if 'lora' in name)
        ...         param.requires_grad = False
        ```

        Args:
            adapter_name (`str`):
                The name of the adapter to be set as active. The adapter must be loaded first.
        """
        if adapter_name not in self.peft_config:
            raise ValueError(f"Adapter {adapter_name} not found.")
        self.active_adapter = adapter_name
        if not self.peft_config[adapter_name].is_prompt_learning:
            self.base_model.set_adapter(adapter_name)
        _set_adapter(self, adapter_name)

    @property
    def base_model_torch_dtype(self):
        return getattr(self.base_model, "dtype", None)

    @property
    def active_peft_config(self):
        return self.peft_config[self.active_adapter]

    def create_or_update_model_card(self, output_dir: str):
        """
        Updates or create model card to include information about peft:
        1. Adds `peft` library tag
        2. Adds peft version
        3. Adds base model info
        4. Adds quantization information if it was used
        """

        filename = os.path.join(output_dir, "README.md")

        card = ModelCard.load(filename) if os.path.exists(filename) else ModelCard.from_template(ModelCardData())

        card.data["library_name"] = "peft"

        model_config = getattr(self, "config", DUMMY_MODEL_CONFIG)
        if hasattr(model_config, "to_dict"):
            model_config = model_config.to_dict()
            
        model_config = None if model_config == DUMMY_MODEL_CONFIG else model_config
        if model_config is not None and "_name_or_path" in model_config:
            card.data["base_model"] = model_config["_name_or_path"]

        lines = card.text.splitlines()

        quantization_config = None
        if hasattr(model_config, "quantization_config"):
            quantization_config = self.config.quantization_config.to_dict()
        training_config_text = ""
        quantization_prefix = "The following `bitsandbytes` quantization config was used during training:"
        # Adds quantization information if it was used
        if quantization_config is not None:
            training_config_text += f"\n{quantization_prefix}\n"
            training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()])
            training_config_text += "\n"

        training_procedure_heading = "## Training procedure"
        if quantization_prefix not in lines and bool(training_config_text):
            if training_procedure_heading in lines:
                lines.insert(lines.index(training_procedure_heading) + 2, training_config_text)
            else:
                lines.append(f"{training_procedure_heading}\n{training_config_text}")

        # Adds peft version
        framework_block_heading = "### Framework versions"
        if f"- PEFT {__version__}" not in lines:
            if framework_block_heading in lines:
                lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}")
            else:
                lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}")

        card.text = "\n".join(lines)
        card.save(filename)

class PeftModelForCausalLM(PeftModel):
    """
    Peft model for causal language modeling.

    Args:
        model ([`~transformers.PreTrainedModel`]): Base transformer model.
        peft_config ([`PeftConfig`]): Peft config.
        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
        autocast_adapter_dtype (`bool`, *optional*):
            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
            select PEFT tuners.

    Example:

        ```py
        >>> from transformers import AutoModelForCausalLM
        >>> from peft import PeftModelForCausalLM, get_peft_config

        >>> config = {
        ...     "peft_type": "PREFIX_TUNING",
        ...     "task_type": "CAUSAL_LM",
        ...     "inference_mode": False,
        ...     "num_virtual_tokens": 20,
        ...     "token_dim": 1280,
        ...     "num_transformer_submodules": 1,
        ...     "num_attention_heads": 20,
        ...     "num_layers": 36,
        ...     "encoder_hidden_size": 1280,
        ...     "prefix_projection": False,
        ...     "postprocess_past_key_value_function": None,
        ... }

        >>> peft_config = get_peft_config(config)
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2-large")
        >>> peft_model = PeftModelForCausalLM(model, peft_config)
        >>> peft_model.print_trainable_parameters()
        trainable params: 1843200 || all params: 775873280 || trainable%: 0.23756456724479544
        ```
    """

    def __init__(
        self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs
    ) -> None:
        super().__init__(model, peft_config, adapter_name, **kwargs)
        self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_ids=None,
        **kwargs,
    ):
        peft_config = self.active_peft_config
        if not peft_config.is_prompt_learning:
            if self.base_model.config.model_type == "mpt":
                if inputs_embeds is not None:
                    raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds")
                return self.base_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    **kwargs,
                )

            if peft_config.peft_type == PeftType.POLY:
                kwargs["task_ids"] = task_ids

            with self._enable_peft_forward_hooks(**kwargs):
                kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
                kwargs.pop("num_items_in_batch", None)
                if isinstance(self.base_model, LoraModel):
                    return self.base_model.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        inputs_embeds=inputs_embeds,
                        labels=labels,
                        output_attentions=output_attentions,
                        output_hidden_states=output_hidden_states,
                        return_dict=return_dict,
                        **kwargs,
                    )
                return self.base_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    inputs_embeds=inputs_embeds,
                    labels=labels,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    **kwargs,
                )

        batch_size = _get_batch_size(input_ids, inputs_embeds)
        if attention_mask is not None:
            # concat prompt attention mask
            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device)
            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)

        if kwargs.get("position_ids", None) is not None:
            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
            kwargs["position_ids"] = None
        if kwargs.get("token_type_ids", None) is not None:
            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
            kwargs["token_type_ids"] = None
        kwargs.update(
            {
                "attention_mask": attention_mask,
                "labels": labels,
                "output_attentions": output_attentions,
                "output_hidden_states": output_hidden_states,
                "return_dict": return_dict,
            }
        )

        if peft_config.peft_type == PeftType.PREFIX_TUNING:
            # overwrite past_kv in kwargs
            kwargs["past_key_values"] = self.get_prompt(batch_size)
            return self.base_model(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)
        elif peft_config.peft_type == PeftType.CPT:
            return self._cpt_forward(input_ids, inputs_embeds, peft_config, task_ids, batch_size, **kwargs)
        else:
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
            # concat prompt labels
            if labels is not None:
                prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device)
                kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1)
            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
            prompts = prompts.to(inputs_embeds.dtype)
            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)

    def _cpt_forward(
        self, input_ids=None, inputs_embeds=None, peft_config=None, task_ids=None, batch_size=None, **kwargs
    ):
        # Extract labels from kwargs
        labels = kwargs.pop("labels")
        device = [i.device for i in [input_ids, inputs_embeds, labels] if i is not None][0]
        # Extract input_type_mask from kwargs and move it to the same device as labels
        if "input_type_mask" in kwargs.keys():
            input_type_mask = kwargs.pop("input_type_mask").to(device)
        else:
            if input_ids is None:
                N_tokens = inputs_embeds.shape[1]
            else:
                N_tokens = input_ids.shape[1]
            input_type_mask = torch.ones((batch_size, N_tokens)).to(device) * 4

        cpt_token_ids = peft_config.cpt_token_ids
        cpt_tokens_type_mask = peft_config.cpt_tokens_type_mask

        # Generate embeddings if not provided
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        # Get prompt and concatenate with input embeddings
        prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
        prompts = prompts.to(inputs_embeds.dtype)
        inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
        # If labels are provided, generate prefix labels and type mask
        cpt_labels = None
        if labels is not None:
            # Generate prefix labels and concatenate with the input labels
            prefix_labels = torch.Tensor(cpt_token_ids).long().view(1, -1)
            prefix_labels = prefix_labels.repeat(batch_size, 1).to(labels.device)
            cpt_labels = torch.cat((prefix_labels, labels), dim=1)
            # Generate prefix type mask and shift input type mask values to avoid conflicts
            prefix_type_mask = torch.Tensor(cpt_tokens_type_mask).long().view(1, -1)
            prefix_type_mask = prefix_type_mask.repeat(batch_size, 1).to(labels.device)
            adjusted_input_type_mask = input_type_mask
            adjusted_input_type_mask[adjusted_input_type_mask > 0] += prefix_type_mask.max()
            # Concatenate prefix and shifted input type masks
            cpt_type_mask = torch.cat((prefix_type_mask, adjusted_input_type_mask), dim=1)
            # Identify valid label positions and mask invalid ones with -100
            labels_idx = (cpt_type_mask > 0) & (cpt_type_mask % 4 == 0)
            cpt_labels[~labels_idx] = -100
            # Update kwargs with the modified labels

        kwargs["labels"] = cpt_labels
        # Pass the modified inputs to the base model
        base_model_output = self.base_model(inputs_embeds=inputs_embeds, **kwargs)
        if labels is None:
            return base_model_output
        else:
            # Calculate the loss using the custom CPT loss function
            base_model_output = CPTEmbedding.calculate_loss(
                base_model_output, cpt_labels, cpt_type_mask, self.peft_config["default"]
            )
            return base_model_output

    def generate(self, *args, **kwargs):
        peft_config = self.active_peft_config
        self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
        if hasattr(self.base_model, "model"):
            self.base_model.model.generation_config = self.generation_config
        else:
            self.base_model.generation_config = self.generation_config
        try:
            if not peft_config.is_prompt_learning:
                with self._enable_peft_forward_hooks(*args, **kwargs):
                    kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
                    outputs = self.base_model.generate(*args, **kwargs)
            else:
                outputs = self.base_model.generate(**kwargs)
        except:
            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
            raise
        else:
            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
            return outputs

    def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor] = None, **kwargs):
        peft_config = self.active_peft_config
        model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)

        # https://github.com/huggingface/transformers/pull/26681/ introduced new cache format
        # for some architectures which requires a special fix for prompt tuning etc.
        # TODO: starting with transformers 4.38, all architectures should support caching.
        uses_transformers_4_38 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.38.0")
        uses_transformers_4_36 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.36.0")
        transformers_new_cache_archs = ["llama", "mistral", "persimmon", "phi"]
        if packaging.version.parse(transformers.__version__) > packaging.version.parse("4.43.3"):
            # https://github.com/huggingface/transformers/pull/31445
            transformers_new_cache_archs.append("bloom")

        uses_cache = uses_transformers_4_38 or (
            uses_transformers_4_36 and self.base_model.config.model_type in transformers_new_cache_archs
        )

        if peft_config.peft_type == PeftType.POLY:
            model_kwargs["task_ids"] = task_ids
        if peft_config.is_prompt_learning:
            if uses_cache and (model_kwargs.get("past_key_values", None) is not None):
                # change in the logic of `prepare_inputs_for_generation` makes the below code necessary
                # In prompt learning methods, past key values are longer when compared to the `input_ids`.
                # As such only consider the last input ids in the autogressive generation phase.
                past_key_values = model_kwargs["past_key_values"]
                if isinstance(past_key_values, (tuple, list)):
                    seq_len = past_key_values[0][0].shape[-2]
                else:  # using transformers kv cache
                    seq_len = past_key_values.get_seq_length()
                if seq_len >= model_kwargs["input_ids"].shape[1]:
                    model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:]

            if model_kwargs.get("attention_mask", None) is not None:
                size = model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens
                prefix_attention_mask = torch.ones(size).to(model_kwargs["input_ids"].device)
                model_kwargs["attention_mask"] = torch.cat(
                    (prefix_attention_mask, model_kwargs["attention_mask"]), dim=1
                )

            if model_kwargs.get("position_ids", None) is not None:
                warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
                model_kwargs["position_ids"] = None

            if kwargs.get("token_type_ids", None) is not None:
                warnings.warn(
                    "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids"
                )
                kwargs["token_type_ids"] = None

            # no past_key_values or past_key_values empty cache
            requires_prompt_injection = (model_kwargs.get("past_key_values", None) is None) or (
                isinstance(model_kwargs["past_key_values"], transformers.Cache)
                and not model_kwargs["past_key_values"].get_seq_length()
            )

            if requires_prompt_injection and peft_config.peft_type == PeftType.PREFIX_TUNING:
                new_past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
                model_kwargs["past_key_values"] = new_past_key_values
            elif requires_prompt_injection:
                inputs_embeds = self.word_embeddings(model_kwargs["input_ids"])
                prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids)
                prompts = prompts.to(inputs_embeds.dtype)
                model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1)
                model_kwargs["input_ids"] = None

        # For transformers>=4.38.0 - for some architectures such as Llama, `cache_position` is
        # passed in the forward pass to keep track of the position ids of the cache. We have to
        # pop that from `model_kwargs` as `cache_position` is properly created by the model, using the passed
        # `inputs_embeds`: https://github.com/huggingface/transformers/blob/593230f0a1150ea9c0477b9d859f25daf73c8c33/src/transformers/models/llama/modeling_llama.py#L956
        _ = model_kwargs.pop("cache_position", None)

        return model_kwargs

@dataclass
class TunerLayerStatus:
    name: str
    module_type: str
    enabled: bool
    active_adapters: list[str]
    merged_adapters: list[str]
    requires_grad: dict[str, bool | Literal["irregular"]]
    available_adapters: list[str]
    devices: dict[str, list[str]]


def get_layer_status(model: torch.nn.Module) -> list[TunerLayerStatus]:
    """Get the status of each adapter layer in the model.

    This function returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following
    attributes:

    - `name` (`str`):
       The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`.
    - `module_type` (`str`):
       The type of the adapter layer, e.g. `lora.Linear`.
    - `enabled` (`bool`):
       Whether the adapter layer is enabled.
    - `active_adapters` (`list[str]`):
       The names of the active adapters, if any, e.g. `["default"]`.
    - `merged_adapters` (`list[str]`):
       The names of the merged adapters, if any, e.g. `["default"]`.
    - requires_grad : dict[str, bool | Literal["irregular"]]
       The requires_grad status of the parameters for each adapter module. Ideally, it should be either `True` or
       `False`. If the requires_grad status is not consistent across all parameters, the value will be set to
       `"irregular"`.
    - `available_adapters` (`list[str]`):
       The names of the available adapters, e.g. `["default"]`.
    - `devices` (`dict[str, list[str]]`):
       The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`.

    Args:
        model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]):
            The model to get the adapter layer status from.

    Returns:
        list[`peft.peft_model.TunerLayerStatus`]:
            A list of dataclasses, each containing the status of the corresponding adapter layer.

    """
    if isinstance(model, PeftModel):
        base_model = model.base_model
    else:
        base_model = model

    layer_status: list[TunerLayerStatus] = []
    for name, module in base_model.named_modules():
        if not isinstance(module, BaseTunerLayer):
            continue

        # determine if all submodules/parameters if this module require grad or not
        mapping_requires_grad_list: dict[str, list[bool]] = collections.defaultdict(list)
        for adapter_module_name in module.adapter_layer_names:
            adapter_module = getattr(module, adapter_module_name)
            if isinstance(adapter_module, torch.nn.ModuleDict):
                for key, submodule in adapter_module.items():
                    for param in submodule.parameters():
                        mapping_requires_grad_list[key].append(param.requires_grad)
            elif isinstance(adapter_module, torch.nn.ParameterDict):
                for key, param in adapter_module.items():
                    mapping_requires_grad_list[key].append(param.requires_grad)
            else:
                # strange, we don't know how to handle this, ignore for now
                pass

        def check_irrgular(vals: list[bool]) -> bool | Literal["irregular"]:
            if all(vals):
                return True
            if not any(vals):
                return False
            return "irregular"

        requires_grad = {key: check_irrgular(vals) for key, vals in mapping_requires_grad_list.items()}

        devices_dd = collections.defaultdict(list)
        for adapter_module_name in module.adapter_layer_names + module.other_param_names:
            adapter_module = getattr(module, adapter_module_name)
            if isinstance(adapter_module, torch.nn.ModuleDict):
                for key, submodule in adapter_module.items():
                    devices_dd[key].extend([param.device.type for param in submodule.parameters()])
            elif isinstance(adapter_module, torch.nn.ParameterDict) or (
                adapter_module.__class__.__name__ == "BufferDict"
            ):  # VeRA
                for key, param in adapter_module.items():
                    devices_dd[key].append(param.device.type)
        devices = {key: sorted(set(val)) for key, val in devices_dd.items()}

        status = TunerLayerStatus(
            name=name,
            module_type=repr(module).partition("(")[0],
            enabled=not module.disable_adapters,
            active_adapters=module.active_adapters,
            merged_adapters=module.merged_adapters,
            requires_grad=requires_grad,
            available_adapters=sorted(module._get_available_adapters()),
            devices=devices,
        )
        layer_status.append(status)

    if not layer_status:
        raise ValueError(
            "No adapter layers found in the model, please ensure that it's a PEFT model or that you have PEFT adapters "
            "injected in the model."
        )

    return layer_status


@dataclass
class TunerModelStatus:
    base_model_type: str
    adapter_model_type: str
    peft_types: dict[str, str]
    trainable_params: int
    total_params: int
    num_adapter_layers: int
    enabled: bool | Literal["irregular"]
    active_adapters: list[str] | Literal["irregular"]
    merged_adapters: list[str] | Literal["irregular"]
    requires_grad: dict[str, bool | Literal["irregular"]]
    available_adapters: list[str]
    devices: dict[str, list[str]]


def get_model_status(model: torch.nn.Module) -> TunerModelStatus:
    """Get the status of tuners of the model.

    This function returns a `TunerModelStatus` dataclass instance, which contains the following attributes:

    - `base_model_type` (`str`):
       The type of the base model, e.g. `T5Model`.
    - `adapter_model_type` (`str`):
       The type of the adapter model, e.g. `LoraModel`.
    - `peft_types` (`dict[str, str]`):
       The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`.
    - `trainable_params` (`int`):
       The number of trainable parameters in the model.
    - `total_params` (`int`):
       The total number of parameters in the model.
    - `num_adapter_layers` (`int`):
       The number of adapter layers in the model.
    - `enabled` (`bool`, `Literal["irregular"]`):
       Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. This
       means that your model is in an inconsistent state and might not work as expected.
    - `active_adapters` (`list[str]`, `Literal["irregular"]`):
       The names of the active adapters. If the active adapters are not consistent across all layers, this will be
       `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
    - `merged_adapters` (`list[str]`, `Literal["irregular"]`):
       The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be
       `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
    - `requires_grad` (`dict[str, bool | Literal["irregular"]]`):
       Whether for the given adapter, all adapter layers have `requires_grad` set to `True` or `False`. If there is a
       mix, this will be set to `"irregular"`, which means that your model is in an inconsistent state and might not
       work as expected.
    - `available_adapters` (`list[str]`):
       The names of the available adapters, e.g. `["default"]`.
    - `devices` (`dict[str, list[str]]`):
       The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`.

    Args:
        model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]):
            The model to get the adapter layer status from.

    Returns:
        `peft.peft_model.TunerModelStatus`:
            A dataclass containing the status of the model.

    """
    if isinstance(model, PeftModel):
        base_model_type = model.get_base_model().__class__.__name__
        trainable_params, total_params = model.get_nb_trainable_parameters()
        base_model = model.base_model
        peft_types = {key: str(config.peft_type).partition(".")[-1] for key, config in base_model.peft_config.items()}
        adapter_model_type = base_model.__class__.__name__
    elif isinstance(model, PreTrainedModel):
        base_model_type = model.__class__.__name__
        trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model)
        base_model = model
        peft_types = {}
        adapter_model_type = "None"
    else:
        base_model_type = "other"
        trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model)
        base_model = model
        peft_types = {}
        adapter_model_type = "None"

    layer_status = get_layer_status(model)
    num_adapter_layers = len(layer_status)

    enabled_set: set[bool] = {status.enabled for status in layer_status}  # must be {True}, {False}, or {True, False}
    enabled: bool | Literal["irregular"]
    if len(enabled_set) == 1:
        enabled = enabled_set.pop()
    else:
        enabled = "irregular"

    available_adapters: list[str] = sorted(set().union(*(status.available_adapters for status in layer_status)))

    # ideally, active adapters should be consistent across all layers of the model, but we cannot guarantee it
    all_active_adapters: set[tuple[str, ...]] = {tuple(status.active_adapters) for status in layer_status}
    active_adapters: list[str] | Literal["irregular"]
    if not all_active_adapters:
        active_adapters = []
    elif len(all_active_adapters) == 1:
        active_adapters = list(all_active_adapters.pop())
    else:
        active_adapters = "irregular"

    # Here we determine what adapters are merged. This is not trivial because multiple adapters can be merged or not at
    # the same time. Some layers may only have adapter A, some only adapter B, so it's not as easy as just checking
    # which adapters are merged on each layer.

    # First, determine all adapters that are merged on at least on module.
    merged_all: set[str] = set()
    for status in layer_status:
        merged_all.update(status.merged_adapters)

    # Next, check if on any layer, on of these adapters is not merged.
    merged_adapters: list[str] | Literal["irregular"] = sorted(merged_all)
    for status in layer_status:
        unmerged = set(status.available_adapters) - set(status.merged_adapters)
        if unmerged & merged_all:
            # there is overlap between unmerged adapters and adapters that should be merged
            merged_adapters = "irregular"
            break

    # check status of requires_grad
    # first, merge the values for all layers
    requires_grad_all: dict[str, list[bool | Literal["irregular"]]] = collections.defaultdict(list)
    for status in layer_status:
        for key, val in status.requires_grad.items():
            requires_grad_all[key].append(val)

    # then, check if the values are consistent
    def check_irrgular(vals: list[bool | Literal["irregular"]]) -> bool | Literal["irregular"]:
        if all(val is True for val in vals):
            return True
        if all(val is False for val in vals):
            return False
        return "irregular"

    requires_grad = {key: check_irrgular(vals) for key, vals in requires_grad_all.items()}

    devices_dd = collections.defaultdict(list)
    for status in layer_status:
        for key, val in status.devices.items():
            devices_dd[key].extend(val)
    devices = {key: sorted(set(val)) for key, val in devices_dd.items()}

    adapter_model_status = TunerModelStatus(
        base_model_type=base_model_type,
        adapter_model_type=adapter_model_type,
        peft_types=peft_types,
        trainable_params=trainable_params,
        total_params=total_params,
        num_adapter_layers=num_adapter_layers,
        enabled=enabled,
        active_adapters=active_adapters,
        merged_adapters=merged_adapters,
        requires_grad=requires_grad,
        available_adapters=available_adapters,
        devices=devices,
    )
    return adapter_model_status


================================================
FILE: kt-sft/ktransformers/sft/torchviz_test.py
================================================
import torch
import torch.nn as nn
from torchviz import make_dot

class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(10, 20)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(20, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleNet()

input_tensor = torch.randn(1, 10)

output = model(input_tensor)

dot = make_dot(output, params=dict(model.named_parameters()))
dot.render('simple_net', format='svg', cleanup=True)    

================================================
FILE: kt-sft/ktransformers/tests/.gitignore
================================================
results/

================================================
FILE: kt-sft/ktransformers/tests/AIME_2024/eval_api.py
================================================
# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
import argparse
import json
import os
import time
import requests
import tqdm

from evaluation import filter_answer
from prompts import instruct_prompt
import pandas as pd
from datasets import load_dataset
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'


def generate_text(api_url,question , model_name, stream=False, auth_token=None):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
    }
    question = instruct_prompt(question)
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        "temperature": 0.6,
        "max_tokens": 10240,
    }
    print(f"content: {question}")
    response = requests.post(api_url, headers=headers, json=data,verify=False)
    if response.status_code == 200:
        result = response.json()
        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
        return filter_answer(results)
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None
def load_data(file_path):
        """
        Load data from a Parquet file into a list.
        Each record in the Parquet file should represent an individual record.
        """
        # dataset = load_dataset('parquet', data_files=file_path)
        data = []
        ds = load_dataset(file_path)
        df = pd.DataFrame(ds['train'])
        for _, row in df.iterrows():
            data.append(row.to_dict())
        return data

def get_score(pred, answer):
        """
        Calculate scores between the prediction and the answer.
        Uses ROUGE scores as the evaluation metric.
        :param pred: The predicted string.
        :param answer: The reference answer string.
        :return: A dictionary containing ROUGE scores.
        """
        if pred == answer:
            return 1
        # if we need to compare str with number, convert teh str to number
        try:
            pred = float(pred)
            answer = float(answer)
        except:
            pass
        if pred == answer:
            return 1
        return 0

def run_eval_api(
    api_url: str,
    model_name: str,
    out_path: str,
    format_tabs: bool = False,
    auth_token: str = None,
    problem_file: str = None,
    append: bool = False,
    skip: int = 0
):
  
    data = load_data(problem_file)
    pbar = tqdm.tqdm(total=len(data) * 1)
    pbar.update(skip)
    for i in range(len(data)):
        i = i+skip
        data_item = data[i]
        question = data_item['Problem']
        # Start the timer for this evaluation
        start_time = time.time()
        try:
            completion = generate_text(api_url, question, model_name, auth_token=auth_token)
            if completion is None:
                raise Exception(f"Failed to get prediction for {question}")
            answer = data_item['Answer']
            score = get_score(completion, answer)
            elapsed_time = time.time() - start_time
            result = {
                "index": i,
                "question_id": data_item["ID"],
                "answer": answer,
                "prediction": completion,
                "score": score,
                "time": elapsed_time
            }
            with open(out_path, "a" if append else "w") as f:
                f.write(json.dumps(result) + "\n")
            
        except Exception as e:
            print(f"Failed to get prediction for {question}")
            print(e)
            continue

        pbar.update(1)
    

def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-R1", help="Model Name")
    parser.add_argument("--out_path", type=str, default="results/api/eval_aime.jsonl", help="Output Path")
    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
    parser.add_argument("--problem_file", type=str, default="Maxwell-Jia/AIME_2024", help="Evalset File")
    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
    parser.add_argument("--skip", type=int, default=0, help="Skip some tasks")
    args = parser.parse_args()
    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append, args.skip)

================================================
FILE: kt-sft/ktransformers/tests/AIME_2024/evaluation.py
================================================
# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
def filter_answer(completion: str) -> str:
    # the answer is the last part of the completion, it's a int64 number
    # get the last line
    completion = completion.strip().split("\n")[-1]
    # handle the $\\boxed{...}$ format
    if "$\\boxed{" in completion:
        return completion.split("}")[0].split("{")[-1]
    return completion.split()[-1]


================================================
FILE: kt-sft/ktransformers/tests/AIME_2024/prompts.py
================================================
def instruct_prompt(prompt: str) -> str:
    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nSolve the following math problem without any tests or explanation only one answer surrounede by '$\\boxed{{}}$'\n{prompt}\n\n### Response:"""


================================================
FILE: kt-sft/ktransformers/tests/dequant_gpu.py
================================================
import os 
# os.environ["CUDA_VISIBLE_DEVICES"]="1,2"
# add path
import sys
current_path = os.path.abspath(os.path.dirname(__file__))
sys.path.append(current_path+"/../..")
import numpy as np
# from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
# from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
from ktransformers.util.custom_loader import GGUFLoader
import torch
import KTransformersOps
torch.set_default_dtype(torch.bfloat16)
import time
from transformers import (
    AutoConfig,
)
import os
# CUDA_LAUNCH_BLOCKING=1
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
model_name = "/data/Qwen2-57B-A14B-Instruct"

# Q4k
key = "blk.1."
target = "attn_q.weight"

t1 = time.time()
q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
# q_weight_cpu = torch.from_numpy(q_weight_cpu)

t2 = time.time()
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
t3 = time.time()
print()
allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu(), atol=1e-6)
print(f"Q4k {key+target}")
print("load gguf tensor from cpu cost: ", t2-t1)
print("load gguf tensor from gpu cost: ", t3-t2)
print("allclose: ", allclose)


# Q6k
key = "blk.0."
target = "ffn_down_exps.weight"

t1 = time.time()
q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
t2 = time.time()
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
t3 = time.time()
print()
allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6)
print(f"Q6k {key+target}")
print("load gguf tensor from cpu cost: ", t2-t1)
print("load gguf tensor from gpu cost: ", t3-t2)
print("allclose: ", allclose)


================================================
FILE: kt-sft/ktransformers/tests/dequant_gpu_t.py
================================================
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# add path
import sys
sys.path.append("../..")
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
import torch
import KTransformersOps
torch.set_default_dtype(torch.bfloat16)
import time
from transformers import (
    AutoConfig,
)

gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
model_name = "/data/Qwen2-57B-A14B-Instruct"
key = "blk.0."
target = "ffn_up_exps.weight"

data = gguf_config.get_mmap_tensor(key + target)

_, factors, offsets, qs1, qs2= dequantize_q4_k(data)
factors_cpu = torch.from_numpy(factors)
offsets_cpu = torch.from_numpy(offsets)
qs1_cpu = torch.from_numpy(qs1)
qs2_cpu = torch.from_numpy(qs2)


_, factors, offsets, qs1, qs2 = dequantize_q4_k_gpu(data)

print(torch.allclose(factors.cpu(), factors_cpu))
print(torch.allclose(offsets.cpu(), offsets_cpu))
print(torch.allclose(qs1.cpu(), qs1_cpu))
print(torch.allclose(qs2.cpu(), qs2_cpu))

================================================
FILE: kt-sft/ktransformers/tests/function_call_test.py
================================================
from openai import OpenAI

def send_messages(messages):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        tools=tools
    )
    return response.choices[0].message

client = OpenAI(
    api_key="placeholder",
    base_url="http://0.0.0.0:10002/v1",
)

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get weather of an location, the user shoud supply a location first",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    }
                },
                "required": ["location"]
            },
        }
    },
]

messages = [{"role": "user", "content": "How's the weather in Hangzhou?"}]
message = send_messages(messages)
print(f"User>\t {messages[0]['content']}")
print(message)
tool = message.tool_calls[0]
messages.append(message)

messages.append({"role": "tool", "tool_call_id": tool.id, "content": "24℃"})
message = send_messages(messages)
print(f"Model>\t {message.content}")

================================================
FILE: kt-sft/ktransformers/tests/humaneval/eval_api.py
================================================
# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
import argparse
import os
import requests
from human_eval.data import write_jsonl, read_problems
import tqdm

from evaluation import filter_code, fix_indents
from prompts import instruct_prompt

def generate_text(api_url,question , model_name, stream=False, auth_token=None):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
    }
    question = instruct_prompt(question)
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        "temperature": 0.6
    }
    print(f"content: {question}")
    response = requests.post(api_url, headers=headers, json=data,verify=False)
    if response.status_code == 200:
        result = response.json()
        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
        return [filter_code(fix_indents(results))]
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

def run_eval_api(
    api_url: str,
    model_name: str,
    out_path: str,
    format_tabs: bool = False,
    auth_token: str = None,
    problem_file: str = None,
    append: bool = False,
    skip: int = 0
):
    if(problem_file is None):
        problems = read_problems()
    else:
        problems = read_problems(problem_file)
    samples = []
    pbar = tqdm.tqdm(total=len(problems) * 1)
    pbar.update(skip)
    try:
        for task_id in problems:
            # skip some tasks
            if skip > 0:
                skip -= 1
                continue

            if format_tabs:
                prompt = problems[task_id]["prompt"].replace("    ", "\t")
            else:
                prompt = problems[task_id]["prompt"]
            completion = generate_text(api_url, prompt, model_name, auth_token=auth_token)
            # samples.append({"task_id": task_id, "completion": completion})
            for sample in completion:
                result = dict(
                    task_id=task_id,
                    completion=sample,
                )
                samples += [result]
                if append:
                    write_jsonl(out_path, [result],append=append)
            pbar.update(1)
        if not append:
            write_jsonl(out_path, samples,append=append)
    except Exception as e:
        if not append:
            write_jsonl(out_path, samples,append=append)
        print(f"Error: {e}")

def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    #parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model Name")
    parser.add_argument("--out_path", type=str, default="results/api/eval_b.jsonl", help="Output Path")
    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
    parser.add_argument("--problem_file", type=str, default=None, help="Evalset File")
    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
    parser.add_argument("--skip", type=int, default=0, help="Skip first n problems")
    args = parser.parse_args()
    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append,args.skip)

================================================
FILE: kt-sft/ktransformers/tests/humaneval/evaluation.py
================================================
# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
def filter_code(completion: str) -> str:
    # The program tends to overwrite, we only take the first function
    completion = completion.lstrip("\n")
    # we also remove ```python\n and ```
    completion = completion.replace("```python\n", "").replace("```", "")
    if 'if __name__ == "__main__":' in completion:
        completion = completion.split('if __name__ == "__main__":')[0]
    if "# Example usage" in completion:
        completion = completion.split("# Example usage")[0]
    return completion


def fix_indents(text: str) -> str:
    return text.replace("\t", "    ")


================================================
FILE: kt-sft/ktransformers/tests/humaneval/prompts.py
================================================
def instruct_prompt(prompt: str) -> str:
    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""


def standard_prompt(prompt: str) -> str:
    return f"""Complete the following Python code without any tests or explanation\n{prompt}"""


def write_prompt(prompt: str) -> str:
    return f"""Write a python program to complete the following code:\n{prompt}"""


def replit_glaive_prompt(prompt: str) -> str:
    return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n Write a response that appropriately completes the request.\n\n ### Instruction:\nWrite a program to perform the given task.\n\n Input:\n{prompt}\n\n### Response:"""


================================================
FILE: kt-sft/ktransformers/tests/mmlu_pro_test.py
================================================
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D, E, F, G, H, I, J. No other answers are accepted. Just the letter.'


class DataEvaluator:
    def __init__(self):
        # self.template_prompt = template_prompt
        self.data = []

    def load_data(self, file_path):
        """
        Load data from a Parquet file into a list.
        Each record in the Parquet file should represent an individual record.
        """
        # dataset = load_dataset('parquet', data_files=file_path)
        ds = load_dataset("TIGER-Lab/MMLU-Pro")
        df = pd.DataFrame(ds['test'])
        # print(ds)
        # # ds_1 =  ds['train']
        # ds_2 =  ds['validation']
        # ds_3 =  ds['test']
        # df_test = pd.DataFrame(ds['test'])
        # df_val = pd.DataFrame(ds['validation'])

        # for _, row in df.iterrows():
        #     self.data.append(row.to_dict())
        # df = pd.read_parquet(file_path)

        for _, row in df.iterrows():
            self.data.append(row.to_dict())

    def get_prompt(self, record):
        """
        Combine fields from a record with the template prompt to create a full prompt.
        :param record: Dictionary containing fields to populate the template.
        :return: A formatted prompt string.
        """
        options_str = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(record['options'])])
        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
        return prompt
        
    def post_processing(self, text):
        """
        Perform post-processing on the prediction string.
        :param text: The raw prediction string.
        :return: Processed prediction string.
        """
        text = text.lstrip('\n').split('\n')[-1]
        return text[-1:]

    def score(self, pred, answers):
        """
        Calculate scores between the prediction and the answer.
        Uses ROUGE scores as the evaluation metric.
        :param pred: The predicted string.
        :param answer: The reference answer string.
        :return: A dictionary containing ROUGE scores.
        """
        for answer in answers:
            if pred == answer:
                return 1

        return 0

# Function to generate text using API
def generate_text(api_url, question, model_name, stream=False):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization' : 'Bearer '
    }
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        # "temperature": 0.0
    }
    
    print("POST data:", data)
    response = requests.post(api_url, headers=headers, json=data)
    
    if response.status_code == 200:
        result = response.json()
        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

# Main function to handle multiple evaluations
def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
    start_total_time = time.time()

    total_score = 0

    results = []
    random.seed(42)
    random.shuffle(data_evaluator.data)
    for i in range(min(concurrent_requests, len(data_evaluator.data))):
        # Randomly select a data item from data for each request
        data_item = data_evaluator.data[i]
        question = data_evaluator.get_prompt(data_item)
        # print(question)

        # Start the timer for this evaluation
        start_time = time.time()
        try:
            # Generate prediction using the API
            prediction = generate_text(api_url, question, model_name)

            if prediction is None:
                raise Exception(f"Failed to get prediction for {question}")

            answer = data_item['answer']
            # Compute score
            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)

            # Calculate the time taken
            elapsed_time = time.time() - start_time

            # Collect the result data
            result_data = {
                "question_id": data_item['question_id'],
                "answer": answer,
                "prediction": data_evaluator.post_processing(prediction),
                "score": score,
                "time": elapsed_time
            }

            # Write results to result.json with each field on a new line
            with open(result_file, 'a', encoding='utf-8') as f:
                json.dump(result_data, f, ensure_ascii=False, indent=4)
                f.write("\n")  # Ensure each JSON object is on a new line

            results.append(result_data)

            # Aggregate scores
            total_score += score

        except Exception as e:
            print(f"Error processing request {i}: {e}")

    # Calculate total time and throughput
    total_time = time.time() - start_total_time
    throughput = concurrent_requests / total_time

    # Log the total time, throughput, and average ROUGE scores
    with open(log_file, 'a', encoding='utf-8') as log_f:
        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
        log_f.write('-' * 40 + '\n')

    print(f"Results saved to {result_file}")
    print(f"Log saved to {log_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
    parser.add_argument("--file", type=str, default="TIGER-Lab/MMLU-Pro", help="Path to the mmlu.jsonl file")
    parser.add_argument("--result", type=str, default="./mmlu_result_pro.json", help="Path to save the result JSON file")
    parser.add_argument("--log", type=str, default="./mmlu_result_pro.log", help="Path to save the log file")
    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
    parser.add_argument("--api_url", type=str, default="http://localhost:15488/v1/chat/completions", help="API URL")
    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")

    args = parser.parse_args()

    # Load the data from the provided file
    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"


    # Load the data from the provided file
    data_evaluator = DataEvaluator()
    data_evaluator.load_data(args.file)

    # Run the main function with the specified number of concurrent evaluations
    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)

================================================
FILE: kt-sft/ktransformers/tests/mmlu_test.py
================================================
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter.'


class DataEvaluator:
    def __init__(self):
        # self.template_prompt = template_prompt
        self.data = []

    def load_data(self, file_path):
        """
        Load data from a Parquet file into a list.
        Each record in the Parquet file should represent an individual record.
        """
        # dataset = load_dataset('parquet', data_files=file_path)
        splits = {'test': 'all/test-00000-of-00001.parquet', 'validation': 'all/validation-00000-of-00001.parquet',
                  'dev': 'all/dev-00000-of-00001.parquet',
                  'auxiliary_train': 'all/auxiliary_train-00000-of-00001.parquet'}
        df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])

        for _, row in df.iterrows():
            self.data.append(row.to_dict())

    def get_prompt(self, record):
        """
        Combine fields from a record with the template prompt to create a full prompt.
        :param record: Dictionary containing fields to populate the template.
        :return: A formatted prompt string.
        """
        options_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(record['choices'])])
        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
        return prompt
        
    def post_processing(self, text):
        """
        Perform post-processing on the prediction string.
        :param text: The raw prediction string.
        :return: Processed prediction string.
        """
        text = text.lstrip('\n').split('\n')[-1]
        return text[-1:]

    def score(self, pred, answers):
        """
        Calculate scores between the prediction and the answer.
        Uses ROUGE scores as the evaluation metric.
        :param pred: The predicted string.
        :param answer: The reference answer string.
        :return: A dictionary containing ROUGE scores.
        """
        for answer in answers:
            if pred == answer:
                return 1

        return 0

# Function to generate text using API
def generate_text(api_url, question, model_name, stream=False):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization' : 'Bearer '
    }
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
        # "temperature": 0.0
    }
    
    print("POST data:", data)
    response = requests.post(api_url, headers=headers, json=data)
    
    if response.status_code == 200:
        result = response.json()
        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

# Main function to handle multiple evaluations
def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
    start_total_time = time.time()

    total_score = 0

    results = []
    random.seed(42)
    random.shuffle(data_evaluator.data)
    for i in range(min(concurrent_requests, len(data_evaluator.data))):
        # Randomly select a data item from data for each request
        data_item = data_evaluator.data[i]
        question = data_evaluator.get_prompt(data_item)
        # print(question)

        # Start the timer for this evaluation
        start_time = time.time()
        try:
            # Generate prediction using the API
            prediction = generate_text(api_url, question, model_name)

            if prediction is None:
                raise Exception(f"Failed to get prediction for {question}")

            answer = chr(data_item['answer'] + 65)
            # Compute score
            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)

            # Calculate the time taken
            elapsed_time = time.time() - start_time

            # Collect the result data
            result_data = {
                "question_id": i,
                "answer": answer,
                "prediction": data_evaluator.post_processing(prediction),
                "score": score,
                "time": elapsed_time
            }

            # Write results to result.json with each field on a new line
            with open(result_file, 'a', encoding='utf-8') as f:
                json.dump(result_data, f, ensure_ascii=False, indent=4)
                f.write("\n")  # Ensure each JSON object is on a new line

            results.append(result_data)

            # Aggregate scores
            total_score += score

        except Exception as e:
            print(f"Error processing request {i}: {e}")

    # Calculate total time and throughput
    total_time = time.time() - start_total_time
    throughput = concurrent_requests / total_time

    # Log the total time, throughput, and average ROUGE scores
    with open(log_file, 'a', encoding='utf-8') as log_f:
        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
        log_f.write('-' * 40 + '\n')

    print(f"Results saved to {result_file}")
    print(f"Log saved to {log_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
    parser.add_argument("--file", type=str, default="cais/mmlu", help="Path to the mmlu.jsonl file")
    parser.add_argument("--result", type=str, default="./mmlu_result_silicon.json", help="Path to save the result JSON file")
    parser.add_argument("--log", type=str, default="./mmlu_result_silicon.log", help="Path to save the log file")
    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
    parser.add_argument("--api_url", type=str, default="http://localhost:10003/v1/chat/completions", help="API URL")
    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")

    args = parser.parse_args()

    # Load the data from the provided file
    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"


    # Load the data from the provided file
    data_evaluator = DataEvaluator()
    data_evaluator.load_data(args.file)

    # Run the main function with the specified number of concurrent evaluations
    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)

================================================
FILE: kt-sft/ktransformers/tests/mmlu_test_multi.py
================================================
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset
import os
import concurrent.futures
import threading
import re

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter.'


def extract_final_answer(text):
    """
    提取模型预测的最终选项（如 A/B/C/D）
    支持自然语言、多行、markdown、高亮、非末尾结论等格式
    """
    text = text.strip()

    explicit_patterns = [
        r'Answer:\s*([A-D])\b',
        r'Correct answer:\s*([A-D])\b',
        r'The correct answer is\s*\*?\*?\s*([A-D])\b',
        r'Answer is\s*([A-D])\b',
        r'Therefore,\s*answer is\s*([A-D])\b',
        r'Therefore,\s*the answer should be\s*(?:Option\s*)?([A-D])\b',
        r'The answer should be\s*(?:Option\s*)?([A-D])\b',
        r'Option\s+([A-D])\s+is correct',
    ]
    for pat in explicit_patterns:
        match = re.search(pat, text, re.IGNORECASE)
        if match:
            return match.group(1).upper()

    markdown_match = re.findall(r'\*\*\s*([A-D])[\.\s]?', text)
    if markdown_match:
        return markdown_match[-1].upper()

    quote_match = re.findall(r"['\"]([A-D])['\"]", text)
    if quote_match:
        return quote_match[-1].upper()

    lines = text.splitlines()
    for line in reversed(lines[-5:]):
        line = line.strip()
        match = re.match(r'^([A-D])([.\s]|$)', line)
        if match:
            return match.group(1).upper()
    
    return None
class DataEvaluator:
    def __init__(self):
        self.data = []

    def load_data(self, file_path):
        """
        从数据文件中加载数据，每条记录对应一个实例
        """
        splits = {'test': 'all/test-00000-of-00001.parquet', 'validation': 'all/validation-00000-of-00001.parquet',
                  'dev': 'all/dev-00000-of-00001.parquet',
                  'auxiliary_train': 'all/auxiliary_train-00000-of-00001.parquet'}
        df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])
        for _, row in df.iterrows():
            self.data.append(row.to_dict())

    def get_prompt(self, record):
        """
        结合提示信息和记录数据生成完整的题目
        """
        options_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(record['choices'])])
        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
        return prompt

    def post_processing(self, text):
        """
        对生成的文本进行后处理，提取最终答案（只返回最后一个字符）
        """
        text = text.lstrip('\n').split('\n')[-1]
        return text[-1:]

    def score(self, pred, answer):
        """
        对比预测答案和正确答案，返回得分
        """
        if pred == answer:
            return 1
        return 0

def generate_text(api_url, question, model_name, stream=False):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': 'Bearer '
    }
    data = {
        "messages": [{"content": question, "role": "user"}],
        "model": model_name,
        "stream": stream,
    }
    print("POST data:", data)
    response = requests.post(api_url, headers=headers, json=data, timeout=5000000)
    if response.status_code == 200:
        result = response.json()
        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
    else:
        print(f"API Request failed with status code {response.status_code}")
        return None

def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
    start_total_time = time.time()
    total_score = 0
    total_exact_score = 0
    results = []
    file_lock = threading.Lock()
    
    random.seed(42)
    random.shuffle(data_evaluator.data)
    data_subset = data_evaluator.data[:min(concurrent_requests, len(data_evaluator.data))]
    
    batch_size = 10

    def worker(index, data_item):
        nonlocal total_score
        nonlocal total_exact_score
        question = data_evaluator.get_prompt(data_item)
        start_time = time.time()
        try:
            prediction = generate_text(api_url, question, model_name)
            if prediction is None:
                raise Exception(f"Failed to get prediction for question: {question}")
            answer = chr(data_item['answer'] + 65)
            processed_prediction = data_evaluator.post_processing(prediction)
            score = data_evaluator.score(processed_prediction, answer)
            exact_score = data_evaluator.score(extract_final_answer(prediction), answer)
            elapsed_time = time.time() - start_time
            result_data = {
                "question_id": index,
                "answer": answer,
                "prediction": processed_prediction,
                "full_prediction": prediction,
                "score": score,
                "exact_score": exact_score,
                "time": elapsed_time
            }
            with file_lock:
                with open(result_file, 'a', encoding='utf-8') as f:
                    json.dump(result_data, f, ensure_ascii=False, indent=4)
                    f.write("\n")
            return result_data
        except Exception as e:
            print(f"Error processing request {index}: {e}")
            return None

    for batch_start in range(0, len(data_subset), batch_size):
        batch = data_subset[batch_start: batch_start + batch_size]
        with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = [executor.submit(worker, batch_start + j, data_item) for j, data_item in enumerate(batch)]
            for future in concurrent.futures.as_completed(futures):
                res = future.result()
                if res is not None:
                    results.append(res)
                    total_score += res['score']
                    total_exact_score += res['exact_score']
    
    total_time = time.time() - start_total_time
    throughput = len(data_subset) / total_time if total_time > 0 else 0
    
    with open(log_file, 'a', encoding='utf-8') as log_f:
        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
        average_score = total_score / len(data_subset) if data_subset else 0
        log_f.write(f"Average Score: {average_score}\n")
        average_exact_score = total_exact_score / len(data_subset) if data_subset else 0
        log_f.write(f"Average Exact Score: {average_exact_score}\n")
        log_f.write('-' * 40 + '\n')
    
    print(f"Results saved to {result_file}")
    print(f"Log saved to {log_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--concurrent", type=int, default=1000, help="需要测试的实例总数")
    parser.add_argument("--file", type=str, default="cais/mmlu", help="数据文件路径")
    parser.add_argument("--result", type=str, default="./mmlu_result_silicon.json", help="结果文件保存路径")
    parser.add_argument("--log", type=str, default="./mmlu_result_silicon.log", help="日志文件保存路径")
    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="模型名称或路径")
    parser.add_argument("--api_url", type=str, default="http://localhost:10006/v1/chat/completions", help="API URL")

    args = parser.parse_args()
    
    data_evaluator = DataEvaluator()
    data_evaluator.load_data(args.file)
    
    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)

================================================
FILE: kt-sft/ktransformers/tests/score.py
================================================
import subprocess
import time
import requests
import sys
import os

def wait_for_server(base_url: str, timeout: int = None) -> None:
    start_time = time.time()
    while True:
        try:
            response = requests.get(
                f"{base_url}/v1/models",
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
                print("Server is ready.")
                break
        except requests.exceptions.RequestException:
            time.sleep(1)
            if timeout and time.time() - start_time > timeout:
                raise TimeoutError("Server did not become ready within timeout period")

server_cmd = [
    "numactl", "-N", "1", "-m", "1",
    "/home/qujing3/anaconda3/envs/ktransformers-dev/bin/ktransformers",
    "--model_path", "/home/qujing3/models/DeepSeek-R1-Q4_K_M/config",
    "--gguf_path", "/home/qujing3/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M",
    "--port", "10002",
    "--cpu_infer", "48",
    "--optimize_config_path", "ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml",
    "--max_new_tokens", "3000",
    "--cache_lens", "6000"
]

print("Starting ktransformers server...")
print(" ".join(server_cmd))
with open("/tmp/server_log.txt", "w") as f:
    server_process = subprocess.Popen(server_cmd, stdout=f, stderr=f, text=True)

try:
    wait_for_server("http://localhost:10002", timeout=600)

    eval_cmd = ["python", "ktransformers/tests/humaneval/eval_api.py"]
    print("Running eval_api.py...")
    print(f"Command: {' '.join(eval_cmd)}")
    
    env = os.environ.copy()
    env["PYTHONUNBUFFERED"] = "1"
    
    eval_process = subprocess.Popen(
        eval_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        bufsize=1,
        env=env,
        universal_newlines=True
    )
    
    import threading
    import queue
    
    def enqueue_output(out, queue):
        for line in iter(out.readline, ''):
            queue.put(line)
        out.close()
    
    stdout_queue = queue.Queue()
    stderr_queue = queue.Queue()
    
    stdout_thread = threading.Thread(target=enqueue_output, args=(eval_process.stdout, stdout_queue))
    stderr_thread = threading.Thread(target=enqueue_output, args=(eval_process.stderr, stderr_queue))
    
    stdout_thread.daemon = True
    stderr_thread.daemon = True
    stdout_thread.start()
    stderr_thread.start()
    
    while eval_process.poll() is None:
        try:
            line = stdout_queue.get_nowait()
            print(line, end='', flush=True)
        except queue.Empty:
            pass
            
        try:
            line = stderr_queue.get_nowait()
            print(line, end='', file=sys.stderr, flush=True)
        except queue.Empty:
            pass
        
        time.sleep(1)

    while not stdout_queue.empty():
        print(stdout_queue.get(), end='', flush=True)
    while not stderr_queue.empty():
        print(stderr_queue.get(), end='', file=sys.stderr, flush=True)
        
    eval_process.wait()
    print(f"eval_api.py completed with exit code: {eval_process.returncode}")

    evaluate_cmd = [
        "evaluate_functional_correctness",
        "ktransformers/tests/humaneval/results/api/eval_b.jsonl"
    ]
    print("Running evaluate_functional_correctness...")
    print(f"Command: {' '.join(evaluate_cmd)}")
    
    evaluate_process = subprocess.Popen(
        evaluate_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        bufsize=1,
        universal_newlines=True
    )
    
    for line in evaluate_process.stdout:
        print(line, end='', flush=True)
    for line in evaluate_process.stderr:
        print(line, end='', file=sys.stderr, flush=True)
        
    evaluate_process.wait()
    
    print(f"evaluate_functional_correctness completed with exit code: {evaluate_process.returncode}")
    if evaluate_process.returncode != 0:
        print(f"evaluate_functional_correctness exited with code {evaluate_process.returncode}")
        sys.exit(evaluate_process.returncode)

finally:
    print("Stopping ktransformers server...")
    server_process.terminate()
    try:
        server_process.wait(timeout=30)
    except subprocess.TimeoutExpired:
        print("Server did not terminate gracefully, forcing...")
        server_process.kill()

================================================
FILE: kt-sft/ktransformers/tests/test_client.py
================================================
import asyncio
import json
import sys
import aiohttp
import argparse

prompt_list = [
    'Please elaborate on modern world history.',
    'Please introduce Harry Potter.',
    'I want to learn Python. Please give me some advice.',
    'Please tell me a joke '
]


async def fetch_event_stream(session, payload, request_id, stream):
    try:
        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }

        async with session.post(SERVER_URL, json=payload, headers=headers, timeout=50000) as response:
            print(f"Request {request_id}: Connected, status {response.status}")

            if response.status != 200:
                print(f"Request {request_id}: Error, status {response.status}")
                return

            output_text = ""

            if stream:
                async for line in response.content:
                    try:
                        decoded_line = line.decode("utf-8").strip()
                        if not decoded_line or not decoded_line.startswith("data: "):
                            continue

                        decoded_line = decoded_line[6:].strip()
                        if not decoded_line:
                            continue

                        response_data = json.loads(decoded_line)
                        choices = response_data.get("choices", [])
                        if not choices:
                            continue

                        delta = choices[0].get("delta", {})
                        token = delta.get("content", "")

                        if token:
                            output_text += token
                            sys.stdout.write(token)
                            sys.stdout.flush()

                        finish_reason = choices[0].get("finish_reason", None)
                        if finish_reason:
                            break

                    except json.JSONDecodeError as e:
                        print(f"\nRequest {request_id}: JSON Decode Error - {e}")
                    except IndexError:
                        print(f"\nRequest {request_id}: List Index Error - choices is empty")
                    except Exception as e:
                        print(f"\nRequest {request_id}: Error parsing stream - {e}")
            else:
                response_data = await response.json()
                choices = response_data.get("choices", [])
                if choices:
                    content = choices[0].get("message", {}).get("content", "")
                    print(f"Request {request_id} Output:\n{content}")
                    output_text += content

    except Exception as e:
        print(f"\nRequest {request_id}: Exception - {e}")

async def main(prompt_id, model, stream, max_tokens, temperature, top_p):
    async with aiohttp.ClientSession() as session:
        payload = {
            "messages": [
                {"role": "system", "content": ""},
                {"role": "user", "content": prompt_list[prompt_id]}
            ],
            "model": model,
            "stream": stream,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p
        }
        tasks = [fetch_event_stream(session, payload, prompt_id, stream)]
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Event Stream Request Tester")
    parser.add_argument("--question_id", type=int, default=0)
    parser.add_argument("--model", type=str, default="DeepSeek-V3")
    parser.add_argument("--stream", type=bool, default=True)  
    parser.add_argument("--max_tokens", type=int, default=500)
    parser.add_argument("--temperature", type=float, default=0.8)
    parser.add_argument("--top_p", type=float, default=1)
    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")

    args = parser.parse_args()
    SERVER_URL = args.api_url
    asyncio.run(main(args.question_id, args.model, args.stream, args.max_tokens, args.temperature, args.top_p))


================================================
FILE: kt-sft/ktransformers/tests/test_pytorch_q8.py
================================================
import torch

class LinearModel(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear = torch.nn.Linear(in_features, out_features)
    
    def forward(self, x):
        return self.linear(x)

in_features = 64
out_features = 128
model_fp32 = LinearModel(in_features, out_features)

model_int8 = torch.ao.quantization.quantize_dynamic(
    model_fp32,
    {torch.nn.Linear},
    dtype=torch.qint8
)

batch_size = 32
input_fp32 = torch.randn(1, batch_size, in_features)
output_int8 = model_int8(input_fp32)

print(f"输入形状: {input_fp32.shape}")
print(f"输出形状: {output_int8.shape}")

with torch.no_grad():
    output_fp32 = model_fp32(input_fp32)
    
print(f"FP32输出的前几个值: {output_fp32[0, :5]}")
print(f"INT8输出的前几个值: {output_int8[0, :5]}")

error = torch.abs(output_fp32 - output_int8).mean().item()
print(f"平均绝对误差: {error}")

print(f"量化前模型类型: {type(model_fp32.linear)}")
print(f"量化后模型类型: {type(model_int8.linear)}")

================================================
FILE: kt-sft/ktransformers/tests/test_speed.py
================================================
import asyncio
import json
import sys
import aiohttp
import random
import argparse
import yaml
import os
import time
from time import sleep

decodesz = 128
# Server URL (replace with your server URL)
decodesz_list = [128]
prefill_speeds = []
decode_speeds = []
ktansformer_prompt1024="""Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. 
They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.Mr. Dursley was the director of a firm called Grunnings, which made drills. 
He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. 
Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. 
The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.
The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. 
They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. 
Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. 
The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. 
The Dursleys knew that the Potters had a small son, too, but they had never even seen him. 
This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. 
Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. 
Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair.None of them noticed a large, tawny owl flutter past the window.
At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls.
“Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive.
It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. 
For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. 
There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. 
What could he have been thinking of? It must have been a trick of the light. 
Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. 
It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. 
Mr. Dursley gave himself a little shake and put the cat out of his mind. 
As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day.
But on the edge of town, drills were driven out of his mind by something else. 
As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. 
People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! 
He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. 
They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! 
The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. 
The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
Mr. Dursley always sat with his back to the window in his office on the ninth floor."""
async def fetch_event_stream(session, request_id, prompt, max_tokens, model):
    try:
        payload = {
            "messages": [
                {"role": "system", "content": ""},
                {"role": "user", "content": prompt}
            ],
            "model": model,
            "temperature": 0.3,
            "top_p": 1.0,
            "stream": True,
            "return_speed": True,
            "max_tokens": max_tokens,
        }

        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }

        async with session.post(SERVER_URL, json=payload, headers=headers, timeout=500000) as response:
            if response.status != 200:
                print(f"[Request {request_id}] Error: Status {response.status}")
                return

            buffer = ""  
            total_tokens = 0
            decode_start_time = None
            decode_end_time = None
            usage_info = None  

            async for line in response.content:
                try:
                    decoded_line = line.decode("utf-8").strip()
                    if not decoded_line or not decoded_line.startswith("data: "):
                        continue

                    decoded_line = decoded_line[6:].strip()
                    if not decoded_line:
                        continue

                    response_data = json.loads(decoded_line)
                    
                    if "usage" in response_data:
                        usage_info = response_data["usage"]
                    
                    choices = response_data.get("choices", [])
                    if not choices:
                        continue

                    delta = choices[0].get("delta", {})
                    token = delta.get("content", "")

                    if token:
                        if decode_start_time is None:
                            decode_start_time = time.time()
                        buffer += token
                        total_tokens += 1
                        decode_end_time = time.time()

                        while "\n" in buffer:
                            line, buffer = buffer.split("\n", 1)
                            print(f"[Request {request_id}] {line}")

                    finish_reason = choices[0].get("finish_reason", None)
                    if finish_reason:
                        break

                except Exception as e:
                    print(f"[Request {request_id}] Stream Error: {e}")

            if buffer.strip():
                print(f"[Request {request_id}] {buffer.strip()}")

            if usage_info:
                if "prefill_time" in usage_info:
                    # print(f"[Request {request_id}] Usage:")
                    # for key, value in usage_info.items():
                    #     print(f"  {key}: {value}")
                    prefill_speed = usage_info["prompt_tokens"] / usage_info["prefill_time"]
                    decode_speed = usage_info["completion_tokens"] / usage_info["decode_time"]
                    prefill_speeds.append(prefill_speed)
                    decode_speeds.append(decode_speed)
                    print(f'[Request {request_id}] prefill speed: {prefill_speed}')
                    print(f'[Request {request_id}] decode speed: {decode_speed}')

    except Exception as e:
        print(f"[Request {request_id}] Exception: {e}")

async def main(concurrent_requests , prompt, max_tokens, model):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_event_stream(session, i , prompt, max_tokens, model) for i in range(concurrent_requests)]
        await asyncio.gather(*tasks)
    if len(prefill_speeds) != 0:
        import numpy as np
        print(f"concurrency: {len(prefill_speeds)}")
        print(f"total prefill speed: {np.sum(prefill_speeds)}\n total decode speed: {np.sum(decode_speeds)}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Event Stream Request Tester")
    parser.add_argument("--concurrent", type=int, default=1, help="Number of concurrent requests")
    parser.add_argument("--model", type=str, default="DeepSeek-V3", help="Model name")
    parser.add_argument("--prompt_lens", type=int, default=1024, help="prefill prompt lens, 1024 or 2048")
    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
    parser.add_argument("--max_tokens", type=int, default=50, help="max decode tokens")
    
    args = parser.parse_args()
    SERVER_URL = args.api_url
    max_tokens = args.max_tokens
    model = args.model
    if args.prompt_lens == 1024:
        prompt = ktansformer_prompt1024
    elif args.prompt_lens == 2048:
        prompt = ktansformer_prompt1024 * 2
    elif args.prompt_lens == 4096:
        prompt = ktansformer_prompt1024 * 4
    asyncio.run(main(args.concurrent, prompt, max_tokens, model))


================================================
FILE: kt-sft/ktransformers/tests/triton_fp8gemm_test.py
================================================
import torch
import torch.nn.functional as F
from typing import Optional
import pytest
from typing import Tuple, Optional, Literal
import time
# use dir path
import os
import sys
sys.path.insert(0, "/home/azure/ktransformers")
print(sys.path)
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from safetensors import safe_open

world_size = 1
rank = 0
block_size = 128
gemm_impl: Literal["bf16", "fp8"] = "bf16"
# Assuming `fp8_gemm`, `act_quant`, `weight_dequant` and other relevant functions are already defined

def test_fp8_gemm_vs_torch_matmul():
    # Test case 1: Create random matrices of size (M, K) and (K, N)
    M, K, N = 64, 128, 256  # Matrix dimensions
    x = torch.randn(M, K, dtype=torch.bfloat16, device='cuda')
    weight = torch.randn(N, K, dtype=torch.bfloat16, device='cuda')

    # Apply act_quant to both matrices
    x_quantized, scale_x = act_quant(x, block_size)
    weight_quantized, scale_w = act_quant(weight, block_size)
    
    # mk continous
    x_quantized = x_quantized.contiguous()
    weight_quantized = weight_quantized.contiguous()
    scale_x = scale_x.contiguous()
    scale_w = scale_w.contiguous()

    # Perform fp8_gemm using the quantized tensors
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight_quantized, scale_w)

    # Perform torch.matmul using the original floating point tensors
    result_torch_matmul = torch.matmul(x, weight.T)
    print(f'result_torch_matmul: {result_torch_matmul.shape}')
    print(f'result_fp8_gemm: {result_fp8_gemm.shape}')

    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
    print(f"result_torch_matmul:\n {result_torch_matmul}")
    
def test_fp8_gemm_vs_torch_matmul_load():
    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
    with safe_open(file_path, framework="pt", device=0) as f:
        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")

    # weight_dequant
    weight_dequantized = weight_dequant(weight, scale)
    print(f"weight_dequantized: {weight_dequantized.shape}")
    N, K = weight_dequantized.shape
    M = 64
    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
    x_quantized, scale_x = act_quant(x, block_size)
    
    # Test case 1: quantized x matmal with undequantized weight
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
    print(f"dtype {result_fp8_gemm.dtype}")

    # Perform torch.matmul using the original floating point tensors
    result_torch_matmul = torch.matmul(x, weight_dequantized.to(torch.bfloat16).T)
    print(f"result_torch_matmul:\n {result_torch_matmul}")

def test_fp8_gemm_tplops():
    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
    with safe_open(file_path, framework="pt", device=0) as f:
        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")

    # weight_dequant
    weight_dequantized = weight_dequant(weight, scale)
    print(f"weight_dequantized: {weight_dequantized.shape}")
    N, K = weight_dequantized.shape
    M = 6400
    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
    # x_quantized, scale_x = act_quant(x, block_size)
    
    # Calculate time for 1000 fp8_gemm
    i = 10
    flops_per_gemm = 2 * M * N * K
    total_flops = i * flops_per_gemm
    
    x_quantized, scale_x = act_quant(x, block_size)
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
    x_quantized, scale_x = act_quant(x, block_size)
    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)

    
    t0 = time.time()
    torch.cuda.synchronize()
    for i in range(i):
        x_quantized, scale_x = act_quant(x, block_size)
        result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
    torch.cuda.synchronize()
    t1 = time.time()
    
    total_time = t1 - t0
    tflops = total_flops / total_time / 1e12
    print(f"total_time: {total_time}")
    print(f"tflops: {tflops}")
    

if __name__ == "__main__":
    test_fp8_gemm_vs_torch_matmul()
    test_fp8_gemm_vs_torch_matmul_load()
    test_fp8_gemm_tplops()
    

================================================
FILE: kt-sft/ktransformers/util/cuda_graph_runner.py
================================================
'''
Description  :  
Author       : Boxin Zhang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import torch
from typing import Dict

class CUDAGraphRunner:

    def __init__(self):
        self.graph = None
        self.input_buffers: Dict[str, torch.Tensor] = {}
        self.output_buffers: Dict[str, torch.Tensor] = {}

    def capture(
        self,
        model,
        cur_token,
        position_ids,
        cache_position,
        past_key_values,
        main_device,
        **kwargs,
    ) -> None:
        assert self.graph is None
        # Capture the graph.
        torch.cuda.synchronize()
        self.graph = torch.cuda.CUDAGraph()
        #self.graph.enable_debug_mode()
        self.model = model
        inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(main_device)
        # torch.cuda.set_device can't set "cuda", must have a index
        if main_device == "cuda":
            main_device = "cuda:0"
        torch.cuda.set_device(main_device)
        self.main_device = main_device
        capture_stream = torch.cuda.Stream()
        with torch.cuda.graph(self.graph, stream = capture_stream):
            logits=model(inputs_embeds=inputs_embeds, 
                         position_ids=position_ids,
                         cache_position=cache_position,
                         past_key_values=past_key_values,
                         **kwargs)[0]
            capture_stream.wait_stream(torch.cuda.current_stream())
            torch.cuda.set_device(main_device)
            torch.cuda.set_stream(capture_stream)
        if past_key_values != None:    
            past_key_values.change_seq_length(-1)
        torch.cuda.synchronize(self.main_device)
        #self.graph.debug_dump("cuda_graph_hooked.dot")

        # Save the input and output buffers.
        self.input_buffers = {
            "inputs_embeds": inputs_embeds,
            "position_ids": position_ids,
            "cache_position": cache_position,
        }
        self.output_buffers = {"logits": logits}
        return

    def forward(
        self,
        cur_token,
        position_ids,
        cache_position,
    ) -> torch.Tensor:
        # Copy the input tensors to the input buffers.
        inputs_embeds = self.model.model.embed_tokens(cur_token.to("cpu"))
        self.input_buffers["inputs_embeds"].copy_(inputs_embeds)
        self.input_buffers["position_ids"].copy_(position_ids)
        self.input_buffers["cache_position"].copy_(cache_position)

        # Run the graph.
        #print("begin replay")
        #time.sleep(1)
        self.graph.replay()
        torch.cuda.synchronize(self.main_device)
        # Return the output tensor.
        return self.output_buffers["logits"]

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)


================================================
FILE: kt-sft/ktransformers/util/custom_gguf.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Azure-Tang, Boxin Zhang, chenht2022
Date         : 2024-07-26 08:48:54
Version      : 1.0.0
LastEditors  : kkk1nak0
LastEditTime : 2024-08-14 08:20:45
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
# GGUF specification
# https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
import struct
import warnings
import numpy as np
import re
import numpy.typing as npt
from typing import Sequence
import os
from enum import IntEnum
import torch
if not torch.xpu.is_available():
    import KTransformersOps
import ctypes
import math

class GGMLQuantizationType(IntEnum):
    F32     = 0
    F16     = 1
    Q4_0    = 2
    Q4_1    = 3
    Q5_0    = 6
    Q5_1    = 7
    Q8_0    = 8
    Q8_1    = 9
    Q2_K    = 10
    Q3_K    = 11
    Q4_K    = 12
    Q5_K    = 13
    Q6_K    = 14
    Q8_K    = 15
    IQ2_XXS = 16
    IQ2_XS  = 17
    IQ3_XXS = 18
    IQ1_S   = 19
    IQ4_NL  = 20
    IQ3_S   = 21
    IQ2_S   = 22
    IQ4_XS  = 23
    I8      = 24
    I16     = 25
    I32     = 26
    I64     = 27
    F64     = 28
    IQ1_M   = 29
    BF16    = 30

QK_K = 256
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.F32:     (1, 4),
    GGMLQuantizationType.F16:     (1, 2),
    GGMLQuantizationType.Q4_0:    (32, 2 + 16),
    GGMLQuantizationType.Q4_1:    (32, 2 + 2 + 16),
    GGMLQuantizationType.Q5_0:    (32, 2 + 4 + 16),
    GGMLQuantizationType.Q5_1:    (32, 2 + 2 + 4 + 16),
    GGMLQuantizationType.Q8_0:    (32, 2 + 32),
    GGMLQuantizationType.Q8_1:    (32, 4 + 4 + 32),
    GGMLQuantizationType.Q2_K:    (256, 2 + 2 + QK_K // 16 + QK_K // 4),
    GGMLQuantizationType.Q3_K:    (256, 2 + QK_K // 4 + QK_K // 8 + 12),
    GGMLQuantizationType.Q4_K:    (256, 2 + 2 + QK_K // 2 + 12),
    GGMLQuantizationType.Q5_K:    (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
    GGMLQuantizationType.Q6_K:    (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
    GGMLQuantizationType.Q8_K:    (256, 4 + QK_K + QK_K // 8),
    GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
    GGMLQuantizationType.IQ2_XS:  (256, 2 + QK_K // 4 + QK_K // 32),
    GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
    GGMLQuantizationType.IQ1_S:   (256, 2 + QK_K // 8 + QK_K // 16),
    GGMLQuantizationType.IQ4_NL:  (32, 2 + 16),
    GGMLQuantizationType.IQ3_S:   (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
    GGMLQuantizationType.IQ2_S:   (256, 2 + QK_K // 4 + QK_K // 16),
    GGMLQuantizationType.IQ4_XS:  (256, 2 + 2 + QK_K // 2 + QK_K // 64),
    GGMLQuantizationType.I8:      (1, 1),
    GGMLQuantizationType.I16:     (1, 2),
    GGMLQuantizationType.I32:     (1, 4),
    GGMLQuantizationType.I64:     (1, 8),
    GGMLQuantizationType.F64:     (1, 8),
    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
    GGMLQuantizationType.BF16:    (1, 2),
}

# copied from llama.cpp/gguf-py/gguf/quants.py to avoid dependence of gguf
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % block_size != 0:
        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
    return (*shape[:-1], shape[-1] // block_size * type_size)

GGML_TYPES = {
    "F32": 0,
    "F16": 1,
    "Q4_0": 2,
    "Q5_0": 6,
    "Q8_0": 8,
    "Q2_K": 10,
    "Q3_K": 11,
    "Q4_K": 12,
    "Q5_K": 13,
    "Q6_K": 14,
    "IQ4_XS": 23,
    "BF16": 30,
}

GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}

GGML_BLOCK_SIZES = {
    "F32": 4,
    "F16": 2,
    "BF16": 2,
    "Q4_0": 2 + 16,
    "Q5_0": 2 + 4 + 16,
    "Q8_0": 2 + 32,
    "Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
    "Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
    "Q4_K": 2 + 2 + 12 + 256 // 2,
    "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
    "Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2,
    "IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64,
    "FP8": 1,
}

GGML_ELEMENTS_PER_BLOCK = {
    "F32": 1,
    "F16": 1,
    "BF16": 1,
    "Q4_0": 32,
    "Q5_0": 32,
    "Q8_0": 32,
    "Q2_K": 256,
    "Q3_K": 256,
    "Q4_K": 256,
    "Q5_K": 256,
    "Q6_K": 256,
    "IQ4_XS": 256,
    "FP8": 1,
}

DATA_TYPES = {
    "uint8": 0,
    "int8": 1,
    "uint16": 2,
    "int16": 3,
    "uint32": 4,
    "int32": 5,
    "float32": 6,
    "bool": 7,
    "string": 8,
    "array": 9,
    "uint64": 10,
    "int64": 11,
    "float64": 12,
    "FP8": 13,
}


def read_value(f, data_type):
    if data_type == DATA_TYPES["string"]:
        length = struct.unpack("<Q", f.read(8))[0]
        return f.read(length).decode("utf-8")

    elif data_type == DATA_TYPES["bool"]:
        return bool(struct.unpack("<?", f.read(1))[0])

    elif data_type == DATA_TYPES["uint8"]:
        return struct.unpack("<B", f.read(1))[0]

    elif data_type == DATA_TYPES["int8"]:
        return struct.unpack("<b", f.read(1))[0]

    elif data_type == DATA_TYPES["uint16"]:
        return struct.unpack("<H", f.read(2))[0]

    elif data_type == DATA_TYPES["int16"]:
        return struct.unpack("<h", f.read(2))[0]

    elif data_type == DATA_TYPES["uint32"]:
        return struct.unpack("<I", f.read(4))[0]

    elif data_type == DATA_TYPES["int32"]:
        return struct.unpack("<i", f.read(4))[0]

    elif data_type == DATA_TYPES["float32"]:
        return struct.unpack("<f", f.read(4))[0]

    elif data_type == DATA_TYPES["uint64"]:
        return struct.unpack("<Q", f.read(8))[0]

    elif data_type == DATA_TYPES["int64"]:
        return struct.unpack("<q", f.read(8))[0]

    elif data_type == DATA_TYPES["float64"]:
        return struct.unpack("<d", f.read(8))[0]

    elif data_type == DATA_TYPES["array"]:
        elem_type, count = struct.unpack("<IQ", f.read(4 + 8))
        return [read_value(f, elem_type) for _ in range(count)]

    elif data_type == DATA_TYPES["FP8"]:
        return struct.unpack("<B", f.read(1))[0]

    else:
        raise NotImplementedError(f"Data type {data_type} not implemented")

def dequantize_q2_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74
    block_size = GGML_BLOCK_SIZES["Q2_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

    dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
    d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32)
    scales = data_u8[:, :16].reshape(num_blocks, 16, 1)
    qs = data_u8[:, 16:80].reshape(num_blocks, 64)

    tmp = np.stack([
        qs[:, 00:16] >> 0,
        qs[:, 16:32] >> 0,
        qs[:, 00:16] >> 2,
        qs[:, 16:32] >> 2,
        qs[:, 00:16] >> 4,
        qs[:, 16:32] >> 4,
        qs[:, 00:16] >> 6,
        qs[:, 16:32] >> 6,
        qs[:, 32:48] >> 0,
        qs[:, 48:64] >> 0,
        qs[:, 32:48] >> 2,
        qs[:, 48:64] >> 2,
        qs[:, 32:48] >> 4,
        qs[:, 48:64] >> 4,
        qs[:, 32:48] >> 6,
        qs[:, 48:64] >> 6,
    ], axis=1)

    return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)

def dequantize_q2_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q2_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q2_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q2_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q3_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95
    block_size = GGML_BLOCK_SIZES["Q3_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

    d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
    bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little")
    bits = 4 ^ (bits << 2)
    qs = data_u8[:, 32:32 + 64].astype(np.int16)
    a, b, c = data_u8[:, 96: 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2)
    scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8)
    scales[:, 0] = (a & 15) | ((c & 3) << 4)
    scales[:, 1] = (b & 15) | (((c >> 2) & 3) << 4)
    scales[:, 2] = (a >> 4) | (((c >> 4) & 3) << 4)
    scales[:, 3] = (b >> 4) | ((c >> 6) << 4)
    scales = scales.reshape(num_blocks, 16, 1).astype(np.int16)

    return d * (scales - 32) * np.stack([
        (((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]),
        (((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]),
        (((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]),
        (((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]),
        (((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]),
        (((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]),
        (((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]),
        (((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]),
        (((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]),
        (((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]),
        (((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]),
        (((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]),
        (((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]),
        (((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]),
        (((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]),
        (((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7])
    ], axis=1)

def dequantize_q3_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q3_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q3_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q3_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q4_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
    block_size = GGML_BLOCK_SIZES["Q4_K"]
    num_blocks = len(data) // block_size
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
    # Casting to float32 because float16 is very slow on CPU
    scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
    scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
    qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
    qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)
    # Dequantize scales and offsets (6 bits and 4 + 2 bits)
    factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1)
    offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1)
    # Interleave low and high quantized bits
    qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
    # Dequantize final weights using scales and offsets
    return factors * qs2 - offsets

def dequantize_q4_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q4_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q4_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q4_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q5_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138
    block_size = GGML_BLOCK_SIZES["Q5_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

    d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32)
    dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32)
    scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
    qh = data_u8[:, 16: 16 + 32].reshape(num_blocks, 32, 1)
    qs = data_u8[:, 48: 48 + 128].reshape(num_blocks, 4, 32)

    bits = np.unpackbits(qh, axis=-1, bitorder="little")

    qs_hi_4 = qs >> 4
    qs_lo_4 = qs & 15

    scales_lo_6 = scales[:, :8] & 63
    scales_hi_6 = scales[:, :8] >> 6
    scales_lo_4 = scales[:, 8:] & 15
    scales_hi_4 = scales[:, 8:] >> 4

    m1 = dmin * scales_lo_6[:, 4]
    m2 = dmin * scales_lo_6[:, 5]
    m3 = dmin * scales_lo_6[:, 6]
    m4 = dmin * scales_lo_6[:, 7]
    m5 = dmin * (scales_hi_4[:, 0] | (scales_hi_6[:, 4] << 4))
    m6 = dmin * (scales_hi_4[:, 1] | (scales_hi_6[:, 5] << 4))
    m7 = dmin * (scales_hi_4[:, 2] | (scales_hi_6[:, 6] << 4))
    m8 = dmin * (scales_hi_4[:, 3] | (scales_hi_6[:, 7] << 4))

    d1 = d * scales_lo_6[:, 0]
    d2 = d * scales_lo_6[:, 1]
    d3 = d * scales_lo_6[:, 2]
    d4 = d * scales_lo_6[:, 3]
    d5 = d * (scales_lo_4[:, 0] | (scales_hi_6[:, 0] << 4))
    d6 = d * (scales_lo_4[:, 1] | (scales_hi_6[:, 1] << 4))
    d7 = d * (scales_lo_4[:, 2] | (scales_hi_6[:, 2] << 4))
    d8 = d * (scales_lo_4[:, 3] | (scales_hi_6[:, 3] << 4))

    return np.concatenate([
        d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1,
        d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2,
        d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3,
        d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4,
        d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5,
        d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6,
        d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7,
        d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
    ], axis=1)

def dequantize_q5_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q5_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q5_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q5_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q6_k(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152
    block_size = GGML_BLOCK_SIZES["Q6_K"]
    num_blocks = len(data) // block_size

    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
    data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size)

    scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32)
    # TODO use uint8 and cast later?
    ql = data_u8[:, :128].astype(np.int16)
    qh = data_u8[:, 128:192].astype(np.int16)
    sc = data_i8[:, 192:208, np.newaxis].astype(np.float32)

    # Unpack bits, subtraction requires signed data type
    q1 = (ql[:,   :32 ] & 0xF) | (((qh[:, :32] >> 0) & 3) << 4) - 32
    q2 = (ql[:, 32:64 ] & 0xF) | (((qh[:, :32] >> 2) & 3) << 4) - 32
    q3 = (ql[:,   :32 ] >>  4) | (((qh[:, :32] >> 4) & 3) << 4) - 32
    q4 = (ql[:, 32:64 ] >>  4) | (((qh[:, :32] >> 6) & 3) << 4) - 32
    q5 = (ql[:, 64:96 ] & 0xF) | (((qh[:, 32:] >> 0) & 3) << 4) - 32
    q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:] >> 2) & 3) << 4) - 32
    q7 = (ql[:, 64:96 ] >>  4) | (((qh[:, 32:] >> 4) & 3) << 4) - 32
    q8 = (ql[:, 96:128] >>  4) | (((qh[:, 32:] >> 6) & 3) << 4) - 32

    # Dequantize
    return scales * np.concatenate([
        sc[:,  0] * q1[:, :16],
        sc[:,  1] * q1[:, 16:],
        sc[:,  2] * q2[:, :16],
        sc[:,  3] * q2[:, 16:],
        sc[:,  4] * q3[:, :16],
        sc[:,  5] * q3[:, 16:],
        sc[:,  6] * q4[:, :16],
        sc[:,  7] * q4[:, 16:],
        sc[:,  8] * q5[:, :16],
        sc[:,  9] * q5[:, 16:],
        sc[:, 10] * q6[:, :16],
        sc[:, 11] * q6[:, 16:],
        sc[:, 12] * q7[:, :16],
        sc[:, 13] * q7[:, 16:],
        sc[:, 14] * q8[:, :16],
        sc[:, 15] * q8[:, 16:],
    ], axis=1) 

# @torch.jit.script
def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q6_K"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q6_K"]
    device = torch.device(device)
    num_blocks = len(data) // block_size
    data = np.frombuffer(data, dtype=data.dtype)
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q6_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

kvalues_iq4nl = np.array([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=np.int8)

def dequantize_iq4_xs(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/21d3a308fcb7f31cb9beceaeebad4fb622f3c337/src/ggml-quants.c#L3568
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/21d3a308fcb7f31cb9beceaeebad4fb622f3c337/src/ggml-common.h#L393
    block_size = GGML_BLOCK_SIZES["IQ4_XS"]
    num_blocks = len(data) // block_size

    d = np.frombuffer(data, dtype=np.float16)[0::block_size//2].astype(np.float32).reshape(num_blocks, 1)
    scales_h = np.frombuffer(data, dtype=np.uint16)[1::block_size//2].reshape(num_blocks, 1)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)[:, 4:]
    scales_l = data_u8[:, :4].reshape(num_blocks, 4)
    qs = data_u8[:, 4:].reshape(num_blocks, block_size - 8)

    ls = np.zeros((num_blocks, QK_K // 32), dtype=np.int8)
    for ib in range(QK_K // 32):
        ls[:, ib] = ((scales_l[:, ib // 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h[:, 0] >> 2 * ib) & 3) << 4)

    dl = (d * (ls - 32)).reshape(num_blocks, -1, 1)

    qs_lo_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) & 0xf
    qs_hi_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) >> 4

    y = np.zeros((num_blocks, QK_K), dtype=np.float32)
    for ib in range(QK_K // 32):
        y[:, ib*32:(ib*32)+16] = dl[:, ib] * kvalues_iq4nl[qs_lo_4[:, ib]]
        y[:, (ib*32)+16:(ib*32)+32] = dl[:, ib] * kvalues_iq4nl[qs_hi_4[:, ib]]

    return y.flatten()

def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["IQ4_XS"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["IQ4_XS"]
    device = torch.device(device)
    num_blocks = len(data) // block_size
    data = np.frombuffer(data, dtype=data.dtype)
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_iq4_xs(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

def dequantize_q4_0(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141
    num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"]

    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32)
    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:]

    return np.concatenate([
        scales * ((qs & 0xf).astype(np.int8) - 8),
        scales * ((qs >> 4).astype(np.int8) - 8),
    ], axis=1)

def dequantize_q4_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    raise NotImplementedError()

def dequantize_q5_0(data):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161
    num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"]

    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32)
    qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4]
    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:]

    bits = np.unpackbits(qh, axis=-1, bitorder="little")

    x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16
    x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16

    return np.concatenate([
        scales * x0,
        scales * x1,
    ], axis=1)

def dequantize_q5_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    raise NotImplementedError()

def dequantize_q8_0(data):
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
    num_blocks = len(data) // GGML_BLOCK_SIZES["Q8_0"]

    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32)
    qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
    return scales * qs

def dequantize_q8_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
    
    block_size = GGML_BLOCK_SIZES["Q8_0"]
    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q8_0"]
    device = torch.device(device)
    data = np.frombuffer(data, dtype=data.dtype)
    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
    return KTransformersOps.dequantize_q8_0(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)


def dequantize_f32(data):
    return np.frombuffer(data, dtype=np.float32)

def dequantize_f32_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float32)
    res = torch.from_numpy(data.copy())
    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
    res_gpu.copy_(res)
    return res_gpu

def dequantize_f16(data):
    return np.frombuffer(data, dtype=np.float16)

def dequantize_f16_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float16)
    res = torch.from_numpy(data.copy())
    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
    res_gpu.copy_(res)
    return res_gpu

def dequantize_bf16_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float16)
    res = torch.from_numpy(data.copy())
    res_gpu = torch.empty_like(res, device=device)
    res_gpu.copy_(res)
    return res_gpu

GGML_DEQUANTIZE = {
    "F32": dequantize_f32,
    "F16": dequantize_f16,
    "BF16": dequantize_f16,
    "Q4_0": dequantize_q4_0,
    "Q5_0": dequantize_q5_0,
    "Q8_0": dequantize_q8_0,
    "Q2_K": dequantize_q2_k,
    "Q3_K": dequantize_q3_k,
    "Q4_K": dequantize_q4_k,
    "Q5_K": dequantize_q5_k,
    "Q6_K": dequantize_q6_k,
    "IQ4_XS": dequantize_iq4_xs,
}

GGML_DEQUANTIZE_GPU = {
    "F32": dequantize_f32_gpu,
    "F16": dequantize_f16_gpu,
    "BF16": dequantize_bf16_gpu,
    "Q4_0": dequantize_q4_0_gpu,
    "Q5_0": dequantize_q5_0_gpu,
    "Q8_0": dequantize_q8_0_gpu,
    "Q2_K": dequantize_q2_k_gpu,
    "Q3_K": dequantize_q3_k_gpu,
    "Q4_K": dequantize_q4_k_gpu,
    "Q5_K": dequantize_q5_k_gpu,
    "Q6_K": dequantize_q6_k_gpu,
    "IQ4_XS": dequantize_iq4_xs_gpu,
}


def translate_name_to_gguf_mixtral(name):
    
    replacement_template = {
        "w1.weight": "ffn_gate",
        "w2.weight": "ffn_down",
        "w3.weight": "ffn_up"
    }  

    pattern = re.compile(r"model.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.(w\d\.weight)")

    def replace_match(match):
        blk_id = match.group(1)
        expert_id = match.group(2)
        weight_type = match.group(3)
        if weight_type in replacement_template:
            return f"blk.{blk_id}.{replacement_template[weight_type]}.{expert_id}.weight"
        else:
            return match.group(0)

    new_name = re.sub(pattern, replace_match, name)
    
    return new_name

def translate_name_to_gguf(name):

    name = translate_name_to_gguf_mixtral(name)

    name = name.replace("lm_head.", "output.")
    name = name.replace("model.embed_tokens.", "token_embd.")
    name = name.replace("model.norm.", "output_norm.")
    
    name = name.replace("model.layers.", "blk.")
    name = name.replace(".input_layernorm", ".attn_norm")
    name = name.replace(".mlp.down_proj", ".ffn_down")
    name = name.replace(".mlp.gate_proj", ".ffn_gate")
    name = name.replace(".mlp.up_proj", ".ffn_up")
    name = name.replace(".post_attention_layernorm", ".ffn_norm")
    name = name.replace(".self_attn.q_proj", ".attn_q")
    name = name.replace(".self_attn.k_proj", ".attn_k")
    name = name.replace(".self_attn.v_proj", ".attn_v")
    name = name.replace(".self_attn.o_proj", ".attn_output")
    name = name.replace(".self_attn.qkv_proj", ".attn_qkv")
    name = name.replace(".self_attn.kv_a_proj_with_mqa", ".attn_kv_a_mqa")
    name = name.replace(".self_attn.kv_a_layernorm", ".attn_kv_a_norm")
    name = name.replace(".self_attn.kv_b_proj", ".attn_kv_b")
    name = name.replace(".self_attn.q_a_proj", ".attn_q_a")
    name = name.replace(".self_attn.q_a_layernorm", ".attn_q_a_norm")
    name = name.replace(".self_attn.q_b_proj", ".attn_q_b")
    
    name = name.replace(".shared_expert.", ".shared_experts.")
    name = name.replace(".shared_expert_", ".shared_experts_")
    name = name.replace(".gate_up_proj.", ".up_proj")
    
    name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
    name = name.replace(".mlp.gate", ".ffn_gate_inp")
    name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
    name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
    name = name.replace(".mlp.shared_experts_gate", ".ffn_gate_inp_shexp")
    name = name.replace(".mlp.experts", "")
    name = name.replace(".mlp.experts.ffn_down_exps", ".ffn_down_exps")
    name = name.replace(".mlp.experts.ffn_gate_exps", ".ffn_gate_exps")
    name = name.replace(".mlp.experts.ffn_up_exps", ".ffn_up_exps")

    
    name = name.replace(".block_sparse_moe.gate.", ".ffn_gate_inp.")
    name = name.replace(".block_sparse_moe.experts", "")
    
    return name

def translate_adapter_name_to_gguf(name):

    # name = translate_name_to_gguf_mixtral(name)

    name = name.replace("lora_A.default.weight", "lora_A.weight")
    name = name.replace("lora_B.default.weight", "lora_B.weight")
    # NOT fine-tun embedding model
    # name = name.replace("base_model.model", "token_embd.")
    # name = name.replace("model.norm.", "output_norm.")
    
    name = name.replace("blk.", "model.layers.")
    # name = name.replace(".input_layernorm", ".attn_norm")
    # name = name.replace(".mlp.down_proj", ".ffn_down")
    # name = name.replace(".mlp.gate_proj", ".ffn_gate")
    # name = name.replace(".mlp.up_proj", ".ffn_up")
    # name = name.replace(".post_attention_layernorm", ".ffn_norm")
    # name = name.replace(".self_attn.q_proj", ".attn_q")
    # name = name.replace(".self_attn.k_proj", ".attn_k")
    # name = name.replace(".self_attn.v_proj", ".attn_v")
    # name = name.replace(".self_attn.o_proj", ".attn_output")
    # name = name.replace(".self_attn.qkv_proj", ".attn_qkv")
    # name = name.replace(".self_attn.kv_a_proj_with_mqa", ".attn_kv_a_mqa")
    # name = name.replace(".self_attn.kv_a_layernorm", ".attn_kv_a_norm")
    # name = name.replace(".self_attn.kv_b_proj", ".attn_kv_b")
    # name = name.replace(".self_attn.q_a_proj", ".attn_q_a")
    # name = name.replace(".self_attn.q_a_layernorm", ".attn_q_a_norm")
    # name = name.replace(".self_attn.q_b_proj", ".attn_q_b")
    
    # name = name.replace(".shared_expert.", ".shared_experts.")
    # name = name.replace(".shared_expert_", ".shared_experts_")
    # name = name.replace(".gate_up_proj.", ".up_proj")
    
    # name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
    # name = name.replace(".mlp.gate", ".ffn_gate_inp")
    # name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
    # name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
    # name = name.replace(".mlp.shared_experts_gate", ".ffn_gate_inp_shexp")
    # name = name.replace(".mlp.experts", "")
    # name = name.replace(".mlp.experts.ffn_down_exps", ".ffn_down_exps")
    # name = name.replace(".mlp.experts.ffn_gate_exps", ".ffn_gate_exps")
    # name = name.replace(".mlp.experts.ffn_up_exps", ".ffn_up_exps")

    
    # name = name.replace(".block_sparse_moe.gate.", ".ffn_gate_inp.")
    # name = name.replace(".block_sparse_moe.experts", "")
    
    return name


if __name__ == '__main__':
    gguf_path = '/mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH'
    loader = GGUFLoader(gguf_path)
    loader.load_gguf_tensor('token_embd.weight')


================================================
FILE: kt-sft/ktransformers/util/custom_loader.py
================================================
import struct
import warnings
import numpy as np
import re
import numpy.typing as npt
from typing import Sequence
import os
from enum import IntEnum
import torch
if not torch.xpu.is_available():
    import KTransformersOps
from safetensors import safe_open
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from ktransformers.util.custom_gguf import *
from safetensors.torch import save_file
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, Union

class ModelLoader(ABC):
    """
    Abstract base class for model loaders.
    Defines the interface that all model loaders must implement.
    """
    tensor_file_map = {}
    @abstractmethod
    def has_tensor(cls, name: str):
        """
        Check if the tensor exists in the loader.
        
        Args:
            name: Name of the tensor to check
            
        Returns:
            bool: True if the tensor exists, False otherwise
        """
        pass

class SafeTensorLoader(ModelLoader):
    tensor_file_map: dict
    tensor_type_map: dict
    file_handle_map: dict
    tensor_device_map: dict
    
    def __init__(self, file_path: str):
        self.__load_tensor_file_map(file_path)

    def __load_tensor_file_map(self, file_path: str):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Path not found: {file_path}")
        if os.path.isfile(file_path):
            folder_path = os.path.dirname(file_path)
        else:
            folder_path = file_path
        self.file_handle_map = {}
        self.tensor_file_map = {}
        self.tensor_type_map = {}
        self.tensor_device_map = {}

        found_safetensor = False
        for root, _, files in os.walk(folder_path):
            files = sorted(files)
            for file in files:
                if file.endswith(".safetensors"):
                    found_safetensor = True
                    file_path = os.path.join(root, file)
                    if file not in self.file_handle_map:
                        try:
                            handle = safe_open(file_path, framework="pt")
                            self.file_handle_map[file] = handle
                        except Exception as e:
                            print(f"Error opening Safetensor file {file_path}: {e}")
                            continue

                    f = self.file_handle_map.get(file)
                    if f is None:
                        continue
                    try:
                        for key in f.keys():
                            self.tensor_file_map[key] = file
                    except Exception as e:
                        print(f"Error reading Safetensor file {file_path}: {e}")

        # if not found_safetensor:
        #     raise FileNotFoundError(f"No Safetensor files found in {folder_path}")

    def load_tensor(self, key: str, device: str="cpu"):
        if translate_name_to_gguf(key) in self.tensor_file_map:
            key = translate_name_to_gguf(key)
        elif key in self.tensor_file_map:
            pass
        else:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        return tensor.to(device)

    def load_experts(self, key: str, device: str="cpu"):
        '''
        Load experts from safetensor
        key: the name of the experts
        device: the device to load the experts to
        return: dict, 
        {up: tensor, down: tensor, gate: tensor, up_type: int, down_type: int, gate_type: int}
        {xxx}_type: the type of the up tensor, corresponding to the ggml type
        '''
        if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
            # legacy branch for loading hybrid model
            base_key = translate_name_to_gguf(key)
            # Load experts from safetensor
            gate_key = f"{base_key}.ffn_gate_exps.weight"
            gate_type_key = f"{base_key}.ffn_gate_exps.ggml_type"
            up_key = f"{base_key}.ffn_up_exps.weight"
            up_type_key = f"{base_key}.ffn_up_exps.ggml_type"
            down_key = f"{base_key}.ffn_down_exps.weight"
            down_type_key = f"{base_key}.ffn_down_exps.ggml_type"
            gate_tensor = self.load_tensor(gate_key, device).numpy()
            up_tensor = self.load_tensor(up_key, device).numpy()
            down_tensor = self.load_tensor(down_key, device).numpy()
            gate_type = self.load_tensor(gate_type_key, device).item()
            up_type = self.load_tensor(up_type_key, device).item()
            down_type = self.load_tensor(down_type_key, device).item()

            return {
                "up": up_tensor,
                "gate": gate_tensor,
                "down": down_tensor,
                "up_type": up_type,
                "gate_type": gate_type,
                "down_type": down_type
            }

        else:
            # Load experts from safetensor
            base_key = key  # e.g. "model.layers.3.mlp.experts"
            experts_count = 0
            
            # First, count how many experts we have by checking for expert 0's up_proj
            while self.has_tensor(f"{base_key}.{experts_count}.up_proj.weight"):
                experts_count += 1
            
            if experts_count == 0:
                raise ValueError(f"No experts found for key {base_key}")
            
            # Initialize empty lists to store tensors for each projection type
            up_projs = []
            gate_projs = []
            down_projs = []
            
            # Load all expert weights
            for expert_id in range(experts_count):
                up_key = f"{base_key}.{expert_id}.up_proj.weight"
                gate_key = f"{base_key}.{expert_id}.gate_proj.weight"
                down_key = f"{base_key}.{expert_id}.down_proj.weight"
                
                up_tensor = self.load_tensor(up_key, device)
                gate_tensor = self.load_tensor(gate_key, device)
                down_tensor = self.load_tensor(down_key, device)
                
                up_projs.append(up_tensor)
                gate_projs.append(gate_tensor)
                down_projs.append(down_tensor)
            
            # Stack the tensors along a new dimension
            up_tensor = torch.stack(up_projs, dim=0)
            gate_tensor = torch.stack(gate_projs, dim=0)
            down_tensor = torch.stack(down_projs, dim=0)
            
            # Get original dtype for GGML type determination
            orig_up_dtype = up_tensor.dtype
            orig_gate_dtype = gate_tensor.dtype
            orig_down_dtype = down_tensor.dtype
            
            # Convert to numpy with proper bfloat16 support
            up_numpy = up_tensor.view(torch.uint16).numpy()
            gate_numpy = gate_tensor.view(torch.uint16).numpy()
            down_numpy = down_tensor.view(torch.uint16).numpy()
            
            # Determine tensor data types for GGML conversion
            def get_ggml_type(dtype):
                if dtype == torch.float32:
                    return GGMLQuantizationType.F32
                elif dtype == torch.float16:
                    return GGMLQuantizationType.F16
                elif dtype == torch.bfloat16:
                    return GGMLQuantizationType.BF16
                else:
                    raise ValueError(f"Unsupported tensor dtype: {dtype}")
            
            return {
                "up": up_numpy,
                "gate": gate_numpy,
                "down": down_numpy,
                "up_type": get_ggml_type(orig_up_dtype),
                "gate_type": get_ggml_type(orig_gate_dtype),
                "down_type": get_ggml_type(orig_down_dtype)
            }
                
    def load_gate(self, key: str, device: str="cpu"):
        '''
        Load gate from safetensor
        key: the name of the gate
        device: the device to load the gate to
        return: dict, 
        {'weight': tensor, 'e_score_correction_bias': tensor}
        '''
        target = ["weight", "e_score_correction_bias"]
        res = {'weight': None, 'e_score_correction_bias': None}
        if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"):
            # legacy branch for loading hybrid model
            base_key = key
            for k in target:
                translated_key = translate_name_to_gguf(f"{base_key}.{k}")
                if self.has_tensor(translated_key):
                    tensor = self.load_tensor(translated_key, device)
                    res[k] = tensor
        else:
            # Load gate from safetensor
            base_key = key
            for k in target:
                if self.has_tensor(f"{base_key}.{k}"):
                    tensor = self.load_tensor(f"{base_key}.{k}", device)
                    res[k] = tensor
        return res
    
    def close_all_handles(self):
        for handle in self.file_handle_map.values():
            handle.close()
        self.file_handle_map.clear()

    def load_dequantized_tensor(self, key:str, device: str="cpu"):
        if key in self.tensor_file_map and translate_name_to_gguf(key):
            pass
        elif translate_name_to_gguf(key) in self.tensor_file_map:
            key = translate_name_to_gguf(key)
        else:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key).to(device)
        if key.endswith(".weight"):
            if key[:-7] + ".weight_scale_inv" in self.tensor_file_map:
                weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
                tensor = weight_dequant(tensor, weight_scale_inv)
        return tensor.to(device)
    
    def has_tensor(self, name: str):
        return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map

class GGUFLoader(ModelLoader):
    tensor_info: dict
    gguf_path: str
    tensor_file_map: dict # {tensor_name: tensor_file_path}
    gguf_file_meta: dict
    safetensor_loader: SafeTensorLoader
    def __init__(self, gguf_path: str):
        # Check dir exist
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
        if os.path.isfile(gguf_path):
            gguf_path = os.path.dirname(gguf_path)

        self.safetensor_loader = None
        
        self.tensor_info = {}
        self.gguf_path = gguf_path
        self.tensor_file_map = {}
        self.file_data_map = {}
        self.gguf_file_meta = {}
        self.tensor_device_map = {}

		# I know this is ugly, but I don't want to change the original code too much
        # TODO: merge gguf load and other loads.
        safetensor_loader = SafeTensorLoader(gguf_path)
        if safetensor_loader.tensor_file_map:
            self.safetensor_loader = safetensor_loader
            return
        # Walk through all the .gguf files in the directory
        found_gguf = False
        for root, dirs, files in os.walk(gguf_path):
            for file in files:
                if file.endswith(".gguf"):
                    found_gguf = True
                    file_name = os.path.join(root, file)
                    with open(file_name, "rb") as f:
                        self.load_gguf(f)
                        if file_name not in self.file_data_map:
                            self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
        if not found_gguf:
            raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
                            
    def load_gguf(self, f):
        f.seek(0)
        assert f.read(4) == b'GGUF'
        values = struct.unpack("<IQQ", f.read(4+8+8))
        version, n_tensors, n_kv = values
        if version != 3:
            warnings.warn(f"Version {version} has never been tested, might not work")

        info = {}
        for _ in range(n_kv):
            name = read_value(f, DATA_TYPES["string"])

            data_type = struct.unpack("<I", f.read(4))[0]

            info[name] = read_value(f, data_type)

        tensor_info = {}
        for _ in range(n_tensors):
            name = read_value(f, DATA_TYPES["string"])
            shape_len = read_value(f, DATA_TYPES["uint32"])
            shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
            ggml_type = read_value(f, DATA_TYPES["uint32"])
            bad_offset = read_value(f, DATA_TYPES["uint64"])
            n_elems = int(math.prod(shape))
            block_size, type_size = GGML_QUANT_SIZES[ggml_type]
            n_bytes = n_elems * type_size // block_size
            np_dims = tuple(reversed(shape))
        
            item_type: npt.DTypeLike
            if ggml_type == GGMLQuantizationType.F16:
                item_count = n_elems
                item_type = np.float16
            elif ggml_type == GGMLQuantizationType.F32:
                item_count = n_elems
                item_type = np.float32
            elif ggml_type == GGMLQuantizationType.F64:
                item_count = n_elems
                item_type = np.float64
            elif ggml_type == GGMLQuantizationType.I8:
                item_count = n_elems
                item_type = np.int8
            elif ggml_type == GGMLQuantizationType.I16:
                item_count = n_elems
                item_type = np.int16
            elif ggml_type == GGMLQuantizationType.I32:
                item_count = n_elems
                item_type = np.int32
            elif ggml_type == GGMLQuantizationType.I64:
                item_count = n_elems
                item_type = np.int64
            else:
                item_count = n_bytes
                item_type = np.uint8
                np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)

            tensor_info[name] = {
                "ggml_type": ggml_type,
                "shape": shape,
                "bad_offset": bad_offset,
                "item_type": item_type,
                "item_count": item_count,
                "np_dims": np_dims
            }

        start = f.tell()
        # Alignment is 32 by default.
        # https://github.com/ggerganov/ggml/blob/e1daebbf9d38d510ba456c4d50b4500a73ac2b14/docs/gguf.md?plain=1#L253
        alignment = info.get("general.alignment", 32)

        # Inconveniently, the offset defined in gguf files is relative to the
        # end of the header and is unaligned.
        # We need to compute the absolute file offset ourselves instead.
        for t in tensor_info.values():
            offset = start + t["bad_offset"]
            offset += (alignment - offset % alignment) % alignment
            t["offset"] = offset
            
        for name in tensor_info:
            self.tensor_file_map[name] = f.name
        self.tensor_info.update(tensor_info)
        self.gguf_file_meta.update(info)
    
    def get_mmap_tensor(self, name):
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        mmap_data = self.file_data_map[ self.tensor_file_map[name] ]

        offset = t["offset"]
        item_type = t["item_type"]
        item_count = t["item_count"]
        itemsize = int(np.empty([], dtype = item_type).itemsize)
        return mmap_data[offset : offset + itemsize * item_count]

    def get_undequanted_tensor_and_ggml_type(self, name):
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        data = self.get_mmap_tensor(name)
        ggml_type = t["ggml_type"]
        data = torch.from_numpy(data)
        return data, ggml_type

    def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        shape = t["shape"]
        ggml_type = t["ggml_type"]
        if ggml_type not in GGML_NAMES:
            raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
        ggml_name = GGML_NAMES[ggml_type]

        # TODO: experts may fused in quant block, split it
        assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"

        blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
        block_size = GGML_BLOCK_SIZES[ggml_name]
        offset = expert_id * block_size * blocks_per_experts
        data = data[offset: offset + block_size * blocks_per_experts]

        if "cuda" in device.lower():
            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
        else:
            values = GGML_DEQUANTIZE[ggml_name](data)
            values = torch.from_numpy(values.copy())

        if ggml_name == "BF16":
            values = values.view(torch.bfloat16)
        values = values.view(shape[-2::-1])

        return values

    def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
        name = translate_name_to_gguf(name)
        t = self.tensor_info[name]
        if target_dtype == None:
            target_dtype = torch.get_default_dtype()
        
        shape = t["shape"]
        ggml_type = t["ggml_type"]

        if ggml_type not in GGML_NAMES:
            raise NotImplementedError(f"ggml_type {ggml_type} not implemented")

        ggml_name = GGML_NAMES[ggml_type]

        data = self.get_mmap_tensor(name)

        block_size = GGML_BLOCK_SIZES[ggml_name]
        elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
        num_elements = int(np.prod(shape))
        num_blocks = num_elements // elements_per_block
        
        blocks_per_iter = 16384
        if num_blocks > blocks_per_iter: # dequant large tensor
            values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
            for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
                blocks_begin = i * blocks_per_iter
                blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
                if "cuda" in device.lower():
                    try:
                        cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
                    except:
                        cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
                        cur_values = torch.from_numpy(cur_values.copy()).to(device)
                else:
                    cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
                    cur_values = torch.from_numpy(cur_values.copy())
                
                cur_values = cur_values.view(-1, elements_per_block)
                if ggml_name == "BF16":
                    cur_values = cur_values.view(torch.bfloat16)
                values[blocks_begin : blocks_end] = cur_values
        else:
            if "cuda" in device.lower():
                values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
            else:
                np_values = np.copy(GGML_DEQUANTIZE[ggml_name](data))
                values = torch.from_numpy(np_values).to(device)
                del np_values

        if ggml_name == "BF16":
            values = values.view(torch.bfloat16)
            

        values = values.view(shape[::-1])
        if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
            n_head = self.gguf_file_meta['llama.attention.head_count']
            values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
            .swapaxes(1, 2)
            .reshape(values.shape))
        elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
            n_head = self.gguf_file_meta['llama.attention.head_count_kv'] 
            values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:])
            .swapaxes(1, 2)
            .reshape(values.shape))
        return values
    def has_tensor(self, name: str):
        name = translate_name_to_gguf(name)
        return name in self.tensor_info

    def get_ggml_type(self, name: str):
        name = translate_name_to_gguf(name)
        if name not in self.tensor_info:
            raise KeyError(f"Key {name} not found in GGUF files")
        return self.tensor_info[name]["ggml_type"]
    
class ModelLoaderFactory:
    """
    Factory class for creating model loaders.
    Automatically detects the model format based on file extensions in the directory.
    """
    
    @staticmethod
    def create_loader(path: str):
        """
        Create a model loader for the given path by detecting the model format.
        The function checks for the presence of .safetensors or .gguf files
        in the specified path and creates the appropriate loader.
        
        Args:
            path: Path to the model directory or file
            
        Returns:
            An appropriate ModelLoader instance (SafeTensorLoader or GGUFLoader)
        
        Raises:
            FileNotFoundError: If no supported model files are found in the path
        """
        if not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
            
        # Normalize to directory path if a file was provided
        if os.path.isfile(path):
            if path.endswith(".safetensors"):
                return SafeTensorLoader(path)
            elif path.endswith(".gguf"):
                return GGUFLoader(path)
            else:
                folder_path = os.path.dirname(path)
        else:
            folder_path = path
            
        # Check for safetensors files
        has_safetensors = False
        has_gguf = False
        
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith(".safetensors"):
                    has_safetensors = True
                    break
                elif file.endswith(".gguf"):
                    has_gguf = True
                    break
            if has_safetensors or has_gguf:
                break
                
        # Create the appropriate loader based on detected file types
        # Prioritize SafeTensor over GGUF if both are present
        if has_safetensors:
            try:
                return SafeTensorLoader(folder_path)
            except Exception as e:
                print(f"Failed to create SafeTensorLoader: {e}")
                # Fall through to try GGUF if SafeTensor fails
                if not has_gguf:
                    raise
        
        if has_gguf:
            try:
                return GGUFLoader(folder_path)
            except Exception as e:
                print(f"Failed to create GGUFLoader: {e}")
                raise
        
        # No supported model files found
        raise FileNotFoundError(f"No .safetensors or .gguf files found in: {folder_path}")

================================================
FILE: kt-sft/ktransformers/util/globals.py
================================================
import os

class _GlobalConfig:
    def __init__(self):
        self._config = {
            "mod": 'infer', # infer or sft
        }

    def get(self, key, default=None):
        return self._config.get(key, default)

    def set(self, key, value):
        self._config[key] = value

    def update(self, **kwargs):
        self._config.update(kwargs)

    def all(self):
        return self._config

    def __getitem__(self, key):
        return self._config[key]

    def __setitem__(self, key, value):
        self._config[key] = value

GLOBAL_CONFIG = _GlobalConfig()


================================================
FILE: kt-sft/ktransformers/util/grad_wrapper.py
================================================
from functools import wraps
import torch, yaml, pathlib

import os, sys
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
sys.path.insert(0, project_dir)

from ktransformers.util.globals import GLOBAL_CONFIG

# print(f"start_sit: {GLOBAL_CONFIG._config['mod']}")

def maybe_no_grad(_func=None):
    # print(f"maybe_sit: {GLOBAL_CONFIG._config['mod']}")
    
    def decorator(func):
        # print(f"decorate_sit: {GLOBAL_CONFIG._config['mod']}")
        def wrapper(*args, **kwargs):
            # print(f"wrap_sit: {GLOBAL_CONFIG._config['mod']}")
            if GLOBAL_CONFIG._config["mod"] == "sft":
                return func(*args, **kwargs)
            elif GLOBAL_CONFIG._config["mod"] == "infer":
                with torch.no_grad():
                    return func(*args, **kwargs)
        return wrapper

    if _func is None:
        return decorator
    else:
        return decorator(_func)


================================================
FILE: kt-sft/ktransformers/util/inference_state.py
================================================

import enum


class InferenceState(enum.Enum):
    UNLOAD = 0
    PREFILL = 1
    GENERATE = 2
    RESTORE = 3


================================================
FILE: kt-sft/ktransformers/util/modeling_rope_utils.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from typing import Optional, Tuple

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import is_torch_available, logging


logger = logging.get_logger(__name__)


if is_torch_available():
    import torch


def _compute_default_rope_parameters(
    config: Optional[PretrainedConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        base = rope_kwargs["base"]
        dim = rope_kwargs["dim"]
    elif config is not None:
        base = config.rope_theta
        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        dim = int(head_dim * partial_rotary_factor)

    attention_factor = 1.0  # Unused in this type of RoPE

    # Compute the inverse frequencies
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, attention_factor


def _compute_linear_scaling_rope_parameters(
    config: Optional[PretrainedConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        factor = rope_kwargs["factor"]
    elif config is not None:
        factor = config.rope_scaling["factor"]

    # Gets the default RoPE parameters
    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)

    # Then applies linear scaling to the frequencies.
    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
    # applying scaling to the inverse frequencies is equivalent.
    inv_freq /= factor
    return inv_freq, attention_factor


def _compute_dynamic_ntk_parameters(
    config: Optional[PretrainedConfig] = None,
    device: Optional["torch.device"] = None,
    seq_len: Optional[int] = None,
    **rope_kwargs,
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    """
    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
    if config is not None and len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
            f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
        )
    if len(rope_kwargs) > 0:
        base = rope_kwargs["base"]
        dim = rope_kwargs["dim"]
        max_position_embeddings = rope_kwargs["max_position_embeddings"]
        factor = rope_kwargs["factor"]
    elif config is not None:
        base = config.rope_theta
        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        dim = int(head_dim * partial_rotary_factor)
        max_position_embeddings = config.max_position_embeddings
        factor = config.rope_scaling["factor"]

    attention_factor = 1.0  # Unused in this type of RoPE

    # seq_len: default to max_position_embeddings, e.g. at init time
    seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings

    # Compute the inverse frequencies
    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
    return inv_freq, attention_factor


def _compute_yarn_parameters(
    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://arxiv.org/abs/2309.00071)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    """
    # No need to keep BC with yarn, unreleased when this new pattern was created.
    if len(rope_kwargs) > 0:
        raise ValueError(
            f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
        )

    base = config.rope_theta
    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
    head_dim = getattr(config, "qk_rope_head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    factor = config.rope_scaling["factor"]
    attention_factor = config.rope_scaling.get("attention_factor")
    mscale = config.rope_scaling.get("mscale")
    mscale_all_dim = config.rope_scaling.get("mscale_all_dim")

    # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
    # values to compute the default attention scaling factor, instead of using `factor`.
    if "original_max_position_embeddings" in config.rope_scaling:
        original_max_position_embeddings = config.rope_scaling["original_max_position_embeddings"]
        factor = config.max_position_embeddings / original_max_position_embeddings
    else:
        original_max_position_embeddings = config.max_position_embeddings

    def get_mscale(scale, mscale=1):
        if scale <= 1:
            return 1.0
        return 0.1 * mscale * math.log(scale) + 1.0

    # Sets the attention factor as suggested in the paper
    if attention_factor is None:
        if mscale and mscale_all_dim:
            attention_factor = float(get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim))
        else:
            attention_factor = get_mscale(factor)

    # Optional config options
    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
    beta_fast = config.rope_scaling.get("beta_fast") or 32
    beta_slow = config.rope_scaling.get("beta_slow") or 1

    # Compute the inverse frequencies
    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
        """Inverse dimension formula to find the dimension based on the number of rotations"""
        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))

    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
        """Find dimension range bounds based on rotations"""
        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
        return max(low, 0), min(high, dim - 1)

    def linear_ramp_factor(min, max, dim):
        if min == max:
            max += 0.001  # Prevent singularity

        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
        ramp_func = torch.clamp(linear_func, 0, 1)
        return ramp_func

    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
    # to expand the possible context length. In other words, interpolation = apply scaling factor.
    pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
    inv_freq_extrapolation = 1.0 / pos_freqs
    inv_freq_interpolation = 1.0 / (factor * pos_freqs)

    low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings)

    # Get n-dimensional rotational scaling corrected for extrapolation
    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
    inv_freq = (
        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
        + inv_freq_extrapolation * inv_freq_extrapolation_factor
    )
    return inv_freq, attention_factor


def _compute_longrope_parameters(
    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    """
    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
    # No need to keep BC with longrope, unreleased when this new pattern was created.
    if len(rope_kwargs) > 0:
        raise ValueError(
            "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
            f"{rope_kwargs}"
        )

    base = config.rope_theta
    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)
    long_factor = config.rope_scaling["long_factor"]
    short_factor = config.rope_scaling["short_factor"]
    factor = config.rope_scaling.get("factor")
    attention_factor = config.rope_scaling.get("attention_factor")

    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
    # values to compute the default attention scaling factor, instead of using `factor`.
    if hasattr(config, "original_max_position_embeddings"):
        original_max_position_embeddings = config.original_max_position_embeddings
        factor = config.max_position_embeddings / config.original_max_position_embeddings
    else:
        original_max_position_embeddings = config.max_position_embeddings

    # Sets the attention factor as suggested in the paper
    if attention_factor is None:
        if factor <= 1.0:
            attention_factor = 1.0
        else:
            attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))

    # Compute the inverse frequencies -- scaled based on the target sequence length
    if seq_len and seq_len > original_max_position_embeddings:
        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
    else:
        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)

    return inv_freq, attention_factor


def _compute_llama3_parameters(
    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
    """
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
        rope_kwargs (`Dict`, *optional*):
            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    """
    # Gets the default RoPE parameters
    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)

    factor = config.rope_scaling["factor"]  # `8` in the original implementation
    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation

    low_freq_wavelen = old_context_len / low_freq_factor
    high_freq_wavelen = old_context_len / high_freq_factor

    wavelen = 2 * math.pi / inv_freq
    # wavelen < high_freq_wavelen: do nothing
    # wavelen > low_freq_wavelen: divide by factor
    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
    # otherwise: interpolate between the two, using a smooth factor
    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)

    return inv_freq_llama, attention_factor


# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
# parameterizations, as long as the callable has the same signature.
ROPE_INIT_FUNCTIONS = {
    "default": _compute_default_rope_parameters,
    "linear": _compute_linear_scaling_rope_parameters,
    "dynamic": _compute_dynamic_ntk_parameters,
    "yarn": _compute_yarn_parameters,
    "longrope": _compute_longrope_parameters,
    "llama3": _compute_llama3_parameters,
}


def _check_received_keys(
    rope_type: str,
    received_keys: set,
    required_keys: set,
    optional_keys: Optional[set] = None,
    ignore_keys: Optional[set] = None,
):
    """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
    # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
    if "type" in received_keys:
        received_keys -= {"type"}
        required_keys.add("rope_type")

    # Some models need to store model-specific keys, and we don't want to throw warning at them
    if ignore_keys is not None:
        received_keys -= ignore_keys

    missing_keys = required_keys - received_keys
    if missing_keys:
        raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")

    if optional_keys is not None:
        unused_keys = received_keys - required_keys - optional_keys
    else:
        unused_keys = received_keys - required_keys
    if unused_keys:
        logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")


def _validate_default_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)


def _validate_linear_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
    optional_keys = {"original_max_position_embeddings"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor"}
    optional_keys = {
        "attention_factor",
        "beta_fast",
        "beta_slow",
        "original_max_position_embeddings",
        "mscale",
        "mscale_all_dim",
    }
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

    attention_factor = rope_scaling.get("attention_factor")
    if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
        logger.warning(
            f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
        )
    beta_fast = rope_scaling.get("beta_fast")
    if beta_fast is not None and not isinstance(beta_fast, float):
        logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
    beta_slow = rope_scaling.get("beta_slow")
    if beta_slow is not None and not isinstance(beta_slow, float):
        logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")

    if (beta_fast or 32) < (beta_slow or 1):
        logger.warning(
            f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
            f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
        )


def _validate_longrope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "short_factor", "long_factor"}
    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
    optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)

    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
    dim = int(head_dim * partial_rotary_factor)

    short_factor = rope_scaling.get("short_factor")
    if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
        logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
    if not len(short_factor) == dim // 2:
        logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")

    long_factor = rope_scaling.get("long_factor")
    if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
        logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
    if not len(long_factor) == dim // 2:
        logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")

    # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
    # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
    # unique to longrope (= undesirable)
    if hasattr(config, "original_max_position_embeddings"):
        logger.warning_once(
            "This model has set a `original_max_position_embeddings` field, to be used together with "
            "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
            "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
            "as it is compatible with most model architectures."
        )
    else:
        factor = rope_scaling.get("factor")
        if factor is None:
            logger.warning("Missing required keys in `rope_scaling`: 'factor'")
        elif not isinstance(factor, float) or factor < 1.0:
            logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

        attention_factor = rope_scaling.get("attention_factor")
        if attention_factor is not None:
            if not isinstance(attention_factor, float) or attention_factor < 0.0:
                logger.warning(
                    f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
                )


def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    rope_scaling = config.rope_scaling
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
    required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
    received_keys = set(rope_scaling.keys())
    _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)

    factor = rope_scaling["factor"]
    if factor is None or not isinstance(factor, float) or factor < 1.0:
        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

    low_freq_factor = rope_scaling["low_freq_factor"]
    high_freq_factor = rope_scaling["high_freq_factor"]
    if low_freq_factor is None or not isinstance(low_freq_factor, float):
        logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
    if high_freq_factor is None or not isinstance(high_freq_factor, float):
        logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
    if high_freq_factor <= low_freq_factor:
        logger.warning(
            "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
            f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
        )

    original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
    if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
        logger.warning(
            "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
            f"{original_max_position_embeddings}"
        )
    if original_max_position_embeddings >= config.max_position_embeddings:
        logger.warning(
            "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
            f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
        )


# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
ROPE_VALIDATION_FUNCTIONS = {
    "default": _validate_default_rope_parameters,
    "linear": _validate_linear_scaling_rope_parameters,
    "dynamic": _validate_dynamic_scaling_rope_parameters,
    "yarn": _validate_yarn_parameters,
    "longrope": _validate_longrope_parameters,
    "llama3": _validate_llama3_parameters,
}


def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None):
    """
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    """
    rope_scaling = getattr(config, "rope_scaling", None)  # not a default parameter in `PretrainedConfig`
    if rope_scaling is None:
        return

    # BC: "rope_type" was originally "type"
    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
    validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
    if validation_fn is not None:
        validation_fn(config, ignore_keys=ignore_keys)
    else:
        logger.warning(
            f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
        )

================================================
FILE: kt-sft/ktransformers/util/textstream.py
================================================
from typing import Any, List, Optional, Set
class TextStreamer:

    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
        self.tokenizer = tokenizer
        self.skip_prompt = skip_prompt
        self.decode_kwargs = decode_kwargs

        # variables used in the streaming process
        self.token_cache = []
        self.print_len = 0
        self.next_tokens_are_prompt = True

    def reset(self):
        self.token_cache = []
        self.print_len = 0

    def put(self, value)->Optional[str]:
        """
        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
        """        
        if not isinstance(value,int):
            raise ValueError("TextStreamer only supports batch size 1, and int type input")


        if self.skip_prompt and self.next_tokens_are_prompt:
            self.next_tokens_are_prompt = False
            return None

        # Add the new token to the cache and decodes the entire thing.
        self.token_cache.append(value)
        text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True,**self.decode_kwargs)

        # After the symbol for a new line, we flush the cache.
        if text.endswith("\n"):
            printable_text = text[self.print_len :]
            self.reset()
        # If the last token is a CJK character, we print the characters.
        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
            printable_text = text[self.print_len :]
            self.print_len += len(printable_text)
        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
        # which may change with the subsequent token -- there are probably smarter ways to do this!)
        else:
            printable_text = text[self.print_len : text.rfind(" ") + 1]
            self.print_len += len(printable_text)
        return printable_text

    def end(self)->Optional[str]:
        """Flushes any remaining cache and prints a newline to stdout."""
        # Flush the cache, if it exists
        if len(self.token_cache) > 0:
            text = self.tokenizer.decode(self.token_cache, skip_special_tokens=True, **self.decode_kwargs)
            printable_text = text[self.print_len :]
            self.reset()
        else:
            printable_text = ""

        self.next_tokens_are_prompt = True
        return printable_text
   
    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

================================================
FILE: kt-sft/ktransformers/util/utils.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :  
Author       : Boxin Zhang, Azure-Tang
Version      : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
'''
import torch
from torch import nn
import itertools
import time
import enum
from typing import Any, List, Optional, Set
from transformers import (
    LogitsProcessorList,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    MinPLogitsWarper,
    TypicalLogitsWarper,
    EpsilonLogitsWarper,
    EtaLogitsWarper,
)
from torchviz import make_dot
# from ktransformers.sft.peft_utils.lora_layer import KTransformersLinearLora
from ktransformers.util.custom_loader import ModelLoaderFactory, ModelLoader, SafeTensorLoader, GGUFLoader, translate_name_to_gguf, translate_adapter_name_to_gguf
from ktransformers.operators import base_operator
from ktransformers.models.custom_cache import StaticCache
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
from ktransformers.util.textstream import TextStreamer
from ktransformers.util.globals import GLOBAL_CONFIG
if not torch.xpu.is_available():
    from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
import socket

from transformers.generation.logits_process import LogitsProcessor
# from transformers import TextStreamer # !!! this will override the TextStreamer from ktransformers.util.textstream

class NoEosUntil(LogitsProcessor):
    def __init__(self, prompt_len: int, min_gen_len: int, eos_ids):
        super().__init__()
        self.start_len = int(prompt_len)
        self.min_len   = self.start_len + int(min_gen_len)
        self.eos_ids   = list(eos_ids) if isinstance(eos_ids,(list,tuple)) else [int(eos_ids)]

    def __call__(self, input_ids, scores):
        if input_ids.shape[-1] < self.min_len:
            scores[..., self.eos_ids] = -float("inf")
        return scores

class SilentCaptureStreamer(TextStreamer):
    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
        super().__init__(tokenizer, skip_prompt=skip_prompt, **decode_kwargs)
        self._buf: List[str] = []

    def _append_piece(self, piece: Optional[str]):
        if piece:
            self._buf.append(piece)

    def put(self, value) -> str:
        tokens: List[int] = []
        if isinstance(value, int):
            tokens = [value]
        else:
            try:
                import torch
                if isinstance(value, torch.Tensor):
                    tokens = list(map(int, value.view(-1).tolist()))
                elif isinstance(value, (list, tuple)) and all(isinstance(x, int) for x in value):
                    tokens = list(value)
                else:
                    raise ValueError("Unsupported value type for SilentCaptureStreamer.put")
            except Exception:
                if isinstance(value, (list, tuple)) and all(isinstance(x, int) for x in value):
                    tokens = list(value)
                else:
                    raise ValueError("Unsupported value type for SilentCaptureStreamer.put")
        for t in tokens:
            piece = super().put(t)
            self._append_piece(piece)
        return ""

    def end(self) -> str:
        piece = super().end()
        self._append_piece(piece)
        return ""

    def getvalue(self) -> str:
        return "".join(self._buf)

    def clear(self):
        self._buf.clear()

warm_uped = False

def get_free_ports(n: int, continue_prot: list):
    sockets = []
    ports = []
    for _ in range(n):
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind(("", 0)) 
        port = s.getsockname()[1]
        if port in continue_prot:
            s.close()
            continue
        ports.append(port)
        sockets.append(s)
    for s in sockets:
        s.close()
    return ports

def get_compute_capability(device:torch.device = None):
    if torch.cuda.is_available():
        if device is None:
            num_gpus = torch.cuda.device_count()
            min_compute_capability_major = 100
            for gpu_id in range(num_gpus):
                gpu_props = torch.cuda.get_device_properties(gpu_id)
                min_compute_capability_major = min(min_compute_capability_major, gpu_props.major)
            return min_compute_capability_major
        else:
            return torch.cuda.get_device_properties(device)
    else:
        return 0

def set_module(model, submodule_key, module):
    tokens = submodule_key.split('.')
    sub_tokens = tokens[:-1]
    cur_mod = model
    for s in sub_tokens:
        if hasattr(cur_mod, s):
            cur_mod = getattr(cur_mod, s)
        else: # nn.ModuleList or nn.ModuleList
            cur_mod=cur_mod[int(s)]
    if hasattr(cur_mod, tokens[-1]):
        setattr(cur_mod, tokens[-1], module)
    else: # nn.ModuleList or nn.ModuleList
        cur_mod[int(tokens[-1])] = module

def set_param(module: nn.Module, name: str, weights: torch.Tensor):
    
    param=nn.parameter.Parameter(weights, requires_grad=True)
    if isinstance(module, nn.Linear) and len(weights.shape)==1:
        param.unsqueeze_(0)
    setattr(module, name, param)

def get_device(gguf_module_key:str, device_map:dict):
    if gguf_module_key in device_map:
        return device_map[gguf_module_key]["generate_device"]
    elif gguf_module_key.replace("model.layers", "blk") in device_map:
        return device_map[gguf_module_key.replace("model.layer", "blk")]["generate_device"]
    else:
        return "cuda"

def get_all_used_cuda_device(device_map:dict):
    all_device_list = set()
    for key in device_map:
        all_device_list.add(device_map[key]["generate_device"]) if "generate_device" in device_map[key] else None
        all_device_list.add(device_map[key]["prefill_device"]) if "prefill_device" in device_map[key] else None
    if "cpu" in all_device_list:
        all_device_list.remove("cpu")
    all_device_list = list(all_device_list)
    return all_device_list

def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda", adapter_gguf: bool = False):
    if GLOBAL_CONFIG._config["mod"] == 'sft':
        prefix = prefix.replace("orig_module.", "")
        persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
        local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
        local_state = {k: v for k, v in local_name_params if v is not None}
        for name, param in local_state.items():
            key = prefix + name
            translated_key = translate_name_to_gguf(key)
            if adapter_gguf == True:
                translated_adapter_key = translate_adapter_name_to_gguf(key)

            # TODO: Merge all loader.
            # I know this is ugly but lets do it for now.
            if gguf_loader.safetensor_loader is not None:
                load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
                tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
            else:
                load_dequantized_tensor = gguf_loader.load_gguf_tensor
                tensor_file_map = gguf_loader.tensor_file_map
            # print(f"tensor_file_map:{tensor_file_map}")
            # We allow some key not be used in GGUF
            if translated_key in tensor_file_map:
                target_dtype = torch.get_default_dtype()
                device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
                print(f"loading {translated_key} to {device}")
                torch.cuda.empty_cache()
                weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
                set_param(module, name, weights)
                del weights
            else:
                if adapter_gguf == True: # Not all module should be reload in lora adapter
                    for single_tensor_file_map in tensor_file_map:
                        if translated_adapter_key in single_tensor_file_map:
                            target_dtype = torch.get_default_dtype()
                            device = get_device(single_tensor_file_map[:single_tensor_file_map.rfind(".")], gguf_loader.tensor_device_map)
                            print(f"loading {single_tensor_file_map} to {device}")
                            torch.cuda.empty_cache()
                            weights = load_dequantized_tensor(single_tensor_file_map, device=device).to(dtype=target_dtype)
                            set_param(module, name, weights)
                            del weights

                else:
                    #print(load_config.tensor_file_map.keys())
                    raise Exception(f"can't find {translated_key} in GGUF file!")
    elif GLOBAL_CONFIG._config["mod"] == 'infer':
        prefix = prefix.replace("orig_module.", "")
        persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
        local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
        local_state = {k: v for k, v in local_name_params if v is not None}
        for name, param in local_state.items():
            key = prefix + name
            translated_key = key
            
            # TODO: Merge all loader.
            # I know this is ugly but lets do it for now.
            if isinstance(gguf_loader, SafeTensorLoader):
                load_dequantized_tensor = gguf_loader.load_dequantized_tensor
            else:
                load_dequantized_tensor = gguf_loader.load_gguf_tensor
                tensor_file_map = gguf_loader.tensor_file_map
            
            if gguf_loader.has_tensor(translated_key) or "kv_b_proj" in translated_key:
                target_dtype = torch.get_default_dtype()
                device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
                print(f"loading {translated_key} to {device}")
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                elif torch.xpu.is_available():
                    torch.xpu.empty_cache()
                if "kv_b_proj" in translated_key and not gguf_loader.has_tensor(translated_key):
                    attn_k_b = load_dequantized_tensor(translated_key.replace("self_attn.kv_b_proj", "attn_k_b"), device=device).to(dtype=target_dtype)
                    attn_k_b = attn_k_b.transpose(1, 2).contiguous()
                    attn_v_b = load_dequantized_tensor(translated_key.replace("self_attn.kv_b_proj", "attn_v_b"), device=device).to(dtype=target_dtype)
                    kv_b_proj = torch.cat((attn_k_b, attn_v_b), dim=1)
                    kv_b_proj = kv_b_proj.contiguous() if kv_b_proj.ndim == 2 else kv_b_proj.flatten(0, 1).contiguous()
                    set_param(module, name, kv_b_proj)
                    del attn_k_b
                    del attn_v_b
                else:
                    weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
                    set_param(module, name, weights)
                    del weights
            else:
                #print(load_config.tensor_file_map.keys())
                raise Exception(f"can't find {translated_key} in GGUF file!")
        

def sync_all_device(all_device_list):
    for device in all_device_list:
        if "cuda" in device.lower():
            torch.cuda.synchronize(device)
        elif "xpu" in device.lower():
            torch.xpu.synchronize(device)
        else:
            raise RuntimeError("The device {} is not available".format(device))

torch_device_mapping ={"cuda": "cuda:0", "xpu": "xpu:0"}

def xpu_fp16_model(config):
    # This function is to check if we run this model on XPU with FP16 dtype
    if not torch.xpu.is_available():
        return False
    if config.architectures[0] == "DeepseekV3ForCausalLM":
        return True
    if config.architectures[0] == "Qwen3MoeForCausalLM" and config.hidden_size == 4096:
        # Qwen3-30B seems have precision issue with FP16
        # so we only use FP16 for Qwen3-235B now
        return True
    return False

def load_weights(module:nn.Module, gguf_loader:ModelLoader, prefix='', device="cuda", adapter_gguf=False):
    #print(f"recursively loading weights {prefix}")
    if not isinstance(module, base_operator.BaseInjectedModule):
        load_cur_state_dict(module, gguf_loader, prefix, device=device, adapter_gguf=adapter_gguf, )
        for name, child in module._modules.items():
            load_weights(child, gguf_loader, prefix+name+".", device=device, adapter_gguf=adapter_gguf, )
    else:
        if adapter_gguf == True:
            # TODO: This is not the best choice, because we should change the value of gguf_loader in BaseInjectModule, but up to now, it can still work
            try: # for other class inherit from BaseInjectModule, but not inherit from KTLinear
                module.load(gguf_loader=gguf_loader, adapter_gguf=adapter_gguf)
            except: # for only KTLinear up to now
                module.load()
        else:
            module.load()

def tf_logits_warper(generation_config):
        """
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
        used for multinomial sampling.
        """

        # instantiate warpers list
        warpers = LogitsProcessorList()

        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
        # better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
        if generation_config.num_beams > 1:
            if isinstance(generation_config._eos_token_tensor, list):
                min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
            elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
                min_tokens_to_keep = generation_config._eos_token_tensor.shape[0] + 1
            else:
                min_tokens_to_keep = 2
        else:
            min_tokens_to_keep = 1

        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
        # all samplers can be found in `generation_utils_samplers.py`
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TemperatureLogitsWarper(generation_config.temperature))
        if generation_config.top_k is not None and generation_config.top_k != 0:
            warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
            warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.min_p is not None:
            # Applied after temperature scaling (see https://github.com/ggerganov/llama.cpp/pull/3841#issuecomment-2073826084)
            warpers.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
            warpers.append(
                TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
            warpers.append(
                EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
            )
        if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
            warpers.append(
               EtaLogitsWarper(
                    epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep, device=device
                )
            )
        # `LogitNormalization` should always be the last logit processor, when present
        if generation_config.renormalize_logits is True:
            warpers.append(LogitNormalization())
        return warpers

def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
                         mode = 'normal', force_think: bool = False, chunk_size = 16384, use_flashinfer_mla = False,
                         num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None):
    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    torch._dynamo.config.suppress_errors = True
    batch_size, seq_length = inputs.shape
    device_map = model.gguf_loader.tensor_device_map
    torch_device = get_device('model.layers.0.self_attn', device_map)
    # torch_device = "cuda:0" if torch_device == "cuda" else torch_device
    torch_device = torch_device_mapping[torch_device] if torch_device in torch_device_mapping else torch_device
    inputs = inputs.to(torch_device)
    all_cuda_device = get_all_used_cuda_device(device_map)

    tokens = []
    
    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True):
        if cuda_graph_runner is None:
            use_cuda_graph = False
        if use_cuda_graph:
            logits = cuda_graph_runner(cur_token, position_ids, cache_position)
        else:
            # custom_stream = torch.cuda.Stream()
            if torch.cuda.is_available():
                torch.cuda.set_device(torch_device)
            elif torch.xpu.is_available():
                torch.xpu.set_device(torch_device)
            else:
                raise RuntimeError(f"The device: {torch_device} is not available")
            inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(torch_device)
            # with torch.cuda.stream(custom_stream):
            logits=model(inputs_embeds=inputs_embeds,
                        position_ids=position_ids,
                        cache_position=cache_position,
                        past_key_values=past_key_values,
                        return_dict=False, use_cache=True)[0]
        if past_key_values != None and isinstance(past_key_values, StaticCache):
            past_key_values.change_seq_length(1)
        sync_all_device(all_cuda_device)
        # print(logits)
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
            probs = nn.functional.softmax(next_token_scores, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)
        return next_token
    
    # TODO: use CUDA Graph for chunk prefill, may get small improvement
    def chunk_prefill(inputs, cache_position, past_key_values):
        if mode == "long_context":
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
        else:
            print(f"torch_device:{torch_device}")
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
        if use_flashinfer_mla:
            MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
            MLAWrapperSingleton.need_plan_all()
            
        logits = model(
            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
        
        return logits
    
    if torch.cuda.is_available():
        torch.cuda.set_device(torch_device)
    elif torch.xpu.is_available():
        torch.xpu.set_device(torch_device)
    else:
        raise RuntimeError(f"The device: {torch_device} is not available")
    with torch.no_grad():
        
        stream = TextStreamer(tokenizer)
        if torch.xpu.is_available():
            from ipex_llm.transformers.kv import DynamicUnbalancedFp8Cache, DynamicNormalCache
            if model.config.architectures[0] in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
                past_key_values = DynamicUnbalancedFp8Cache.from_legacy_cache(None)
            else:
                past_key_values = DynamicNormalCache.from_legacy_cache(None)
        elif mode != 'long_context':
            past_key_values = StaticCache(
                config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
            )
        else:
            past_key_values = None
        
        generation_config, model_kwargs = model._prepare_generation_config(
            None, do_sample=True
            # change this to modify generate config
            #top_k=5, top_p=0.85, temperature=0.1
        )

        logits_warper = tf_logits_warper(generation_config)

        cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32)
        generated_ids = torch.zeros(
            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
        )
        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
        start_time = time.time()

        chunk_start = 0
        while chunk_start < seq_length:
            chunk_end = min(chunk_start + chunk_size, seq_length)
            if past_key_values != None:
                past_key_values.cur_idx=cache_position[chunk_start:chunk_end]
            logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values)
            chunk_start += chunk_size

        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
            probs = nn.functional.softmax(next_token_scores, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)
            
        # decoded_first = tokenizer.decode(next_token)
        # print(f"\n[DEBUG] first token id={next_token.item()} decoded='{decoded_first}'\n")

        first_token_time = time.time() - start_time
        
        if use_flashinfer_mla:
            MLAWrapperSingleton.reset_buffer()

        prefill_count = seq_length
        prefill_time = first_token_time
        if force_think:
            print("<think>")
        print(stream.put(next_token.item()), end="", flush=True)
        # stream.put(next_token.item())
        generated_ids[:, seq_length] = next_token
        tokens.append(int(next_token))
        inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
        cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
        position_ids = cache_position.unsqueeze(0)
        seq_length += 1
        
        cuda_graph_runner = None
            
        start_time = time.time()
        for i in range(1, max_new_tokens):
            if use_flashinfer_mla:
                MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None,
                                             num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
                                             model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
            global warm_uped
            if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
                warm_uped = True
                cuda_graph_runner = CUDAGraphRunner()
                cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device)
            inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
            generated_ids[:, cache_position] = next_token.int()
            tokens.append(int(next_token))
            seq_length += 1
            
            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                # print(stream.end(), end="", flush=True)
                stream.end()
                break
            else:
                print(stream.put(next_token.item()), end="", flush=True)
                # stream.put(next_token.item())
            cache_position += 1
            position_ids = cache_position.unsqueeze(0)
        

    total_time = time.time() - start_time
    tokens_generated = len(tokens)
    tokens_per_second = tokens_generated / total_time

    print("")

    print(f"prompt eval count:    {prefill_count} token(s)")
    print(f"prompt eval duration: {prefill_time}s")
    print(f"prompt eval rate:     {prefill_count/prefill_time} tokens/s")
    print(f"eval count:           {tokens_generated} token(s)")
    print(f"eval duration:        {total_time}s")
    print(f"eval rate:            {tokens_per_second} tokens/s")

    return tokens

def prefill_and_generate_capture(
    model, tokenizer, inputs,
    max_new_tokens=10000, use_cuda_graph: bool = True,
    mode='normal', force_think: bool = False, chunk_size=16384,
    use_flashinfer_mla=False, num_heads=None,
    head_dim_ckv=None, head_dim_kpe=None, q_head_dim=None,
    echo_stream: bool = True,
):
    """
    echo_stream=False 时，将不会在终端输出，只写入返回值。
    """
    import os, time, torch, torch.nn as nn
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    torch._dynamo.config.suppress_errors = True
    batch_size, seq_length = inputs.shape
    device_map = model.gguf_loader.tensor_device_map
    torch_device = get_device('model.layers.0.self_attn', device_map)
    torch_device = torch_device_mapping.get(torch_device, torch_device)
    inputs = inputs.to(torch_device)
    all_cuda_device = get_all_used_cuda_device(device_map)
    tokens = []

    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True):
        if cuda_graph_runner is None:
            use_cuda_graph = False
        if use_cuda_graph:
            logits = cuda_graph_runner(cur_token, position_ids, cache_position)
        else:
            # custom_stream = torch.cuda.Stream()
            if torch.cuda.is_available():
                torch.cuda.set_device(torch_device)
            elif torch.xpu.is_available():
                torch.xpu.set_device(torch_device)
            else:
                raise RuntimeError(f"The device: {torch_device} is not available")
            inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(torch_device)
            # with torch.cuda.stream(custom_stream):
            logits=model(inputs_embeds=inputs_embeds,
                        position_ids=position_ids,
                        cache_position=cache_position,
                        past_key_values=past_key_values,
                        return_dict=False, use_cache=True)[0]
        if past_key_values != None and isinstance(past_key_values, StaticCache):
            past_key_values.change_seq_length(1)
        sync_all_device(all_cuda_device)
        # print(logits)
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
            probs = nn.functional.softmax(next_token_scores, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)
        return next_token
    
    # TODO: use CUDA Graph for chunk prefill, may get small improvement
    def chunk_prefill(inputs, cache_position, past_key_values):
        if mode == "long_context":
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
        else:
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
        if use_flashinfer_mla:
            MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
            MLAWrapperSingleton.need_plan_all()
            
        logits = model(
            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
        
        return logits

    if torch.cuda.is_available():
        torch.cuda.set_device(torch_device)
    elif torch.xpu.is_available():
        torch.xpu.set_device(torch_device)
    else:
        raise RuntimeError(f"The device: {torch_device} is not available")

    with torch.no_grad():
        stream = SilentCaptureStreamer(tokenizer)

        if torch.xpu.is_available():
            from ipex_llm.transformers.kv import DynamicUnbalancedFp8Cache, DynamicNormalCache
            if model.config.architectures[0] in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
                past_key_values = DynamicUnbalancedFp8Cache.from_legacy_cache(None)
            else:
                past_key_values = DynamicNormalCache.from_legacy_cache(None)
        elif mode != 'long_context':
            past_key_values = StaticCache(
                config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype
            )
        else:
            past_key_values = None
        
        generation_config, model_kwargs = model._prepare_generation_config(
            None, do_sample=True
            # change this to modify generate config
            #top_k=5, top_p=0.85, temperature=0.1
        )

        logits_warper = tf_logits_warper(generation_config)

        cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32)
        generated_ids = torch.zeros(
            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
        )
        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
        start_time = time.time()

        chunk_start = 0
        while chunk_start < seq_length:
            chunk_end = min(chunk_start + chunk_size, seq_length)
            if past_key_values != None:
                past_key_values.cur_idx=cache_position[chunk_start:chunk_end]
            logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values)
            chunk_start += chunk_size

        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
            probs = nn.functional.softmax(next_token_scores, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)
            
        # decoded_first = tokenizer.decode(next_token)
        # print(f"\n[DEBUG] first token id={next_token.item()} decoded='{decoded_first}'\n")

        first_token_time = time.time() - start_time
        
        if use_flashinfer_mla:
            MLAWrapperSingleton.reset_buffer()

        prefill_count = seq_length
        prefill_time = first_token_time
        if force_think:
            print("<think>")
        print(stream.put(next_token.item()), end="", flush=True)
        generated_ids[:, seq_length] = next_token
        tokens.append(int(next_token))
        inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
        cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
        position_ids = cache_position.unsqueeze(0)
        seq_length += 1
        
        cuda_graph_runner = None
            
        start_time = time.time()
        for i in range(1, max_new_tokens):
            if use_flashinfer_mla:
                MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None,
                                             num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
                                             model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
            global warm_uped
            if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
                warm_uped = True
                cuda_graph_runner = CUDAGraphRunner()
                cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device)
            inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
            generated_ids[:, cache_position] = next_token.int()
            tokens.append(int(next_token))
            seq_length += 1
            
            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                print(stream.end(), end="", flush=True)
                break
            else:
                print(stream.put(next_token.item()), end="", flush=True)
            cache_position += 1
            position_ids = cache_position.unsqueeze(0)

        stream.end()
        return stream.getvalue()


================================================
FILE: kt-sft/ktransformers/util/vendors.py
================================================
from __future__ import annotations

from enum import IntEnum, auto
from typing import Optional, Union, List
import torch

class GPUVendor(IntEnum):
    NVIDIA = auto()
    AMD = auto()
    MooreThreads = auto()
    MetaX = auto()
    MUSA = auto()
    Unknown = auto()

class DeviceManager:
    """
    Device manager that provides a unified interface for handling different GPU vendors
    """
    def __init__(self):
        self.gpu_vendor = self._detect_gpu_vendor()
        self.available_devices = self._get_available_devices()
    
    def _detect_gpu_vendor(self) -> GPUVendor:
        """Detect GPU vendor type"""
        if not torch.cuda.is_available():
            # Check MUSA availability (assuming a musa module exists)
            try:
                import musa
                if musa.is_available():
                    return GPUVendor.MUSA
            except (ImportError, AttributeError):
                pass
            
            return GPUVendor.Unknown
        
        device_name = torch.cuda.get_device_name(0).lower()
        
        if any(name in device_name for name in ["nvidia", "geforce", "quadro", "tesla", "titan", "rtx", "gtx"]):
            return GPUVendor.NVIDIA
        elif any(name in device_name for name in ["amd", "radeon", "rx", "vega", "instinct", "firepro", "mi"]):
            return GPUVendor.AMD
        elif any(name in device_name for name in ["mthreads", "moore", "mtt"]):
            return GPUVendor.MooreThreads
        elif any(name in device_name for name in ["metax", "meta"]):
            return GPUVendor.MetaX
        elif "musa" in device_name:
            return GPUVendor.MUSA
        
        # Backend check
        try:
            if hasattr(torch.version, 'hip') and torch.version.hip is not None:
                return GPUVendor.AMD
            elif hasattr(torch.version, 'cuda') and torch.version.cuda is not None:
                return GPUVendor.NVIDIA
        except:
            pass
            
        return GPUVendor.Unknown
    
    def _get_available_devices(self) -> List[int]:
        """Get list of available device indices"""
        devices = []
        
        if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
            devices = list(range(torch.cuda.device_count()))
        elif self.gpu_vendor == GPUVendor.MUSA:
            try:
                import musa
                devices = list(range(musa.device_count()))
            except (ImportError, AttributeError):
                pass
            
        return devices
    
    def get_device_str(self, device_id: Union[int, str]) -> str:
        """
        Get device string for the given device ID
        
        Args:
            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
            
        Returns:
            Device string representation (e.g., "cuda:0", "musa:1", "cpu")
        """
        if device_id == -1 or device_id == "cpu":
            return "cpu"
            
        if isinstance(device_id, int):
            if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
                if device_id < torch.cuda.device_count():
                    return f"cuda:{device_id}"
            elif self.gpu_vendor == GPUVendor.MUSA:
                try:
                    import musa
                    if device_id < musa.device_count():
                        return f"musa:{device_id}"
                except (ImportError, AttributeError):
                    pass
        
        return "cpu"
    
    def to_torch_device(self, device_id: Union[int, str] = 0) -> torch.device:
        """
        Convert device ID to torch.device object
        
        Args:
            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
            
        Returns:
            torch.device object
        """
        device_str = self.get_device_str(device_id)
        
        # Handle MUSA device
        if device_str.startswith("musa:"):
            try:
                import musa
                index = int(device_str.split(":")[-1])
                return musa.device(index)
            except (ImportError, ValueError, AttributeError):
                return torch.device("cpu")
        
        # Standard PyTorch device
        return torch.device(device_str)
    
    def move_tensor_to_device(self, tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
        """
        Move tensor to specified device
        
        Args:
            tensor: PyTorch tensor to move
            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
            
        Returns:
            Tensor moved to the specified device
        """
        device = self.to_torch_device(device_id)
        return tensor.to(device)
    
    def is_available(self, index: int = 0) -> bool:
        """
        Check if device at specified index is available
        
        Args:
            index: Device index to check
            
        Returns:
            True if the device is available, False otherwise
        """
        if index < 0:
            return True  # CPU is always available
            
        return index in self.available_devices
    
    def get_all_devices(self) -> List[int]:
        """
        Get all available device indices
        
        Returns:
            List of available device indices (0, 1, 2, etc.)
        """
        return self.available_devices

# Create global device manager instance
device_manager = DeviceManager()

# Convenience functions
def get_device(device_id: Union[int, str] = 0) -> torch.device:
    """
    Get torch.device object for the specified device ID
    
    Args:
        device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
        
    Returns:
        torch.device object
    """
    return device_manager.to_torch_device(device_id)

def to_device(tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
    """
    Move tensor to specified device
    
    Args:
        tensor: PyTorch tensor to move
        device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
        
    Returns:
        Tensor moved to the specified device
    """
    return device_manager.move_tensor_to_device(tensor, device_id)

# Get devices
cpu_device = get_device(-1)        # CPU using index -1
cpu_device2 = get_device("cpu")    # CPU using string "cpu"
gpu0 = get_device(0)               # First GPU

# Move tensors
x = torch.randn(3, 3)
x_gpu = to_device(x, 0)            # Move to first GPU
x_cpu1 = to_device(x, -1)          # Move to CPU using index -1
x_cpu2 = to_device(x, "cpu")       # Move to CPU using string "cpu"

================================================
FILE: kt-sft/ktransformers/util/weight_loader.py
================================================
from abc import ABC, abstractmethod
import os
import torch
import numpy as np
from safetensors import safe_open
from typing import Dict, Any, Optional, Union

class ModelLoader(ABC):
    """
    Abstract base class for model loaders.
    Defines the interface that all model loaders must implement.
    """
    
    @abstractmethod
    def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load a tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The loaded tensor
        """
        pass
    
    @classmethod
    @abstractmethod
    def supports_format(cls, path: str) -> bool:
        """
        Check if this loader supports the given path format.
        
        Args:
            path: Path to check
            
        Returns:
            True if this loader supports the given path, False otherwise
        """
        pass


class SafeTensorLoader(ModelLoader):
    """
    Loader for SafeTensor format models.
    """
    
    def __init__(self, path: str):
        """
        Initialize the SafeTensor loader.
        
        Args:
            path: Path to the model directory or file
        """
        self.tensor_file_map = {}  # Maps tensor names to file paths
        self.file_handle_map = {}  # Maps file names to file handles
        self._load_tensor_file_map(path)
    
    def _load_tensor_file_map(self, path: str) -> None:
        """
        Load the tensor file map from the given path.
        
        Args:
            path: Path to the model directory or file
        """
        # Normalize path to directory
        if not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        if os.path.isfile(path):
            folder_path = os.path.dirname(path)
        else:
            folder_path = path

        found_safetensor = False
        for root, _, files in os.walk(folder_path):
            files = sorted(files)
            for file in files:
                if file.endswith(".safetensors"):
                    found_safetensor = True
                    file_path = os.path.join(root, file)
                    if file not in self.file_handle_map:
                        try:
                            handle = safe_open(file_path, framework="pt")
                            self.file_handle_map[file] = handle
                        except Exception as e:
                            print(f"Error opening Safetensor file {file_path}: {e}")
                            continue

                    f = self.file_handle_map.get(file)
                    if f is None:
                        continue
                    try:
                        for key in f.keys():
                            self.tensor_file_map[key] = file
                    except Exception as e:
                        print(f"Error reading Safetensor file {file_path}: {e}")

        if not found_safetensor:
            # Not raising an error here allows for the factory to try other loaders
            print(f"No Safetensor files found in {folder_path}")
    
    def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load a tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The loaded tensor
        """
        if name not in self.tensor_file_map:
            raise KeyError(f"Key {name} not found in Safetensor files")
        file = self.tensor_file_map[name]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(name)
        return tensor.to(device)
    
    def load_dequantized_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load and dequantize a tensor.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The dequantized tensor
        """
        if name not in self.tensor_file_map:
            raise KeyError(f"Key {name} not found in Safetensor files")
        file = self.tensor_file_map[name]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(name).to(device)
        if name.endswith(".weight"):
            if name[:-7] + ".weight_scale_inv" in self.tensor_file_map:
                weight_scale_inv = f.get_tensor(name[:-7] + ".weight_scale_inv").to(device)
                # Assuming weight_dequant function is imported
                from ktransformers.ktransformers_ext.triton.fp8gemm import weight_dequant
                tensor = weight_dequant(tensor, weight_scale_inv)
        return tensor.to(device)
    
    def close_all_handles(self) -> None:
        """
        Close all file handles.
        """
        for handle in self.file_handle_map.values():
            handle.close()
        self.file_handle_map.clear()

    @classmethod
    def supports_format(cls, path: str) -> bool:
        """
        Check if this loader supports the given path format.
        
        Args:
            path: Path to check
            
        Returns:
            True if safetensor files are found in the path, False otherwise
        """
        # Normalize path to directory
        if not os.path.exists(path):
            return False
        if os.path.isfile(path):
            if path.endswith(".safetensors"):
                return True
            folder_path = os.path.dirname(path)
        else:
            folder_path = path
            
        # Check if any safetensor files exist in the folder
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith(".safetensors"):
                    return True
        return False


class GGUFLoader(ModelLoader):
    """
    Loader for GGUF format models.
    """
    
    def __init__(self, path: str):
        """
        Initialize the GGUF loader.
        
        Args:
            path: Path to the model directory or file
        """
        # Check if path exists
        if not os.path.exists(path):
            raise FileNotFoundError(f"GGUF dir not found: {path}")
        if os.path.isfile(path):
            self.gguf_path = os.path.dirname(path)
        else:
            self.gguf_path = path
            
        self.tensor_info = {}  # Stores tensor metadata
        self.tensor_file_map = {}  # Maps tensor names to file paths
        self.file_data_map = {}  # Maps file paths to memory-mapped data
        self.gguf_file_meta = {}  # Stores GGUF metadata
        
        # For compatibility with the factory pattern
        self.safetensor_loader = None
        
        # Scan all GGUF files in the directory
        found_gguf = False
        for root, _, files in os.walk(self.gguf_path):
            for file in files:
                if file.endswith(".gguf"):
                    found_gguf = True
                    file_path = os.path.join(root, file)
                    with open(file_path, "rb") as f:
                        self._load_gguf(f)
                        if file_path not in self.file_data_map:
                            self.file_data_map[file_path] = np.memmap(file_path, mode='r')
        
        if not found_gguf:
            raise FileNotFoundError(f"Cannot find any .gguf files in: {self.gguf_path}")
    
    def _load_gguf(self, f) -> None:
        """
        Load GGUF file metadata and tensor info.
        
        Args:
            f: File handle of the GGUF file
        """
        # Implementation should follow the original GGUFLoader._load_gguf
        # This is a simplified version for illustration
        f.seek(0)
        assert f.read(4) == b'GGUF'
        
        # Read header
        values = struct.unpack("<IQQ", f.read(4+8+8))
        version, n_tensors, n_kv = values
        if version != 3:
            warnings.warn(f"Version {version} has never been tested, might not work")

        # Read key-value pairs
        info = {}
        for _ in range(n_kv):
            name = self._read_value(f, 8)  # DATA_TYPES["string"]
            data_type = struct.unpack("<I", f.read(4))[0]
            info[name] = self._read_value(f, data_type)

        # Read tensor info
        tensor_info = {}
        for _ in range(n_tensors):
            name = self._read_value(f, 8)  # DATA_TYPES["string"]
            shape_len = self._read_value(f, 4)  # DATA_TYPES["uint32"]
            shape = [self._read_value(f, 10) for _ in range(shape_len)]  # DATA_TYPES["uint64"]
            ggml_type = self._read_value(f, 4)  # DATA_TYPES["uint32"]
            offset = self._read_value(f, 10)  # DATA_TYPES["uint64"]
            
            # Additional tensor metadata would be calculated here
            # For brevity, we're omitting the detailed tensor metadata calculation
            tensor_info[name] = {
                "ggml_type": ggml_type,
                "shape": shape,
                "offset": offset,
                # ... other tensor metadata
            }
            
        start = f.tell()
        alignment = info.get("general.alignment", 32)
        
        # Calculate actual file offsets
        for t in tensor_info.values():
            offset = start + t["offset"]
            offset += (alignment - offset % alignment) % alignment
            t["offset"] = offset
            
        # Update file maps
        for name in tensor_info:
            self.tensor_file_map[name] = f.name
            
        self.tensor_info.update(tensor_info)
        self.gguf_file_meta.update(info)
    
    def _read_value(self, f, data_type) -> Any:
        """
        Read a value from the file according to its data type.
        
        Args:
            f: File handle
            data_type: Type of data to read
            
        Returns:
            The read value
        """
        # Simplified implementation
        # In a complete implementation, this would handle all data types
        if data_type == 8:  # DATA_TYPES["string"]
            length = struct.unpack("<Q", f.read(8))[0]
            return f.read(length).decode("utf-8")
        elif data_type == 4:  # DATA_TYPES["uint32"]
            return struct.unpack("<I", f.read(4))[0]
        elif data_type == 10:  # DATA_TYPES["uint64"]
            return struct.unpack("<Q", f.read(8))[0]
        # ... handling for other data types
        return None
    
    def load_tensor(self, name: str, device: str = "cpu") -> torch.Tensor:
        """
        Load a tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            
        Returns:
            The loaded tensor
        """
        # This should call load_gguf_tensor with the appropriate parameters
        return self.load_gguf_tensor(name, device)
    
    def load_gguf_tensor(self, name: str, device: str = "cpu", target_dtype = None) -> torch.Tensor:
        """
        Load a GGUF tensor by name.
        
        Args:
            name: Name of the tensor to load
            device: Device to load the tensor to
            target_dtype: Target data type for the tensor
            
        Returns:
            The loaded tensor
        """
        # Implementation would follow the original GGUFLoader.load_gguf_tensor
        # This is a placeholder for illustration
        if name not in self.tensor_info:
            raise KeyError(f"Tensor {name} not found")
            
        # Actual implementation would dequantize the tensor data
        # and return a torch.Tensor
        return torch.zeros(1, device=device)  # Placeholder
    
    @classmethod
    def supports_format(cls, path: str) -> bool:
        """
        Check if this loader supports the given path format.
        
        Args:
            path: Path to check
            
        Returns:
            True if GGUF files are found in the path, False otherwise
        """
        # Normalize path to directory
        if not os.path.exists(path):
            return False
        if os.path.isfile(path):
            return path.endswith(".gguf")
        
        # Check if any GGUF files exist in the folder
        for root, _, files in os.walk(path):
            for file in files:
                if file.endswith(".gguf"):
                    return True
        return False

================================================
FILE: kt-sft/ktransformers/website/.browserslistrc
================================================
> 1%
last 2 versions
not dead
not ie 11


================================================
FILE: kt-sft/ktransformers/website/.eslintrc.js
================================================
module.exports = {
  root: true,
  env: {
    node: true
  },
  'extends': [
    'plugin:vue/vue3-essential',
    'eslint:recommended',
    '@vue/typescript/recommended'
  ],
  parserOptions: {
    ecmaVersion: 2020
  },
  rules: {
    'no-console': process.env.NODE_ENV === 'production' ? 'warn' : 'off',
    'no-debugger': process.env.NODE_ENV === 'production' ? 'warn' : 'off'
  },
  overrides: [
    {
      files: [
        '**/__tests__/*.{j,t}s?(x)',
        '**/tests/unit/**/*.spec.{j,t}s?(x)'
      ],
      env: {
        jest: true
      }
    }
  ]
}


================================================
FILE: kt-sft/ktransformers/website/.gitignore
================================================
.DS_Store
node_modules
/dist


# local env files
.env.local
.env.*.local

# Log files
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*

# Editor directories and files
.idea
.vscode
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?


================================================
FILE: kt-sft/ktransformers/website/README.md
================================================
# 

## Project setup
```
npm install
```

### Compiles and hot-reloads for development
```
npm run serve
```

### Compiles and minifies for production
```
npm run build
```

### Run your unit tests
```
npm run test:unit
```

### Lints and fixes files
```
npm run lint
```

### Customize configuration
See [Configuration Reference](https://cli.vuejs.org/config/).


================================================
FILE: kt-sft/ktransformers/website/config.d.ts
================================================
declare module '*.js' {
    const config: {
      apiUrl: string;
      port:number;
    };
    export { config };
  }

================================================
FILE: kt-sft/ktransformers/website/jest.config.js
================================================
module.exports = {
  preset: '@vue/cli-plugin-unit-jest/presets/typescript'
}


================================================
FILE: kt-sft/ktransformers/website/package.json
================================================
{
  "name": "",
  "version": "",
  "private": true,
  "scripts": {
    "serve": "vue-cli-service serve",
    "build": "vue-cli-service build",
    "test:unit": "vue-cli-service test:unit",
    "lint": "vue-cli-service lint"
  },
  "dependencies": {
    "@types/pdfjs-dist": "^2.10.378",
    "@types/websocket": "^1.0.10",
    "@vue/cli": "^5.0.8",
    "ant-design-vue": "^4.2.1",
    "apexcharts": "^3.49.1",
    "axios": "^1.7.0",
    "axios-extensions": "^3.1.6",
    "better-scroll": "^2.5.1",
    "element-plus": "^2.7.3",
    "marked": "^12.0.2",
    "marked-highlight": "^2.1.1",
    "pdf-lib": "^1.17.1",
    "pdfobject": "^2.3.0",
    "v-clipboard": "^3.0.0-next.1",
    "vue": "^3.4.27",
    "vue-i18n": "^9.13.1",
    "vue-pdf": "^4.3.0",
    "vue-router": "^4.0.3",
    "vue3-apexcharts": "^1.5.3",
    "vuex": "^4.0.0",
    "webpack": "^5.91.0",
    "webpack-cli": "^5.1.4",
    "websocket": "^1.0.35"
  },
  "devDependencies": {
    "@types/jest": "^27.0.1",
    "@types/pdfobject": "^2.2.5",
    "@typescript-eslint/eslint-plugin": "^5.4.0",
    "@typescript-eslint/parser": "^5.4.0",
    "@vue/cli-plugin-eslint": "~5.0.0",
    "@vue/cli-plugin-router": "~5.0.0",
    "@vue/cli-plugin-typescript": "~5.0.0",
    "@vue/cli-plugin-unit-jest": "~5.0.0",
    "@vue/cli-plugin-vuex": "~5.0.0",
    "@vue/cli-service": "~5.0.0",
    "@vue/eslint-config-typescript": "^9.1.0",
    "@vue/test-utils": "^2.0.0-0",
    "@vue/vue3-jest": "^27.0.0-alpha.1",
    "babel-jest": "^27.0.6",
    "eslint": "^7.32.0",
    "eslint-plugin-vue": "^8.0.3",
    "jest": "^27.0.5",
    "stylus": "^0.55.0",
    "stylus-loader": "^6.1.0",
    "ts-jest": "^27.0.4",
    "typescript": "~4.5.5"
  },
  "_id": "@",
  "readme": "ERROR: No README data found!"
}


================================================
FILE: kt-sft/ktransformers/website/public/config.js
================================================
window.configWeb = {
    apiUrl: 'http://119.255.238.12:15670/v1',
    port: 8080,
  };

================================================
FILE: kt-sft/ktransformers/website/public/css/reset.css
================================================
html, body, div, span, applet, object, iframe,
h1, h2, h3, h4, h5, h6, p, blockquote, pre,
a, abbr, acronym, address, big, cite, code,
del, dfn, em, img, ins, kbd, q, s, samp,
small, strike, strong, sub, sup, tt, var,
b, u, i, center,
dl, dt, dd, ol, ul, li,
fieldset, form, label, legend,textarea,
table, caption, tbody, tfoot, thead, tr, th, td,
article, aside, canvas, details, embed,
figure, figcaption, footer, header, hgroup,
menu, nav, output, ruby, section, summary,
time, mark, audio, video {
    margin: 0;
    padding: 0;
    border: 0;
    font-size: 100%;
    *font: inherit;
    font-family: Arial, Microsoft YaHei, SimHei, Tahoma, sans-serif !important;
    vertical-align: baseline;
}
/* HTML5 display-role reset for older browsers */
article, aside, details, figcaption, figure,
footer, header, hgroup, menu, nav, section {
    display: block;
}
body {
    line-height: 1;
    -webkit-text-size-adjust: 100%!important;
    margin: 0;
}
html,body {
    height: 100%;
    width: 100%;
    overflow: hidden;
}
ol, ul {
    list-style: none;
}
blockquote, q {
    quotes: none;
}
blockquote:before, blockquote:after,
q:before, q:after {
    content: '';
    content: none;
}
table {
    border-collapse: collapse;
    border-spacing: 0;
}

.clearfix:before,
.clearfix:after {
    content:"";
    display:table
}
.clearfix:after {
    clear:both
}

/*显示省略号*/
.ellipsis{
    overflow: hidden;
    text-overflow: ellipsis;
    white-space: nowrap;
}


================================================
FILE: kt-sft/ktransformers/website/public/index.html
================================================
<!DOCTYPE html>
<html lang="">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,minimum-scale=1.0,user-scalable=no">
    <script src="./config.js"></script>
    <link rel="icon" href="./balck.ico" />
    <link type="text/css" rel="stylesheet" href="<%= BASE_URL %>/css/reset.css">
    <title>KTransformers</title>
  </head>
  <body onselectstart='return false' onselect='return false'>
    <noscript>
      <strong>We're sorry but <%= htmlWebpackPlugin.options.title %> doesn't work properly without JavaScript enabled. Please enable it to continue.</strong>
    </noscript>
    <div id="app"></div>
    <!-- built files will be auto injected -->
  </body>
</html>


================================================
FILE: kt-sft/ktransformers/website/src/App.vue
================================================
<template>
  <div class="app-container" @contextmenu.prevent.stop="">
    <keep-alive>
      <router-view/>
    </keep-alive>
  </div>
</template>

<script setup lang="ts">
</script>

<style lang="stylus">
  @import "assets/iconfont/iconfont.css"
  #app
  .app-container
    width: 100%
    height: 100%
    position: relative
</style>

================================================
FILE: kt-sft/ktransformers/website/src/api/api-client.ts
================================================
import axios, { AxiosInstance } from 'axios';
import {baseURL} from '@/conf/config';
const apiClient: AxiosInstance = axios.create({
    baseURL: baseURL,
    // baseURL: '/api',
    headers: {
        'Content-Type': 'application/json',
    },
    withCredentials: true,
});
export default apiClient;


================================================
FILE: kt-sft/ktransformers/website/src/api/assistant.ts
================================================
import apiClient from './api-client';
import { IAssistant,IDeleteResult, IAssistantWithStatus } from '../utils/types';
function filterAndConvert(
    assistantsWithStatus: IAssistantWithStatus[],
    statusCondition: string
  ): IAssistant[] {
    return assistantsWithStatus
      .filter((assistant) => assistant.build_status.status === statusCondition)
      .map(({ build_status, ...rest }) => rest);
  }

interface IAssistantData {
    model: string;
    prefix_system_prompt?: string;
    suffix_system_prompt?: string;
    name?: string;
    description?: string;
    tools?: any[];
    tool_resources?: object;
    metadata?:{[key:string]:any}
    top_p?: number;
    temperature?: number;
    response_format?: string;
    instructions?: string;
}

export const createAssistant = async (data: IAssistantData): Promise<IAssistant> => {
    const assistant_data: {
        model: string;
        instructions?: string;
        name?: string;
        description?: string;
        tools?: any[];
        tool_resources?: object;
        metadata?:{[key:string]:any}
        top_p?: number;
        temperature?: number;
        response_format?: string;
    } = {
        model: data.model
    };

    if (data.prefix_system_prompt) {
        assistant_data.instructions = data.prefix_system_prompt;
    }
    if (data.suffix_system_prompt) {
        assistant_data.instructions = data.suffix_system_prompt;
    }
    if (data.name) {
        assistant_data.name = data.name;
    }
    if (data.description) {
        assistant_data.description = data.description;
    }
    if (data.tools) {
        assistant_data.tools = data.tools;
    }
    if (data.tool_resources) {
        assistant_data.tool_resources = data.tool_resources;
    }
    if (data.metadata) {
        assistant_data.metadata = data.metadata
    }
    if (typeof data.top_p !== 'undefined') {
        assistant_data.top_p = data.top_p;
    }
    if (typeof data.temperature !== 'undefined') {
        assistant_data.temperature = data.temperature;
    }
    if (data.response_format) {
        assistant_data.response_format = data.response_format;
    }
    if (data.instructions) {
        assistant_data.instructions = data.instructions;
    }
    console.log(assistant_data)
    const response = await apiClient.post<IAssistant>(
        '/assistants/',
        assistant_data
    );
    console.log("response", response)
    return response.data;
};


export const listAssistants = async (
    limit?: number,
    order?: string,
    after?: string,
    before?: string,
    run_id?: string,
): Promise<IAssistant[]> => {
    const params: {
        limit?: number,
        order?: string,
        after?: string,
        before?: string,
        run_id?: string
    } = {};

    if (typeof limit !== 'undefined') {
        params.limit = limit;
    }
    if (typeof order !== 'undefined') {
        params.order = order;
    }
    if (typeof after !== 'undefined') {
        params.after = after;
    }
    if (typeof before !== 'undefined') {
        params.before = before;
    }
    if (typeof run_id !== 'undefined') {
        params.run_id = run_id;
    }
    const response = await apiClient.get<IAssistantWithStatus[]>('/assistants/status', {
        params
    });
    let tmp = response.data
    let result = [] as IAssistant[]
    const filteredAssistants = filterAndConvert(tmp, 'completed');
    return filteredAssistants
};

export const getAssistant = async (
    assistant_id: string
): Promise<IAssistant> => {
    const response = await apiClient.get<IAssistant>(`/assistants/${assistant_id}`);
    return response.data;
}

export const deleteAssistant = async (
    assistant_id: string
): Promise<IDeleteResult> => {
    const response = await apiClient.delete<IDeleteResult>(`/assistants/${assistant_id}`);
    return response.data;
}

export const getRelatedThreadId = async (
    assistant_id: string
): Promise<string[]> => {
    const response = await apiClient.get<string[]>(`/assistants/${assistant_id}/related_thread`);
    return response.data;
}

export const listAssistantsWithStatus = async (
    limit?: number,
    order?: string,
    after?: string,
    before?: string,
    run_id?: string,
): Promise<IAssistantWithStatus[]> => {
    const params: {
        limit?: number,
        order?: string,
        after?: string,
        before?: string,
        run_id?: string
    } = {};

    if (typeof limit !== 'undefined') {
        params.limit = limit;
    }
    if (typeof order !== 'undefined') {
        params.order = order;
    }
    if (typeof after !== 'undefined') {
        params.after = after;
    }
    if (typeof before !== 'undefined') {
        params.before = before;
    }
    if (typeof run_id !== 'undefined') {
        params.run_id = run_id;
    }
    console.log(params)
    const response = await apiClient.get<IAssistantWithStatus[]>('/assistants/status', {
        params
    });

    return response.data;
};


================================================
FILE: kt-sft/ktransformers/website/src/api/message.ts
================================================
import apiClient from './api-client';
import { IMessage,IDeleteResult } from '../utils/types';

export const createMessage = async (
    thread_id: string,
    content: string,
    role?: string,
    attachments?: any[],
    metadata?:{[key:string]:any}
): Promise<IMessage> => {
    const message_data: {
        content: string;
        role?: string;
        attachments?: any[];
        metadata?:{[key:string]:any}
    } = {
        content,
    };

    if (metadata) {
        message_data.metadata = metadata;
    }
    if (role) {
        message_data.role = role;
    }
    if (attachments) {
        message_data.attachments = attachments;
    }
    const response = await apiClient.post<IMessage>(`/threads/${thread_id}/messages`, message_data);
    return response.data;
};


export const listMessages = async (
    thread_id: string,
    limit?: number,
    order?: string,
    after?: string,
    before?: string,
    run_id?: string,
): Promise<IMessage[]> => {
    const params: {
        limit?: number,
        order?: string,
        after?: string,
        before?: string,
        run_id?: string
    } = {};

    if (typeof limit !== 'undefined') {
        params.limit = limit;
    }
    if (typeof order !== 'undefined') {
        params.order = order;
    }
    if (typeof after !== 'undefined') {
        params.after = after;
    }
    if (typeof before !== 'undefined') {
        params.before = before;
    }
    if (typeof run_id !== 'undefined') {
        params.run_id = run_id;
    }

    const response = await apiClient.get<IMessage[]>(`/threads/${thread_id}/messages`, {
        params
    });

    return response.data;
};
export const deleteMessage = async(thread_id:string, message_id:string): Promise<IDeleteResult> => {
    const response = await apiClient.delete<IDeleteResult>(`/threads/${thread_id}/messages/${message_id}`);
    return response.data;
}


================================================
FILE: kt-sft/ktransformers/website/src/api/run.ts
================================================
import apiClient from './api-client';
import { IRun } from '../utils/types';
import {baseURL} from '@/conf/config';
interface IRunData {
    assistant_id: string;
    model?: string;
    instructions?: string;
    additional_instructions?: string;
    additional_messages?: any[];
    tools?: any[];
    metadata?: { [key: string]: any }
    temperature?: number;
    top_p?: number;
    stream?: boolean;
    max_prompt_tokens?: number;
    max_completion_tokens?: number;
    truncation_strategy?: object;
    tool_choice?: string;
    response_format?: string | object;
}


export async function* createRun(
    data: IRunData,
    thread_id: string
): AsyncGenerator<string> {
    const run_data = {
        ...data, 
        assistant_id: data.assistant_id, 
    };

    const response = await fetch(`${baseURL}/threads/${thread_id}/runs`, {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json',
        },
        body: JSON.stringify(run_data),
    });

    if (!response.ok) {
        throw new Error(`HTTP error! status: ${response.status}`);
    }

    if (!response.body) {
        throw new Error('Response body is missing');
    }
    const reader = response.body.getReader();
    const decoder = new TextDecoder();
    let buffer = '';
    try {
        while (true) {
            const { done, value } = await reader.read();
            if (done) return;
            buffer += decoder.decode(value, { stream: true });

            let eventIndex = buffer.indexOf("\n\n");
            while (eventIndex !== -1) {
                const event = buffer.slice(0, eventIndex);
                buffer = buffer.slice(eventIndex + 2);
                if (event.startsWith("event: thread.run.created")) {
                    const dataIndex = event.indexOf("data: ");
                    if (dataIndex !== -1) {
                        const datads = event.slice(39, 75)
                        yield datads;
                    }
                } else if (event.startsWith("event: thread.message.delta")) {
                    const dataIndex = event.indexOf("data: ");
                    if (dataIndex !== -1) {
                        const data = JSON.parse(event.slice(dataIndex + 6));
                        yield data.delta.content[0].text.value || '';
                    }
                } else if (event.startsWith("event: done")) {
                    return;
                }

                eventIndex = buffer.indexOf("\n\n");
            }
        }
    } catch (e) {

        console.error('An error occurred while reading the response stream:', e);
        // throw e; 
        return e
    }
}
// 定义取消运行的函数
export async function cancelRun(threadId: string, runId: string){
    const run_data = {
        thread_id:threadId,
        run_id:runId,
    };
    try {
        const response = await fetch(`${baseURL}/threads/${threadId}/runs/${runId}/cancel`, {
            method: 'POST',
        });

        if (!response.ok) {
            throw new Error(`HTTP error! status: ${response.status}`);
        }

        return response;
    } catch (error) {
        console.error('An error occurred while cancelling the run:', error);
        throw error;
    }
}

================================================
FILE: kt-sft/ktransformers/website/src/api/thread.ts
================================================
import apiClient from './api-client';
import { IThread, IMessage, IThreadAndMessageAndAssistant, IDeleteResult } from '../utils/types';
export const createThread = async (
    message?: IMessage,
    tool_resources?: object,
    metadata?: { [key: string]: any }
): Promise<IThread> => {
    const thread_data: { message?: object, metadata?: { [key: string]: any } } = {};
    if (message) {
        thread_data.message = message;
    }
    if (metadata) {
        thread_data.metadata = metadata;
    }
    const response = await apiClient.post<IThread>(
        '/threads',
        thread_data);
    return response.data;
};

export const listThreads = async (
    limit?: number,
    order?: string,
): Promise<IThreadAndMessageAndAssistant[]> => {
    const params: {
        limit?: number,
        order?: string,
    } = { limit, order };
    const response = await apiClient.get<IThreadAndMessageAndAssistant[]>('/threads', {
        params
    });

    return response.data;
};

export const deleteThread = async (
    thread_id: string
): Promise<IDeleteResult> => {
    const response = await apiClient.delete<IDeleteResult>(`/threads/${thread_id}`);
    return response.data;
}

export const getThread = async (
    thread_id: string
): Promise<IThread> => {
    const response = await apiClient.get<IThread>(`/threads/${thread_id}`);
    return response.data;
}

================================================
FILE: kt-sft/ktransformers/website/src/assets/css/mixins.styl
================================================

/*Define color variables*/
$bg_gray_light_normal = #F9F9F9
$bg_gray_light_hover = #E8E8E8
$bg_gray_light_active = #E8E8E8

$border_gray_light_normal = rgba(0, 0, 0, .15)
$border_gray_light_hover = #8080FF

$gray_20 = #333333
$gray_40 = #585858
$gray_50 = #7F7F7F
$gray_60 = #9F9F9F
$gray_70 = #BFBFBF
$gray_80 = #DFDFDF
$gray_85 = #F2F2F2
$gray_90 = #F7F7F7

$gray = #53525B
$gray_dark = #42414a
$gray_hover = #121212
$gray_action = #6C757D

$primary = #409eff
$primary_hover = #428bca
$primary_middle = #9DDDF9
$primary_light = #D4F0FC

$cyan = #66CCCC
$cyan_hover = #46C2C2


/*Define common modules*/
$input-duration = .25s
input-border()
  -webkit-transition: border-color ease-in-out $input-duration,-webkit-box-shadow ease-in-out $input-duration
  -o-transition: border-color ease-in-out $input-duration,box-shadow ease-in-out $input-duration
  transition: border-color ease-in-out $input-duration,box-shadow ease-in-out $input-duration
input-focus()
  border-color: #66afe9
  outline: 0
  z-index: 100
  -webkit-box-shadow: inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(102,175,233,.6)
  box-shadow: inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(102,175,233,.6)


/*Define common class*/
.flex-column
  display: -webkit-box
  display: -webkit-flex
  display: flex
  box-sizing: border-box
  -webkit-box-orient: vertical
  -webkit-box-direction: normal
  -webkit-flex-direction: column
  flex-direction: column
  height: 100%

.flex-row
  position: relative
  display: -webkit-box
  display: -ms-flexbox
  display: flex
  box-sizing: border-box
  -webkit-box-align: center
  -ms-flex-align: center
  align-items: center

.flex-unit
  -webkit-box-flex: 1
  -ms-flex: 1
  flex: 1
  // overflow: hidden

.clearfix
  &:after
    clear: both
    content: "\20"
    display: block
    height: 0
    visibility: hidden

a,a:hover
  text-decoration:none

button:focus
  outline: none

.btn
  display: inline-block
  margin-bottom: 0
  padding:0px 15px
  font-size: 14px
  height: 34px
  line-height: 32px
  float: left /*去掉inline-block之间的空格*/
  font-weight: normal
  text-align: center
  white-space: nowrap
  vertical-align: middle
  cursor: pointer
  background-image: none
  border-radius: 3px
  -webkit-user-select: none
  -moz-user-select: none
  -ms-user-select: none
  -o-user-select: none
  user-select: none
  &:hover
    .dropdown-list
      display: block
  i
    font-size: 16px
  .text
    float: right
    margin-left: 3px

.btn-gray
  color: $gray_action
  background-color: #FFFFFF
  border: 1px solid $gray_action
  &:not(.is-disabled):hover
    color: #FFFFFF
    background-color: $gray_action
    border: 1px solid $gray_action

.btn-primary
  color: #FFFFFF
  background-color: $primary
  border: 1px solid $primary
  &:not(.is-disabled):hover
    color: #FFFFFF
    background-color: $primary_hover
    border: 1px solid $primary_hover

.chat-box
  position: relative
  .chat-input
    border: 1px solid $border_gray_light_normal
    height: 48px
    line-height: 48px
    font-size: 16px
    outline: 0
    box-sizing: border-box
    padding:0 30px0 20px
    color: #7F7F7F
    width: 800px
    border-radius: 12px
    position: relative
    &:focus
      input-focus()
  i
    position: absolute
    font-size: 26px
    right: 13px
    bottom:0px
    color: $border_gray_light_normal
    z-index: 100
    cursor: pointer
    &:hover
      color: $border_gray_light_hover


================================================
FILE: kt-sft/ktransformers/website/src/assets/iconfont/demo.css
================================================
/* Logo 字体 */
@font-face {
  font-family: "iconfont logo";
  src: url('https://at.alicdn.com/t/font_985780_km7mi63cihi.eot?t=1545807318834');
  src: url('https://at.alicdn.com/t/font_985780_km7mi63cihi.eot?t=1545807318834#iefix') format('embedded-opentype'),
    url('https://at.alicdn.com/t/font_985780_km7mi63cihi.woff?t=1545807318834') format('woff'),
    url('https://at.alicdn.com/t/font_985780_km7mi63cihi.ttf?t=1545807318834') format('truetype'),
    url('https://at.alicdn.com/t/font_985780_km7mi63cihi.svg?t=1545807318834#iconfont') format('svg');
}

.logo {
  font-family: "iconfont logo";
  font-size: 160px;
  font-style: normal;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

/* tabs */
.nav-tabs {
  position: relative;
}

.nav-tabs .nav-more {
  position: absolute;
  right: 0;
  bottom: 0;
  height: 42px;
  line-height: 42px;
  color: #666;
}

#tabs {
  border-bottom: 1px solid #eee;
}

#tabs li {
  cursor: pointer;
  width: 100px;
  height: 40px;
  line-height: 40px;
  text-align: center;
  font-size: 16px;
  border-bottom: 2px solid transparent;
  position: relative;
  z-index: 1;
  margin-bottom: -1px;
  color: #666;
}


#tabs .active {
  border-bottom-color: #f00;
  color: #222;
}

.tab-container .content {
  display: none;
}

/* 页面布局 */
.main {
  padding: 30px 100px;
  width: 960px;
  margin: 0 auto;
}

.main .logo {
  color: #333;
  text-align: left;
  margin-bottom: 30px;
  line-height: 1;
  height: 110px;
  margin-top: -50px;
  overflow: hidden;
  *zoom: 1;
}

.main .logo a {
  font-size: 160px;
  color: #333;
}

.helps {
  margin-top: 40px;
}

.helps pre {
  padding: 20px;
  margin: 10px 0;
  border: solid 1px #e7e1cd;
  background-color: #fffdef;
  overflow: auto;
}

.icon_lists {
  width: 100% !important;
  overflow: hidden;
  *zoom: 1;
}

.icon_lists li {
  width: 100px;
  margin-bottom: 10px;
  margin-right: 20px;
  text-align: center;
  list-style: none !important;
  cursor: default;
}

.icon_lists li .code-name {
  line-height: 1.2;
}

.icon_lists .icon {
  display: block;
  height: 100px;
  line-height: 100px;
  font-size: 42px;
  margin: 10px auto;
  color: #333;
  -webkit-transition: font-size 0.25s linear, width 0.25s linear;
  -moz-transition: font-size 0.25s linear, width 0.25s linear;
  transition: font-size 0.25s linear, width 0.25s linear;
}

.icon_lists .icon:hover {
  font-size: 100px;
}

.icon_lists .svg-icon {
  /* 通过设置 font-size 来改变图标大小 */
  width: 1em;
  /* 图标和文字相邻时，垂直对齐 */
  vertical-align: -0.15em;
  /* 通过设置 color 来改变 SVG 的颜色/fill */
  fill: currentColor;
  /* path 和 stroke 溢出 viewBox 部分在 IE 下会显示
      normalize.css 中也包含这行 */
  overflow: hidden;
}

.icon_lists li .name,
.icon_lists li .code-name {
  color: #666;
}

/* markdown 样式 */
.markdown {
  color: #666;
  font-size: 14px;
  line-height: 1.8;
}

.highlight {
  line-height: 1.5;
}

.markdown img {
  vertical-align: middle;
  max-width: 100%;
}

.markdown h1 {
  color: #404040;
  font-weight: 500;
  line-height: 40px;
  margin-bottom: 24px;
}

.markdown h2,
.markdown h3,
.markdown h4,
.markdown h5,
.markdown h6 {
  color: #404040;
  margin: 1.6em 0 0.6em 0;
  font-weight: 500;
  clear: both;
}

.markdown h1 {
  font-size: 28px;
}

.markdown h2 {
  font-size: 22px;
}

.markdown h3 {
  font-size: 16px;
}

.markdown h4 {
  font-size: 14px;
}

.markdown h5 {
  font-size: 12px;
}

.markdown h6 {
  font-size: 12px;
}

.markdown hr {
  height: 1px;
  border: 0;
  background: #e9e9e9;
  margin: 16px 0;
  clear: both;
}

.markdown p {
  margin: 1em 0;
}

.markdown>p,
.markdown>blockquote,
.markdown>.highlight,
.markdown>ol,
.markdown>ul {
  width: 80%;
}

.markdown ul>li {
  list-style: circle;
}

.markdown>ul li,
.markdown blockquote ul>li {
  margin-left: 20px;
  padding-left: 4px;
}

.markdown>ul li p,
.markdown>ol li p {
  margin: 0.6em 0;
}

.markdown ol>li {
  list-style: decimal;
}

.markdown>ol li,
.markdown blockquote ol>li {
  margin-left: 20px;
  padding-left: 4px;
}

.markdown code {
  margin: 0 3px;
  padding: 0 5px;
  background: #eee;
  border-radius: 3px;
}

.markdown strong,
.markdown b {
  font-weight: 600;
}

.markdown>table {
  border-collapse: collapse;
  border-spacing:0;
  empty-cells: show;
  border: 1px solid #e9e9e9;
  width: 95%;
  margin-bottom: 24px;
}

.markdown>table th {
  white-space: nowrap;
  color: #333;
  font-weight: 600;
}

.markdown>table th,
.markdown>table td {
  border: 1px solid #e9e9e9;
  padding: 8px 16px;
  text-align: left;
}

.markdown>table th {
  background: #F7F7F7;
}

.markdown blockquote {
  font-size: 90%;
  color: #999;
  border-left: 4px solid #e9e9e9;
  padding-left: 0.8em;
  margin: 1em 0;
}

.markdown blockquote p {
  margin: 0;
}

.markdown .anchor {
  opacity: 0;
  transition: opacity 0.3s ease;
  margin-left: 8px;
}

.markdown .waiting {
  color: #ccc;
}

.markdown h1:hover .anchor,
.markdown h2:hover .anchor,
.markdown h3:hover .anchor,
.markdown h4:hover .anchor,
.markdown h5:hover .anchor,
.markdown h6:hover .anchor {
  opacity: 1;
  display: inline-block;
}

.markdown>br,
.markdown>p>br {
  clear: both;
}


.hljs {
  display: block;
  background: white;
  padding: 0.5em;
  color: #333333;
  overflow-x: auto;
}

.hljs-comment,
.hljs-meta {
  color: #969896;
}

.hljs-string,
.hljs-variable,
.hljs-template-variable,
.hljs-strong,
.hljs-emphasis,
.hljs-quote {
  color: #df5000;
}

.hljs-keyword,
.hljs-selector-tag,
.hljs-type {
  color: #a71d5d;
}

.hljs-literal,
.hljs-symbol,
.hljs-bullet,
.hljs-attribute {
  color: #0086b3;
}

.hljs-section,
.hljs-name {
  color: #63a35c;
}

.hljs-tag {
  color: #333333;
}

.hljs-title,
.hljs-attr,
.hljs-selector-id,
.hljs-selector-class,
.hljs-selector-attr,
.hljs-selector-pseudo {
  color: #795da3;
}

.hljs-addition {
  color: #55a532;
  background-color: #eaffea;
}

.hljs-deletion {
  color: #bd2c00;
  background-color: #ffecec;
}

.hljs-link {
  text-decoration: underline;
}

/* 代码高亮 */
/* PrismJS 1.15.0
https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript */
/**
 * prism.js default theme for JavaScript, CSS and HTML
 * Based on dabblet (http://dabblet.com)
 * @author Lea Verou
 */
code[class*="language-"],
pre[class*="language-"] {
  color: black;
  background: none;
  text-shadow: 0 1px white;
  font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
  text-align: left;
  white-space: pre;
  word-spacing: normal;
  word-break: normal;
  word-wrap: normal;
  line-height: 1.5;

  -moz-tab-size: 4;
  -o-tab-size: 4;
  tab-size: 4;

  -webkit-hyphens: none;
  -moz-hyphens: none;
  -ms-hyphens: none;
  hyphens: none;
}

pre[class*="language-"]::-moz-selection,
pre[class*="language-"] ::-moz-selection,
code[class*="language-"]::-moz-selection,
code[class*="language-"] ::-moz-selection {
  text-shadow: none;
  background: #b3d4fc;
}

pre[class*="language-"]::selection,
pre[class*="language-"] ::selection,
code[class*="language-"]::selection,
code[class*="language-"] ::selection {
  text-shadow: none;
  background: #b3d4fc;
}

@media print {

  code[class*="language-"],
  pre[class*="language-"] {
    text-shadow: none;
  }
}

/* Code blocks */
pre[class*="language-"] {
  padding: 1em;
  margin: .5em 0;
  overflow: auto;
}

:not(pre)>code[class*="language-"],
pre[class*="language-"] {
  background: #f5f2f0;
}

/* Inline code */
:not(pre)>code[class*="language-"] {
  padding: .1em;
  border-radius: .3em;
  white-space: normal;
}

.token.comment,
.token.prolog,
.token.doctype,
.token.cdata {
  color: slategray;
}

.token.punctuation {
  color: #999;
}

.namespace {
  opacity: .7;
}

.token.property,
.token.tag,
.token.boolean,
.token.number,
.token.constant,
.token.symbol,
.token.deleted {
  color: #905;
}

.token.selector,
.token.attr-name,
.token.string,
.token.char,
.token.builtin,
.token.inserted {
  color: #690;
}

.token.operator,
.token.entity,
.token.url,
.language-css .token.string,
.style .token.string {
  color: #9a6e3a;
  background: hsla(0, 0%, 100%, .5);
}

.token.atrule,
.token.attr-value,
.token.keyword {
  color: #07a;
}

.token.function,
.token.class-name {
  color: #DD4A68;
}

.token.regex,
.token.important,
.token.variable {
  color: #e90;
}

.token.important,
.token.bold {
  font-weight: bold;
}

.token.italic {
  font-style: italic;
}

.token.entity {
  cursor: help;
}


================================================
FILE: kt-sft/ktransformers/website/src/assets/iconfont/demo_index.html
================================================
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8"/>
  <title>iconfont Demo</title>
  <link rel="shortcut icon" href="//img.alicdn.com/imgextra/i4/O1CN01Z5paLz1O0zuCC7osS_!!6000000001644-55-tps-83-82.svg" type="image/x-icon"/>
  <link rel="icon" type="image/svg+xml" href="//img.alicdn.com/imgextra/i4/O1CN01Z5paLz1O0zuCC7osS_!!6000000001644-55-tps-83-82.svg"/>
  <link rel="stylesheet" href="https://g.alicdn.com/thx/cube/1.3.2/cube.min.css">
  <link rel="stylesheet" href="demo.css">
  <link rel="stylesheet" href="iconfont.css">
  <script src="iconfont.js"></script>
  <!-- jQuery -->
  <script src="https://a1.alicdn.com/oss/uploads/2018/12/26/7bfddb60-08e8-11e9-9b04-53e73bb6408b.js"></script>
  <!-- 代码高亮 -->
  <script src="https://a1.alicdn.com/oss/uploads/2018/12/26/a3f714d0-08e6-11e9-8a15-ebf944d7534c.js"></script>
  <style>
    .main .logo {
      margin-top: 0;
      height: auto;
    }

    .main .logo a {
      display: flex;
      align-items: center;
    }

    .main .logo .sub-title {
      margin-left: 0.5em;
      font-size: 22px;
      color: #fff;
      background: linear-gradient(-45deg, #3967FF, #B500FE);
      -webkit-background-clip: text;
      -webkit-text-fill-color: transparent;
    }
  </style>
</head>
<body>
  <div class="main">
    <h1 class="logo"><a href="https://www.iconfont.cn/" title="iconfont 首页" target="_blank">
      <img width="200" src="https://img.alicdn.com/imgextra/i3/O1CN01Mn65HV1FfSEzR6DKv_!!6000000000514-55-tps-228-59.svg">
      
    </a></h1>
    <div class="nav-tabs">
      <ul id="tabs" class="dib-box">
        <li class="dib active"><span>Unicode</span></li>
        <li class="dib"><span>Font class</span></li>
        <li class="dib"><span>Symbol</span></li>
      </ul>
      
      <a href="https://www.iconfont.cn/manage/index?manage_type=myprojects&projectId=4550268" target="_blank" class="nav-more">查看项目</a>
      
    </div>
    <div class="tab-container">
      <div class="content unicode" style="display: block;">
          <ul class="icon_lists dib-box">
          
            <li class="dib">
              <span class="icon iconfont">&#xe8b0;</span>
                <div class="name">复制</div>
                <div class="code-name">&amp;#xe8b0;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe85e;</span>
                <div class="name">箭头下</div>
                <div class="code-name">&amp;#xe85e;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe651;</span>
                <div class="name">进度</div>
                <div class="code-name">&amp;#xe651;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe617;</span>
                <div class="name">环形进度条</div>
                <div class="code-name">&amp;#xe617;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe779;</span>
                <div class="name">向左1</div>
                <div class="code-name">&amp;#xe779;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe608;</span>
                <div class="name">点</div>
                <div class="code-name">&amp;#xe608;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe7dd;</span>
                <div class="name">编辑</div>
                <div class="code-name">&amp;#xe7dd;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe614;</span>
                <div class="name">删除</div>
                <div class="code-name">&amp;#xe614;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe618;</span>
                <div class="name">上传</div>
                <div class="code-name">&amp;#xe618;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe621;</span>
                <div class="name">探索-选中</div>
                <div class="code-name">&amp;#xe621;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe657;</span>
                <div class="name">ellipsis</div>
                <div class="code-name">&amp;#xe657;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe60c;</span>
                <div class="name">发送</div>
                <div class="code-name">&amp;#xe60c;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe62d;</span>
                <div class="name">列表</div>
                <div class="code-name">&amp;#xe62d;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe639;</span>
                <div class="name">列表</div>
                <div class="code-name">&amp;#xe639;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe6bd;</span>
                <div class="name">重试</div>
                <div class="code-name">&amp;#xe6bd;</div>
              </li>
          
            <li class="dib">
              <span class="icon iconfont">&#xe826;</span>
                <div class="name">Fork 记录</div>
                <div class="code-name">&amp;#xe826;</div>
              </li>
          
          </ul>
          <div class="article markdown">
          <h2 id="unicode-">Unicode 引用</h2>
          <hr>

          <p>Unicode 是字体在网页端最原始的应用方式，特点是：</p>
          <ul>
            <li>支持按字体的方式去动态调整图标大小，颜色等等。</li>
            <li>默认情况下不支持多色，直接添加多色图标会自动去色。</li>
          </ul>
          <blockquote>
            <p>注意：新版 iconfont 支持两种方式引用多色图标：SVG symbol 引用方式和彩色字体图标模式。（使用彩色字体图标需要在「编辑项目」中开启「彩色」选项后并重新生成。）</p>
          </blockquote>
          <p>Unicode 使用步骤如下：</p>
          <h3 id="-font-face">第一步：拷贝项目下面生成的 <code>@font-face</code></h3>
<pre><code class="language-css"
>@font-face {
  font-family: 'iconfont';
  src: url('iconfont.woff2?t=1717950820214') format('woff2'),
       url('iconfont.woff?t=1717950820214') format('woff'),
       url('iconfont.ttf?t=1717950820214') format('truetype'),
       url('iconfont.svg?t=1717950820214#iconfont') format('svg');
}
</code></pre>
          <h3 id="-iconfont-">第二步：定义使用 iconfont 的样式</h3>
<pre><code class="language-css"
>.iconfont {
  font-family: "iconfont" !important;
  font-size: 16px;
  font-style: normal;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}
</code></pre>
          <h3 id="-">第三步：挑选相应图标并获取字体编码，应用于页面</h3>
<pre>
<code class="language-html"
>&lt;span class="iconfont"&gt;&amp;#x33;&lt;/span&gt;
</code></pre>
          <blockquote>
            <p>"iconfont" 是你项目下的 font-family。可以通过编辑项目查看，默认是 "iconfont"。</p>
          </blockquote>
          </div>
      </div>
      <div class="content font-class">
        <ul class="icon_lists dib-box">
          
          <li class="dib">
            <span class="icon iconfont icon-copy"></span>
            <div class="name">
              复制
            </div>
            <div class="code-name">.icon-copy
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-arrow-down"></span>
            <div class="name">
              箭头下
            </div>
            <div class="code-name">.icon-arrow-down
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-usage-progress"></span>
            <div class="name">
              进度
            </div>
            <div class="code-name">.icon-usage-progress
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-gen-progress"></span>
            <div class="name">
              环形进度条
            </div>
            <div class="code-name">.icon-gen-progress
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-back"></span>
            <div class="name">
              向左1
            </div>
            <div class="code-name">.icon-back
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-point"></span>
            <div class="name">
              点
            </div>
            <div class="code-name">.icon-point
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-edit"></span>
            <div class="name">
              编辑
            </div>
            <div class="code-name">.icon-edit
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-delete"></span>
            <div class="name">
              删除
            </div>
            <div class="code-name">.icon-delete
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-upload-1"></span>
            <div class="name">
              上传
            </div>
            <div class="code-name">.icon-upload-1
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-explore"></span>
            <div class="name">
              探索-选中
            </div>
            <div class="code-name">.icon-explore
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-ellipsis"></span>
            <div class="name">
              ellipsis
            </div>
            <div class="code-name">.icon-ellipsis
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-sent"></span>
            <div class="name">
              发送
            </div>
            <div class="code-name">.icon-sent
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-list-list"></span>
            <div class="name">
              列表
            </div>
            <div class="code-name">.icon-list-list
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-list-icon"></span>
            <div class="name">
              列表
            </div>
            <div class="code-name">.icon-list-icon
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-zhongshi"></span>
            <div class="name">
              重试
            </div>
            <div class="code-name">.icon-zhongshi
            </div>
          </li>
          
          <li class="dib">
            <span class="icon iconfont icon-log"></span>
            <div class="name">
              Fork 记录
            </div>
            <div class="code-name">.icon-log
            </div>
          </li>
          
        </ul>
        <div class="article markdown">
        <h2 id="font-class-">font-class 引用</h2>
        <hr>

        <p>font-class 是 Unicode 使用方式的一种变种，主要是解决 Unicode 书写不直观，语意不明确的问题。</p>
        <p>与 Unicode 使用方式相比，具有如下特点：</p>
        <ul>
          <li>相比于 Unicode 语意明确，书写更直观。可以很容易分辨这个 icon 是什么。</li>
          <li>因为使用 class 来定义图标，所以当要替换图标时，只需要修改 class 里面的 Unicode 引用。</li>
        </ul>
        <p>使用步骤如下：</p>
        <h3 id="-fontclass-">第一步：引入项目下面生成的 fontclass 代码：</h3>
<pre><code class="language-html">&lt;link rel="stylesheet" href="./iconfont.css"&gt;
</code></pre>
        <h3 id="-">第二步：挑选相应图标并获取类名，应用于页面：</h3>
<pre><code class="language-html">&lt;span class="iconfont icon-xxx"&gt;&lt;/span&gt;
</code></pre>
        <blockquote>
          <p>"
            iconfont" 是你项目下的 font-family。可以通过编辑项目查看，默认是 "iconfont"。</p>
        </blockquote>
      </div>
      </div>
      <div class="content symbol">
          <ul class="icon_lists dib-box">
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-copy"></use>
                </svg>
                <div class="name">复制</div>
                <div class="code-name">#icon-copy</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-arrow-down"></use>
                </svg>
                <div class="name">箭头下</div>
                <div class="code-name">#icon-arrow-down</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-usage-progress"></use>
                </svg>
                <div class="name">进度</div>
                <div class="code-name">#icon-usage-progress</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-gen-progress"></use>
                </svg>
                <div class="name">环形进度条</div>
                <div class="code-name">#icon-gen-progress</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-back"></use>
                </svg>
                <div class="name">向左1</div>
                <div class="code-name">#icon-back</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-point"></use>
                </svg>
                <div class="name">点</div>
                <div class="code-name">#icon-point</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-edit"></use>
                </svg>
                <div class="name">编辑</div>
                <div class="code-name">#icon-edit</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-delete"></use>
                </svg>
                <div class="name">删除</div>
                <div class="code-name">#icon-delete</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-upload-1"></use>
                </svg>
                <div class="name">上传</div>
                <div class="code-name">#icon-upload-1</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-explore"></use>
                </svg>
                <div class="name">探索-选中</div>
                <div class="code-name">#icon-explore</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-ellipsis"></use>
                </svg>
                <div class="name">ellipsis</div>
                <div class="code-name">#icon-ellipsis</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-sent"></use>
                </svg>
                <div class="name">发送</div>
                <div class="code-name">#icon-sent</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-list-list"></use>
                </svg>
                <div class="name">列表</div>
                <div class="code-name">#icon-list-list</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-list-icon"></use>
                </svg>
                <div class="name">列表</div>
                <div class="code-name">#icon-list-icon</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-zhongshi"></use>
                </svg>
                <div class="name">重试</div>
                <div class="code-name">#icon-zhongshi</div>
            </li>
          
            <li class="dib">
                <svg class="icon svg-icon" aria-hidden="true">
                  <use xlink:href="#icon-log"></use>
                </svg>
                <div class="name">Fork 记录</div>
                <div class="code-name">#icon-log</div>
            </li>
          
          </ul>
          <div class="article markdown">
          <h2 id="symbol-">Symbol 引用</h2>
          <hr>

          <p>这是一种全新的使用方式，应该说这才是未来的主流，也是平台目前推荐的用法。相关介绍可以参考这篇<a href="">文章</a>
            这种用法其实是做了一个 SVG 的集合，与另外两种相比具有如下特点：</p>
          <ul>
            <li>支持多色图标了，不再受单色限制。</li>
            <li>通过一些技巧，支持像字体那样，通过 <code>font-size</code>, <code>color</code> 来调整样式。</li>
            <li>兼容性较差，支持 IE9+，及现代浏览器。</li>
            <li>浏览器渲染 SVG 的性能一般，还不如 png。</li>
          </ul>
          <p>使用步骤如下：</p>
          <h3 id="-symbol-">第一步：引入项目下面生成的 symbol 代码：</h3>
<pre><code class="language-html">&lt;script src="./iconfont.js"&gt;&lt;/script&gt;
</code></pre>
          <h3 id="-css-">第二步：加入通用 CSS 代码（引入一次就行）：</h3>
<pre><code class="language-html">&lt;style&gt;
.icon {
  width: 1em;
  height: 1em;
  vertical-align: -0.15em;
  fill: currentColor;
  overflow: hidden;
}
&lt;/style&gt;
</code></pre>
          <h3 id="-">第三步：挑选相应图标并获取类名，应用于页面：</h3>
<pre><code class="language-html">&lt;svg class="icon" aria-hidden="true"&gt;
  &lt;use xlink:href="#icon-xxx"&gt;&lt;/use&gt;
&lt;/svg&gt;
</code></pre>
          </div>
      </div>

    </div>
  </div>
  <script>
  $(document).ready(function () {
      $('.tab-container .content:first').show()

      $('#tabs li').click(function (e) {
        var tabContent = $('.tab-container .content')
        var index = $(this).index()

        if ($(this).hasClass('active')) {
          return
        } else {
          $('#tabs li').removeClass('active')
          $(this).addClass('active')

          tabContent.hide().eq(index).fadeIn()
        }
      })
    })
  </script>
</body>
</html>


================================================
FILE: kt-sft/ktransformers/website/src/assets/iconfont/iconfont.css
================================================
@font-face {
  font-family: "iconfont"; /* Project id 4550268 */
  src: url('iconfont.woff2?t=1717950820214') format('woff2'),
       url('iconfont.woff?t=1717950820214') format('woff'),
       url('iconfont.ttf?t=1717950820214') format('truetype'),
       url('iconfont.svg?t=1717950820214#iconfont') format('svg');
}

.iconfont {
  font-family: "iconfont" !important;
  font-size: 16px;
  font-style: normal;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

.icon-copy:before {
  content: "\e8b0";
}

.icon-arrow-down:before {
  content: "\e85e";
}

.icon-usage-progress:before {
  content: "\e651";
}

.icon-gen-progress:before {
  content: "\e617";
}

.icon-back:before {
  content: "\e779";
}

.icon-point:before {
  content: "\e608";
}

.icon-edit:before {
  content: "\e7dd";
}

.icon-delete:before {
  content: "\e614";
}

.icon-upload-1:before {
  content: "\e618";
}

.icon-explore:before {
  content: "\e621";
}

.icon-ellipsis:before {
  content: "\e657";
}

.icon-sent:before {
  content: "\e60c";
}

.icon-list-list:before {
  content: "\e62d";
}

.icon-list-icon:before {
  content: "\e639";
}

.icon-zhongshi:before {
  content: "\e6bd";
}

.icon-log:before {
  content: "\e826";
}


================================================
FILE: kt-sft/ktransformers/website/src/assets/iconfont/iconfont.js
================================================
window._iconfont_svg_string_4550268='<svg><symbol id="icon-copy" viewBox="0 0 1024 1024"><path d="M394.666667 106.666667h448a74.666667 74.666667 0 0 1 74.666666 74.666666v448a74.666667 74.666667 0 0 1-74.666666 74.666667H394.666667a74.666667 74.666667 0 0 1-74.666667-74.666667V181.333333a74.666667 74.666667 0 0 1 74.666667-74.666666z m0 64a10.666667 10.666667 0 0 0-10.666667 10.666666v448a10.666667 10.666667 0 0 0 10.666667 10.666667h448a10.666667 10.666667 0 0 0 10.666666-10.666667V181.333333a10.666667 10.666667 0 0 0-10.666666-10.666666H394.666667z m245.333333 597.333333a32 32 0 0 1 64 0v74.666667a74.666667 74.666667 0 0 1-74.666667 74.666666H181.333333a74.666667 74.666667 0 0 1-74.666666-74.666666V394.666667a74.666667 74.666667 0 0 1 74.666666-74.666667h74.666667a32 32 0 0 1 0 64h-74.666667a10.666667 10.666667 0 0 0-10.666666 10.666667v448a10.666667 10.666667 0 0 0 10.666666 10.666666h448a10.666667 10.666667 0 0 0 10.666667-10.666666v-74.666667z" fill="#000000" ></path></symbol><symbol id="icon-arrow-down" viewBox="0 0 1024 1024"><path d="M554.666667 690.005333l228.864-228.864 60.330666 60.330667L512 853.333333l-331.861333-331.861333 60.330666-60.330667L469.333333 690.005333V170.666667h85.333334v519.338666z"  ></path></symbol><symbol id="icon-usage-progress" viewBox="0 0 1024 1024"><path d="M512 125.098667A386.901333 386.901333 0 1 1 125.098667 512 386.901333 386.901333 0 0 1 512 125.098667z" fill="#ACE9C5" ></path><path d="M512 318.634667A193.365333 193.365333 0 1 1 318.634667 512 193.365333 193.365333 0 0 1 512 318.634667z" fill="#2BA866" ></path></symbol><symbol id="icon-gen-progress" viewBox="0 0 1024 1024"><path d="M692.004733 714.930578l96.018649 96.017519C715.492309 877.950022 618.525386 918.887417 512 918.887417c-104.225342 0-199.297978-39.187779-271.287664-103.631964l96.127152-96.126023C384.097201 759.135506 445.230905 783.258278 512 783.258278c69.07253 0 132.114084-25.817007 180.004733-68.3277z m-202.61185-609.200883L489.395143 241.670781C350.16053 253.157439 240.741722 369.800759 240.741722 512c0 66.767965 24.122773 127.900539 64.127717 175.160512l-96.126022 96.126022C144.299232 711.295717 105.112583 616.225342 105.112583 512c0-217.130949 170.07894-394.539514 384.2803-406.270305z m325.8637 134.984901C879.700768 312.702022 918.887417 407.774658 918.887417 512c0 101.921907-37.474331 195.091214-99.395814 266.479611l-96.270694-96.268432C760.774358 635.667779 783.258278 576.460009 783.258278 512c0-66.767965-24.122773-127.901669-64.128848-175.161642l96.127153-96.124892zM534.608247 105.728565c95.334852 5.221722 181.928406 43.261174 248.678287 103.013722l-96.127152 96.127152c-41.869845-35.444415-94.631841-58.422252-152.553395-63.199788l0.00226-135.941086z" fill="#448AFF" fill-opacity=".6" ></path><path d="M489.392883 105.729695L489.395143 241.670781C350.16053 253.157439 240.741722 369.800759 240.741722 512c0 66.767965 24.122773 127.900539 64.127717 175.160512l-96.126022 96.126022C144.299232 711.295717 105.112583 616.225342 105.112583 512c0-217.130949 170.07894-394.539514 384.2803-406.270305z" fill="#448AFF" ></path></symbol><symbol id="icon-back" viewBox="0 0 1024 1024"><path d="M671.968176 911.99957c-12.287381 0-24.576482-4.67206-33.951566-14.047144L286.048434 545.984249c-18.751888-18.719204-18.751888-49.12028 0-67.872168L638.016611 126.111222c18.751888-18.751888 49.12028-18.751888 67.872168 0 18.751888 18.719204 18.751888 49.12028 0 67.872168l-318.016611 318.047574L705.888778 830.047574c18.751888 18.751888 18.751888 49.12028 0 67.872168C696.544658 907.32751 684.255557 911.99957 671.968176 911.99957z" fill="#2c2c2c" ></path></symbol><symbol id="icon-point" viewBox="0 0 1024 1024"><path d="M512 307.2a204.86826667 204.86826667 0 0 1 0 409.6 204.8 204.8 0 0 1 0-409.6z" fill="" ></path></symbol><symbol id="icon-edit" viewBox="0 0 1024 1024"><path d="M899.072 125.44c-28.672-28.672-67.072-44.544-107.52-44.544s-78.848 15.872-107.52 44.544L251.392 558.08c-34.304 34.304-60.416 74.752-78.336 119.808L88.576 896c-4.608 11.264-1.536 24.064 7.168 32.768 5.632 5.632 13.824 9.216 21.504 9.216 3.584 0 7.68-0.512 11.264-2.048l218.624-84.48c45.056-17.408 85.504-44.032 119.808-78.336l351.744-351.744 80.896-80.896c58.88-59.392 58.88-155.648-0.512-215.04z m-475.648 604.16c-28.16 28.16-61.44 50.176-98.816 64.512l-153.6 59.392 59.392-153.6c14.336-37.376 35.84-70.656 64.512-98.816L625.152 271.36l128.512 128.512-330.24 329.728z m432.64-432.128l-58.88 58.88-128.512-128.512L727.552 168.96c16.896-16.896 39.936-26.624 64.512-26.624s47.104 9.216 64.512 26.624c34.816 35.328 34.816 92.672-0.512 128.512z" fill="#333333" ></path></symbol><symbol id="icon-delete" viewBox="0 0 1024 1024"><path d="M742.4 944H281.6c-49.4 0-89.6-43.1-89.6-96V368h64v480c0 17.3 11.7 32 25.6 32h460.8c13.9 0 25.6-14.7 25.6-32V368h64v480c0 52.9-40.2 96-89.6 96z"  ></path><path d="M384 368h64v416h-64zM592 368h64v416h-64zM64 224h896v64H64z"  ></path><path d="M768 288H256V160c0-52.9 43.1-96 96-96h320c52.9 0 96 43.1 96 96v128z m-448-64h384v-64c0-17.6-14.4-32-32-32H352c-17.6 0-32 14.4-32 32v64z"  ></path></symbol><symbol id="icon-upload-1" viewBox="0 0 1024 1024"><path d="M323.034074 291.934815l383.620741 0c9.481481 0 17.256296-8.533333 17.256296-18.962963 0-10.42963-7.68-18.962963-17.256296-18.962963L323.034074 254.008889c-9.481481 0-17.256296 8.533333-17.256296 18.962963C305.777778 283.496296 313.457778 291.934815 323.034074 291.934815z" fill="#272536" ></path><path d="M522.05037 328.628148c-1.232593-1.232593-2.844444-1.896296-4.740741-1.991111-1.706667-0.094815-3.318519-0.094815-5.025185 0-1.896296 0.094815-3.508148 0.758519-4.740741 1.991111L349.013333 487.253333c-3.887407 3.887407-1.896296 12.325926 4.456296 18.773333 6.447407 6.447407 14.791111 8.438519 18.773333 4.456296l125.060741-125.060741 0 367.122963c0 9.671111 7.86963 17.540741 17.540741 17.540741l0 0c9.671111 0 17.540741-7.86963 17.540741-17.540741L532.385185 385.327407l125.060741 125.060741c3.887407 3.887407 12.325926 1.896296 18.773333-4.456296 6.447407-6.447407 8.438519-14.791111 4.456296-18.773333L522.05037 328.628148z" fill="#272536" ></path></symbol><symbol id="icon-explore" viewBox="0 0 1024 1024"><path d="M926.352541 89.231277c-0.029676-7.432273-1.212618-13.651928-2.837628-19.264762-31.228235-8.264221-71.898517 1.24127-106.283652 17.927301-7.049556 3.41068-23.762193 13.583366-48.51597 28.643364-10.237155 6.250354-19.264762 11.739369-23.251563 14.002922-0.384763 0.224104-0.608867 0.63752-0.958838 0.861624-67.557652-41.147142-146.571217-65.327868-231.319389-65.327868-246.251474 0-446.569802 200.319351-446.569802 446.564685 0 82.554204 22.904663 159.683862 62.105476 226.062666-46.315862 71.387887-69.2809 122.93182-63.283302 157.863401 1.24127 7.144724 13.555737 8.28878 20.316721 8.28878 137.989771 0 453.393207-302.802444 492.628814-341.399507C751.64859 393.022235 926.449755 184.667883 926.352541 89.231277L926.352541 89.231277zM305.847292 611.014084c-43.956118 0-79.744205-35.757388-79.744205-79.743182 0-43.956118 35.789111-79.744205 79.744205-79.744205 43.956118 0 79.743182 35.789111 79.743182 79.744205C385.591497 575.256696 349.803409 611.014084 305.847292 611.014084L305.847292 611.014084zM446.19783 387.730719c-52.760644 0-95.694479-42.937928-95.694479-95.692433 0-52.760644 42.933835-95.694479 95.694479-95.694479 52.761668 0 95.694479 42.933835 95.694479 95.694479C541.892309 344.79279 498.958474 387.730719 446.19783 387.730719L446.19783 387.730719zM893.595486 279.9469c-66.889433 99.330286-172.055634 218.596623-276.967032 321.751005-28.551266 28.104081-201.624067 195.822944-346.982666 285.198507 0.12689-0.097214 0.223081-0.160659 0.349971-0.224104 70.049403 45.708018 153.491837 72.536037 243.189741 72.536037 246.246357 0 446.565708-200.318328 446.565708-446.570825C959.716416 427.317319 935.282934 347.82587 893.595486 279.9469L893.595486 279.9469zM638.54051 799.720957c-35.180244 0-63.793932-28.614711-63.793932-63.794955 0-35.184337 28.613688-63.799048 63.793932-63.799048 35.184337 0 63.793932 28.614711 63.793932 63.799048C702.334441 771.106246 673.724847 799.720957 638.54051 799.720957L638.54051 799.720957zM638.54051 799.720957" fill="#615CED" ></path></symbol><symbol id="icon-ellipsis" viewBox="0 0 1024 1024"><path d="M322.292 505.5m-66 0a66 66 0 1 0 132 0 66 66 0 1 0-132 0Z" fill="#272636" ></path><path d="M509.791 505.5m-66 0a66 66 0 1 0 132 0 66 66 0 1 0-132 0Z" fill="#272636" ></path><path d="M701.791 505.5m-66 0a66 66 0 1 0 132 0 66 66 0 1 0-132 0Z" fill="#272636" ></path></symbol><symbol id="icon-sent" viewBox="0 0 1024 1024"><path d="M998.976 554.3232C1031.232 539.6032 1031.328 515.7952 998.976 501.0432L122.88 101.3312C90.624 86.6112 64.448 103.5072 64.384 138.4832L64 426.9952 773.568 527.6672 64 628.3392 64.384 916.8832C64.448 952.1152 90.528 968.7872 122.88 954.0352L998.976 554.3232Z"  ></path></symbol><symbol id="icon-list-list" viewBox="0 0 1024 1024"><path d="M419.037 287.953h413.124c17.673 0 32-14.327 32-32s-14.327-32-32-32H419.037c-17.673 0-32 14.327-32 32s14.327 32 32 32zM419.028 543.17h411.608c17.673 0 32-14.327 32-32s-14.327-32-32-32H419.028c-17.673 0-32 14.327-32 32s14.327 32 32 32zM832.161 735.802H419.037c-17.673 0-32 14.327-32 32s14.327 32 32 32h413.124c17.673 0 32-14.327 32-32s-14.327-32-32-32z" fill="" ></path><path d="M256.037 255.953m-64 0a64 64 0 1 0 128 0 64 64 0 1 0-128 0Z" fill="" ></path><path d="M256.037 510.787m-64 0a64 64 0 1 0 128 0 64 64 0 1 0-128 0Z" fill="" ></path><path d="M256.037 767.621m-64 0a64 64 0 1 0 128 0 64 64 0 1 0-128 0Z" fill="" ></path></symbol><symbol id="icon-list-icon" viewBox="0 0 1024 1024"><path d="M841.6 489.6h-214.4c-48 0-86.4-38.4-86.4-86.4V188.8c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c0 48-38.4 86.4-86.4 86.4z m-211.2-320c-12.8 0-22.4 9.6-22.4 22.4v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4V188.8c0-12.8-9.6-22.4-22.4-22.4h-214.4zM393.6 489.6H182.4c-48 0-86.4-38.4-86.4-86.4V188.8c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c-3.2 48-41.6 86.4-89.6 86.4z m-211.2-320c-12.8 0-22.4 9.6-22.4 19.2v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4V188.8c0-12.8-9.6-22.4-22.4-22.4H182.4zM841.6 937.6h-214.4c-48 0-86.4-38.4-86.4-86.4v-214.4c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c0 48-38.4 86.4-86.4 86.4z m-211.2-323.2c-12.8 0-22.4 9.6-22.4 22.4v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4v-214.4c0-12.8-9.6-22.4-22.4-22.4h-214.4zM393.6 937.6H182.4c-48 0-86.4-38.4-86.4-86.4v-214.4c0-48 38.4-86.4 86.4-86.4h214.4c48 0 86.4 38.4 86.4 86.4v214.4c-3.2 48-41.6 86.4-89.6 86.4zM182.4 614.4c-12.8 0-22.4 9.6-22.4 22.4v214.4c0 12.8 9.6 22.4 22.4 22.4h214.4c12.8 0 22.4-9.6 22.4-22.4v-214.4c0-12.8-9.6-22.4-22.4-22.4H182.4z" fill="#333333" ></path></symbol><symbol id="icon-zhongshi" viewBox="0 0 1024 1024"><path d="M973.53044 167.133265l-65.609003 50.468463A491.226376 491.226376 0 0 0 522.971405 33.282123C253.074841 33.282123 34.74388 247.370807 34.378166 512.220525c-0.365714 265.142289 218.550389 480.108685 488.593239 480.108686 211.016691 0 390.728306-131.291147 459.189873-315.245039a9.069695 9.069695 0 0 0-5.851416-11.775975l-65.82843-22.308523a9.435408 9.435408 0 0 0-11.775975 5.485702 392.48373 392.48373 0 0 1-92.525516 141.896839 402.650566 402.650566 0 0 1-282.915965 115.12661c-54.125598 0-106.495772-10.386263-155.793952-30.793077a398.627717 398.627717 0 0 1-212.845258-209.188123 383.779749 383.779749 0 0 1-31.451361-152.868244c0-53.1016 10.532549-104.374633 31.451361-152.868243 20.114243-46.738186 49.005609-88.795238 85.723245-124.85459a401.260854 401.260854 0 0 1 282.915965-115.12661c54.052456 0 106.422629 10.459406 155.720809 30.866219a398.627717 398.627717 0 0 1 159.52423 120.100314l-69.997565 53.686742a9.069695 9.069695 0 0 0 3.437707 16.091394l204.287562 49.151895c5.851416 1.316569 11.556547-2.998851 11.556547-8.777124l0.950855-206.554986a9.508551 9.508551 0 0 0-15.213681-7.167985z" fill="#000000" ></path></symbol><symbol id="icon-log" viewBox="0 0 1024 1024"><path d="M288 64c70.692 0 128 57.308 128 128 0 58.192-38.833 107.315-91.998 122.867L324 571.5h225c48.8 0 84.134-19.864 110.1-62.009 15.655-25.408 27.76-58.805 36.092-100.127C648.71 390.177 616 344.408 616 291c0-70.692 57.308-128 128-128 70.692 0 128 57.308 128 128 0 62.814-45.245 115.06-104.923 125.925-9.94 52.391-25.407 95.81-46.677 130.334-38.644 62.721-96.365 95.58-169.189 96.231l-2.211 0.01H324l0.002 65.633c52.52 15.363 91.052 63.486 91.98 120.75L416 832c0 70.692-57.308 128-128 128-70.692 0-128-57.308-128-128 0-58.193 38.833-107.315 91.999-122.868V314.868C198.833 299.315 160 250.193 160 192c0-70.692 57.308-128 128-128z" fill="#333333" ></path></symbol></svg>',function(l){var t=(t=document.getElementsByTagName("script"))[t.length-1],c=t.getAttribute("data-injectcss"),t=t.getAttribute("data-disable-injectsvg");if(!t){var i,o,e,a,h,n=function(t,c){c.parentNode.insertBefore(t,c)};if(c&&!l.__iconfont__svg__cssinject__){l.__iconfont__svg__cssinject__=!0;try{document.write("<style>.svgfont {display: inline-block;width: 1em;height: 1em;fill: currentColor;vertical-align: -0.1em;font-size:16px;}</style>")}catch(t){console&&console.log(t)}}i=function(){var t,c=document.createElement("div");c.innerHTML=l._iconfont_svg_string_4550268,(c=c.getElementsByTagName("svg")[0])&&(c.setAttribute("aria-hidden","true"),c.style.position="absolute",c.style.width=0,c.style.height=0,c.style.overflow="hidden",c=c,(t=document.body).firstChild?n(c,t.firstChild):t.appendChild(c))},document.addEventListener?~["complete","loaded","interactive"].indexOf(document.readyState)?setTimeout(i,0):(o=function(){document.removeEventListener("DOMContentLoaded",o,!1),i()},document.addEventListener("DOMContentLoaded",o,!1)):document.attachEvent&&(e=i,a=l.document,h=!1,d(),a.onreadystatechange=function(){"complete"==a.readyState&&(a.onreadystatechange=null,s())})}function s(){h||(h=!0,e())}function d(){try{a.documentElement.doScroll("left")}catch(t){return void setTimeout(d,50)}s()}}(window);

================================================
FILE: kt-sft/ktransformers/website/src/assets/iconfont/iconfont.json
================================================
{
  "id": "4550268",
  "name": "Lexllama",
  "font_family": "iconfont",
  "css_prefix_text": "icon-",
  "description": "Lexllama开源项目使用",
  "glyphs": [
    {
      "icon_id": "11372665",
      "name": "复制",
      "font_class": "copy",
      "unicode": "e8b0",
      "unicode_decimal": 59568
    },
    {
      "icon_id": "34202237",
      "name": "箭头下",
      "font_class": "arrow-down",
      "unicode": "e85e",
      "unicode_decimal": 59486
    },
    {
      "icon_id": "7766233",
      "name": "进度",
      "font_class": "usage-progress",
      "unicode": "e651",
      "unicode_decimal": 58961
    },
    {
      "icon_id": "38865122",
      "name": "环形进度条",
      "font_class": "gen-progress",
      "unicode": "e617",
      "unicode_decimal": 58903
    },
    {
      "icon_id": "577406",
      "name": "向左1",
      "font_class": "back",
      "unicode": "e779",
      "unicode_decimal": 59257
    },
    {
      "icon_id": "1920286",
      "name": "点",
      "font_class": "point",
      "unicode": "e608",
      "unicode_decimal": 58888
    },
    {
      "icon_id": "8866967",
      "name": "编辑",
      "font_class": "edit",
      "unicode": "e7dd",
      "unicode_decimal": 59357
    },
    {
      "icon_id": "10199175",
      "name": "删除",
      "font_class": "delete",
      "unicode": "e614",
      "unicode_decimal": 58900
    },
    {
      "icon_id": "1010111",
      "name": "上传",
      "font_class": "upload-1",
      "unicode": "e618",
      "unicode_decimal": 58904
    },
    {
      "icon_id": "351773",
      "name": "探索-选中",
      "font_class": "explore",
      "unicode": "e621",
      "unicode_decimal": 58913
    },
    {
      "icon_id": "564941",
      "name": "ellipsis",
      "font_class": "ellipsis",
      "unicode": "e657",
      "unicode_decimal": 58967
    },
    {
      "icon_id": "1048859",
      "name": "发送",
      "font_class": "sent",
      "unicode": "e60c",
      "unicode_decimal": 58892
    },
    {
      "icon_id": "1304951",
      "name": "列表",
      "font_class": "list-list",
      "unicode": "e62d",
      "unicode_decimal": 58925
    },
    {
      "icon_id": "8676284",
      "name": "列表",
      "font_class": "list-icon",
      "unicode": "e639",
      "unicode_decimal": 58937
    },
    {
      "icon_id": "22290034",
      "name": "重试",
      "font_class": "zhongshi",
      "unicode": "e6bd",
      "unicode_decimal": 59069
    },
    {
      "icon_id": "22961085",
      "name": "Fork 记录",
      "font_class": "log",
      "unicode": "e826",
      "unicode_decimal": 59430
    }
  ]
}


================================================
FILE: kt-sft/ktransformers/website/src/components/chat/index.vue
================================================
<template>
  <div class="chat-panel">
    <!-- <div class="chat-model">{{ activeAssistant?.model }}</div> -->
    <div class="chat-panel-inner flex-column">
      <div class="chat-init flex-unit flex-column" v-if="isNotChating">
        <div class="assistant-info flex-column flex-unit">
          <div class="avatar">
            <img src="../../../public/images/avatar.png" />
          </div>
          <div class="name">
            {{ activeAssistant.name }}
          </div>
          <div class="desc">
            {{ activeAssistant.description }}
          </div>
        </div>
      </div>
      <div class="chat-msg flex-unit" v-else>
        <ul>
          <li
            class="chat-msg-item flex-row"
            v-for="(msg, index) in localMessages"
            :key="index"
          >
            <div class="avatar" v-if="msg.role == 'user'">
              <img src="../../../public/images/user-filling.png" />
            </div>
            <div class="avatar" v-else>
              <img src="../../../public/images/avatar.png" />
            </div>
            <div class="msg flex-unit">
              <div class="title flex-row">
                <div class="name">{{ msg.role }}</div>
                <div class="time flex-row">
                  {{ timeFormat(msg.created_at) }}
                </div>
              </div>
              <div
                class="content"
                v-html="markedText(msg.content)"
                ref="content_Ref"
              ></div>
              <div class="copy-btn flex-row" v-show="msgBttnBoxShow[index]">
                <i
                  class="iconfont icon-copy"
                  @click="copy(createText(msg.content))"
                ></i>
              </div>
            </div>
          </li>
        </ul>
      </div>
      <div class="scroll-box" v-show="showScrollButton" @click="scrollToBottom">
        <i class="iconfont icon-arrow-down"></i>
      </div>
      <div class="chat-send">
        <div
          class="chat-box flex-row"
          :style="{ height: textareaHeight + 'px' }"
          ref="chatBox_Ref"
        >
          <button @click="StopOutput" class="stop-btn" v-show="isRunning">
            stop
          </button>
          <textarea
            name="chat-input"
            class="chat-input flex-unit"
            :placeholder="inputPlaceholder"
            v-model="inputQuestion"
            @keydown="keyBoardCommitQuestion"
            :disabled="inputDisabled"
            :style="{ height: textareaHeight + 'px' }"
            @input="handleInput"
            ref="textarea_ref"
            maxlength="2000"
            cols="20"
          ></textarea>
          <i class="iconfont icon-sent" @click="clickCommitQuestion"></i>
        </div>
      </div>
    </div>
  </div>
</template>

<script lang="ts">
import {
  defineComponent,
  nextTick,
  PropType,
  ref,
  watch,
  computed,
  onMounted,
} from "vue";
import { IThread, IMessageData, IAssistant } from "@/utils/types";
import { marked } from "marked";
import { createMessage } from "@/api/message";
import { createRun, cancelRun } from "@/api/run";
import { getAssistant } from "@/api/assistant";
import { createThread } from "@/api/thread";
import BScroll from "better-scroll";
import { useRouter, useRoute } from "vue-router";
import { useI18n } from "vue-i18n";
import { ElMessage } from "element-plus";
import { tr } from "element-plus/es/locale";
import copy from "@/utils/copy";
export default defineComponent({
  name: "ChatChat",
  props: {
    messages: {
      type: Array as PropType<IMessageData[]>,
      required: true,
    },
    chatInit: {
      type: Boolean,
      required: true,
    },
    activeAssistant: {
      type: Object as PropType<IAssistant>,
      required: true,
    },
    activeThread: {
      type: Object as PropType<IThread>,
      required: true,
    },
    inputDisabled: {
      type: Boolean,
      default: false,
    },
  },
  setup(props, context) {
    const { t } = useI18n();
    const router = useRouter();
    const route = useRoute();
    const localMessages = ref<IMessageData[]>([...props.messages]);
    const showScrollButton = ref(false);
    const messageScroll = ref<BScroll | null>(null);
    const inputQuestion = ref<string>("");
    const inputDisabled = ref(false);
    const msgBttnBoxShow = ref<boolean[]>([]);
    const answer = ref("");
    const activeThread = ref<IThread>({} as IThread);
    const activeAssistant = ref<IAssistant>({} as IAssistant);
    const isNotChating = ref(true);
    const isRunning = ref(false);
    const stopRunId = ref<string>("");
    const shouldContinueReceiving = ref(true);
    const textareaHeight = ref(48);
    const chatBox_Ref = ref();
    const textarea_ref = ref();
    const content_Ref = ref();
    // Boolean if go
    isNotChating.value = props.chatInit;
    activeThread.value = props.activeThread;
    activeAssistant.value = props.activeAssistant;
    watch(
      () => props.messages,
      (newMessages) => {
        localMessages.value = [...newMessages];
        msgBttnBoxShow.value = new Array(newMessages.length).fill(true);
      }
    );
    watch(
      () => props.inputDisabled,
      (newValue) => {
        inputDisabled.value = newValue;
      }
    );
    // Update scrollbars and scrolling events
    watch(
      () => localMessages.value,
      (newMessages) => {
        if (messageScroll.value) {
          scrollToTop();
          messageScroll.value.destroy();
          messageScroll.value = null;
        }
        if (!isNotChating.value) {
          nextTick(() => {
            messageScroll.value = new BScroll(".chat-msg", {
              click: true,
              mouseWheel: true,
              probeType: 3, //Only when set to 3 can the event of scrolling binding be triggered
            });
          });
        }
      },
      {
        immediate: true,
        deep: true,
      }
    );
    watch(
      () => messageScroll.value,
      (newValue) => {
        if (newValue) {
          messageScroll.value?.on("scroll", handleScroll);
          showScrollButton.value = false;
          scrollToBottom();
        }
      }
    );
    watch(
      () => props.chatInit,
      (newValue) => {
        isNotChating.value = newValue;
      }
    );
    watch(
      () => props.activeThread,
      (newValue) => {
        activeThread.value = newValue;
      }
    );
    watch(
      () => props.activeAssistant,
      (newValue) => {
        activeAssistant.value = newValue;
      }
    );

    const handleInput = (event:any) => {
      adjustHeight();
      const maxLength = 2000; 
      if (inputQuestion.value?.length > maxLength) {
        event.preventDefault(); 
        inputQuestion.value = inputQuestion.value.substring(0, maxLength); 
      }
    };
    const adjustHeight = () => {
      const currentScrollTop = textarea_ref.value.scrollTop;
      textarea_ref.value.style.height = textarea_ref.value.scrollHeight + "px";
      chatBox_Ref.value.style.height = textarea_ref.value.style.height;
      textarea_ref.value.scrollTop = currentScrollTop;
    };

    const inputPlaceholder = computed(() => {
      if (typeof activeAssistant.value.name != "undefined") {
        return replaceAssistant(t("chat.inputTip"), activeAssistant.value.name);
      } else {
        return t("chat.inputTip");
      }
    });
    // Block events
    const StopOutput = async () => {
      shouldContinueReceiving.value = false;
      try {
        const response = await cancelRun(
          activeThread.value.id,
          stopRunId.value
        );
        if (!response.ok) {
          console.error("Failed to cancel run");
        }
      } catch (error) {
        console.error("Failed to cancel run:", error);
      }
    };
    // dialogue
    const commitQuestion: () => void = async () => {
      const question = inputQuestion.value;
      // If it came in by clicking on assistants without clicking on thread, or through preview
      if (Object.keys(activeThread.value).length == 0) {
        try {
          let res = {} as IThread;
          // If you click thread and do not select assistant
          if (route.name == "preview") {
            let metadata = {
              hidden: "true",
            };
            res = await createThread(undefined, undefined, metadata);
          } else {
            res = await createThread();
          }
          activeThread.value = res;
        } catch (err) {
          console.error(err);
        }
      }
      //If you click thread and do not select assistant
      else if (Object.keys(activeAssistant.value).length == 0) {
        try {
          const messageOfAssistant = props.messages.find(
            (message) => message.role === "assistant"
          );
          if (messageOfAssistant && messageOfAssistant.assistant_id) {
            const res = await getAssistant(messageOfAssistant.assistant_id);
            activeAssistant.value = res;
          }
        } catch (err) {
          console.error(err);
        }
      }
      if (question) {
        inputQuestion.value = "";
        textareaHeight.value = 48;
        // inputDisabled.value = true;
        isNotChating.value = false;
        isRunning.value = true;
        await createMessage(activeThread.value.id, question)
          .then((res: any) => {})
          .catch((err: any) => {
            ElMessage({
              type: "warning",
              message: "Request error",
            });
            return;
          });
        // Current message queue insertion issue
        localMessages.value.push({
          role: "user",
          content: [
            { type: "text", text: { value: question }, annotatons: [] },
          ],
          created_at: Date.now() / 1000,
        });
        msgBttnBoxShow.value.push(true);
        // Insert answer into the current message queue
        localMessages.value.push({
          role: "assistant",
          content: [{ type: "text", text: { value: "" }, annotatons: [] }],
          created_at: Date.now() / 1000,
        });
        msgBttnBoxShow.value.push(false);
        try {
          const asyncGenerator = createRun(
            {
              assistant_id: activeAssistant.value.id,
              stream: true,
            },
            activeThread.value.id
          );
          for await (const word of asyncGenerator) {
            if (!shouldContinueReceiving.value) {
              break;
            }
            if (word.length == 36) {
              stopRunId.value = word;
              console.log(stopRunId.value);
            } else {
              answer.value += word;
              const index = localMessages.value.length - 1;
              localMessages.value[index].content[0].text.value += word;
              if (answer.value.length <= 3) {
                localMessages.value[index].created_at = Date.now() / 1000;
              }
            }
          }
        } catch (err) {
          console.error(err);
        }
        shouldContinueReceiving.value = true;
        answer.value = "";
        inputDisabled.value = false;
        msgBttnBoxShow.value[msgBttnBoxShow.value.length - 1] = true;
        scrollToBottom();
        isRunning.value = false;
        context.emit("updateAssistant", true);
        textarea_ref.value.focus();
      }
    };
    // Keyboard event stabilization
    const keyBoardCommitQuestion = (event: any) => {
      const question = inputQuestion.value?.trim();
      if (event.keyCode === 13) {
        event.preventDefault();

        const cursorPosition = event.target.selectionStart;
        if ((event.metaKey || event.ctrlKey) && question) {
          event.target.value =
            event.target.value.substring(0, cursorPosition) +
            "\n" +
            event.target.value.substring(cursorPosition);
          event.target.selectionStart = event.target.selectionEnd =
            cursorPosition + 1;
          adjustHeight();
          return;
        }
        if (!question) {
          ElMessage({
            message: "Please enter the content!",
            type: "warning",
            plain: true,
          });
          return;
        }
        if (!isRunning.value) {
          commitQuestion();
          inputQuestion.value = "";
        }
      }
    };
    const clickCommitQuestion = () => {
      if (!isRunning.value && inputQuestion.value?.trim() != "") {
        commitQuestion();
        return;
      }
      ElMessage({
        message: "Please enter the content!",
        type: "warning",
        plain: true,
      });
    };
    //Bottom scrolling
    const scrollToBottom = () => {
      //If messageScroll. value exists
      if (messageScroll.value) {
        //Call the scrollTo method of messageScroll. value and scroll to the bottom
        messageScroll.value.scrollTo(0, messageScroll.value?.maxScrollY, 800);
      }
    };
    // Top scrolling
    const scrollToTop = () => {
      if (messageScroll.value) {
        messageScroll.value.scrollTo(0, messageScroll.value?.minScrollY, 800);
      }
    };
    // Handling rolling events
    const handleScroll = (pos: any) => {
      if (messageScroll.value) {
        const distanceToBottom =
          messageScroll.value.y - messageScroll.value.maxScrollY;
        showScrollButton.value = distanceToBottom > 100;
      }
    };
    // Replace characters

    function replaceAssistant(input: string, newString: string) {
      return input.replace(/assistant/g, newString);
    }
    // Extract the markup text to convert the passed in object array into an HTML string parsed by market.js
    const markedText = (content: object[]) => {
      let context = "";
      for (const item of content) {
        if ((item as { type: string }).type === "text") {
          context += ((item as { text: object }).text as { value: string })
            .value;
        }
      }
      return marked.parse(context);
    };
    // Extract text content
    const createText = (content: object[]) => {
      let context = "";
      for (const item of content) {
        if ((item as { type: string }).type === "text") {
          context += ((item as { text: object }).text as { value: string })
            .value;
        }
      }
      return context;
    };
    // Time formatting
    const timeFormat = (timestamp: number | undefined) => {
      if (!timestamp) {
        return "";
      }
      const date = new Date(timestamp * 1000);
      // Obtain various time sections
      const year = date.getFullYear();
      const month = String(date.getMonth() + 1).padStart(2, "0"); // The month starts from 0 and needs to be increased by 1, with zeros added
      const day = String(date.getDate()).padStart(2, "0"); // Zero padding
      const hours = String(date.getHours()).padStart(2, "0"); // Zero padding
      const minutes = String(date.getMinutes()).padStart(2, "0"); // Zero padding
      const seconds = String(date.getSeconds()).padStart(2, "0"); // Zero padding
      // Format as "YYYY-MM-DD HH: mm: ss"
      const formattedDate = `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`;
      return formattedDate;
    };
    onMounted(() => {
      adjustHeight();
    });
    return {
      inputQuestion,
      inputDisabled,
      msgBttnBoxShow,
      localMessages,
      textareaHeight,
      answer,
      StopOutput,
      isNotChating,
      handleInput,
      chatBox_Ref,
      adjustHeight,
      content_Ref,
      markedText,
      timeFormat,
      createText,
      inputPlaceholder,
      keyBoardCommitQuestion,
      clickCommitQuestion,
      messageScroll,
      showScrollButton,
      commitQuestion,
      scrollToBottom,
      scrollToTop,
      isRunning,
      copy,
      replaceAssistant,
      textarea_ref,
    };
  },
});
</script>

<style scoped lang="stylus">
@import '@/assets/css/mixins.styl';

.chat-panel {
  justify-content: center;
  display: flex;
  position: relative;
  height: 100%;

  .chat-model {
    font-size: 16px;
    font-weight: bold;
    position: absolute;
    top: 20px;
    left: 30px;
  }

  .chat-panel-inner {
    width: 920px;
    padding-top: 80px;
  }

  .chat-init {
    padding: 0 20px;

    .assistant-info {
      text-align: center;
      align-items: center;
      justify-content: center;

      .avatar img {
        width: 70px;
        height: 70px;
      }

      .name {
        margin: 40px 0;
        font-size: 20px;
        font-weight: bold;
      }

      .desc {
        color: $gray_40;
      }
    }

    .assistant-tips {
      margin-bottom: 80px;

      .tips-item {
        width: 44%;
        height: 70px;
        line-height: 70px;
        float: left;
        border: 1px solid $border_gray_light_normal;
        border-radius: 8px;
        margin-top: 10px;
        margin-bottom: 10px;
        padding: 0 20px;
        color: $gray_40;

        &:nth-child(odd) {
          margin-left: 4%;
          margin-right: 4%;
        }

        &:nth-child(even) {
          margin-right: 4%;
        }

        .tips-ops {
          display: none;
          width: 24px;
          height: 24px;
          line-height: 24px;
          border-radius: 4px;
          text-align: center;
          border: 1px solid $border_gray_light_normal;

          i {
            font-size: 20px;
          }
        }

        &:hover {
          cursor: pointer;
          background-color: $bg_gray_light_hover;

          .tips-ops {
            display: block;
            background-color: #FFFFFF;
          }
        }
      }
    }
  }

  .chat-msg {
    overflow-y: hidden;

    ul {
      li.chat-msg-item {
        margin-bottom: 40px;
        align-items: flex-start !important;
        // border: 1px solid;
        border-radius: 15px;
        padding: 20px;
        margin-right: 20px;
        background-color: #313344;
        box-shadow: 12.5px 12.5px 10px rgba(0, 0, 0, 0.035), 10px 10px 8px rgba(0, 0, 0, 0.07);

        .avatar {
          margin-right: 15px;
          width: 36px;
          height: 36px;

          img {
            width: 100%;
            height: 100%;
            border-radius: 25px;
          }
        }

        .msg {
          .title {
            display: flex;
            align-items: center;
            justify-content: space-between;
            margin-bottom: 12px;
            height: 36px;
            line-height: 24px;

            .time {
              justify-content: center;
              // margin-bottom: 12px;
              line-height: 20px;
              font-size: 14px;
              color: $gray_80;
            }

            .name {
              color: #edf2ea;
              font-size: 16px;
              font-weight: bold;
              margin-right: 15px;
            }

            .tips {
              font-size: 14px;
              color: $gray_50;
            }
          }

          .content {
            max-width: 829px;
            color: #edf2ea;
            font-size: 14px;
            line-height: 20px;
            word-wrap: break-word;
            margin-bottom: 12px;
          }

          .copy-btn {
            margin-top: 10px;
            justify-content: left;

            i {
              font-size: 20px;
              color: $gray_70;

              &:hover {
                cursor: pointer;
                color: $gray_50;

                .tips-ops {
                  display: block;
                  background-color: #FFFFFF;
                }
              }
            }
          }
        }
      }
    }
  }

  .chat-send {
    width: 900px;
    padding: 40px 0;
    position: relative;

    .chat-box {
      width: 100%;
      height: auto;
      min-height: 48px;
      max-height: 192px !important;
      border: none;
      border-radius: 15px;
      background: white;
      line-height: 48px;

      // overflow: hidden;
      .chat-input {
        height: auto;
        min-width: 900px;
        max-height: 192px !important;
        width: 100%;
        border: none;
        overflow-anchor: auto;
        overflow-x: hidden;
        overflow-y: auto;
        resize: none;
        background: white;
        display: inline-block;
      }

      .chat-input::-webkit-scrollbar {
        width: 10px;
      }

      .chat-input::-webkit-scrollbar-track {
        background-color: #f1f1f1;
      }

      .chat-input::-webkit-scrollbar-thumb {
        background-color: #888;
        border-radius: 5px;
      }

      .chat-input::-webkit-scrollbar-thumb:hover {
        background-color: #555;
      }

      .chat-input::-webkit-resizer {
        display: none;
      }

      .stop-btn {
        border: none;
        width: 60px;
        position: absolute;
        right: 50%;
        transform: translateX(50%);
        top: -40px;
        -webkit-border-radius: 50;
        -moz-border-radius: 50;
        border-radius: 50px;
        font-family: Arial;
        color: #ffffff;
        font-size: 16px;
        background: #cacdd1;
        padding: 10px 15px 10px 15px;
        text-decoration: none;
      }

      .stop-btn:hover {
        background: #8080e1;
        text-decoration: none;
        cursor: pointer;
      }
    }
  }
}

.scroll-box {
  position: absolute;
  bottom: 130px;
  right: 50%;
  transform: translateX(50%);
  margin: 0 auto;
  width: 32px;
  height: 32px;
  border-radius: 16px;
  border: 1px solid $gray_80;
  background-color: var(--el-bg-color-overlay);
  box-shadow: var(--el-box-shadow-lighter);
  text-align: center;
  line-height: 32px;
  color: #1989fa;

  i {
    font-size: 24px;
    color: $gray_60;
  }

  &:hover {
    cursor: pointer;
    background-color: $bg_gray_light_hover;

    i {
      color: $gray_50;
    }
  }
}
</style>

================================================
FILE: kt-sft/ktransformers/website/src/conf/config.ts
================================================
declare global {
    interface Window {
      configWeb: {
        apiUrl: string;
        port: string;
       };
     }
  }

export const baseURL = window.configWeb.apiUrl;
export const basePort = window.configWeb.port;


================================================
FILE: kt-sft/ktransformers/website/src/locals/en.js
================================================
// en.js
export default {
    home: {
        explore: 'Explore',
        language: 'Choose Language',
        english: 'English',
        chinese: 'Chinese',
        today: 'Today',
        previous:'Previous',
        withoutAssistantTip:'The KTransformers of this record has been deleted. The user can only view historical conversation information and cannot continue the conversation!',
        deleteThreadTip:'Deleting records will clear historical information~'
    },
    chat:{
        inputTip:"Send a message and chat with the KTransformers ~",
    },
    explore:{
        description: "Based on Lexllama, let’s create your own KTransformers~",
        configuring: "Configuring",
        completed: "Completed",
        assistantName: "Name",
        assistantDescription: "Description",
        assistantStatus: "Status",
        createAssistant: "Create New KTransformers",
        deleteAssistant: "Are you sure to delete this? After deleting the KTransformers, its KVCache will also be cleared simultaneously~",
    },
    config:{
        title:'Configure your KTransformers',
        fileTip:"Only support text, docx, .ppt, .pdf format.",
        reConfigTip:'Reconfig KTransformers needs to delete kvcache, please choose carefully',
        secletFile:'Select Files',
        outOfSize:'File size exceeds 10MB, please reselect',
        fileExist:'The file already exists, please reselect',
        createAssistant:'Assistant created successfully, click the build button to start building KVCache',
    },
    build:{
        title:'Building Logs',
        step1:'Parse uploded files',
        parsingFileStep1:'File upload and reception completed',
        parsingFileStep2:{
            parse:"Parsing",
            file:"file(s)",
            total:'total',
        },
        parsingFileStep3:'Prompt loaded, ready to generate KVCache',
        step2:'Generate KVCache',
        generateStep1:'Generate KVCache calculation plan',
        generateStep2:{
            calculate:"calculating",
            token:"tokens",
            total:'total',
        },
        generateStep3:'KVCache has been generated successfully',
        durationTime:'Duration:',
        remainTime:'Time left:',
        buildProgress:'Building Progress',
        storageUsage:'KVCache Storage Usage',
    }
}


================================================
FILE: kt-sft/ktransformers/website/src/locals/index.js
================================================
// index.js
import { createI18n } from 'vue-i18n'
import zh from './zh'
import en from './en'

const messages = {
  en,
  zh,
}
const language = (navigator.language || 'en').toLocaleLowerCase() // 这是获取浏览器的语言
const i18n = createI18n({
  legacy: false, // you must set `false`, to use Compostion API
  locale: localStorage.getItem('lang') || language.split('-')[0] || 'en', // 首先从缓存里拿，没有的话就用浏览器语言，
  fallbackLocale: 'en', // 设置备用语言
  messages, 
})

export default i18n

================================================
FILE: kt-sft/ktransformers/website/src/locals/zh.js
================================================
// zh.js
export default {
    home: {
        explore: '探索',
        language: '选择语言',
        english: '英语',
        chinese: '中文',
        today: '今天',
        previous:'历史',
        withoutAssistantTip:'本记录的KTransformers已被删除，用户只能查看历史对话信息而无法继续对话!',
        deleteThreadTip:'删除记录会清除历史信息哦～'
    },
    chat:{
        inputTip:"发送信息和 KTransformers 畅聊吧～",
    },
    explore:{
        description: "基于Lexllama，一起来创建你的专属KTransformers吧~",
        configuring: "配置中",
        completed: "完成",
        assistantName: "名称",
        assistantDescription: "描述",
        assistantStatus: "Status",
        createAssistant: "创建新的KTransformers",
        deleteAssistant: "是否确认删除KTransformers，删除KTransformers之后其KVCache也会被同步清理掉哦~",
    },
    config:{
        title:'配置你的KTransformers',
        fileTip:"仅支持上传文件格式为 .text, docx, .ppt, .pdf format.",
        secletFile:'选择文件',
        outOfSize:'文件大小超出10MB，请重新选择',
        fileExist:'文件已存在，请重新选择',
        createAssistant:'KTransformers创建成功，点击build按钮开始构建KVCache',
    },
    build:{
        title:'构建日志',
        step1:'解析上传文件',
        parsingFileStep1:'文件上传接收完成',
        parsingFileStep2:{
            parse:"正在解析第",
            file:"文件",
            total:'共',
        },
        parsingFileStep3:'Prompt装载完毕，准备生成KVCache',
        step2:'生成 KVCache',
        generateStep1:'生成KVCache计算计划',
        generateStep2:{
            calculate:"正在计算",
            token:"tokens",
            total:'共',
        },
        generateStep3:'KVCache已生成完成',
        durationTime:'持续时间：',
        remainTime:'剩余时间：',
        buildProgress:'构建进度',
        storageUsage:'存储使用：',
        
    }
}


================================================
FILE: kt-sft/ktransformers/website/src/main.ts
================================================
import { createApp } from 'vue'
import App from './App.vue'
import router from './router'
import store from './store'
import ElementPlus from 'element-plus'
import 'element-plus/dist/index.css'
import VueApexCharts from "vue3-apexcharts"
import i18n from '@/locals'

const app = createApp(App)

app.use(ElementPlus)

app.use(i18n)
app.use(VueApexCharts)
app.use(store)
app.use(router)
app.mount('#app')


================================================
FILE: kt-sft/ktransformers/website/src/router/index.ts
================================================
import { createRouter, createWebHashHistory, RouteRecordRaw, createWebHistory } from 'vue-router'
import HomeView from '@/views/home.vue'

const routes: Array<RouteRecordRaw> = [
  {
    path: '/',
    name: 'home',
    component: HomeView,
    redirect: '/chat',
    children: [{
      path: '/chat',
      name: '',
      component: () => import(/* webpackChunkName: "about" */ '../components/chat/index.vue')
    },]
  },

]

const router = createRouter({
  history: createWebHashHistory(),
  routes
})

export default router


================================================
FILE: kt-sft/ktransformers/website/src/shims-vue.d.ts
================================================
/* eslint-disable */
declare module '*.vue' {
  import type { DefineComponent } from 'vue'
  const component: DefineComponent<{}, {}, any>
  export default component
  
}

declare module '@/locals'
declare module 'pdfobject';


================================================
FILE: kt-sft/ktransformers/website/src/store/index.ts
================================================
import { createStore } from 'vuex'

export default createStore({
  state: {
  },
  getters: {
  },
  mutations: {
  },
  actions: {
  },
  modules: {
  }
})


================================================
FILE: kt-sft/ktransformers/website/src/utils/copy.ts
================================================
import { ElMessage } from "element-plus";
const copy = (value: string) => {
  //Try using the navigator.clipboard.writeText method
  if (navigator.clipboard && window.isSecureContext) {
    navigator.clipboard.writeText(value)
      .then(() => {
        //Using ElMessage to Display Success Messages in Windows Systems
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制成功!",
            type: "success",
            plain: true,
          });
        } else {
          //Using custom DOM elements to display success messages in macOS system
          showCopySuccessMessage();
        }
      })
      .catch(() => {
        //Using ElMessage to Display Failure Messages in Windows Systems
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制失败!",
            type: "error",
            plain: true,
          });
        } else {
          //Using custom DOM elements to display failure messages in macOS system
          showCopyErrorMessage();
        }
      });
  } else {
    const textarea = document.createElement("textarea");
    textarea.value = value;
    document.body.appendChild(textarea);
    textarea.select();
    try {
      const successful = document.execCommand('copy');
      if (successful) {
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制成功!",
            type: "success",
            plain: true,
          });
        } else {
          showCopySuccessMessage();
        }
      } else {
        if (navigator.appVersion.includes("Win")) {
          ElMessage({
            message: "内容复制失败!",
            type: "error",
            plain: true,
          });
        } else {
          showCopyErrorMessage();
        }
      }
    } catch (err) {
      if (navigator.appVersion.includes("Win")) {
        ElMessage({
          message: "内容复制失败!",
          type: "error",
          plain: true,
        });
      } else {
        showCopyErrorMessage();
      }
    }
    document.body.removeChild(textarea);
  }
};

function showCopySuccessMessage() {
  const messageElement = document.createElement('div');
  messageElement.textContent = '内容复制成功!';
  messageElement.style.position = 'fixed';
  messageElement.style.bottom = '10px';
  messageElement.style.left = '50%';
  messageElement.style.transform = 'translateX(-50%)';
  messageElement.style.padding = '10px';
  messageElement.style.backgroundColor = '#4CAF50';
  messageElement.style.color = 'white';
  messageElement.style.borderRadius = '15px';
  messageElement.style.zIndex = '1000';
  document.body.appendChild(messageElement);
  setTimeout(() => {
    document.body.removeChild(messageElement);
  }, 3000);
}

function showCopyErrorMessage() {
  const messageElement = document.createElement('div');
  messageElement.textContent = '内容复制失败!';
  messageElement.style.position = 'fixed';
  messageElement.style.bottom = '10px';
  messageElement.style.left = '50%';
  messageElement.style.transform = 'translateX(-50%)';
  messageElement.style.padding = '10px';
  messageElement.style.backgroundColor = '#F44336';
  messageElement.style.color = 'white';
  messageElement.style.borderRadius = '5px';
  messageElement.style.zIndex = '1000';
  document.body.appendChild(messageElement);
  setTimeout(() => {
    document.body.removeChild(messageElement);
  }, 3000);
}

export default copy;

================================================
FILE: kt-sft/ktransformers/website/src/utils/types.ts
================================================
export interface IAssistant {
  id: string;
  object: string;
  created_at: number;
  name?: string;
  description?: string;
  model: string;
  instructions?: string;
  tools: any[];
  tool_resources?: object;
  metadata?:{[key:string]:any}
  top_p?: number;
  temperature?: number;
  response_format: string | object;
}

export interface IAssistantWithStatus {
  build_status:{status:string}
  id: string;
  object: string;
  created_at: number;
  name?: string;
  description?: string;
  model: string;
  instructions?: string;
  tools: any[];
  tool_resources?: object;
  metadata?:{[key:string]:any}
  top_p?: number;
  temperature?: number;
  response_format: string | object;
}

export interface IMessage {
  id: string;
  object: string;
  created_at: number;
  thread_id: string;
  status: string;
  incomplete_details?: object;
  completed_at?: number;
  incomplete_at?: number;
  role: string;
  content: any[];
  assistant_id?: string;
  run_id?: string;
  attachments?: any[];
  metadata:{[key:string]:any}
}

export interface IThread {
  id: string;
  object: string;
  created_at: number;
  tool_resources?: object;
  metadata?:{[key:string]:any}
}

export interface IRun {
  id: string;
  object: string;
  created_at: number;
  thread_id: string,
  assistant_id: string,
  status: string,
  required_action?: object,
  last_error?: object,
  expires_at?: number,
  started_at?: number,
  cancelled_at?: number,
  failed_at?: number,
  completed_at?: number,
  incomplete_details?: object,
  model: string,
  instructions: string,
  tools: any[],
  metadata: Map<string, string>,
  usage?: object,
  temperature?: number,
  top_p?: number,
  max_prompt_tokens?: number,
  max_completion_tokens?: number,
  truncation_strategy: object,
  tool_choice: string | object,
  response_format: string | object,
}

export interface IFile {
  id: string,
  bytes: number,
  created_at: number,
  filename: string,
  object: string,
  purpose: string,
}

export interface IMessageData {
  role: string;
  content: any[];
  created_at?: number;
  assistant_id?: string,
}

export interface IThreadAndMessageAndAssistant {

  thread: IThread;
  first_message: IMessage;
  assistant: IAssistantWithStatus
}
export interface IDeleteResult {
  id: string;
  object: string;
  deleted: boolean;
}
export interface IBuildData {
  parsed_file_count:number;
  total_file_count:number;
  prefilling_current:number;
  prefilling_total:number;
  build_completed_time:number;
  build_started_time:number;
  storage_total:number;
  storage_usage:number;
  status:string
}

================================================
FILE: kt-sft/ktransformers/website/src/views/home.vue
================================================
<template>
  <div class="home flex-row">
    <nav class="left-panel flex-column">
      <div class="logo-box">
        <div class="logo flex-row">
          <img class="img" src="../../public/images/three.png" />
          <span class="text">{{ projectName }}</span>
        </div>
        <div class="version">{{ projectVersion }}</div>
      </div>
      <div class="divider"></div>
      <div class="assistant-box">
        <div class="assistant-list">
          <ul>
            <li
              class="assistant-item flex-row"
              v-for="(item, index) in assistantList"
              :key="index"
              @click="setActiveAssistant(item)"
            >
              <img src="../../public/images/avatar.png" />
              <span class="name flex-unit">{{ item.name }}</span>
              <i class="iconfont icon-edit"></i>
            </li>
          </ul>
        </div>
      </div>
      <div class="divider"></div>
      <!-- History area -->
      <div class="history-box flex-unit">
        <div class="">
          <div class="date">{{ $t("home.today") }}</div>
          <ul>
            <li
              v-for="(item, index) in todayThreads"
              :key="index"
              class="chat-item"
              :class="{ active: activeThreadIndex === index }"
              @click="setActiveThreadIndex(index)"
            >
              <div class="chat-abbr">
                {{ firstMessages[index] }}
              </div>
              <div class="chat-ops flex-row">
                <img src="../../public/images/avatar.png" />
                <div class="name flex-unit">
                  {{ assistantOfThread[index].name || "" }}
                </div>
                <i class="iconfont icon-delete" @click="delThread(index)"></i>
              </div>
            </li>
          </ul>
          <div class="date" v-if="previousThreads.length > 0">
            {{ $t("home.previous") }}
          </div>
          <ul>
            <li
              v-for="(item, index) in previousThreads"
              :key="index"
              class="chat-item"
              :class="{
                active: activeThreadIndex === index + todayThreads.length,
              }"
              @click="setActiveThreadIndex(index + todayThreads.length)"
            >
              <div class="chat-abbr">
                {{ firstMessages[index + todayThreads.length] }}
              </div>
              <div class="chat-ops flex-row">
                <img src="../../public/images/avatar.png" />
                <div class="name flex-unit">
                  {{
                    assistantOfThread[index + todayThreads.length].name || ""
                  }}
                </div>
                <i
                  class="iconfont icon-delete"
                  @click="delThread(index + todayThreads.length)"
                ></i>
              </div>
            </li>
          </ul>
        </div>
      </div>
      <div class="icon-box example-2">
        <div class="iconhub icon-content" @click="navigateToIconHub">
          <svg
            xmlns="http://www.w3.org/2000/svg"
            width="16"
            height="16"
            fill="currentColor"
            class="bi bi-github"
            viewBox="0 0 16 16"
            xml:space="preserve"
          >
            <path
              d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27s1.36.09 2 .27c1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.01 8.01 0 0 0 16 8c0-4.42-3.58-8-8-8"
              fill="currentColor"
            ></path>
          </svg>
          <div class="tooltip">GitHub</div>
        </div>
        <div class="iconlanguage" @click="changeLanguage">
          <svg
            v-if="!flag"
            t="1719306572024"
            class="icon"
            viewBox="0 0 1024 1024"
            version="1.1"
            xmlns="http://www.w3.org/2000/svg"
            p-id="16849"
            data-spm-anchor-id="a313x.search_index.0.i21.366e3a81tz0TYS"
            width="18"
            height="18"
          >
            <path
              d="M64.064 768V192H448.64v64H127.936v192h320v64h-320v192h320v64H64.064z m511.872 0V192h64l256 447.68V192h64v576h-64l-256-447.168V768h-64z"
              p-id="16850"
              data-spm-anchor-id="a313x.search_index.0.i22.366e3a81tz0TYS"
              class="selected"
              fill="#000000"
            ></path>
          </svg>
          <svg
            v-else
            t="1719306494614"
            class="icon"
            viewBox="0 0 1024 1024"
            version="1.1"
            xmlns="http://www.w3.org/2000/svg"
            p-id="12325"
            width="18"
            height="18"
          >
            <path
              d="M1023.488 831.552h-96l-265.472-451.904c-8.96-12.8-16-25.344-21.44-37.888H638.08c2.176 12.992 3.2 40.128 3.2 81.408v408.32L576 836.928V256h101.568l257.024 445.632c14.592 20.992 23.232 34.368 25.92 40.128h1.6c-2.688-16.512-4.032-44.8-4.032-84.736v-399.36L1024 256l-0.512 575.552zM435.008 804.224c-42.752 21.76-96.384 32.64-160.896 32.64-83.2 0-149.76-25.6-199.488-76.736C24.896 708.928 0 641.344 0 557.12c0-90.432 27.968-163.2 84.032-218.368C140.032 283.52 211.072 256 297.344 256c55.552 0 101.376 7.616 137.6 22.848v75.84a284.992 284.992 0 0 0-136.832-33.408c-64.768 0-117.504 20.864-158.208 62.592-40.768 41.728-61.184 98.048-61.184 168.96 0 67.2 19.008 120.576 57.024 160.128 38.016 39.552 87.744 59.328 149.248 59.328 57.536 0 107.52-12.544 150.016-37.76v69.696z"
              fill="#000000"
              p-id="12326"
              data-spm-anchor-id="a313x.search_index.0.i16.366e3a81tz0TYS"
              class="selected"
            ></path>
          </svg>
        </div>
      </div>
    </nav>
    <router-view v-slot="{ Component }" class="main-panel flex-unit">
      <component
        :is="Component"
        :chatInit="chatInit"
        :activeAssistant="activeAssistant"
        :activeThread="activeThread"
        :messages="allMessageInCurrentThread"
        :completedAssistant="assistantList"
        :inputDisabled="inputDisabled"
        @updateAssistant="handleUpdateAssistant"
      />
    </router-view>
  </div>
</template>

<script lang="ts">
import { defineComponent, ref, onMounted, computed, nextTick } from "vue";
import {
  IThread,
  IAssistant,
  IMessageData,
  IThreadAndMessageAndAssistant,
  IAssistantWithStatus,
} from "@/utils/types";
import { listThreads, deleteThread, getThread } from "@/api/thread";
import { ElMessage, ElMessageBox } from "element-plus";
import { listAssistants } from "@/api/assistant";
import { listMessages } from "@/api/message";
import { useRouter } from "vue-router";
import BScroll from "better-scroll";
import { useI18n } from "vue-i18n";

export default defineComponent({
  name: "HomeView",
  setup() {
    const assistantList = ref<IAssistant[]>([]);
    const threadsList = ref<IThread[]>([]);
    const firstMessages = ref<string[]>([]);
    const activeAssistant = ref({} as IAssistant);
    const assistantOfThread = ref<IAssistantWithStatus[]>([]);
    const threadAndMessages = ref<IThreadAndMessageAndAssistant[]>([]);
    const assistantScroll = ref<BScroll | null>(null);
    const historyScroll = ref<BScroll | null>(null);
    const router = useRouter();
    const { t, locale } = useI18n();
    const flag = ref(true);
    const changeLanguage = () => {
      if (flag.value) {
        locale.value = "zh";
        localStorage.setItem("lang", "zh");
        flag.value = false;
      } else {
        locale.value = "en";
        flag.value = true;
        localStorage.setItem("lang", "en");
      }
    };
    // Initialize data
    const initData = async () => {
      try {
        threadsList.value = [];
        firstMessages.value = [];
        assistantOfThread.value = [];

        const assistantsRes = await listAssistants();
        if (assistantsRes && assistantsRes.length > 0) {
          assistantList.value = assistantsRes;
          activeAssistant.value = assistantsRes[0];
        }

        const threadsRes = await listThreads(100);
        if (threadsRes) {
          threadAndMessages.value = threadsRes;
          for (let t of threadsRes) {
            if (t.thread && !t.thread.metadata?.hidden) {
              threadsList.value.push(t.thread);
              if (
                t.first_message &&
                t.first_message.content &&
                t.first_message.content.length > 0
              ) {
                firstMessages.value.push(t.first_message.content[0].text.value);
              } else {
                firstMessages.value.push("no message yet");
              }
              assistantOfThread.value.push(
                t.assistant || ({} as IAssistantWithStatus)
              );
            }
          }
        }

        assistantScroll.value = new BScroll(".assistant-list", {
          click: true,
          mouseWheel: true,
          scrollbar: {
            fade: true,
            interactive: true,
          },
        });

        historyScroll.value = new BScroll(".history-box", {
          click: true,
          mouseWheel: true,
          scrollbar: {
            fade: true,
            interactive: true,
          },
        });
      } catch (err) {
        console.error("Failed to initialize data:", err);
      }
    };
    const navigateToIconHub = () => {
      window.open("https://github.com/kvcache-ai/Lexllama");
    };
    const isEmptyObject = (obj: object): boolean => {
      //Determine if the object is empty
      return Object.keys(obj).length === 0;
    };
    //Jump route
    const navigateToExplore = () => {
      router.push("/explore");
    };
    const navigatorToChat = () => {
      router.push("/chat");
    };
    // Calculate date
    const todayThreads = computed(() => {
      const today = Math.floor(Date.now() / 1000);
      return threadsList.value.filter((thread) => {
        return today - thread.created_at <= 86400;
      });
    });
    const previousThreads = computed(() => {
      const today = Math.floor(Date.now() / 1000);
      return threadsList.value.filter((thread) => {
        return today - thread.created_at > 86400;
      });
    });

    onMounted(async () => {
      initData();
    });

    return {
      t,
      flag,
      assistantList,
      isEmptyObject,
      activeAssistant,
      navigateToExplore,
      navigatorToChat,
      threadsList,
      firstMessages,
      navigateToIconHub,
      assistantScroll,
      historyScroll,
      assistantOfThread,
      changeLanguage,
      initData,
      todayThreads,
      previousThreads,
    };
  },
  data() {
    return {
      projectName: "KTransformers",
      projectVersion: "v0.01",
      activeThreadIndex: -1,
      chatInit: true,
      activeThread: {} as IThread,
      allMessageInCurrentThread: [] as IMessageData[],
      inputDisabled: false,
      isSettingActiveThread: false,
      isDeletingThread: false,
      threadAndMessages: <IThreadAndMessageAndAssistant[]>[],
    };
  },
  methods: {
    setActiveAssistant(assistant: IAssistant) {
      this.chatInit = true;
      this.inputDisabled = false;
      this.activeThreadIndex = -1;
      this.activeAssistant = assistant;
      this.activeThread = {} as IThread;
      this.allMessageInCurrentThread = [];
      if (this.$route.path != "/chat") {
        this.navigatorToChat();
      }
    },
    async setActiveThreadIndex(index: number) {
      //If setting up an active thread, return directly
      if (this.isSettingActiveThread) {
        return;
      }
      this.isSettingActiveThread = true;
      this.activeThreadIndex = index;
      this.chatInit = false;
      this.inputDisabled = false;
      this.activeAssistant = {} as IAssistant;
      this.activeThread = this.threadsList[index];
      //If the assistant of the current thread is an empty object
      if (this.isEmptyObject(this.assistantOfThread[index])) {
        ElMessage({
          message: this.t("home.withoutAssistantTip"),
          type: "warning",
        });
        this.inputDisabled = true;
      }
      try {
        //Call asynchronous function to obtain the message list of the current thread
        const res = await listMessages(this.activeThread.id, 100, "asc");
        //Convert the obtained message list to the specified format and assign values to all messages of the current thread
        this.allMessageInCurrentThread = res.map((m) => ({
          role: m.role,
          content: m.content,
          assistant_id: m.assistant_id,
          created_at: m.created_at,
        }));
      } catch (err) {
        console.log(err);
      } finally {
        this.isSettingActiveThread = false;
      }
      if (this.$route.path != "/chat") {
        this.navigatorToChat();
      }
    },

    async delThread(index: number) {
      // If the thread is currently being deleted, return directly
      if (this.isDeletingThread) {
        return;
      }
      this.isDeletingThread = true;
      try {
        //Pop up a confirmation box and ask the user if they are sure to delete the thread
        await ElMessageBox.confirm(this.t("home.deleteThreadTip"), "Warning", {
          confirmButtonText: "OK",
          cancelButtonText: "Cancel",
          type: "warning",
        });

        const res = await deleteThread(this.threadsList[index].id);
        this.threadsList.splice(index, 1);
        this.firstMessages.splice(index, 1);
        this.assistantOfThread.splice(index, 1);
        // Jump to the first assistant or other suitable page
        this.setActiveAssistant(this.assistantList[0]);
        ElMessage({
          type: "success",
          message: "Delete completed",
        });
      } catch (err) {
        // Specific error handling, such as logging or displaying specific error messages to users
        console.error("Delete session failed:", err);
        ElMessage({
          type: "error",
          message: `Delete failed`, // Display specific error messages
        });
      } finally {
        this.isDeletingThread = false; //Ensure that the delete thread flag is reset no matter what
      }
    },
    // Handles the update of the assistant asynchronously.
    async handleUpdateAssistant(value: any) {
      await this.initData();
      if (this.activeThreadIndex != -1) {
        this.setActiveThreadIndex(this.activeThreadIndex);
      } else if (this.activeAssistant.id) {
        this.setActiveThreadIndex(0);
      } else {
        this.setActiveAssistant(this.assistantList[0]);
      }
    },
  },
});
</script>


<style lang="stylus" rel="stylesheet/stylus" scoped>
@import '../assets/css/mixins.styl';

.home {
  width: 100%;
  height: 100%;
  position: relative;
}

.left-panel {
  width: 320px;
  height: 100%;
  background-color: #363433;
  padding: 30px 30px;
  .logo-box {
    .logo {
      .img {
        width: 36px;
        height: 36px;
      }

      .text {
        font-size: 28px;
        font-weight: bold;
        margin-left: 10px;
        color: #edf2ea;
      }
    }

    .version {
      text-align: right;
      font-size: 14px;
      color: #bdbdbd;
    }
  }

  .divider {
    border-bottom: 1px solid #D7D7D7;
    width: 30%;
    margin: 30px auto;
  }

  .lang-box {
    position: relative;
    width: 100%;
    height: 30px;
    margin: auto;
    margin-bottom: 10px;

    .el-dropdown {
      font-size: 14px;
      position: absolute;
      top: 50%;
      left: 50%;
      transform: translate(-50%, -50%);
    }
  }

  .assistant-box {
    .assistant-list {
      min-height: 50px;
      max-height: 300px;
      overflow: hidden;
      position: relative;

      ul > li.assistant-item {
        padding: 8px 15px;
        color: #edf2ea;

        img {
          width: 32px;
          height: 32px;
        }

        .name {
          margin-left: 12px;
          font-size: 14px;
          color: #edf2ea;
        }

        i.iconfont {
          display: none;
          margin-left: 10px;
        }

        &:hover {
          background-color: $bg_gray_light_hover;
          cursor: pointer;
          border-radius: 4px;

          .name {
            color: #313433;
          }

          i.iconfont {
            display: block;
          }
        }
      }
    }

    .explore {
      position: relative;
      justify-content: center;
      display: flex;
      margin-top: 10px;

      .explore-btn {
        margin: 0 auto;
        padding: 0 20px;
        justify-content: center;
        height: 32px;
        line-height: 32px;
        background-color: #FFFFFF;
        border: 1px solid RGBA(0, 0, 0, 0.15);
        border-radius: 16px;

        i {
          color: #8080FF;
        }

        .text {
          color: #7F7F7F;
          margin-left: 4px;
        }

        &:hover {
          background-color: #FAFAFA;
          cursor: pointer;
        }
      }
    }
  }

  .history-box {
    position: relative;

    .date {
      font-size: 14px;
      color: #7F7F7F;
      margin: 8px 0;

      &:first-child {
        margin-top: 0;
      }
    }

    li.chat-item {
      padding: 12px 15px;
      cursor: pointer;
      background-color: #edf2ea;
      border-radius: 4px;
      margin-bottom: 10px;
      font-size: 16px;

      .chat-abbr {
        font-size: 14px;
        color: #313433;
        white-space: nowrap;
        overflow: hidden;
        text-overflow: ellipsis;
      }

      .chat-ops {
        display: flex;
        margin-top: 5px;

        img {
          width: 16px;
          height: 16px;
        }

        .name {
          font-size: 12px;
          color: #898989;
          margin-left: 8px;
        }

        i.iconfont {
          color: $gray_60;
        }
      }

      &:hover, &.active {
        transition: 0.3s all;
        cursor: pointer;
        background-color: #a2a79f;
        .chat-abbr {
          color: black;
        }

        .name, i.iconfont {
          color: black;
        }
      }
    }
  }

  .icon-box {
    width: 100%;
    display: flex;
    flex-direction: row;
    justify-content: flex-end;
    align-items: center;

    .iconhub {
      width: 32px;
      height: 24px;
      background: white;
      font-size: 30px;
      border: none;
      ovferflow: hidden;
      border-radius: 15%;
      display: flex;
      flex-direction: column;
      justify-content: center;
      align-items: center;
      color: #898989;
      transition: all 0.5s;
      cursor: pointer;
    }

    .iconhub:hover {
      background: #e5e5e5;
      text-decoration: none;
    }

    .iconlanguage {
      margin-left: 15px;
      width: 32px;
      height: 24px;
      background: white;
      font-size: 30px;
      border: none;
      ovferflow: hidden;
      border-radius: 15%;
      display: flex;
      flex-direction: column;
      justify-content: center;
      align-items: center;
      color: #898989;
      transition: all 0.5s;
      cursor: pointer;
    }

    .iconlanguage:hover {
      background: #e5e5e5;
      text-decoration: none;
    }
  }
}

ul {
  list-style: none;
}

.example-2 {
  display: flex;
  justify-content: center;
  align-items: center;
}

.example-2 .icon-content {
  margin: 0 10px;
  position: relative;
}

.example-2 .icon-content .tooltip {
  position: absolute;
  top: -30px;
  left: 50%;
  transform: translateX(-50%);
  color: #fff;
  padding: 6px 10px;
  border-radius: 5px;
  opacity: 0;
  visibility: hidden;
  font-size: 14px;
  transition: all 0.3s ease;
}

.example-2 .icon-content:hover .tooltip {
  opacity: 1;
  visibility: visible;
  top: -50px;
}

.main-panel {
  height: 100%;
  background-color: #f1f0ed;
}
</style>


================================================
FILE: kt-sft/ktransformers/website/tests/unit/example.spec.ts
================================================
import { shallowMount } from '@vue/test-utils'
import HelloWorld from '@/components/HelloWorld.vue'

describe('HelloWorld.vue', () => {
  it('renders props.msg when passed', () => {
    const msg = 'new message'
    const wrapper = shallowMount(HelloWorld, {
      props: { msg }
    })
    expect(wrapper.text()).toMatch(msg)
  })
})


================================================
FILE: kt-sft/ktransformers/website/tsconfig.json
================================================
{
  "compilerOptions": {
    "target": "es5",
    "module": "esnext",
    "strict": true,
    "jsx": "preserve",
    "importHelpers": true,
    "moduleResolution": "node",
    "skipLibCheck": true,
    "esModuleInterop": true,
    "allowSyntheticDefaultImports": true,
    "forceConsistentCasingInFileNames": true,
    "useDefineForClassFields": true,
    "sourceMap": true,
    "allowJs": true,
    "baseUrl": ".",
    "types": [
      "webpack-env",
      "jest"
    ],
    "paths": {
      "@/*": [
        "src/*"
      ]
    },
    "lib": [
      "esnext",
      "dom",
      "dom.iterable",
      "scripthost"
    ]
  },
  "include": [
    "src/**/*.ts",
    "src/**/*.tsx",
    "src/**/*.vue",
    "tests/**/*.ts",
    "tests/**/*.tsx",
    "config.d.ts"
  ],
 
  "exclude": [
    "node_modules"
  ]
}

================================================
FILE: kt-sft/ktransformers/website/vue.config.js
================================================

module.exports = {
  // 配置 webpack-dev-server 行为。
  devServer: {
    open: false, // 编译后默认打开浏览器
    host: '0.0.0.0',  // 域名
    port: 8082,  // 端口
    https: false,  // 是否https
    proxy: {
        '/api': {
          target: 'http://localhost:9016/v1', // 你的后端服务器地址
          changeOrigin: true, // 是否允许跨域
          pathRewrite: {
            '/api': '' // 将 '/api' 前缀替换为空，如果你的后端不需要这个前缀
          }
        }
      }
},
publicPath: '/web/',  // 基本路径
outputDir: 'dist', // 构建时的输出目录
assetsDir: 'static', // 放置静态资源的目录
indexPath: 'index.html', // html 的输出路径
filenameHashing: true, // 文件名哈希值
lintOnSave: false, // 是否在保存的时候使用 `eslint-loader` 进行检查。

// 组件是如何被渲染到页面中的？ （ast：抽象语法树；vDom：虚拟DOM）
// template ---> ast ---> render ---> vDom ---> 真实的Dom ---> 页面
// runtime-only：将template在打包的时候，就已经编译为render函数
// runtime-compiler：在运行的时候才去编译template
runtimeCompiler: false,

transpileDependencies: [], // babel-loader 默认会跳过 node_modules 依赖。
productionSourceMap: false, // 是否为生产环境构建生成 source map

//调整内部的 webpack 配置
configureWebpack: () => {},

chainWebpack: () => {},
  
}

================================================
FILE: kt-sft/merge_tensors/merge_safetensor_gguf.py
================================================
# this script targets to merge the fp8 safe tensor and the gguf quantized tensors.

import os
# insert the path of the project
import sys
# sys.path.insert(0, "/home/azure/ktransformers")
import argparse
import torch
from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf
from safetensors import safe_open
from safetensors.torch import save_file
import re
from collections import defaultdict

def read_safetensor_keys_from_folder(folder_path)->dict:
    """    
    :param folder_path: folder path
    :return: key_to_file_map
    """
    # check if the folder path is exist
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"GGUF dir not found: {folder_path}")
    if os.path.isfile(folder_path):
        folder_path = os.path.dirname(folder_path)
    
    key_to_file_map = {}

    found_safetensor = False
    for root, dirs, files in os.walk(folder_path):
        # sort files
        files = sorted(files)
        for file in files:
            if file.endswith(".safetensors"):
                found_safetensor = True
                file_path = os.path.join(root, file)
                try:
                    with safe_open(file_path, framework="pt") as f:
                        for key in f.keys():
                            if "model.layers.61" in key:
                                # skip MTP layer
                                continue
                            # try:
                            #     if int(key.split('.')[2]) > 4:
                            #         continue
                            # except:
                            #     pass
                            key_to_file_map[key] = file_path
                except Exception as e:
                    print(f"Error reading Safetensor file {file_path}: {e}")
    
    if not found_safetensor:
        raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
    
    return key_to_file_map

tensor_from_gguf = [] # todo: add keys in gguf that should be used in the final tensor

def translate_name(name:str)->str:
    """
    :param name: name of the tensor
    :return: translated name
    """
    name = translate_name_to_gguf(name)
    name = name.replace(".up_proj.", ".ffn_up_exps.")
    name = name.replace(".down_proj.", ".ffn_down_exps.")
    name = name.replace(".gate_proj.", ".ffn_gate_exps.")
    name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias") 
    return name
    

def combine_tensor_sources(safetensor_path:str, gguf_path:str):
    gguf_loader = GGUFLoader(gguf_path)
    gguf_tensor_file_map = gguf_loader.tensor_file_map
    safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)
    
    # build a map for the key to the tensor
    # according to the key, we can get the tensor from the file
    
    target_tensor_map = {}
    for key in safetensor_tensor_file_map.keys():
        # for all experts, we use the gguf tensor
        if ".mlp.experts." in key:
            if '.weight_scale_inv' in key:
                continue
            key = '.'.join(key.split('.')[:5]+key.split('.')[-2:])
            translated_key = translate_name(key)
            target_tensor_map[key] = gguf_tensor_file_map[translated_key]
            continue
        
        if any(target_key in key for target_key in tensor_from_gguf):
            target_tensor_map[key] = gguf_tensor_file_map[translate_name(key)]
        else:
            target_tensor_map[key] = safetensor_tensor_file_map[key]
    
    return target_tensor_map, gguf_loader

def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader):
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)
    
    # Cache for safetensor file handles and GGUF loaders
    safetensors_cache = {}
    gguf_cache = {}
    
    # Group tensors by layer
    layer_groups = defaultdict(list)
    non_layer_keys = []
    layer_pattern = re.compile(r'\.layers\.(\d+)\.')
    
    for key in target_tensor_map:
        match = layer_pattern.search(key)
        if match:
            layer_num = int(match.group(1))
            layer_groups[layer_num].append(key)
        else:
            non_layer_keys.append(key)
    
    # Calculate total shards
    total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1
    if total_shards == 0:
        raise ValueError("No tensors to save")
    
    shard_idx = 0
    
    # Save non-layer tensors to the first shard if they exist
    if non_layer_keys:
        tensors = {}
        for key in non_layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None
            if file_path.endswith('.safetensors'):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
            elif file_path.endswith('.gguf'):
                gguf_name = translate_name(key)
                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
            tensors[translate_name(key)] = tensor
            if ggml_type:
                ggml_type = torch.tensor(ggml_type)
                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
                tensors[ggml_key] = ggml_type
        
        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
        print(f"Saving non-layer tensors to {output_file}")
        save_file(tensors, output_file)
        print(tensors.keys())

        shard_idx += 1
    
    # Save each layer's tensors to subsequent shards
    for layer_num in sorted(layer_groups.keys()):
        layer_keys = layer_groups[layer_num]
        tensors = {}
        for key in layer_keys:
            file_path = target_tensor_map[key]
            tensor = None
            ggml_type = None
            if file_path.endswith('.safetensors'):
                if file_path not in safetensors_cache:
                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
                f = safetensors_cache[file_path]
                tensor = f.get_tensor(key)
                tensor_info = tensor.shape
            elif file_path.endswith('.gguf'):
                gguf_name = translate_name(key)
                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
                # tensor_info = gguf_loader.tensor_info[gguf_name]
                # ggml_type = gguf_loader.tensor_info[gguf_name]['ggml_type']
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
            tensors[translate_name(key)] = tensor
            if ggml_type:
                ggml_type = torch.tensor(ggml_type)
                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
                tensors[ggml_key] = ggml_type
        
        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
        print(f"Saving layer {layer_num} to {output_file}")
        # print(tensors.keys())
        save_file(tensors, output_file)
        shard_idx += 1
    
    return
    
def main():
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser(description="Read parameters from Safetensor and GGUF files")
    parser.add_argument("--safetensor_path", type=str, help="Path to the Safetensor file", default="/mnt/data/model/DeepSeek-V3")
    parser.add_argument("--gguf_path", type=str, help="Path to the GGUF file", default="/mnt/data/model/DeepseekV3-q4km-gguf")
    parser.add_argument("--output_path", type=str, help="Path to the output file", default="/mnt/data/model/ktrans-safetensors/DeepSeek-V3-q4km-fp8")
    
    # print all the arguments
    print("All the arguments:")
    print(parser.parse_args())
    
    # 解析命令行参数
    args = parser.parse_args()

    safetensor_path = args.safetensor_path
    gguf_path = args.gguf_path
    output_path = args.output_path
    
    target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path)
    write_combined_tensor(target_tensor_map, output_path, gguf_loader)
    
    return

if __name__ == "__main__":
    main()

================================================
FILE: kt-sft/pyproject.toml
================================================
[build-system]
requires = [
  "setuptools",
  "wheel",
  "cmake >= 3.20",
  "torch >= 2.3.0", 
  "ninja",
  "packaging",
  "cpufeature"
  ]
build-backend = "setuptools.build_meta"

[project]

name = "ktransformers"

dynamic = ["version"]

dependencies = [
  "torch >= 2.3.0",
  "transformers == 4.51.3",
  "peft == 0.14.0",
  "fastapi >= 0.111.0",
  "uvicorn >= 0.30.1",
  "langchain >= 0.2.0",
  "blessed >= 1.20.0",
  "accelerate >= 0.31.0",
  "sentencepiece >= 0.1.97",
  "setuptools",
  "ninja",
  "wheel",
  "colorlog",
  "build",
  "fire",
  "protobuf",
  "datasets",
  "torchviz",
]

requires-python = ">=3.10"

authors = [
  {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"}
]

maintainers = [
  {name = "james0zan", email = "zhang.mingxing@outlook.com"},
  {name = "awake", email = "awake@approaching.ai"},
  {name = "unicorn chan", email = "nl@approaching.ai"}
]

description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies."

readme = "README.md"
license = "Apache-2.0"
license-files = ["LICENSE"]

keywords = ["ktransformers", "llm"]

classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12"
]

[project.urls]
Homepage = "https://kvcache.ai"
Repository = "https://github.com/kvcache-ai/ktransformers.git"
Issues = "https://github.com/kvcache-ai/ktransformers/issues"


[project.scripts]
ktransformers = "ktransformers.server.main:main"

[tool.setuptools.packages.find]
where = ["./", ]
include = ["ktransformers","ktransformers.*"]
[tool.black]
line-length = 120
preview = true
unstable = true


================================================
FILE: kt-sft/requirements-sft.txt
================================================
absl-py==2.3.1
aiohappyeyeballs==2.6.1
aiohttp==3.11.18
aiosignal==1.3.2
attrs==25.3.0
colorama==0.4.6
conda-pack==0.8.1
datasets==3.6.0
dill==0.3.8
einops==0.8.1
frozenlist==1.6.0
graphviz==0.20.3
joblib==1.5.1
multidict==6.4.4
multiprocess==0.70.16
nltk==3.9.1
nvidia-cufile-cu12==1.11.1.6
pandas==2.2.3
peft==0.14.0
propcache==0.3.1
pyarrow==20.0.0
python-dateutil==2.9.0.post0
python_helper==0.3.74
pytz==2025.2
rouge_score==0.1.2
six==1.17.0
tabulate==0.9.0
thop==0.1.1.post2209072238
torchviz==0.0.3
tzdata==2025.2
xxhash==3.5.0
yarl==1.20.0
torchviz

================================================
FILE: kt-sft/setup.py
================================================
#!/usr/bin/env python
# coding=utf-8
'''
Description  :
Author       : chenxl
Date         : 2024-07-27 16:15:27
Version      : 1.0.0
LastEditors  : chenxl
LastEditTime : 2024-08-14 16:36:19
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''

import os
import sys
import re
import ast
from collections import deque
import subprocess
import select
import time
import platform
import shutil
from typing import List, Optional, Literal
import http.client
import urllib.request
import urllib.error
from pathlib import Path
from packaging.version import parse
import torch
import torch.version
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
from setuptools import setup, Extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
from packaging.requirements import Requirement
try:
    from torch_musa.utils.simple_porting import SimplePorting
    from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
except ImportError:
    MUSA_HOME=None
try:
    import tomllib  # Py3.11+
except Exception:
    import tomli as tomllib  # 兼容老 Python

def _load_pyproject_deps():
    with open("pyproject.toml", "rb") as f:
        data = tomllib.load(f)
    return list(data.get("project", {}).get("dependencies", []) or [])

KTRANSFORMERS_BUILD_XPU = torch.xpu.is_available()

# 检测 DEV_BACKEND 环境变量
dev_backend = os.environ.get("DEV_BACKEND", "").lower()
if dev_backend == "xpu":
    triton_dep = [
        "pytorch-triton-xpu==3.3.0"
    ]
else:
    triton_dep = []

base_deps = _load_pyproject_deps()
combined_deps = base_deps + triton_dep


def _strip_req(reqs, name: str):
    out = []
    for r in reqs:
        try:
            rn = Requirement(r).name.lower()
        except Exception:
            rn = r.split()[0].lower()
        if rn != name.lower():
            out.append(r)
    return out

_tver = parse(torch.__version__)
_tlow = f"{_tver.major}.{_tver.minor}"
_thigh = f"{_tver.major}.{_tver.minor + 1}"
TORCH_RANGE = f"torch>={_tlow},<{_thigh}"
install_requires_pinned = _strip_req(combined_deps, "torch") + [TORCH_RANGE]

with_balance = os.environ.get("USE_BALANCE_SERVE", "0") == "1"

class CpuInstructInfo:
    CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
    FANCY = "FANCY"
    AVX512 = "AVX512"
    AVX2 = "AVX2"
    CMAKE_NATIVE = "-DLLAMA_NATIVE=ON"
    CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON"
    CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON"
    CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON"

class VersionInfo:
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))
    PACKAGE_NAME = "ktransformers"
    BASE_WHEEL_URL:str = (
        "https://github.com/kvcache-ai/ktransformers/releases/download/{tag_name}/{wheel_filename}"
    )
    FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE"

    def get_musa_bare_metal_version(self, musa_dir):
        raw_output = subprocess.run(
            [musa_dir + "/bin/mcc", "-v"], check=True,
            stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8")
        output = raw_output.split()
        release_idx = output.index("version") + 1
        bare_metal_version = parse(output[release_idx].split(",")[0])
        musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
        return musa_version

    def get_rocm_bare_metal_version(self, rocm_dir):
        """
        Get the ROCm version from the ROCm installation directory.

        Args:
            rocm_dir: Path to the ROCm installation directory

        Returns:
            A string representation of the ROCm version (e.g., "63" for ROCm 6.3)
        """
        try:
            # Try using rocm_agent_enumerator to get version info
            raw_output = subprocess.check_output(
                [rocm_dir + "/bin/rocminfo", "--version"],
                universal_newlines=True,
                stderr=subprocess.STDOUT)
            # Extract version number from output
            match = re.search(r'(\d+\.\d+)', raw_output)
            if match:
                version_str = match.group(1)
                version = parse(version_str)
                rocm_version = f"{version.major}{version.minor}"
                return rocm_version
        except (subprocess.CalledProcessError, FileNotFoundError):
            # If rocminfo --version fails, try alternative methods
            pass

        try:
            # Try reading version from release file
            with open(os.path.join(rocm_dir, "share/doc/hip/version.txt"), "r") as f:
                version_str = f.read().strip()
                version = parse(version_str)
                rocm_version = f"{version.major}{version.minor}"
                return rocm_version
        except (FileNotFoundError, IOError):
            pass

        # If all else fails, try to extract from directory name
        dir_name = os.path.basename(os.path.normpath(rocm_dir))
        match = re.search(r'rocm-(\d+\.\d+)', dir_name)
        if match:
            version_str = match.group(1)
            version = parse(version_str)
            rocm_version = f"{version.major}{version.minor}"
            return rocm_version

        # Fallback to extracting from hipcc version
        try:
            raw_output = subprocess.check_output(
                [rocm_dir + "/bin/hipcc", "--version"],
                universal_newlines=True,
                stderr=subprocess.STDOUT)
            match = re.search(r'HIP version: (\d+\.\d+)', raw_output)
            if match:
                version_str = match.group(1)
                version = parse(version_str)
                rocm_version = f"{version.major}{version.minor}"
                return rocm_version
        except (subprocess.CalledProcessError, FileNotFoundError):
            pass

        # If we still can't determine the version, raise an error
        raise ValueError(f"Could not determine ROCm version from directory: {rocm_dir}")

    def get_cuda_bare_metal_version(self, cuda_dir):
        raw_output = subprocess.check_output(
            [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
        output = raw_output.split()
        release_idx = output.index("release") + 1
        bare_metal_version = parse(output[release_idx].split(",")[0])
        cuda_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
        return cuda_version

    def get_cuda_version_of_torch(self):
        torch_cuda_version = parse(torch.version.cuda)
        cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
        return cuda_version

    def get_platform(self,):
        """
        Returns the platform name as used in wheel filenames.
        """
        if sys.platform.startswith("linux"):
            return f'linux_{platform.uname().machine}'
        elif sys.platform == "win32":
            return "win_amd64"
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))

    def get_cpu_instruct(self,):
        if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
            return "fancy"
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
            return "avx512"
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
            return "avx2"
        else:
            print("Using native cpu instruct")
        if sys.platform.startswith("linux"):
            with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
                cpuinfo = cpu_f.read()
            flags_line = [line for line in cpuinfo.split(
                '\n') if line.startswith('flags')][0]
            flags = flags_line.split(':')[1].strip().split(' ')
            # fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI
            for flag in flags:
                if 'avx512bw' in flag:
                    return 'fancy'
            for flag in flags:
                if 'avx512' in flag:
                    return 'avx512'
            for flag in flags:
                if 'avx2' in flag:
                    return 'avx2'
            raise ValueError(
                "Unsupported cpu Instructions: {}".format(flags_line))
        elif sys.platform == "win32":
            from cpufeature.extension import CPUFeature

            if CPUFeature.get("AVX512bw", False):
                return 'fancy'
            if CPUFeature.get("AVX512f", False):
                return 'avx512'
            if CPUFeature.get("AVX2", False):
                return 'avx2'
            raise ValueError(
                "Unsupported cpu Instructions: {}".format(str(CPUFeature)))
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))

    def get_torch_version(self,):
        torch_version_raw = parse(torch.__version__)
        torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}"
        return torch_version

    def get_flash_version(self,):
        version_file = os.path.join(
            Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
        with open(version_file, "r", encoding="utf-8") as f:
            version_match = re.search(
                r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
        flash_version = ast.literal_eval(version_match.group(1))
        return flash_version

    def get_package_version(self, full_version=False):
        flash_version = str(self.get_flash_version())
        torch_version = self.get_torch_version()
        cpu_instruct = self.get_cpu_instruct()
        backend_version = ""
        if CUDA_HOME is not None:
            backend_version = f"cu{self.get_cuda_version_of_torch()}"
        elif MUSA_HOME is not None:
            backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
        elif ROCM_HOME is not None:
            backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
        elif torch.xpu.is_available():
            backend_version = f"xpu"
        else:
            raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set and XPU is not available.")
        package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
        if full_version:
            return package_version
        if not VersionInfo.FORCE_BUILD:
            return flash_version
        return package_version


class BuildWheelsCommand(_bdist_wheel):
    def get_wheel_name(self,):
        version_info = VersionInfo()
        package_version = version_info.get_package_version(full_version=True)
        flash_version = version_info.get_flash_version()
        python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
        wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{package_version}-{python_version}-{python_version}-{version_info.get_platform()}.whl"
        wheel_url = VersionInfo.BASE_WHEEL_URL.format(tag_name=f"v{flash_version}", wheel_filename=wheel_filename)
        return wheel_filename, wheel_url


    def run(self):
        if VersionInfo.FORCE_BUILD:
            super().run()
            return
        wheel_filename, wheel_url = self.get_wheel_name()
        print("Guessing wheel URL: ", wheel_url)
        try:
            urllib.request.urlretrieve(wheel_url, wheel_filename)
            # Make the archive
            # Lifted from the root wheel processing command
            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
            if not os.path.exists(self.dist_dir):
                os.makedirs(self.dist_dir)

            impl_tag, abi_tag, plat_tag = self.get_tag()
            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"

            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
            print("Raw wheel path", wheel_path)
            shutil.move(wheel_filename, wheel_path)
        except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected):
            print("Precompiled wheel not found. Building from source...")
            # If the wheel could not be downloaded, build from source
            super().run()


ANSI_ESCAPE = re.compile(
    r'\033[@-Z\\-_\[\]P]|\033\[[0-?]*[ -/]*[@-~]|\033][^\007\033]*\007|[\000-\037]'
)

def colored(text, color=None, bold=False):
    fmt = []
    if color== 'red':
        fmt.append('31')
    elif color == 'green':
        fmt.append('32')
    if bold:
        fmt.append('1')

    return f"\033[{';'.join(fmt)}m{text}\033[0m"


def split_line(text: str) -> List[str]:
    """Split text into lines based on terminal width."""
    term_width = shutil.get_terminal_size().columns or 80
    if not text.strip():
        return []
    # Split by explicit newlines and wrap long lines
    lines = []
    for line in text.split('\n'):
        while len(line) > term_width:
            lines.append(line[:term_width])
            line = line[term_width:]
        if line:
            lines.append(line)
    return lines


ANSI_ESCAPE = re.compile(
    r'\033[@-Z\\-_\[\]P]|\033\[[0-?]*[ -/]*[@-~]|\033][^\007\033]*\007|[\000-\037]'
)

def colored(text, color=None, bold=False):
    fmt = []
    if color== 'red':
        fmt.append('31')
    elif color == 'green':
        fmt.append('32')
    if bold:
        fmt.append('1')

    return f"\033[{';'.join(fmt)}m{text}\033[0m"


def split_line(text: str) -> List[str]:
    """Split text into lines based on terminal width."""
    term_width = shutil.get_terminal_size().columns or 80
    if not text.strip():
        return []
    # Split by explicit newlines and wrap long lines
    lines = []
    for line in text.split('\n'):
        while len(line) > term_width:
            lines.append(line[:term_width])
            line = line[term_width:]
        if line:
            lines.append(line)
    return lines


def run_command_with_live_tail(ext: str, command: List[str], output_lines: int = 20,
                               refresh_rate: float = 0.1, cwd: Optional[str] = None):
    """
    Execute a script-like command with real-time output of the last `output_lines` lines.

    - during execution: displays the last `output_lines` lines of output in real-time.
    - On success: Clears the displayed output.
    - On failure: Prints the full command output.

    Args:
        ext (str): the name of the native extension currently building.
        command (List[str]): The command to execute, as a list of arguments.
        output_lines (int, optional): Number of terminal lines to display during live output. Defaults to 20.
        refresh_rate (float, optional): Time in seconds between output refreshes. Defaults to 0.1.
        cwd (Optional[str], optional): Working directory to run the command in. Defaults to current directory.
    """
    # Dump all subprocess output without any buffering if stdout is not a terminal
    if not sys.stdout.isatty():
        return subprocess.run(command, cwd=cwd, check=True)
    # Start time for elapsed time calculation
    start = time.time()
    # Buffer for all output
    all_output = []
    write_buffer = deque(maxlen=output_lines)
    # Current number of lines from sub process displayed
    current_lines = 0

    # ANSI escape codes for terminal control
    CLEAR_LINE = '\033[K'
    MOVE_UP = '\033[1A'
    SAVE_CURSOR = '\0337'
    RESTORE_CURSOR = '\0338'
    CLEAR_REMAINING = '\033[J'

    def write_progress(status: Literal['RUNNING', 'SUCCEED', 'FAILED'] = 'RUNNING',
                       new_line: Optional[str] = None):
        """Update terminal display with latest output"""
        nonlocal current_lines, process
        sys.stdout.write(SAVE_CURSOR)
        sys.stdout.write(MOVE_UP * current_lines)
        banner = f"ext={ext} pid={process.pid} status={status.upper()} elapsed=({time.time()-start:.2f}S)\n"
        if status != 'FAILED':
            banner = colored(banner, 'green', bold=True)
        else:
            banner = colored(banner, 'red', bold=True)
        sys.stdout.write(CLEAR_LINE + banner)
        if new_line is not None:
            all_output.append(new_line)
            write_buffer.extend(split_line(ANSI_ESCAPE.sub('', new_line).rstrip()))
        elif status == 'RUNNING':
            sys.stdout.write(RESTORE_CURSOR)
            sys.stdout.flush()
            return

        sys.stdout.write(CLEAR_REMAINING)
        if status == 'RUNNING':
            current_lines = 1 + len(write_buffer)
            for text in write_buffer:
                sys.stdout.write(text + '\n')
        elif status == 'FAILED':
            for text in all_output:
                sys.stdout.write(text)
        sys.stdout.flush()

    # Start subprocess
    sys.stdout.write(colored(f'ext={ext} command={" ".join(str(c) for c in command)}\n', bold=True))
    sys.stdout.flush()
    process = subprocess.Popen(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        cwd=cwd,
        text=True,
        bufsize=1
    )

    try:
        write_progress()
        poll_obj = select.poll()
        poll_obj.register(process.stdout, select.POLLIN)
        while process.poll() is None:
            poll_result = poll_obj.poll(refresh_rate * 1000)
            if poll_result:
                write_progress(new_line=process.stdout.readline())
            else:
                write_progress()

        # Get any remaining output
        while True:
            line = process.stdout.readline()
            if not line:
                break
            write_progress(new_line=line)
    except BaseException as e:
        process.terminate()
        raise e
    finally:
        exit_code = process.wait()
        write_progress(status='SUCCEED' if exit_code == 0 else 'FAILED')


# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE = {
    "win32": "Win32",
    "win-amd64": "x64",
    "win-arm32": "ARM",
    "win-arm64": "ARM64",
}


class CMakeExtension(Extension):
    def __init__(self, name: str, sourcedir: str) -> None:
        super().__init__(name, sources=[])
        print(name, sourcedir)
        self.sourcedir = sourcedir

def get_cmake_abi_args(cmake_args):
    if torch.compiled_with_cxx11_abi():
        cmake_args.append("-D_GLIBCXX_USE_CXX11_ABI=1")
    else:
        cmake_args.append("-D_GLIBCXX_USE_CXX11_ABI=0")
    return cmake_args

class CMakeBuild(BuildExtension):

    def build_extension(self, ext) -> None:
        if not isinstance(ext, CMakeExtension):
            super().build_extension(ext)
            return
        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
        extdir = ext_fullpath.parent.resolve()

        # Using this requires trailing slash for auto-detection & inclusion of
        # auxiliary "native" libs

        debug = int(os.environ.get("DEBUG", 0)
                    ) if self.debug is None else self.debug
        cfg = "Debug" if debug else "Release"

        # CMake lets you override the generator - we need to check this.
        # Can be set with Conda-Build, for example.
        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")

        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
        # from Python.
        cmake_args = [
            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
            f"-DPYTHON_EXECUTABLE={sys.executable}",
            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
        ]

        if CUDA_HOME is not None:
            cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
        elif MUSA_HOME is not None:
            cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
        elif ROCM_HOME is not None:
            cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
        elif KTRANSFORMERS_BUILD_XPU:
            cmake_args += ["-DKTRANSFORMERS_USE_XPU=ON", "-DKTRANSFORMERS_USE_CUDA=OFF"]
        else:
            raise ValueError("Unsupported backend: CUDA_HOME, MUSA_HOME, and ROCM_HOME are not set and XPU is not available.")
        
        cmake_args = get_cmake_abi_args(cmake_args)
        # log cmake_args
        print("CMake args:", cmake_args)

        build_args = []
        if "CMAKE_ARGS" in os.environ:
            cmake_args += [
                item for item in os.environ["CMAKE_ARGS"].split(" ") if item]

        if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
            cpu_args = CpuInstructInfo.CMAKE_FANCY
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
            cpu_args = CpuInstructInfo.CMAKE_AVX512
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
            cpu_args = CpuInstructInfo.CMAKE_AVX2
        else:
            cpu_args = CpuInstructInfo.CMAKE_NATIVE

        cmake_args += [
            item for item in cpu_args.split(" ") if item
        ]
        # In this example, we pass in the version to C++. You might not need to.
        cmake_args += [
            f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
        if self.compiler.compiler_type != "msvc":
            if not cmake_generator or cmake_generator == "Ninja":
                pass
                # try:
                #     import ninja

                #     ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
                #     cmake_args += [
                #         "-GNinja",
                #         f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
                #     ]
                # except ImportError:
                #     pass

        else:
            # Single config generators are handled "normally"
            single_config = any(
                x in cmake_generator for x in {"NMake", "Ninja"})

            # CMake allows an arch-in-generator style for backward compatibility
            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
            if not single_config and not contains_arch and cmake_generator:
                cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]

            # Multi-config generators have a different way to specify configs
            if not single_config:
                cmake_args += [
                    f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"
                ]
                build_args += ["--config", cfg]

        if sys.platform.startswith("darwin"):
            # Cross-compile support for macOS - respect ARCHFLAGS if set
            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
            if archs:
                cmake_args += [
                    "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]

        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
            cpu_count = os.cpu_count()
            if cpu_count is None:
                cpu_count = 1
            if hasattr(self, "parallel") and self.parallel:
                build_args += [f"--parallel={self.parallel}"]
            else:
                build_args += [f"--parallel={cpu_count}"]
        print("CMake args:", cmake_args)
        build_temp = Path(ext.sourcedir) / "build"
        print("build_temp:", build_temp)

        if not build_temp.exists():
            build_temp.mkdir(parents=True)
        run_command_with_live_tail(ext.name,
            ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp
        )
        run_command_with_live_tail(ext.name,
            ["cmake", "--build", build_temp, "--verbose", *build_args], cwd=build_temp
        )

if CUDA_HOME is not None or ROCM_HOME is not None:
    ops_module = CUDAExtension('KTransformersOps', [
        'csrc/ktransformers_ext/cuda/custom_gguf/dequant.cu',
        'csrc/ktransformers_ext/cuda/binding.cpp',
        'csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
    ],
    extra_compile_args={
            'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
            'nvcc': [
                '-O3',
                # '--use_fast_math',
                '-Xcompiler', '-fPIC',
                '-DKTRANSFORMERS_USE_CUDA',
            ]
        }
    )
elif MUSA_HOME is not None:
    SimplePorting(cuda_dir_path="csrc/ktransformers_ext/cuda", mapping_rule={
        # Common rules
        "at::cuda": "at::musa",
        "#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
        "#include <c10/cuda/CUDAGuard.h>": "#include \"torch_musa/csrc/core/MUSAGuard.h\"",
        "nv_bfloat16": "mt_bfloat16",
        }).run()
    ops_module = MUSAExtension('KTransformersOps', [
        'csrc/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
        'csrc/ktransformers_ext/cuda_musa/binding.cpp',
        # TODO: Add Marlin support for MUSA.
        # 'csrc/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
    ],
    extra_compile_args={
            'cxx': ['force_mcc'],
            'mcc': [
                '-O3',
                '-DKTRANSFORMERS_USE_MUSA',
                '-DTHRUST_IGNORE_CUB_VERSION_CHECK',
            ]
        }
    )
elif torch.xpu.is_available(): #XPUExtension is not available now.
    ops_module = None
else:
    raise ValueError("Unsupported backend: CUDA_HOME ROCM_HOME MUSA_HOME are not set and XPU is not available.")

if not torch.xpu.is_available():
    ext_modules = [
        CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
        ops_module,
        CUDAExtension(
            'vLLMMarlin', [
                'csrc/custom_marlin/binding.cpp',
                'csrc/custom_marlin/gptq_marlin/gptq_marlin.cu',
                'csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu',
            ],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': ['-O3', '-Xcompiler', '-fPIC'],
            },
        )
    ]
    if with_balance:
        print("using balance_serve")
        ext_modules.append(
            CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
        )
else:
    ext_modules = [
        CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
    ]

setup(
    name=VersionInfo.PACKAGE_NAME,
    version=VersionInfo().get_package_version(),
    install_requires=install_requires_pinned,
    cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
    ext_modules=ext_modules
)


================================================
FILE: kt-sft/test_adapter/data_transfer.py
================================================
import json

converted_data = []
with open('/data/user23202791/lpl/LLaMA-Factory/examples/KT_used/translation.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        converted_data.append({
            "instruction": "",
            "input": data["问"],
            "output": data["答"]
        })

with open('/data/user23202791/lpl/LLaMA-Factory/examples/KT_used/sft_translation.json', 'w', encoding='utf-8') as f:
    json.dump(converted_data, f, ensure_ascii=False, indent=4)

================================================
FILE: kt-sft/test_adapter/infer_with_adapter.py
================================================
import torch
import os

checkpoint_dir = "/home/yj/ktransformers/test_adapter/demo_adapter_KT_target_module/checkpoint-6600"  # 请将此处替换为实际文件夹路径

for filename in os.listdir(checkpoint_dir):
    file_path = os.path.join(checkpoint_dir, filename)
    if filename.endswith(('.pt', '.bin', '.pth')):
        try:
            loaded_data = torch.load(file_path)
            print(f"===== 文件: {filename} =====")
            print(f"数据类型: {type(loaded_data)}")
            
            if isinstance(loaded_data, dict):
                print("字典包含的键:", list(loaded_data.keys()))
                # 示例：打印优化器状态的部分参数（若为优化器文件）
                if "state" in loaded_data and "param_groups" in loaded_data:
                    print("优化器示例参数：")
                    print("param_groups 前2项:", loaded_data["param_groups"][:2])
                    print("state 中前2个参数的状态:", list(loaded_data["state"].items())[:2])
            elif isinstance(loaded_data, torch.nn.Module):
                print("模块参数列表:")
                for name, param in loaded_data.named_parameters():
                    print(f"参数名: {name}, 形状: {param.shape}")
            else:
                print("数据内容预览:", loaded_data)
        except Exception as e:
            print(f"读取 {filename} 时出错: {str(e)}")

================================================
FILE: kt-sft/test_adapter/inspect_adapter.py
================================================
# -*- coding: utf-8 -*-
"""
inspect_adapter.py  ‒  查看 LoRA / Adapter checkpoint 信息
------------------------------------------------------------
示例：
  python inspect_adapter.py ./checkpoint
  python inspect_adapter.py ./checkpoint --show-params            # 打印全部权重行
  python inspect_adapter.py ./checkpoint --param lora_A.weight    # 只看某个权重
  python inspect_adapter.py ./checkpoint --dump-all               # 导出所有张量
"""
import argparse
import json
from pathlib import Path

import torch
from safetensors.torch import load_file as safe_load
from tabulate import tabulate


def load_json(p: Path):
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)


def human_readable(num: int) -> str:
    for unit in ["", "K", "M", "B"]:
        if abs(num) < 1000:
            return f"{num:,.0f}{unit}"
        num /= 1000
    return f"{num:.1f}T"


def inspect_adapter_weights(weight_path: Path):
    """
    读取 adapter_model.safetensors / .bin / .pt
    返回 (rows, total_params, state) 三元组
    """
    if weight_path.suffix == ".safetensors":
        state = safe_load(str(weight_path))
    else:
        state = torch.load(str(weight_path), map_location="cpu")

    rows, total = [], 0
    for name, tensor in state.items():
        n = tensor.numel()
        total += n
        rows.append([
            name,
            list(tensor.shape),
            str(tensor.dtype).replace("torch.", ""),
            human_readable(n)
        ])
    rows.sort(key=lambda x: x[0])
    return rows, total, state


def maybe_print_optimizer(optimizer_pt: Path, max_keys: int = 20):
    try:
        opt_state = torch.load(str(optimizer_pt), map_location="cpu")
    except Exception as e:
        print(f"[optimizer.pt] 读取失败：{e}")
        return
    print("\n====== optimizer.pt 结构 (部分) ======")
    if isinstance(opt_state, dict):
        for i, k in enumerate(opt_state.keys()):
            if i >= max_keys:
                print("... (省略)")
                break
            print(f"{k}: type={type(opt_state[k])}")
    else:
        print(f"type={type(opt_state)} 非典型，请自行查看。")


def maybe_print_scheduler(scheduler_pt: Path, max_keys: int = 20):
    try:
        sch_state = torch.load(str(scheduler_pt), map_location="cpu")
    except Exception as e:
        print(f"[scheduler.pt] 读取失败：{e}")
        return
    print("\n====== scheduler.pt 结构 (部分) ======")
    if isinstance(sch_state, dict):
        for i, (k, v) in enumerate(sch_state.items()):
            if i >= max_keys:
                print("... (省略)")
                break
            print(f"{k}: type={type(v)}")
    else:
        print(f"type={type(sch_state)} 非典型，请自行查看。")


def maybe_print_rng(rng_pth: Path):
    try:
        rng = torch.load(str(rng_pth), map_location="cpu")
    except Exception as e:
        print(f"[rng_state.pth] 读取失败：{e}")
        return
    print("\n====== rng_state.pth 键列表 ======")
    if isinstance(rng, dict):
        for k in rng.keys():
            print(f"- {k}")
    else:
        print(f"type={type(rng)} 非典型，请自行查看。")


def dump_tensors(state: dict, out_dir="tensor_dump"):
    """
    将 state 的每个张量写入 txt（repr）并可选保存二进制 .pt
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(exist_ok=True)
    torch.set_printoptions(sci_mode=False, linewidth=180)

    for name, tensor in state.items():
        safe_name = name.replace("/", "_")
        txt_path = out_dir / f"{safe_name}.txt"
        with open(txt_path, "w") as f:
            f.write(repr(tensor))

        # 若需要二进制，取消下一行注释
        # torch.save(tensor, out_dir / f"{safe_name}.pt")

    print(f"[done] 已把 {len(state)} 个张量写入 {out_dir}/")


def main():
    parser = argparse.ArgumentParser(
        description="检查 LoRA / Adapter checkpoint 内容")
    parser.add_argument("ckpt_dir", type=str,
                        help="包含 adapter_config.json / adapter_model.safetensors 的目录")
    parser.add_argument("--show-params", action="store_true",
                        help="打印所有权重摘要（默认只显示前 30 行）")
    parser.add_argument("--param", type=str,
                        help="仅打印指定参数的完整张量")
    parser.add_argument("--dump-all", action="store_true",
                        help="把所有张量完整写入文件夹")
    args = parser.parse_args()

    d = Path(args.ckpt_dir).expanduser()
    if not d.exists():
        raise FileNotFoundError(d)

    # ========== adapter_config.json ==========
    cfg_path = d / "adapter_config.json"
    if cfg_path.exists():
        print("====== adapter_config.json ======")
        print(json.dumps(load_json(cfg_path), indent=2, ensure_ascii=False))
    else:
        print("未找到 adapter_config.json")

    # ========== trainer_state.json ==========
    ts_path = d / "trainer_state.json"
    if ts_path.exists():
        ts = load_json(ts_path)
        print("\n====== trainer_state.json (节选) ======")
        sel = {k: ts.get(k, None) for k in
               ["global_step", "best_metric", "best_model_checkpoint", "log_history"]}
        if isinstance(sel.get("log_history"), list) and len(sel["log_history"]) > 3:
            sel["log_history"] = sel["log_history"][-3:]
        print(json.dumps(sel, indent=2, ensure_ascii=False))
    else:
        print("\n未找到 trainer_state.json")

    # ========== adapter_model.* ==========
    st_path = next((d / n for n in
                   ["adapter_model.safetensors", "adapter_model.bin", "adapter_model.pt"]
                   if (d / n).exists()), None)

    if st_path is None:
        print("\n未找到 adapter_model.* (safetensors/bin/pt)")
        state = {}
    else:
        rows, total, state = inspect_adapter_weights(st_path)

        # 若用户指定 --param，仅打印该张量
        if args.param is not None:
            if args.param not in state:
                raise KeyError(f"参数 {args.param!r} 不存在！")
            torch.set_printoptions(sci_mode=False, linewidth=180, profile="full")
            print(f"\n====== {args.param} 的完整张量 ======")
            print(state[args.param])
            return  # 提前结束

        print(f"\n====== {st_path.name} 中的可训练参数（共 {human_readable(total)} 个元素）======")
        if args.show_params:
            print(tabulate(rows, headers=["参数名", "形状", "dtype", "元素数"], tablefmt="github"))
        else:
            head = rows[:30]
            print(tabulate(head, headers=["参数名", "形状", "dtype", "元素数"], tablefmt="github"))
            if len(rows) > 30:
                print(f"... 还有 {len(rows) - 30} 个参数未展示，使用 --show-params 查看全部。")

        # --dump-all 时将所有张量写文件
        if args.dump_all:
            dump_tensors(state, out_dir=f"{st_path.stem}_dump")

    # ========== 其它 state_dict ==========
    if (d / "optimizer.pt").exists():
        maybe_print_optimizer(d / "optimizer.pt")
    if (d / "scheduler.pt").exists():
        maybe_print_scheduler(d / "scheduler.pt")
    if (d / "rng_state.pth").exists():
        maybe_print_rng(d / "rng_state.pth")

    print("\nDone.")


if __name__ == "__main__":
    main()


================================================
FILE: kt-sft/test_adapter/pred2metrics.py
================================================
import json
import argparse
from pathlib import Path
from ktransformers.sft.metrics import ComputeSimilarity
from transformers import AutoTokenizer
from transformers.trainer_utils import EvalPrediction

def load_pred_ref(pred_file: Path):
    data = json.loads(pred_file.read_text(encoding="utf-8"))
    preds, refs = [], []
    for it in data:
        preds.append("" if it.get("prediction") is None else str(it.get("prediction")))
        refs.append("" if it.get("label") is None else str(it.get("label")))
    return preds, refs

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pred-file", type=str, required=True)
    parser.add_argument("--output-dir", type=str, required=True)
    parser.add_argument("--tokenizer", type=str, required=True)
    args = parser.parse_args()

    pred_file = Path(args.pred_file)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    metric_file = output_dir / "metrics.json"

    preds, refs = load_pred_ref(pred_file)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
    compute_metrics = ComputeSimilarity(tokenizer)
    enc_pred = tokenizer(preds, add_special_tokens=False, padding=True, return_tensors="np")
    enc_ref  = tokenizer(refs,  add_special_tokens=False, padding=True, return_tensors="np")
    ep = EvalPrediction(predictions=enc_pred["input_ids"], label_ids=enc_ref["input_ids"])
    metrics = compute_metrics(ep, compute_result=True)

    with metric_file.open("w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)

    print(f"[OK] sample length: {len(preds)}")
    print(f"[OK] saved to: {metric_file}")

if __name__ == "__main__":
    main()


================================================
FILE: kt-sft/test_adapter/test_grad.py
================================================
import torch, glob

records = sorted(glob.glob("/home/lpl/kt-sft/tmp/train_logs/step_*.pt"))
example = torch.load(records[1])

# print("step:", example["step"])
# print("inputs keys:", list(example["inputs"].keys()))
# print("loss:", example["loss"])


# print("param 'base_model.model.model.orig_module.layers.1.mlp.orig_module.gate.weight' 形状:",
#       example["params"]["base_model.model.model.orig_module.layers.1.mlp.orig_module.gate.weight"].shape)
# print("grad 'base_model.model.model.orig_module.layers.1.mlp.orig_module.gate.weight':", example["grads"]["base_model.model.model.orig_module.layers.1.mlp.orig_module.gate.weight"])

print(example)


================================================
FILE: kt-sft/test_adapter/time_test_lora_train.py
================================================
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)


with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
# ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
#                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
# ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
#          aten::mkldnn_convolution        73.87%      37.241ms        74.04%      37.326ms       7.465ms       9.25 Mb           0 b             5  
#                       aten::addmm        12.98%       6.545ms        13.11%       6.609ms       2.203ms     179.53 Kb     179.53 Kb             3  
#     aten::max_pool2d_with_indices         6.63%       3.343ms         6.63%       3.343ms       1.114ms       5.05 Mb       5.05 Mb             3  
#                   aten::clamp_min         2.12%       1.071ms         2.12%       1.071ms     153.000us           0 b           0 b             7  
#                  aten::bernoulli_         1.20%     607.000us         1.23%     622.000us     311.000us           0 b    -260.00 Kb             2  
# ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
# Self CPU time total: 50.416ms

print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
# ---------------------------------  ------------  -------------------------------------------
#                              Name     CPU total                                 Input Shapes
# ---------------------------------  ------------  -------------------------------------------
#                   model_inference      57.503ms                                           []
#                      aten::conv2d       8.008ms      [5,64,56,56], [64,64,3,3], [], ..., []]
#                 aten::convolution       7.956ms     [[5,64,56,56], [64,64,3,3], [], ..., []]  #卷积统计
#                aten::_convolution       7.909ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
#          aten::mkldnn_convolution       7.834ms     [[5,64,56,56], [64,64,3,3], [], ..., []]
#                      aten::conv2d       6.332ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
#                 aten::convolution       6.303ms    [[5,512,7,7], [512,512,3,3], [], ..., []]  #卷积统计
#                aten::_convolution       6.273ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
#          aten::mkldnn_convolution       6.233ms    [[5,512,7,7], [512,512,3,3], [], ..., []]
#                      aten::conv2d       4.751ms  [[5,256,14,14], [256,256,3,3], [], ..., []]
# ---------------------------------  ------------  -------------------------------------------
# Self CPU time total: 57.549ms

model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()

with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# -------------------------------------------------------  ------------  ------------
#                                                    Name     Self CUDA    CUDA total
# -------------------------------------------------------  ------------  ------------
#                                         model_inference       0.000us      11.666ms
#                                            aten::conv2d       0.000us      10.484ms
#                                       aten::convolution       0.000us      10.484ms
#                                      aten::_convolution       0.000us      10.484ms
#                              aten::_convolution_nogroup       0.000us      10.484ms
#                                       aten::thnn_conv2d       0.000us      10.484ms
#                               aten::thnn_conv2d_forward      10.484ms      10.484ms
# void at::native::im2col_kernel<float>(long, float co...       3.844ms       3.844ms
#                                       sgemm_32x32x32_NN       3.206ms       3.206ms
#                                   sgemm_32x32x32_NN_vec       3.093ms       3.093ms
# -------------------------------------------------------  ------------  ------------
# Self CPU time total: 23.015ms
# Self CUDA time total: 11.666ms

model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU],
        profile_memory=True, record_shapes=True) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)) # 算子自身使用的内存总量，不包括子算子

print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))


model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    model(inputs)

prof.export_chrome_trace("trace.json")


with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    with_stack=True,
) as prof:
    model(inputs)

# Print aggregated stats
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2)) # 启用stack tracing会带来额外开销
# -------------------------  -----------------------------------------------------------
#                      Name  Source Location
# -------------------------  -----------------------------------------------------------
# aten::thnn_conv2d_forward  .../torch/nn/modules/conv.py(439): _conv_forward
#                            .../torch/nn/modules/conv.py(443): forward
#                            .../torch/nn/modules/module.py(1051): _call_impl
#                            .../site-packages/torchvision/models/resnet.py(63): forward
#                            .../torch/nn/modules/module.py(1051): _call_impl
# aten::thnn_conv2d_forward  .../torch/nn/modules/conv.py(439): _conv_forward
#                            .../torch/nn/modules/conv.py(443): forward
#                            .../torch/nn/modules/module.py(1051): _call_impl
#                            .../site-packages/torchvision/models/resnet.py(59): forward
#                            .../torch/nn/modules/module.py(1051): _call_impl
# -------------------------  -----------------------------------------------------------
# Self CPU time total: 34.016ms
# Self CUDA time total: 11.659ms

================================================
FILE: kt-sft/withoutKT_PEFT.py
================================================
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, inject_adapter_in_model, TaskType
from transformers import Trainer, TrainingArguments
from datasets import Dataset, load_dataset
import transformers
from transformers.trainer import TRAINING_ARGS_NAME
import os
import torch
from datasets import load_dataset, Dataset, DatasetDict
from torch.utils.data import DataLoader
from torchviz import make_dot

# 加载 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained('/home/yj/ktransformers/DeepSeek-V2-Lite-Chat', trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained('/data/model/Qwen2.5-7B-Instruct', trust_remote_code=True)
save_path = '/home/yj/ktransformers/tmp/Qwen_Lora_model'
data_file = '/home/yj/ktransformers/test_adapter/sft_translation.json'

dataset = Dataset.from_json(data_file)

def preprocess_function(examples):
    inputs = examples["input"]
    targets = examples["output"]
    
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(targets, padding="max_length", truncation=True, max_length=512)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def print_model_with_params(model, prefix="", max_layers=3, max_params=5):
    print(f"\n{prefix}模型结构:")
    print(model)  # 原始结构打印
    
    print(f"\n{prefix}参数示例:")
    total_params = 0
    for name, param in model.named_parameters():
        if total_params >= max_layers:  # 控制打印层数
            break
        # 过滤非LoRA相关参数（可根据需要调整）
        if "lora" not in name and "embed" not in name and "proj" not in name:
            continue
        print(f"层名: {name}")
        print(f"形状: {param.shape}")
        print(f"数据类型: {param.dtype}")
        print(f"参数示例值 (前{max_params}个): {param.data.flatten()[:max_params].cpu().numpy()}\n")
        total_params += 1

processed_dataset = dataset.map(preprocess_function, batched=True)
split_dataset = processed_dataset.train_test_split(test_size=0.1)

train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

model = AutoModelForCausalLM.from_pretrained(
    '/home/yj/ktransformers/DeepSeek-V2-Lite-Chat', 
    trust_remote_code=True,
    torch_dtype=torch.float16)
# model = AutoModelForCausalLM.from_pretrained('/data/model/Qwen2.5-7B-Instruct', trust_remote_code=True)

print_model_with_params(model, prefix="原始模型")

# 配置 LoRA
lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        target_modules=[
            # "q_proj"
            "kv_a_proj_with_mqa",
            "kv_b_proj",
            # "o_proj"
        ],
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
    )

model = get_peft_model(model, lora_config)
# model = inject_adapter_in_model(lora_config, model)

for name, parms in model.named_parameters():	
        print('-->name:', name)
        print('-->para:', parms)
        print('-->grad_requirs:',parms.requires_grad)
        print('-->grad_fn:',parms.grad_fn)
        print('-->grad_value:',parms.grad)
        print("===")

# print(model)

model.train()

# for name, parms in model.named_parameters():	
#         print('-->name:', name)
#         print('-->para:', parms)
#         print('-->grad_requirs:',parms.requires_grad)
#         print('-->grad_fn:',parms.grad_fn)
#         print('-->grad_value:',parms.grad)
#         print("===")

model.to(device='cuda')
x = torch.tensor([[1,2,3]], dtype=torch.int32).to("cuda")
output = model(x)
loss = output.logits.mean()
print(f"output:{output}")
print(f"loss:{loss}")

# output = model(input_ids=torch.tensor([[1,2,3]], dtype=torch.int32))
# loss = output.logits.mean()
# # print_grad_fn(loss.grad_fn)
# # 生成计算图
dot = make_dot(loss, params=dict(model.named_parameters()))
dot.render("PEFT_compute_one_layer_model_graph", format="svg")  # 保存为SVG格式的文件

# 暂时先不训练
# model = model.to('cuda')
# model.config.use_cache = False

# # 定义训练参数
# training_args = TrainingArguments(
#     output_dir='./results',         # 模型保存和日志输出的目录路径
#     num_train_epochs=3,             # 训练的总轮数（epochs）
#     per_device_train_batch_size=1, # 每个设备（如GPU或CPU）上的训练批次大小，16表示每次输入模型的数据数量
#     learning_rate=5e-5,             # 学习率
#     logging_steps=10,               # 每隔多少步（steps）进行一次日志记录
#     save_steps=100,                 # 每隔多少步保存模型
#     save_total_limit=2,             # 保留最近的两个模型
#     fp16=True,                   
# )

class KTrainer(Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        # 改写trainer的save_model，在checkpoint的时候只存lora权重
        os.makedirs(output_dir, exist_ok=True)
        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
        saved_params = {
            k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad
        }
        torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))

trainer = KTrainer(
    model=model,
    train_dataset=train_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=16,
        num_train_epochs=10,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        save_steps=200,
        output_dir=save_path
    ),
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

trainer.train()
# model.save_pretrained(save_path)

# print_model_with_params(model, prefix="LoRA微调模型")

# model.print_trainable_parameters() 

# model = model.merge_and_unload()

# print_model_with_params(model, prefix="合并后模型")

for name, parms in model.named_parameters():	
        print('-->name:', name)
        print('-->para:', parms)
        print('-->grad_requirs:',parms.requires_grad)
        print('-->grad_fn:',parms.grad_fn)
        print('-->grad_value:',parms.grad)
        print("===")

================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=61"]
build-backend = "setuptools.build_meta"

[project]
name = "ktransformers"
dynamic = ["version", "dependencies"]
description = "KTransformers: CPU-GPU heterogeneous inference framework for LLMs"
readme = "README.md"
authors = [{ name = "kvcache-ai" }]
license = "Apache-2.0"
requires-python = ">=3.8"
classifiers = [
  "Programming Language :: Python :: 3",
  "Operating System :: POSIX :: Linux",
]

[project.urls]
Homepage = "https://github.com/kvcache-ai/ktransformers"

[tool.setuptools]
# No actual Python packages — this is a meta-package
packages = []


================================================
FILE: setup.py
================================================
"""Meta-package: pip install ktransformers → installs kt-kernel + sglang-kt."""
from pathlib import Path
from setuptools import setup

_version_file = Path(__file__).resolve().parent / "version.py"
_ns = {}
exec(_version_file.read_text(), _ns)
_v = _ns["__version__"]

setup(
    version=_v,
    install_requires=[
        f"kt-kernel=={_v}",
        f"sglang-kt=={_v}",
    ],
)


================================================
FILE: third_party/llamafile/README.md
================================================
The code in this folder is copied from [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile). Special thanks to the Mozilla-Ocho team.


================================================
FILE: third_party/llamafile/bench.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/bench.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#pragma once

#include <stdio.h>

#include "micros.h"

#define BENCH(x)                                                                       \
    do {                                                                               \
        x;                                                                             \
        __asm__ volatile("" ::: "memory");                                             \
        long long start = micros();                                                    \
        for (int i = 0; i < ITERATIONS; ++i) {                                         \
            __asm__ volatile("" ::: "memory");                                         \
            x;                                                                         \
            __asm__ volatile("" ::: "memory");                                         \
        }                                                                              \
        printf("%9lld us %s\n", (micros() - start + ITERATIONS - 1) / ITERATIONS, #x); \
    } while (0)


================================================
FILE: third_party/llamafile/flags.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#include "flags.h"

bool FLAG_precise = false;


================================================
FILE: third_party/llamafile/flags.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#pragma once

extern bool FLAG_precise;


================================================
FILE: third_party/llamafile/iqk_mul_mat.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
// Copyright 2024 Iwan Kawrakow
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstring>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)

#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "sgemm.h"

// For i-quants, I had to explicitely specify which
// functions to inline / not inline (at least for some
// of the functions), else performance would be significantly
// lower. This is worrysome as things can change with,
// e.g., a different compiler version or running on a different
// CPU.
#ifdef _MSC_VER
#define IQK_NOINLINE __declspec(noinline)
#define IQK_ALWAYS_INLINE inline
#else
#define IQK_NOINLINE __attribute__((__noinline__))
#define IQK_ALWAYS_INLINE __attribute__((always_inline))
#endif

#define GGML_COMMON_IMPL_C
#include "llama.cpp/ggml-common.h"

// clang-format off

// This matrix - vector and matrix - matrix multiplication implementation
// for legacy quants, k-quants and i-quants makes prompt processing 150-200%
// (legacy and k-quants) or 250-400% (i-quants) faster.
// compared to mainline llama.cpp (and llamafile).
// It provides implementations for ARM_NEON (all quants) and AVX2
// (all quants except sub-4 bit i-quants).
//
// Main idea is that unpacking the quants and the block scales to
// be ready for dot products with the corresponding Q8_Y quants
// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type).
// Hence, if we are performing a QX x Q8_Y matrix matrix
// multiplication (as needed for prompt processing), we can get
// a significant speedup by reusing the unpacked QX quants and scales
// for multiplication with several Q8_K columns. We also achieve fewer
// loads from memory, which is the main purpose of tiling in general
// purpose matrix multiplication packages.

#include <utility>
#include <array>

#endif

constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast<ggml_type>(98);
constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast<ggml_type>(99);


namespace {

typedef struct {
    int32_t i1;
    int32_t i2;
} mmid_row_mapping;

struct DataInfo {
    float       * s;
    const char  * cy;
    size_t        bs;
    size_t        by;
    int           cur_y = 0;
    int           ne11;
    const mmid_row_mapping * row_mapping = nullptr;
    size_t        bs2 = 0;

    inline const char * src1_row(int iy) const {
        if (!row_mapping) return cy + (cur_y + iy)*by;
        int i11 = row_mapping[cur_y + iy].i1 % ne11;
        int i12 = row_mapping[cur_y + iy].i2;
        return cy + (i11 + i12*ne11)*by;
    }

    inline void store(int ix, int iy, float result) const {
        *(dst_row(iy) + ix) = result;
        //dst_row(iy)[ix] = result;
    }
    inline float * dst_row(int iy) const {
        if (!row_mapping) return s + (cur_y + iy)*bs;
        int i12 = row_mapping[cur_y + iy].i2;
        int i1  = row_mapping[cur_y + iy].i1;
        int i2  = i12;
        return s + i1*bs + i2*bs2;
    }
};

/*
moonll 
change param for set_mul_mat 
add func16
*/

typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);

struct MulMat {
    std::array<mul_mat_t, 8> funcs = {};
    mul_mat_t func16 = nullptr;
    //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
    IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
        constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)

        if (func16 && nrc_y >= 16) {
            int n_step = (nrc_y - info.cur_y)/16;
            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
                auto this_info = info;
                this_info.s += ix;
                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
                for (int iy = 0; iy < n_step; ++iy) {
                    func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
                    this_info.cur_y += 16;
                }
            }
            info.cur_y += 16 * n_step;
            if (info.cur_y == nrc_y) return;
        }

        int n_step = (nrc_y - info.cur_y)/funcs.size();
        if (n_step > 0) {
            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
                auto this_info = info;
                this_info.s += ix;
                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
                for (int iy = 0; iy < n_step; ++iy) {
                    funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
                    this_info.cur_y += funcs.size();
                }
            }
            info.cur_y += funcs.size() * n_step;
        }
        int n_left = nrc_y - info.cur_y;
        if (n_left > 0) {
            funcs[n_left-1](n, vx, bx, info, nrc_x);
        }
    }
    static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
private:
    template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
};

inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
    const uint16_t * scales = (const uint16_t *)scales8;
    const uint32_t a0 = scales[0] | (scales[1] << 16);
    const uint32_t a1 = scales[2] | (scales[3] << 16);
    const uint32_t a2 = scales[4] | (scales[5] << 16);
    aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030);
    aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030);
    aux32[2] = a1 & 0x3f3f3f3f;
    aux32[0] = a0 & 0x3f3f3f3f;
}

/*
moonll
decoding tables
*/
#ifdef __AVX2__
static const uint64_t iq1s_grid_us[2048] = {
    0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
    0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
    0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
    0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
    0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
    0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
    0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
    0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
    0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
    0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
    0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
    0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
    0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
    0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
    0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
    0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
    0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
    0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
    0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
    0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
    0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
    0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
    0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
    0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
    0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
    0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
    0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
    0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
    0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
    0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
    0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
    0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
    0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
    0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
    0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
    0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
    0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
    0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
    0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
    0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
    0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
    0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
    0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
    0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
    0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
    0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
    0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
    0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
    0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
    0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
    0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
    0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
    0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
    0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
    0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
    0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
    0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
    0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
    0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
    0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
    0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
    0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
    0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
    0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
    0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
    0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
    0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
    0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
    0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
    0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
    0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
    0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
    0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
    0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
    0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
    0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
    0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
    0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
    0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
    0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
    0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
    0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
    0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
    0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
    0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
    0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
    0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
    0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
    0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
    0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
    0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
    0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
    0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
    0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
    0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
    0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
    0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
    0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
    0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
    0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
    0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
    0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
    0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
    0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
    0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
    0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
    0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
    0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
    0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
    0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
    0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
    0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
    0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
    0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
    0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
    0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
    0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
    0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
    0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
    0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
    0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
    0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
    0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
    0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
    0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
    0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
    0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
    0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
    0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
    0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
    0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
    0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
    0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
    0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
    0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
    0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
    0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
    0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
    0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
    0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
    0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
    0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
    0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
    0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
    0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
    0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
    0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
    0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
    0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
    0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
    0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
    0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
    0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
    0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
    0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
    0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
    0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
    0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
    0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
    0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
    0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
    0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
    0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
    0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
    0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
    0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
    0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
    0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
    0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
    0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
    0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
    0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
    0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
    0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
    0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
    0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
    0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
    0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
    0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
    0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
    0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
    0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
    0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
    0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
    0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
    0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
    0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
    0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
    0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
    0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
    0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
    0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
    0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
    0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
    0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
    0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
    0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
    0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
    0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
    0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
    0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
    0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
    0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
    0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
    0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
    0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
    0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
    0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
    0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
    0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
    0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
    0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
    0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
    0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
    0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
    0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
    0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
    0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
    0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
    0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
    0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
    0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
    0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
    0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
    0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
    0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
    0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
    0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
    0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
    0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
    0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
    0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
    0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
    0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
    0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
    0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
    0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
    0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
    0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
    0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
    0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
    0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
    0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
    0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
    0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
    0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
    0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
    0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
    0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
    0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
    0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
    0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
    0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
    0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
    0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
    0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
    0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
    0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
    0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
    0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
    0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
    0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
    0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
    0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
    0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
    0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
    0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
    0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
    0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
    0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
    0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
    0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
    0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
    0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
    0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
    0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
    0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
    0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
    0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
    0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
    0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
    0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
    0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
    0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
    0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
    0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
    0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
    0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
    0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
    0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
    0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
    0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
    0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
    0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
    0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
    0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
    0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
    0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
    0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
    0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
    0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
    0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
    0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
    0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
    0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
    0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
    0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
    0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
    0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
    0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
    0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
    0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
    0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
    0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
    0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
    0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
    0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
    0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
    0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
    0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
    0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
    0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
    0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
    0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
    0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
    0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
    0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
    0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
    0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
    0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
    0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
    0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
    0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
    0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
    0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
    0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
    0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
    0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
    0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
    0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
    0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
    0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
    0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
    0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
    0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
    0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
    0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
    0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
    0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
    0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
    0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
    0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
    0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
    0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
    0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
    0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
    0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
    0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
    0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
    0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
    0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
    0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
    0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
    0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
    0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
    0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
    0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
    0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
    0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
    0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
    0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
    0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
    0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
    0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
    0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
    0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
    0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
    0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
    0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
    0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
    0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
    0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
    0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
    0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
    0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
    0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
    0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
    0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
    0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
    0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
    0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
    0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
    0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
    0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
    0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
    0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
    0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
    0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
    0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
    0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
    0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
    0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
    0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
    0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
    0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
    0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
    0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
    0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
    0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
    0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
    0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
    0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
    0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
    0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
    0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
    0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
    0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
    0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
    0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
    0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
    0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
    0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
    0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
    0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
    0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
    0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
    0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
    0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
    0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
    0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
    0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
    0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
    0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
    0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
    0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
    0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
    0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
    0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
    0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
    0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
    0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
    0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
    0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
    0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
    0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
    0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
    0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
    0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
    0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
    0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
    0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
    0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
    0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
    0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
    0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
    0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
    0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
    0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
    0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
    0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
    0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
    0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
    0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
    0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
    0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
    0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
    0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
    0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
    0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
    0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
    0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
    0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
    0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
    0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
    0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
    0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
    0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
    0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
    0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
    0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
    0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
    0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
    0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
    0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
    0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
    0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
    0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
    0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
    0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
    0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
    0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
    0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
    0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
    0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
    0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
    0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
    0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
    0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
    0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
    0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
    0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
    0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
    0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
    0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
    0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
    0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
    0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
    0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
    0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
    0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
    0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
    0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
};
#else
static const uint32_t iq1s_grid_us[2048] = {
    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
};
#endif

#ifndef HAVE_FANCY_SIMD
const uint64_t keven_signs[128] = {
    0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
    0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff,
    0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff,
    0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff,
    0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff,
    0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff,
    0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff,
    0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff,
    0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff,
    0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff,
    0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff,
    0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff,
    0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff,
    0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff,
    0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff,
    0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff,
    0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff,
    0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff,
    0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff,
    0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff,
    0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
    0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
};
#endif

}

/* moonll change mulmat
add typeB and strideB
}*/

bool iqk_mul_mat(long Nx, long Ny, long ne00,
    int typeA, const void * A, long strideA,
    int typeB, const void * B, long strideB,
    float * C, long stride_C, int ith, int nth) {

        MulMat mm;
    
        if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) {
            return false;
        }

        size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA));
        size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB));
      
        
        auto nrc_x = (Nx + nth - 1)/nth;
        auto first_x = ith*nrc_x;
        if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;

        DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};

        mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);

        return true;
}


bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
        float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
    const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
    assert(row_mapping != nullptr);

    MulMat mm;
    int row_size_q8;
    /* moonll

    if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
        return false;
    }*/
    int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
    int nrc_x = (Nx + nth - 1)/nth;
    int first_x = ith*nrc_x;
    if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
    DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)};
    mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
    return true;
}

#if defined __x86_64__ || defined(_M_X64)

#if defined HAVE_FANCY_SIMD
    #undef HAVE_FANCY_SIMD
#endif
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
    #define HAVE_FANCY_SIMD
#endif
//#define HAVE_FANCY_SIMD

namespace {

inline float hsum_float_4(__m128 x) {
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
    x = _mm_add_ss(x, _mm_movehdup_ps(x));
    return _mm_cvtss_f32(x);
}
inline float hsum_float_8(__m256 x) {
    return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
}

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)


template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

#ifdef HAVE_FANCY_SIMD
    inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
#endif
    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
    inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

// Handles q4_K and q5_K scales/mins
struct Scales8K {
    template <typename Q8>
    inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        make_q4_scales(data, utmp);
        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
        const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1);
        accum_mins(mins128, q8, i, c, accd);
        const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
        return MM256_SET_M128I(sc128, sc128);
    }
#ifdef HAVE_FANCY_SIMD
    template <typename Q8>
    inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        auto scales = process_mins_and_scales(data, c, i, q8, accd);
        return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1);
    }
#endif
    template <typename Q8>
    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i q8s = q8.load_bsums(iy, i);
            const __m256i prod = _mm256_madd_epi16(mins, q8s);
            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
        }
    }
#ifdef HAVE_FANCY_SIMD
    const __m512i shuffles512[2] = {
        _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302,
                         0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100),
        _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a,
                         0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
    };
#endif
    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};

    uint32_t utmp[4];
};

template <typename Q8>
inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        const __m256i prod  = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i));
        accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]);
    }
}
inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) {
    const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
    const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
    scales[0] = MM256_SET_M128I(l_scales, l_scales);
    scales[1] = MM256_SET_M128I(h_scales, h_scales);
}

struct ScaleQ3 {
    inline __m128i make_scales(const uint16_t * s8) const {
        const uint16_t * scales16 = (const uint16_t *)s8;
        uint32_t aux0 = scales16[0] | (scales16[1] << 16);
        uint32_t aux1 = scales16[2] | (scales16[3] << 16);
        uint32_t aux2 = scales16[4] | (scales16[5] << 16);
        __m128i scales128 = _mm_set_epi32(
            ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030),
            ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030),
             (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030),
             (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030));
        return _mm_add_epi8(scales128, m32);
    }
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct ScaleIQ4XS {
    inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) {
        uint32_t tmp32 = scales_h | (scales_h << 14);
        const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4);
        const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask);
        return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32);
    }
    const __m128i hshift = _mm_set_epi32(12, 8, 4, 0);
    const __m128i lshift = _mm_set_epi32(4, 0, 4, 0);
    const __m128i hmask  = _mm_set1_epi16(0x03);
    const __m128i lmask  = _mm_set1_epi8(0xf);
    const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400);
    const __m128i m32 = _mm_set1_epi16(-32);
};

struct Scales8KBase {
    template <typename Q8>
    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i q8s = q8.load_bsums(iy, i);
            const __m256i prod = _mm256_madd_epi16(mins, q8s);
            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
        }
    }
    inline __m256i shuffle(__m128i mins) const {
        return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0]));
    }
    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
};

template <typename Block>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
    inline void new_row(int ix) {
        x = (const Block *)((const char *)vx + bx*ix);
    }

    const void *  vx;
    size_t        bx;
    const Block * x;

    float d;
};

__m128i inline load_iq4nl_values_128() {
    static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
    return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
}

__m256i inline load_iq4nl_values_256() {
    auto val128 = load_iq4nl_values_128();
    return MM256_SET_M128I(val128, val128);
}

#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================

struct BlockPermuter {
    const __m512i permute1 = _mm512_set_epi64(11, 10,  9,  8, 3, 2, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
};

struct Q4Bits {
    inline void prepare(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        auto tmp1 = _mm512_and_si512(q4bits, ml);
        auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        tmp1 = _mm512_and_si512(q4bits, ml);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
    }
    inline void prepare64(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        values[0] = _mm512_and_si512(q4bits, ml);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        values[2] = _mm512_and_si512(q4bits, ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0xf);
    BlockPermuter perm;
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2) {

        auto q2bits = _mm512_loadu_si512((const __m512i*)q2);
        auto tmp = _mm512_srli_epi16(q2bits, 2);

        values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp);
        values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml);
        values[0] = _mm512_and_si512(values[0], ml);
        values[2] = _mm512_and_si512(values[2], ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0x03);
    BlockPermuter perm;
};

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    Scales8K s8k;
};

/*
moonll DequantizerIQ4XS
*/

__m512i inline load_iq4nl_values_512() {
    auto val256 = load_iq4nl_values_256();
    return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        prepare(x[i].qs);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        auto scales256 = MM256_SET_M128I(scales128, scales128);
        auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
        scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
        scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
        scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
    }
    inline void prepare(const uint8_t * q4) {
        bits.prepare64(q4);
        // We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111
        //                bits.valuse[1]: 16..31, 48...63, 80...95, 112..127
        //                etc.
        auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]);
        bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1]));
        bits.values[0] = _mm512_shuffle_epi8(values, tmp);
        tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]);
        bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3]));
        bits.values[2] = _mm512_shuffle_epi8(values, tmp);
    }

    Q4Bits bits;
    Scales8KBase s8k;
    ScaleIQ4XS siq4;
    const __m512i values;
    const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2,  9,  8, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
    const __m512i shuffles[4] = {
        _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
        _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
    };
};

struct HighBit5 {
    inline void apply(const uint8_t * h, Q4Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x10);
};

struct HighBit3 {
    inline void apply(const uint8_t * h, Q2Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x04);
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].qh, bits);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    HighBit5 hbits;
    Scales8K s8k;
};

struct Scale16 {
    inline void make_scales(const __m128i& scales8, __m512i * scales) const {
        auto all_scales8 = MM256_SET_M128I(scales8, scales8);
        auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1);
        auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2);
        scales[0] = _mm512_cvtepi8_epi16(scales1);
        scales[1] = _mm512_cvtepi8_epi16(scales2);
    }
    template <typename Q8>
    inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8,
        const Q8& q8, __m256 * accm, __m512i * scales) const {
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm);
        make_scales(scales8, scales);
    }
    const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202,
                                              0x05050505, 0x01010101, 0x04040404, 0x00000000);
    const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a,
                                              0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales);
    }

    Q2Bits bits;
    Scale16 sc16;
    const __m128i m4 = _mm_set1_epi8(0xf);

};

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].hmask, bits);
        auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales);
        sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales);
    }

    Q2Bits bits;
    HighBit3 hbits;
    ScaleQ3 sc3;
    Scale16 sc16;
    const __m128i m4  = _mm_set1_epi8(0xf);
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare64(x[i].ql);
        add_high_bits(x[i].qh, bits);
        auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales);
        sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales);
    }

    inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const {
        auto hbits = _mm512_loadu_si512((const __m512i *)qh);
        auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh);
        auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
        tmp1 = _mm512_and_si512(hbits, mh);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh);
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
    }

    Q4Bits bits;
    HighBit3 hbits;
    Scale16 sc16;

    const __m512i mh = _mm512_set1_epi8(0x30);

};

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0));
                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1));
                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2));
                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}
template <typename Q8>
inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
    const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
    const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1));
    const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2));
    const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3));
    auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
    sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
    accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0));
                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1));
                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2));
                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[4];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0));
                const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1));
                const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2));
                const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(),
                                    p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]);
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}

template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    constexpr int k_nx = 2;

    Q8<1> q8(info);

    Dequantizer deq1(vx, bx);
    Dequantizer deq2(vx, bx);

    Dequantizer * deq[k_nx];
    deq[0] = &deq1;
    deq[1] = &deq2;

    __m512i scales[2*k_nx];

    for (int ix = 0; ix < nrc_x; ++ix) {

        auto accd = _mm512_setzero_ps();
        auto accm = _mm256_setzero_ps();

        for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix);

        for (int i = 0; i < nb/k_nx; ++i) {

            for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx);

            for (int kx = 0; kx < k_nx; ++kx) {
                compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd);
            }

        }
        if (2*(nb/2) < nb) {
            int i0 = 2*(nb/2);
            deq[0]->new_block(i0, q8, &accm, scales);
            compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd);
        }

        auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1));
        info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256)));
    }
}

#else
// ===================================== Vanilla AVX2 =====================================

struct Q4Bits {
    inline void prepare(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[2] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare64(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[1] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare16(const uint8_t * q4, int j) {
        values[0] = dequant16(q4 + 64*j +  0);
        values[1] = dequant16(q4 + 64*j + 16);
        values[2] = dequant16(q4 + 64*j + 32);
        values[3] = dequant16(q4 + 64*j + 48);
    }
    inline __m256i dequant16(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128);
        return _mm256_and_si256(ml, aux256);
    };
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0xf);
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2, int j) {
        auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j);
        values[0] = _mm256_and_si256(q2bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml);
    }
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0x03);
};

struct HighBit5 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q4Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x10);
    __m256i hbits;
};

struct HighBit3 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q2Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x04);
    __m256i hbits;
};


/*
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
    if (j == 0) {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
        }
    } else {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
        }
    }
}*/

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q4Bits bits;
    Scales8K s8k;
};

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        return MM256_SET_M128I(scales128, scales128);
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs, j);
        bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]);
        bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]);
        bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]);
        bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]);
    }

    static __m256i load_values() {
        static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
        auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
        return MM256_SET_M128I(val128, val128);
    }

    Q4Bits bits;
    Scales8K s8k;
    ScaleIQ4XS siq4;
    const __m256i values;
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].qh);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q4Bits  bits;
    HighBit5 hbits;
    Scales8K s8k;
};

template <typename Q8>
inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d,
    __m256 * accm, __m256i * scales) {
    const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
    process_mins_16(all_scales, q8, i, d, accm);
    prepare_scales_16(all_scales, scales);
}

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].hmask);
        process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q2Bits  bits;
    HighBit3 hbits;
    ScaleQ3 sc3;

    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm);
        prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q2Bits  bits;

    const __m128i m4 = _mm_set1_epi8(0xf);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare64(x[i].ql, j);
        auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j);
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh));
    }

    Q4Bits  bits;
    const __m256i mh = _mm256_set1_epi8(0x30);
};

inline __m256i get_scale_shuffle_8(int i);

inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales);

inline __m256i get_scale_shuffle_16(int i);

inline void set_scales_16(const __m256i& all_scales, __m256i* scales);


template <typename Dequantizer, int nrc_y>
static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%QK_K == 0);
    const int nb = n/QK_K;

    Q8<nrc_y> q8(info);

    __m256i all_scales[2];
    __m256i scales[4];
    __m256  accd[nrc_y];

    Dequantizer deq(vx, bx);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accd, all_scales);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {
                deq.prepare(i, j);
                set_scales_16(all_scales[j], scales);
                multiply_add(deq.bits, scales, j, i, q8, sumi);
            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }

}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accd[nrc_y];
    __m256i scales[4];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            auto all_scales = deq.new_block(i, q8, accd);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {

                deq.prepare(i, j);

                set_scales_8(all_scales, j, scales);

                multiply_add(deq.bits, scales, j, i, q8, sumi);

            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }
}
#endif  // Zen4 or vanilla AVX2


//
// ============================== Legacy quants
//

struct DotHelper {
    const __m256i m1 = _mm256_set1_epi16(1);
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y);
    }
#else
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y));
    }
#endif
};

struct SignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x));
    }
};
struct UnsignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(x, y);
    }
};
template <typename Q8, typename Dot> struct Sum4 {
    Dot dot;
    inline __m256i compute(const __m256i * qx, const Q8 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1));    // 0,0, 1,1, 0,0, 1,1
        const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3));    // 2,2, 3,3, 2,2, 3,3
        return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
    }
};

struct Sum4_Q8 {
    SignedDot dot;
    static inline __m256i add1(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b));
    }
    static inline __m256i add2(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b));
    }
    inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = add1(p0, p1);  // 0,1, 0,1, 0,1, 0,1
        const __m256i p23 = add1(p2, p3);  // 2,3, 2,3, 2,3, 2,3
        return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3
    }
};

struct ScaleHelperQ_0 {
    ggml_half scales8[4];
    template <typename Q>
    inline __m128 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
        return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
    }
    template <typename Q>
    inline __m128 prepare4(__m128 other_scales, const Q * y) {
        return _mm_mul_ps(other_scales, prepare4<Q>(y));
    }
    template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
    template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
};
template <int min_value>
struct ScaleHelperQ_0_1 {
    ggml_half scales8[4];
    template <typename Q>
    inline __m256 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
        auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
        return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
    }
    template <typename Q>
    inline __m256 prepare4(__m256 other_scales, const Q * y) {
        return _mm_mul256_ps(other_scales, prepare4<Q>(y));
    }
    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
        float d = GGML_FP16_TO_FP32(y->d);
        return std::make_pair(d, -d*float(min_value));
    }
    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
    }
    const __m128 min = _mm_set1_ps(float(-min_value));
};

struct ScaleHelperQ_1 {
    uint32_t scales8[4];
    const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);

    template <typename Q>
    inline __m256 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) {
            // it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
            // complain that this breaks strict-aliasing rules.
            memcpy(scales8 + j, &y[j].d, sizeof(uint32_t));
        }
        return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle));
    }

    template <typename Q>
    inline __m256 prepare4(__m256 other_scales, const Q * y) {
        return _mm256_mul_ps(other_scales, prepare4<Q>(y));
    }

    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
        return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
    }
    template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
    }
    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
    }
};

struct MinusType0 {
    inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); }
    inline float compute(float d, int) const { return d; }
    inline float result(__m256 acc, int) const { return hsum_float_8(acc); }
};

template <int nrc_y> struct MinusType1 {
    __m128 accm[nrc_y];
    MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); }
    inline __m256 compute(__m256 dm, int iy) {
        const __m128 d = _mm256_castps256_ps128(dm);
        const __m128 m = _mm256_extractf128_ps(dm, 1);
        accm[iy] = _mm_add_ps(accm[iy], m);
        return _mm256_set_m128(d, d);
    }
    inline float compute(const std::pair<float, float>& dm, int iy) {
        accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f));
        return dm.first;
    }
    inline float result(__m256 acc, int iy) const {
        const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
        return hsum_float_4(_mm_add_ps(sum, accm[iy]));
    }
};

template <typename Minus, int nrc_y, bool is_multiple_of_4> struct AccumT {
    __m256 acc[nrc_y];
    Minus accm;
    AccumT() {  for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); }
    template <typename Unpacker, typename Scales, typename Sum, typename Q8>
    inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) {
        auto qx = unp.quants();
        __m256 dall[nrc_y];
        for (int i = 0; i < nb/4; ++i) {
            auto other_scales = unp.set_block_4(i);
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
                dall[iy] = accm.compute(s12, iy);
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto pall = sum.compute(qx, y[iy] + 4*i);
                acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
            }
        }
        if (!is_multiple_of_4) {
            for (int i = 4*(nb/4); i < nb; ++i) {
                auto other_scales = unp.set_block(i);
                for (int iy = 0; iy < nrc_y; ++iy) {
                    auto s12 = scales.prepare1(other_scales, y[iy] + i);
                    auto d = accm.compute(s12, iy);
                    const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
                    acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
                }
            }
        }
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, accm.result(acc[iy], iy));
            //s[iy*bs] = accm.result(acc[iy], iy);
        }
    }
};

template <int nrc_y, bool is_multiple_of_4>
using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;

template <int nrc_y, bool is_multiple_of_4>
using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;

using Sum4Type0 = Sum4<block_q8_0, SignedDot>;
using Sum4Type1 = Sum4<block_q8_1, UnsignedDot>;

template <typename Unpacker, typename Sum4Type, typename AccumType, typename Scales, typename Q8, int nrc_y>
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
    Unpacker unp(vx, bx);
    Sum4Type sum4;
    Scales scales;
    for (int ix = 0; ix < nrc_x; ++ix) {
        unp.set_row(ix);
        AccumType accum;
        accum.compute(nb, unp, scales, sum4, y, info, ix);
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_1> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, true>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, false>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

struct Dequantizer4bit {
    const __m256i m4 = _mm256_set1_epi8(0xf);
    inline __m256i dequant(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4);
    }
};

struct Q8_0_Dequantizer {
    inline __m256i dequant(const block_q8_0 * x) const {
        return _mm256_loadu_si256((const __m256i *)x->qs);
    }
};

struct Q8_0_1_Dequantizer {
    inline __m256i dequant(const block_q8_0 * x) const {
        return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
    }
};

struct Q4_0_Dequantizer {
    Dequantizer4bit b4;
    const __m256i m8 = _mm256_set1_epi8(-8);
    inline __m256i dequant(const block_q4_0 * x) const {
        return _mm256_add_epi8(b4.dequant(x->qs), m8);
    }
};

struct Q4_1_Dequantizer {
    Dequantizer4bit b4;
    inline __m256i dequant(const block_q4_1 * x) const {
        return b4.dequant(x->qs);
    }
};

struct HBitDequantizer {
    const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
    const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
    const __m256i minus1 = _mm256_set1_epi64x(-1);
    inline __m256i to_bytes(const uint8_t * bits) const {
        // Note: Data in all ggml quants is at least 2-byte aligned.
        // => we can cast to uint16_t and use or on two consecutive entries
        // which is faster than memcpy
        const uint16_t * aux16 = (const uint16_t *)bits;
        const uint32_t aux32 = aux16[0] | (aux16[1] << 16);
        //uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
        __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle);
        bytes = _mm256_or_si256(bytes, mask);
        return _mm256_cmpeq_epi8(bytes, minus1);
    }
};

struct Q5_0_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8((char)0xF0);
    inline __m256i dequant(const block_q5_0 * x) const {
        const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

struct Q5_1_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8(0x10);
    inline __m256i dequant(const block_q5_1 * x) const {
        const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

template <typename Q, typename Scales, typename Dequantizer>
struct Q_Unpacker {
    Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {}

    const char * cx_0;
    const Q    * x;
    size_t       bx;

    Scales scales;
    Dequantizer deq;

    __m256i qx[4];

    inline const __m256i* quants() const { return qx; }

    inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); }

    inline auto set_block_4(int i) {
        for (int j = 0; j < 4; ++j) {
            qx[j] = deq.dequant(x + 4*i + j);
        }
        return scales.prepare4(x + 4*i);
    }
    inline auto set_block(int i) {
        qx[0] = deq.dequant(x + i);
        return scales.prepare1(x + i);
    }
};

struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
    Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
    Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
//    using Sum4T = Sum4TypeQ81;
    inline static int block_size() { return QK8_0; }
};
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
    Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
    Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK5_0; }
};
struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> {
    Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};
struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer> {
    Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};

template <int nrc_y>
void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Q8_0_Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Q8_0_Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}


/*
moonll
add some structs for DequantizerIQ2XXS
SimpleBits
EvenSignHelper
*/
struct SimpleBits {
    __m256i values[4];
};

// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
#define HAVE_AVX512_POPCNT 1
#else
#define HAVE_AVX512_POPCNT 0
#endif

struct EvenSignHelper {
    #if defined HAVE_FANCY_SIMD
    // #pragma message("Using AVX512VPOPCNTDQ in even sign helper")
        union sbits_t {
            __m128i vec;
            __mmask32 mask[4];
        };
        IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
            aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
            
            // fix for #829: 兼容Intel Cascade Lake架构的CPU，如果不支持AVX512VPOPCNTDQ扩展，则使用替代实现
            #if HAVE_AVX512_POPCNT
                auto pcnt = _mm256_popcnt_epi32(aux);
                
            #else
                // 提供替代实现，使用标准的位计数方法
                __m256i pcnt;
                int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
                int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址，避免不必要的复制
                
                #pragma unroll 8  // 提示编译器展开循环，提高 SIMD 计算吞吐量
                for (int i = 0; i < 8; i++) {
                    pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
                }
            #endif
            
            sbits_t sbits;
            sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
            values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
            values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]);
            //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
            //const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
            //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
            //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
        }
        const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0);
        const __m256i mask   = _mm256_set1_epi32(127);
        const __m256i mone   = _mm256_set1_epi32(1);
    #else
        inline void sign_value(uint32_t aux32, __m256i& value) const {
            auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127],
                                           keven_signs[(aux32 >>  7) & 127], keven_signs[(aux32 >>  0) & 127]);
            value = _mm256_sign_epi8(value, signs);
        }
    #endif
};

/*
moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
add func
get_scale_shuffle_8
get_scale_shuffle_16
set_scales_16
*/

inline __m256i get_scale_shuffle_8(int i) {
    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
}

inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
}


inline __m256i get_scale_shuffle_16(int i) {
    static const uint8_t k_shuffle[128] = {
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
    };
    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
}

inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
}


template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
    if (j == 0) {
#ifdef HAVE_FANCY_SIMD
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
        }
#else
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
        }
#endif
    } else {
#ifdef HAVE_FANCY_SIMD
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
        }
#else
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
        }
#endif
    }
}

/*
moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
add func
set_scales_8_iq
set_scales_16_iq

add MUL_MAT
mul_mat_qX_K_q8_K_IQ_1
mul_mat_qX_K_q8_K_IQ_N
mul_mat_qX_K_q8_K_IQ
*/

template <typename Bits>
inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) {
    if (j == 0) {
#ifdef HAVE_FANCY_SIMD
        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
        sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2));
        sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4));
#else
        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
        sumi[0] = _mm256_add_epi32(p1, p3);
        sumi[1] = _mm256_add_epi32(p2, p4);
#endif
    } else {
#ifdef HAVE_FANCY_SIMD
        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
        sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2));
        sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4));
#else
        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
        sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3));
        sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4));
#endif
    }
}


inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) {
    //#ifdef HAVE_FANCY_SIMD
        auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100)
                              : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908);
        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)));
    //#else
    //    set_scales_8(all_scales, j, scales);
    //#endif
    }
    
inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) {
    #ifdef HAVE_FANCY_SIMD
        auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100);
        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8)));
    #else
        set_scales_16(all_scales, scales);
    #endif
    }
    
template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
        const int nb = n / QK_K;
        Q8<1> q8(info);
        Dequantizer deq(vx, bx);
        __m256i scales[2];
        __m256i q8_quants[4];
        for (int ix = 0; ix < nrc_x; ++ix) {
    
            __m256 accd = _mm256_setzero_ps();
            deq.new_row(ix);
    
            for (int i = 0; i < nb; ++i) {
    
                __m256i sumi[2], all_scales[Dequantizer::num_blocks/8];
                deq.new_block(i, all_scales);
    
                for (int j = 0; j < QK_K/128; ++j) {
                    deq.prepare(i, j, q8, q8_quants);
                    if constexpr (Dequantizer::num_blocks == 8) {
                        set_scales_8_iq(j, all_scales[0], scales);
                    } else {
                        set_scales_16_iq(all_scales[j], scales);
                    }
                    multiply_add_1(j, deq.bits, scales, q8_quants, sumi);
                }
                accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd);
            }
    
            info.store(ix, 0, hsum_float_8(accd));
        }
    }


template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    const int nb = n / QK_K;
    Q8<nrc_y> q8(info);
    Dequantizer deq(vx, bx);
    __m256i scales[4];
    __m256  accd[nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8];
            //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
            __m256i mins;
            float dmin = deq.new_block(i, all_scales, mins);
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto bsums = q8.load_bsums(iy, i);
                auto prod  = _mm256_madd_epi16(mins, bsums);
                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
            }

            for (int j = 0; j < QK_K/128; ++j) {
                deq.prepare(i, j);
                if constexpr (Dequantizer::num_blocks == 8) {
                    set_scales_8(all_scales[0], j, scales);
                } else {
                    set_scales_16(all_scales[j], scales);
                }
                //multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
                multiply_add(deq.bits, scales, j, i, q8, sumi);
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }
        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }
    }
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
#ifdef HAVE_FANCY_SIMD
    if constexpr (nrc_y == 1) {
        mul_mat_qX_K_q8_K_IQ_1<Dequantizer>(n, vx, bx, info, nrc_x);
    } else {
        mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
    }
#else
    mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
#endif
}

/*
moonll iq1s
core func for iq1s mul_mat_iq1_s_q8_K

*/

template <int nrc_y>
static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    GGML_ASSERT(n%QK_K == 0);
    Q8<nrc_y, block_q8_K> q8(info);
    __m256i qx[8];
    __m256i scales[4];
    __m256  acc[nrc_y] = {};
    auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000
    __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100);
    for (int ix = 0; ix < nrc_x; ++ix) {
        auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx);
        for (int ibl = 0; ibl < n/QK_K; ++ibl) {
            float d = GGML_FP16_TO_FP32(iq1s[ibl].d);
            auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh);
            auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7));
            scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1));
#ifdef HAVE_FANCY_SIMD
            auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask);
            auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9));
#else
            auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask);
            auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7)));
#endif
            deltas128 = _mm_mullo_epi16(scales128, deltas128);
            scales128 = _mm_slli_epi16(scales128, 3);
            auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128);
            auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128);
            auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7
            auto all_scales = MM256_SET_M128I(scales128, scales128);
            auto shuffle = shuffle0;
            for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
                scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle);
                shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4));
            }
            const uint8_t  * qs = iq1s[ibl].qs;
            const uint16_t * qh = iq1s[ibl].qh;
            for (int ib = 0; ib < QK_K/32; ib += 2) {
                qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)],
                                             iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
                qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)],
                                             iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
                qs += 8;
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto bsums = q8.load_bsums(iy, ibl);
                auto sumi = _mm256_setzero_si256();
                for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
                    auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0);
                    auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1);
#ifdef HAVE_FANCY_SIMD
                    auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1);
                    auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2);
                    sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2));
#else
                    auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1);
                    auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2);
                    auto dot  = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2));
                    sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot));
#endif
                }
#ifdef HAVE_FANCY_SIMD
                sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas);
#else
                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas));
#endif
                acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]);
            }
        }
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, 0.125f*hsum_float_8(acc[iy]));
            acc[iy] = _mm256_setzero_ps();
        }
    }
}

/*
moonll iq1s
DequantizerIQ2XXS
DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
*/

struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
    DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    constexpr static int num_blocks = 8;

    union Data {
        __m256i vec;
        uint32_t val[8];
    };

    inline __m128i load_scales(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        const uint16_t * a16 = (const uint16_t *)x[i].qs;
        auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12);
        return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1));
    }

    inline void new_block(int i, __m256i * scales) {
        auto sc16 = load_scales(i);
        scales[0] = MM256_SET_M128I(sc16, sc16);
    }
    inline float new_block(int i, __m256i * scales, __m256i& mins) {
        auto sc16 = load_scales(i);
        mins = scb.shuffle(sc16);
        scales[0] = MM256_SET_M128I(sc16, sc16);
        return -d*minv;
    }

    inline static void make4(const uint32_t * aux32, __m256i * values) {
        const uint8_t * aux8 = (const uint8_t *)aux32;
        values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]);
        values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]);
        values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]);
        values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]);
    }

    IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const {
#ifdef HAVE_FANCY_SIMD
        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0);
        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2);
#else
        esh.sign_value(aux32[1], values[0]);
        esh.sign_value(aux32[3], values[1]);
        esh.sign_value(aux32[5], values[2]);
        esh.sign_value(aux32[7], values[3]);
#endif
    }
    inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const {
        make4(aux32, values);
        sign_values(aux32, values);
        for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value);
    }
    inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const {
        make4(aux32, values);
        sign_values(aux32, q8);
    }
    inline void prepare(int i, int j) {
        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
        make4_signed(data.val, min_value, bits.values);
    }
    inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) {
        for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k);
        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
        make4(data.val, bits.values, q8_quants);
    }

    constexpr static int minv = 43;
    SimpleBits bits;
    Scales8KBase scb;
    EvenSignHelper esh;
    const __m256i min_value = _mm256_set1_epi8(minv);
    const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
};

/*
moonll
add Q8_0_Unpacker && DequantizerIQ2XXS support
add func mul_mat_qX_K_q8_K_IQ
*/

template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
    if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
        std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
            m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
        }
        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>|| std::is_same_v<Dequantizer, Q8_0_1_Unpacker>) {
            m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_1_q8_1_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_1_q8_1_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_1_q8_1_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
        }
        else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
            m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>;
            }
            else {
#ifdef HAVE_FANCY_SIMD
            if constexpr (std::is_same_v<Dequantizer, DequantizerIQ4XS>) {
            m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
            m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
            m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
            m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 4>;
            m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 5>;
            m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 6>;
            m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 7>;
            m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 8>;
            } else {
            m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1<Dequantizer>;
            m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>;
            }
#else
            if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
                          std::is_same_v<Dequantizer, DequantizerQ3K> ||
                          std::is_same_v<Dequantizer, DequantizerQ6K>) {
                m.funcs[0] = mul_mat_qY_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qY_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qY_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qY_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qY_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qY_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qY_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qY_K_q8_K_T<Dequantizer, 8>;
            } else {
                m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
            }
#endif
        }
}

struct QFBase {
    #ifdef __AVX512F__
        constexpr static int k_step = 16;
        using Data = __m512;
        using Acc  = __m512;
        static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); }
        static inline Data load(const float * x) { return _mm512_loadu_ps(x); }
        static inline Data load(const ggml_bf16_t * x) {
            return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16));
        }
        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
            return _mm512_fmadd_ps(y, x, prev);
        }
        static inline Acc acc_first(const Data& y, const Data& x) {
            return _mm512_mul_ps(y, x);
        }
        static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); }
        static inline float hsum(Acc acc) {
            return _mm512_reduce_add_ps(acc);
        }
        template <typename Float>
        static inline Data load4Floats(const Float * x) {
            return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
        }
        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
            acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc);
            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
            auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00));
            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline __m128 hsum_r4(Acc acc) {
            auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1));
            auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3));
            return _mm_add_ps(sum1, sum2);
        }
    #else
        constexpr static int k_step = 8;
        using Data = __m256;
        using Acc  = __m256;
        static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); }
        static inline Data load(const float * x) { return _mm256_loadu_ps(x); }
        static inline Data load(const ggml_bf16_t * x) {
            return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16));
        }
        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
            return _mm256_fmadd_ps(y, x, prev);
        }
        static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); }
        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
            acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc);
            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
            auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00));
            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
            return acc;
        }
        static inline Acc acc_first(const Data& y, const Data& x) {
            return _mm256_mul_ps(y, x);
        }
        static inline float hsum(Acc acc) {
            return hsum_float_8(acc);
        }
        static inline __m128 hsum_r4(Acc acc) {
            return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
        }
        template <typename Float>
        static inline Data load4Floats(const Float * x) {
            return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
        }
    #endif
        static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
        static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
        static inline __m128 load128(const ggml_bf16_t * x) {
            return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16));
        }
    };
    template <typename Float, int nrc_in> struct QFT final : public QFBase {
        constexpr static int nrc = nrc_in;
        QFT(const DataInfo& info) {
            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy);
        }
        QFT(const char * cx, size_t bx) {
            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
        }
        IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
        IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
        IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const {
            xv[0] = load1(ix+0, i);
            xv[1] = load1(ix+1, i);
            xv[2] = load1(ix+2, i);
            xv[3] = load1(ix+3, i);
    #ifdef __AVX512F__
            auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]);
            auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]);
            auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]);
            auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]);
            xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
            xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
            xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
            xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
    #else
            auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]);
            auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]);
            auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]);
            auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]);
            xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
            xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
            xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
            xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
    #endif
        }
        const Float * y[nrc];
    };
    

template <typename Qy, typename Qx>
IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
    int nb = n/QFBase::k_step;
    int nb4 = n/4;
    Qy y(info);
    Qx x(cx + ix0*bx, bx);
    QFBase::Data xv[Qx::nrc];
    QFBase::Acc  acc[Qx::nrc*Qy::nrc];
    auto yv = y.load1(0, 0);
    for (int ix = 0; ix < Qx::nrc; ++ix) {
        xv[ix] = x.load1(ix, 0);
        acc[ix] = QFBase::acc_first(yv, xv[ix]);
    }
    for (int iy = 1; iy < Qy::nrc; ++iy) {
        yv = y.load1(iy, 0);
        for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]);
    }
    for (int i = 1; i < nb; ++i) {
        yv = y.load1(0, i);
        for (int ix = 0; ix < Qx::nrc; ++ix) {
            xv[ix] = x.load1(ix, i);
            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
        }
        for (int iy = 1; iy < Qy::nrc; ++iy) {
            yv = y.load1(iy, i);
            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
        }
    }
    for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
        yv = y.load_tail(0, i);
        for (int ix = 0; ix < Qx::nrc; ++ix) {
            xv[ix] = x.load_tail(ix, i);
            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
        }
        for (int iy = 1; iy < Qy::nrc; ++iy) {
            yv = y.load_tail(iy, i);
            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
        }
    }
    for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
}
// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
template <int nrc_y, typename FloatX, typename FloatY>
void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    const char * cx = (const char *)vx;
    // TBD if we want this
    //if constexpr (nrc_y == 1) {
    //    constexpr int k_nx = 2;
    //    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
    //        mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
    //    }
    //    if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
    //        int nx = nrc_x - lastx;
    //        switch (nx) {
    //            case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
    //            case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
    //            case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
    //        }
    //        //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
    //    }
    //    return;
    //}
#ifdef __AVX512F__
    constexpr int k_nx = 5;
#else
    constexpr int k_nx = nrc_y == 1 ? 4 : 2;
#endif
    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
        mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
    }
    int last_x = k_nx*(nrc_x/k_nx);
    if (last_x == nrc_x) return;
    int nx = nrc_x - last_x;
#ifdef __AVX512F__
    switch (nx) {
        case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
        case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
        case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
        case 4: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 4>>(n, cx, bx, last_x, info); break;
    }
#else
    if constexpr (nrc_y == 1) {
        switch (nx) {
            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
            case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
            case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
        }
    } else {
        switch (nx) {
            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
        }
    }
#endif
}

template <typename FloatX, typename FloatY>
void set_mul_mat_f(MulMat& mm) {
    for (auto& f : mm.funcs) f = nullptr;
    mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>;
    mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>;
    mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>;
    mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>;
    mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>;
#ifndef __AVX512F__
    mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>;
#endif
}


/*
moonll
add typeb TO compare return not expected type of weight matrix
add IQ2XSS
add IQ1_S
add GGML_TYPE_IQ4_XS
*/

bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
    (void)Ny;

        auto expected_typeB = GGML_TYPE_Q8_K;
    switch (typeA) {
        case GGML_TYPE_Q2_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ2K>(mm);
            break;
        case GGML_TYPE_Q3_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ3K>(mm);
            break;
        case GGML_TYPE_Q4_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ4K>(mm);
            break;
        case GGML_TYPE_Q5_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ5K>(mm);
            break;
        case GGML_TYPE_Q6_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ6K>(mm);
            break;
        case GGML_TYPE_IQ4_XS:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerIQ4XS>(mm);
            break;
        case GGML_TYPE_IQ2_XXS:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerIQ2XXS>(mm);
            break;
        case GGML_TYPE_Q4_0:
            assert (ne00 % QK4_0 == 0);
            MulMat::set_functions<Q4_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0;
            break;
        case GGML_TYPE_Q4_1:
            assert (ne00 % QK4_1 == 0);
            MulMat::set_functions<Q4_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
            break;
        case GGML_TYPE_Q5_0:
            assert (ne00 % QK5_0 == 0);
            MulMat::set_functions<Q5_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0;
            break;
        case GGML_TYPE_Q5_1:
            assert (ne00 % QK5_1 == 0);
            MulMat::set_functions<Q5_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
            break;
        case GGML_TYPE_Q8_0:
            assert (ne00 % QK8_0 == 0);
#ifdef HAVE_FANCY_SIMD
            MulMat::set_functions<Q8_0_1_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_1_X4;
#else
            MulMat::set_functions<Q8_0_Unpacker>(mm);
            expected_typeB = GGML_TYPE_Q8_0_X4;
#endif
            break;
        case GGML_TYPE_IQ1_S:
            mm.funcs[0] = mul_mat_iq1_s_q8_K<1>;
            mm.funcs[1] = mul_mat_iq1_s_q8_K<2>;
            mm.funcs[2] = mul_mat_iq1_s_q8_K<3>;
            mm.funcs[3] = mul_mat_iq1_s_q8_K<4>;
            mm.funcs[4] = mul_mat_iq1_s_q8_K<5>;
            mm.funcs[5] = mul_mat_iq1_s_q8_K<6>;
            mm.funcs[6] = mul_mat_iq1_s_q8_K<7>;
            mm.funcs[7] = mul_mat_iq1_s_q8_K<8>;
        #ifdef HAVE_FANCY_SIMD
             mm.func16 = mul_mat_iq1_s_q8_K<16>;
        #endif
       // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
              expected_typeB = GGML_TYPE_Q8_K;
            break;

        default:
        {
            return false;
        }
            
    }


    return ggml_type(typeB) == expected_typeB;

}

} // namespace

/*
iq1_s is not support for arm
*/
#else   // __aarch64__

//[kawrakow] Need these two for performance on Arm
typedef struct {
    ggml_half d[8];
    int8_t qs[4*QK8_1];
} block_q8_1_x4;
static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
typedef struct {
    ggml_half d[4];
    int8_t qs[4*QK8_0];
} block_q8_0_x4;
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");

namespace {

template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

    inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); }
    inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); }
    inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); }
    inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); }
    inline int16x8_t load_bsums8(int iy, int i) const {
        auto q8s = vld1q_s16_x2(y[iy][i].bsums);
        return vpaddq_s16(q8s.val[0], q8s.val[1]);
    }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

//#pragma GCC unroll 4
        for (int i = 0; i < nb; ++i) {

            int32x4_t sumi[nrc_y];
            for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

            if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) {
                deq.process_scales(i, q8, acc);
                deq.prepare(i, 0);
                deq.compute(q8, i, 0, sumi);
                deq.prepare(i, 1);
                deq.compute(q8, i, 1, sumi);
            } else {
                if constexpr (Dequantizer::num_blocks() == 8) {
                    auto scales = deq.new_block(i, q8, acc);
                    deq.prepare(i, 0);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                    deq.prepare(i, 1);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                }
                else if constexpr (Dequantizer::num_blocks() == 16) {
                    auto scales = deq.new_block(i, q8, acc);
                    deq.prepare(i, 0);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                    deq.prepare(i, 1);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                }
                else {
                    GGML_ASSERT(false);
                }
            }

#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
            }
        }

#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}
template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {

            int32x4_t sumi[nrc_y];
            for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

            if constexpr (Dequantizer::num_blocks() == 8) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else if constexpr (Dequantizer::num_blocks() == 16) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else {
                GGML_ASSERT(false);
            }
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);
    const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val;
    const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val;

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales.val[j], pall);
}
template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8,
        const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales, pall);
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {

    auto mzero = vdupq_n_s32(0);
    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1,
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4,
    auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3
    sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5,
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7,
    auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7
    sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34);
}

template <typename Q8>
inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums8(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}
template <typename Q8>
inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0]));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0]));
        int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1]));
        int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1]));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4)));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}

struct Scales8 {
    uint32_t utmp[4];
    const uint8_t * sc8 = (const uint8_t *)utmp;
    template <typename Q8, typename Qx>
    inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) {
        make_q4_scales(x.scales, utmp);
        int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8));
        accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin));

        uint8x8_t scales8 = vld1_u8(sc8);
        uint16x8_t scales16 = vmovl_u8(scales8);
        int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))),
                              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))};
        return scales;
    }
};

struct Q4bits {
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    uint8x16x4_t b1, b2;
    inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[2] = vshrq_n_u8(val[0], 4);
        b.val[1] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[1] = vshrq_n_u8(val[0], 4);
        b.val[2] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4(b2, q4bits.val);
    }
    inline void prepare_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4(b1, q4bits.val+0);
        prepare4(b2, q4bits.val+2);
    }
    inline void prepare64(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        b1.val[0] = vandq_u8(q4bits.val[0], m4b);
        b1.val[1] = vandq_u8(q4bits.val[1], m4b);
        b1.val[2] = vandq_u8(q4bits.val[2], m4b);
        b1.val[3] = vandq_u8(q4bits.val[3], m4b);
        b2.val[0] = vshrq_n_u8(q4bits.val[0], 4);
        b2.val[1] = vshrq_n_u8(q4bits.val[1], 4);
        b2.val[2] = vshrq_n_u8(q4bits.val[2], 4);
        b2.val[3] = vshrq_n_u8(q4bits.val[3], 4);
    }
    inline void prepare16(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4_16(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4_16(b2, q4bits.val);
    }
    inline void prepare16_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4_16(b1, q4bits.val+0);
        prepare4_16(b2, q4bits.val+2);
    }
};

struct Q2bits {
    const uint8x16_t m4b = vdupq_n_u8(0x03);
    uint8x16x4_t b1, b2;
    inline void prepare(const uint8_t * qs) {
        auto q2bits = vld1q_u8_x2(qs);
        b1.val[0] = vandq_u8(q2bits.val[0], m4b);
        b1.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b1.val[2] = vandq_u8(q2bits.val[0], m4b);
        b1.val[3] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[0] = vandq_u8(q2bits.val[0], m4b);
        b2.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[2] = vandq_u8(q2bits.val[0], m4b);
        b2.val[3] = vandq_u8(q2bits.val[1], m4b);
    }
};

template <typename block_q>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {}
    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); }
    const void * vx;
    const block_q * x;
    const size_t bx;
    const int nrc;
};

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        if (nrc == 1) bits.prepare_v2(x[i].qs+64*j);
        else bits.prepare(x[i].qs+64*j);
    }

    Q4bits bits;
    Scales8 s8;

    float d;
};

struct HighBit5 {
    const uint8x16_t mhb = vdupq_n_u8(0x10);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct HighBit3 {
    const uint8x16_t mhb = vdupq_n_u8(0x04);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].qh);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+64*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    Q4bits bits;
    HighBit5 h;
    Scales8 s8;

    uint8x16x2_t hbits;

    float d;
};

inline int32x4x4_t make_wider(const int16x8x2_t& scales16) {
    int32x4x4_t scales = {
        vmovl_s16(vget_low_s16 (scales16.val[0])),
        vmovl_s16(vget_high_s16(scales16.val[0])),
        vmovl_s16(vget_low_s16 (scales16.val[1])),
        vmovl_s16(vget_high_s16(scales16.val[1])),
    };
    return scales;
}

template <typename Q8>
inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) {
    int16x8x2_t scales16;
    scales16.val[0] = vmovl_s8(vget_low_s8(scales8));
    scales16.val[1] = vmovl_s8(vget_high_s8(scales8));
    accum_mins_16(scales16, q8, acc, i, c);
    return make_wider(scales16);
}

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d);
    }
    inline void prepare(int i, int j) {

        auto hbits = vld1q_u8_x2(x[i].qh + 32*j);

        bits.prepare64(x[i].ql+64*j);
        bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb));
        bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb));
        bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb));
        bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb));

        bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb));
        bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb));
        bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb));
        bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb));

    }

    Q4bits bits;

    const uint8x16_t mhb = vdupq_n_u8(0x30);

    float d;
};

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].hmask);
        const uint16_t * sc16 = (const uint16_t *)x[i].scales;
        uint32_t aux0 = sc16[0] | (sc16[1] << 16);
        uint32_t aux1 = sc16[2] | (sc16[3] << 16);
        uint32_t aux2 = sc16[4] | (sc16[5] << 16);
        aux32[0] =  (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030);
        aux32[1] =  (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030);
        aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030);
        aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030);
        return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d);
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    uint32_t aux32[4];

    Q2bits bits;

    HighBit3 h;

    float d;
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return true; }

    template <typename Q8>
    inline void process_scales(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales_and_mins = vld1q_u8(x[i].scales);
        auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4));
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(mins8));
        scales16.val[1] = vmovl_s8(vget_high_s8(mins8));
        accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin));

        scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf));
    }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        process_scales(i, q8, acc);
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8)));
        scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8)));
        return make_wider(scales16);
    }

    template <typename Q8>
    inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) {
        auto m1 = vdupq_n_u8(1);
        auto shuffle = vdupq_n_u8(8*j);
        bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]);

            auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]);

            auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]);

            auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]);
        }
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
    }

    uint32_t aux32[4];

    uint8x16_t scales8;

    Q2bits bits;

    float d;
};

// ============================= i-quants

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {

    static int8x16_t load_values() {
        static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
        return vld1q_s8(iq4nl_values);
    }

    DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        (void)q8;
        (void)acc;
        d = GGML_FP16_TO_FP32(x[i].d);
        const uint16_t scales_h = x[i].scales_h;
        const uint16_t * scales_l = (const uint16_t *)x[i].scales_l;
        aux32[0] = scales_l[0] | (scales_l[1] << 16);
        aux32[1] = aux32[0] >> 4;
        // scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7
        uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf));
        uint16_t * aux16 = (uint16_t *)aux32;
        aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2;
        // sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7
        uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30));
        int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32));
        // shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7
        scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff));
        int16x8_t scales16 = vmovl_s8(scales8);
        int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))};
        return scales;
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs+64*j);
        for (int k = 0; k < 4; ++k) {
            bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k]));
            bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k]));
        }
    }

    Q4bits bits;
    const int8x16_t values;
    uint32_t aux32[2];

    constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602};

    float d;
};

struct SimpleBits {
    uint8x16x4_t b1;
    uint8x16x4_t b2;
};

IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) {
    int32x4x2_t scales;
    auto one = vdupq_n_u32(1);
    scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1));
    scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1));
    return scales;
}

inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) {
    auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127))));
    auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127))));
    b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1));
    b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2));
}

IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) {
    return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1));
}

struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
    DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j));
        prepare_all(data, q);
        return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1]));
    }

private:

    static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) {
        const uint8_t * idx = (const uint8_t *)bits;
        b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]});
        b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]});
        apply_signs_2(b, signs, bits[1]);
    }

    inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) {
        const uint32_t * q2 = (const uint32_t *)data.val;
        prepare2(quants+0, q2+0, keven_signs);
        prepare2(quants+2, q2+2, keven_signs);
        prepare2(quants+4, q2+4, keven_signs);
        prepare2(quants+6, q2+6, keven_signs);
    }
};

inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) {
    auto aux = vld1_u8(sc);
    auto scales_l = vand_u8(aux, vdup_n_u8(0xf));
    auto scales_h = vshr_n_u8(aux, 4);
    auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));

    auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1)));
    int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) };
    return make_wider(scales16);
}

struct DequantizerIQ2XS final : public BaseDequantizer<block_iq2_xs> {
    DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1);
    }

private:

    static void make2(const uint16_t * qs, uint8x16_t * b) {
        auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511))));
        auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511))));
        auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9))));
        auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9))));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1));
        b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2));
    }

    inline static void make4(const uint16_t * qs, uint8x16_t * b) {
        make2(qs + 0, b + 0);
        make2(qs + 4, b + 2);
    }

    IQK_ALWAYS_INLINE void prepare_internal(int i, int j) {
        make4(x[i].qs + 16*j + 0, bits.b1.val);
        make4(x[i].qs + 16*j + 8, bits.b2.val);
    }

};

// So, I hate to include this table, but with the GCC 12.3 compiler
// bundled in the Cosmopolitan tools, loading the unpacked sign bytes
// from this table using the packed 8 sign bits as index is faster than
// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to
// expand the bits to bytes.
static const uint64_t kall_signs[256] = {
    0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff,
    0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff,
    0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff,
    0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff,
    0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff,
    0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff,
    0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff,
    0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff,
    0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff,
    0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff,
    0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff,
    0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff,
    0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff,
    0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff,
    0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff,
    0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff,
    0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff,
    0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff,
    0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff,
    0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff,
    0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff,
    0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff,
    0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff,
    0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff,
    0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff,
    0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff,
    0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff,
    0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff,
    0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff,
    0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff,
    0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff,
    0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff,
    0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff,
    0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff,
    0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff,
    0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff,
    0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff,
    0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff,
    0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff,
    0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff,
    0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff,
    0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff,
    0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff,
    0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff,
    0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff,
    0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff,
    0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff,
    0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff,
    0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff,
    0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff,
    0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff,
    0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff,
    0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff,
    0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff,
};

struct SignHelper {

    IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const {
        auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]});
        // Normally we would expect this to be faster, but it isn't.
        // auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1]));
        // auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s));
    }

    // We would need these two if we weren't loading from the unpacked sign table.
    //const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201));
    //const uint8x16_t m1    = vdupq_n_u8(1);
};

struct DequantizerIQ2S final : public BaseDequantizer<block_iq2_s> {
    DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0, bits);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1, bits);
    }

private:

    static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) {
        uint32_t aux32[2];
        const uint16_t * aux16 = (const uint16_t *)aux32;
        for (int k = 0; k < 2; ++k) {
            aux32[1] = (qh[k] << 4) | (qh[k] << 18);
            aux32[0] = (aux32[1] << 4) & 0x03000300;
            aux32[1] &= 0x03000300;
            b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1]))));
            b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3]))));
            sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2;
            sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2;
        }
    }

    void prepare_internal(int i, int j, SimpleBits& sb) {

        const auto * qs = x[i].qs + 16*j;
        const auto * qh = x[i].qh + 4*j;
        const auto * sign_bits = qs + QK_K/8;

        make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val);
        make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val);
    }

    SignHelper sh;
};

struct DequantizerIQ3XXS final : public BaseDequantizer<block_iq3_xxs> {
    DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto q3data = vld1q_u8_x2(x[i].qs + 32*j);
        auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j));
        prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q);
        return prepare_scales_8(gas);
    }

private:

    inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) {
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]});
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]});
        apply_signs_2(b, keven_signs, sidx);
    }
    inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) {
        make2(q3+ 0, signs[0], quants + 0);
        make2(q3+ 8, signs[1], quants + 2);
        make2(q3+16, signs[2], quants + 4);
        make2(q3+24, signs[3], quants + 6);
    }
};

struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> {
    DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x2_t new_block(int i) {
        d = GGML_FP16_TO_FP32(x[i].d);
        uint32_t scales32[2];
        auto qs = vld1q_u8_x2(x[i].qs);
        auto signs = vld1q_u8(x[i].signs);

        prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs);

        std::memcpy(scales32, x[i].scales, 4);
        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
        auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7
        scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400)));
        auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8));
        int32x4x2_t scales;
        scales.val[0] = vmovl_s16(vget_low_s16(scales16));
        scales.val[1] = vmovl_s16(vget_high_s16(scales16));
        return scales;
    }

    inline void prepare(int i, int j) {
        if (j == 1) {
            auto qs = vld1q_u8_x2(x[i].qs + 32);
            auto signs = vld1q_u8(x[i].signs + 16);
            prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs);
        }
    }

private:

    static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256)));
        const uint16_t * idx = (const uint16_t *)&vindex;
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]});
        sh.apply_signs_1x(b+0, sign_bits+0);
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]});
        sh.apply_signs_1x(b+1, sign_bits+2);
    }
    static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto idx_l = vld1q_u8(qs);
        make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0);
        make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2);
    }

    static int16x8_t load_shift() {
        static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
        return vld1q_s16(k_shift);
    }

    inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) {
        auto signs = vld1q_u8(sign_bits);
        auto s = (const uint8_t *)&signs;
        make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val);
        make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val);
    }

    SignHelper sh;
    const int16x8_t hshift = load_shift();

};

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);
    Dequantizer deq(vx, bx, nrc_y);
    uint8x16_t  qx[8];
    int32x4_t   sumi[nrc_y];
    float32x4_t acc[nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {
            float d = deq.new_block(i);
            auto scales = deq.unpack(i, 0, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                sumi[iy] = vdupq_n_s32(0);
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]);
            }
            scales = deq.unpack(i, 1, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]);
                acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy]));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

// =========================================== Legacy quants

template <typename Block>
inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) {
    for (int k = 0; k < 4; ++k) aux[k] = x[k].d;
    return vld1_f16((const float16_t *)aux);
}

template <typename Block>
inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) {
    if constexpr (std::is_same_v<Block, block_q8_1>) {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; }
    } else {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; }
    }
    return vld1q_f16((const float16_t *)aux);
}

struct Q4LegacyBits {
    template <typename Block>
    inline void prepare(const Block * x) {
        for (int i = 0; i < 4; ++i) {
            auto q4bits = vld1q_u8(x[i].qs);
            b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
            b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
        }
    }
    inline void prepare1(const uint8_t * qs, int8x16_t * q) const {
        auto q4bits = vld1q_u8(qs);
        q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
        q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
    }
    inline void prepare1(const uint8_t * qs) {
        prepare1(qs, b);
    }
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    int8x16_t b[8];
};

// One would think this commented out version would do better than the one below
// because it offers more opportunities to execute instructions in parallel.
// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers
// cannot it just do the sequential version below on its own?
//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
//    const auto q8b_1 = vld1q_s8_x2(qs + 0);
//    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]);
//    const auto q8b_2 = vld1q_s8_x2(qs + 32);
//    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]);
//    auto p1234 = vpaddq_s32(p12, p34);
//    const auto q8b_3 = vld1q_s8_x2(qs + 64);
//    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]);
//    const auto q8b_4 = vld1q_s8_x2(qs + 96);
//    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]);
//    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
//}

inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
    auto q8b = vld1q_s8_x2(qs + 0);
    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 32);
    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]);
    auto p1234 = vpaddq_s32(p12, p34);
    q8b = vld1q_s8_x2(qs + 64);
    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 96);
    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]);
    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
}

template <int nrc> struct Q80 {

    constexpr static int nrc_y = nrc;

    Q80(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x4_t load_scales(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return vld1_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            sc16[iy] = vmul_f16(qx_scales, q8_scales);
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
        }
    }

    const block_q8_0 * y[nrc_y];
};

template <int nrc> struct Q81 {

    constexpr static int nrc_y = nrc;

    Q81(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x8_t load_scales(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return vld1q_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales));
            acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m));
            sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales));
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
            acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s)));
        }
    }

    const block_q8_1 * y[nrc_y];
};

template <typename block_q>
struct BaseLegacyDequantizer {

    BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {}

    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); }

    Q4LegacyBits bits;

    const void * vx;
    const block_q * x;
    size_t bx;
};

struct DequantizerQ40 final : public BaseLegacyDequantizer<block_q4_0> {

    DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        q[0] = vaddq_s8(q[0], m8);
        q[1] = vaddq_s8(q[1], m8);
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    const int8x16_t m8 = vdupq_n_s8(-8);
    //ggml_half aux[4];
};

struct DequantizerQ41 : public BaseLegacyDequantizer<block_q4_1> {

    DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.prepare1(x[i].qs);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q4_1)/4;
            bits.prepare1(x[4*i+k].qs, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }
    // Leaving this commented out attempt to be reminded that I already tried this.
    // It has basically the same performance as the version above.
    //inline float16x8_t new_block(int i) {
    //    uint32x4_t scales = {};
    //    const block_q4_1 * xi = x + 4*i;
    //    const uint32_t * s32 = (const uint32_t *)&xi->d;
    //    scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[0].qs, bits.b + 0);
    //    scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[1].qs, bits.b + 2);
    //    scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[2].qs, bits.b + 4);
    //    scales = vsetq_lane_u32(*s32, scales, 3);
    //    bits.prepare1(xi[3].qs, bits.b + 6);
    //    return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle)));
    //}

    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};
};

struct HighBit5Legacy {
    inline uint8x16_t to_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask));
    }
    inline uint8x16_t to_negated_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0));
    }
    const uint64x2_t mask = vdupq_n_u64(0x8040201008040201);
    const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1));
};

struct DequantizerQ50 final : public BaseLegacyDequantizer<block_q5_0> {

    DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0xf0);

};

struct DequantizerQ80 final : public BaseLegacyDequantizer<block_q8_0> {

    DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.b[0] = vld1q_s8(x[i].qs);
        bits.b[1] = vld1q_s8(x[i].qs+16);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs);
            bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16);
        }
        return vld1_f16((const float16_t *)aux);
    }

};

struct DequantizerQ51 final : public BaseLegacyDequantizer<block_q5_1> {

    DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        bits.prepare1(x[i].qs, bits.b);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q5_1)/4;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0x10);
    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};

};

template <typename Dequantizer, typename Q8>
inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i));
        auto scale = vcvt_f32_f16(sc16[iy]);
        acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall));
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[Q8::nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[Q8::nrc_y];
        for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb/4; ++i) {
            q8.process_scales(i, deq, sc16, acc);
            sum_4(i, deq, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq, acc);
        }

        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq1.new_row(ix);
        deq2.new_row(ix);

        float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) };

        for (int i = 0; i < nb/8; ++i) {
            q8.process_scales(2*i+0, deq1, sc16+0, acc+0);
            q8.process_scales(2*i+1, deq2, sc16+1, acc+1);
            sum_4(2*i+0, deq1, q8, sc16+0, acc+0);
            sum_4(2*i+1, deq2, q8, sc16+1, acc+1);
        }
        for (int i = 2*(nb/8); i < nb/4; ++i) {
            q8.process_scales(i, deq1, sc16, acc);
            sum_4(i, deq1, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq1, acc);
        }

        info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1])));
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q81<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q80<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q81<1> q8(info);
    mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q80<1> q8(info);
    mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
    if constexpr (std::is_same_v<Dequantizer, DequantizerQ40> || std::is_same_v<Dequantizer, DequantizerQ50> ||
                  std::is_same_v<Dequantizer, DequantizerQ80>) {
        m.funcs[0] = mul_mat_qX_0_q8_0<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_0_q8_0<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_0_q8_0<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_0_q8_0<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_0_q8_0<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_0_q8_0<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_0_q8_0<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_0_q8_0<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerQ41> || std::is_same_v<Dequantizer, DequantizerQ51>) {
        m.funcs[0] = mul_mat_qX_1_q8_1<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_1_q8_1<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_1_q8_1<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_1_q8_1<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_1_q8_1<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_1_q8_1<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_1_q8_1<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_1_q8_1<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS> || std::is_same_v<Dequantizer, DequantizerIQ3XXS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ3S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ2XS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>;
    }
    else {
        m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>;
    }
}

bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) {
    row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);

    (void)Ny;
    // Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications.
    //if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S ||
    //                typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false;

    switch (typeA) {
        case GGML_TYPE_Q2_K:
            MulMat::set_functions<DequantizerQ2K>(m);
            break;
        case GGML_TYPE_Q3_K:
            MulMat::set_functions<DequantizerQ3K>(m);
            break;
        case GGML_TYPE_Q4_K:
            MulMat::set_functions<DequantizerQ4K>(m);
            break;
        case GGML_TYPE_Q5_K:
            MulMat::set_functions<DequantizerQ5K>(m);
            break;
        case GGML_TYPE_Q6_K:
            MulMat::set_functions<DequantizerQ6K>(m);
            break;
        case GGML_TYPE_IQ4_XS:
            MulMat::set_functions<DequantizerIQ4XS>(m);
            break;
        case GGML_TYPE_IQ3_S:
            MulMat::set_functions<DequantizerIQ3S>(m);
            break;
        case GGML_TYPE_IQ3_XXS:
            MulMat::set_functions<DequantizerIQ3XXS>(m);
            break;
        case GGML_TYPE_IQ2_S:
            MulMat::set_functions<DequantizerIQ2S>(m);
            break;
        case GGML_TYPE_IQ2_XS:
            MulMat::set_functions<DequantizerIQ2XS>(m);
            break;
        case GGML_TYPE_IQ2_XXS:
            MulMat::set_functions<DequantizerIQ2XXS>(m);
            break;
        case GGML_TYPE_Q4_0:
            MulMat::set_functions<DequantizerQ40>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q4_1:
            MulMat::set_functions<DequantizerQ41>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q5_0:
            MulMat::set_functions<DequantizerQ50>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q5_1:
            MulMat::set_functions<DequantizerQ51>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q8_0:
            MulMat::set_functions<DequantizerQ80>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        default:
            return false;
    }
    return true;
}

}

#endif // __x86_64__ or __aarch64__

================================================
FILE: third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_avx2.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#include "iqk_mul_mat.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_zen4.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define iqk_mul_mat iqk_mul_mat_zen4
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
#include "iqk_mul_mat.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/iqk_mul_mat_arm.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
// Copyright 2024 Iwan Kawrakow
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstring>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)

#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "sgemm.h"

// For i-quants, I had to explicitely specify which
// functions to inline / not inline (at least for some
// of the functions), else performance would be significantly
// lower. This is worrysome as things can change with,
// e.g., a different compiler version or running on a different
// CPU.
#ifdef _MSC_VER
#define IQK_NOINLINE __declspec(noinline)
#define IQK_ALWAYS_INLINE inline
#else
#define IQK_NOINLINE __attribute__((__noinline__))
#define IQK_ALWAYS_INLINE __attribute__((always_inline))
#endif

#define GGML_COMMON_IMPL_C
#include "llama.cpp/ggml-common.h"

// clang-format off

// This matrix - vector and matrix - matrix multiplication implementation
// for legacy quants, k-quants and i-quants makes prompt processing 150-200%
// (legacy and k-quants) or 250-400% (i-quants) faster.
// compared to mainline llama.cpp (and llamafile).
// It provides implementations for ARM_NEON (all quants) and AVX2
// (all quants except sub-4 bit i-quants).
//
// Main idea is that unpacking the quants and the block scales to
// be ready for dot products with the corresponding Q8_Y quants
// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type).
// Hence, if we are performing a QX x Q8_Y matrix matrix
// multiplication (as needed for prompt processing), we can get
// a significant speedup by reusing the unpacked QX quants and scales
// for multiplication with several Q8_K columns. We also achieve fewer
// loads from memory, which is the main purpose of tiling in general
// purpose matrix multiplication packages.

#include <utility>
#include <array>

#endif

namespace {

typedef struct {
    int32_t i1;
    int32_t i2;
} mmid_row_mapping;

struct DataInfo {
    float       * s;
    const char  * cy;
    size_t        bs;
    size_t        by;
    int           cur_y = 0;
    int           ne11;
    const mmid_row_mapping * row_mapping = nullptr;
    size_t        bs2 = 0;

    inline const char * src1_row(int iy) const {
        if (!row_mapping) return cy + (cur_y + iy)*by;
        int i11 = row_mapping[cur_y + iy].i1 % ne11;
        int i12 = row_mapping[cur_y + iy].i2;
        return cy + (i11 + i12*ne11)*by;
    }

    inline void store(int ix, int iy, float result) const {
        *(dst_row(iy) + ix) = result;
        //dst_row(iy)[ix] = result;
    }
    inline float * dst_row(int iy) const {
        if (!row_mapping) return s + (cur_y + iy)*bs;
        int i12 = row_mapping[cur_y + iy].i2;
        int i1  = row_mapping[cur_y + iy].i1;
        int i2  = i12;
        return s + i1*bs + i2*bs2;
    }
};

typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);

struct MulMat {
    std::array<mul_mat_t, 8> funcs = {};
    //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
    IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
        constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
        int n_step = (nrc_y - info.cur_y)/funcs.size();
        if (n_step > 0) {
            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
                auto this_info = info;
                this_info.s += ix;
                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
                for (int iy = 0; iy < n_step; ++iy) {
                    funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
                    this_info.cur_y += funcs.size();
                }
            }
            info.cur_y += funcs.size() * n_step;
        }
        int n_left = nrc_y - info.cur_y;
        if (n_left > 0) {
            funcs[n_left-1](n, vx, bx, info, nrc_x);
        }
    }
    static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int Ny);
private:
    template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
};

inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
    const uint16_t * scales = (const uint16_t *)scales8;
    const uint32_t a0 = scales[0] | (scales[1] << 16);
    const uint32_t a1 = scales[2] | (scales[3] << 16);
    const uint32_t a2 = scales[4] | (scales[5] << 16);
    aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030);
    aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030);
    aux32[2] = a1 & 0x3f3f3f3f;
    aux32[0] = a0 & 0x3f3f3f3f;
}

const uint64_t keven_signs[128] = {
    0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
    0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff,
    0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff,
    0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff,
    0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff,
    0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff,
    0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff,
    0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff,
    0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff,
    0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff,
    0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff,
    0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff,
    0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff,
    0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff,
    0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff,
    0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff,
    0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff,
    0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff,
    0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff,
    0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff,
    0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
    0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
};

}

bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B,
        float * C, long stride_C, int ith, int nth) {

    MulMat mm;
    int row_size_q8;
    if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
        return false;
    }

    auto row_size_qx = ggml_row_size((ggml_type)typeA, ne00);

    auto nrc_x = (Nx + nth - 1)/nth;
    auto first_x = ith*nrc_x;
    if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;

    DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, (size_t)row_size_q8, 0, 1, nullptr, 0};

    mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);

    return true;
}

bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
        float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
    const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
    assert(row_mapping != nullptr);

    MulMat mm;
    int row_size_q8;
    if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
        return false;
    }
    int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
    int nrc_x = (Nx + nth - 1)/nth;
    int first_x = ith*nrc_x;
    if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
    DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)};
    mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
    return true;
}

#if defined __x86_64__ || defined(_M_X64)

#if defined HAVE_FANCY_SIMD
    #undef HAVE_FANCY_SIMD
#endif
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
    #define HAVE_FANCY_SIMD
#endif

namespace {

inline float hsum_float_4(__m128 x) {
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
    x = _mm_add_ss(x, _mm_movehdup_ps(x));
    return _mm_cvtss_f32(x);
}
inline float hsum_float_8(__m256 x) {
    return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
}

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)


template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

#ifdef HAVE_FANCY_SIMD
    inline __m512i load_quants(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
#else
    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
#endif
    inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

// Handles q4_K and q5_K scales/mins
struct Scales8K {
    template <typename Q8>
    inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        make_q4_scales(data, utmp);
        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
        const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1);
        accum_mins(mins128, q8, i, c, accd);
        const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
        return MM256_SET_M128I(sc128, sc128);
    }
#ifdef HAVE_FANCY_SIMD
    template <typename Q8>
    inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
        auto scales = process_mins_and_scales(data, c, i, q8, accd);
        return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1);
    }
#endif
    template <typename Q8>
    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i q8s = q8.load_bsums(iy, i);
            const __m256i prod = _mm256_madd_epi16(mins, q8s);
            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
        }
    }
#ifdef HAVE_FANCY_SIMD
    const __m512i shuffles512[2] = {
        _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302,
                         0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100),
        _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a,
                         0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
    };
#endif
    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};

    uint32_t utmp[4];
};

template <typename Q8>
inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        const __m256i prod  = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i));
        accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]);
    }
}
inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) {
    const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
    const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
    scales[0] = MM256_SET_M128I(l_scales, l_scales);
    scales[1] = MM256_SET_M128I(h_scales, h_scales);
}

struct ScaleQ3 {
    inline __m128i make_scales(const uint16_t * s8) const {
        const uint16_t * scales16 = (const uint16_t *)s8;
        uint32_t aux0 = scales16[0] | (scales16[1] << 16);
        uint32_t aux1 = scales16[2] | (scales16[3] << 16);
        uint32_t aux2 = scales16[4] | (scales16[5] << 16);
        __m128i scales128 = _mm_set_epi32(
            ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030),
            ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030),
             (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030),
             (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030));
        return _mm_add_epi8(scales128, m32);
    }
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct ScaleIQ4XS {
    inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) {
        uint32_t tmp32 = scales_h | (scales_h << 14);
        const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4);
        const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask);
        return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32);
    }
    const __m128i hshift = _mm_set_epi32(12, 8, 4, 0);
    const __m128i lshift = _mm_set_epi32(4, 0, 4, 0);
    const __m128i hmask  = _mm_set1_epi16(0x03);
    const __m128i lmask  = _mm_set1_epi8(0xf);
    const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400);
    const __m128i m32 = _mm_set1_epi16(-32);
};

template <typename Block>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
    inline void new_row(int ix) {
        x = (const Block *)((const char *)vx + bx*ix);
    }

    const void *  vx;
    size_t        bx;
    const Block * x;

    float d;
};

#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================

struct BlockPermuter {
    const __m512i permute1 = _mm512_set_epi64(11, 10,  9,  8, 3, 2, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
};

struct Q4Bits {
    inline void prepare(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        auto tmp1 = _mm512_and_si512(q4bits, ml);
        auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        tmp1 = _mm512_and_si512(q4bits, ml);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
        values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
    }
    inline void prepare64(const uint8_t * q4) {
        auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
        values[0] = _mm512_and_si512(q4bits, ml);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
        q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
        values[2] = _mm512_and_si512(q4bits, ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0xf);
    BlockPermuter perm;
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2) {

        auto q2bits = _mm512_loadu_si512((const __m512i*)q2);
        auto tmp = _mm512_srli_epi16(q2bits, 2);

        values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp);
        values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp);
        values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml);
        values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml);
        values[0] = _mm512_and_si512(values[0], ml);
        values[2] = _mm512_and_si512(values[2], ml);
    }
    __m512i values[4];
    const __m512i ml = _mm512_set1_epi8(0x03);
    BlockPermuter perm;
};

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    Scales8K s8k;
};

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        prepare(x[i].qs);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        auto scales256 = MM256_SET_M128I(scales128, scales128);
        auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }
    static __m512i load_values() {
        static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
        auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
        auto val256 = MM256_SET_M128I(val128, val128);
        return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
    }
    inline void prepare(const uint8_t * q4) {
        bits.prepare64(q4);
        // We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111
        //                bits.valuse[1]: 16..31, 48...63, 80...95, 112..127
        //                etc.
        auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]);
        bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1]));
        bits.values[0] = _mm512_shuffle_epi8(values, tmp);
        tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]);
        bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3]));
        bits.values[2] = _mm512_shuffle_epi8(values, tmp);
    }

    Q4Bits bits;
    Scales8K s8k;
    ScaleIQ4XS siq4;
    const __m512i values;
    const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2,  9,  8, 1, 0);
    const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
};

struct HighBit5 {
    inline void apply(const uint8_t * h, Q4Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x10);
};

struct HighBit3 {
    inline void apply(const uint8_t * h, Q2Bits& bits) {
        auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
        auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh));
    }
    const __m512i mh = _mm512_set1_epi8(0x04);
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].qh, bits);
        auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
    }

    Q4Bits bits;
    HighBit5 hbits;
    Scales8K s8k;
};

struct Scale16 {
    inline void make_scales(const __m128i& scales8, __m512i * scales) const {
        auto all_scales8 = MM256_SET_M128I(scales8, scales8);
        auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1);
        auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2);
        scales[0] = _mm512_cvtepi8_epi16(scales1);
        scales[1] = _mm512_cvtepi8_epi16(scales2);
    }
    template <typename Q8>
    inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8,
        const Q8& q8, __m256 * accm, __m512i * scales) const {
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm);
        make_scales(scales8, scales);
    }
    const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202,
                                              0x05050505, 0x01010101, 0x04040404, 0x00000000);
    const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a,
                                              0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales);
    }

    Q2Bits bits;
    Scale16 sc16;
    const __m128i m4 = _mm_set1_epi8(0xf);

};

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare(x[i].qs);
        hbits.apply(x[i].hmask, bits);
        auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales);
        sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales);
    }

    Q2Bits bits;
    HighBit3 hbits;
    ScaleQ3 sc3;
    Scale16 sc16;
    const __m128i m4  = _mm_set1_epi8(0xf);
    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        bits.prepare64(x[i].ql);
        add_high_bits(x[i].qh, bits);
        auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales);
        sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales);
    }

    inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const {
        auto hbits = _mm512_loadu_si512((const __m512i *)qh);
        auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh);
        auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh);
        bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
        tmp1 = _mm512_and_si512(hbits, mh);
        tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh);
        bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
        bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
    }

    Q4Bits bits;
    HighBit3 hbits;
    Scale16 sc16;

    const __m512i mh = _mm512_set1_epi8(0x30);

};

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accm[nrc_y];
    __m512  accd[nrc_y];
    __m512i scales[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accm, scales);

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0));
                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1));
                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2));
                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3));
                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
        }

    }
}

#else
// ===================================== Vanilla AVX2 =====================================

struct Q4Bits {
    inline void prepare(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[2] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare64(const uint8_t * q4, int j) {
        auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
        values[0] = _mm256_and_si256(q4bits, ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
        q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
        values[1] = _mm256_and_si256(q4bits, ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
    }
    inline void prepare16(const uint8_t * q4, int j) {
        values[0] = dequant16(q4 + 64*j +  0);
        values[1] = dequant16(q4 + 64*j + 16);
        values[2] = dequant16(q4 + 64*j + 32);
        values[3] = dequant16(q4 + 64*j + 48);
    }
    inline __m256i dequant16(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128);
        return _mm256_and_si256(ml, aux256);
    };
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0xf);
};

struct Q2Bits {
    inline void prepare(const uint8_t * q2, int j) {
        auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j);
        values[0] = _mm256_and_si256(q2bits, ml);
        values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml);
        values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml);
        values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml);
    }
    __m256i values[4];
    const __m256i ml = _mm256_set1_epi8(0x03);
};

struct HighBit5 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q4Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x10);
    __m256i hbits;
};

struct HighBit3 {
    inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
    inline void apply(Q2Bits& bits, bool do_shift) {
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh));
        if (do_shift) {
            hbits = _mm256_srli_epi16(hbits, 4);
        }
    }
    const __m256i mh = _mm256_set1_epi8(0x04);
    __m256i hbits;
};

inline __m256i get_scale_shuffle_8(int i) {
    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
}

inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
}

template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
    if (j == 0) {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
        }
    } else {
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
        }
    }
}

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q4Bits bits;
    Scales8K s8k;
};

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
        s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
        return MM256_SET_M128I(scales128, scales128);
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs, j);
        bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]);
        bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]);
        bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]);
        bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]);
    }

    static __m256i load_values() {
        static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
        auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
        return MM256_SET_M128I(val128, val128);
    }

    Q4Bits bits;
    Scales8K s8k;
    ScaleIQ4XS siq4;
    const __m256i values;
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].qh);
        return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q4Bits  bits;
    HighBit5 hbits;
    Scales8K s8k;
};

template <typename Q8>
inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d,
    __m256 * accm, __m256i * scales) {
    const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
    process_mins_16(all_scales, q8, i, d, accm);
    prepare_scales_16(all_scales, scales);
}

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        hbits.load(x[i].hmask);
        process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
        hbits.apply(bits, j == 0);
    }

    Q2Bits  bits;
    HighBit3 hbits;
    ScaleQ3 sc3;

    const __m128i m32 = _mm_set1_epi8(-32);
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}

    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
        process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm);
        prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs, j);
    }

    Q2Bits  bits;

    const __m128i m4 = _mm_set1_epi8(0xf);
};

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
    template <typename Q8>
    inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
        d = GGML_FP16_TO_FP32(x[i].d);
        process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales);
    }
    inline void prepare(int i, int j) {
        bits.prepare64(x[i].ql, j);
        auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j);
        bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
        bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
        bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
        bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh));
    }

    Q4Bits  bits;
    const __m256i mh = _mm256_set1_epi8(0x30);
};

inline __m256i get_scale_shuffle_16(int i) {
    static const uint8_t k_shuffle[128] = {
         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
    };
    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
}

inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%QK_K == 0);
    const int nb = n/QK_K;

    Q8<nrc_y> q8(info);

    __m256i all_scales[2];
    __m256i scales[4];
    __m256  accd[nrc_y];

    Dequantizer deq(vx, bx);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        for (int i = 0; i < nb; ++i) {

            deq.new_block(i, q8, accd, all_scales);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {
                deq.prepare(i, j);
                set_scales_16(all_scales[j], scales);
                multiply_add(deq.bits, scales, j, i, q8, sumi);
            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }

}

template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y> q8(info);

    Dequantizer deq(vx, bx);

    __m256  accd[nrc_y];
    __m256i scales[4];

    for (int ix = 0; ix < nrc_x; ++ix) {

        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();

        deq.new_row(ix);

        for (int i = 0; i < nb; ++i) {

            auto all_scales = deq.new_block(i, q8, accd);

            __m256i sumi[nrc_y];

            for (int j = 0; j < QK_K/128; ++j) {

                deq.prepare(i, j);

                set_scales_8(all_scales, j, scales);

                multiply_add(deq.bits, scales, j, i, q8, sumi);

            }

            for (int iy = 0; iy < nrc_y; ++iy) {
                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
            }

        }

        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, hsum_float_8(accd[iy]));
        }

    }
}
#endif  // Zen4 or vanilla AVX2

//
// ============================== Legacy quants
//

struct DotHelper {
    const __m256i m1 = _mm256_set1_epi16(1);
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y);
    }
#else
    inline __m256i dot(__m256i x, __m256i y) const {
        return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y));
    }
#endif
};

struct SignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x));
    }
};
struct UnsignedDot {
    DotHelper helper;
    inline __m256i compute(__m256i x, __m256i y) const {
        return helper.dot(x, y);
    }
};
template <typename Q8, typename Dot> struct Sum4 {
    Dot dot;
    inline __m256i compute(const __m256i * qx, const Q8 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1));    // 0,0, 1,1, 0,0, 1,1
        const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3));    // 2,2, 3,3, 2,2, 3,3
        return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
    }
};

struct Sum4_Q8 {
    SignedDot dot;
    static inline __m256i add1(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b));
    }
    static inline __m256i add2(__m256i a, __m256i b) {
        return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b));
    }
    inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const {
        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
        const __m256i p01 = add1(p0, p1);  // 0,1, 0,1, 0,1, 0,1
        const __m256i p23 = add1(p2, p3);  // 2,3, 2,3, 2,3, 2,3
        return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3
    }
};

struct ScaleHelperQ_0 {
    ggml_half scales8[4];
    template <typename Q>
    inline __m128 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
        return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
    }
    template <typename Q>
    inline __m128 prepare4(__m128 other_scales, const Q * y) {
        return _mm_mul_ps(other_scales, prepare4<Q>(y));
    }
    template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
    template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
};

struct ScaleHelperQ_1 {
    uint32_t scales8[4];
    const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);

    template <typename Q>
    inline __m256 prepare4(const Q * y) {
        for (int j = 0; j < 4; ++j) {
            // it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
            // complain that this breaks strict-aliasing rules.
            memcpy(scales8 + j, &y[j].d, sizeof(uint32_t));
        }
        return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle));
    }

    template <typename Q>
    inline __m256 prepare4(__m256 other_scales, const Q * y) {
        return _mm256_mul_ps(other_scales, prepare4<Q>(y));
    }

    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
        return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
    }
    template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
    }
    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
    }
};

struct MinusType0 {
    inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); }
    inline float compute(float d, int) const { return d; }
    inline float result(__m256 acc, int) const { return hsum_float_8(acc); }
};

template <int nrc_y> struct MinusType1 {
    __m128 accm[nrc_y];
    MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); }
    inline __m256 compute(__m256 dm, int iy) {
        const __m128 d = _mm256_castps256_ps128(dm);
        const __m128 m = _mm256_extractf128_ps(dm, 1);
        accm[iy] = _mm_add_ps(accm[iy], m);
        return _mm256_set_m128(d, d);
    }
    inline float compute(const std::pair<float, float>& dm, int iy) {
        accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f));
        return dm.first;
    }
    inline float result(__m256 acc, int iy) const {
        const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
        return hsum_float_4(_mm_add_ps(sum, accm[iy]));
    }
};

template <typename Minus, int nrc_y, bool is_multiple_of_4> struct AccumT {
    __m256 acc[nrc_y];
    Minus accm;
    AccumT() {  for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); }
    template <typename Unpacker, typename Scales, typename Sum, typename Q8>
    inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) {
        auto qx = unp.quants();
        __m256 dall[nrc_y];
        for (int i = 0; i < nb/4; ++i) {
            auto other_scales = unp.set_block_4(i);
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
                dall[iy] = accm.compute(s12, iy);
            }
            for (int iy = 0; iy < nrc_y; ++iy) {
                auto pall = sum.compute(qx, y[iy] + 4*i);
                acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
            }
        }
        if (!is_multiple_of_4) {
            for (int i = 4*(nb/4); i < nb; ++i) {
                auto other_scales = unp.set_block(i);
                for (int iy = 0; iy < nrc_y; ++iy) {
                    auto s12 = scales.prepare1(other_scales, y[iy] + i);
                    auto d = accm.compute(s12, iy);
                    const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
                    acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
                }
            }
        }
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, accm.result(acc[iy], iy));
            //s[iy*bs] = accm.result(acc[iy], iy);
        }
    }
};

template <int nrc_y, bool is_multiple_of_4>
using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;

template <int nrc_y, bool is_multiple_of_4>
using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;

using Sum4Type0 = Sum4<block_q8_0, SignedDot>;
using Sum4Type1 = Sum4<block_q8_1, UnsignedDot>;

template <typename Unpacker, typename Sum4Type, typename AccumType, typename Scales, typename Q8, int nrc_y>
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
    Unpacker unp(vx, bx);
    Sum4Type sum4;
    Scales scales;
    for (int ix = 0; ix < nrc_x; ++ix) {
        unp.set_row(ix);
        AccumType accum;
        accum.compute(nb, unp, scales, sum4, y, info, ix);
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

template <typename Unpacker, int nrc_y>
void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_1> q8(info);
    int nb = n/Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, true>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, false>, ScaleHelperQ_1, block_q8_1, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

struct Dequantizer4bit {
    const __m256i m4 = _mm256_set1_epi8(0xf);
    inline __m256i dequant(const uint8_t * qs) const {
        const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
        return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4);
    }
};

struct Q8_0_Dequantizer {
    inline __m256i dequant(const block_q8_0 * x) const {
        return _mm256_loadu_si256((const __m256i *)x->qs);
    }
};

struct Q4_0_Dequantizer {
    Dequantizer4bit b4;
    const __m256i m8 = _mm256_set1_epi8(-8);
    inline __m256i dequant(const block_q4_0 * x) const {
        return _mm256_add_epi8(b4.dequant(x->qs), m8);
    }
};

struct Q4_1_Dequantizer {
    Dequantizer4bit b4;
    inline __m256i dequant(const block_q4_1 * x) const {
        return b4.dequant(x->qs);
    }
};

struct HBitDequantizer {
    const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
    const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
    const __m256i minus1 = _mm256_set1_epi64x(-1);
    inline __m256i to_bytes(const uint8_t * bits) const {
        // Note: Data in all ggml quants is at least 2-byte aligned.
        // => we can cast to uint16_t and use or on two consecutive entries
        // which is faster than memcpy
        const uint16_t * aux16 = (const uint16_t *)bits;
        const uint32_t aux32 = aux16[0] | (aux16[1] << 16);
        //uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
        __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle);
        bytes = _mm256_or_si256(bytes, mask);
        return _mm256_cmpeq_epi8(bytes, minus1);
    }
};

struct Q5_0_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8((char)0xF0);
    inline __m256i dequant(const block_q5_0 * x) const {
        const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

struct Q5_1_Dequantizer {
    Dequantizer4bit b4;
    HBitDequantizer hbit;
    const __m256i mh = _mm256_set1_epi8(0x10);
    inline __m256i dequant(const block_q5_1 * x) const {
        const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh);
        return _mm256_or_si256(b4.dequant(x->qs), vqh);
    }
};

template <typename Q, typename Scales, typename Dequantizer>
struct Q_Unpacker {
    Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {}

    const char * cx_0;
    const Q    * x;
    size_t       bx;

    Scales scales;
    Dequantizer deq;

    __m256i qx[4];

    inline const __m256i* quants() const { return qx; }

    inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); }

    inline auto set_block_4(int i) {
        for (int j = 0; j < 4; ++j) {
            qx[j] = deq.dequant(x + 4*i + j);
        }
        return scales.prepare4(x + 4*i);
    }
    inline auto set_block(int i) {
        qx[0] = deq.dequant(x + i);
        return scales.prepare1(x + i);
    }
};

struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
    Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
    Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_0; }
};
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
    Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK5_0; }
};
struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> {
    Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};
struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer> {
    Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
    inline static int block_size() { return QK4_1; }
};

template <int nrc_y>
void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n%Q8_0_Unpacker::block_size() == 0);
    Q8<nrc_y, block_q8_0> q8(info);
    int nb = n/Q8_0_Unpacker::block_size();
    if (nb%4 == 0) {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    } else {
        mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
                nb, vx, bx, info, q8.y, nrc_x
        );
    }
}

template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
        if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker>) {
            m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
        }
        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
            m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_1_q8_1_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_1_q8_1_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_1_q8_1_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
        }
        else {
#ifdef HAVE_FANCY_SIMD
            m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
            m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
            m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
            m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
            m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
            m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
            m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
            m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
#else
            if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
                          std::is_same_v<Dequantizer, DequantizerQ3K> ||
                          std::is_same_v<Dequantizer, DequantizerQ6K>) {
                m.funcs[0] = mul_mat_qY_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qY_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qY_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qY_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qY_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qY_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qY_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qY_K_q8_K_T<Dequantizer, 8>;
            } else {
                m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
                m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
                m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
                m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
                m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
                m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
                m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
                m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
            }
#endif
        }
}

bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {

    if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0)
        row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);

    switch (typeA) {
        case GGML_TYPE_Q2_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ2K>(mm);
            break;
        case GGML_TYPE_Q3_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ3K>(mm);
            break;
        case GGML_TYPE_Q4_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ4K>(mm);
            break;
        case GGML_TYPE_Q5_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ5K>(mm);
            break;
        case GGML_TYPE_Q6_K:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerQ6K>(mm);
            break;
        case GGML_TYPE_IQ4_XS:
            assert (ne00 % QK_K == 0);
            MulMat::set_functions<DequantizerIQ4XS>(mm);
            break;
        case GGML_TYPE_Q4_0:
            assert (ne00 % QK4_0 == 0);
            MulMat::set_functions<Q4_0_Unpacker>(mm);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q4_1:
            assert (ne00 % QK4_1 == 0);
            MulMat::set_functions<Q4_1_Unpacker>(mm);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q5_0:
            assert (ne00 % QK5_0 == 0);
            MulMat::set_functions<Q5_0_Unpacker>(mm);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q5_1:
            assert (ne00 % QK5_1 == 0);
            MulMat::set_functions<Q5_1_Unpacker>(mm);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;

        default:
            return false;
    }

    return true;
}

} // namespace


#else   // __aarch64__

//[kawrakow] Need these two for performance on Arm
typedef struct {
    ggml_half d[8];
    int8_t qs[4*QK8_1];
} block_q8_1_x4;
static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
typedef struct {
    ggml_half d[4];
    int8_t qs[4*QK8_0];
} block_q8_0_x4;
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");

namespace {

template <int nrc, typename block_q8 = block_q8_K> struct Q8 {

    constexpr static int nrc_y = nrc;

    Q8(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
    }

    inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); }
    inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); }
    inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); }
    inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); }
    inline int16x8_t load_bsums8(int iy, int i) const {
        auto q8s = vld1q_s16_x2(y[iy][i].bsums);
        return vpaddq_s16(q8s.val[0], q8s.val[1]);
    }
    inline float scale(int iy, int i) const { return y[iy][i].d; }

    const block_q8 * y[nrc_y];
};

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

//#pragma GCC unroll 4
        for (int i = 0; i < nb; ++i) {

            int32x4_t sumi[nrc_y];
            for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

            if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) {
                deq.process_scales(i, q8, acc);
                deq.prepare(i, 0);
                deq.compute(q8, i, 0, sumi);
                deq.prepare(i, 1);
                deq.compute(q8, i, 1, sumi);
            } else {
                if constexpr (Dequantizer::num_blocks() == 8) {
                    auto scales = deq.new_block(i, q8, acc);
                    deq.prepare(i, 0);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                    deq.prepare(i, 1);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                }
                else if constexpr (Dequantizer::num_blocks() == 16) {
                    auto scales = deq.new_block(i, q8, acc);
                    deq.prepare(i, 0);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                    deq.prepare(i, 1);
#pragma GCC unroll 8
                    for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
                }
                else {
                    GGML_ASSERT(false);
                }
            }

#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
            }
        }

#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}
template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);

    Dequantizer deq(vx, bx, nrc_y);

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[nrc_y];
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {

            int32x4_t sumi[nrc_y];
            for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);

            if constexpr (Dequantizer::num_blocks() == 8) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else if constexpr (Dequantizer::num_blocks() == 16) {
                auto scales = deq.new_block(i);
                deq.prepare(i, 0);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
                deq.prepare(i, 1);
#pragma GCC unroll 8
                for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
            }
            else {
                GGML_ASSERT(false);
            }
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);
    const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val;
    const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val;

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales.val[j], pall);
}
template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8,
        const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {
    auto mzero = vdupq_n_s32(0);

    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2
    auto p12 = vpaddq_s32(p1, p2);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4
    auto p34 = vpaddq_s32(p3, p4);

    auto pall = vpaddq_s32(p12, p34);
    sumi = vmlaq_s32(sumi, scales, pall);
}

template <typename Q8>
IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
        const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {

    auto mzero = vdupq_n_s32(0);
    auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
    auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1,
    auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
    auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4,
    auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3
    sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12);

    auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
    auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5,
    auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
    auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]),
                         ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7,
    auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7
    sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34);
}

template <typename Q8>
inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums8(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}
template <typename Q8>
inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto q8s = q8.load_bsums(iy, i);
        int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0]));
        int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0]));
        int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1]));
        int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1]));
        float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4)));
        acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
    }
}

struct Scales8 {
    uint32_t utmp[4];
    const uint8_t * sc8 = (const uint8_t *)utmp;
    template <typename Q8, typename Qx>
    inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) {
        make_q4_scales(x.scales, utmp);
        int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8));
        accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin));

        uint8x8_t scales8 = vld1_u8(sc8);
        uint16x8_t scales16 = vmovl_u8(scales8);
        int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))),
                              vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))};
        return scales;
    }
};

struct Q4bits {
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    uint8x16x4_t b1, b2;
    inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[2] = vshrq_n_u8(val[0], 4);
        b.val[1] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const {
        b.val[0] = vandq_u8(val[0], m4b);
        b.val[1] = vshrq_n_u8(val[0], 4);
        b.val[2] = vandq_u8(val[1], m4b);
        b.val[3] = vshrq_n_u8(val[1], 4);
    }
    inline void prepare(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4(b2, q4bits.val);
    }
    inline void prepare_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4(b1, q4bits.val+0);
        prepare4(b2, q4bits.val+2);
    }
    inline void prepare64(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        b1.val[0] = vandq_u8(q4bits.val[0], m4b);
        b1.val[1] = vandq_u8(q4bits.val[1], m4b);
        b1.val[2] = vandq_u8(q4bits.val[2], m4b);
        b1.val[3] = vandq_u8(q4bits.val[3], m4b);
        b2.val[0] = vshrq_n_u8(q4bits.val[0], 4);
        b2.val[1] = vshrq_n_u8(q4bits.val[1], 4);
        b2.val[2] = vshrq_n_u8(q4bits.val[2], 4);
        b2.val[3] = vshrq_n_u8(q4bits.val[3], 4);
    }
    inline void prepare16(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x2(qs);
        prepare4_16(b1, q4bits.val);
        q4bits = vld1q_u8_x2(qs+32);
        prepare4_16(b2, q4bits.val);
    }
    inline void prepare16_v2(const uint8_t * qs) {
        auto q4bits = vld1q_u8_x4(qs);
        prepare4_16(b1, q4bits.val+0);
        prepare4_16(b2, q4bits.val+2);
    }
};

struct Q2bits {
    const uint8x16_t m4b = vdupq_n_u8(0x03);
    uint8x16x4_t b1, b2;
    inline void prepare(const uint8_t * qs) {
        auto q2bits = vld1q_u8_x2(qs);
        b1.val[0] = vandq_u8(q2bits.val[0], m4b);
        b1.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b1.val[2] = vandq_u8(q2bits.val[0], m4b);
        b1.val[3] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[0] = vandq_u8(q2bits.val[0], m4b);
        b2.val[1] = vandq_u8(q2bits.val[1], m4b);

        q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
        q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
        b2.val[2] = vandq_u8(q2bits.val[0], m4b);
        b2.val[3] = vandq_u8(q2bits.val[1], m4b);
    }
};

template <typename block_q>
struct BaseDequantizer {
    BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {}
    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); }
    const void * vx;
    const block_q * x;
    const size_t bx;
    const int nrc;
};

struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
    DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        if (nrc == 1) bits.prepare_v2(x[i].qs+64*j);
        else bits.prepare(x[i].qs+64*j);
    }

    Q4bits bits;
    Scales8 s8;

    float d;
};

struct HighBit5 {
    const uint8x16_t mhb = vdupq_n_u8(0x10);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct HighBit3 {
    const uint8x16_t mhb = vdupq_n_u8(0x04);
    uint8x16x2_t bits;
    inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
        b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
        b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
        b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
        b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));

        b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb));
        b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb));
        b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb));
        b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb));

        if (do_shift) {
            bits.val[0] = vshrq_n_u8(bits.val[0], 4);
            bits.val[1] = vshrq_n_u8(bits.val[1], 4);
        }
    }
};

struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
    DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].qh);
        return s8.process_scales_mins(x[i], q8, i, acc);
    }
    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+64*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    Q4bits bits;
    HighBit5 h;
    Scales8 s8;

    uint8x16x2_t hbits;

    float d;
};

inline int32x4x4_t make_wider(const int16x8x2_t& scales16) {
    int32x4x4_t scales = {
        vmovl_s16(vget_low_s16 (scales16.val[0])),
        vmovl_s16(vget_high_s16(scales16.val[0])),
        vmovl_s16(vget_low_s16 (scales16.val[1])),
        vmovl_s16(vget_high_s16(scales16.val[1])),
    };
    return scales;
}

template <typename Q8>
inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) {
    int16x8x2_t scales16;
    scales16.val[0] = vmovl_s8(vget_low_s8(scales8));
    scales16.val[1] = vmovl_s8(vget_high_s8(scales8));
    accum_mins_16(scales16, q8, acc, i, c);
    return make_wider(scales16);
}

struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
    DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d);
    }
    inline void prepare(int i, int j) {

        auto hbits = vld1q_u8_x2(x[i].qh + 32*j);

        bits.prepare64(x[i].ql+64*j);
        bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb));
        bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb));
        bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb));
        bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb));

        bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb));
        bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb));
        bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb));
        bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb));

    }

    Q4bits bits;

    const uint8x16_t mhb = vdupq_n_u8(0x30);

    float d;
};

struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
    DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        h.bits = vld1q_u8_x2(x[i].hmask);
        const uint16_t * sc16 = (const uint16_t *)x[i].scales;
        uint32_t aux0 = sc16[0] | (sc16[1] << 16);
        uint32_t aux1 = sc16[2] | (sc16[3] << 16);
        uint32_t aux2 = sc16[4] | (sc16[5] << 16);
        aux32[0] =  (aux0       & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030);
        aux32[1] =  (aux1       & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030);
        aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030);
        aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030);
        return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d);
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
        h.apply(bits.b1, bits.b2, j == 0);
    }

    uint32_t aux32[4];

    Q2bits bits;

    HighBit3 h;

    float d;
};

struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
    DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return true; }

    template <typename Q8>
    inline void process_scales(int i, const Q8& q8, float32x4_t * acc) {
        d = GGML_FP16_TO_FP32(x[i].d);
        auto scales_and_mins = vld1q_u8(x[i].scales);
        auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4));
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(mins8));
        scales16.val[1] = vmovl_s8(vget_high_s8(mins8));
        accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin));

        scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf));
    }

    template <typename Q8>
    inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        process_scales(i, q8, acc);
        int16x8x2_t scales16;
        scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8)));
        scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8)));
        return make_wider(scales16);
    }

    template <typename Q8>
    inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) {
        auto m1 = vdupq_n_u8(1);
        auto shuffle = vdupq_n_u8(8*j);
        bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]);

            auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]),
                    vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]);

            auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]);

            auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
            sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]),
                    vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]);
        }
    }

    inline void prepare(int i, int j) {
        bits.prepare(x[i].qs+32*j);
    }

    uint32_t aux32[4];

    uint8x16_t scales8;

    Q2bits bits;

    float d;
};

// ============================= i-quants

struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {

    static int8x16_t load_values() {
        static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
        return vld1q_s8(iq4nl_values);
    }

    DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); }

    template <typename Q8>
    inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
        (void)q8;
        (void)acc;
        d = GGML_FP16_TO_FP32(x[i].d);
        const uint16_t scales_h = x[i].scales_h;
        const uint16_t * scales_l = (const uint16_t *)x[i].scales_l;
        aux32[0] = scales_l[0] | (scales_l[1] << 16);
        aux32[1] = aux32[0] >> 4;
        // scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7
        uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf));
        uint16_t * aux16 = (uint16_t *)aux32;
        aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2;
        // sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7
        uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30));
        int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32));
        // shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7
        scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff));
        int16x8_t scales16 = vmovl_s8(scales8);
        int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))};
        return scales;
    }
    inline void prepare(int i, int j) {
        bits.prepare16(x[i].qs+64*j);
        for (int k = 0; k < 4; ++k) {
            bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k]));
            bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k]));
        }
    }

    Q4bits bits;
    const int8x16_t values;
    uint32_t aux32[2];

    constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602};

    float d;
};

struct SimpleBits {
    uint8x16x4_t b1;
    uint8x16x4_t b2;
};

IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) {
    int32x4x2_t scales;
    auto one = vdupq_n_u32(1);
    scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1));
    scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1));
    return scales;
}

inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) {
    auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127))));
    auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127))));
    b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1));
    b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2));
}

IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) {
    return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1));
}

struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
    DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j));
        prepare_all(data, q);
        return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1]));
    }

private:

    static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) {
        const uint8_t * idx = (const uint8_t *)bits;
        b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]});
        b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]});
        apply_signs_2(b, signs, bits[1]);
    }

    inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) {
        const uint32_t * q2 = (const uint32_t *)data.val;
        prepare2(quants+0, q2+0, keven_signs);
        prepare2(quants+2, q2+2, keven_signs);
        prepare2(quants+4, q2+4, keven_signs);
        prepare2(quants+6, q2+6, keven_signs);
    }
};

inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) {
    auto aux = vld1_u8(sc);
    auto scales_l = vand_u8(aux, vdup_n_u8(0xf));
    auto scales_h = vshr_n_u8(aux, 4);
    auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));

    auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1)));
    int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) };
    return make_wider(scales16);
}

struct DequantizerIQ2XS final : public BaseDequantizer<block_iq2_xs> {
    DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1);
    }

private:

    static void make2(const uint16_t * qs, uint8x16_t * b) {
        auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511))));
        auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511))));
        auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9))));
        auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9))));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1));
        b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2));
    }

    inline static void make4(const uint16_t * qs, uint8x16_t * b) {
        make2(qs + 0, b + 0);
        make2(qs + 4, b + 2);
    }

    IQK_ALWAYS_INLINE void prepare_internal(int i, int j) {
        make4(x[i].qs + 16*j + 0, bits.b1.val);
        make4(x[i].qs + 16*j + 8, bits.b2.val);
    }

};

// So, I hate to include this table, but with the GCC 12.3 compiler
// bundled in the Cosmopolitan tools, loading the unpacked sign bytes
// from this table using the packed 8 sign bits as index is faster than
// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to
// expand the bits to bytes.
static const uint64_t kall_signs[256] = {
    0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff,
    0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff,
    0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff,
    0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff,
    0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff,
    0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff,
    0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff,
    0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff,
    0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff,
    0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff,
    0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff,
    0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff,
    0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff,
    0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff,
    0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff,
    0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff,
    0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff,
    0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff,
    0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff,
    0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff,
    0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff,
    0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff,
    0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff,
    0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff,
    0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff,
    0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff,
    0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff,
    0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff,
    0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff,
    0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff,
    0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff,
    0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff,
    0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff,
    0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff,
    0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff,
    0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff,
    0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff,
    0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff,
    0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff,
    0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff,
    0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff,
    0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff,
    0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff,
    0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff,
    0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff,
    0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff,
    0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff,
    0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff,
    0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff,
    0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff,
    0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff,
    0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff,
    0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff,
    0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff,
    0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff,
    0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff,
    0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff,
    0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff,
    0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff,
    0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff,
    0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff,
    0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff,
    0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff,
    0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff,
};

struct SignHelper {

    IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const {
        auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]});
        // Normally we would expect this to be faster, but it isn't.
        // auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1]));
        // auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1));
        b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s));
    }

    // We would need these two if we weren't loading from the unpacked sign table.
    //const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201));
    //const uint8x16_t m1    = vdupq_n_u8(1);
};

struct DequantizerIQ2S final : public BaseDequantizer<block_iq2_s> {
    DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 16; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x4_t new_block(int i) {
        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
        prepare_internal(i, 0, bits);
        return prepare_4bit_scales16(x[i].scales);
    }

    inline void prepare(int i, int j) {
        if (j == 1) prepare_internal(i, 1, bits);
    }

private:

    static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) {
        uint32_t aux32[2];
        const uint16_t * aux16 = (const uint16_t *)aux32;
        for (int k = 0; k < 2; ++k) {
            aux32[1] = (qh[k] << 4) | (qh[k] << 18);
            aux32[0] = (aux32[1] << 4) & 0x03000300;
            aux32[1] &= 0x03000300;
            b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1]))));
            b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))),
                                   vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3]))));
            sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2;
            sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2;
        }
    }

    void prepare_internal(int i, int j, SimpleBits& sb) {

        const auto * qs = x[i].qs + 16*j;
        const auto * qh = x[i].qh + 4*j;
        const auto * sign_bits = qs + QK_K/8;

        make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val);
        make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val);
    }

    SignHelper sh;
};

struct DequantizerIQ3XXS final : public BaseDequantizer<block_iq3_xxs> {
    DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); }

    inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
        auto q3data = vld1q_u8_x2(x[i].qs + 32*j);
        auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j));
        prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q);
        return prepare_scales_8(gas);
    }

private:

    inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) {
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]});
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]});
        apply_signs_2(b, keven_signs, sidx);
    }
    inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) {
        make2(q3+ 0, signs[0], quants + 0);
        make2(q3+ 8, signs[1], quants + 2);
        make2(q3+16, signs[2], quants + 4);
        make2(q3+24, signs[3], quants + 6);
    }
};

struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> {
    DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}

    constexpr static int num_blocks() { return 8; }
    constexpr static bool should_scale_quants() { return false; }

    SimpleBits bits;
    float d;

    inline int32x4x2_t new_block(int i) {
        d = GGML_FP16_TO_FP32(x[i].d);
        uint32_t scales32[2];
        auto qs = vld1q_u8_x2(x[i].qs);
        auto signs = vld1q_u8(x[i].signs);

        prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs);

        std::memcpy(scales32, x[i].scales, 4);
        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
        auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7
        scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400)));
        auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8));
        int32x4x2_t scales;
        scales.val[0] = vmovl_s16(vget_low_s16(scales16));
        scales.val[1] = vmovl_s16(vget_high_s16(scales16));
        return scales;
    }

    inline void prepare(int i, int j) {
        if (j == 1) {
            auto qs = vld1q_u8_x2(x[i].qs + 32);
            auto signs = vld1q_u8(x[i].signs + 16);
            prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs);
        }
    }

private:

    static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256)));
        const uint16_t * idx = (const uint16_t *)&vindex;
        b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]});
        sh.apply_signs_1x(b+0, sign_bits+0);
        b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]});
        sh.apply_signs_1x(b+1, sign_bits+2);
    }
    static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh,
            const int16x8_t& hshift, uint8x16_t * b) {
        auto idx_l = vld1q_u8(qs);
        make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0);
        make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2);
    }

    static int16x8_t load_shift() {
        static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
        return vld1q_s16(k_shift);
    }

    inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) {
        auto signs = vld1q_u8(sign_bits);
        auto s = (const uint8_t *)&signs;
        make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val);
        make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val);
    }

    SignHelper sh;
    const int16x8_t hshift = load_shift();

};

template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    assert(n % QK_K == 0);
    const int nb = n / QK_K;

    Q8<nrc_y, block_q8_K> q8(info);
    Dequantizer deq(vx, bx, nrc_y);
    uint8x16_t  qx[8];
    int32x4_t   sumi[nrc_y];
    float32x4_t acc[nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);
        for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb; ++i) {
            float d = deq.new_block(i);
            auto scales = deq.unpack(i, 0, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                sumi[iy] = vdupq_n_s32(0);
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]);
            }
            scales = deq.unpack(i, 1, qx);
#pragma GCC unroll 8
            for (int iy = 0; iy < nrc_y; ++iy) {
                compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]);
                acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy]));
            }
        }
#pragma GCC unroll 8
        for (int iy = 0; iy < nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

// =========================================== Legacy quants

template <typename Block>
inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) {
    for (int k = 0; k < 4; ++k) aux[k] = x[k].d;
    return vld1_f16((const float16_t *)aux);
}

template <typename Block>
inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) {
    if constexpr (std::is_same_v<Block, block_q8_1>) {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; }
    } else {
        for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; }
    }
    return vld1q_f16((const float16_t *)aux);
}

struct Q4LegacyBits {
    template <typename Block>
    inline void prepare(const Block * x) {
        for (int i = 0; i < 4; ++i) {
            auto q4bits = vld1q_u8(x[i].qs);
            b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
            b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
        }
    }
    inline void prepare1(const uint8_t * qs, int8x16_t * q) const {
        auto q4bits = vld1q_u8(qs);
        q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
        q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
    }
    inline void prepare1(const uint8_t * qs) {
        prepare1(qs, b);
    }
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    int8x16_t b[8];
};

// One would think this commented out version would do better than the one below
// because it offers more opportunities to execute instructions in parallel.
// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers
// cannot it just do the sequential version below on its own?
//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
//    const auto q8b_1 = vld1q_s8_x2(qs + 0);
//    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]);
//    const auto q8b_2 = vld1q_s8_x2(qs + 32);
//    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]);
//    auto p1234 = vpaddq_s32(p12, p34);
//    const auto q8b_3 = vld1q_s8_x2(qs + 64);
//    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]);
//    const auto q8b_4 = vld1q_s8_x2(qs + 96);
//    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]);
//    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
//}

inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
    auto q8b = vld1q_s8_x2(qs + 0);
    auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 32);
    auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]);
    auto p1234 = vpaddq_s32(p12, p34);
    q8b = vld1q_s8_x2(qs + 64);
    auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]);
    q8b = vld1q_s8_x2(qs + 96);
    auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]);
    return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
}

template <int nrc> struct Q80 {

    constexpr static int nrc_y = nrc;

    Q80(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x4_t load_scales(int iy, int i) const {
        const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
        return vld1_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            sc16[iy] = vmul_f16(qx_scales, q8_scales);
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
        }
    }

    const block_q8_0 * y[nrc_y];
};

template <int nrc> struct Q81 {

    constexpr static int nrc_y = nrc;

    Q81(const DataInfo& info) {
        for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy);
    }

    inline const int8_t * quant_data(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return y4->qs;
    }

    inline float16x8_t load_scales(int iy, int i) const {
        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
        return vld1q_f16((const float16_t *)y4->d);
    }

    template <typename Dequantizer>
    inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const {
        auto qx_scales = deq.new_block(i);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8_scales = load_scales(iy, i);
            auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales));
            acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m));
            sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales));
        }
    }

    template <typename Dequantizer>
    inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
        deq.prepare1(i);
        float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m);
        for (int iy = 0; iy < nrc; ++iy) {
            auto q8b = vld1q_s8_x2(y[iy][i].qs);
            auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
            acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
            acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s)));
        }
    }

    const block_q8_1 * y[nrc_y];
};

template <typename block_q>
struct BaseLegacyDequantizer {

    BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {}

    inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); }

    Q4LegacyBits bits;

    const void * vx;
    const block_q * x;
    size_t bx;
};

struct DequantizerQ40 final : public BaseLegacyDequantizer<block_q4_0> {

    DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        q[0] = vaddq_s8(q[0], m8);
        q[1] = vaddq_s8(q[1], m8);
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    const int8x16_t m8 = vdupq_n_s8(-8);
    //ggml_half aux[4];
};

struct DequantizerQ41 : public BaseLegacyDequantizer<block_q4_1> {

    DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.prepare1(x[i].qs);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q4_1)/4;
            bits.prepare1(x[4*i+k].qs, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }
    // Leaving this commented out attempt to be reminded that I already tried this.
    // It has basically the same performance as the version above.
    //inline float16x8_t new_block(int i) {
    //    uint32x4_t scales = {};
    //    const block_q4_1 * xi = x + 4*i;
    //    const uint32_t * s32 = (const uint32_t *)&xi->d;
    //    scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[0].qs, bits.b + 0);
    //    scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[1].qs, bits.b + 2);
    //    scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4;
    //    bits.prepare1(xi[2].qs, bits.b + 4);
    //    scales = vsetq_lane_u32(*s32, scales, 3);
    //    bits.prepare1(xi[3].qs, bits.b + 6);
    //    return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle)));
    //}

    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};
};

struct HighBit5Legacy {
    inline uint8x16_t to_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask));
    }
    inline uint8x16_t to_negated_bytes(const uint8_t * qh) const {
        uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
        return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0));
    }
    const uint64x2_t mask = vdupq_n_u64(0x8040201008040201);
    const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1));
};

struct DequantizerQ50 final : public BaseLegacyDequantizer<block_q5_0> {

    DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        prepare1(i, bits.b);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vld1_f16((const float16_t *)aux);
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0xf0);

};

struct DequantizerQ80 final : public BaseLegacyDequantizer<block_q8_0> {

    DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i) {
        bits.b[0] = vld1q_s8(x[i].qs);
        bits.b[1] = vld1q_s8(x[i].qs+16);
    }

    inline float16x4_t new_block(int i) {
        ggml_half aux[4];
        for (int k = 0; k < 4; ++k) {
            aux[k] = x[4*i+k].d;
            bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs);
            bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16);
        }
        return vld1_f16((const float16_t *)aux);
    }

};

struct DequantizerQ51 final : public BaseLegacyDequantizer<block_q5_1> {

    DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}

    inline void prepare1(int i, int8x16_t * q) const {
        bits.prepare1(x[i].qs, q);
        auto qh = x[i].qh;
        q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0))));
        q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2))));
    }
    inline void prepare1(int i) {
        bits.prepare1(x[i].qs, bits.b);
    }

    inline float16x8_t new_block(int i) {
        uint32_t aux32[4];
        const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
        for (int k = 0; k < 4; ++k) {
            aux32[k] = *s32; s32 += sizeof(block_q5_1)/4;
            prepare1(4*i+k, bits.b + 2*k);
        }
        return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
    }

    HighBit5Legacy hbits;

    const uint8x16_t mh = vdupq_n_u8(0x10);
    const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};

};

template <typename Dequantizer, typename Q8>
inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) {
    for (int iy = 0; iy < Q8::nrc_y; ++iy) {
        auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i));
        auto scale = vcvt_f32_f16(sc16[iy]);
        acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall));
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[Q8::nrc_y];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq.new_row(ix);

        float32x4_t acc[Q8::nrc_y];
        for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);

        for (int i = 0; i < nb/4; ++i) {
            q8.process_scales(i, deq, sc16, acc);
            sum_4(i, deq, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq, acc);
        }

        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
            info.store(ix, iy, vaddvq_f32(acc[iy]));
        }
    }
}

template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) {
    const int nb = n / QK4_1;

    float16x4_t sc16[2];

    for (int ix = 0; ix < nrc_x; ++ix) {

        deq1.new_row(ix);
        deq2.new_row(ix);

        float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) };

        for (int i = 0; i < nb/8; ++i) {
            q8.process_scales(2*i+0, deq1, sc16+0, acc+0);
            q8.process_scales(2*i+1, deq2, sc16+1, acc+1);
            sum_4(2*i+0, deq1, q8, sc16+0, acc+0);
            sum_4(2*i+1, deq2, q8, sc16+1, acc+1);
        }
        for (int i = 2*(nb/8); i < nb/4; ++i) {
            q8.process_scales(i, deq1, sc16, acc);
            sum_4(i, deq1, q8, sc16, acc);
        }
        for (int i = 4*(nb/4); i < nb; ++i) {
            q8.process_1_block(i, deq1, acc);
        }

        info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1])));
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q81<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Q80<nrc_y> q8(info);
    if constexpr (nrc_y == 1) {
        Dequantizer deq1(vx, bx), deq2(vx, bx);
        mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
    } else {
        Dequantizer deq(vx, bx);
        mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
    }
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q81<1> q8(info);
    mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
    Dequantizer deq1(vx, bx), deq2(vx, bx);
    Q80<1> q8(info);
    mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x);
}

template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
    if constexpr (std::is_same_v<Dequantizer, DequantizerQ40> || std::is_same_v<Dequantizer, DequantizerQ50> ||
                  std::is_same_v<Dequantizer, DequantizerQ80>) {
        m.funcs[0] = mul_mat_qX_0_q8_0<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_0_q8_0<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_0_q8_0<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_0_q8_0<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_0_q8_0<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_0_q8_0<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_0_q8_0<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_0_q8_0<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerQ41> || std::is_same_v<Dequantizer, DequantizerQ51>) {
        m.funcs[0] = mul_mat_qX_1_q8_1<Dequantizer, 1>;
        m.funcs[1] = mul_mat_qX_1_q8_1<Dequantizer, 2>;
        m.funcs[2] = mul_mat_qX_1_q8_1<Dequantizer, 3>;
        m.funcs[3] = mul_mat_qX_1_q8_1<Dequantizer, 4>;
        m.funcs[4] = mul_mat_qX_1_q8_1<Dequantizer, 5>;
        m.funcs[5] = mul_mat_qX_1_q8_1<Dequantizer, 6>;
        m.funcs[6] = mul_mat_qX_1_q8_1<Dequantizer, 7>;
        m.funcs[7] = mul_mat_qX_1_q8_1<Dequantizer, 8>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS> || std::is_same_v<Dequantizer, DequantizerIQ3XXS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>;
    }
    else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ3S> ||
                       std::is_same_v<Dequantizer, DequantizerIQ2XS>) {
        m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>;
    }
    else {
        m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>;
        m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>;
        m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>;
        m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>;
        m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>;
        m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>;
        m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>;
        m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>;
    }
}

bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) {
    row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);

    (void)Ny;
    // Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications.
    //if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S ||
    //                typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false;

    switch (typeA) {
        case GGML_TYPE_Q2_K:
            MulMat::set_functions<DequantizerQ2K>(m);
            break;
        case GGML_TYPE_Q3_K:
            MulMat::set_functions<DequantizerQ3K>(m);
            break;
        case GGML_TYPE_Q4_K:
            MulMat::set_functions<DequantizerQ4K>(m);
            break;
        case GGML_TYPE_Q5_K:
            MulMat::set_functions<DequantizerQ5K>(m);
            break;
        case GGML_TYPE_Q6_K:
            MulMat::set_functions<DequantizerQ6K>(m);
            break;
        case GGML_TYPE_IQ4_XS:
            MulMat::set_functions<DequantizerIQ4XS>(m);
            break;
        case GGML_TYPE_IQ3_S:
            MulMat::set_functions<DequantizerIQ3S>(m);
            break;
        case GGML_TYPE_IQ3_XXS:
            MulMat::set_functions<DequantizerIQ3XXS>(m);
            break;
        case GGML_TYPE_IQ2_S:
            MulMat::set_functions<DequantizerIQ2S>(m);
            break;
        case GGML_TYPE_IQ2_XS:
            MulMat::set_functions<DequantizerIQ2XS>(m);
            break;
        case GGML_TYPE_IQ2_XXS:
            MulMat::set_functions<DequantizerIQ2XXS>(m);
            break;
        case GGML_TYPE_Q4_0:
            MulMat::set_functions<DequantizerQ40>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q4_1:
            MulMat::set_functions<DequantizerQ41>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q5_0:
            MulMat::set_functions<DequantizerQ50>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        case GGML_TYPE_Q5_1:
            MulMat::set_functions<DequantizerQ51>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
            break;
        case GGML_TYPE_Q8_0:
            MulMat::set_functions<DequantizerQ80>(m);
            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
            break;
        default:
            return false;
    }
    return true;
}

}

#endif // __x86_64__ or __aarch64__


================================================
FILE: third_party/llamafile/iqk_mul_mat_arm82.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_arm82.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define iqk_mul_mat iqk_mul_mat_arm82
#define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
#include "iqk_mul_mat_arm.inc"
#endif  // __aarch64__


================================================
FILE: third_party/llamafile/macros.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/macros.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#pragma once

#define MIN(X, Y) ((Y) > (X) ? (X) : (Y))
#define MAX(X, Y) ((Y) < (X) ? (X) : (Y))
#define CEIL_DIV(M, N) (((M) + (N) - 1) / (N))
#define ROUNDUP(X, K) (((X) + (K) - 1) & -(K))
#define ARRAYLEN(A) ((sizeof(A) / sizeof(*(A))) / ((unsigned)!(sizeof(A) % sizeof(*(A)))))


================================================
FILE: third_party/llamafile/micros.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/micros.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#pragma once

#include <ctime>

#ifndef _WIN32
#include <unistd.h>
#else
#include <windows.h>
#endif

#ifdef _WIN32
static long long GetQueryPerformanceFrequency() {
    LARGE_INTEGER t;
    QueryPerformanceFrequency(&t);
    return t.QuadPart;
}
static long long GetQueryPerformanceCounter() {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return t.QuadPart;
}
#endif

static long long micros(void) {
#ifndef _WIN32
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000;
#else
    static long long timer_freq = GetQueryPerformanceFrequency();
    static long long timer_start = GetQueryPerformanceCounter();
    return ((GetQueryPerformanceCounter() - timer_start) * 1000000) / timer_freq;
#endif
}


================================================
FILE: third_party/llamafile/numba.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/numba.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#pragma once

inline int rand32(void) {
    static unsigned long long lcg = 1;
    lcg *= 6364136223846793005;
    lcg += 1442695040888963407;
    return lcg >> 32;
}

inline int popcount(unsigned x) {
    x = x - ((x >> 1) & 0x55555555);
    x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = (x + (x >> 16));
    return (x + (x >> 8)) & 0x0000003F;
}

inline int hamming(int x, int y) {
    return popcount(x ^ y);
}

inline float float01(unsigned x) {  // (0,1)
    return 1.f / 8388608 * ((x >> 9) + .5f);
}

inline float numba(void) {  // (-10,10)
    return float01(rand32()) * 2.f - 1.f;
}

template <typename T>
void randomize(T* A, int n) {
    for (int i = 0; i < n; ++i)
        A[i] = numba();
}

template <typename T>
void randomize(int m, int n, T* A, int lda) {
    for (int j = 0; j < n; ++j)
        for (int i = 0; i < m; ++i)
            A[lda * j + i] = numba();
}

template <typename T, typename U>
void broadcast(T* A, int n, U x) {
    for (int i = 0; i < n; ++i)
        A[i] = x;
}

template <typename T, typename U>
void broadcast(int m, int n, T* A, int lda, U x) {
    for (int j = 0; j < n; ++j)
        for (int i = 0; i < m; ++i)
            A[lda * j + i] = x;
}


================================================
FILE: third_party/llamafile/sgemm.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "sgemm.h"
// #include <cosmo.h>
// #include <cpuid.h>
// #include <libc/sysv/consts/hwcap.h>
#include <stdio.h>
// #include <sys/auxv.h>
#include <cassert>
// #include "llamafile.h"

static const struct GemmFuncs {
    bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
    bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
    bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
    // typeof(llamafile_sgemm)* sgemm;
    // typeof(llamafile_mixmul)* mixmul;
    // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
    GemmFuncs() {
#if defined(__x86_64__) || defined(_M_X64)
        // if (X86_HAVE(AVX)) {
        //     if (X86_HAVE(FMA)) {
        //         if (X86_HAVE(AVX2)) {
        //             if (X86_HAVE(AVX512F)) {
        //                 if (X86_HAVE(AVX512VL) &&     //
        //                     X86_HAVE(AVX512BW) &&     //
        //                     X86_HAVE(AVX512DQ) &&     //
        //                     X86_HAVE(AVX512_VNNI) &&  //
        //                     X86_HAVE(AVX512_BF16)) {
        //                     // AMD Zen4+ (2023-)
        //                     sgemm = llamafile_sgemm_amd_zen4;
        //                     mixmul = llamafile_mixmul_amd_zen4;
        //                     iqk_mixmul = iqk_mul_mat_moe_zen4;
        //                 } else {
        //                     // Intel Xeon Skylake+ (2015-)
        //                     sgemm = llamafile_sgemm_amd_avx512f;
        //                     mixmul = llamafile_mixmul_amd_avx512f;
        //                     iqk_mixmul = iqk_mul_mat_moe;
        //                 }
        //             } else if (X86_HAVE(AVXVNNI)) {
        //                 // Intel Alderlake (2021-)
        //                 sgemm = llamafile_sgemm_amd_avxvnni;
        //                 mixmul = llamafile_mixmul_amd_avxvnni;
        //                 iqk_mixmul = iqk_mul_mat_moe;
        //             } else {
        //                 // Intel Haswell/Broadwell/Skylake (2013-2020)
        //                 // AMD Excavator (2015-2022)
        //                 sgemm = llamafile_sgemm_amd_avx2;
        //                 mixmul = llamafile_mixmul_amd_avx2;
        //                 if (X86_HAVE(F16C))
        //                     iqk_mixmul = iqk_mul_mat_moe;
        //             }
        //         } else {
        //             // AMD Piledriver (2011-2014)
        //             sgemm = llamafile_sgemm_amd_fma;
        //             mixmul = llamafile_mixmul_amd_fma;
        //             if (X86_HAVE(F16C))
        //                 iqk_mixmul = iqk_mul_mat_moe;
        //         }
        //     } else {
        //         // Intel Sandybridge/Ivybridge (2010-2012)
        //         // AMD Bulldozer (2011)
        //         sgemm = llamafile_sgemm_amd_avx;
        //         mixmul = llamafile_mixmul_amd_avx;
        //     }
        // } else {
        //     // AMD K8/Barcelona (2003-2010)
        //     // Intel Core/Nehalem (2006-2009)
        //     sgemm = llamafile_sgemm_unsupported;
        //     mixmul = llamafile_mixmul_unsupported;
        // }

#if defined(__AVX__)
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX512F__)
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
        // AMD Zen4+ (2023-)
        sgemm = llamafile_sgemm_amd_zen4;
        mixmul = llamafile_mixmul_amd_zen4;
        iqk_mixmul = iqk_mul_mat_moe_zen4;
#else
        // Intel Xeon Skylake+ (2015-)
        sgemm = llamafile_sgemm_amd_avx512f;
        mixmul = llamafile_mixmul_amd_avx512f;
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#elif defined(__AVXVNNI__)
        // Intel Alderlake (2021-)
        sgemm = llamafile_sgemm_amd_avxvnni;
        mixmul = llamafile_mixmul_amd_avxvnni;
        iqk_mixmul = iqk_mul_mat_moe;
#else
        // Intel Haswell/Broadwell/Skylake (2013-2020)
        // AMD Excavator (2015-2022)
        sgemm = llamafile_sgemm_amd_avx2;
        mixmul = llamafile_mixmul_amd_avx2;
#if defined(__F16C__)
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
        // AMD Piledriver (2011-2014)
        sgemm = llamafile_sgemm_amd_fma;
        mixmul = llamafile_mixmul_amd_fma;
#if defined(__F16C__)
        iqk_mixmul = iqk_mul_mat_moe;
#endif
#endif
#else
        // Intel Sandybridge/Ivybridge (2010-2012)
        // AMD Bulldozer (2011)
        sgemm = llamafile_sgemm_amd_avx;
        mixmul = llamafile_mixmul_amd_avx;
#endif
#else
        // AMD K8/Barcelona (2003-2010)
        // Intel Core/Nehalem (2006-2009)
        sgemm = llamafile_sgemm_unsupported;
        mixmul = llamafile_mixmul_unsupported;
#endif

#elif defined(__aarch64__)
        // long hwcap = getauxval(AT_HWCAP);
        // if ((hwcap & HWCAP_FPHP) &&     // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
        //     (hwcap & HWCAP_ASIMDHP) &&  // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
        //     (hwcap & HWCAP_ASIMDDP)) {  // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
        //     // e.g. Apple M1, Raspberry Pi 5
            sgemm = llamafile_sgemm_arm82;
            mixmul = llamafile_mixmul_arm82;
            iqk_mixmul = iqk_mul_mat_moe_arm82;
        // } else {
            // ARM64 baseline ISA
        //     sgemm = llamafile_sgemm_arm80;
        //     mixmul = llamafile_mixmul_arm80;
        // }
#else
        sgemm = llamafile_sgemm_unsupported;
        mixmul = llamafile_mixmul_unsupported;
#endif
    }
} funcs;

/**
 * Performs optimized matrix multiplication on CPU.
 *
 * This subroutine may compute C = Aᵀ * B with column major ordering.
 * Despite its name, this isn't a generalized implementation. Work is
 * only performed when a handwritten kernel is written and available.
 * Otherwise the caller should fall back to a general matmul routine.
 *
 * @param m is rows in `A` and `C`
 * @param n is cols in `B` and `C`
 * @param k is cols in `A` and rows in `B`
 * @param A is first input matrix (always transposed)
 * @param lda is row stride of `A`
 * @param B is second input matrix (never transposed)
 * @param ldb is row stride of `B`
 * @param C is input/output array of output matrices
 * @param ldc is row stride of `C`
 * @param ith is thread id (must be less than `nth`)
 * @param nth is number of threads (must be greater than zero)
 * @param task is GGML task type
 * @param Atype is GGML data type of `A`
 * @param Btype is GGML data type of `B`
 * @param Ctype is GGML data type of `C`
 * @param precision may be used to control the internal compute type
 * @return true if this function was able to service the matmul request
 */
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype,
                       precision);
}

/**
 * Performs "mixture of experts" tensor multiplication on CPU.
 */
bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
    return funcs.mixmul(params, weights, thought, plan, result);
}

bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) {
    return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth);
}


================================================
FILE: third_party/llamafile/sgemm.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#pragma once
#include <stdbool.h>
#include <cstddef>
#ifdef __cplusplus
extern "C" {
#endif

struct ggml_tensor;
struct ggml_compute_params;
#ifdef __aarch64__

bool iqk_mul_mat(long, long, long, int, const void*, const void*, float*, long, int, int);
bool iqk_mul_mat_zen4(long, long, long, int, const void*, const void*, float*, long, int, int);
bool iqk_mul_mat_arm82(long, long, long, int, const void*, const void*, float*, long, int, int);

bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);

bool llamafile_sgemm(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_mixmul(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
size_t llamafile_mixmul_needs(const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*);

bool llamafile_sgemm_unsupported(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_fma(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx2(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avxvnni(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx512f(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_zen4(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_arm80(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_arm82(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);

bool llamafile_mixmul_unsupported(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_fma(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_arm80(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_arm82(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_iqk(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);

#else

bool iqk_mul_mat(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
bool iqk_mul_mat_zen4(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
bool iqk_mul_mat_arm82(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);


bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);

bool llamafile_sgemm(long m, long n, long k, const void* a, long lda, const void* b, long ldb, void* c, long ldc, int ith, int nth, int task_type, int a_type, int b_type, int c_type, int precision);
bool llamafile_mixmul(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
size_t llamafile_mixmul_needs(const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*);

bool llamafile_sgemm_unsupported(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_fma(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx2(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avxvnni(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_avx512f(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_amd_zen4(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_arm80(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
bool llamafile_sgemm_arm82(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);

bool llamafile_mixmul_unsupported(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_fma(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_arm80(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_arm82(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
bool llamafile_mixmul_iqk(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);

#endif

#ifdef __cplusplus
}
#endif


================================================
FILE: third_party/llamafile/tinyblas_cpu.h
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//                   BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

#pragma once

#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
// #include "log.h"
#include "flags.h"
#include "sgemm.h"
// #include <cosmo.h>

#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wignored-attributes"

#define ROW_ALIGN 64
#define MATRIX_ALIGN 4096
#define MAX_ALIGN 4096

#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else
#define NOINLINE __attribute__((__noinline__))
#endif

#if defined(__ARM_NEON) || defined(__AVX512F__)
#define VECTOR_REGISTERS 32
#else
#define VECTOR_REGISTERS 16
#endif

#if 0
#define NOT_SUPPORTED tinyBLAS_not_supported(__FILE__, __LINE__)
#else
#define NOT_SUPPORTED false
#endif
#define WANT_QUANTIZATION false

namespace {

bool tinyBLAS_not_supported(const char* file, int line) {
    // tinylogf("%s:%d: tinyBLAS not supported\n", file, line);
    return false;
}

inline float unhalf(ggml_fp16_t d) {
    return GGML_FP16_TO_FP32(d);
}
inline float unhalf(ggml_bf16_t d) {
    return GGML_BF16_TO_FP32(d);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// MATRIX MEMORY INDEXING

#define NCA 1
#define NCB 2
#define NCC 4

#define INDEX(A, lda, j, i) (CONFIG & NC##A ? ((T##A**)A)[j] + i : A + lda * (j) + i)

////////////////////////////////////////////////////////////////////////////////////////////////////
// GGML TYPE TRAITS

template <typename T>
struct ggml_type_trait;
template <>
struct ggml_type_trait<float> {
    static constexpr ggml_type id = GGML_TYPE_F32;
};
template <>
struct ggml_type_trait<ggml_bf16_t> {
    static constexpr ggml_type id = GGML_TYPE_BF16;
};
template <>
struct ggml_type_trait<ggml_fp16_t> {
    static constexpr ggml_type id = GGML_TYPE_F16;
};
template <>
struct ggml_type_trait<block_q8_0> {
    static constexpr ggml_type id = GGML_TYPE_Q8_0;
};

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED ARITHMETIC OPERATIONS

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline __m128 add(__m128 x, __m128 y) {
    return _mm_add_ps(x, y);
}
inline __m128 sub(__m128 x, __m128 y) {
    return _mm_sub_ps(x, y);
}
inline __m128 mul(__m128 x, __m128 y) {
    return _mm_mul_ps(x, y);
}
#endif  // __SSE__

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline __m256 add(__m256 x, __m256 y) {
    return _mm256_add_ps(x, y);
}
inline __m256 sub(__m256 x, __m256 y) {
    return _mm256_sub_ps(x, y);
}
inline __m256 mul(__m256 x, __m256 y) {
    return _mm256_mul_ps(x, y);
}
#endif  // __AVX__

#if defined(__AVX512F__)
inline __m512 add(__m512 x, __m512 y) {
    return _mm512_add_ps(x, y);
}
inline __m512 sub(__m512 x, __m512 y) {
    return _mm512_sub_ps(x, y);
}
inline __m512 mul(__m512 x, __m512 y) {
    return _mm512_mul_ps(x, y);
}
#endif  // __AVX512F__

#if defined(__ARM_NEON)
inline float32x4_t add(float32x4_t x, float32x4_t y) {
    return vaddq_f32(x, y);
}
inline float32x4_t sub(float32x4_t x, float32x4_t y) {
    return vsubq_f32(x, y);
}
inline float32x4_t mul(float32x4_t x, float32x4_t y) {
    return vmulq_f32(x, y);
}
#endif  // __ARM_NEON

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
inline float16x8_t add(float16x8_t x, float16x8_t y) {
    return vaddq_f16(x, y);
}
inline float16x8_t sub(float16x8_t x, float16x8_t y) {
    return vsubq_f16(x, y);
}
inline float16x8_t mul(float16x8_t x, float16x8_t y) {
    return vmulq_f16(x, y);
}
#endif  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED FUSED MULTIPLY ADD

/**
 * Computes a * b + c.
 */
template <typename T, typename U>
inline U madd(T a, T b, U c) {
    return add(mul(a, b), c);
}

/**
 * Computes a * b + c with error correction.
 *
 * @see W. Kahan, "Further remarks on reducing truncation errors,"
 *    Communications of the ACM, vol. 8, no. 1, p. 40, Jan. 1965,
 *    doi: 10.1145/363707.363723.
 */
template <typename T, typename U>
inline U madder(T a, T b, U c, U* e) {
    U y = sub(mul(a, b), *e);
    U t = add(c, y);
    *e = sub(sub(t, c), y);
    return t;
}

#ifdef __ARM_NEON
inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e) {
    float32x4_t y = sub(vmulq_n_f32(a, b), *e);
    float32x4_t t = add(c, y);
    *e = sub(sub(t, c), y);
    return t;
}
#endif

#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m256 madd(__m256 a, __m256 b, __m256 c) {
    return _mm256_fmadd_ps(a, b, c);
}
#endif
#if defined(__AVX512F__)
template <>
inline __m512 madd(__m512 a, __m512 b, __m512 c) {
    return _mm512_fmadd_ps(a, b, c);
}
#endif
#endif

#if defined(__ARM_FEATURE_FMA)
template <>
inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
    return vfmaq_f32(c, a, b);
}
#if 0  // todo: this specialization chops gcc 12.3 performance in half
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) && 0
template <>
inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
    return vfmaq_f16(c, b, a);
}
#endif
#endif
#endif

#if defined(__AVX512BF16__)
template <>
inline __m512 madd(__m512bh x, __m512bh y, __m512 z) {
    return _mm512_dpbf16_ps(z, x, y);
}
template <>
inline __m512 madder(__m512bh x, __m512bh y, __m512 z, __m512* _) {
    return _mm512_dpbf16_ps(z, x, y);
}
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED HORIZONTAL SUM

#if defined(__ARM_NEON)
inline float hsum(float32x4_t x) {
    return vaddvq_f32(x);
}
#endif  // __ARM_NEON

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
inline float hsum(float16x8_t x) {
    // todo: this works great on clang but it produces terrible code on gcc 12.3
    return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_high_f16(x))));
}
#endif  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline float hsum(__m128 x) {
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
    x = _mm_add_ss(x, _mm_movehdup_ps(x));
#else
    __m128 t;
    t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
    x = _mm_add_ps(x, t);
    t = _mm_movehl_ps(t, x);
    x = _mm_add_ss(x, t);
#endif
    return _mm_cvtss_f32(x);
}
#endif

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline float hsum(__m256 x) {
    return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x)));
}
#endif  // __AVX__

#if defined(__AVX512F__)
inline float hsum(__m512 x) {
    return _mm512_reduce_add_ps(x);
}
#endif  // __AVX512F__

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED MEMORY LOADING

template <typename T, typename U>
T load(const U*);

template <>
inline float load(const float* p) {
    return *p;
}
template <>
inline float load(const ggml_fp16_t* p) {
    return unhalf(*p);
}
template <>
inline float load(const ggml_bf16_t* p) {
    return unhalf(*p);
}

#if defined(__ARM_NEON)
template <>
inline float32x4_t load(const float* p) {
    return vld1q_f32(p);
}
template <>
inline float32x4_t load(const ggml_bf16_t* p) {
    return vreinterpretq_f32_u32(vshll_n_u16(vld1_u16((const unsigned short*)p), 16));
}
#if !defined(_MSC_VER)
template <>
inline float16x8_t load(const ggml_fp16_t* p) {
    return vld1q_f16((const float16_t*)p);
}
template <>
inline float32x4_t load(const ggml_fp16_t* p) {
    return vcvt_f32_f16(vld1_f16((const float16_t*)p));
}
#endif  // _MSC_VER
#endif  // __ARM_NEON

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m128 load(const float* p) {
    return _mm_loadu_ps(p);
}
#endif  // __SSE__

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m256 load(const float* p) {
    return _mm256_loadu_ps(p);
}
#endif  // __AVX__

#if defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m256 load(const ggml_bf16_t* p) {
    return _mm256_castsi256_ps(
        _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)p)), 16));
}
#endif  // __AVX2__

#if defined(__F16C__)
template <>
inline __m256 load(const ggml_fp16_t* p) {
    return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)p));
}
#endif  // __F16C__

#if defined(__AVX512F__)
template <>
inline __m512 load(const float* p) {
    return _mm512_loadu_ps(p);
}
template <>
inline __m512 load(const ggml_fp16_t* p) {
    return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)p));
}
template <>
inline __m512 load(const ggml_bf16_t* p) {
    return _mm512_castsi512_ps(
        _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)p)), 16));
}
#endif  // __AVX512F__

#if defined(__AVX512BF16__)
template <>
inline __m512bh load(const ggml_bf16_t* p) {
    return (__m512bh)_mm512_loadu_ps((const float*)p);
}
template <>
inline __m512bh load(const float* p) {
    return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
}
#endif  // __AVX512BF16__

////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT OUTPUT STREAMING

inline void store(float* p, float f) {
    *p = f;
}

inline void store(ggml_fp16_t* p, float f) {
    *p = GGML_FP32_TO_FP16(f);
}

inline void store(ggml_bf16_t* p, float f) {
    *p = GGML_FP32_TO_BF16(f);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION

template <int CONFIG, int KN, typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS {
   public:
    tinyBLAS(long k, const TA* A, long lda, const TB* B, long ldb, TC* C, long ldc, int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(long m, long n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }

   private:
    NOINLINE void mnpack(long m0, long m, long n0, long n) {
        long mc, nc, mp, np;

#if VECTOR_REGISTERS == 32
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
                case 0x55:
                    mc = 5;
                    nc = 5;
                    gemm<5, 5, false>(m0, m, n0, n);
                    break;
                case 0x54:
                case 0x53:
                case 0x52:
                case 0x45:
                case 0x44:
                case 0x43:
                case 0x42:
                case 0x35:
                case 0x34:
                case 0x33:
                case 0x32:
                case 0x25:
                case 0x24:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x51:
                case 0x41:
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x15:
                case 0x14:
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 3)) {
                case 0x43:
                    mc = 4;
                    nc = 3;
                    gemm<4, 3, true>(m0, m, n0, n);
                    break;
                case 0x42:
                case 0x33:
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x41:
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

#if VECTOR_REGISTERS == 16
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 3)) {
                case 0x43:
                    mc = 4;
                    nc = 3;
                    gemm<4, 3, false>(m0, m, n0, n);
                    break;
                case 0x42:
                case 0x33:
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x41:
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 2)) {
                case 0x32:
                    mc = 3;
                    nc = 2;
                    gemm<3, 2, true>(m0, m, n0, n);
                    break;
                case 0x23:
                    mc = 2;
                    nc = 3;
                    gemm<2, 3, true>(m0, m, n0, n);
                    break;
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN, int PRECISE>
    NOINLINE void gemm(long m0, long m, long n0, long n) {
        long ytiles = RM > 1 ? (m - m0) / RM : 1;
        long xtiles = RN > 1 ? (n - n0) / RN : 1;
        long tiles = xtiles * ytiles;
        long duty = (tiles + nth - 1) / nth;
        long start = duty * ith;
        long end = start + duty;
        if (end > tiles)
            end = tiles;
        for (long job = start; job < end; ++job) {
            long ii = m0 + job / xtiles * RM;
            long jj = n0 + job % xtiles * RN;
            D Cv[RN][RM] = {};
            D Ce[RN][RM] = {};
            for (long l = 0; l < k; l += KN)
#pragma GCC unroll 100
                for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                    for (int i = 0; i < RM; ++i)
                        if (PRECISE)
                            Cv[j][i] = madder(load<V>(INDEX(A, lda, ii + i, l)),  //
                                              load<V>(INDEX(B, ldb, jj + j, l)),  //
                                              Cv[j][i], &Ce[j][i]);
                        else
                            Cv[j][i] = madd(load<V>(INDEX(A, lda, ii + i, l)),  //
                                            load<V>(INDEX(B, ldb, jj + j, l)),  //
                                            Cv[j][i]);
#pragma GCC unroll 100
            for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                for (int i = 0; i < RM; ++i)
                    store(INDEX(C, ldc, jj + j, ii + i), hsum(Cv[j][i]));
        }
    }

    const TA* const A;
    const TB* const B;
    TC* const C;
    const long k;
    const long lda;
    const long ldb;
    const long ldc;
    const int ith;
    const int nth;
};

//////////////////////////////////////////////////////////////////////////////////////////
// QUANT ZERO MATRIX MULTIPLICATION

#if defined(__ARM_FEATURE_DOTPROD)
template <int CONFIG, typename TA, typename TB, typename TC>
class tinyBLAS_Q0_ARM {
   public:
    tinyBLAS_Q0_ARM(long k, const TA* A, long lda, const TB* B, long ldb, TC* C, long ldc, int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(long m, long n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }

   private:
    NOINLINE void mnpack(long m0, long m, long n0, long n) {
        long mc, nc, mp, np;

        if (!FLAG_precise) {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, false>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, true>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }

        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN, int PRECISE>
    NOINLINE void gemm(long m0, long m, long n0, long n) {
        long ytiles = RM > 1 ? (m - m0) / RM : 1;
        long xtiles = RN > 1 ? (n - n0) / RN : 1;
        long tiles = xtiles * ytiles;
        long duty = (tiles + nth - 1) / nth;
        long start = duty * ith;
        long end = start + duty;
        if (end > tiles)
            end = tiles;
        for (long job = start; job < end; ++job) {
            long ii = m0 + job / xtiles * RM;
            long jj = n0 + job % xtiles * RN;
            float32x4_t Cv[RN][RM] = {};
            float32x4_t Ce[RN][RM] = {};
            for (int l = 0; l < k; ++l)
#pragma GCC unroll 100
                for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                    for (int i = 0; i < RM; ++i) {
                        float32x4_t a = vcvtq_f32_s32(vdotq_s32(
                            vdotq_s32(vdupq_n_s32(0), load_lo(INDEX(A, lda, ii + i, l)),
                                      load_lo(INDEX(B, ldb, jj + j, l))),
                            load_hi(INDEX(A, lda, ii + i, l)), load_hi(INDEX(B, ldb, jj + j, l))));
                        float b = unhalf(INDEX(A, lda, ii + i, l)->d) *
                                  unhalf(INDEX(B, ldb, jj + j, l)->d);
                        if (PRECISE)
                            Cv[j][i] = badder(a, b, Cv[j][i], &Ce[j][i]);
                        else
                            Cv[j][i] = vmlaq_n_f32(Cv[j][i], a, b);
                    }
#pragma GCC unroll 100
            for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                for (int i = 0; i < RM; ++i)
                    store(INDEX(C, ldc, jj + j, ii + i), hsum(Cv[j][i]));
        }
    }

    inline int8x16_t load_lo(const block_q8_0* b) {
        return vld1q_s8(b->qs);
    }

    inline int8x16_t load_hi(const block_q8_0* b) {
        return vld1q_s8(b->qs + 16);
    }

    inline int8x16_t load_lo(const block_q4_0* b) {
        return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs), vdupq_n_u8(0x0f))),
                        vdupq_n_s8(0x8));
    }

    inline int8x16_t load_hi(const block_q4_0* b) {
        return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)), vdupq_n_s8(0x8));
    }

    const TA* const A;
    const TB* const B;
    TC* const C;
    const long k;
    const long lda;
    const long ldb;
    const long ldc;
    const int ith;
    const int nth;
};
#endif  // __ARM_FEATURE_DOTPROD

#if defined(__AVX2__) || defined(__AVX512F__)
template <int CONFIG, typename TA, typename TB, typename TC>
class tinyBLAS_Q0_AVX2 {
   public:
    tinyBLAS_Q0_AVX2(long k, const TA* A, long lda, const TB* B, long ldb, TC* C, long ldc, int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(long m, long n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }

   private:
    void mnpack(long m0, long m, long n0, long n) {
        long mc, nc, mp, np;

#if VECTOR_REGISTERS == 32
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, false>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3)) {
                case 0x33:
                    mc = 3;
                    nc = 3;
                    gemm<3, 3, true>(m0, m, n0, n);
                    break;
                case 0x32:
                case 0x23:
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, true>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x13:
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

#if VECTOR_REGISTERS == 16
        if (!FLAG_precise) {
            switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 2)) {
                case 0x32:
                    mc = 3;
                    nc = 2;
                    gemm<3, 2, false>(m0, m, n0, n);
                    break;
                case 0x23:
                    mc = 2;
                    nc = 3;
                    gemm<2, 3, false>(m0, m, n0, n);
                    break;
                case 0x22:
                    mc = 2;
                    nc = 2;
                    gemm<2, 2, false>(m0, m, n0, n);
                    break;
                case 0x31:
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, false>(m0, m, n0, n);
                    break;
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, false>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, false>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        } else {
            switch ((MIN(m - m0, 2) << 4) | MIN(n - n0, 1)) {
                case 0x21:
                    mc = 2;
                    nc = 1;
                    gemm<2, 1, true>(m0, m, n0, n);
                    break;
                case 0x12:
                    mc = 1;
                    nc = 2;
                    gemm<1, 2, true>(m0, m, n0, n);
                    break;
                case 0x11:
                    mc = 1;
                    nc = 1;
                    gemm<1, 1, true>(m0, m, n0, n);
                    break;
                default:
                    return;
            }
        }
#endif

        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN, int PRECISE>
    NOINLINE void gemm(long m0, long m, long n0, long n) {
        long ytiles = RM > 1 ? (m - m0) / RM : 1;
        long xtiles = RN > 1 ? (n - n0) / RN : 1;
        long tiles = xtiles * ytiles;
        long duty = (tiles + nth - 1) / nth;
        long start = duty * ith;
        long end = start + duty;
        if (end > tiles)
            end = tiles;
        for (long job = start; job < end; ++job) {
            long ii = m0 + job / xtiles * RM;
            long jj = n0 + job % xtiles * RN;
            __m256 Cv[RN][RM] = {};
            __m256 Ce[RN][RM] = {};
            for (long l = 0; l < k; ++l)
#pragma GCC unroll 100
                for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                    for (int i = 0; i < RM; ++i) {
                        __m256 a = _mm256_set1_ps(unhalf(INDEX(A, lda, ii + i, l)->d) *
                                                  unhalf(INDEX(B, ldb, jj + j, l)->d));
                        __m256 b = updot(_mm256_sign_epi8(load(INDEX(A, lda, ii + i, l)),
                                                          load(INDEX(A, lda, ii + i, l))),
                                         _mm256_sign_epi8(load(INDEX(B, ldb, jj + j, l)),
                                                          load(INDEX(A, lda, ii + i, l))));
                        if (PRECISE)
                            Cv[j][i] = madder(a, b, Cv[j][i], &Ce[j][i]);
                        else
                            Cv[j][i] = madd(a, b, Cv[j][i]);
                    }
#pragma GCC unroll 100
            for (int j = 0; j < RN; ++j)
#pragma GCC unroll 100
                for (int i = 0; i < RM; ++i)
                    store(INDEX(C, ldc, jj + j, ii + i), hsum(Cv[j][i]));
        }
    }

    inline __m256i load(const block_q8_0* b) {
        return _mm256_loadu_si256((const __m256i*)b->qs);
    }

    inline __m256i load(const block_q4_0* b) {
        __m128i x = _mm_loadu_si128((const __m128i*)b->qs);
        return _mm256_sub_epi8(_mm256_and_si256(_mm256_set1_epi8(15),
                                                _mm256_insertf128_si256(_mm256_castsi128_si256(x),
                                                                        _mm_srli_epi16(x, 4), 1)),
                               _mm256_set1_epi8(8));
    }

    inline __m256 updot(__m256i u, __m256i s) {
        __m256i res;
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
        res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
#else
        res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
#endif
        return _mm256_cvtepi32_ps(res);
    }

    const TA* const A;
    const TB* const B;
    TC* const C;
    const long k;
    const long lda;
    const long ldb;
    const long ldc;
    const int ith;
    const int nth;
};
#endif  // __AVX2__

}  // namespace


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul.inc
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tinyblas_cpu.h"

//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//               MIXTURE OF EXPERTS TENSOR MULTIPLICATION
//
//
// SHAPES
//
//   - weights [cols, rows, experts]
//   - thought [cols, tasks, tokens] w/ tasks ≤ thinkers
//   - result  [rows, thinkers, tokens] w/ thinkers ≤ experts
//   - plan    [thinkers, tokens] w/ i32 < experts
//
// DEFINITION
//
//   for thinker in range(thinkers):
//     for token in range(tokens):
//       for row in range(rows):
//         c = 0
//         for col in range(cols):
//           expert = plan[token][thinker]
//           a = weights[expert][row][col]
//           b = thought[token][thinker % tasks][col]
//           c += a * b
//         result[token][thinker][row] = c
//
// REGULARITIES
//
//   - tokens can be odd
//   - thinkers is usually 2
//   - tasks is usually 1 or 2
//   - cols should be a multiple of 64
//   - rows should be a multiple of 64
//   - experts is usually 8 but could be 60
//   - tokens is always 1 for token generation
//   - tokens can be huge for prompt processing
//
// EXAMPLE
//
//   mixtral 8x7b w/ 217 token prompt
//
//           |  ne*0 ne*1 ne*2 ne*3 | nb*0    nb*1      nb*2       nb*3 | type
//   =========================================================================
//   weights | 16384 6144    8    1 |   18  0x2400 0x3600000 0x1b000000 | q4_0
//   thought | 16384    2  217    1 |    4 0x10000   0x20000  0x1b20000 | f32
//   result  |  6144    2  217    1 |    4  0x6000    0xc000   0xa2c000 | f32
//   plan    |     2  217    1    1 |    4    0x20    0x1b20     0x1b20 | i32
//

namespace {

class MixMul {
   public:
    MixMul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result)
        : params(params),
          weights(weights),
          thought(thought),
          plan(plan),
          result(result),
          rows(weights->ne[1]),
          cols(weights->ne[0]),
          experts(weights->ne[2]),
          thinkers(plan->ne[0]),
          tasks(thought->ne[1]),
          tokens(thought->ne[2]),
          ldq((cols * 2 + ROW_ALIGN - 1) & -ROW_ALIGN),
          wdata_((char*)(((uintptr_t)params->wdata + MAX_ALIGN - 1) & -MAX_ALIGN)),
          allocated_(0) {
    }

    bool allocate_shared_memory() {
        if (!(quantized_thought_ = allocate<char>(MATRIX_ALIGN, tokens * tasks * ldq)))
            return false;
        if (!(rowptr_result_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
            return false;
        if (!(rowptr_thought_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
            return false;
        if (!(rowptr_count_ = allocate<long>(sizeof(long), experts)))
            return false;
        return true;
    }

    size_t get_allocated_bytes() {
        return (wdata_ - (char*)params->wdata) + allocated_;
    }

    bool mixmul() {
        // invariants
        assert(tasks <= thinkers);
        assert(thinkers <= experts);
        assert(tokens == plan->ne[1]);
        assert(rows == result->ne[0]);
        assert(cols == thought->ne[0]);
        assert(tokens == result->ne[2]);
        assert(thinkers == result->ne[1]);

        // dimensionality
        assert(plan->ne[2] == 1);
        assert(plan->ne[3] == 1);
        assert(result->ne[3] == 1);
        assert(weights->ne[3] == 1);
        assert(thought->ne[3] == 1);

        // miscellaneous
        assert(params->nth > 0);
        assert(params->ith < params->nth);
        assert(plan->type == GGML_TYPE_I32);

        // check nb01 is convertible to lda
        if (weights->nb[1] % ggml_type_size(weights->type))
            return false;

        // no support for column strides
        if (result->nb[0] != ggml_type_size(result->type))
            return false;
        if (thought->nb[0] != ggml_type_size(thought->type))
            return false;
        if (weights->nb[0] != ggml_type_size(weights->type))
            return false;

        // supported output types
        switch (result->type) {
            case GGML_TYPE_F32:
                return mixmuler<float>();
            default:
                return false;
        }
    }

   private:
    template <typename TC>
    bool mixmuler() {
        switch (weights->type) {
            case GGML_TYPE_F32:
                if (thought->type != GGML_TYPE_F32)
                    return false;
#if defined(__AVX512F__)
                return mixmat<16, 1, tinyBLAS<NCB | NCC, 16, __m512, __m512, float, float, TC>, float,
                              float, TC>();
#elif defined(__AVX__) || defined(__AVX2__)
                return mixmat<8, 1, tinyBLAS<NCB | NCC, 8, __m256, __m256, float, float, TC>, float,
                              float, TC>();
#elif defined(__SSE__)
                return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, __m128, __m128, float, float, TC>, float,
                              float, TC>();
#elif defined(__ARM_NEON)
                return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, float, float, TC>,
                              float, float, TC>();
#else
                return false;
#endif

            case GGML_TYPE_BF16:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_BF16)
                    return false;
#if defined(__AVX512BF16__)
                if (!FLAG_precise) {
                    return mixmat<
                        32, 1, tinyBLAS<NCB | NCC, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC>,
                        ggml_bf16_t, ggml_bf16_t, TC>();
                } else {
                    return mixmat<16, 1,
                                  tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
                                  ggml_bf16_t, ggml_bf16_t, TC>();
                }
#elif defined(__AVX512F__)
                return mixmat<16, 1,
                              tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
                              ggml_bf16_t, ggml_bf16_t, TC>();
#elif defined(__AVX2__)
                return mixmat<8, 1,
                              tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, TC>,
                              ggml_bf16_t, ggml_bf16_t, TC>();
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
                return mixmat<
                    4, 1,
                    tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_bf16_t, ggml_bf16_t, TC>,
                    ggml_bf16_t, ggml_bf16_t, TC>();
#else
                return false;
#endif

            case GGML_TYPE_F16:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_F16)
                    return false;
#if defined(__AVX512F__)
                return mixmat<16, 1,
                              tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC>,
                              ggml_fp16_t, ggml_fp16_t, TC>();
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
                // if (X86_CHECK(F16C)) {
                return mixmat<8, 1,
                              tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC>,
                              ggml_fp16_t, ggml_fp16_t, TC>();
                // } else {
                //     return false;
                // }
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
                if (result->op_params[0] == GGML_PREC_F32) {
                    return mixmat<
                        4, 1,
                        tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
                        ggml_fp16_t, ggml_fp16_t, TC>();
                } else {
                    return mixmat<
                        8, 1,
                        tinyBLAS<NCB | NCC, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC>,
                        ggml_fp16_t, ggml_fp16_t, TC>();
                }
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
                return mixmat<
                    4, 1,
                    tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
                    ggml_fp16_t, ggml_fp16_t, TC>();
#else
                return false;
#endif

            case GGML_TYPE_Q4_0:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
                    return false;
#if defined(__AVX2__) || defined(__AVX512F__)
                return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q4_0, block_q8_0, TC>,
                              block_q4_0, block_q8_0, TC>();
#elif defined(__ARM_FEATURE_DOTPROD)
                return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q4_0, block_q8_0, TC>,
                              block_q4_0, block_q8_0, TC>();
#else
                return false;
#endif

            case GGML_TYPE_Q8_0:
                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
                    return false;
#if defined(__AVX2__) || defined(__AVX512F__)
                return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q8_0, block_q8_0, TC>,
                              block_q8_0, block_q8_0, TC>();
#elif defined(__ARM_FEATURE_DOTPROD)
                return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q8_0, block_q8_0, TC>,
                              block_q8_0, block_q8_0, TC>();
#else
                return false;
#endif

            default:
                return false;
        }
    }

    template <int KN, int BS, typename BLAS, typename TA, typename TB, typename TC>
    bool mixmat() {
        if (cols % KN)
            return false;
        switch (params->type) {
            case GGML_TASK_TYPE_INIT:
                if (thought->type != ggml_type_trait<TB>::id)
                    quantize_thought(ggml_type_trait<TB>::id);
                build_row_pointers(ggml_type_trait<TB>::id);
                return true;
            case GGML_TASK_TYPE_COMPUTE:
                assert(!(cols % BS));
                assert(!(weights->nb[1] % sizeof(TA)));
                for (int expert = 0; expert < experts; ++expert) {
                    BLAS tb{cols / BS,
                            (const TA*)((const char*)weights->data + expert * weights->nb[2]),
                            (long)(weights->nb[1] / sizeof(TA)),
                            (const TB*)(rowptr_thought_ + expert * tokens * thinkers),
                            0,
                            (TC*)(rowptr_result_ + expert * tokens * thinkers),
                            0,
                            params->ith,
                            params->nth};
                    tb.matmul(rows, rowptr_count_[expert], GGML_TASK_TYPE_COMPUTE);
                }
                return true;
            default:
                return true;
        }
    }

    void build_row_pointers(ggml_type vec_dot_type) {
        for (int expert = params->ith; expert < experts; expert += params->nth) {
            long count = 0;
            for (long token = 0; token < tokens; ++token)
                for (int thinker = 0; thinker < thinkers; ++thinker)
                    if (expert == *(const int32_t*)((const char*)plan->data +
                                                    token * plan->nb[1] + thinker * plan->nb[0])) {
                        long row = count++;
                        long idx = expert * thinkers * tokens + row;
                        rowptr_result_[idx] =
                            (uintptr_t)((char*)result->data + token * result->nb[2] +
                                        thinker * result->nb[1]);
                        if (thought->type == vec_dot_type)
                            rowptr_thought_[idx] =
                                (uintptr_t)((char*)thought->data + token * thought->nb[2] +
                                            thinker % tasks * thought->nb[1]);
                        else
                            rowptr_thought_[idx] =
                                (uintptr_t)((char*)quantized_thought_ + token * tasks * ldq +
                                            thinker % tasks * ldq);
                    }
            rowptr_count_[expert] = count;
        }
    }

    void quantize_thought(ggml_type vec_dot_type) {
        long chore = 0;
        for (long token = 0; token < tokens; ++token)
            for (int task = 0; task < tasks; ++task)
                if (chore++ % params->nth == params->ith)
                    quantize_row(quantized_thought_ + token * tasks * ldq + task * ldq,
                                 (const float*)((const char*)thought->data +
                                                token * thought->nb[2] + task * thought->nb[1]),
                                 vec_dot_type);
    }

    void quantize_row(void* dst, const float* src, ggml_type type) {
        assert((long)ggml_row_size(type, cols) <= ldq);
        switch (type) {
            case GGML_TYPE_F16:
                ggml_fp32_to_fp16_row(src, (ggml_fp16_t*)dst, cols);
                break;
            case GGML_TYPE_BF16:
                ggml_fp32_to_bf16_row(src, (ggml_bf16_t*)dst, cols);
                break;
            case GGML_TYPE_Q8_0:
                quantize_row_q8_0((const float*)src, (block_q8_0*)dst, cols);
                break;
            default:
                GGML_UNREACHABLE();
        }
    }

    template <typename T>
    T* allocate(size_t align, size_t elems) {
        T* res = nullptr;
        size_t need = sizeof(T) * elems;
        size_t base = allocated_;
        base += align - 1;
        base &= -align;
        size_t toto = base + need;
        if (toto >= allocated_ && toto <= params->wsize) {
            res = (T*)(wdata_ + base);
            allocated_ = toto;
        }
        return res;
    }

    const ggml_compute_params* const params;
    const ggml_tensor* const weights;
    const ggml_tensor* const thought;
    const ggml_tensor* const plan;
    ggml_tensor* const result;
    const long rows;
    const long cols;
    const int experts;
    const int thinkers;
    const int tasks;
    const long tokens;
    const long ldq;

    // variables
    char* const wdata_;
    size_t allocated_;

    // shared memory
    long* rowptr_count_ /*[experts]*/;
    char* quantized_thought_ /*[tokens][tasks][cols][2]*/;
    uintptr_t* rowptr_result_ /*[experts][tokens*thinkers]*/;
    uintptr_t* rowptr_thought_ /*[experts][tokens*thinkers]*/;
};

}  // namespace

/**
 * Performs "mixture of experts" tensor multiplication on CPU.
 */
bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
    MixMul mm{params, weights, thought, plan, result};
    return mm.allocate_shared_memory() && mm.mixmul();
}


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avx
#include "tinyblas_cpu_mixmul.inc"

/**
 * Returns number of shared memory bytes llamafile_mixmul() needs.
 */
size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
    ggml_compute_params params{};
    params.wsize = 0x7ffff000;
    params.wdata = (void*)0x1000;
    MixMul mm{&params, weights, thought, plan, 0};
    if (mm.allocate_shared_memory())
        return mm.get_allocated_bytes();
    else
        return 0;
}

#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avx2
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avx512f
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_avxvnni
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_fma
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_mixmul llamafile_mixmul_amd_zen4
#include "tinyblas_cpu_mixmul.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm80.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define llamafile_mixmul llamafile_mixmul_arm80
#include "tinyblas_cpu_mixmul.inc"

/**
 * Returns number of shared memory bytes llamafile_mixmul() needs.
 */
size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
    ggml_compute_params params{};
    params.wsize = 0x7ffff000;
    params.wdata = (void*)0x1000;
    MixMul mm{&params, weights, thought, plan, 0};
    if (mm.allocate_shared_memory())
        return mm.get_allocated_bytes();
    else
        return 0;
}

#endif  // __aarch64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm82.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define llamafile_mixmul llamafile_mixmul_arm82
#include "tinyblas_cpu_mixmul.inc"
#endif  // __aarch64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm.inc
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tinyblas_cpu.h"

//
//
//                                ██████╗ ██╗   █████╗ ██████╗
//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
//
//                   BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

namespace {

template <typename TC>
bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    switch (Atype) {
        case GGML_TYPE_F32: {
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
#if defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__AVX__) || defined(__AVX2__)
            if (k % 8)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_NEON)
            if (k % 4)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{
                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_BF16: {
#if defined(__AVX512BF16__)
            if (k % 32)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_BF16)
                return NOT_SUPPORTED;
            if (!FLAG_precise) {
                tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            } else {
                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{
                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
#elif defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__AVX2__)
            if (k % 8)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
            if (k % 4)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{
                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_F16: {
#if defined(__AVX512F__)
            if (k % 16)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_F16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
            // if (X86_CHECK(F16C)) {
            if (k % 8)
                return NOT_SUPPORTED;
            if (Btype == GGML_TYPE_F32 && n < 2) {
                tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_F16)
                return NOT_SUPPORTED;
            tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
            // } else {
            //     return NOT_SUPPORTED;
            // }
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
            if (n < 2 && !FLAG_precise)
                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
                return NOT_SUPPORTED;
            if (precision == GGML_PREC_F32) {
                if (k % 4)
                    return NOT_SUPPORTED;
                if (Btype != GGML_TYPE_F32)
                    return NOT_SUPPORTED;
                tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            } else {
                if (k % 8)
                    return NOT_SUPPORTED;
                if (Btype == GGML_TYPE_F32)
                    return WANT_QUANTIZATION;
                if (Btype != GGML_TYPE_F16)
                    return NOT_SUPPORTED;
                tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{
                    k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
                tb.matmul(m, n, task);
                return true;
            }
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
            if (n < 2 && !FLAG_precise)
                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
                return NOT_SUPPORTED;
            if (k % 4)
                return NOT_SUPPORTED;
            if (Btype != GGML_TYPE_F32)
                return NOT_SUPPORTED;
            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
                k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_Q8_0: {
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_Q8_0)
                return NOT_SUPPORTED;
#if defined(__AVX2__) || defined(__AVX512F__)
            tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{
                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_FEATURE_DOTPROD)
            tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{
                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        case GGML_TYPE_Q4_0: {
            if (Btype == GGML_TYPE_F32)
                return WANT_QUANTIZATION;
            if (Btype != GGML_TYPE_Q8_0)
                return NOT_SUPPORTED;
#if defined(__AVX2__) || defined(__AVX512F__)
            tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{
                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#elif defined(__ARM_FEATURE_DOTPROD)
            tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{
                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
            tb.matmul(m, n, task);
            return true;
#else
            return NOT_SUPPORTED;
#endif
        }

        default:
            return NOT_SUPPORTED;
    }

    (void)m;
    (void)n;
    (void)k;
    (void)A;
    (void)lda;
    (void)B;
    (void)ldb;
    (void)C;
    (void)ldc;
    (void)ith;
    (void)nth;
    (void)Atype;
    (void)Btype;
    (void)precision;
}

}  // namespace

/**
 * Performs optimized matrix multiplication on CPU.
 *
 * This subroutine may compute C = Aᵀ * B with column major ordering.
 * Despite its name, this isn't a generalized implementation. Work is
 * only performed when a handwritten kernel is written and available.
 * Otherwise the caller should fall back to a general matmul routine.
 *
 * For example, for single-threaded single-precision GEMM you can say
 *
 *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1,
 *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32,
 *                     GGML_PREC_DEFAULT);
 *
 * @param m is rows in `A` and `C`
 * @param n is cols in `B` and `C`
 * @param k is cols in `A` and rows in `B`
 * @param A is first input matrix (always transposed)
 * @param lda is row stride of `A`
 * @param B is second input matrix (never transposed)
 * @param ldb is row stride of `B`
 * @param C is input/output array of output matrices
 * @param ldc is row stride of `C`
 * @param ith is thread id (must be less than `nth`)
 * @param nth is number of threads (must be greater than zero)
 * @param Atype is GGML data type of `A`
 * @param Btype is GGML data type of `B`
 * @param Ctype is GGML data type of `C`
 * @param precision may be used to control the internal compute type
 * @return true if this function was able to service the matmul request
 */
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    assert(m >= 0);
    assert(n >= 0);
    assert(k >= 0);
    assert(lda >= k);
    assert(ldb >= k);
    assert(ldc >= m);
    assert(nth > 0);
    assert(ith < nth);

#if QK_K == 256
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
    /* 
    moonll
    more Btype accept
    }*/

    if (Ctype == GGML_TYPE_F32){
        if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }

#endif
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
        // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
        assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
            return true;
        }
    }
#endif
#endif

    switch (Ctype) {
        case GGML_TYPE_F32:
            return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype,
                                        Btype, Ctype, precision);
        default:
            return NOT_SUPPORTED;
    }
}

================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avx
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avx2
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avx512f
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_avxvnni
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_fma
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#if defined(__x86_64__) || defined(_M_X64)
#define llamafile_sgemm llamafile_sgemm_amd_zen4
#define iqk_mul_mat iqk_mul_mat_zen4
#include "tinyblas_cpu_sgemm.inc"
#endif  // __x86_64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp
================================================
// // Adapted from
// // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm80.cpp
// // Copyrigth 2024 Mozilla Foundation.
// // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// #ifdef __aarch64__
// #define llamafile_sgemm llamafile_sgemm_arm80
// #include "tinyblas_cpu_sgemm.inc"
// #endif  // __aarch64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm82.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

#ifdef __aarch64__
#define llamafile_sgemm llamafile_sgemm_arm82
#define iqk_mul_mat iqk_mul_mat_arm82
#include "tinyblas_cpu_sgemm.inc"
#endif  // __aarch64__


================================================
FILE: third_party/llamafile/tinyblas_cpu_unsupported.cpp
================================================
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_unsupported.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.

// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "sgemm.h"

bool llamafile_sgemm_unsupported(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
    return false;
}

bool llamafile_mixmul_unsupported(const struct ggml_compute_params* params,
                                  const struct ggml_tensor* weights,
                                  const struct ggml_tensor* thought,
                                  const struct ggml_tensor* plan,
                                  struct ggml_tensor* result) {
    return false;
}

bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int) {
    return false;
}


================================================
FILE: version.py
================================================
"""
KTransformers version information.
Shared across kt-kernel and kt-sft modules.
"""

__version__ = "0.5.2.post1"